1+ import random
2+ import time
3+
4+ import requests
5+ from openpyxl import Workbook
6+ import pymysql .cursors
7+
8+
9+ def get_conn ():
10+ '''建立数据库连接'''
11+ conn = pymysql .connect (host = 'localhost' ,
12+ user = 'root' ,
13+ password = 'root' ,
14+ db = 'python' ,
15+ charset = 'utf8mb4' ,
16+ cursorclass = pymysql .cursors .DictCursor )
17+ return conn
18+
19+
20+ def insert (conn , info ):
21+ '''数据写入数据库'''
22+ with conn .cursor () as cursor :
23+ sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
24+ cursor .execute (sql , info )
25+ conn .commit ()
26+
27+
28+ def get_json (url , page , lang_name ):
29+ '''返回当前页面的信息列表'''
30+ headers = {
31+ 'Host' : 'www.lagou.com' ,
32+ 'Connection' : 'keep-alive' ,
33+ 'Content-Length' : '23' ,
34+ 'Origin' : 'https://www.lagou.com' ,
35+ 'X-Anit-Forge-Code' : '0' ,
36+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0' ,
37+ 'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8' ,
38+ 'Accept' : 'application/json, text/javascript, */*; q=0.01' ,
39+ 'X-Requested-With' : 'XMLHttpRequest' ,
40+ 'X-Anit-Forge-Token' : 'None' ,
41+ 'Referer' : 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=' ,
42+ 'Accept-Encoding' : 'gzip, deflate, br' ,
43+ 'Accept-Language' : 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
44+ }
45+ data = {'first' : 'false' , 'pn' : page , 'kd' : lang_name }
46+ json = requests .post (url , data , headers = headers ).json ()
47+ list_con = json ['content' ]['positionResult' ]['result' ]
48+ info_list = []
49+ for i in list_con :
50+ info = []
51+ info .append (i .get ('companyShortName' , '无' ))
52+ info .append (i .get ('companyFullName' , '无' ))
53+ info .append (i .get ('industryField' , '无' ))
54+ info .append (i .get ('companySize' , '无' ))
55+ info .append (i .get ('salary' , '无' ))
56+ info .append (i .get ('city' , '无' ))
57+ info .append (i .get ('education' , '无' ))
58+ info_list .append (info )
59+ return info_list
60+
61+
62+ def main ():
63+ lang_name = 'python'
64+ wb = Workbook () # 打开 excel 工作簿
65+ conn = get_conn () # 建立数据库连接 不存数据库 注释此行
66+ for i in ['北京' , '上海' , '广州' , '深圳' , '杭州' ]: # 五个城市
67+ page = 1
68+ ws1 = wb .active
69+ ws1 .title = lang_name
70+ url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false' .format (i )
71+ while page < 31 : # 每个城市30页信息
72+ info = get_json (url , page , lang_name )
73+ page += 1
74+ print (i , 'page' , page )
75+ time .sleep (random .randint (10 , 20 ))
76+ for row in info :
77+ # insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行
78+ ws1 .append (row )
79+ conn .close () # 关闭数据库连接,不存数据库 注释此行
80+ wb .save ('{}职位信息.xlsx' .format (lang_name ))
81+
82+ if __name__ == '__main__' :
83+ main ()
0 commit comments