|
| 1 | +import requests |
| 2 | +import time |
| 3 | +import random |
| 4 | +import pymysql |
| 5 | +import re |
| 6 | +from pyecharts.charts import BMap, Map, Geo, Bar, Pie, PictorialBar, Boxplot, WordCloud |
| 7 | +from pyecharts import options as opts |
| 8 | +from pyecharts.globals import ChartType, ThemeType, SymbolType |
| 9 | + |
| 10 | + |
| 11 | +class LgCrawler(object): |
| 12 | + conn = None |
| 13 | + cursor = None |
| 14 | + |
| 15 | + |
| 16 | + def __init__(self): |
| 17 | + |
| 18 | + self.conn = pymysql.connect("127.0.0.1", "root", "12345678", "lagou") |
| 19 | + self.cursor = self.conn.cursor() |
| 20 | + |
| 21 | + def insert(self): |
| 22 | + sql = 'INSERT INTO jobs (positionName,workYear,salary,city,education,positionAdvantage,companyLabelList,financeStage,companySize,industryField,firstType) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' |
| 23 | + self.cursor.execute(sql) |
| 24 | + self.conn.commit() |
| 25 | + pass |
| 26 | + |
| 27 | + def query(self, sql): |
| 28 | + |
| 29 | + self.cursor.execute(sql) |
| 30 | + return self.cursor.fetchall() |
| 31 | + |
| 32 | + def crawler(self): |
| 33 | + |
| 34 | + headers = { |
| 35 | + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', |
| 36 | + 'Host': 'www.lagou.com', |
| 37 | + 'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=', |
| 38 | + 'Cookie': 'user_trace_token=20200321120912-e091b8e2-ae3a-4e98-b8cc-7eda56613730; LGUID=20200321120912-103e3b3f-4b2d-4b40-aac8-de6f2151b52a; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1584763752; _ga=GA1.2.707847320.1584763752; _gid=GA1.2.1026377415.1584763752; index_location_city=%E5%85%A8%E5%9B%BD; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22170fb47eec2128-04c4426beb9ea8-396d7406-1764000-170fb47eec46c6%22%2C%22%24device_id%22%3A%22170fb47eec2128-04c4426beb9ea8-396d7406-1764000-170fb47eec46c6%22%7D; sajssdk_2015_cross_new_user=1; X_MIDDLE_TOKEN=b44cae2e06dda98341f7fda429c15d04; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; LGSID=20200321151013-aa659974-2803-4434-83e7-ed146560e5e0; PRE_SITE=; X_HTTP_TOKEN=f05004685d58bcda35257748511c75fb5b02e29508; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1584775254; _gat=1; LGRID=20200321152606-05042c06-9cea-4b97-9b47-908278188949', |
| 39 | + 'X-Anit-Forge-Code': '0', |
| 40 | + 'X-Anit-Forge-Token': 'None', |
| 41 | + 'X-Requested-With': 'XMLHttpRequest' |
| 42 | + } |
| 43 | + page = 0 |
| 44 | + totalCount = 1 |
| 45 | + resultSize = 0 |
| 46 | + while (page * resultSize) <= totalCount: |
| 47 | + page = page + 1 |
| 48 | + url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" |
| 49 | + |
| 50 | + datas = { |
| 51 | + 'first': 'false', |
| 52 | + 'pn': page, |
| 53 | + 'kd': 'python' |
| 54 | + } |
| 55 | + if page == 1: |
| 56 | + datas['first'] = 'true' |
| 57 | + |
| 58 | + html = requests.post(url, headers=headers, data=datas) |
| 59 | + result = html.json() |
| 60 | + |
| 61 | + if page == 1: |
| 62 | + totalCount = result['content']['positionResult']['totalCount'] |
| 63 | + resultSize = result['content']['positionResult']['resultSize'] |
| 64 | + |
| 65 | + jobs = result['content']['positionResult']['result'] |
| 66 | + for job in jobs: |
| 67 | + job_array = [job['positionName'], job['workYear'], job['salary'], job['city'], job['education'], |
| 68 | + job['positionAdvantage'], "|".join(job['companyLabelList']), |
| 69 | + job['financeStage'], job['companySize'], job['industryField'], job['firstType']] |
| 70 | + |
| 71 | + self.cursor.execute(self.sql, tuple(job_array)) |
| 72 | + self.conn.commit() |
| 73 | + |
| 74 | + r = random.randint(15, 30) |
| 75 | + time.sleep(r) |
| 76 | + |
| 77 | + |
| 78 | + def city(self): |
| 79 | + |
| 80 | + sql = 'select city, count(1) counts from jobs group by city' |
| 81 | + results = self.query(sql) |
| 82 | + |
| 83 | + c = ( |
| 84 | + Geo() |
| 85 | + .add_schema(maptype="china") |
| 86 | + .add( |
| 87 | + "城市热力图", |
| 88 | + list(results), |
| 89 | + type_=ChartType.HEATMAP, |
| 90 | + ) |
| 91 | + .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) |
| 92 | + .set_global_opts( |
| 93 | + visualmap_opts=opts.VisualMapOpts(), |
| 94 | + ).render("拉钩城市热力图.html") |
| 95 | + ) |
| 96 | + |
| 97 | + sql = 'select city,counts from (select city, count(1) counts from jobs group by city) a order by counts desc limit 20' |
| 98 | + results = self.query(sql) |
| 99 | + citys = [] |
| 100 | + values = [] |
| 101 | + for row in results: |
| 102 | + citys.append(row[0]) |
| 103 | + values.append(row[1]) |
| 104 | + c = ( |
| 105 | + Bar() |
| 106 | + .add_xaxis(citys) |
| 107 | + .add_yaxis("各城市的招聘数量 Top 20", values) |
| 108 | + .set_global_opts( |
| 109 | + xaxis_opts=opts.AxisOpts(name_rotate=60, name="城市", axislabel_opts={"rotate": 45}) |
| 110 | + ).render("拉钩城市招聘图.html") |
| 111 | + ) |
| 112 | + |
| 113 | + def education(self): |
| 114 | + sql = 'select education,count(1) counts from jobs group by education' |
| 115 | + results = self.query(sql) |
| 116 | + c = ( |
| 117 | + Pie() |
| 118 | + .add("", list(results)) |
| 119 | + .set_global_opts(title_opts=opts.TitleOpts(title='学历占比')) |
| 120 | + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) |
| 121 | + .render("拉勾学历.html") |
| 122 | + ) |
| 123 | + |
| 124 | + |
| 125 | + def workYear(self): |
| 126 | + sql = 'select workYear,count(1) counts from jobs group by workYear' |
| 127 | + results = self.query(sql) |
| 128 | + c = ( |
| 129 | + Pie() |
| 130 | + .add("", list(results)) |
| 131 | + .set_global_opts(title_opts=opts.TitleOpts(title='工作经验占比')) |
| 132 | + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c},{d}%")) |
| 133 | + .render("拉勾工作年限.html") |
| 134 | + ) |
| 135 | + |
| 136 | + def field(self): |
| 137 | + sql = 'select industryField from jobs' |
| 138 | + results = self.query(sql) |
| 139 | + rows = [] |
| 140 | + for row in results: |
| 141 | + r = row[0].replace(',', ' ').replace('丨', ' ').replace('、', ' ') |
| 142 | + rows.extend(r.split(' ')) |
| 143 | + sum = {} |
| 144 | + for r in rows: |
| 145 | + num = sum.get(r, 0) + 1 |
| 146 | + sum[r] = num |
| 147 | + tup = sorted(sum.items(), key = lambda kv:(kv[1], kv[0]),reverse=True) |
| 148 | + sum = {} |
| 149 | + for k, v in tup[0:20]: |
| 150 | + sum[k + str(v)] = v |
| 151 | + location = list(sum.keys()) |
| 152 | + values = list(sum.values()) |
| 153 | + |
| 154 | + c = ( |
| 155 | + PictorialBar() |
| 156 | + .add_xaxis(location) |
| 157 | + .add_yaxis( |
| 158 | + "", |
| 159 | + values, |
| 160 | + label_opts=opts.LabelOpts(is_show=False), |
| 161 | + symbol_size=18, |
| 162 | + symbol_repeat="fixed", |
| 163 | + symbol_offset=[0, 0], |
| 164 | + is_symbol_clip=True, |
| 165 | + symbol=SymbolType.ROUND_RECT, |
| 166 | + ) |
| 167 | + .reversal_axis() |
| 168 | + .set_global_opts( |
| 169 | + title_opts=opts.TitleOpts(title="热门行业"), |
| 170 | + xaxis_opts=opts.AxisOpts(is_show=False), |
| 171 | + yaxis_opts=opts.AxisOpts( |
| 172 | + axistick_opts=opts.AxisTickOpts(is_show=False), |
| 173 | + axisline_opts=opts.AxisLineOpts( |
| 174 | + linestyle_opts=opts.LineStyleOpts(opacity=0) |
| 175 | + ), |
| 176 | + ), |
| 177 | + ) |
| 178 | + .render("拉勾行业.html") |
| 179 | + ) |
| 180 | + |
| 181 | + |
| 182 | + def salary(self): |
| 183 | + sql = 'SELECT workYear,replace(salary,\'k\',\'\') s FROM jobs group by workYear,salary order by workYear' |
| 184 | + results = self.query(sql) |
| 185 | + sum = {} |
| 186 | + for r in results: |
| 187 | + rs = r[1].split('-') |
| 188 | + a = sum.get(r[0], []) |
| 189 | + a.extend(rs) |
| 190 | + sum[r[0]] = a |
| 191 | + |
| 192 | + for k in sum: |
| 193 | + numbers = list(map(int, sum[k])) |
| 194 | + v = list(set(numbers)) |
| 195 | + sum[k] = v |
| 196 | + |
| 197 | + print(list(sum.values())) |
| 198 | + |
| 199 | + c = Boxplot() |
| 200 | + c.add_xaxis(list(sum.keys())) |
| 201 | + c.add_yaxis("薪资与工作经验", c.prepare_data(list(sum.values()))) |
| 202 | + c.set_global_opts(title_opts=opts.TitleOpts(title="薪资与工作经验")) |
| 203 | + c.render("拉勾薪资.html") |
| 204 | + |
| 205 | + def ciyun(self): |
| 206 | + sql = 'select positionAdvantage,companyLabelList from jobs' |
| 207 | + results = self.query(sql) |
| 208 | + data = {} |
| 209 | + for row in results: |
| 210 | + positionStr = re.sub('\W+', ' ', row[0]) |
| 211 | + labelStr = re.sub('\W+', ' ', row[1]) |
| 212 | + a = positionStr.split(' ') |
| 213 | + b = labelStr.split(' ') |
| 214 | + a.extend(b) |
| 215 | + for i in a: |
| 216 | + data[i] = data.get(i, 0) + 1 |
| 217 | + sum = [] |
| 218 | + for k in data: |
| 219 | + sum.append((k,data[k])) |
| 220 | + |
| 221 | + ( |
| 222 | + WordCloud() |
| 223 | + .add(series_name="热点分析", data_pair=sum, word_size_range=[6, 66]) |
| 224 | + .set_global_opts( |
| 225 | + title_opts=opts.TitleOpts( |
| 226 | + title="热点分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23) |
| 227 | + ), |
| 228 | + tooltip_opts=opts.TooltipOpts(is_show=True), |
| 229 | + ) |
| 230 | + .render("拉勾福利.html") |
| 231 | + ) |
| 232 | + |
| 233 | + |
| 234 | + def companySize(self): |
| 235 | + results = self.query('select companySize,count(1) counts from jobs group by companySize') |
| 236 | + c = ( |
| 237 | + Pie() |
| 238 | + .add("", list(results)) |
| 239 | + .set_global_opts(title_opts=opts.TitleOpts(title='企业大小')) |
| 240 | + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c},{d}%")) |
| 241 | + .render("拉勾企业大小.html") |
| 242 | + ) |
| 243 | + |
| 244 | + |
| 245 | + def financeStage(self): |
| 246 | + results = self.query('select financeStage,count(1) counts from jobs group by financeStage') |
| 247 | + c = ( |
| 248 | + Pie() |
| 249 | + .add("", list(results)) |
| 250 | + .set_global_opts(title_opts=opts.TitleOpts(title='企业融资占比')) |
| 251 | + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c},{d}%")) |
| 252 | + .render("拉勾融资.html") |
| 253 | + ) |
| 254 | +if __name__ == '__main__': |
| 255 | + LgCrawler().field() |
0 commit comments