Skip to content

Commit 5eeff37

Browse files
author
mochazi
committed
Baidu Crawler Update🎉
1 parent 868d2a0 commit 5eeff37

File tree

2 files changed

+100
-75
lines changed

2 files changed

+100
-75
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# **Python3Webcrawler**
2-
## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)**  **UPDATE 2021-01-10**
2+
## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)**  **UPDATE 2021-01-18**
33
> **精心挑选了几个爬虫,给大家在学Scrapy框架之前打基础。**
44
>> **该项目仅限学习交流,请勿用于商业用途,如有侵权,请联系删除。**
55

百度图片/crawl.py

Lines changed: 99 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,109 @@
11
#Python3.7
22
#encoding = utf-8
33

4-
import requests,asyncio,aiohttp,os,time
5-
from urllib import parse
4+
import requests,json,re,os,traceback,datetime,aiohttp,asyncio
65
from uuid import uuid4
6+
from urllib import parse
77
from concurrent.futures import ThreadPoolExecutor
88

9-
tasks = []
10-
119
headers = {
12-
'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1585789312844_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&sid=&word=miku',
13-
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
14-
#请填写你的Cookie
10+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
11+
'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&pv=&ic=0&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E5%88%9D%E9%9F%B3%E6%9C%AA%E6%9D%A5'
1512
}
1613

17-
session = requests.session()#保存登录状态
18-
session.headers = headers
19-
20-
def get_html(url):#访问网页
21-
22-
s= time.time()
23-
html = session.get(url)
24-
25-
if html.status_code == 200:#状态码
26-
27-
parse_html(html.json())
28-
29-
else:
30-
print('访问网页错误')
31-
32-
print('\n\n'+'*'*40+'\n\n')
33-
print('程序耗时了: {} 秒'.format(time.time()-s))#输出程序耗时
34-
print('\n\n'+'*'*40+'\n\n')
35-
36-
37-
def parse_html(html):#解析网页
38-
39-
data = html['data']
40-
41-
for i in data:
42-
try:
43-
img = i['middleURL']
44-
print(img)
45-
tasks.append(download(img))
46-
except Exception as e:
47-
print(e)
48-
49-
50-
async def download(img_url):
51-
52-
filename = '下载好的图片'
53-
if not os.path.exists(filename):
54-
os.makedirs(filename)
55-
56-
async with aiohttp.ClientSession(headers = headers) as session:
57-
async with session.get(img_url) as html:
58-
59-
with open('./{}/{}.jpg'.format(filename,uuid4()),'wb') as f:
60-
f.write(await html.content.read())
61-
62-
63-
64-
65-
if __name__ == '__main__':
66-
67-
loop = asyncio.get_event_loop()#创建异步编程
68-
69-
name = parse.quote('初音未来')#有需要的可以添加一个input,让用户输入
70-
71-
with ThreadPoolExecutor(max_workers = 2) as t:
72-
for i in range(30,270,30):
73-
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com'\
74-
'&ipn=rj&ct=201326592&is=&fp=result&queryWord={}'\
75-
'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic='\
76-
'&hd=&latest=&copyright=&word={}&s=&se=&tab=&width='\
77-
'&height=&face=0&istype=2&qc=&nc=1&fr=&expermode='\
78-
'&force=&pn={}&rn=30'.format(name,name,i)
79-
t.submit(get_html,url)
80-
81-
loop.run_until_complete(asyncio.wait(tasks))
82-
loop.close()#程序关闭
83-
14+
tasks = []
8415

16+
def get_html(url):
17+
18+
try:
19+
html = requests.get(url,headers=headers)
20+
json_data = html.text.replace('\\','')#去除JSON数据多余的\
21+
json_data = json.loads(json_data)
22+
parse_json(json_data)
23+
24+
except json.decoder.JSONDecodeError:
25+
26+
#去除"fromPageTitle"键值的双引号异常
27+
fromPageTitle = r'"fromPageTitle":"(.*?)",'
28+
json_data = replace_data(fromPageTitle,json_data)
29+
30+
#去除"fromPageTitle"键值的双引号异常
31+
fromPageTitle = r'"fromPageTitleEnc":"(.*?)",'
32+
json_data = replace_data(fromPageTitle,json_data)
33+
34+
json_data = json.loads(json_data)
35+
write_error(url,flag='已经成功处理')
36+
parse_json(json_data)
37+
38+
except Exception:
39+
write_error(url,flag='未能成功处理')
40+
41+
#解析JSON获取图片URL
42+
def parse_json(json_data):
43+
list_data = json_data['data']
44+
for data in list_data[:-1]:
45+
image_name = data["fromPageTitleEnc"]
46+
for image_data in data["replaceUrl"]:
47+
image_url = image_data['ObjURL']
48+
tasks.append(download(image_url,image_name))
49+
50+
#下载图片
51+
async def download(image_url,image_name):
52+
53+
black_image = b'GIF89a\x04\x00\x08\x00\x91\x02\x00\xff\xff\xff\x00\x00\x00\xff\xff\xff\x00\x00\x00!\xf9\x04\x01\x00\x00\x02\x00,\x00\x00\x00\x00\x04\x00\x08\x00\x00\x02\x05\x94\x8f\xa9\x8b\x05\x00;'
54+
55+
filename = './百度图片/下载好的图片'
56+
if not os.path.exists(filename):
57+
os.makedirs(filename)
58+
59+
print("[INFO]{} 正在下载图片:{}".format(datetime.datetime.now(),image_name))
60+
61+
async with aiohttp.ClientSession(headers = headers) as session:
62+
async with session.get(image_url) as html:
63+
64+
uuid_id = uuid4()
65+
image_file_name = '{}/{}.jpg'.format(filename,uuid_id)
66+
67+
#筛选掉异常的黑色图片、查询不到的图片
68+
if black_image not in await html.read() and b'<!DOCTYPE html>' not in await html.read():
69+
70+
with open(image_file_name,'wb') as f:
71+
f.write(await html.read())
72+
73+
with open('./百度图片/图片映射表.json','a+',encoding='utf-8') as f:
74+
json_data = json.dumps(dict(image_name = image_name,id=str(uuid_id)),ensure_ascii=False)
75+
f.write(json_data + '\n')
76+
77+
#用正则删除双引号异常
78+
def replace_data(re_compile,json_data):
79+
re_data = re.compile(re_compile)
80+
for i in re_data.findall(json_data):
81+
data = i.replace('"','').replace("\\'",'')
82+
json_data = json_data.replace(i,data)
83+
return json_data
84+
85+
#写入异常
86+
def write_error(url,flag=None):
87+
88+
with open('./百度图片/错误日志.txt','a+',encoding='utf-8') as f:
89+
f.write('JSON异常是否处理成功:{}\n'.format(flag))
90+
f.write('异常时间:{}\n'.format(datetime.datetime.now()))
91+
f.write('异常URL:{}\n'.format(url))
92+
f.write(traceback.format_exc() + '\n')
93+
94+
if __name__ == "__main__":
95+
96+
loop = asyncio.get_event_loop()#创建异步编程
97+
name = parse.quote('初音未来')
98+
99+
with ThreadPoolExecutor(max_workers = 2) as t:
100+
#翻页30
101+
for i in range(30,120,30):
102+
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592'\
103+
'&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest='\
104+
'&copyright=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1'\
105+
'&fr=&expermode=&force=&pn={}&rn=30'.format(name,name,i)
106+
t.submit(get_html,url)
107+
108+
loop.run_until_complete(asyncio.wait(tasks))
109+
loop.close()#程序关闭

0 commit comments

Comments
 (0)