11#Python3.7
22#encoding = utf-8
33
4- import requests ,asyncio ,aiohttp ,os ,time
5- from urllib import parse
4+ import requests ,json ,re ,os ,traceback ,datetime ,aiohttp ,asyncio
65from uuid import uuid4
6+ from urllib import parse
77from concurrent .futures import ThreadPoolExecutor
88
9- tasks = []
10-
119headers = {
12- 'Referer' :'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1585789312844_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&sid=&word=miku' ,
13- 'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' ,
14- #请填写你的Cookie
10+ 'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36' ,
11+ 'Referer' :'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E5%88%9D%E9%9F%B3%E6%9C%AA%E6%9D%A5'
1512}
1613
17- session = requests .session ()#保存登录状态
18- session .headers = headers
19-
20- def get_html (url ):#访问网页
21-
22- s = time .time ()
23- html = session .get (url )
24-
25- if html .status_code == 200 :#状态码
26-
27- parse_html (html .json ())
28-
29- else :
30- print ('访问网页错误' )
31-
32- print ('\n \n ' + '*' * 40 + '\n \n ' )
33- print ('程序耗时了: {} 秒' .format (time .time ()- s ))#输出程序耗时
34- print ('\n \n ' + '*' * 40 + '\n \n ' )
35-
36-
37- def parse_html (html ):#解析网页
38-
39- data = html ['data' ]
40-
41- for i in data :
42- try :
43- img = i ['middleURL' ]
44- print (img )
45- tasks .append (download (img ))
46- except Exception as e :
47- print (e )
48-
49-
50- async def download (img_url ):
51-
52- filename = '下载好的图片'
53- if not os .path .exists (filename ):
54- os .makedirs (filename )
55-
56- async with aiohttp .ClientSession (headers = headers ) as session :
57- async with session .get (img_url ) as html :
58-
59- with open ('./{}/{}.jpg' .format (filename ,uuid4 ()),'wb' ) as f :
60- f .write (await html .content .read ())
61-
62-
63-
64-
65- if __name__ == '__main__' :
66-
67- loop = asyncio .get_event_loop ()#创建异步编程
68-
69- name = parse .quote ('初音未来' )#有需要的可以添加一个input,让用户输入
70-
71- with ThreadPoolExecutor (max_workers = 2 ) as t :
72- for i in range (30 ,270 ,30 ):
73- url = 'https://image.baidu.com/search/acjson?tn=resultjson_com' \
74- '&ipn=rj&ct=201326592&is=&fp=result&queryWord={}' \
75- '&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=' \
76- '&hd=&latest=©right=&word={}&s=&se=&tab=&width=' \
77- '&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=' \
78- '&force=&pn={}&rn=30' .format (name ,name ,i )
79- t .submit (get_html ,url )
80-
81- loop .run_until_complete (asyncio .wait (tasks ))
82- loop .close ()#程序关闭
83-
14+ tasks = []
8415
16+ def get_html (url ):
17+
18+ try :
19+ html = requests .get (url ,headers = headers )
20+ json_data = html .text .replace ('\\ ' ,'' )#去除JSON数据多余的\
21+ json_data = json .loads (json_data )
22+ parse_json (json_data )
23+
24+ except json .decoder .JSONDecodeError :
25+
26+ #去除"fromPageTitle"键值的双引号异常
27+ fromPageTitle = r'"fromPageTitle":"(.*?)",'
28+ json_data = replace_data (fromPageTitle ,json_data )
29+
30+ #去除"fromPageTitle"键值的双引号异常
31+ fromPageTitle = r'"fromPageTitleEnc":"(.*?)",'
32+ json_data = replace_data (fromPageTitle ,json_data )
33+
34+ json_data = json .loads (json_data )
35+ write_error (url ,flag = '已经成功处理' )
36+ parse_json (json_data )
37+
38+ except Exception :
39+ write_error (url ,flag = '未能成功处理' )
40+
41+ #解析JSON获取图片URL
42+ def parse_json (json_data ):
43+ list_data = json_data ['data' ]
44+ for data in list_data [:- 1 ]:
45+ image_name = data ["fromPageTitleEnc" ]
46+ for image_data in data ["replaceUrl" ]:
47+ image_url = image_data ['ObjURL' ]
48+ tasks .append (download (image_url ,image_name ))
49+
50+ #下载图片
51+ async def download (image_url ,image_name ):
52+
53+ black_image = b'GIF89a\x04 \x00 \x08 \x00 \x91 \x02 \x00 \xff \xff \xff \x00 \x00 \x00 \xff \xff \xff \x00 \x00 \x00 !\xf9 \x04 \x01 \x00 \x00 \x02 \x00 ,\x00 \x00 \x00 \x00 \x04 \x00 \x08 \x00 \x00 \x02 \x05 \x94 \x8f \xa9 \x8b \x05 \x00 ;'
54+
55+ filename = './百度图片/下载好的图片'
56+ if not os .path .exists (filename ):
57+ os .makedirs (filename )
58+
59+ print ("[INFO]{} 正在下载图片:{}" .format (datetime .datetime .now (),image_name ))
60+
61+ async with aiohttp .ClientSession (headers = headers ) as session :
62+ async with session .get (image_url ) as html :
63+
64+ uuid_id = uuid4 ()
65+ image_file_name = '{}/{}.jpg' .format (filename ,uuid_id )
66+
67+ #筛选掉异常的黑色图片、查询不到的图片
68+ if black_image not in await html .read () and b'<!DOCTYPE html>' not in await html .read ():
69+
70+ with open (image_file_name ,'wb' ) as f :
71+ f .write (await html .read ())
72+
73+ with open ('./百度图片/图片映射表.json' ,'a+' ,encoding = 'utf-8' ) as f :
74+ json_data = json .dumps (dict (image_name = image_name ,id = str (uuid_id )),ensure_ascii = False )
75+ f .write (json_data + '\n ' )
76+
77+ #用正则删除双引号异常
78+ def replace_data (re_compile ,json_data ):
79+ re_data = re .compile (re_compile )
80+ for i in re_data .findall (json_data ):
81+ data = i .replace ('"' ,'' ).replace ("\\ '" ,'' )
82+ json_data = json_data .replace (i ,data )
83+ return json_data
84+
85+ #写入异常
86+ def write_error (url ,flag = None ):
87+
88+ with open ('./百度图片/错误日志.txt' ,'a+' ,encoding = 'utf-8' ) as f :
89+ f .write ('JSON异常是否处理成功:{}\n ' .format (flag ))
90+ f .write ('异常时间:{}\n ' .format (datetime .datetime .now ()))
91+ f .write ('异常URL:{}\n ' .format (url ))
92+ f .write (traceback .format_exc () + '\n ' )
93+
94+ if __name__ == "__main__" :
95+
96+ loop = asyncio .get_event_loop ()#创建异步编程
97+ name = parse .quote ('初音未来' )
98+
99+ with ThreadPoolExecutor (max_workers = 2 ) as t :
100+ #翻页30
101+ for i in range (30 ,120 ,30 ):
102+ url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592' \
103+ '&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=' \
104+ '©right=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1' \
105+ '&fr=&expermode=&force=&pn={}&rn=30' .format (name ,name ,i )
106+ t .submit (get_html ,url )
107+
108+ loop .run_until_complete (asyncio .wait (tasks ))
109+ loop .close ()#程序关闭
0 commit comments