1+ # -*- coding:utf-8 -*-
2+
3+ import re
4+ import requests
5+ import os
6+ import urllib .request
7+ import ssl
8+
9+ from urllib .parse import urlsplit
10+ from os .path import basename
11+ import json
12+
13+ ssl ._create_default_https_context = ssl ._create_unverified_context
14+
15+ headers = {
16+ 'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" ,
17+ 'Accept-Encoding' : 'gzip, deflate'
18+ }
19+
20+ def get_image_url (qid , title ):
21+ answers_url = 'https://www.zhihu.com/api/v4/questions/' + str (qid )+ '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={}&limit=10&sort_by=default&platform=desktop'
22+ offset = 0
23+ session = requests .Session ()
24+
25+ while True :
26+ page = session .get (answers_url .format (offset ), headers = headers )
27+ json_text = json .loads (page .text )
28+ answers = json_text ['data' ]
29+
30+ offset += 10
31+
32+ if not answers :
33+ print ('获取图片地址完成' )
34+ return
35+
36+ pic_re = re .compile ('data-original="(.*?)"' , re .S )
37+
38+ for answer in answers :
39+ tmp_list = []
40+ pic_urls = re .findall (pic_re , answer ['content' ])
41+
42+ for item in pic_urls :
43+ # 去掉转移字符 \
44+ pic_url = item .replace ("\\ " , "" )
45+ pic_url = pic_url .split ('?' )[0 ]
46+
47+ # 去重复
48+ if pic_url not in tmp_list :
49+ tmp_list .append (pic_url )
50+
51+
52+ for pic_url in tmp_list :
53+ if pic_url .endswith ('r.jpg' ):
54+ print (pic_url )
55+ write_file (title , pic_url )
56+
57+ def write_file (title , pic_url ):
58+ file_name = title + '.txt'
59+
60+ f = open (file_name , 'a' )
61+ f .write (pic_url + '\n ' )
62+ f .close ()
63+
64+ def read_file (title ):
65+ file_name = title + '.txt'
66+
67+ pic_urls = []
68+
69+ # 判断文件是否存在
70+ if not os .path .exists (file_name ):
71+ return pic_urls
72+
73+ with open (file_name , 'r' ) as f :
74+ for line in f :
75+ url = line .replace ("\n " , "" )
76+ if url not in pic_urls :
77+ pic_urls .append (url )
78+
79+ print ("文件中共有{}个不重复的 URL" .format (len (pic_urls )))
80+ return pic_urls
81+
82+ def download_pic (pic_urls , title ):
83+
84+ # 创建文件夹
85+ if not os .path .exists (title ):
86+ os .makedirs (title )
87+
88+ error_pic_urls = []
89+ success_pic_num = 0
90+ repeat_pic_num = 0
91+
92+ index = 1
93+
94+ for url in pic_urls :
95+ file_name = os .sep .join ((title ,basename (urlsplit (url )[2 ])))
96+
97+ if os .path .exists (file_name ):
98+ print ("图片{}已存在" .format (file_name ))
99+ index += 1
100+ repeat_pic_num += 1
101+ continue
102+
103+ try :
104+ urllib .request .urlretrieve (url , file_name )
105+ success_pic_num += 1
106+ index += 1
107+ print ("下载{}完成!({}/{})" .format (file_name , index , len (pic_urls )))
108+ except :
109+ print ("下载{}失败!({}/{})" .format (file_name , index , len (pic_urls )))
110+ error_pic_urls .append (url )
111+ index += 1
112+ continue
113+
114+ print ("图片全部下载完毕!(成功:{}/重复:{}/失败:{})" .format (success_pic_num , repeat_pic_num , len (error_pic_urls )))
115+
116+ if len (error_pic_urls ) > 0 :
117+ print ('下面打印失败的图片地址' )
118+ for error_url in error_pic_urls :
119+ print (error_url )
120+
121+ if __name__ == '__main__' :
122+
123+ qid = 406321189
124+ title = '你们身边有什么素人美女吗(颜值身材巨好的那种)?'
125+
126+ get_image_url (qid , title )
127+
128+ pic_urls = read_file (title )
129+ # 下载文件
130+ download_pic (pic_urls , title )
0 commit comments