1+ from typing import List
2+ import requests ,json ,csv ,os
3+ from uuid import uuid4
4+ from bs4 import BeautifulSoup
5+ from urllib import parse
6+
7+ '''主域名'''
8+ DOMAIN_URL = 'https://book.douban.com'
9+
10+ '''
11+ 协议头
12+ user-agent(必填)
13+ Referer(有就填,没有不填)
14+ Cookie(有账号登录就填,没有不填)
15+ '''
16+ HEADERS = {
17+ 'user-agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' ,
18+ 'Referer' :'https://book.douban.com/' ,
19+ 'Cookie' :'填写你的Cookie'
20+ }
21+
22+ '''结果去重集合'''
23+ RESULT_SET_DATA = set ()
24+
25+ '''
26+ 获取book的tag链接
27+ params:
28+ parse_number: int --> 爬取几个tag链接,默认全部
29+
30+ return: List[str] --> 确定爬取几个tag链接
31+ '''
32+ def get_book_tag_url (split_number :int = None ) -> List [str ]:
33+
34+ html = requests .get (url = DOMAIN_URL ,headers = HEADERS )
35+ soup = BeautifulSoup (html .text ,'lxml' )
36+
37+ tag_url_list_data = [
38+ DOMAIN_URL + parse .quote (tag_url ['href' ])
39+ for tag_url in soup .select ('ul.hot-tags-col5.s ul a' )
40+ ]
41+
42+ if split_number :
43+ tag_url_list_data = tag_url_list_data [:split_number ]
44+
45+ return tag_url_list_data
46+
47+
48+ '''
49+ 解析tag_url,进行翻页后,获取book的内容
50+ params:
51+ tag_url_list_data: List[str] --> book的tag链接
52+ parse_number: int --> 翻页参数,默认爬取3页
53+ write_type: bool --> 是否写入json文件
54+ return:List[dict] --> 爬取成功book的内容
55+ '''
56+ def parse_book_url_info (
57+ tag_url_list_data :List [str ],
58+ parse_number :int = 3 ,
59+ write_json_type :bool = True ,
60+ write_csv_type :bool = True ,
61+ write_image_type :bool = True
62+ ) -> List [dict ]:
63+
64+ book_info_list_data = []
65+
66+ for tag_url in tag_url_list_data :
67+
68+ # 开始翻页,每20算一页
69+ for parse in range (0 ,parse_number * 20 + 1 ,20 ):
70+
71+ # 翻页URL
72+ parse_url = f'{ tag_url } ?start={ parse } '
73+
74+ html = requests .get (url = parse_url ,headers = HEADERS )
75+ soup = BeautifulSoup (html .text ,'lxml' )
76+
77+ # 选择书本
78+ books = soup .select ('li.subject-item' )
79+
80+ for book in books :
81+
82+ # 选择书本链接
83+ book_url = book .select_one ('.info h2 a' )['href' ]
84+
85+ # 选择书名
86+ title = book .select_one ('.info h2 a' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
87+
88+ # 选择作者
89+ info = book .select_one ('.info div.pub' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
90+
91+ # 选择评分
92+ star = book .select_one ('.rating_nums' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
93+
94+ # 选择评价
95+ pl = book .select_one ('.pl' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
96+
97+ # 选择书本简介
98+ introduce = book .select_one ('.info p' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
99+
100+ # 获取图片URL
101+ image_url = book .select_one ('.nbg img' )['src' ]
102+
103+ book_info_result = dict (
104+ 书本链接 = book_url ,
105+ 书名 = title ,
106+ 作者 = info ,
107+ 评分 = star ,
108+ 评价 = pl ,
109+ 书本简介 = introduce ,
110+ 图片链接 = image_url
111+ )
112+
113+ '''生成结果hash值'''
114+ result_hash_data = hash (json .dumps (book_info_result ,ensure_ascii = False ))
115+
116+ if result_hash_data not in RESULT_SET_DATA :
117+
118+ '''加入去重集合'''
119+ RESULT_SET_DATA .add (result_hash_data )
120+
121+ if write_image_type :
122+ write_image_book_info (
123+ image_url = image_url ,
124+ image_name = title ,
125+ headers = HEADERS
126+ )
127+
128+ # 检查是否写入json文件
129+ if write_json_type :
130+ write_json_book_info (book_info_result )
131+
132+ # 检查是否写入csv文件
133+ if write_csv_type :
134+ write_csv_book_info (
135+ headers = [key for key ,value in book_info_result .items ()],
136+ book_info = [value for key ,value in book_info_result .items ()]
137+ )
138+
139+ print (book_info_result )
140+
141+ book_info_list_data .append (book_info_result )
142+
143+ return book_info_list_data
144+
145+
146+
147+ '''
148+ 保存图片,生成图片映射JSON文件
149+ params:
150+ image_url:str --> 图片链接
151+ image_name:str --> 图片名字
152+ headers: dict --> 协议头
153+ '''
154+ def write_image_book_info (image_url :str ,image_name :str ,headers :dict ):
155+
156+ '''确保图片文件名不重复'''
157+ uuid_id = uuid4 ()
158+
159+ filename = './保存图片/图片'
160+
161+ image_file_name = f'{ filename } /{ uuid_id } .jpg'
162+
163+ image_map_file_name = f'./保存图片/image_map_data.json'
164+
165+ '''如果不存在文件夹则创建'''
166+ if not os .path .exists (filename ):
167+ os .makedirs (filename )
168+
169+ html = requests .get (url = image_url ,headers = headers )
170+
171+ '''写入图片'''
172+ with open (image_file_name ,'wb' ) as f :
173+
174+ f .write (html .content )
175+
176+ '''保存图片映射JSON文件'''
177+ with open (image_map_file_name ,'a+' ,encoding = 'utf-8' ) as f :
178+
179+ f .write (json .dumps (dict (image_name = image_name ,uuid = str (uuid_id ),image_url = image_url ),ensure_ascii = False )+ '\n ' )
180+
181+
182+
183+ '''
184+ 将book的内容,写入json文件
185+ params:
186+ book_info: dict --> 爬取成功book的内容
187+ '''
188+ def write_json_book_info (book_info :dict ):
189+
190+ with open ('book_info.json' ,'a+' ,encoding = 'utf-8' ) as f :
191+
192+ '''
193+ json.dumps() 将dict对象转成str对象,json就是str对象
194+ ensure_ascii=False 让json显示中文编码
195+ '''
196+ f .write (json .dumps (book_info ,ensure_ascii = False )+ '\n ' )
197+
198+
199+
200+ '''
201+ 将book的内容,写入csv文件(带表头)
202+ params:
203+ headers:list --> CSV表头
204+ book_info: list --> 爬取成功book的内容
205+ '''
206+ def write_csv_book_info (headers :list ,book_info :list ):
207+
208+ '''
209+ 跨平台问题:
210+ 写入csv 因为Windows有点BUG
211+ writerows()写入会出现空行
212+ 所以加入newline=''
213+ 没有出现这种情况则不需要
214+ '''
215+
216+ '''
217+ 检查是否创建了CSV文件
218+ 没有则生成带有表头的CSV文件
219+ '''
220+ if not os .path .exists ('book_info.csv' ):
221+
222+ with open ('book_info.csv' ,'a+' ,encoding = 'utf-8' ,newline = '' ) as f :
223+
224+ f_csv = csv .writer (f )
225+ f_csv .writerow (headers )
226+
227+
228+
229+ '''
230+ 逐行开始写入CSV
231+ '''
232+ with open ('book_info.csv' ,'a+' ,encoding = 'utf-8' ,newline = '' ) as f :
233+
234+ f_csv = csv .writer (f )
235+ f_csv .writerow (book_info ) #逐行插入
236+
237+ if __name__ == '__main__' :
238+
239+ book_tag_url = get_book_tag_url (1 )
240+
241+ book_url_info = parse_book_url_info (book_tag_url )
0 commit comments