Skip to content

Commit 5e21989

Browse files
committed
add code
1 parent 97b298e commit 5e21989

File tree

2 files changed

+160
-1
lines changed

2 files changed

+160
-1
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import requests
2+
import time
3+
import pandas as pd
4+
from lxml import etree
5+
from pyecharts.charts import Bar
6+
from pyecharts import options as opts
7+
from pyecharts.charts import Line
8+
import jieba
9+
import matplotlib.pyplot as plt
10+
from wordcloud import WordCloud
11+
12+
base_url = 'https://movie.douban.com/subject/35155748/comments?start={}&limit=20&status=P&sort={}'
13+
headers = {
14+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
15+
'Referer': 'https://movie.douban.com',
16+
# 注意,这里需要加上你自己的 cookie
17+
'Cookie': '.'
18+
}
19+
20+
21+
def get_comments():
22+
user_list, star_list, time_list, comment_list = [], [], [], []
23+
for sort in ['time', 'new_score']:
24+
sort_name = "最热" if sort == 'new_score' else '最新'
25+
for start in range(25):
26+
print('准备抓取第 {} 页数据, 排序方式:{}'.format(start + 1, sort_name))
27+
users, stars, times, comments = get_comment_by_url(base_url.format(start * 20, sort))
28+
if not users:
29+
break
30+
user_list += users
31+
star_list += stars
32+
time_list += times
33+
comment_list += comments
34+
# 每次获取数据之后暂停 5 秒
35+
time.sleep(5)
36+
print("#" * 10)
37+
print(user_list)
38+
print(star_list)
39+
print(time_list)
40+
print(comment_list)
41+
print("#" * 10)
42+
43+
comments_dic = {'users': user_list, 'times': time_list, 'stars': star_list, 'comments': comment_list}
44+
return comments_dic
45+
46+
47+
def get_comment_by_url(url):
48+
# 用户、评论等级、评论时间、评论内容
49+
users, stars, times, content_list = [], [], [], []
50+
data = requests.get(url, headers=headers)
51+
selector = etree.HTML(data.text)
52+
comments = selector.xpath('//div[@class="comment"]')
53+
# 遍历所有评论
54+
for comment in comments:
55+
user = comment.xpath('.//h3/span[2]/a/text()')[0]
56+
star = comment.xpath('.//h3/span[2]/span[2]/@class')[0][7:8]
57+
date_time = comment.xpath('.//h3/span[2]/span[3]/text()')
58+
if len(date_time) != 0:
59+
date_time = date_time[0].replace("\n", "").strip()
60+
else:
61+
date_time = None
62+
comment_text = comment.xpath('.//p/span/text()')[0].strip()
63+
users.append(user)
64+
stars.append(star)
65+
times.append(date_time)
66+
content_list.append(comment_text)
67+
return users, stars, times, content_list
68+
69+
70+
def format_data(result, key):
71+
list_ = []
72+
for value in result[key].values():
73+
list_.append(value)
74+
return list_
75+
76+
77+
# 数量
78+
def show_num(df):
79+
df_time = df.groupby(['times']).size()
80+
values = df_time.values.tolist()
81+
index = df_time.index.tolist()
82+
bar = Bar()
83+
bar.add_xaxis(index)
84+
bar.add_yaxis("数量 & 时间", values)
85+
bar.set_global_opts(xaxis_opts=opts.AxisOpts(name="评论日期", axislabel_opts={"rotate": 30}))
86+
bar.render_notebook()
87+
88+
89+
# 星级
90+
def show_star(df):
91+
df_time = df.groupby(['times']).size()
92+
dic = {}
93+
94+
for k in df_time.index:
95+
stars = df.loc[df['times'] == str(k), 'stars']
96+
stars = list(map(int, stars))
97+
dic[k] = round(sum(stars) / len(stars), 2)
98+
99+
bar_star = Bar()
100+
bar_star.add_xaxis([x for x in dic.keys()])
101+
bar_star.add_yaxis("星级 & 时间", [x for x in dic.values()])
102+
bar_star.set_global_opts(xaxis_opts=opts.AxisOpts(name="评论日期", axislabel_opts={"rotate": 30}))
103+
bar_star.render_notebook()
104+
105+
106+
# 演员
107+
def show_actor(df):
108+
roles = {'张译': 0, '吴京': 0, '李九霄': 0, '魏晨': 0, '邓超': 0}
109+
names = list(roles.keys())
110+
111+
for row in df['comments']:
112+
for name in names:
113+
roles[name] += row.count(name)
114+
115+
line = (
116+
Line()
117+
.add_xaxis(list(roles.keys()))
118+
.add_yaxis('', list(roles.values()))
119+
.set_global_opts(title_opts=opts.TitleOpts(title=""))
120+
)
121+
122+
line.render_notebook()
123+
124+
125+
# 词云
126+
def show_word_cloud(df):
127+
content = "".join(list(df['comments']))
128+
129+
# 分词
130+
words = jieba.cut(content)
131+
word_list = []
132+
for x in words:
133+
word_list.append(x)
134+
cloud_word = ','.join(word_list)
135+
wc = WordCloud(font_path='/System/Library/Fonts/PingFang.ttc', background_color="white", scale=2.5,
136+
contour_color="lightblue", ).generate(cloud_word)
137+
138+
plt.figure(figsize=(16, 9))
139+
plt.imshow(wc)
140+
plt.axis('off')
141+
plt.show()
142+
143+
144+
if __name__ == '__main__':
145+
result = get_comments()
146+
users = format_data(result, 'users')
147+
stars = format_data(result, 'stars')
148+
times = format_data(result, 'times')
149+
comments = format_data(result, 'comments')
150+
151+
comments_dic = {'users': users, 'times': times, 'stars': stars, 'comments': comments}
152+
df = pd.DataFrame(comments_dic)
153+
df = df.drop_duplicates()
154+
155+
# show_num(df)
156+
# show_star(df)
157+
# show_actor(df)
158+
# show_word_cloud(df)

doudou/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Python技术 公众号文章代码库
44

5-
+ [douban-movie-top250](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-02-20-douban-movie-top250):实战|数据分析篇之豆瓣电影 TOP250 示例代码
5+
+ [douban-movie-top250](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-02-20-douban-movie-top250):实战|数据分析篇之豆瓣电影 TOP250
66
+ [duo-la-a-meng](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-03-27-duo-la-a-meng):用 Python 画哆啦 A 梦
77
+ [fund-fixed-investment](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-03-27-found):指数基金定投到底能不能赚钱?Python 来告诉你答案
88
+ [pyecharts](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-03-27-pyechars):Python 图表利器 pyecharts
@@ -16,6 +16,7 @@ Python技术 公众号文章代码库
1616
+ [national-day](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-10-13-national-day):国庆旅游热图
1717
+ [Appium](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-10-20-appium):Appium 神器
1818
+ [163 music](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-11-02-163-music):下载网易云乐库
19+
+ [Chinese People's Volunteer Army](https://github.com/JustDoPython/python-examples/tree/master/doudou/2020-11-10-resisting-us-aid-korea):中国人民志愿军
1920

2021
---
2122

0 commit comments

Comments
 (0)