|
| 1 | +import requests |
| 2 | +import time |
| 3 | +import pandas as pd |
| 4 | +from lxml import etree |
| 5 | +from pyecharts.charts import Bar |
| 6 | +from pyecharts import options as opts |
| 7 | +from pyecharts.charts import Line |
| 8 | +import jieba |
| 9 | +import matplotlib.pyplot as plt |
| 10 | +from wordcloud import WordCloud |
| 11 | + |
| 12 | +base_url = 'https://movie.douban.com/subject/35155748/comments?start={}&limit=20&status=P&sort={}' |
| 13 | +headers = { |
| 14 | + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', |
| 15 | + 'Referer': 'https://movie.douban.com', |
| 16 | + # 注意,这里需要加上你自己的 cookie |
| 17 | + 'Cookie': '.' |
| 18 | +} |
| 19 | + |
| 20 | + |
| 21 | +def get_comments(): |
| 22 | + user_list, star_list, time_list, comment_list = [], [], [], [] |
| 23 | + for sort in ['time', 'new_score']: |
| 24 | + sort_name = "最热" if sort == 'new_score' else '最新' |
| 25 | + for start in range(25): |
| 26 | + print('准备抓取第 {} 页数据, 排序方式:{}'.format(start + 1, sort_name)) |
| 27 | + users, stars, times, comments = get_comment_by_url(base_url.format(start * 20, sort)) |
| 28 | + if not users: |
| 29 | + break |
| 30 | + user_list += users |
| 31 | + star_list += stars |
| 32 | + time_list += times |
| 33 | + comment_list += comments |
| 34 | + # 每次获取数据之后暂停 5 秒 |
| 35 | + time.sleep(5) |
| 36 | + print("#" * 10) |
| 37 | + print(user_list) |
| 38 | + print(star_list) |
| 39 | + print(time_list) |
| 40 | + print(comment_list) |
| 41 | + print("#" * 10) |
| 42 | + |
| 43 | + comments_dic = {'users': user_list, 'times': time_list, 'stars': star_list, 'comments': comment_list} |
| 44 | + return comments_dic |
| 45 | + |
| 46 | + |
| 47 | +def get_comment_by_url(url): |
| 48 | + # 用户、评论等级、评论时间、评论内容 |
| 49 | + users, stars, times, content_list = [], [], [], [] |
| 50 | + data = requests.get(url, headers=headers) |
| 51 | + selector = etree.HTML(data.text) |
| 52 | + comments = selector.xpath('//div[@class="comment"]') |
| 53 | + # 遍历所有评论 |
| 54 | + for comment in comments: |
| 55 | + user = comment.xpath('.//h3/span[2]/a/text()')[0] |
| 56 | + star = comment.xpath('.//h3/span[2]/span[2]/@class')[0][7:8] |
| 57 | + date_time = comment.xpath('.//h3/span[2]/span[3]/text()') |
| 58 | + if len(date_time) != 0: |
| 59 | + date_time = date_time[0].replace("\n", "").strip() |
| 60 | + else: |
| 61 | + date_time = None |
| 62 | + comment_text = comment.xpath('.//p/span/text()')[0].strip() |
| 63 | + users.append(user) |
| 64 | + stars.append(star) |
| 65 | + times.append(date_time) |
| 66 | + content_list.append(comment_text) |
| 67 | + return users, stars, times, content_list |
| 68 | + |
| 69 | + |
| 70 | +def format_data(result, key): |
| 71 | + list_ = [] |
| 72 | + for value in result[key].values(): |
| 73 | + list_.append(value) |
| 74 | + return list_ |
| 75 | + |
| 76 | + |
| 77 | +# 数量 |
| 78 | +def show_num(df): |
| 79 | + df_time = df.groupby(['times']).size() |
| 80 | + values = df_time.values.tolist() |
| 81 | + index = df_time.index.tolist() |
| 82 | + bar = Bar() |
| 83 | + bar.add_xaxis(index) |
| 84 | + bar.add_yaxis("数量 & 时间", values) |
| 85 | + bar.set_global_opts(xaxis_opts=opts.AxisOpts(name="评论日期", axislabel_opts={"rotate": 30})) |
| 86 | + bar.render_notebook() |
| 87 | + |
| 88 | + |
| 89 | +# 星级 |
| 90 | +def show_star(df): |
| 91 | + df_time = df.groupby(['times']).size() |
| 92 | + dic = {} |
| 93 | + |
| 94 | + for k in df_time.index: |
| 95 | + stars = df.loc[df['times'] == str(k), 'stars'] |
| 96 | + stars = list(map(int, stars)) |
| 97 | + dic[k] = round(sum(stars) / len(stars), 2) |
| 98 | + |
| 99 | + bar_star = Bar() |
| 100 | + bar_star.add_xaxis([x for x in dic.keys()]) |
| 101 | + bar_star.add_yaxis("星级 & 时间", [x for x in dic.values()]) |
| 102 | + bar_star.set_global_opts(xaxis_opts=opts.AxisOpts(name="评论日期", axislabel_opts={"rotate": 30})) |
| 103 | + bar_star.render_notebook() |
| 104 | + |
| 105 | + |
| 106 | +# 演员 |
| 107 | +def show_actor(df): |
| 108 | + roles = {'张译': 0, '吴京': 0, '李九霄': 0, '魏晨': 0, '邓超': 0} |
| 109 | + names = list(roles.keys()) |
| 110 | + |
| 111 | + for row in df['comments']: |
| 112 | + for name in names: |
| 113 | + roles[name] += row.count(name) |
| 114 | + |
| 115 | + line = ( |
| 116 | + Line() |
| 117 | + .add_xaxis(list(roles.keys())) |
| 118 | + .add_yaxis('', list(roles.values())) |
| 119 | + .set_global_opts(title_opts=opts.TitleOpts(title="")) |
| 120 | + ) |
| 121 | + |
| 122 | + line.render_notebook() |
| 123 | + |
| 124 | + |
| 125 | +# 词云 |
| 126 | +def show_word_cloud(df): |
| 127 | + content = "".join(list(df['comments'])) |
| 128 | + |
| 129 | + # 分词 |
| 130 | + words = jieba.cut(content) |
| 131 | + word_list = [] |
| 132 | + for x in words: |
| 133 | + word_list.append(x) |
| 134 | + cloud_word = ','.join(word_list) |
| 135 | + wc = WordCloud(font_path='/System/Library/Fonts/PingFang.ttc', background_color="white", scale=2.5, |
| 136 | + contour_color="lightblue", ).generate(cloud_word) |
| 137 | + |
| 138 | + plt.figure(figsize=(16, 9)) |
| 139 | + plt.imshow(wc) |
| 140 | + plt.axis('off') |
| 141 | + plt.show() |
| 142 | + |
| 143 | + |
| 144 | +if __name__ == '__main__': |
| 145 | + result = get_comments() |
| 146 | + users = format_data(result, 'users') |
| 147 | + stars = format_data(result, 'stars') |
| 148 | + times = format_data(result, 'times') |
| 149 | + comments = format_data(result, 'comments') |
| 150 | + |
| 151 | + comments_dic = {'users': users, 'times': times, 'stars': stars, 'comments': comments} |
| 152 | + df = pd.DataFrame(comments_dic) |
| 153 | + df = df.drop_duplicates() |
| 154 | + |
| 155 | + # show_num(df) |
| 156 | + # show_star(df) |
| 157 | + # show_actor(df) |
| 158 | + # show_word_cloud(df) |
0 commit comments