11#!/usr/bin/env python
22# encoding=utf-8
3- import requests ,re
3+ import requests
4+ import re
45import codecs
56from bs4 import BeautifulSoup
67from openpyxl import Workbook
78wb = Workbook ()
89dest_filename = '电影.xlsx'
9- ws1 = wb .active
10+ ws1 = wb .active
1011ws1 .title = "电影top250"
1112
1213DOWNLOAD_URL = 'http://movie.douban.com/top250/'
@@ -24,56 +25,58 @@ def download_page(url):
2425def get_li (doc ):
2526 soup = BeautifulSoup (doc , 'html.parser' )
2627 ol = soup .find ('ol' , class_ = 'grid_view' )
27- name = [] # 名字
28- star_con = [] # 评价人数
29- score = [] #评分
30- info_list = [] #短评
28+ name = [] # 名字
29+ star_con = [] # 评价人数
30+ score = [] # 评分
31+ info_list = [] # 短评
3132 for i in ol .find_all ('li' ):
3233 detail = i .find ('div' , attrs = {'class' : 'hd' })
33- movie_name = detail .find ('span' , attrs = {'class' : 'title' }).get_text () #电影名字
34- level_star = i .find ('span' ,attrs = {'class' :'rating_num' }).get_text () #评分
35- star = i .find ('div' ,attrs = {'class' :'star' })
36- star_num = star .find (text = re .compile ('评价' )) #评价
34+ movie_name = detail .find (
35+ 'span' , attrs = {'class' : 'title' }).get_text () # 电影名字
36+ level_star = i .find (
37+ 'span' , attrs = {'class' : 'rating_num' }).get_text () # 评分
38+ star = i .find ('div' , attrs = {'class' : 'star' })
39+ star_num = star .find (text = re .compile ('评价' )) # 评价
3740
38- info = i .find ('span' ,attrs = {'class' :'inq' }) #短评
39- if info : # 判断是否有短评
41+ info = i .find ('span' , attrs = {'class' : 'inq' }) # 短评
42+ if info : # 判断是否有短评
4043 info_list .append (info .get_text ())
4144 else :
4245 info_list .append ('无' )
4346 score .append (level_star )
44-
4547
4648 name .append (movie_name )
4749 star_con .append (star_num )
48- page = soup .find ('span' , attrs = {'class' : 'next' }).find ('a' ) # 获取下一页
50+ page = soup .find ('span' , attrs = {'class' : 'next' }).find ('a' ) # 获取下一页
4951 if page :
50- return name ,star_con ,score ,info_list ,DOWNLOAD_URL + page ['href' ]
51- return name ,star_con ,score ,info_list ,None
52+ return name , star_con , score , info_list , DOWNLOAD_URL + page ['href' ]
53+ return name , star_con , score , info_list , None
5254
5355
5456def main ():
5557 url = DOWNLOAD_URL
5658 name = []
57- star_con = []
59+ star_con = []
5860 score = []
5961 info = []
6062 while url :
6163 doc = download_page (url )
62- movie ,star ,level_num ,info_list ,url = get_li (doc )
64+ movie , star , level_num , info_list , url = get_li (doc )
6365 name = name + movie
6466 star_con = star_con + star
65- score = score + level_num
66- info = info + info_list
67- for (i ,m , o , p ) in zip (name ,star_con ,score ,info ):
68- col_A = 'A%s' % (name .index (i )+ 1 )
69- col_B = 'B%s' % (name .index (i )+ 1 )
70- col_C = 'C%s' % (name .index (i )+ 1 )
71- col_D = 'D%s' % (name .index (i )+ 1 )
72- ws1 [col_A ]= i
67+ score = score + level_num
68+ info = info + info_list
69+ for (i , m , o , p ) in zip (name , star_con , score , info ):
70+ col_A = 'A%s' % (name .index (i ) + 1 )
71+ col_B = 'B%s' % (name .index (i ) + 1 )
72+ col_C = 'C%s' % (name .index (i ) + 1 )
73+ col_D = 'D%s' % (name .index (i ) + 1 )
74+ ws1 [col_A ] = i
7375 ws1 [col_B ] = m
7476 ws1 [col_C ] = o
7577 ws1 [col_D ] = p
7678 wb .save (filename = dest_filename )
7779
80+
7881if __name__ == '__main__' :
7982 main ()
0 commit comments