Skip to content

Commit aeffaa6

Browse files
author
mochazi
committed
Initial commit🎉
0 parents  commit aeffaa6

File tree

12 files changed

+852
-0
lines changed

12 files changed

+852
-0
lines changed

QQ音乐/crawl.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
#Python3.7
2+
#encoding = utf-8
3+
4+
import requests,os,json,math
5+
from urllib import parse
6+
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7+
from db import SQLsession,Song
8+
9+
headers = {
10+
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
11+
'referer':'https://y.qq.com/portal/singer_list.html',
12+
#参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
13+
}
14+
15+
16+
session = SQLsession()
17+
18+
def myProcess():
19+
#把歌手按照首字母分为27类
20+
with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
21+
for i in range(2,3):#28
22+
p.submit(get_singer_mid,i)
23+
24+
def get_singer_mid(index):
25+
#index = 1-----27
26+
#打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
27+
#找出mid, 用于歌手详情页
28+
29+
data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
30+
',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
31+
'"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
32+
33+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
34+
'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
35+
'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
36+
'&data={}'.format(parse.quote(data))
37+
38+
html = requests.get(url).json()
39+
total = html['singerList']['data']['total']#多少个歌手
40+
pages = int(math.floor(int(total)/80))
41+
thread_number = pages
42+
43+
Thread = ThreadPoolExecutor(max_workers = thread_number)
44+
45+
sin = 0
46+
#分页迭代每一个字母下的所有页面歌手
47+
for page in range(1,pages):
48+
data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
49+
'"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
50+
'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
51+
52+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
53+
'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
54+
'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
55+
'&data={}'.format(parse.quote(data))
56+
57+
html = requests.get(url,headers = headers).json()
58+
59+
sings = html['singerList']['data']['singerlist']
60+
61+
for sing in sings:
62+
63+
singer_name = sing['singer_name']
64+
mid = sing['singer_mid']
65+
66+
Thread.submit(get_singer_data,mid = mid,
67+
singer_name = singer_name,)
68+
sin+=80
69+
70+
71+
72+
#获取歌手信息
73+
def get_singer_data(mid,singer_name):
74+
#获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
75+
#找出歌手的歌曲信息页
76+
77+
params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
78+
'"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
79+
'"module":"musichall.song_list_server"}}'%str(mid)
80+
81+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
82+
'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
83+
'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
84+
85+
html = requests.session()
86+
content = html.get(url,headers = headers).json()
87+
88+
songs_num = content['singerSongList']['data']['totalNum']
89+
90+
if int(songs_num)<=80:
91+
params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
92+
'"param":{"order":1,"singerMid":"%s","begin":0,"num":%s},' \
93+
'"module":"musichall.song_list_server"}}' % (str(mid),int(songs_num))
94+
95+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
96+
'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
97+
'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
98+
99+
html = requests.session()
100+
content = html.get(url, headers=headers).json()
101+
datas = content['singerSongList']['data']['songList']
102+
103+
for d in datas:
104+
sing_name = d['songInfo']['title']
105+
songmid = d['songInfo']['mid']
106+
try:
107+
session.add(Song(song_name = sing_name,
108+
song_singer = singer_name,
109+
song_mid = songmid))
110+
session.commit()
111+
print('commit')
112+
except:
113+
session.rollback()
114+
print('rollbeak')
115+
116+
print(sing_name,songmid,singer_name)
117+
download(songmid,sing_name)
118+
else:
119+
for a in range(0,songs_num,80):
120+
121+
params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
122+
'"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
123+
'"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
124+
125+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
126+
'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
127+
'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
128+
129+
html = requests.session()
130+
content = html.get(url, headers=headers).json()
131+
132+
datas = content['singerSongList']['data']['songList']
133+
134+
for d in datas:
135+
sing_name = d['songInfo']['title']
136+
songmid = d['songInfo']['mid']
137+
try:
138+
session.add(Song(song_name = sing_name,
139+
song_singer = singer_name,
140+
song_mid = songmid))
141+
session.commit()
142+
print('commit')
143+
except:
144+
session.rollback()
145+
print('rollbeak')
146+
147+
print(sing_name, songmid, singer_name)
148+
download(songmid,sing_name)
149+
150+
def download(songmid,sing_name):
151+
headers = {
152+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
153+
'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
154+
}
155+
156+
157+
data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
158+
'"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
159+
'"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
160+
'"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
161+
162+
163+
url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
164+
'&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
165+
'&notice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
166+
167+
html = requests.get(url,headers = headers)
168+
169+
try:
170+
purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
171+
172+
url = 'http://ws.stream.qqmusic.qq.com/{}'.format(purl)
173+
174+
html = requests.get(url,headers = headers)
175+
html.encoding = 'utf-8'
176+
177+
filename = '歌曲'
178+
179+
if not os.path.exists(filename):
180+
os.makedirs(filename)
181+
182+
with open('./{}/{}.m4a'.format(filename,sing_name),'wb') as f:
183+
print('\n正在下载{}歌曲.....\n'.format(sing_name))
184+
f.write(html.content)
185+
186+
except:
187+
print('查询权限失败,或没有查到对应的歌曲')
188+
189+
190+
191+
if __name__ == '__main__':
192+
myProcess()

QQ音乐/db.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from sqlalchemy import Column,Integer,String,create_engine
2+
from sqlalchemy.orm import sessionmaker,scoped_session
3+
from sqlalchemy.ext.declarative import declarative_base
4+
engine = create_engine('mysql+pymysql://root:root@localhost:3306/test?charset=utf8',
5+
max_overflow = 500,#超过连接池大小外最多可以创建的链接
6+
pool_size = 100,#连接池大小
7+
echo = False,#调试信息展示
8+
)
9+
Base = declarative_base()
10+
11+
class Song(Base):
12+
__tablename__ = 'song'
13+
song_id = Column(Integer,primary_key = True,autoincrement = True)
14+
song_name = Column(String(64))
15+
song_ablum = Column(String(64))
16+
song_mid = Column(String(50))
17+
song_singer = Column(String(50))
18+
Base.metadata.create_all(engine)
19+
20+
DBsession = sessionmaker(bind = engine)
21+
22+
SQLsession = scoped_session(DBsession)
23+
24+
25+
26+
27+

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# **Python3Webcrawler**
2+
## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)** &emsp;**UPDATE 2020 4 27**
3+
> **精心挑选了几个爬虫,给大家在学Scrapy框架之前打基础。**
4+
>> **该项目仅限学习交流,请勿用于商业用途,如有侵权,请联系删除。**
5+
6+
|**程序依赖**|**安装指令**|**项目使用版本**|
7+
|:----:|:--------:|:--------:|
8+
|**lxml**|**pip install lxml**|**4.5.0**|
9+
|**requests**|**pip install requests**|**2.23.0**|
10+
|**aiohttp**|**pip install aiohttp**|**3.6.2**|
11+
|**sqlalchemy**|**pip install sqlalchemy**|**1.3.16**|
12+
|**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.0**|
13+
14+
* ### **京东&emsp;&emsp;&emsp;[官网地址](https://item.jd.com)**
15+
* ### **房天下&emsp;&emsp;[官网地址](https://www.fang.com)**
16+
* ### **快代理&emsp;&emsp;[官网地址](https://www.kuaidaili.com)**
17+
* ### **QQ音乐 &emsp; [官网地址](https://y.qq.com)**
18+
* ### **百度图片&emsp;[官网地址](https://image.baidu.com)**
19+
* ### **豆瓣读书&emsp;[官网地址](https://book.douban.com)**
20+
* ### **有道翻译&emsp;[官网地址](http://fanyi.youdao.com)**

京东商品信息/crawl.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#Python3.7
2+
#encoding = utf-8
3+
4+
import requests,re,json
5+
from bs4 import BeautifulSoup
6+
from urllib import parse
7+
8+
KEYWORD = parse.quote('python')
9+
10+
base = 'https://item.jd.com'
11+
headers = {
12+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
13+
'Connection':'keep-alive',
14+
#参考链接:https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python
15+
}
16+
17+
18+
def get_index(url):
19+
#一开始的请求页面
20+
21+
session = requests.Session()
22+
session.headers = headers
23+
html = session.get(url)
24+
html.encoding = 'GBK'
25+
soup = BeautifulSoup(html.text,'lxml')
26+
items = soup.select('li.gl-item')
27+
28+
29+
for item in items:
30+
inner_url = item.select('li.gl-item .gl-i-wrap .p-img a')[0].get('href')
31+
print(inner_url)
32+
inner_url = parse.urljoin(base,inner_url)#转成URL格式
33+
34+
item_id = get_id(inner_url)
35+
36+
#评论数
37+
comm_num = get_comm_num(inner_url)
38+
inner_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6501&productId=11993134&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
39+
40+
#获取评论
41+
if comm_num>0:
42+
get_comm(inner_url,comm_num,item_id)
43+
44+
45+
46+
47+
def get_comm(url,comm_num,item_id ):
48+
49+
headers = {
50+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
51+
}
52+
good_comments = '' #存放结果
53+
#获取评论
54+
55+
pages = comm_num//10
56+
if pages>99:
57+
pages = 99
58+
59+
for page in range(0,pages):
60+
comment_url = 'https://sclub.jd.com/comment/productPageComments.action?'\
61+
'callback=fetchJSON_comment98vv4&productId={}&score=0'\
62+
'&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(item_id,page)
63+
64+
json_decoder = requests.get(comment_url,headers=headers).text
65+
try:
66+
if json_decoder:
67+
start = json_decoder.find('{"productAttr":null,')
68+
69+
end = json_decoder.find(',"afterDays":0}]}')+len(',"afterDays":0}]}')
70+
71+
content = json.loads(json_decoder[start:end])
72+
73+
comments = content['comments']
74+
75+
for c in comments:
76+
comm = c['content']
77+
good_comments+="{}|".format(comm)
78+
79+
print(good_comments)
80+
except Exception as e:
81+
print(e)
82+
83+
print(item_id,good_comments)
84+
85+
def get_shop_info(url):#获取商品信息
86+
shop_data = {}
87+
html = requests.get(url,headers = headers)
88+
soup = BeautifulSoup(html.text,'lxml')
89+
try:
90+
shop_name = soup.select('div.mt h3 a')
91+
except Exception as e:
92+
raise e
93+
94+
def get_index_lists(html):#获取索引列表
95+
html.encoding = 'utf8'
96+
soup = BeautifulSoup(html.text,'lxml')
97+
lis = soup.find_all('li',attrs = {"class":"gl-item"})
98+
for li in lis:
99+
number = li.find('div',attrs = {"class":"p-commit"}).strong
100+
print(number)
101+
102+
def get_comm_num(url):#获取评论数量
103+
104+
item_id = get_id(url)
105+
comm_url = 'https://club.jd.com/comment/productCommentSummaries.action?'\
106+
'referenceIds={}&callback=jQuery3096445'.format(item_id)
107+
comment = requests.get(comm_url,headers = headers).text
108+
start = comment.find('{"CommentsCount":')#起始
109+
end = comment.find('"PoorRateStyle":0}]}')+len('"PoorRateStyle":0}]}')#结尾
110+
try:
111+
content = json.loads(comment[start:end])['CommentsCount']#取出json
112+
except:
113+
return 0
114+
comm_num = content[0]['CommentCount']
115+
return comm_num
116+
117+
118+
def get_id(url):#匹配id
119+
id = re.compile('\d+')
120+
res = id.findall(url)
121+
return res[0]
122+
123+
124+
if __name__ == '__main__':
125+
126+
for i in range(1,30,2):
127+
url = 'https://search.jd.com/Search?'\
128+
'keyword={}&page={}'.format(KEYWORD,i)
129+
get_index(url)
130+
131+

0 commit comments

Comments
 (0)