Skip to content

Commit 9ed0d1a

Browse files
author
mochazi
committed
2020.7.28 commit🎉
1 parent b442130 commit 9ed0d1a

File tree

12 files changed

+707
-63
lines changed

12 files changed

+707
-63
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
# **Python3Webcrawler**
2-
## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)**  **UPDATE 2020 5 30**
2+
## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)**  **UPDATE 2020 7 28**
33
> **精心挑选了几个爬虫,给大家在学Scrapy框架之前打基础。**
44
>> **该项目仅限学习交流,请勿用于商业用途,如有侵权,请联系删除。**
55
66
|**程序依赖**|**安装指令**|**项目使用版本**|
77
|:----:|:--------:|:--------:|
8-
|**lxml**|**pip install lxml**|**4.5.0**|
8+
|**lxml**|**pip install lxml**|**4.5.2**|
99
|**aiohttp**|**pip install aiohttp**|**3.6.2**|
10-
|**requests**|**pip install requests**|**2.23.0**|
10+
|**requests**|**pip install requests**|**2.24.0**|
1111
|**PyExecJS**|**pip install PyExecJS**|**1.5.1**|
12-
|**sqlalchemy**|**pip install sqlalchemy**|**1.3.16**|
13-
|**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.0**|
12+
|**sqlalchemy**|**pip install sqlalchemy**|**1.3.18**|
13+
|**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.1**|
14+
|**mysqlconnector**|**pip install mysql-connector-python**|**8.0.21**|
1415

1516
* ### **京东   [官网地址](https://item.jd.com)**
1617
* ### **网易   [官网地址](https://www.163.com/)**

requirements.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
aiohttp==3.6.2
2+
async-timeout==3.0.1
3+
attrs==19.3.0
4+
beautifulsoup4==4.9.1
5+
certifi==2020.6.20
6+
chardet==3.0.4
7+
idna==2.10
8+
lxml==4.5.2
9+
multidict==4.7.6
10+
mysql-connector-python==8.0.21
11+
protobuf==3.12.2
12+
PyExecJS==1.5.1
13+
requests==2.24.0
14+
six==1.15.0
15+
soupsieve==2.0.1
16+
SQLAlchemy==1.3.18
17+
typing-extensions==3.7.4.2
18+
urllib3==1.25.10
19+
yarl==1.5.0

房天下/db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
from sqlalchemy.ext.declarative import declarative_base
55

66
BASE = declarative_base()#创建基类
7+
8+
#此处没有使用pymysql的驱动
9+
#请安装pip install mysql-connector-python
10+
#engine中的 mysqlconnector 为 mysql官网驱动
711
engine = create_engine(
8-
"mysql+pymysql://root:root@127.0.0.1:3306/test?charset=utf8",#确定编码格式
12+
"mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8",#确定编码格式
913
max_overflow = 500,#超过连接池大小外最多可以创建的链接
1014
pool_size = 100,#连接池大小
1115
echo = False,#调试信息展示

新版QQ音乐/crawl.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#Python3.7
2+
#encoding = utf-8
3+
4+
import execjs,requests,math,os,threading
5+
from urllib import parse
6+
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7+
from db import SQLsession,Song
8+
9+
lock = threading.Lock()
10+
11+
headers = {
12+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
13+
'Referer':'https://y.qq.com/portal/singer_list.html',
14+
#参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
15+
}
16+
17+
session = SQLsession()
18+
19+
def get_sign(data):
20+
21+
with open('./新版QQ音乐/get_sign.js','r',encoding='utf-8') as f:
22+
text = f.read()
23+
24+
js_data = execjs.compile(text)
25+
sign = js_data.call('get_sign',data)
26+
return sign
27+
28+
29+
def myProcess():
30+
#把歌手按照首字母分为27类
31+
with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
32+
for i in range(1,28):#28
33+
p.submit(get_singer_mid,i)
34+
35+
36+
def get_singer_mid(index):
37+
#index = 1-----27
38+
#打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
39+
#找出mid, 用于歌手详情页
40+
data = '{"comm":{"ct":24,"cv":0},"singerList":'\
41+
'{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
42+
'{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
43+
sign = get_sign(data)
44+
45+
url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
46+
'&sign={}'\
47+
'&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
48+
'&notice=0&platform=yqq.json&needNewCode=0'\
49+
'&data={}'.format(sign,parse.quote(data))
50+
51+
html = requests.get(url,headers = headers).json()
52+
53+
total = html['singerList']['data']['total']#多少个歌手
54+
55+
pages = int(math.floor(int(total)/80))#向下取整
56+
57+
thread_number = pages
58+
Thread = ThreadPoolExecutor(max_workers = thread_number)
59+
60+
sin = 0
61+
#分页迭代每一个字母下的所有页面歌手
62+
for page in range(1,pages+2):
63+
64+
data = '{"comm":{"ct":24,"cv":0},"singerList":'\
65+
'{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
66+
'{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page))
67+
sign = get_sign(data)
68+
69+
url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
70+
'&sign={}'\
71+
'&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
72+
'&notice=0&platform=yqq.json&needNewCode=0'\
73+
'&data={}'.format(sign,parse.quote(data))
74+
75+
html = requests.get(url,headers = headers).json()
76+
77+
sings = html['singerList']['data']['singerlist']
78+
79+
for sing in sings:
80+
81+
singer_name = sing['singer_name'] #获取歌手名字
82+
mid = sing['singer_mid'] #获取歌手mid
83+
84+
Thread.submit(get_singer_data,mid = mid,
85+
singer_name = singer_name,)
86+
sin+=80
87+
88+
#获取歌手信息
89+
def get_singer_data(mid,singer_name):
90+
#获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
91+
#找出歌手的歌曲信息页
92+
93+
data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
94+
'{"order":1,"singerMid":"%s","begin":0,"num":10}'\
95+
',"module":"musichall.song_list_server"}}'%(str(mid))
96+
97+
sign = get_sign(data)
98+
url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
99+
'&g_tk=5381&sign={}&loginUin=0'\
100+
'&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
101+
'&data={}'.format(sign,parse.quote(data))
102+
103+
html = requests.get(url,headers = headers).json()
104+
105+
songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数
106+
107+
108+
for number in range(0,songs_num,100):
109+
110+
data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
111+
'{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\
112+
',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num))
113+
114+
sign = get_sign(data)
115+
url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
116+
'&g_tk=5381&sign={}&loginUin=0'\
117+
'&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
118+
'&data={}'.format(sign,parse.quote(data))
119+
120+
html = requests.get(url,headers = headers).json()
121+
122+
datas = html['singerSongList']['data']['songList']
123+
124+
for d in datas:
125+
sing_name = d['songInfo']['title']
126+
song_mid = d['songInfo']['mid']
127+
try:
128+
lock.acquire()#锁上
129+
130+
session.add(Song(song_name = sing_name,
131+
song_singer = singer_name,
132+
song_mid = song_mid))
133+
session.commit()
134+
135+
lock.release()#解锁
136+
print('commit')
137+
except:
138+
session.rollback()
139+
print('rollbeak')
140+
141+
142+
print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,song_mid))
143+
download(song_mid,sing_name,singer_name)
144+
145+
146+
def download(song_mid,sing_name,singer_name):
147+
148+
qq_number = '1641202711'#请修改你的QQ号
149+
data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\
150+
',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\
151+
'"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\
152+
'"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\
153+
'"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number))
154+
155+
sign = get_sign(data)
156+
url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\
157+
'&g_tk=1291538537&sign={}&loginUin={}'\
158+
'&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0'\
159+
'&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data))
160+
161+
html = requests.get(url,headers = headers).json()
162+
163+
try:
164+
purl = html['req_0']['data']['midurlinfo'][0]['purl']
165+
166+
167+
url = 'http://119.147.228.27/amobile.music.tc.qq.com/{}'.format(purl)
168+
169+
html = requests.get(url,headers = headers)
170+
171+
html.encoding = 'utf-8'
172+
173+
sing_file_name = '{} -- {}'.format(sing_name,singer_name)
174+
175+
filename = './新版QQ音乐/歌曲'
176+
177+
if not os.path.exists(filename):
178+
os.makedirs(filename)
179+
180+
with open('./新版QQ音乐/歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
181+
print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
182+
f.write(html.content)
183+
184+
except:
185+
print('查询权限失败,或没有查到对应的歌曲')
186+
187+
188+
189+
190+
191+
if __name__ == "__main__":
192+
myProcess()
193+
194+
195+

QQ音乐/db.py renamed to 新版QQ音乐/db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
from sqlalchemy import Column,Integer,String,create_engine
22
from sqlalchemy.orm import sessionmaker,scoped_session
33
from sqlalchemy.ext.declarative import declarative_base
4-
engine = create_engine('mysql+pymysql://root:root@localhost:3306/test?charset=utf8',
4+
5+
#此处没有使用pymysql的驱动
6+
#请安装pip install mysql-connector-python
7+
#engine中的 mysqlconnector 为 mysql官网驱动
8+
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8',
59
max_overflow = 500,#超过连接池大小外最多可以创建的链接
610
pool_size = 100,#连接池大小
711
echo = False,#调试信息展示

0 commit comments

Comments
 (0)