mochazi
diff --git a/‎QQ音乐/crawl.py‎
Lines changed: 192 additions & 0 deletions b/‎QQ音乐/crawl.py‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎QQ音乐/db.py‎
Lines changed: 27 additions & 0 deletions b/‎QQ音乐/db.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 20 additions & 0 deletions b/‎README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎京东商品信息/crawl.py‎
Lines changed: 131 additions & 0 deletions b/‎京东商品信息/crawl.py‎
Lines changed: 131 additions & 0 deletions
@@ -0,0 +1,192 @@
+#Python3.7 
+#encoding = utf-8
+
+import requests,os,json,math
+from urllib import parse
+from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
+from db import SQLsession,Song
+
+headers = {
+	'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+	'referer':'https://y.qq.com/portal/singer_list.html',
+	#参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
+}
+
+
+session = SQLsession()
+
+def myProcess():
+	#把歌手按照首字母分为27类
+	with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
+		for i in range(2,3):#28
+			p.submit(get_singer_mid,i)
+	
+def get_singer_mid(index):
+	#index =  1-----27
+	#打开歌手列表页面，找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
+	#找出mid, 用于歌手详情页
+
+	data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
+			',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
+			'"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
+
+	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
+		'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
+		'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
+		'&data={}'.format(parse.quote(data))
+	
+	html = requests.get(url).json()
+	total = html['singerList']['data']['total']#多少个歌手
+	pages = int(math.floor(int(total)/80))
+	thread_number = pages
+
+	Thread = ThreadPoolExecutor(max_workers = thread_number)
+
+	sin = 0
+	#分页迭代每一个字母下的所有页面歌手
+	for page in range(1,pages):
+		data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
+				'"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
+				'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
+		
+		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
+			'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
+			'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
+			'&data={}'.format(parse.quote(data))
+
+		html = requests.get(url,headers = headers).json()
+		
+		sings = html['singerList']['data']['singerlist']
+
+		for sing in sings:
+
+			singer_name = sing['singer_name']
+			mid = sing['singer_mid']
+
+			Thread.submit(get_singer_data,mid = mid,
+							singer_name = singer_name,)
+		sin+=80
+
+
+
+#获取歌手信息
+def get_singer_data(mid,singer_name):
+	#获取歌手mid,进入歌手详情页，也就是每一个歌手歌曲所在页面
+	#找出歌手的歌曲信息页
+
+	params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
+			'"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
+			'"module":"musichall.song_list_server"}}'%str(mid)
+
+	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
+			'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
+			'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
+
+	html = requests.session()
+	content = html.get(url,headers = headers).json()
+
+	songs_num = content['singerSongList']['data']['totalNum']
+
+	if int(songs_num)<=80:
+		params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
+				 '"param":{"order":1,"singerMid":"%s","begin":0,"num":%s},' \
+				 '"module":"musichall.song_list_server"}}' % (str(mid),int(songs_num))
+
+		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
+			  'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
+			  '&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
+
+		html = requests.session()
+		content = html.get(url, headers=headers).json()
+		datas = content['singerSongList']['data']['songList']
+
+		for d in datas:
+			sing_name = d['songInfo']['title']
+			songmid = d['songInfo']['mid']
+			try:
+				session.add(Song(song_name = sing_name,
+								 song_singer = singer_name,
+								 song_mid = songmid))
+				session.commit()
+				print('commit')
+			except:
+				session.rollback()
+				print('rollbeak')
+
+			print(sing_name,songmid,singer_name)
+			download(songmid,sing_name)
+	else:
+		for a in range(0,songs_num,80):
+
+			params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
+					 '"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
+					 '"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
+
+			url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
+				  'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
+				  '&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
+
+			html = requests.session()
+			content = html.get(url, headers=headers).json()
+
+			datas = content['singerSongList']['data']['songList']
+
+			for d in datas:
+				sing_name = d['songInfo']['title']
+				songmid = d['songInfo']['mid']
+				try:
+					session.add(Song(song_name = sing_name,
+									 song_singer = singer_name,
+									 song_mid = songmid))
+					session.commit()
+					print('commit')
+				except:
+					session.rollback()
+					print('rollbeak')
+
+				print(sing_name, songmid, singer_name)
+				download(songmid,sing_name)
+
+def download(songmid,sing_name):
+	headers = {
+		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+		'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
+	}
+
+
+	data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
+			'"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
+			'"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
+			'"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
+
+
+	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
+		'&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
+		'&notice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
+
+	html = requests.get(url,headers = headers)
+
+	try:
+		purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
+
+		url = 'http://ws.stream.qqmusic.qq.com/{}'.format(purl)
+
+		html = requests.get(url,headers = headers)
+		html.encoding = 'utf-8'
+
+		filename = '歌曲'
+
+		if not os.path.exists(filename):
+			os.makedirs(filename)
+	
+		with open('./{}/{}.m4a'.format(filename,sing_name),'wb') as f:
+			print('\n正在下载{}歌曲.....\n'.format(sing_name))
+			f.write(html.content)
+		
+	except:
+		print('查询权限失败，或没有查到对应的歌曲')
+
+
+
+if __name__ == '__main__':
+	myProcess()
@@ -0,0 +1,27 @@
+from sqlalchemy import Column,Integer,String,create_engine
+from sqlalchemy.orm import  sessionmaker,scoped_session
+from sqlalchemy.ext.declarative import declarative_base
+engine = create_engine('mysql+pymysql://root:root@localhost:3306/test?charset=utf8',
+                       max_overflow = 500,#超过连接池大小外最多可以创建的链接
+                       pool_size = 100,#连接池大小
+                       echo = False,#调试信息展示
+)
+Base = declarative_base()
+
+class Song(Base):
+    __tablename__ = 'song'
+    song_id = Column(Integer,primary_key = True,autoincrement = True)
+    song_name = Column(String(64))
+    song_ablum = Column(String(64))
+    song_mid = Column(String(50))
+    song_singer = Column(String(50))
+Base.metadata.create_all(engine)
+
+DBsession = sessionmaker(bind = engine)
+
+SQLsession = scoped_session(DBsession)
+
+
+
+
+
@@ -0,0 +1,20 @@
+# **Python3Webcrawler**
+## **[哔哩哔哩作者：-相依-](https://space.bilibili.com/343154012)**  &emsp;**UPDATE 2020 4 27**
+> **精心挑选了几个爬虫，给大家在学Scrapy框架之前打基础。**
+>> **该项目仅限学习交流，请勿用于商业用途，如有侵权，请联系删除。**
+
+|**程序依赖**|**安装指令**|**项目使用版本**|
+|:----:|:--------:|:--------:|
+|**lxml**|**pip install lxml**|**4.5.0**|
+|**requests**|**pip install requests**|**2.23.0**|
+|**aiohttp**|**pip install aiohttp**|**3.6.2**|
+|**sqlalchemy**|**pip install sqlalchemy**|**1.3.16**|
+|**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.0**|
+
+ * ### **京东&emsp;&emsp;&emsp;[官网地址](https://item.jd.com)** 
+ * ### **房天下&emsp;&emsp;[官网地址](https://www.fang.com)** 
+ * ### **快代理&emsp;&emsp;[官网地址](https://www.kuaidaili.com)** 
+ * ### **QQ音乐 &emsp; [官网地址](https://y.qq.com)** 
+ * ### **百度图片&emsp;[官网地址](https://image.baidu.com)** 
+ * ### **豆瓣读书&emsp;[官网地址](https://book.douban.com)** 
+ * ### **有道翻译&emsp;[官网地址](http://fanyi.youdao.com)** 
@@ -0,0 +1,131 @@
+#Python3.7 
+#encoding = utf-8
+
+import requests,re,json
+from bs4 import BeautifulSoup
+from urllib import parse
+
+KEYWORD = parse.quote('python')
+
+base = 'https://item.jd.com'
+headers = {
+	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+	'Connection':'keep-alive',
+	#参考链接:https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python
+}
+
+
+def get_index(url):
+	#一开始的请求页面
+
+	session = requests.Session()
+	session.headers = headers
+	html = session.get(url)
+	html.encoding = 'GBK'
+	soup = BeautifulSoup(html.text,'lxml')
+	items = soup.select('li.gl-item')
+
+
+	for item in items:
+		inner_url = item.select('li.gl-item .gl-i-wrap .p-img a')[0].get('href')
+		print(inner_url)	
+		inner_url = parse.urljoin(base,inner_url)#转成URL格式
+	
+		item_id = get_id(inner_url)
+
+		#评论数
+		comm_num = get_comm_num(inner_url)
+		inner_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6501&productId=11993134&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
+
+		#获取评论
+		if comm_num>0:
+			get_comm(inner_url,comm_num,item_id)
+	
+
+
+
+def get_comm(url,comm_num,item_id ):
+
+	headers = {
+		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
+	}
+	good_comments = ''  #存放结果
+	#获取评论
+
+	pages = comm_num//10
+	if pages>99:
+		pages = 99
+
+	for page in range(0,pages):
+		comment_url = 'https://sclub.jd.com/comment/productPageComments.action?'\
+					'callback=fetchJSON_comment98vv4&productId={}&score=0'\
+					'&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(item_id,page)
+	
+		json_decoder = requests.get(comment_url,headers=headers).text
+		try:
+			if json_decoder:
+				start = json_decoder.find('{"productAttr":null,')
+
+				end = json_decoder.find(',"afterDays":0}]}')+len(',"afterDays":0}]}')
+			
+				content = json.loads(json_decoder[start:end])
+				
+				comments = content['comments']
+				
+				for c in comments:
+					comm = c['content']
+					good_comments+="{}|".format(comm)
+					
+				print(good_comments)
+		except Exception as e:
+			print(e)
+
+	print(item_id,good_comments)
+
+def get_shop_info(url):#获取商品信息
+	shop_data = {}
+	html = requests.get(url,headers = headers)
+	soup = BeautifulSoup(html.text,'lxml')
+	try:
+		shop_name = soup.select('div.mt h3 a')
+	except Exception as e:
+		raise e
+
+def get_index_lists(html):#获取索引列表
+	html.encoding = 'utf8'
+	soup = BeautifulSoup(html.text,'lxml')
+	lis = soup.find_all('li',attrs = {"class":"gl-item"})
+	for li in lis:
+		number = li.find('div',attrs = {"class":"p-commit"}).strong
+		print(number)
+
+def get_comm_num(url):#获取评论数量
+	
+	item_id = get_id(url)
+	comm_url = 'https://club.jd.com/comment/productCommentSummaries.action?'\
+			'referenceIds={}&callback=jQuery3096445'.format(item_id)
+	comment = requests.get(comm_url,headers = headers).text
+	start = comment.find('{"CommentsCount":')#起始
+	end = comment.find('"PoorRateStyle":0}]}')+len('"PoorRateStyle":0}]}')#结尾
+	try:
+		content = json.loads(comment[start:end])['CommentsCount']#取出json
+	except:
+		return 0
+	comm_num = content[0]['CommentCount']
+	return comm_num
+
+
+def get_id(url):#匹配id
+	id = re.compile('\d+')
+	res = id.findall(url)
+	return res[0]
+
+
+if __name__ == '__main__':
+
+	for i in range(1,30,2):
+		url = 'https://search.jd.com/Search?'\
+		'keyword={}&page={}'.format(KEYWORD,i)
+		get_index(url)
+
+