200字范文,内容丰富有趣,生活中的好帮手!
200字范文 > python3爬虫(4):获取网易云音乐歌手所有歌曲及歌曲的精选评论

python3爬虫(4):获取网易云音乐歌手所有歌曲及歌曲的精选评论

时间:2022-12-05 22:20:05

相关推荐

python3爬虫(4):获取网易云音乐歌手所有歌曲及歌曲的精选评论

1. 需要的python包

>pip install pycryptodome>pip install requests>>pip install lxml

2. 实践1:爬取所有歌手id

以网易云音乐为例:

"""sources:/wanhaiwei/wangyiyun/blob/master/get_all_singer.py"""import requestsimport reimport csvimport jsonclass SingerSpider(object):def __init__(self):self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Host': '','Referer': '/','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/66.0.3359.181 Safari/537.36'}def get_index(self, url):'请求模块'try:resp = requests.get(url,headers=self.headers)if resp.status_code == 200:self.parse_re(resp.text)else:print('error')except ConnectionError:self.get_index(url)def parse_re(self, resp):'解析模块'print('start parse {}'.format(url))tags = re.findall(r'<a href=".*?/artist\?id=(\d+)" class="nm nm-icn f-thide s-fc0" title=".*?的音乐">(.*?)</a>', resp, re.S)title = re.findall(r'<title>(.*?)-.*?</title>', resp, re.S)for tag in tags:# print(tag[0],tag[1])# self.save_json(tag, title)self.save_csv(tag, title)def save_csv(self, tag, title):'存储模块'print('start save {}'.format(url))with open('all_singer.csv', 'a+', newline='', encoding='utf-8') as f:writer = csv.writer(f)writer.writerow((tag[0], tag[1], title[0]))print('finish spider {}'.format(url))def save_json(self, tag, title):print('start save {}'.format(url))s = json.dumps({'id': tag[0], 'name': tag[1], 'title': title[0]},ensure_ascii=False)with open('all_singer.json', 'a+', newline='', encoding='utf-8') as f:f.write(s)print('finish spider {}'.format(url))print(s)if __name__ == '__main__':# 歌手分类idlist1 = [1001, 1002, 1003, 2001, 2002, , 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003]# initial的值list2 = [0,65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]for i in list1:for j in list2:url = '/discover/artist/cat?id=' + str(i) + '&initial=' + str(j)print('start spider {}'.format(url))SingerSpider().get_index(url)

效果:

(如果Excel打开乱码,可以先打开Excel,再导入外部源文件,使用 utf-8 编码即可)

第一列: 网易云歌手的id;第二列网易云歌手的名字;第三列歌手所属的类别。

3. 实践2: 爬取网易云上某个歌手的所有歌

# python3# -*- coding: utf-8 -*-# @Author : lina# @Time : /11/30 13:16"""爬取网易云音乐的歌词Step1:获取歌手专辑id信息Step2:根据专辑id获取这张专辑中包含的歌曲id"""import requestsimport lxml.etree as etreeimport osimport jsonimport redef get_album_links(url, album_headers, f_path):"""获取专辑名称和专辑id,将其存储到文件中,并调用get_lyrics_list()函数:param html::return:"""album_ids = []# 获取专辑名称数据response_albums = requests.get(url, headers=album_headers)pattern = pile(r'<div class="u-cover u-cover-alb3" title=(.*?)>')titles = re.findall(pattern, response_albums.text)# 判断是否文件已存在,若存在则删除if os.path.exists(f_path + "AlbumInfo.txt"):os.remove(f_path + "AlbumInfo.txt")if os.path.exists(f_path + "Lyrics.txt"):os.remove(f_path + "Lyrics.txt")# 获取专辑id并存储数据with open(f_path+"AlbumInfo.txt", 'a', encoding='utf8') as f:for title in titles:# 替换掉双引号,避免对正则化解析出现干扰title_handle = title.replace('\"', '')# Todo: sre_constants.error: nothing to repeat at position 65try:id_elem = pile(r'<a href="/album\?id=(.*?)" class="tit s-fc0">%s</a>' % title_handle)except Exception as e:print("error",e)continuealbum_id = re.findall(id_elem, response_albums.text) # 获取专辑idif len(album_id) == 1:f.write(title + "\t" + str(album_id[0]) + "\n") # 追加写入文件album_ids.append(album_id[0])elif len(album_id) == 0:print("无对应的id")else:print("出错错误,一个专辑title对应多个id::", title)f.close()print("专辑爬取成功")return album_idsdef get_lyrics_list(album_ids, lyrics_list_url_current, lyrics_list_headers, f_path):"""通过专辑的id获取每张专辑的歌曲及歌曲id:param album_links: 专辑ids:param lyricsList_url_row::param lyrics_list_headers::return:"""with open(f_path + "lyricsList.txt", 'a', encoding='utf-8') as f:for album_id in album_ids:url = lyrics_list_url_current + str(album_id)print("url is::", url)response_lyrics_list = requests.get(url, headers=lyrics_list_headers)html_lyricsList = etree.HTML(response_lyrics_list.text)lyric_list = html_lyricsList.xpath('//ul[@class="f-hide"]//a')for lyric in lyric_list:html_data = str(lyric.xpath('string(.)'))try:# 获取歌曲的idpattern = pile(r'<a href="/song\?id=(\d+?)">%s</a>' % html_data)items = re.findall(pattern, response_lyrics_list.text)except Exception as e:print("error",e)if len(items) == 1:f.write(html_data + "\t" + str(items[0]) + "\n")elif len(items) == 0:print("无歌曲id")else:print("出现错误,一首歌曲的title对一个多个id::", html_data)print("歌曲::%s, 歌曲ID::%s 写入文件成功" % (html_data, items))f.close()def get_lyrics(lyrics_headers, f_path):"""通过歌曲id获取歌词:param lyrics_headers: 头文件:return:"""# 直接读取所有内容with open(f_path + 'lyricsList.txt', 'r', encoding='utf8') as f:list_of_line = f.readlines()count = 1for elem in list_of_line:song_name = elem.split('\t')[0]song_id = elem.split('\t')[1]url = "/api/song/lyric?" + "id=" + str(song_id) + '&lv=1&kv=1&tv=-1'response = requests.get(url, headers=lyrics_headers)json_content = json.loads(response.text)try:lyric = json_content['lrc']['lyric']pattern = pile(r'\[.*\]')lrc = str(re.sub(pattern, "", lyric).strip())with open(f_path + "歌曲名-" + song_name + ".txt", 'w', encoding='utf-8') as w:w.write(lrc)w.close()count += 1except:print("歌曲有错误,歌名为:%s。" % song_name)print("共爬取歌曲数量为:%s" % count)def get_all_singers_id(path="../result/all_singer.csv"):singer_dict = {}with open(path, "r", encoding="utf-8") as f:for line in f:if len(line.split(",")) != 3:#print("line",line)line = line.split(",")singer_id = line[0]singer_category = line[-1]singer_name = ",".join(line[1:-1])#print(singer_id,singer_name,singer_category)else:singer_id, singer_name,singer_category = line.split(",")singer_dict[singer_id] = (singer_name, singer_category)return singer_dictdef get_singer_songs_info(singer_id=3686):album_url = "/artist/album?id=" + str(singer_id) + "&limit=1000&offset=0"album_headers = {'Accept': 'ext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': '_iuqxldmzr_=32; _ntes_nnid=b9e9c8a460bfdbc1250ffc5908f95cad,1543554353478; _ntes_nuid=b9e9c8a460bfdbc1250ffc5908f95cad;\__utma=94650624.345999225.1543554354.1543554354.1543554354.1; __utmc=94650624; \__utmz=94650624.1543554354.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; WM_TID=N3LQ47ihEYOKTieS18tLGKdN8q6R0iyt; \WM_NI=VJ7qPEylWXfPBYYl3aMJisRzvjArZ%2BpJ9ES13N1zv1m4f9VnfiHOdDEdcZWOUY6xS9Gi27GgC4pLKvWki7aHQISPLz9y0Uo3kGEPO5874RygU9DtXL3P1LY1%2BiVxtw77UEk%3D; \WM_NIKE=9ca17ae2e6ffcda170e2e6ee8bd05d8a96ba85ef7af2bc8fa6c14f929b9e85f25cbbb5afa2d874bc96a2b8d12af0fea7c3b92a9696f992d34fbbb499b9e55c8287978db26b8f96a794d172b6b68f93d921b496f7a6d23bf192ff9bf53bafecaadad566a6b788b0e75087bcadbbd6459787fa99d23385b7fed8c66d81ecc0adca7e95b5a7d8e44493979db1cf72fba700b6ee39a9998bb1ce3ffce9fcb3fc6d97e78c95db48f190ffaddc3be9b79a99f77df5e99ba6ea37e2a3; \JSESSIONID-WYYY=pNCF%2B8xzB2jTGWW7r7JlavTaS0YVMSZBP9THDnXZp86OQ3Aqo5WpW6dr3h6FR3hgevYdmOdO8N7aubiagD%2FhBrf%2BYd%2BcXtBehyUotNH%2BCs%5CqZXKRbf4Pyt6fU1tl7UCsXBvbe6b5%2BQwZ%5Cuth8Shm4fRFdkApHsDIEA9tuUYQYDB7BYuo%3A1543559635032; __utmb=94650624.51.10.1543554354','Host': '','Referer': '/','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}# 获取所有专辑的地址album_ids = get_album_links(album_url, album_headers, f_path)# 专辑主页,专辑详情页面的headers和专辑list页面的headers一样lyrics_list_url_current = "/album?id="get_lyrics_list(album_ids, lyrics_list_url_current, album_headers, f_path)# 获取歌词的API的headerslyrics_headers = {'Request URL': '/weapi/song/lyric?csrf_token=','Request Method': 'POST','Status Code': '200 OK','Remote Address': '59.111.160.195:80','Referrer Policy': 'no-referrer-when-downgrade'}#get_lyrics(lyrics_headers, f_path)if __name__ == '__main__':# 存储路径f_path = "./lyrics_罗大佑/"if not os.path.exists(f_path):os.mkdir(f_path)singer_dict = get_all_singers_id()#print("singer",singer_dict.keys(),len(singer_dict.keys()))# 专辑地址和headerssinger_id = 3686 # 歌手id,可以在网易云音乐网站上搜索自己喜欢歌手的IDfor singer_id in list(singer_dict.keys())[:20]:print("singer_id:",singer_dict[singer_id])get_singer_songs_info(singer_id)

效果:

以下是罗大佑的一些歌名及歌在网易云上的id.

爬虫关键参数的获取,可以参考:对网易云音乐参数(params,encSecKey)的分析

4. 实践3. 爬取每首歌的精选评论

# python3# -*- coding: utf-8 -*-# @Author : lina# @Time : /3/24 13:16"""引用别人的文件,这里用来爬取网易云音乐的评论数。因为频繁爬取会被封ip,所以这里随机生成时间sleep引用信息如下:Created on Sat Dec 2 23:43:01 @author: lenovo# -*- coding: utf-8 -*-# @Time : /3/28 8:46# @Author : Lyrichu# @Email : 919987476@# @File : NetCloud_spider3.py'''@Description:网易云音乐评论爬虫,可以完整爬取整个评论部分参考了@平胸小仙女的文章(地址:/question/36081767)post加密部分也给出了,可以参考原帖:作者:平胸小仙女链接:/question/36081767/answer/140287795来源:知乎"""from Crypto.Cipher import AESimport base64import requestsimport reimport jsonimport codecsimport timeimport randomimport oscomments_num_dict = {} # a dict, key是(singer_name, song_name), value是(song_id, 评论数)# 头部信息 #需根据自己浏览器的信息进行替换headers = {'Host': '','Accept': '*/*','Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3','Accept-Encoding': 'gzip, deflate','Content-Type': 'application/x-www-form-urlencoded','Referer': '/song?id=347597','Content-Length': '484','Cookie': '__s_=1; _ntes_nnid=f17890f7160fd145486752ebbf2066df,1505221478108; _ntes_nuid=f17890f7160fd145486752ebbf2066df; ''JSESSIONID-WYYY=Z99pE%2BatJVOAGco1d%2FJpojOK94Xe9GHqe0epcCOj23nqP2SlHt1XwzWQ2FXTwaM2xgIN628qJGj8%2BikzfYkv%2FXAUo%2FSzwMxjdyO9oeQlGKBvH6nYoFpJpVlA%2F8eP57fkZAVEsuB9wqkVgdQc2cjIStE1vyfE6SxKAlA8r0sAgOnEun%2BV%3A1512200032388; _iuqxldmzr_=32; __utma=94650624.1642739310.1512184312.1512184312.1512184312.1; __utmc=94650624; __utmz=94650624.1512184312.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); playerid=10841206','Connection': 'keep-alive','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/0101 Firefox/56.0'}# offset的取值为:(评论页数-1)*20,total第一页为true,其余页为false# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一个参数second_param = "010001" # 第二个参数# 第三个参数third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"# 第四个参数forth_param = "0CoJUm6Qyw8W8jud"# 获取参数def get_params(page): # page为传入页数iv = "0102030405060708"first_key = forth_paramsecond_key = 16 * 'F'if(page == 1): # 如果为第一页first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'h_encText = AES_encrypt(first_param, first_key, iv)else:offset = str((page-1)*20)first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')h_encText = AES_encrypt(first_param, first_key, iv)h_encText = AES_encrypt(h_encText, second_key, iv)return h_encText# 获取 encSecKeydef get_encSecKey():encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"return encSecKey# 解密过程def AES_encrypt(text, key, iv):pad = 16 - len(text) % 16text = text + pad * chr(pad)# encode utf8key = key.encode("utf-8")iv = iv.encode("utf-8")text = text.encode("utf-8")encryptor = AES.new(key, AES.MODE_CBC, iv)encrypt_text = encryptor.encrypt(text)encrypt_text = base64.b64encode(encrypt_text)encrypt_text = str(encrypt_text, encoding="utf-8") #注意一定要加上这一句,没有这一句则出现错误return encrypt_text# 获得评论json数据def get_json(url, params, encSecKey):data = {"params": params,"encSecKey": encSecKey}response = requests.post(url, headers=headers, data=data)return response.content# 抓取某一首歌的全部评论def get_comments_num(url, singer_name, song_name, song_id, params, encSecKey):json_text = get_json(url,params,encSecKey)json_dict = json.loads(json_text)comments_num = int(json_dict['total'])print("count num isL::", comments_num)if comments_num_dict.get((singer_name, song_name)):print("comments_num_dict中已有信息::%s, %s" % (singer_name, song_name))if comments_num > comments_num_dict[(singer_name, song_name)][1]:comments_num_dict[(singer_name, song_name)] = (song_id, comments_num)else:comments_num_dict[(singer_name, song_name)] = (song_id, comments_num)def get_top_like_comments(url, singer_name, song_name, song_id, params, encSecKey):"""获得高赞的评论。:param url::param singer_name::param song_name::param song_id::param params::param encSecKey::return:"""json_text = get_json(url, params, encSecKey)json_dict = json.loads(json_text)hot_comments_list = json_dict["hotComments"]top_like_comments = []for item in hot_comments_list:top_like_comments.append(item["content"])#print("top_like_comments",top_like_comments)if comments_num_dict.get((singer_name, song_name)):if len(comments_num_dict.get((singer_name,song_name))) == 2:song_id, comments_num = comments_num_dict.get((singer_name,song_name))comments_num_dict[(singer_name, song_name)] = (song_id, comments_num, top_like_comments)elif len(comments_num_dict.get((singer_name, song_name))) == 3:passelse:comments_num_dict[(singer_name, song_name)] = (song_id, len(top_like_comments),top_like_comments)return top_like_comments# if len(hot_comments_list) > 0:#print("all comments, json_dict:", hot_comments_list[0]['content'])def auto_create_path(FilePath):if os.path.exists(FilePath): ##目录存在,返回为真print( 'dir exists' )else:print( 'dir not exists')os.makedirs(FilePath)def write_crapy_result(path="../result/comments_num"):"""write: (singer_name,song_name,song_id, comments_num, top_like_comments):param path::return:"""auto_create_path("\\".join(os.getcwd().split("\\")[:-1]) + path.replace("..",".") )with open("{}/singer_".format(path) + singer_name + ".txt", 'w', encoding='utf8') as w:for key, value in comments_num_dict.items():w.write(key[0] + "\t" + key[1] + "\t" + value[0] + "\t" + str(value[1]) + "\t" + str(value[2]) + '\n')w.close()def get_singer_comments_info(f_path, comments_num_dict={}):start_time = time.time() # 开始时间auto_create_path(os.getcwd()+"./lyrics_罗大佑")params = get_params(1)encSecKey = get_encSecKey()with open(f_path, 'r', encoding='utf8') as f:singer_name = ""song_name = ""singer_name_obj = re.match(r'./lyrics_(.*)/lyricsList.txt', f_path)if singer_name_obj:singer_name = singer_name_obj.group(1)print("singer_name is::", singer_name)else:print("无法提取歌手姓名")results = f.readlines()#print("results",results)for result in results:# sleep 缓冲time.sleep(random.uniform(0.001, 1))temp_list = [elem for elem in result.split("\t") if elem != '']if len(temp_list) == 2:song_name = temp_list[0].strip()song_id = temp_list[1].strip()else:print("歌曲出现错误::", result)print("song_name::%s, song_id::%s" % (song_name, song_id))url = "/weapi/v1/resource/comments/R_SO_4_"+ str(song_id) + "?csrf_token=" #替换为你想下载的歌曲R_SO的链接get_comments_num(url, singer_name, song_name, song_id, params, encSecKey) # 获取歌曲评论数# 获取歌曲top 评论get_top_like_comments(url, singer_name, song_name, song_id, params, encSecKey)auto_create_path(os.getcwd() + "./comments_num")# 写入数据write_crapy_result()end_time = time.time() #结束时间print("程序耗时%f秒." % (end_time - start_time))if __name__ == "__main__":f_path = "./lyrics_罗大佑/lyricsList.txt"comments_num_dict = {}get_singer_comments_info(f_path, comments_num_dict=comments_num_dict)

效果:

罗大佑叮咛1369739912217['其实罗大佑从不曾用教诲的姿态灌输给我们什么,他用的只是少年的心绪、最无忌的语言、最敏锐的感觉刺穿现实的重重迷雾!', '我觉得……现在听罗大佑的这么少,那一定是这个时代出了些问题', '毛不易唱的是岁月沧桑、酸甜苦辣难免牵绊。\n罗大佑唱的是流金岁月、悲欢离愁一饮而尽。', '太真了,有些人可能就是想捧杀毛毛,别在意,罗大佑真的是大师级别了', '罗大佑的歌就是诗,诗就是歌', '这个时代已经不需要呐喊', ]

完整代码

crawl_music163

欢迎Star!

参考:

Github

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。