软件pycharm实现下载request,lxml,BeautifulSoup这些包
pip install lxmlpip install BeautifulSouppip install request
UA伪装
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
然后用BeautifulSoup定位到标签全部代码如下:
from time import sleepimport requestsimport lxmlfrom bs4 import BeautifulSoupurl = '/0/425/'headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36","Host": "","Referer": "/0/425/"}resp = requests.get(url=url, headers=headers)resp.encoding = 'utf-8'page_text = resp.textsoup = BeautifulSoup(page_text, 'lxml')list_title = soup.select('#list > dl > dd')fp = open("斗破苍穹.txt", 'w', encoding='utf-8')for list in list_title:title = list.a.stringtitle_links = list.a['href']title_url = "/0/425/" + title_linkssleep(1)list_resp = requests.get(url=title_url, headers=headers)list_resp.encoding = 'utf-8'list_page_text = list_resp.textlist_soup = BeautifulSoup(list_page_text, 'lxml')list_tag_id = list_soup.find('div', id='content')neirong = list_tag_id.textfp.write(title +"\n"+ neirong + "\n")print(title + " " + "爬取完成")