200字范文 > scrapy框架爬取斗鱼女主播照片依据颜值排行榜

scrapy框架爬取斗鱼女主播照片依据颜值排行榜

时间：2020-03-01 22:38:03

spider爬虫文件 douyugirl.py

# -*- coding: utf-8 -*-import scrapyimport jsonfrom douyu.items import DouyuItemimport requestsclass DouyugirlSpider(scrapy.Spider):name = 'douyugirl'allowed_domains = ['']urlone = '/api/v1/getVerticalRoom?limit=20&offset='offset = 0start_urls = [urlone+str(offset)]def parse(self, response):a = json.loads(response.body)['data']for each in a:item = DouyuItem()item['nickname'] = each['nickname']item['imageurl'] = each['vertical_src']# a = requests.get(each['vertical_src'])# a.encoding = 'utf-8'## print(a.text)#print(item['nickname']+'*'*30)yield itemif self.offset<500:self.offset+=20yield scrapy.Request(self.urlone+str(self.offset),callback=self.parse)

settings.py

此处注意开启ITEM_PIPELINES,否则无效!

ITEM_PIPELINES = {'douyu.pipelines.DouyuPipeline': 300,}

# -*- coding: utf-8 -*-# Scrapy settings for douyu project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##/en/latest/topics/settings.html#/en/latest/topics/downloader-middleware.html#/en/latest/topics/spider-middleware.htmlBOT_NAME = 'douyu'SPIDER_MODULES = ['douyu.spiders']NEWSPIDER_MODULE = 'douyu.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'douyu (+)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See /en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See /en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'douyu.middlewares.DouyuSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See /en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'douyu.middlewares.DouyuDownloaderMiddleware': 543,#}# Enable or disable extensions# See /en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See /en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {'douyu.pipelines.DouyuPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See /en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass DouyuItem(scrapy.Item):nickname = scrapy.Field()imageurl = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: /en/latest/topics/item-pipeline.htmlimport requestsclass DouyuPipeline(object):def process_item(self, item, spider):name = item['nickname']url = item['imageurl']text = requests.get(url).content# print(text)with open('./girl/%s.jpg'%name , 'wb') as f:f.write(text)f.close()return item