Python-Scrapy 获取历史双色球开奖号码
文章目录
1-创建项目2-settings文件设置3-Itrm设置4. 创建Spider5-爬取规则的编写6-pipeline.py文件的编写7-爬取8-数据统计1-创建项目
在终端中输入创建Scrapy项目的命令:
scrapy startproject GetBicolorNumber
2-settings文件设置
ROBOTSTXT_OBEY = FalseDEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en','User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',} # 请求头ITEM_PIPELINES = {'GetBicolorNumber.pipelines.GetbicolornumberPipeline': 300,} # 保存文件所需LOG_LEVEL="WARNING" # 不想显示日志加上这个
3-Itrm设置
Item设置,设置需要爬取的数据内容,items.py
issue = scrapy.Field() # 旗号time = scrapy.Field() # 开奖具体时间numbers = scrapy.Field() # 中奖号码
4. 创建Spider
创建一个Spider,终端上进入GetBicolorNumber/GetBicolorNumber/Spider
输入scrapy genspider bicolor_number
5-爬取规则的编写
# -*- coding: utf-8 -*-import scrapyfrom ..items import GetbicolornumberItemimport timeclass BicolorNumberSpider(scrapy.Spider):name = 'bicolor_number'# allowed_domains = [']handle_httpstatus_list = [404, 500] # 请求返回错误的类型start_urls = ['/zhcw/html/ssq/list_1.html']url = "/zhcw/html/ssq/list_{}.html"page = 1def parse(self, response):print("Crawl 第:{}页".format(self.page))datas_xpath = response.xpath('/html/body/table//tr') # 数据展示区域item = GetbicolornumberItem() # 声明item对象for data in datas_xpath[2:-1]:issue = data.xpath("./td[1]/text()").extract_first()time_data = data.xpath("./td[2]/text()").extract_first()numbers = data.xpath("./td[3]//em/text()").extract()item['issue'] = issueitem['time'] = time_dataitem['numbers'] = numbers# print(item)yield itemself.page = self.page + 1next_page = self.url.format(self.page)# time.sleep(2)if self.page <=145:# 请求中加入errback,检查错误代码并发出替代请求。yield scrapy.Request(next_page, callback=self.parse,errback=self.after_404)def after_404(self, response):print(response.url)
6-pipeline.py文件的编写
文件保存为projects.json。
import codecsimport jsonclass GetbicolornumberPipeline(object):def __init__(self):self.file = codecs.open('projects.json', 'w+', encoding="utf-8")def process_item(self, item, spider):data = json.dumps(dict(item), ensure_ascii=False) + "\n"self.file.write(data)return itemdef spider_closed(self, spider):self.file.close()
7-爬取
命令行进入项目的根目录scrapy crawl bicolor_number
建议:创建一个start.py文件,执行此文件即可
from scrapy import cmdline# 执行爬虫cmdline.execute("scrapy crawl bicolor_number".split())
8-数据统计
获取历史蓝色球和红色球的出现次数
# -*- coding: utf-8 -*-import jsonimport operatordef get_json(file_path):with open(file_path,'r',encoding='utf-8') as jf:josn_list = jf.readlines()return josn_listdef get_numbersR_dict(number_list):numbersR_dict = {}for numbers in number_list:for number in numbers[:-1]:if number in numbersR_dict.keys():numbersR_dict[number] += 1else:numbersR_dict[number] = 0 return numbersR_dictdef get_numbersB_dict(number_list):numbersB_dict = {}for numbers in number_list:if numbers[-1] in numbersB_dict.keys():numbersB_dict[numbers[-1]] += 1else:numbersB_dict[numbers[-1]] = 0 return numbersB_dictdef sort_dictKey(numbers_dict,sort_key):result = []for k in sort_key:if k not in numbers_dict.keys():continuetemp = (k,numbers_dict[k])result.append(temp)return resultif __name__ == '__main__':file_path = r"E:\pyCharm\网络爬虫\test_scrapy\GetBicolorNumber\GetBicolorNumber\projects.json"json_list = get_json(file_path)number_list = []for data in range(len(json_list)):dict_bicolor = json.loads(json_list[data])number_list.append(dict_bicolor['numbers'])print("总共有:{}期双色球数据数据".format(len(number_list)))numbersR_dict = get_numbersR_dict(number_list)numbersB_dict = get_numbersB_dict(number_list)# 字典排序,排序红球出现次数 导入operatornumbersR_v = sorted(numbersR_dict.items(),key=operator.itemgetter(1),reverse = True)numbersB_v = sorted(numbersB_dict.items(),key=operator.itemgetter(1),reverse = True)print("红色球出现统计数据:")for kv in numbersR_v:print(kv[0],":",kv[1])print("蓝色球出现统计数据:")for kv in numbersB_v:print(kv[0],":",kv[1])