200字范文,内容丰富有趣,生活中的好帮手!
200字范文 > 爬取去哪儿网酒店信息

爬取去哪儿网酒店信息

时间:2020-06-23 13:17:25

相关推荐

爬取去哪儿网酒店信息

不说太多废话,就简单一句:你们你要爬哪里可以把地点改一下,还有时间改一下,爬取数量自己修改参数和代码,变化不大。有问题请留言,我不再次废话分析(这里我爬取的上海最近的酒店信息)

# coding=utf-8import csv#用来储存文件的模块import timeimport requestsimport jsonimport pandas as pd#excel出处理# 区域店铺id ct_Poi cateName抓取,传入参数为区域iddef crow_id(city):url = '/api/hotel/hotellist'#目标网址headers = {"wx-v": "","content-type": "application/json","Connection": "Keep-Alive","Accept-Encoding": "gzip","wx-q": "","unionid": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU","openid": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0","wx-t": "","User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 MicroMessenger/6.7.2.1340(0x2607023A) NetType/WIFI Language/zh_CN","charset": "utf-8","referer": "/wx799d4d93a341b368/114/page-frame.html","Host": "","Cookie": "QN48=tc_437f21c62a765ca0_165c198a408_e56b; QN1=qunar; QN66=smart_app; QN1=O5cv+luWLPthsvB1BKl0Ag==","Content-Length": "0",}#请求头和cookiep0 = {'http': 'http://101.132.122.230:3128'}p1 = {'http': 'http://114.113.126.83:80'}p2 = {'http': 'http://210.45.123.127:9999'}p3 = {'http': 'http://118.190.217.182:80'}p4 = {'http': 'http://120.27.14.125:80'}p5 = {'http': 'http://118.31.223.194:3128'}p6 = {'http': 'http://101.37.79.125:3128'}p7 = {'http': 'http://125.62.26.197:3128'}p8 = {'http': 'http://218.60.8.98:3129'}p9 = {'http': 'http://114.215.95.188:3128'}p10 = {'http': 'http://218.60.8.99:3129'}p11 = {'http': 'http://218.60.8.83:3129'}p12 = {'http': 'http://118.190.217.61:80'}p13 = {'http': 'http://203.86.26.9:3128'}p14 = {'http': 'http://114.113.126.87:80'}p15 = {'http': 'http://106.12.32.43:3128'}#爬取不同页网址p = p1page = 1#抓取我们需要的数据data = {"city": city,"cityUrl": "","page": page,"extra": "{}","sort": "","keywords": "","checkOutDate": "-10-29","checkInDate": "-10-29","locationAreaFilter": "","comprehensiveFilter": "[]","fixedComprehensiveFilter": "[]","SDKVersion": "2.2.4","wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU","wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0","bd_source": "smart_app","bd_origin": "pt-onl-ots-ggjd",}r = requests.post(url, headers=headers, params=data, proxies=p)result = json.loads(r.text)pages = result['data']['totalPage']# pages=586hotel = result['data']# attrs = hotel['attrs']print("当前总页数:",pages)print("Page:%d" %page)print(len(hotel), pages)df = pd.DataFrame(data=hotel['hotels'])df.to_csv('qunaer9.csv', mode='a', header=False)df.drop(df.index, inplace=True)if pages > 1:pages = pages - pagepage +=1while pages >=0:data2 = {"city": city,"cityUrl": "","page": page,"extra": "{}","sort": "","keywords": "","checkOutDate": "-11-2","checkInDate": "-11-1","locationAreaFilter": "","comprehensiveFilter": "[]","fixedComprehensiveFilter": "[]","SDKVersion": "2.2.4","wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU","wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0","bd_source": "smart_app","bd_origin": "pt-onl-ots-ggjd",}try:r = requests.post(url, headers=headers, params=data2, proxies=p)print(len(hotel), pages)print(page)result = json.loads(r.text)hotel = result['data']# attrs = hotel['attrs']df = pd.DataFrame(data=hotel['hotels'])df.to_csv('qunaer9.csv',mode='a',header=False)df.drop(df.index,inplace=True)except Exception as e:print(e)finally:print("Page:%d" %page)pages -= 1page = page+1time.sleep(3.1)if __name__ == '__main__':a = {"areaObj": {"上海": [{"city": '上海'}]}}datas = a['areaObj']b = datas.values()area_list = []for data in b:for d in data[0:]:area_list.append(d)l = 0old = time.time()for i in range(len(area_list)):print("开始抓取%s区域:" % (area_list[i]['city']))crow_id(area_list[i]['city'])

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。