200字范文,内容丰富有趣,生活中的好帮手!
200字范文 > java做爬虫解决521错误

java做爬虫解决521错误

时间:2021-04-26 04:41:29

相关推荐

java做爬虫解决521错误

欢迎访问github

最近做爬虫时碰到了521错误,500开头的都是服务器错误,521错误有很大可能是请求头参数不对,比如下面这个

这是错误的

这是正确的

就是这一堆东西可能有哪些少了或是错误,而在爬虫中遇到这个错误,又极大可能是少了Cookie参数,比如在这里就是少了Cookie参数。

展开那个Cookie

Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; _ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245

可以看到就是两个字段,一个yd_cookie,还有一个_ydclearance。那不就是少这两个参数嘛,yd_cookie可以看到在第一次错误的响应头中已经有了

Set-Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; Expires=1544419245; Path=/; HttpOnly

那只要正则截取下及好了,这样就只剩下一个_ydclearance,这个怎么解决呢,首先用各种拦截请求的工具,我是用的Firefox,直接看请求

可以看到实际上请求了两次,第一次就是521错误,第二次才是200的正确返回。

看下521错误的返回

<html><body><script language = "javascript">window.onload = setTimeout("fp(38)", 200);function fp(PD) {var qo, mo = "",no = "",oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";eval(qo);qo = 233;do {oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;} while (--qo >= 3);qo = 1;for(;;) {if(qo > 233) break;oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);qo++;}po = "";for(qo = 1; qo < oo.length - 1; qo++)if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);eval("qo=eval;qo(po);");} </script> </body></html>

一堆和密码一样的js代码,不管他,看最后一句

eval("qo=eval;qo(po);");

这句绕来绕去实际上就是eval(po),提取出这个方法

function fp(PD) {var qo, mo = "",no = "",oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0xdb, 0xfd, 0x00, 0x81, 0x62, 0x7f, 0x02, 0x64, 0x25, 0x28, 0x30, 0x50, 0x17, 0x80, 0x01, 0x83, 0x4c, 0xbd, 0xc0, 0x67, 0xb0, 0xd1, 0x79, 0xbc, 0x1e, 0x7f, 0xc8, 0xaf, 0x72, 0x80, 0x41, 0x2a, 0xec, 0xf3, 0xdc, 0xe3, 0xc6, 0x6d, 0xb6, 0x1e, 0x28, 0x14, 0xfc, 0xa4, 0x0c, 0xf4, 0xfe, 0x24, 0x4b, 0x2e, 0xb0, 0x71, 0x7b, 0x7e, 0x5f, 0xe7, 0xca, 0x71, 0x7b, 0xb2, 0x3a, 0xe1, 0xea, 0x91, 0xb4, 0xd4, 0x7c, 0x43, 0xcc, 0x4d, 0x10, 0x88, 0x49, 0xf1, 0xfb, 0x84, 0x8b, 0x6f, 0x98, 0xbf, 0xe8, 0x6f, 0xf8, 0x69, 0x52, 0x79, 0x82, 0x4a, 0xf4, 0x95, 0x98, 0x38, 0xd8, 0x5a, 0x39, 0xeb, 0xee, 0xad, 0xb5, 0xb3, 0xf5, 0x5a, 0xbc, 0xe6, 0x8f, 0x18, 0xc0, 0x58, 0x62, 0x7f, 0x82, 0x43, 0x4d, 0x2d, 0xb5, 0x5d, 0x08, 0xb0, 0xb7, 0x8a, 0x72, 0x59, 0x02, 0xe9, 0x11, 0x55, 0x5e, 0x07, 0x46, 0x44, 0x5f, 0xbf, 0x87, 0x32, 0x53, 0x95, 0x97, 0xa5, 0x28, 0xa9, 0x0c, 0x14, 0x7e, 0x87, 0xf0, 0x58, 0xd9, 0x7a, 0xe4, 0xa4, 0x65, 0xc7, 0x8f, 0x3a, 0xda, 0x00, 0x81, 0xa2, 0x44, 0x4c, 0x96, 0x70, 0xb9, 0x81, 0x2c, 0x6b, 0xee, 0x8e, 0xf0, 0x12, 0x54, 0x93, 0xfd, 0x3f, 0x62, 0xa4, 0x65, 0x66, 0x68, 0xbb, 0xbe, 0x1f, 0x40, 0xaa, 0xcc, 0x1a, 0x5c, 0x1d, 0x9f, 0xc0, 0x42, 0x92, 0xd4, 0x35, 0x3d, 0x60, 0xa2, 0xac, 0x6d, 0x6e, 0x70, 0x71, 0xd3, 0xc6, 0xe5, 0x50, 0x4b, 0x28, 0x46, 0xfb, 0x3b];qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";eval(qo);qo = 233;do {oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;} while (--qo >= 3);qo = 1;for(;;) {if(qo > 233) break;oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);qo++;}po = "";for(qo = 1; qo < oo.length - 1; qo++)if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);return po;}fp(38);

把它放到浏览器中执行下

"document.cookie='_ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245; expires=Mon, 10-Dec-18 05:20:45 GMT; domain=.; path=/'; window.document.location=document.URL"

_ydclearance终于出来了,现在只要把_ydclearanceyd_cookie拼起来set到Cookie中就可以了。

JAVA代码

public class HandleCrawler {public static void setCookie() throws IOException, ScriptException {CloseableHttpResponse response = ApacheHttpUtil.sendGet(Constant.proxyUrl);if(response.getStatusLine().getStatusCode()==521) {String yd_cookie = getYdCookie(response.getAllHeaders());ProxyRequest.logger.info("yd_cookie is :"+yd_cookie);HttpEntity entity = response.getEntity();String html=EntityUtils.toString(entity,"utf-8");String runString = getRunString(html);String fuction = html.substring(html.indexOf("function")).replace("</script> </body></html>",runString+";").replace("eval(\"qo=eval;qo(po);\")","return po");ProxyRequest.logger.info("fuction is :"+fuction);ScriptEngineManager m = new ScriptEngineManager(); //获取JavaScript执行引擎ScriptEngine engine = m.getEngineByName("JavaScript"); //执行JavaScript代码String origin = (String) engine.eval(fuction);ProxyRequest.logger.info("origin ydclearance is :"+origin);String ydclearance = getYdclearance(origin);ProxyRequest.logger.info("ydclearance is :"+ydclearance);Constant.COOKIE = "yd_cookie="+yd_cookie+"; _ydclearance="+ydclearance;}}private static String getYdCookie(Header[] headers){String yd_cookie = null;for(Header header:headers){if (header.getName().equals("Set-Cookie")){yd_cookie = header.getValue();}}Pattern pattern = pile("(?<=yd_cookie=).+?(?=; Expires=)");Matcher matcher = pattern.matcher(yd_cookie);while (matcher.find()){yd_cookie = matcher.group(0);}return yd_cookie;}private static String getYdclearance(String origin){String ydclearance = null;Pattern pattern = pile("(?<=_ydclearance=).+?(?=; expires=)");Matcher matcher = pattern.matcher(origin);while (matcher.find()){ydclearance = matcher.group(0);}return ydclearance;}private static String getRunString(String html){Pattern pattern = pile("(?<=window.onload=setTimeout\\(\").+?(?=\", 200\\))");Matcher matcher = pattern.matcher(html);while (matcher.find()){return matcher.group(0);}return null;}}

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。