JAVA 爬虫问题

V2EX = way to explore

V2EX 是一个关于分享和探索的地方

For Existing Member Sign In

This topic created in 3463 days ago, the information mentioned may be changed or developed.

使用 HttpClient 模拟登陆之后，将登录成功的 SESSIONID 传给 WebMagic 爬虫，爬虫能够爬取到需登陆网站的下载链接，但使用链接下载的时候得到的都是“请重新登陆！”。
问题可能出在登陆成功的 sessionid 失效了？

sessionid

登陆

webmagic

链接

10 replies 2016-11-02 18:37:04 +08:00

waytoexplorer

Nov 2, 2016

这个 sessionid 可能会定时变更，爬取的时候也要相应更改

CharlesL

Nov 2, 2016

把模拟登录之后的 cookie 也给 webmagic 带上试试

Jobin0528

Nov 2, 2016

@waytoexplorer 请问这个怎么操作？

Jobin0528

Nov 2, 2016

@CharlesL 带上了

joechan

Nov 2, 2016

应该是你带上的 sessionid 还是每次都不同的

winglight2016

Nov 2, 2016

你用浏览器访问一下，看看登录请求及返回的所有 header 和 response ，肯定是漏了什么地方没模拟对

Jobin0528

Nov 2, 2016

```
@RequestMapping(value = "doGrab",method = RequestMethod.POST)
public String doGrab(String username, String password, HttpServletRequest request){
try {
String cookie = simulationHttpUtil.getCookie(username,password);
String cookies[] = cookie.split("=");
webMagicUtil.setSite(cookies[1]);
webMagicUtil.setCook(cookie);
Spider.create(webMagicUtil)
//从该网页开始抓
.addUrl("http://www.digifilm.com.cn/index.php/member/index")
.addPipeline(new ConsolePipeline())
//开启 5 个线程抓取
.thread(5)
//启动爬虫
.run();
System.out.print(webMagicUtil.getLength());

} catch (Exception e) {
e.printStackTrace();
}

@Component
//给爬虫供给 cookie 的方法
public String getCookie (String username,String password) throws Exception {
RequestConfig requestCOnfig= RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();//标准 cookie 策略 /*.STANDARD_STRICT*/
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//设置进去

HttpGet getHomePage = new HttpGet("http://www.digifilm.com.cn/index.php/public/login");
getHomePage.setHeader("Accept","text/html,application/xhtml+xml,image/jxr,*/*");
getHomePage.setHeader("Accept-Encoding","gzip,deflate");
getHomePage.setHeader("Accept-Language","zh-CN");
getHomePage.setHeader("Connection","Keep-Alive");
getHomePage.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393");
//填充登陆请求中基本的参数
CloseableHttpResponse respOnse= httpClient.execute(getHomePage);
String rec = setCookie(response);
//printResponse(response);
//首页的源码
String respOnseHtml= EntityUtils.toString(response.getEntity());
//首页中的 html 代码<input name="__hash__" type="hidden" value=""/>
String hashValue = responseHtml.split("<input type=\"hidden\" name=\"__hash__\" value=\"")[1].split("\" />")[0];
response.close();
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("__hash__" , hashValue));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("username", username));
while (true){
//获取验证码"
HttpGet getCaptcha = new HttpGet("http://www.digifilm.com.cn/index.php/Verify/verify/?rand=" + Math.random());
CloseableHttpResponse imageRespOnse= httpClient.execute(getCaptcha);
//把响应的 png 格式图片转换成 jpg 格式。
InputStream in = imageResponse.getEntity().getContent();
BufferedImage bufferedImage = imageUtil.imageChange(in);
imageResponse.close();
in.close();
//图片去噪
File file = imageUtil.cleanImage(bufferedImage);
//识别去噪后的图片
String text = scanCodeUtil.recognizeText(file);
System.out.println("扫描后的图片:"+text);
valuePairs.add(new BasicNameValuePair("verify", text));
//完成登陆请求的构造
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
HttpPost post = new HttpPost("http://www.digifilm.com.cn/index.php/public/checklogin");
post.setEntity(entity);
CloseableHttpResponse httpRespOnse= httpClient.execute(post);//登录并返回响应对象
httpResponse.close();
//构造一个 get 请求，用来测试登录 cookie 是否拿到
HttpGet g = new HttpGet("http://www.digifilm.com.cn/index.php/member/index");//获取登录后页面
//将 cookie 注入到 get 请求头当中。未得到 cookie 就会把请求头里的 cookie 清空。造成失败。
//可关闭的响应对象。
CloseableHttpResponse r = httpClient.execute(g);

Header headers= r.getFirstHeader("Content-Length");
Integer cOntentLength= Integer.parseInt(headers.getValue());
if(contentLength > 7000){
r.close();
break;
}
r.close();
}
//httpClient.close();
String rec2 = rec.split(";")[2];
return rec2;
}
}

@Component
public class WebMagicUtil implements PageProcessor {

private int length;
//部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数、超时时间等
private Site site ;
public void setSite(String cookie) {
this.site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(1000*60*60).setCycleRetryTimes(3)
//添加 cookie 之前一定要先设置主机地址，否则 cookie 信息不生效
.setDomain("www.digifilm.com.cn")
//添加获取的 cookie 信息;
.addCookie("PHPSESSID",cookie)
//添加请求头，网站会根据请求头判断该请求是由浏览器发起还是爬虫发起。
.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
.addHeader("Accept","text/html, application/xhtml+xml, image/jxr, */*")
.addHeader("Accept-Encodin","gzip,deflate")
.addHeader("Accept-Language","zh-CN")
.addHeader("Connection","Keep-Alive");
//.addHeader("Referer","http://www.digifilm.com.cn/index.php/public/login");;
}

@Override
//process 是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
public void process(Page page) {
//在开始的页面抓（去到密钥列表和单个下载页面的连接）
if(page.getUrl().regex("(.*/index\\.php/member/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class=leaguer]").links().regex("(.*/index\\.php/(\\w+)_down/index)").all());
}
//在密钥列表页面抓（列表页码和单个下载页面的链接）
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/index)").match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlelb']").links().regex("(.*/index\\.php/(\\w+)_down/content/id/.*)").all());
//翻页链接
page.addTargetRequests(page.getHtml().xpath("//div[@class=fanye_1]").links().regex("(.*/index\\.php/(\\w+)_down/index\\?&p=\\d+)").all());
}
//在密钥单个下载页面抽取信息。
if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/content/id/\\w+)").match()){
page.putField("filmTitle", page.getHtml().xpath("//div[@class='videoDescri']/span[1]/text()"));
page.putField("filmSchedule", page.getHtml().xpath("//div[@class='videoDescri']/span[2]/text()"));
page.putField("filmType", page.getHtml().xpath("//div[@class='videoDescri']/span[3]/text()"));
page.putField("secretKey", page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlela']/ul/li/a[@class='load']").links().regex("(.*/download\\.php\\?mid=.*)").all());
List<String> list = page.getResultItems().get("secretKey");
for (String url: list) {
try {
System.out.println(url);
downloadFromUrl(url,"C:\\360Downloads\\Test\\");
} catch (Exception e) {
e.printStackTrace();
}
}
length++;
}
}

//测试下载代码
public static String downloadFromUrl(String url,String dir) {
try {
URL httpurl = new URL(url);
String fileName = getFileNameFromUrl(url);
System.out.println(fileName);
File saveDir = new File(dir);
if (!saveDir.exists()) {
saveDir.mkdir();
}
File file = new File(saveDir + File.separator + fileName);

file.createNewFile();
FileUtils.copyURLToFile( httpurl,file);
} catch (Exception e) {
e.printStackTrace();
return "Fault";
}
return "Successful!";
}

public static String getFileNameFromUrl(String url) {
String name = new Long(System.currentTimeMillis()).toString() + ".xml";
return name;
}

```

Jobin0528

Nov 2, 2016

@livid 可以帮忙删除这条代码么？

zoran

Nov 2, 2016

java 爬虫可以试试 https://github.com/zhuoran/crawler4j 记得点赞~~ ：）

bytenoob

Nov 2, 2016

是不是网站设置了单点登录