public static void main(String[] args) {
String body = "";
String url = "<a href="http://www.google.cn/search?q=%E4%BD%A0%E5%A5%BD" target="_blank">http://www.google.cn/search?q=%E4%BD%A0%E5%A5%BD</a>";
//构造HttpClient的实例
HttpClient httpClient = new HttpClient();
httpClient.setConnectionTimeout(HTTPCONTIMEOUT);//设置连接超时时间
httpClient.setTimeout(HTTPCONTIMEOUT);
String referer="<a href="http://www.google.cn/" target="_blank">http://www.google.cn/</a>";
Header [] cookie_h=null;
AutoSearchBadWordOnWWW search=new AutoSearchBadWordOnWWW();
int TIMEOUT=60000;
for(int i=0;i<1000;i++){
//创建GET方法的实例
GetMethod getMethod = new GetMethod(url);
try {
getMethod.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
//使用系统提供的默认的恢复策略
//getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
getMethod.addRequestHeader("User-Agent","Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;)");
getMethod.addRequestHeader("Accept", "*/*");
getMethod.addRequestHeader("Accept-Language", "zh-cn");
getMethod.addRequestHeader("Accept-Encoding", "gzip,deflate");
getMethod.addRequestHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7");
//getMethod.addRequestHeader("Keep-Alive", "300");
getMethod.addRequestHeader("Connection", "Keep-Alive");
getMethod.addRequestHeader("Cache-control", "no-cache");
getMethod.addRequestHeader("Referer",referer);</span>
if(cookie_h!=null){
for(Header c : cookie_h){
getMethod.addRequestHeader(c);
}
}
//执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("getMethod:" + statusCode);
}
//body = getMethod.getResponseBodyAsString();
cookie_h=getMethod.getResponseHeaders("Set-Cookie");
BufferedReader br=new BufferedReader(new InputStreamReader(getMethod.getResponseBodyAsStream(),<span style="color: #ff0000;">"UTF-8"</span>));//中文语言转码
StringBuffer responseBody=new StringBuffer();
String s="";
while((s=br.readLine())!=null){
responseBody.append(s);
}
body=responseBody.toString();
referer=url;
url=search.getNextUrlByGoogle(body);
//System.out.println("referer:"+referer);
System.out.println(i+" nextUrl:"+url);
search.sleep();
} catch (Exception e) {
e.printStackTrace();
} finally {
getMethod.releaseConnection();
}
}
}
重点:如果频繁访问,则可能会被google封掉。
一般每抓一个网页随机停4-12秒
一但被google封掉,则可以通过随机产生httpClient中的header参数Accept来解决。
PHP 版本如下:
首先借助 HttpClient.class.php 模拟前端操作(例如浏览页面,登陆页面);
然后得到返回结果借助 simple_html_dom.php 分析返回来的String结果;
抓取示例代码如下:
<?php
require_once 'libs/HttpClient.class.php';
require_once 'libs/simple_html_dom.php';
$mySample = new Sample();
class Sample
{
private $client;
private $pageConents;
private $htmlContents;
private $htmlBody;
private $htmlDiv;
private $htmlFile;
private $htmlFileHandler;
public function Sample()
{
//STEP 1: initialize;
$this->client = new HttpClient("www.google.cn");//http://www.google.cn/search?hl=zh-CN&q=word
$this->client->setDebug(TRUE);
$this->htmlFile = "sample.html";
$this->htmlFileHandler = fopen($this->htmlFile,'w');
//Grabbing an HTML page (static method)
// Create DOM from URL or file
//$html = file_get_html('http://www.google.com/');
// Find all images
//foreach($html->find('img') as $element)
// echo $element->src . '<br>';
// Find all links
//foreach($html->find('a') as $element)
//echo $element->href . '<br>';
//STEP 2: with get query;
//Fake the User Agent string
$this->client->setUserAgent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.3a) Gecko/20021207');
//A simple POST request using the class
if($this->client->getStatus()!='404')//Check to see if a page exists
{
$this->client = new HttpClient("passport.zzzzzz.com");
$this->client->post("/login.php",array('username'=>'xxxx','password'=>'yyyy'));
var_dump($this->client);
}else {
die('This page is not existed!');
}
return;
//A simple GET request using the class
if(!$this->client->get('/search',array('hl'=>'zh_CN','q'=>'农业')))
{
die('An error occured::'.$this->client->getError());
}
$this->pageConents = $this->client->getContent();
var_dump($this->pageConents);
$this->htmlContents = str_get_html($this->pageConents);
// var_dump($this->htmlContents);
//Save this result html file;
// fwrite($this->htmlFileHandler,$this->htmlContents);
// fclose($this->htmlFileHandler);
// var_dump(strip_tags($this->pageConents));
// echo preg_match('/<body/',$this->pageConents);
// echo ereg("<body",$this->pageConents);
// echo ereg("</body>",$this->pageConents);
if(ereg("<body",$this->pageConents) && ereg("</body>",$this->pageConents))
{
$this->htmlBody = substr($this->pageConents,stripos($this->pageConents,"<body"),(stripos($this->pageConents,"</body>")+7-stripos($this->pageConents,"<body")));
var_dump($this->htmlBody);
// $this->htmlBody = str_replace("<img src=/","<img src=http://www.google.com/",$this->htmlBody);
// var_dump($this->htmlBody);
// Find all links
$this->htmlBody = str_get_html($this->pageConents);
foreach($this->htmlBody->find('a') as $element)
echo $element->href . '<br>';
}
if(ereg("<div",$this->pageConents) && ereg("</div>",$this->pageConents))
{
$this->htmlDiv = substr($this->pageConents,stripos($this->pageConents,"<div"),(stripos($this->pageConents,"</div>")+6-stripos($this->pageConents,"<div")));
var_dump($this->htmlDiv);
}
}
}
?>