Yangbo's Blog!

Artificial Intelligence Board Game.
Browsing JAVA

Snatch google search result(抓取google测试)

March25
public static void main(String[] args) {
String body = "";
String url = "<a href="http://www.google.cn/search?q=%E4%BD%A0%E5%A5%BD" target="_blank">http://www.google.cn/search?q=%E4%BD%A0%E5%A5%BD</a>";
//构造HttpClient的实例
HttpClient httpClient = new HttpClient();
httpClient.setConnectionTimeout(HTTPCONTIMEOUT);//设置连接超时时间
httpClient.setTimeout(HTTPCONTIMEOUT);
String referer="<a href="http://www.google.cn/" target="_blank">http://www.google.cn/</a>";
Header [] cookie_h=null;
AutoSearchBadWordOnWWW search=new AutoSearchBadWordOnWWW();

int TIMEOUT=60000;
for(int i=0;i<1000;i++){
//创建GET方法的实例
GetMethod getMethod = new GetMethod(url);
try {
getMethod.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
//使用系统提供的默认的恢复策略
//getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
getMethod.addRequestHeader("User-Agent","Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;)");
getMethod.addRequestHeader("Accept", "*/*");
getMethod.addRequestHeader("Accept-Language", "zh-cn");
getMethod.addRequestHeader("Accept-Encoding", "gzip,deflate");
getMethod.addRequestHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7");
//getMethod.addRequestHeader("Keep-Alive", "300");
getMethod.addRequestHeader("Connection", "Keep-Alive");
getMethod.addRequestHeader("Cache-control", "no-cache");
getMethod.addRequestHeader("Referer",referer);</span>
if(cookie_h!=null){
for(Header c : cookie_h){
getMethod.addRequestHeader(c);
}
}

//执行getMethod
int statusCode = httpClient.executeMethod(getMethod);

if (statusCode != HttpStatus.SC_OK) {
System.err.println("getMethod:" + statusCode);
}
//body = getMethod.getResponseBodyAsString();

cookie_h=getMethod.getResponseHeaders("Set-Cookie");

BufferedReader br=new BufferedReader(new InputStreamReader(getMethod.getResponseBodyAsStream(),<span style="color: #ff0000;">"UTF-8"</span>));//中文语言转码
StringBuffer responseBody=new StringBuffer();
String s="";
while((s=br.readLine())!=null){
responseBody.append(s);
}

body=responseBody.toString();

referer=url;
url=search.getNextUrlByGoogle(body);
//System.out.println("referer:"+referer);
System.out.println(i+" nextUrl:"+url);
search.sleep();
} catch (Exception e) {
e.printStackTrace();
} finally {
getMethod.releaseConnection();
}
}

}

重点:如果频繁访问,则可能会被google封掉。
一般每抓一个网页随机停4-12秒
一但被google封掉,则可以通过随机产生httpClient中的header参数Accept来解决。

PHP 版本如下:
首先借助 HttpClient.class.php 模拟前端操作(例如浏览页面,登陆页面);
然后得到返回结果借助 simple_html_dom.php 分析返回来的String结果;
抓取示例代码如下:

<?php
require_once 'libs/HttpClient.class.php';
require_once 'libs/simple_html_dom.php';
$mySample = new Sample();
class Sample
{
	private $client;
	private $pageConents;
	private $htmlContents;
	private $htmlBody;
	private $htmlDiv;
	private $htmlFile;
	private $htmlFileHandler;

	public function Sample()
	{
	//STEP 1: initialize;
		$this->client = new HttpClient("www.google.cn");//http://www.google.cn/search?hl=zh-CN&q=word
		$this->client->setDebug(TRUE);
		$this->htmlFile = "sample.html";
		$this->htmlFileHandler = fopen($this->htmlFile,'w');
		//Grabbing an HTML page (static method)

		// Create DOM from URL or file
		//$html = file_get_html('http://www.google.com/');

		// Find all images
		//foreach($html->find('img') as $element)
		//       echo $element->src . '<br>';

		// Find all links
		//foreach($html->find('a') as $element)
        //echo $element->href . '<br>'; 

	//STEP 2: with get query;
		//Fake the User Agent string
		$this->client->setUserAgent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.3a) Gecko/20021207');
		//A simple POST request using the class

		if($this->client->getStatus()!='404')//Check to see if a page exists
		{
			$this->client = new HttpClient("passport.zzzzzz.com");
			$this->client->post("/login.php",array('username'=>'xxxx','password'=>'yyyy'));
			var_dump($this->client);
		}else {
			die('This page is not existed!');
		}
		return;	

		//A simple GET request using the class
		if(!$this->client->get('/search',array('hl'=>'zh_CN','q'=>'农业')))
		{
			die('An error occured::'.$this->client->getError());
		}
		$this->pageConents = $this->client->getContent();
		var_dump($this->pageConents);
		$this->htmlContents = str_get_html($this->pageConents);
//		var_dump($this->htmlContents);
		//Save this result html file;
//		fwrite($this->htmlFileHandler,$this->htmlContents);
//		fclose($this->htmlFileHandler);

//		var_dump(strip_tags($this->pageConents));
//		echo preg_match('/<body/',$this->pageConents);
//		echo ereg("<body",$this->pageConents);
//		echo ereg("</body>",$this->pageConents);

		if(ereg("<body",$this->pageConents) && ereg("</body>",$this->pageConents))
		{
			$this->htmlBody = substr($this->pageConents,stripos($this->pageConents,"<body"),(stripos($this->pageConents,"</body>")+7-stripos($this->pageConents,"<body")));
			var_dump($this->htmlBody);
//			$this->htmlBody = str_replace("<img src=/","<img src=http://www.google.com/",$this->htmlBody);
//			var_dump($this->htmlBody);
			// Find all links
			$this->htmlBody = str_get_html($this->pageConents);
			foreach($this->htmlBody->find('a') as $element)
	        echo $element->href . '<br>';
		}

		if(ereg("<div",$this->pageConents) && ereg("</div>",$this->pageConents))
		{
			$this->htmlDiv = substr($this->pageConents,stripos($this->pageConents,"<div"),(stripos($this->pageConents,"</div>")+6-stripos($this->pageConents,"<div")));
			var_dump($this->htmlDiv);
		}

	}
}

?>

BLOG CALENDAR

September 2010
M T W T F S S
« Apr    
 12345
6789101112
13141516171819
20212223242526
27282930