做采集的时候,可以使用file_get_contents()去获取网页源代码,但是使用file_get_contents采集,速度慢,而且超时时间,不好控制。如果采集的页面不存在,需要等待的时间很长。一般来说,curl的速度最快,其次是socket,最后是file_get_contents。现在跟大家分享一个很强大的采集类,会根据你的服务器当前的配置,自动选择最快的方式。已经封装了curl和socket,file_get_contents用法很简单:1,采用get方法请求Http::doGet(网址);//超市时间可忽略,默认是5秒Http::doGet(网址,超时时间); 如echo Http::doGet('http://www.baidu.com');2,采用post方法请求Http::doPost(网址,数据,超时时间);如$url='http://www.canphp.com/test.php';$data['name']='单骑';$data['email']='admin@canphp.com';Http::doPost($url,$data,10);test.php页面接收数据$_POST['name'];$_POST['email'];这个http类不仅可以用来采集,还有一个很强大的作用,模拟php异步多进程。比如有index.php和a.php, b.php, c.php在index.php中Http::doGet('http://www.canphp.com/a.php',1);Http::doGet('http://www.canphp.com/b.php',1);Http::doGet('http://www.canphp.com/c.php',1);a.php, b.php, c.php程序分别在头部加上ignore_user_abort(true);那么就可以实现多进程了。原理:通过curl或socket发送请求给a.php, b.php, c.php,由于超时时间比较短,只是触发了a.php, b.php, c.php三个页面,不需要等待数据返回,连接已中断,但是a.php, b.php, c.php程序中加上了ignore_user_abort(true);忽略客户端连接,还会继续执行。具体案例可以观看很邪恶很强大的av程序()
|
<?php
//数据采集,doGET,doPOSTclass Http{//类定义开始//通过get方式获取数据static public function doGet($url,$timeout=5){ $code=self::getSupport();switch($code){ case 1:return self::curl($url,'',$timeout);break;case 2:return self::socketGet($url,$timeout);break;case 3:return @file_get_contents($url);break;default:return false; }}//通过POST方式发送数据static public function doPost($url,$data,$timeout=5){ $code=self::getSupport();switch($code){ case 1:return self::curl($url,$data,$timeout);break;case 2:return self::socketPost($url,$data,$timeout);break;default:return false; }}//获取支持读取远程文件的方式
static public function getSupport(){ if(function_exists('curl_init'))//curl方式{ return 1;}else if(function_exists('fsockopen'))//socket{ return 2;}else if(function_exists('file_get_contents'))//php系统函数file_get_contents{ return 3;}else if(ini_get('allow_url_fopen')&&function_exists('fopen'))//php系统函数fopen{ return 4;}else{ return 0;} }static public function GetHttpContent($fsock=null) { $out = null;while($buff = @fgets($fsock, 2048)){ $out .= $buff;}fclose($fsock);$pos = strpos($out, "\r\n\r\n");$head = substr($out, 0, $pos); //http head$status = substr($head, 0, strpos($head, "\r\n")); //http status line$body = substr($out, $pos + 4, strlen($out) - ($pos + 4));//page bodyif(preg_match("/^HTTP\/\d\.\d\s([\d]+)\s.*$/", $status, $matches)){ if(intval($matches[1]) / 100 == 2){ return $body; }else{ return false;}}else{ return false;}}static public function socketGet($url,$timeout=5){ $url2 = parse_url($url);$url2["path"] = isset($url2["path"])? $url2["path"]: "/" ;$url2["port"] = isset($url2["port"])? $url2["port"] : 80;$url2["query"] = isset($url2["query"])? "?".$url2["query"] : "";$host_ip = @gethostbyname($url2["host"]);$fsock_timeout = $timeout; //超时时间if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){ return false;}$request = $url2["path"] .$url2["query"];$in = "GET " . $request . " HTTP/1.1\r\n";$in .= "Accept: */*\r\n";// $in .= "User-Agent: Payb-Agent\r\n";$in .= "Host: " . $url2["host"] . "\r\n";$in .= "Connection: Close\r\n\r\n";if(, $in, strlen($in))){ @fclose($fsock);return false;}return self::GetHttpContent($fsock);}static public function socketPost($url,$post_data=array(),$timeout=5){
$url2 = parse_url($url);$url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]);$url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]);$host_ip = @gethostbyname($url2["host"]);$fsock_timeout = $timeout; //超时时间if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){ return false;}$request = $url2["path"].($url2["query"] ? "?" . $url2["query"] : "");$post_data2 = http_build_query($post_data);$in = "POST " . $request . " HTTP/1.1\r\n";$in .= "Accept: */*\r\n";$in .= "Host: " . $url2["host"] . "\r\n";// $in .= "User-Agent: Lowell-Agent\r\n";$in .= "Content-type: application/x-www-form-urlencoded\r\n";$in .= "Content-Length: " . strlen($post_data2) . "\r\n";$in .= "Connection: Close\r\n\r\n";$in .= $post_data2 . "\r\n\r\n";unset($post_data2);if(, $in, strlen($in))){ @fclose($fsock);return false;}return self::GetHttpContent($fsock);}static public function curl($url, $data=array(), $timeout=5)
{ $ch = curl_init();if (is_array($data) && $data) { $formdata = http_build_query($data);curl_setopt($ch, CURLOPT_POST, true);curl_setopt($ch, CURLOPT_POSTFIELDS, $formdata);}curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);$result = curl_exec($ch);curl_close($ch);return $result;}}//类定义结束
?>