skymvc开发手册之蜘蛛库cls_solink.php
skymvc的蜘蛛库cls_solink.php
$this->loadClass("solink",false,false);
$solink= new solink(array("url"=>"http://www.skymvc.com"));
$solink->get_content();
echo $solink->get_title();
print_r($solink->get_link());
附:cls_solink.php
<?php
/*
$solink= new solink(array("url"=>"http://www.skymvc.com"));
$solink->get_content();
$solink->get_title();
print_r($solink->get_link());
*/
class solink{
public $content;
public $url;
public $dir;
public $host;//当前域名
public $scheme;
public $selfsite;
public $domain;//主域名
public function __construct($config=array()){
$this->url=isset($config['url'])?$config['url']:"";//要采集的url
$this->selfsite=isset($config['selfsite'])?$config['selfsite']:0;//1 只采集自己当前域名的站 0.采集主域名及子域名 2.采集所有
$this->parseurl();
}
public function set($config=array()){
$this->url=isset($config['url'])?$config['url']:"";//要采集的url
$this->selfsite=isset($config['selfsite'])?$config['selfsite']:2;//1 只采集自己当前域名的站 0.采集主域名及子域名 2.采集所有
$this->parseurl();
}
public function get_content(){
$this->content=$this->toutf8($this->curl_get_contents($this->url));
}
public function get_link(){
preg_match_all("/<a[^>]*href=["']([^"']*)["']/iUs",$this->content,$a);
if(isset($a[1])){
foreach($a[1] as $v){
$u=$this->builtlink($v);
$u && $urls[]=$u;
}
return $urls;
}
return array();
}
public function get_title(){
preg_match("/<title>(.*)<\/title>/iUs",$this->content,$a);
if(isset($a[1])) return $a[1];
return false;
}
public function parseurl(){
$arr=parse_url($this->url);
$this->host=$arr['host'];
$this->scheme=$arr['scheme'];
preg_match("/([\w]+\.(com|cn|net|org|cc|info|me|co|tv|name|tel|so)(\.cn)?)/i",$this->host,$c);
$this->domain=$c[1];
$this->dir=dirname($this->url);
}
public function parsedomain($url){
$d=array(
".com",
".cn",
".com.cn",
".net"
);
}
public function builtlink($url){
if(empty($url)) return "";
if($this->selfsite==1){
$a=parse_url($url);
if($a['host'] && $a['host']!=$this->host) return "";
}elseif($this->selfsite==0){
$a=parse_url($url);
//如果不是主域名
if($a['host'] && strpos($url,$this->domain)===false) return "";
}
if($url{0}=="/"){
return $this->scheme."://".$this->host.$url;
}elseif(preg_match("/^http/i",$url)){
return $url;
}elseif(preg_match("/^javascript/i",$url)){
return "";
}elseif($url{0}=="#"){
return "";
}else{
return $this->scheme."://".$this->host."/".$this->dir."/".$url;
}
}
public function curl_get_contents($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch,CURLOPT_TIMEOUT,30);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$content=curl_exec($ch);
curl_close($ch);
return $content;
}
public function toutf8($str){
$ico=array("gbk","utf-8","'ASCII'");
foreach($ico as $c){
if($str===iconv("utf-8","$c//IGNORE",iconv($c,"utf-8//IGNORE",$str))){
if($c=='utf-8'){
return $str;
}
return iconv($c,"utf-8",$str);
}
}
return $str;
}
}
?>