Forum Moderators: goodroi
Array ($Current_Line)
(
[] => http*//www.example.com/gfx/
[I] => http*//www.example.com/cgi-bin/
[2] => http*//www.example.com/QuickSand/
[3] => http*//www.example.com/pda/
[4] => http*//www.example.com/zForumFFFFFF/
)
if the current url was : [example.com...]
how can i preg_match this url with the dissalowed string?
if preg_match("/".$current_line[3]."/i", $current_url){
echo "Forbidden";
}
Ive tried the preg_matc above, but it returns an error message.
<?php
$current_url="http://www.example.com/QuickSand/index.htm";
$robotsdomain="http://www.example.com";
$my_user_agent="User-agent: *"; //my useragent
$robots=file('http://www.example.com/robots.txt');
for ($i=0;$i<sizeof($robots);$i++){
if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
for ($checkrules=1;$checkrules<10;$checkrules++){
if (trim($robots[$i+$checkrules])!=""){
$pos = strpos( $current_line[$count],"User-agent");
if (is_integer($pos)) break;
$pos = strpos( $current_line[$count],"#");
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
$current_line[$count]=str_replace("Disallow: ", "" ,$robotsdomain.$robots[$i+$checkrules]);
$count++;
}
}
}
}
print_r($current_line);
echo $current_url;
?>
usage: echo robots_allowed($url);
returns 1 if allowed to crawl
returns 0 if not not allowed to crawl
(of course robots.txt should saved once and then be retrieved from the database and not opened for every url)
function robots_allowed($url){
$current_url=$url;
$xmp=explode("/", $current_url."/");
$robotsdomain=trim("http://".$xmp[2]);
$stipped_robotsdomain=str_replace("/","",$robotsdomain);
$stripped_current_url=str_replace("/", "" ,$url);
$my_user_agent="User-agent: *"; //my useragent
$robots=Read_Content($robotsdomain.'/robots.txt');
$robots=explode("\n",$robots);
for ($i=0;$i<sizeof($robots);$i++){
if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
for ($checkrules=1;$checkrules<10;$checkrules++){
if (trim($robots[$i+$checkrules])!=""){
$pos = strpos( $current_line[$count],"User-agent");
if (is_integer($pos)) break;
$pos = strpos( $current_line[$count],"#");
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
$disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
//$disallow_line=str_replace("http://", "" ,$disallow_line);
$disallow_line=str_replace("/", "" ,$disallow_line);
$newdata[$num]=$stipped_robotsdomain.$disallow_line;
$num++;
$count++;
}
}
}
}
$forbidden=1;
for ($last=0;$last<20;$last++){
if (trim($newdata[$last])!=""){
if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
}
}
return $forbidden;
}
Function Read_Content($url){// Open een url return content
$handle=@fopen($url,"r");
if($handle){
$contents = fread ($handle, 10000);
fclose($handle);
}
return $contents;
}