Forum Moderators: coopster
<?php
//startpage: index.php
$startpage="http://hal.cs.tuiasi.ro/~dmihaiu/site/index.php";
$storefile="links/index.txt";
$cmparray=array(); //creare array
$links1=getlinks($startpage); //create array with links
for($j=0;$j<count($links1);$j++)
{
array_push($cmparray,$links1[$j]);
}
for($i=0;$i<count($cmparray);$i++)
{
$linksx=getlinks($cmparray[$i]);
for($j=0;$j<count($linksx);$j++)
{
if(array_search($linksx[$j],$cmparray) == FALSE)
{
array_push($cmparray,$linksx[$j]);
}
}
echo "<br>".count($cmparray)."-".$i;
}
echo "<pre>";
print_r($cmparray);
function getlinks($page) //get links from a page, store in array
{
@$startfile=file($page);
@$indexfile=join("",$startfile);
$indexfile=str_replace("\n"," ",$indexfile);
$indexfile=str_replace("</a>","</a>\n",$indexfile);
$indexfile=split("\n",$indexfile);
$dim=count($indexfile);
$links=array();
for($i=0;$i<$dim;$i++)
{
$indexfile[$i]=eregi_replace(".*<a ","<a ",$indexfile[$i]);
eregi("href=[\"']{0,1}([^\"'> ]*)",$indexfile[$i],$req);
$url=trim($req[1]);
if($url!= "" && $url!= "/" && $url!= ''
&& substr($url,0,7)!= "mailto:" && substr($url, 0,11)!= "java script:"
&& substr($url, 0,11)!= "java script:")
{
if(substr($url,0,4) == "www.")
{
$url="http://".$url;
}
if(substr($url,0,1) == "/")
{
$url="http://".$_SERVER['HTTP_HOST']."/~dmihaiu"."$url";
}
if(substr($url,0,1)!= "/" && substr($url,0,7)!= "http://")
{
$url="http://".$_SERVER['HTTP_HOST']."/~dmihaiu/"."$url";
}
preg_match("/^(http://)?([^/]+)/i",$url, $hostname);
$host = $hostname[2];
if(eregi($host,$_SERVER["HTTP_HOST"]))
{
array_push($links,trim($url));
}
}
}
$links1=array_unique($links);
unset($indexfile);
unset($links);
return $links1;
} //end function getlinks
?>
The getlinks() function work corectly, but I can't stop my progam.
What is wrong?
The getlinks() function work corectly, but I can't stop my progam.
If I understand your code, then you start with a given page, fetch all links within this page and push them in $cmparray (with a duplicate in $link1). Then you have a loop with which you traverse through your $cmparray; for every entry in $cmparray you get the links and push them in $cmparray unless this link is already in $cmparray.
This loop now runs (nearly) for eternity, because with every entry in $cmparray you get 5 to 10 new links (average number of links on a webpage) which are pushed on $cmparray again. So your $cmparray is growing while your crawler crawls the whole web.
for($i=0;$i<count($cmparray);$i++)
{
$linksx=getlinks($cmparray[$i]);
for($j=0;$j<count($linksx);$j++)
{
if(array_search($linksx[$j],$cmparray) == FALSE)
{
// Here your array grows in average with about 5-10 entries per loop
array_push($cmparray,$linksx[$j]);
}
}
echo "<br>".count($cmparray)."-".$i;
}