Forum Moderators: coopster

Message Too Old, No Replies

get all links from whole site

without opendir/readdir

         

StarTrek

10:37 am on Mar 16, 2003 (gmt 0)



I want to write an application what to extract all links from a whole site, but not work:

<?php

//startpage: index.php

$startpage="http://hal.cs.tuiasi.ro/~dmihaiu/site/index.php";
$storefile="links/index.txt";

$cmparray=array(); //creare array

$links1=getlinks($startpage); //create array with links

for($j=0;$j<count($links1);$j++)
{
array_push($cmparray,$links1[$j]);
}

for($i=0;$i<count($cmparray);$i++)
{
$linksx=getlinks($cmparray[$i]);
for($j=0;$j<count($linksx);$j++)
{
if(array_search($linksx[$j],$cmparray) == FALSE)
{
array_push($cmparray,$linksx[$j]);
}
}
echo "<br>".count($cmparray)."-".$i;
}

echo "<pre>";
print_r($cmparray);

function getlinks($page) //get links from a page, store in array
{

@$startfile=file($page);
@$indexfile=join("",$startfile);

$indexfile=str_replace("\n"," ",$indexfile);
$indexfile=str_replace("</a>","</a>\n",$indexfile);
$indexfile=split("\n",$indexfile);

$dim=count($indexfile);

$links=array();

for($i=0;$i<$dim;$i++)
{
$indexfile[$i]=eregi_replace(".*<a ","<a ",$indexfile[$i]);
eregi("href=[\"']{0,1}([^\"'> ]*)",$indexfile[$i],$req);
$url=trim($req[1]);
if($url!= "" && $url!= "/" && $url!= ''
&& substr($url,0,7)!= "mailto:" && substr($url, 0,11)!= "java script:"
&& substr($url, 0,11)!= "java script:")
{
if(substr($url,0,4) == "www.")
{
$url="http://".$url;
}
if(substr($url,0,1) == "/")
{
$url="http://".$_SERVER['HTTP_HOST']."/~dmihaiu"."$url";
}
if(substr($url,0,1)!= "/" && substr($url,0,7)!= "http://")
{
$url="http://".$_SERVER['HTTP_HOST']."/~dmihaiu/"."$url";
}

preg_match("/^(http://)?([^/]+)/i",$url, $hostname);
$host = $hostname[2];

if(eregi($host,$_SERVER["HTTP_HOST"]))
{
array_push($links,trim($url));
}

}
}

$links1=array_unique($links);

unset($indexfile);
unset($links);

return $links1;
} //end function getlinks

?>

The getlinks() function work corectly, but I can't stop my progam.
What is wrong?

Fischerlaender

12:10 pm on Mar 16, 2003 (gmt 0)

10+ Year Member



The getlinks() function work corectly, but I can't stop my progam.

If I understand your code, then you start with a given page, fetch all links within this page and push them in $cmparray (with a duplicate in $link1). Then you have a loop with which you traverse through your $cmparray; for every entry in $cmparray you get the links and push them in $cmparray unless this link is already in $cmparray.

This loop now runs (nearly) for eternity, because with every entry in $cmparray you get 5 to 10 new links (average number of links on a webpage) which are pushed on $cmparray again. So your $cmparray is growing while your crawler crawls the whole web.

for($i=0;$i<count($cmparray);$i++) 
{
$linksx=getlinks($cmparray[$i]);
for($j=0;$j<count($linksx);$j++)
{
if(array_search($linksx[$j],$cmparray) == FALSE)
{
// Here your array grows in average with about 5-10 entries per loop
array_push($cmparray,$linksx[$j]);
}
}
echo "<br>".count($cmparray)."-".$i;
}