Newbie need help, how to extract a URL from a html document

Forum Moderators: coopster

Message Too Old, No Replies

Newbie need help, how to extract a URL from a html document

lematou

4:17 am on Apr 20, 2007 (gmt 0)

Hi,

I'm a newbies in PHP, i'm looking around but I can't
find nothing, or i simply do not understand.

What I want to do must be really simple for some of you.

1. I want to extract a string from a html document into a variable string.

2. This string has a specific pattern, but it is not the same every time :

Lets say I want to extract from this source document :
http://www.example.com

This URL :
http://www.example.com/dist.php?s=

After the s=, there's numerical characters, which I don't know the exact length it may be s=1 or s=1092093...

Finally, I try to find a way to get this number which is different everytime in a string variable...

Would someone have a clue?

Thanks,

Sebastien

[edited by: eelixduppy at 10:29 am (utc) on April 20, 2007]
[edit reason] use example.com, please [/edit]

kristian nissen

7:28 am on Apr 20, 2007 (gmt 0)

Hi, I once wrote this small robot:

<code>
<?
class Bot {
private $current_url;
private $current_root;
private $current_method;
private $current_port_number;

public $current_depth;
public $debug = 0;
public $http_status = '';

public function __construct($url){
$this->current_url = array();
$this->current_root = parse_url($url);
$this->current_method = 'GET';
$this->current_port_number = 80;
}
/**
* Mutator/Accessor methods
*/
public function SetMethod($m){
$this->current_method = $m;
}
public function SetPortNumber($p){
$this->current_port_number = $p;
}
/**
* Request
* @param: string url
* returns HTTP response
*/
public function Request($url){
$this->current_url = parse_url($url);

$response = '';

$host = isset($this->current_url['host'])? $this->current_url['host'] : $this->current_root['host'];
$path = isset($this->current_url['path'])? $this->current_url['path'] : '';
if(!preg_match("/^\//", $path)){
$path = "/". $path;
}
$method = $this->current_method;
$port_number = $this->current_port_number;

//echo "$host $path $method";

$FH = '';
try{
//echo "Host: $host <br>";
if(!$FH = fsockopen($host, $port_number, $errno, $errstr, 30)){
//echo "Error $errno <br>";
//print_r($this->root);
}

$request = "$method $path HTTP/1.1\r\n";
//$request .= "From: hirebot@example.com\r\n";
$request .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n";
$request .= "Host: $host\r\n";
$request .= "Connection: Close\r\n\r\n";

if($FH!= '')
fwrite($FH, $request);

if($FH!= ''){
while(!feof($FH)){
$response .= fgets($FH, 1024);
}
fclose($FH);
}

} catch(Exception $e){
//echo "Exception: ". $e->getMessage();
}

//HTTP/1.1 200
if(preg_match("/HTTP\/1\.1 200/i", $response))
$this->http_status = 200;

return $response;
}
/**
* GetLink
* @param: string html source
*/
public function GetLink($html_source){
$reg = "/<a[^>]+href=[\"Ś']?([^\"Ś'>]+)[\"Ś'>]/i";
if(preg_match("/<frame[^>]/", $html_source) > 0)
$reg = "/<frame[^>]+src=[\"Ś']?([^\"Ś'>]+)[\"Ś'>]/i";

$m = array();
preg_match_all($reg, $html_source, $m);

$l = array();
$scheme = isset($this->current_root['scheme'])? $this->current_root['scheme'] : "http://";
$host = $this->current_root['host'];
$path = isset($this->current_root['path'])? $this->current_root['path'] : '';

if(count($m) > 0){
foreach($m[1] as $href){
$is_base = $this->MapPath($host, $href);
if($is_base!= false){
array_push($l, $is_base);
}
}
}

return $l;
}
/**
* MapPath
* @param: string url
*/
public function MapPath($base, $link){
//Match for ' '
if(preg_match("/\s+/i", $link))
$link = preg_replace("/\s+/i", "%20", $link);

//$link = urldecode($link);
//Match mail and script
if(preg_match("/^(mailŚjavascript)/i", $link))
return false;
//Match for different host
if(preg_match("/^(httpŚhttps)/i", $link)){
$tmp_url = parse_url($link);
if($tmp_url['host']!= $base)
return false;
//Return only path
return $link;
}
//Match file and folder
if(preg_match("/^([a-z0-9]Ś\/)/i", $link)){
//Check for leading /
if(preg_match("/^\//i", $link)){
$link = preg_replace("/^\//i", "", $link);
}
//Return link if it has a leading letter or number
return $this->current_url['scheme']."://$base/$link";
}
//Match .pdf and other files
if(preg_match("/(\.pdf)$/i", $link))
return false;
//Return false as default
return false;
}
/**
* ApplyFilter
* @param: string regexp, array
*/
public function ApplyFilter($reg, $arr){
$tmp_arr = array();
foreach($arr as $a){
if(preg_match($reg, $a) > 0)
array_push($tmp_arr, $a);
}
return $tmp_arr;
}
/**
* FindInBody($html, $keyword)
*/
public function FindInBody($html, $word){
//$word_list = explode(",", $word);
$word_list = $word;

$found = 0;

for($i = 0; $i < count($word_list); $i++){
$word = $word_list[$i];
echo "WORD $word <br>\n";
if(preg_match("/($word)+/i", $html)){
$found = 1;
}
}

return $found;
}
}
?>
</code>

And here is how it's used

<code>
<?
require_once "bot.php";

$start_page = 'http://www.example.com/Om%20Matas/Ledige%20stillinger/';

echo "Lćser siden $start_page\n";

$words = array("ledige stillinger");

$b = new Bot($start_page);
$link_to_crawl = array($start_page);
$link_crawled = array();

$page = $b->Request($start_page);

do{
$current_link = array_shift($link_to_crawl);

echo "Reading from $current_link \n";

$current_page = $b->Request($current_link);
$link_on_current_page = $b->GetLink($current_page);

array_push($link_crawled, $current_link);

foreach($link_on_current_page as $link){
echo "Current link $link \n";
if(!in_array($link, $link_to_crawl) &&!in_array($link, $link_crawled)){
array_push($link_to_crawl, $link);
}
}
}while(count($link_to_crawl) > 0);

?>
</code>

Im sure it can be optimized and improved, but it works for the project is was created for.

[1][edited by: eelixduppy at 10:30 am (utc) on April 20, 2007]
[edit reason] example.com [/edit]

lematou

2:29 pm on Apr 20, 2007 (gmt 0)

Hi,

I really apreciate all this...but it's seem really complicated for me...sorry, i'm a real beginner!

I know how to get the html file in a string, but when i get this string, i'm looking for the pattern that will this url:

http://www.example.com/dist.php?s=1
or
http://www.example.com/dist.php?s=129873
or
http://www.example.com/dist.php?s=2983
or
http://www.example.com/dist.php?s=220989803
or
http://www.example.com/dist.php?s=28
or
http://www.example.com/dist.php?s=49088
...

I need in exit the S=?

First how with a pregmatch or other, get the full
http://www.example.com/dist.php?s=?

And after how to get only the number after the S...

Thanks,

Sebastien