Forum Moderators: coopster
I'm a newbies in PHP, i'm looking around but I can't
find nothing, or i simply do not understand.
What I want to do must be really simple for some of you.
1. I want to extract a string from a html document into a variable string.
2. This string has a specific pattern, but it is not the same every time :
Lets say I want to extract from this source document :
http://www.example.com
This URL :
http://www.example.com/dist.php?s=
After the s=, there's numerical characters, which I don't know the exact length it may be s=1 or s=1092093...
Finally, I try to find a way to get this number which is different everytime in a string variable...
Would someone have a clue?
Thanks,
Sebastien
[edited by: eelixduppy at 10:29 am (utc) on April 20, 2007]
[edit reason] use example.com, please [/edit]
<code>
<?
class Bot {
private $current_url;
private $current_root;
private $current_method;
private $current_port_number;
public $current_depth;
public $debug = 0;
public $http_status = '';
public function __construct($url){
$this->current_url = array();
$this->current_root = parse_url($url);
$this->current_method = 'GET';
$this->current_port_number = 80;
}
/**
* Mutator/Accessor methods
*/
public function SetMethod($m){
$this->current_method = $m;
}
public function SetPortNumber($p){
$this->current_port_number = $p;
}
/**
* Request
* @param: string url
* returns HTTP response
*/
public function Request($url){
$this->current_url = parse_url($url);
$response = '';
$host = isset($this->current_url['host'])? $this->current_url['host'] : $this->current_root['host'];
$path = isset($this->current_url['path'])? $this->current_url['path'] : '';
if(!preg_match("/^\//", $path)){
$path = "/". $path;
}
$method = $this->current_method;
$port_number = $this->current_port_number;
//echo "$host $path $method";
$FH = '';
try{
//echo "Host: $host <br>";
if(!$FH = fsockopen($host, $port_number, $errno, $errstr, 30)){
//echo "Error $errno <br>";
//print_r($this->root);
}
$request = "$method $path HTTP/1.1\r\n";
//$request .= "From: hirebot@example.com\r\n";
$request .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n";
$request .= "Host: $host\r\n";
$request .= "Connection: Close\r\n\r\n";
if($FH!= '')
fwrite($FH, $request);
if($FH!= ''){
while(!feof($FH)){
$response .= fgets($FH, 1024);
}
fclose($FH);
}
} catch(Exception $e){
//echo "Exception: ". $e->getMessage();
}
//HTTP/1.1 200
if(preg_match("/HTTP\/1\.1 200/i", $response))
$this->http_status = 200;
return $response;
}
/**
* GetLink
* @param: string html source
*/
public function GetLink($html_source){
$reg = "/<a[^>]+href=[\"¦']?([^\"¦'>]+)[\"¦'>]/i";
if(preg_match("/<frame[^>]/", $html_source) > 0)
$reg = "/<frame[^>]+src=[\"¦']?([^\"¦'>]+)[\"¦'>]/i";
$m = array();
preg_match_all($reg, $html_source, $m);
$l = array();
$scheme = isset($this->current_root['scheme'])? $this->current_root['scheme'] : "http://";
$host = $this->current_root['host'];
$path = isset($this->current_root['path'])? $this->current_root['path'] : '';
if(count($m) > 0){ return $l; //$link = urldecode($link); $found = 0; for($i = 0; $i < count($word_list); $i++){ return $found; And here is how it's used <code> $start_page = 'http://www.example.com/Om%20Matas/Ledige%20stillinger/'; echo "Læser siden $start_page\n"; $words = array("ledige stillinger"); $b = new Bot($start_page); $page = $b->Request($start_page); do{ echo "Reading from $current_link \n"; $current_page = $b->Request($current_link); array_push($link_crawled, $current_link); foreach($link_on_current_page as $link){ ?> Im sure it can be optimized and improved, but it works for the project is was created for. [1][edited by: eelixduppy at 10:30 am (utc) on April 20, 2007]
foreach($m[1] as $href){
$is_base = $this->MapPath($host, $href);
if($is_base!= false){
array_push($l, $is_base);
}
}
}
}
/**
* MapPath
* @param: string url
*/
public function MapPath($base, $link){
//Match for ' '
if(preg_match("/\s+/i", $link))
$link = preg_replace("/\s+/i", "%20", $link);
//Match mail and script
if(preg_match("/^(mail¦javascript)/i", $link))
return false;
//Match for different host
if(preg_match("/^(http¦https)/i", $link)){
$tmp_url = parse_url($link);
if($tmp_url['host']!= $base)
return false;
//Return only path
return $link;
}
//Match file and folder
if(preg_match("/^([a-z0-9]¦\/)/i", $link)){
//Check for leading /
if(preg_match("/^\//i", $link)){
$link = preg_replace("/^\//i", "", $link);
}
//Return link if it has a leading letter or number
return $this->current_url['scheme']."://$base/$link";
}
//Match .pdf and other files
if(preg_match("/(\.pdf)$/i", $link))
return false;
//Return false as default
return false;
}
/**
* ApplyFilter
* @param: string regexp, array
*/
public function ApplyFilter($reg, $arr){
$tmp_arr = array();
foreach($arr as $a){
if(preg_match($reg, $a) > 0)
array_push($tmp_arr, $a);
}
return $tmp_arr;
}
/**
* FindInBody($html, $keyword)
*/
public function FindInBody($html, $word){
//$word_list = explode(",", $word);
$word_list = $word;
$word = $word_list[$i];
echo "WORD $word <br>\n";
if(preg_match("/($word)+/i", $html)){
$found = 1;
}
}
}
}
?>
</code>
<?
require_once "bot.php";
$link_to_crawl = array($start_page);
$link_crawled = array();
$current_link = array_shift($link_to_crawl);
$link_on_current_page = $b->GetLink($current_page);
echo "Current link $link \n";
if(!in_array($link, $link_to_crawl) &&!in_array($link, $link_crawled)){
array_push($link_to_crawl, $link);
}
}
}while(count($link_to_crawl) > 0);
</code>
[edit reason] example.com [/edit]
I really apreciate all this...but it's seem really complicated for me...sorry, i'm a real beginner!
I know how to get the html file in a string, but when i get this string, i'm looking for the pattern that will this url:
http://www.example.com/dist.php?s=1
or
http://www.example.com/dist.php?s=129873
or
http://www.example.com/dist.php?s=2983
or
http://www.example.com/dist.php?s=220989803
or
http://www.example.com/dist.php?s=28
or
http://www.example.com/dist.php?s=49088
...
I need in exit the S=?
First how with a pregmatch or other, get the full
http://www.example.com/dist.php?s=?
And after how to get only the number after the S...
Thanks,
Sebastien