Forum Moderators: coopster
<?php
class Spider {
private $ch;
private $content;
private $binary;
private $url;function __construct() {
$this->content = '';
$this->binary = false;
$this->url = '';
}function fetchPage($url) {
$this->url = $url;
if (isset($this->url)) {
$this->ch = curl_init();
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this->ch, CURLOPT_URL, $this->url);
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary);
$this->content = curl_exec($this->ch);
curl_close($this->ch);
}
}function getContent() {
return $this->content;
}
}
?>
Then you could do something like:
$spider = new Spider();
$spider->fetchPage('http://en.wikipedia.org/');
$content = $spider->getContent();$regx = '/<img src="(.*?)"([^>]*)>/ims';
$img_src = array();
preg_match_all($regx, $content, $imgs, PREG_PATTERN_ORDER);
$img_src = $imgs[0];
which will extract all the URLs of the images on the page, using regex. This is assuming the page has src as the first attribute of img, which is not true on wikipedia, so the above won't actually work. But there's the general idea.
for example...
this works
<? GetImagesbyURL('http://rawstory.com/2009/12/gop-strategist-matalin-bush-inherited-911-attacks-clinton/',2); ?>
this doesn't
<? GetImagesbyURL('http://sports.yahoo.com/nba/recap?gid=2009122617&prov=ap',2); ?>
how can i do some error handling to capture the bugs and not display them.
here's what i have so far...
<?
function GetImagesbyURL($siteURL,$fileType) {
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$siteURL);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$data = curl_exec($ch);
if (!$data) {
echo "Please try again later.<br />";
echo "error number:" .curl_errno($ch);
echo 'error:' . curl_error($ch);
exit;
}
curl_close($ch);
$regex = '([\src=[\A-Za-z0-9\.\_\-]
preg_match_all($regex,$data,$match);
for ($i=0; $i < count($match[0]); $i++) {
$source = $match[0][$i];
//echo $source."<br>";
$imgURL = InternetCombineURL($siteURL,$source);
//get Image size
list($width, $height) = getimagesize($imgURL);
echo "<a href='addtag.php'><img src=".$imgURL." width='".$width."' height='".$height."'></a><br>";
echo $imgURL."<br>";
echo "Size: " . $width . "x" . $height . "<br>";
echo "<br>";
}
}
//Function to get the absolute URL for Image
function InternetCombineUrl($absolute, $relative) {
$p = parse_url($relative);
if($p["scheme"])return $relative;
extract(parse_url($absolute));
$path = dirname($path);
if($relative{0} == '/') {
$cparts = array_filter(explode("/", $relative));
}
else {
$aparts = array_filter(explode("/", $path));
$rparts = array_filter(explode("/", $relative));
$cparts = array_merge($aparts, $rparts);
foreach($cparts as $i => $part) {
if($part == '.') {
$cparts[$i] = null;
}
if($part == '..') {
$cparts[$i - 1] = null;
$cparts[$i] = null;
}
}
$cparts = array_filter($cparts);
}
$path = implode("/", $cparts);
$url = "";
if($scheme) {
$url = "$scheme://";
}
if($user) {
$url .= "$user";
if($pass) {
$url .= ":$pass";
}
$url .= "@";
}
if($host) {
$url .= "$host/";
}
$url .= $path;
return $url;
}
?>