Forum Moderators: coopster

Message Too Old, No Replies

Image Scraping

         

vincenyc

12:52 am on Dec 27, 2009 (gmt 0)

10+ Year Member




Looking for a script that scrapes only images from a website...specifically jpgs and gifs.

Does anyone have any sites or source they can share?

FourDegreez

11:10 pm on Dec 27, 2009 (gmt 0)

WebmasterWorld Senior Member 10+ Year Member



The humble beginnings of a spider class:

<?php
class Spider {
private $ch;
private $content;
private $binary;
private $url;

function __construct() {
$this->content = '';
$this->binary = false;
$this->url = '';
}

function fetchPage($url) {
$this->url = $url;
if (isset($this->url)) {
$this->ch = curl_init();
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this->ch, CURLOPT_URL, $this->url);
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary);
$this->content = curl_exec($this->ch);
curl_close($this->ch);
}
}

function getContent() {
return $this->content;
}
}
?>

Then you could do something like:

$spider = new Spider();
$spider->fetchPage('http://en.wikipedia.org/');
$content = $spider->getContent();

$regx = '/<img src="(.*?)"([^>]*)>/ims';
$img_src = array();
preg_match_all($regx, $content, $imgs, PREG_PATTERN_ORDER);
$img_src = $imgs[0];

which will extract all the URLs of the images on the page, using regex. This is assuming the page has src as the first attribute of img, which is not true on wikipedia, so the above won't actually work. But there's the general idea.

vincenyc

7:39 am on Dec 28, 2009 (gmt 0)

10+ Year Member



thanks, but here's what i ended up doing...
only problem is i get a nasty bug when I try to scrape certain jpgs

for example...

this works
<? GetImagesbyURL('http://rawstory.com/2009/12/gop-strategist-matalin-bush-inherited-911-attacks-clinton/',2); ?>

this doesn't
<? GetImagesbyURL('http://sports.yahoo.com/nba/recap?gid=2009122617&prov=ap',2); ?>

how can i do some error handling to capture the bugs and not display them.
here's what i have so far...

<?
function GetImagesbyURL($siteURL,$fileType) {
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$siteURL);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$data = curl_exec($ch);
if (!$data) {
echo "Please try again later.<br />";
echo "error number:" .curl_errno($ch);
echo 'error:' . curl_error($ch);
exit;
}
curl_close($ch);

$regex = '([\src=[\A-Za-z0-9\.\_\-]

preg_match_all($regex,$data,$match);

for ($i=0; $i < count($match[0]); $i++) {

$source = $match[0][$i];
//echo $source."<br>";

$imgURL = InternetCombineURL($siteURL,$source);

//get Image size
list($width, $height) = getimagesize($imgURL);

echo "<a href='addtag.php'><img src=".$imgURL." width='".$width."' height='".$height."'></a><br>";
echo $imgURL."<br>";
echo "Size: " . $width . "x" . $height . "<br>";
echo "<br>";

}
}

//Function to get the absolute URL for Image
function InternetCombineUrl($absolute, $relative) {
$p = parse_url($relative);
if($p["scheme"])return $relative;

extract(parse_url($absolute));

$path = dirname($path);

if($relative{0} == '/') {
$cparts = array_filter(explode("/", $relative));
}
else {
$aparts = array_filter(explode("/", $path));
$rparts = array_filter(explode("/", $relative));
$cparts = array_merge($aparts, $rparts);
foreach($cparts as $i => $part) {
if($part == '.') {
$cparts[$i] = null;
}
if($part == '..') {
$cparts[$i - 1] = null;
$cparts[$i] = null;
}
}
$cparts = array_filter($cparts);
}
$path = implode("/", $cparts);
$url = "";
if($scheme) {
$url = "$scheme://";
}
if($user) {
$url .= "$user";
if($pass) {
$url .= ":$pass";
}
$url .= "@";
}
if($host) {
$url .= "$host/";
}
$url .= $path;
return $url;
}

?>