Image Scraping

Forum Moderators: coopster

Message Too Old, No Replies

Image Scraping

vincenyc

12:52 am on Dec 27, 2009 (gmt 0)

Looking for a script that scrapes only images from a website...specifically jpgs and gifs.

Does anyone have any sites or source they can share?

FourDegreez

11:10 pm on Dec 27, 2009 (gmt 0)

The humble beginnings of a spider class:

<?php 
class Spider { 
 private $ch;  
 private $content; 
 private $binary; 
 private $url; 
 function __construct() { 
  $this->content = ''; 
  $this->binary = false; 
  $this->url = ''; 
 } 
 function fetchPage($url) { 
  $this->url = $url; 
  if (isset($this->url)) {  
   $this->ch = curl_init(); 
   curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1); 
   curl_setopt($this->ch, CURLOPT_URL, $this->url); 
   curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true); 
   curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary); 
   $this->content = curl_exec($this->ch); 
   curl_close($this->ch); 
  } 
 } 
 function getContent() { 
  return $this->content; 
 } 
} 
?>

Then you could do something like:

$spider = new Spider(); 
$spider->fetchPage('http://en.wikipedia.org/'); 
$content = $spider->getContent(); 
$regx = '/<img src="(.*?)"([^>]*)>/ims'; 
$img_src = array(); 
preg_match_all($regx, $content, $imgs, PREG_PATTERN_ORDER); 
$img_src = $imgs[0];

which will extract all the URLs of the images on the page, using regex. This is assuming the page has src as the first attribute of img, which is not true on wikipedia, so the above won't actually work. But there's the general idea.

vincenyc

7:39 am on Dec 28, 2009 (gmt 0)

thanks, but here's what i ended up doing...
only problem is i get a nasty bug when I try to scrape certain jpgs

for example...

this works
<? GetImagesbyURL('http://rawstory.com/2009/12/gop-strategist-matalin-bush-inherited-911-attacks-clinton/',2); ?>

this doesn't
<? GetImagesbyURL('http://sports.yahoo.com/nba/recap?gid=2009122617&prov=ap',2); ?>

how can i do some error handling to capture the bugs and not display them.
here's what i have so far...

<?
function GetImagesbyURL($siteURL,$fileType) {
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$siteURL);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$data = curl_exec($ch);
if (!$data) {
echo "Please try again later. ";
echo "error number:" .curl_errno($ch);
echo 'error:' . curl_error($ch);
exit;
}
curl_close($ch);

$regex = '([\src=[\A-Za-z0-9\.\_\-]

preg_match_all($regex,$data,$match);

for ($i=0; $i < count($match[0]); $i++) {

$source = $match[0][$i];
//echo $source." ";

$imgURL = InternetCombineURL($siteURL,$source);

//get Image size
list($width, $height) = getimagesize($imgURL);

echo "<a href='addtag.php'><img src=".$imgURL." width='".$width."' height='".$height."'></a> ";
echo $imgURL." ";
echo "Size: " . $width . "x" . $height . " ";
echo " ";

}
}

//Function to get the absolute URL for Image
function InternetCombineUrl($absolute, $relative) {
$p = parse_url($relative);
if($p["scheme"])return $relative;

extract(parse_url($absolute));

$path = dirname($path);

if($relative{0} == '/') {
$cparts = array_filter(explode("/", $relative));
}
else {
$aparts = array_filter(explode("/", $path));
$rparts = array_filter(explode("/", $relative));
$cparts = array_merge($aparts, $rparts);
foreach($cparts as $i => $part) {
if($part == '.') {
$cparts[$i] = null;
}
if($part == '..') {
$cparts[$i - 1] = null;
$cparts[$i] = null;
}
}
$cparts = array_filter($cparts);
}
$path = implode("/", $cparts);
$url = "";
if($scheme) {
$url = "$scheme://";
}
if($user) {
$url .= "$user";
if($pass) {
$url .= ":$pass";
}
$url .= "@";
}
if($host) {
$url .= "$host/";
}
$url .= $path;
return $url;
}

Image Scraping

vincenyc

FourDegreez

vincenyc

Join The Conversation

Moderators and Top Contributors

Hot Threads This Week