I want to share this, 'cause I use this forum a lot, so it's correct I give something.. :)
I've never posted/made-public/etc this script anywhere before now.
- Be aware that the script will create a lot of .txt files.
- If you want the output redirected in a file, in *ix use:
'perl google.pl blahblah > fileyouwant.ext'
[redirecting output]
- If you want the help, run simply 'perl google.pl'.
- Of course, if the Google html layout changes, you must change also the regexp in lines 118-119 and maybe also in the subs.. ;)
- Hope the script will not get broken by browser/textarea.. :)
---BEGIN PERL SCRIPT---
use Socket;
$SIG{ALRM} = \&timeout;
$timeout = '10';#--- num. of seconds-timeout in socketting with servers
$ua = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; DigExt)';#-- he
$indent = 1;
$indent = ' ' x $indent; $localized = $ARGV[2]; #---change the below example inserting only your 3 chars legitimate-for-counting words.. #---change below inserting your -no count- words if(!(-s $fileein)) { --------------------------------------------------------------- n 'n' is the amount of results wanted [NOT the amount of pages] example of a command line 'launch': open(RR, $fileein); $numposition = <RR>; close RR; #---initialize all single files.. #----------------------------------- foreach $singk(@keyw) { #------------------------------------ local $count = 0; COUNT: while ($count < $numposition) { #--- a little customization is needed in the URL below.. i.ex. the UTF stuff etc, my $server = 'www.google.com';#--- hehe; change this with co.uk if you want 'en' as language & UK as country of course. $page =~ s/\<font\ssize\=\-1\>\s*\-\s\[\s\<a\shref\=.+?translate.+?\<\/font\>//ig; #------------------------------------ close $handle; } #----------------------------------- exit 0; #--- begin grab sub $descr =~ s/\<br\>//g; my $badcharcheck = $url . $title . $descr; if ($badcharcheck !~ /\</) { if ($rank =~ s/^(\s*HEADER\:.+?\n\n)//s) { $header =~ s/\n(.)/\n$indent $1/sg;#--- some indent if($srank =~ /^((?:.+\n){$klines})/) { my $pcount = $count +1; return 'nnnn'; #--- begin getkeyw sub if (!($totcont = sck($server, $url, $ua))) { $totwords = 0; my $header = ''; $totcont =~ s/^(.+?)\r?\n\r?\n//s;#--- let's strip the header $totcont =~ s/\s*(\w+)\s*/&stat($1)/esg; $totwords = 1 if($totwords < 1); $return .= "$totcont\n\n"; return $return; } #--- begin sck sub #------------------------------- #alarm($timeout); alarm($timeout); socket (SK, PF_INET, SOCK_STREAM, getprotobyname('tcp')); alarm($timeout); select(SK); EOM alarm($timeout); if ($@ =~ /BLAH/) { }; shutdown(SK, 2); #--- begin stat sub #--- here we don't count any 1-2-3 char word, at least if not in the 'legitimate 3 word' index $totwords++; #--- begin timeout sub ---ENDOF PERL SCRIPT--- cminblues [1][edited by: jatar_k at 9:17 pm (utc) on Oct. 27, 2002]
$fileein = $ARGV[0];
$klines = $ARGV;
$klines++;
$klines--;
$localized = '&hl=' . $localized if($localized =~ /^\w+$/);
$chars3 = 'red-god-sea-sky-leg-bed-aol-seo-new';
$chars3 =~ s/[^\w\-]//g;
$chars3 =~ s/^\W//;
$chars3 =~ s/\W$//;
$chars3 =~ s/\-/\¦/g;
$nocount = 'nocount1-nocount2-nocount3';
$nocount =~ s/[^\w\-]//g;
$nocount =~ s/^\W//;
$nocount =~ s/\W$//;
$nocount =~ s/\-/\¦/g;
print "Usage: perl $0 file.ext klines localID
file.ext has this format:
word1
word2
word3 multiple
etc etc
----------------------------------------------------------------
'klines' is the n. of keyw displayed in std output for each site
----------------------------------------------------------------
'localID' [optional] is the ID of your country in Google search
[ex. 'en', 'de', etc]
----------------------------------------------------------------
perl google.pl example.dat 30 de
";
exit 0;
}
chomp $numposition;
while(<RR>) {
my $tmp = $_;
chomp $tmp;
push(@keyw, $tmp);
}
foreach $initial(@keyw) {
my $file = $initial;
$file =~ s/\W/_/g;
$file .= '.txt';
open (W, ">$file");
close W;
}
#------BEGIN KEYWORDS FOR CYCLE-----
#-----------------------------------
$handle = uc $singk;
my $wfile = $singk;
$wfile =~ s/\W/_/g;
$wfile .= '.txt';
open($handle, ">>$wfile");
#------BEGIN POSITIONS FOR CYCLE-----
#------------------------------------
my $pag = -10;
$pag += 10;
my $adjsingk = $singk;
$adjsingk =~ s/\s/+/g;
#--- if we want our query more browser-similar
my $url = '/search?q=' . $adjsingk . '&start=' . $pag . $localized . '&sa=N';
$server =~ s/com$/$localized/ if($localized =~ /^(fr¦de¦it¦ch)$/);
my $page = sck($server, $url, $ua);
$page =~ s/\<p\>\<a\shref\=([^\>]+)\>(.+?)\<\/a\>\s*\<br\>\<font\ssize\=\-1\>(.*?)
(?:\<font\scolor\=\"*\#008000\"?\>¦\<span)/&grab($1,$2,$3)/iseg;
<mod note: the above 2 lines were split for wrapping>
}
#------ENDOF POSITIONS FOR CYCLE-----
#------------------------------------
#------ENDOF KEYWORDS FOR CYCLE-----
#-----------------------------------
sub grab {
my $url = $_[0];
my $title = $_[1];
my $descr = $_[2];
my $header = '';
$title =~ s/\.\.\.//g;
$descr =~ s/\.\.\.//g;
$title =~ s/\<\/?b\>//g;
$descr =~ s/\<\/?b\>//g;
$title =~ s/\s{2,}/ /g;
$descr =~ s/\s{2,}/ /g;
$descr =~ s/(?:\n¦\r)//isg;
my $writeurl = $url;
$writeurl =~ s/^http\:\/\///;
$writeurl =~ s/[^\w\.]/\-/g;
$writeurl =~ s/\-$//;
my $shorthandle = $handle;
$shorthandle =~ s/\.txt$//;
$writeurl .= '-' . $shorthandle . '.txt';
$writeurl =~ s/[^\w\.]/\-/g;
my $rank = getkeyw($url);
open(WU, ">$writeurl");
print WU $rank;
close WU;
$header = $1;
}
print STDOUT $header;#--- comment this out if you don't want to see the headers in the resume/output
my $srank = $rank;
$srank = $1;
}
print STDOUT "[$pcount] **$shorthandle** URL: $url -- TITLE: $title --\nDESCR: $descr\n$srank\n\n\n\n";
print $handle "$url \:\:\:\: $title \:\:\:\: $descr\n-----\n$srank\n-----\n";
$count++;
last COUNT if($count >= $numposition);
}
}
#--- endof grab sub
sub getkeyw {
local %kw;
my $return;
my $totcont;
my $tmp = shift;
$tmp =~ s/^http\:\/\///;
$tmp =~ /^([^\/]+)(\/.*)$/;
my $server = $1;
my $url = $2;
$return .= "--- SITE NOT REACHED ---\n";
return $return;
}
$header = $1;
$header =~ s/\r//g;
$return .= "$indent HEADER:\n$header\n\n" if($header =~ /\w/);
$totcont =~ s/\<style.+?\<\/style\>/ /isg;
$totcont =~ s/\<script.+?\<\/script\>/ /isg;
$totcont =~ s/\<[^\>]+\>/ /sg;
$totcont =~ s/\&\#\d+\;/ /sg;
$totcont =~ s/\&\w+\;/ /sg;
$totcont =~ s/\W/ /sg;
$totcont =~ s/\W($nocount)\W/ /isg;
$totcont =~ s/\s{2,}/ /sg;
foreach $singrank(sort{my $one = $kw{$a}; my $two = $kw{$b}; $two <=> $one;}(keys(%kw))) {
my $kwrank = $kw{$singrank};
my $perc = (($kwrank * 100) / $totwords);
$perc =~ s/(\.\d\d).+$/$1/;
$perc = ' ' x (5 - length($perc)) . $perc;
$kwrank = ' ' x (3 - length($kwrank)) . $kwrank;
my $bd0 = '.' x (30 - length($singrank));
$return .= "$singrank$bd0$kwrank $perc\%\n";
}
#--- endof getkeyw sub
sub sck {
my $response;
my ($server, $url, $ua) = @_;
my $referer = 'http://' . $server . $url;
#---------BEGIN EVAL------------
#-------------------------------
eval {
my $there;
#my $noo = gethostbyname($server);
#alarm(0);
$there = sockaddr_in(80, inet_aton($server));
alarm(0);
return 'nonono' if ($@ =~ /BLAH/);
connect (SK, $there);
alarm(0);
return 'nonono' if ($@ =~ /BLAH/);
$¦ = 1;
print SK <<EOM;
GET $url HTTP/1.0
Referer: $referer
Host: $server
User-Agent: $ua
while (<SK>) {
$response .= $_;
}
alarm(0);
print STDOUT "\n\n-----Socket timeout-----\n\n";
return 'nonono';
}
#-------------------------------
#---------ENDOF EVAL------------
#-------------------------------
select STDOUT;
return $response;
}
#--- endof sck sub
sub stat {
my $kw = shift;
$kw = lc($kw);
#--- comment the line below if you don't want to use this feature
return '' if ((length($kw) < 4)&&($kw !~ /($chars3)/i));
$kw{$kw}++;
return '';
}
#--- endof stat sub
sub timeout {
shutdown(SK, 2);
print STDOUT "Socket TIMEOUT\n";
select (STDOUT);
die "BLAH";
}
#--- endof timeout sub
---BEGIN EXAMPLE.DAT---
30
widget
blue sea
apple pie
bad computer
good geek
damn lamer
money back
girl smile
cup of coffe
---ENDOF EXAMPLE.DAT---