Forum Moderators: coopster & phranque

Message Too Old, No Replies

Perl script analysing google-ranked sites

Headers&keyw "Google-ranked-sites analysis" perl script I wrote.

         

cminblues

6:28 am on Oct 27, 2002 (gmt 0)

10+ Year Member



I've wrote this script some time ago.
Features:
* given a list of keyw [simple 'keyw' or 'keyw1-keyw2-etc'], the script makes a list of the keyw %,
and the server headers of sites ranked in google for these keyw.
[The number of results is a parameter you can decide]

I want to share this, 'cause I use this forum a lot, so it's correct I give something.. :)

I've never posted/made-public/etc this script anywhere before now.

- Be aware that the script will create a lot of .txt files.
- If you want the output redirected in a file, in *ix use:
'perl google.pl blahblah > fileyouwant.ext'
[redirecting output]
- If you want the help, run simply 'perl google.pl'.
- Of course, if the Google html layout changes, you must change also the regexp in lines 118-119 and maybe also in the subs.. ;)
- Hope the script will not get broken by browser/textarea.. :)

---BEGIN PERL SCRIPT---

use Socket;
$SIG{ALRM} = \&timeout;
$timeout = '10';#--- num. of seconds-timeout in socketting with servers
$ua = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; DigExt)';#-- he
$indent = 1;

$indent = ' ' x $indent;
$fileein = $ARGV[0];
$klines = $ARGV;
$klines++;
$klines--;

$localized = $ARGV[2];
$localized = '&hl=' . $localized if($localized =~ /^\w+$/);

#---change the below example inserting only your 3 chars legitimate-for-counting words..
$chars3 = 'red-god-sea-sky-leg-bed-aol-seo-new';
$chars3 =~ s/[^\w\-]//g;
$chars3 =~ s/^\W//;
$chars3 =~ s/\W$//;
$chars3 =~ s/\-/\¦/g;

#---change below inserting your -no count- words
$nocount = 'nocount1-nocount2-nocount3';
$nocount =~ s/[^\w\-]//g;
$nocount =~ s/^\W//;
$nocount =~ s/\W$//;
$nocount =~ s/\-/\¦/g;

if(!(-s $fileein)) {
print "Usage: perl $0 file.ext klines localID

---------------------------------------------------------------
file.ext has this format:

n
word1
word2
word3 multiple
etc etc

'n' is the amount of results wanted [NOT the amount of pages]
----------------------------------------------------------------
'klines' is the n. of keyw displayed in std output for each site
----------------------------------------------------------------
'localID' [optional] is the ID of your country in Google search
[ex. 'en', 'de', etc]
----------------------------------------------------------------

example of a command line 'launch':
perl google.pl example.dat 30 de
";
exit 0;
}

open(RR, $fileein);

$numposition = <RR>;
chomp $numposition;
while(<RR>) {
my $tmp = $_;
chomp $tmp;
push(@keyw, $tmp);
}

close RR;

#---initialize all single files..
foreach $initial(@keyw) {
my $file = $initial;
$file =~ s/\W/_/g;
$file .= '.txt';
open (W, ">$file");
close W;
}

#-----------------------------------
#------BEGIN KEYWORDS FOR CYCLE-----
#-----------------------------------

foreach $singk(@keyw) {
$handle = uc $singk;
my $wfile = $singk;
$wfile =~ s/\W/_/g;
$wfile .= '.txt';
open($handle, ">>$wfile");

#------------------------------------
#------BEGIN POSITIONS FOR CYCLE-----
#------------------------------------

local $count = 0;
my $pag = -10;

COUNT: while ($count < $numposition) {
$pag += 10;
my $adjsingk = $singk;
$adjsingk =~ s/\s/+/g;

#--- a little customization is needed in the URL below.. i.ex. the UTF stuff etc,
#--- if we want our query more browser-similar
my $url = '/search?q=' . $adjsingk . '&start=' . $pag . $localized . '&sa=N';

my $server = 'www.google.com';#--- hehe; change this with co.uk if you want 'en' as language & UK as country of course.
$server =~ s/com$/$localized/ if($localized =~ /^(fr¦de¦it¦ch)$/);
my $page = sck($server, $url, $ua);

$page =~ s/\<font\ssize\=\-1\>\s*\-\s\[\s\<a\shref\=.+?translate.+?\<\/font\>//ig;
$page =~ s/\<p\>\<a\shref\=([^\>]+)\>(.+?)\<\/a\>\s*\<br\>\<font\ssize\=\-1\>(.*?)
(?:\<font\scolor\=\"*\#008000\"?\>¦\<span)/&grab($1,$2,$3)/iseg;
<mod note: the above 2 lines were split for wrapping>
}

#------------------------------------
#------ENDOF POSITIONS FOR CYCLE-----
#------------------------------------

close $handle;

}

#-----------------------------------
#------ENDOF KEYWORDS FOR CYCLE-----
#-----------------------------------

exit 0;

#--- begin grab sub
sub grab {
my $url = $_[0];
my $title = $_[1];
my $descr = $_[2];
my $header = '';

$descr =~ s/\<br\>//g;
$title =~ s/\.\.\.//g;
$descr =~ s/\.\.\.//g;
$title =~ s/\<\/?b\>//g;
$descr =~ s/\<\/?b\>//g;
$title =~ s/\s{2,}/ /g;
$descr =~ s/\s{2,}/ /g;
$descr =~ s/(?:\n¦\r)//isg;

my $badcharcheck = $url . $title . $descr;

if ($badcharcheck !~ /\</) {
my $writeurl = $url;
$writeurl =~ s/^http\:\/\///;
$writeurl =~ s/[^\w\.]/\-/g;
$writeurl =~ s/\-$//;
my $shorthandle = $handle;
$shorthandle =~ s/\.txt$//;
$writeurl .= '-' . $shorthandle . '.txt';
$writeurl =~ s/[^\w\.]/\-/g;
my $rank = getkeyw($url);
open(WU, ">$writeurl");
print WU $rank;
close WU;

if ($rank =~ s/^(\s*HEADER\:.+?\n\n)//s) {
$header = $1;
}

$header =~ s/\n(.)/\n$indent $1/sg;#--- some indent
print STDOUT $header;#--- comment this out if you don't want to see the headers in the resume/output
my $srank = $rank;

if($srank =~ /^((?:.+\n){$klines})/) {
$srank = $1;
}

my $pcount = $count +1;
print STDOUT "[$pcount] **$shorthandle** URL: $url -- TITLE: $title --\nDESCR: $descr\n$srank\n\n\n\n";
print $handle "$url \:\:\:\: $title \:\:\:\: $descr\n-----\n$srank\n-----\n";
$count++;
last COUNT if($count >= $numposition);
}

return 'nnnn';
}
#--- endof grab sub

#--- begin getkeyw sub
sub getkeyw {
local %kw;
my $return;
my $totcont;
my $tmp = shift;
$tmp =~ s/^http\:\/\///;
$tmp =~ /^([^\/]+)(\/.*)$/;
my $server = $1;
my $url = $2;

if (!($totcont = sck($server, $url, $ua))) {
$return .= "--- SITE NOT REACHED ---\n";
return $return;
}

$totwords = 0;

my $header = '';

$totcont =~ s/^(.+?)\r?\n\r?\n//s;#--- let's strip the header
$header = $1;
$header =~ s/\r//g;
$return .= "$indent HEADER:\n$header\n\n" if($header =~ /\w/);
$totcont =~ s/\<style.+?\<\/style\>/ /isg;
$totcont =~ s/\<script.+?\<\/script\>/ /isg;
$totcont =~ s/\<[^\>]+\>/ /sg;
$totcont =~ s/\&\#\d+\;/ /sg;
$totcont =~ s/\&\w+\;/ /sg;
$totcont =~ s/\W/ /sg;
$totcont =~ s/\W($nocount)\W/ /isg;
$totcont =~ s/\s{2,}/ /sg;

$totcont =~ s/\s*(\w+)\s*/&stat($1)/esg;

$totwords = 1 if($totwords < 1);
foreach $singrank(sort{my $one = $kw{$a}; my $two = $kw{$b}; $two <=> $one;}(keys(%kw))) {
my $kwrank = $kw{$singrank};
my $perc = (($kwrank * 100) / $totwords);
$perc =~ s/(\.\d\d).+$/$1/;
$perc = ' ' x (5 - length($perc)) . $perc;
$kwrank = ' ' x (3 - length($kwrank)) . $kwrank;
my $bd0 = '.' x (30 - length($singrank));
$return .= "$singrank$bd0$kwrank $perc\%\n";
}

$return .= "$totcont\n\n";

return $return;

}
#--- endof getkeyw sub

#--- begin sck sub
sub sck {
my $response;
my ($server, $url, $ua) = @_;
my $referer = 'http://' . $server . $url;

#-------------------------------
#---------BEGIN EVAL------------
#-------------------------------
eval {
my $there;

#alarm($timeout);
#my $noo = gethostbyname($server);
#alarm(0);

alarm($timeout);
$there = sockaddr_in(80, inet_aton($server));
alarm(0);
return 'nonono' if ($@ =~ /BLAH/);

socket (SK, PF_INET, SOCK_STREAM, getprotobyname('tcp'));

alarm($timeout);
connect (SK, $there);
alarm(0);
return 'nonono' if ($@ =~ /BLAH/);

select(SK);
$¦ = 1;
print SK <<EOM;
GET $url HTTP/1.0
Referer: $referer
Host: $server
User-Agent: $ua

EOM

alarm($timeout);
while (<SK>) {
$response .= $_;
}
alarm(0);

if ($@ =~ /BLAH/) {
print STDOUT "\n\n-----Socket timeout-----\n\n";
return 'nonono';
}

};
#-------------------------------
#---------ENDOF EVAL------------
#-------------------------------

shutdown(SK, 2);
select STDOUT;
return $response;
}
#--- endof sck sub

#--- begin stat sub
sub stat {
my $kw = shift;
$kw = lc($kw);

#--- here we don't count any 1-2-3 char word, at least if not in the 'legitimate 3 word' index
#--- comment the line below if you don't want to use this feature
return '' if ((length($kw) < 4)&&($kw !~ /($chars3)/i));

$totwords++;
$kw{$kw}++;
return '';
}
#--- endof stat sub

#--- begin timeout sub
sub timeout {
shutdown(SK, 2);
print STDOUT "Socket TIMEOUT\n";
select (STDOUT);
die "BLAH";
}
#--- endof timeout sub

---ENDOF PERL SCRIPT---


---BEGIN EXAMPLE.DAT---
30
widget
blue sea
apple pie
bad computer
good geek
damn lamer
money back
girl smile
cup of coffe
---ENDOF EXAMPLE.DAT---

cminblues

[1][edited by: jatar_k at 9:17 pm (utc) on Oct. 27, 2002]

jatar_k

9:24 pm on Oct 27, 2002 (gmt 0)

WebmasterWorld Administrator 10+ Year Member



interesting script cminblues,

I will have to take a better read when I have some time. I saved it in my useful scripts dir. :)