The lines I am stuck on are:
do {
my $r = $ua->simple_request(HTTP::Request->new("GET", $link));
$r->content_type eq "text/html"? $r->content : "";}
If I add in:
print $r->content, it only works for the first page visited then the script ends?
Below is the full script.
Thx!
#!/usr/bin/perl -w
use strict;
my $VERSION = "0.82";
use LWP::UserAgent;
use HTML::Parser;
use HTML::LinkExtor;
use URI::URL;
my $parser = HTML::Parser->new;
$¦ = 1;
sub spider (%);
spider URL => 'http://www.example.org';
sub spider (%) {
my %args = @_;
my @startlinks = ();
push(@startlinks, $args{URL});
my $ua = LWP::UserAgent->new;
WORKLOOP: while (my $link = shift @startlinks) {
for (my $i = 0; $i< $#startlinks; $i++) {
next WORKLOOP if $link eq $startlinks[$i];
}
print ">>>>> working on $link\n";
HTML::LinkExtor->new(
sub {
my ($t, %a) = @_;
my @links = map { url($_, $link)->abs() }
grep { defined } @a{qw/href img/};
# mark already spidered links for removal
foreach my $start_link (@startlinks) {
my $i = 0;
for (0 .. $#links) {
if ($links[$i++] eq $start_link) {
$links[$i -1] = "'REMOVE'";
}
}
}
# remove already spidered links
@links = sort @links;
for (my $i = 0; $i< $#links; $i++) {
$links[$i] = "'REMOVE'" if $links[$i] eq $links[$i +1];
}
@links = grep { $_ ne "'REMOVE'" } @links;
print "+ $_\n" foreach @links;
push @startlinks, @links if @links;
} ) -> parse(
do {
my $r = $ua->simple_request
(HTTP::Request->new("GET", $link));
$r->content_type eq "text/html"? $r->content : "";}
)
}
}
[edited by: jatar_k at 4:53 pm (utc) on Jan. 9, 2005]
[edit reason] generalized url [/edit]