#!/usr/bin/perl
#
# pullrss.pl - D.M. Ragle and Andy King
#
# Retrieves the top n headlines from a supplied RSS
# file and places them as HREF links in a supplied output
# file. The user that runs the script must have read access
# to the RSS file (write access to the local stored copy of the
# RSS file when an external RSS file is specified), and write
# access to the output file AND the directory in which the
# output file resides.
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# See CHANGES.txt for documented changes in versions.
#
# 03/30/01 - Initial release (1.0).
# 04/13/01 - 1.1.
# 07/30/03 - 1.2.
#
#############################################################
use strict;
use HTTP::Request;
use LWP::UserAgent;
use XML::Parser;
use XML::RSS;
use HTML::Template;
use URI::URL ();
# Variables defined in external file
use vars qw($externalRSS
$localRSS
@siteDomains
$hTemplate
$outfile
$siteId
$ADMIN_EMAIL
$numheadlines
$redirectServer);
# Retrieve configuration file/parameters
my $cfgFile = "/hsphere/local/home/abelkorz/abelkorzeniowski.com/cgi-bin/pullrss/pullrss.cfg";
$cfgFile = $ARGV[0] if ($ARGV[0]);
die "Specified Config file ($cfgFile) doesn't exist.\n" unless (-e $cfgFile);
require $cfgFile or die "Can't retrieve configuration file: $!\n";
# Agent name for external file pulls
my $AGENT_NAME="pullrss/1.0";
# grab and format headlines from the RSS file
my $topHeadlines = &pullRSS($externalRSS,$localRSS,$numheadlines,\@siteDomains,$hTemplate);
# write to output file
open(OUTFILE,">$outfile.$$") or die "Can't open $outfile.$$ for output: $!\n";
print OUTFILE $topHeadlines;
close OUTFILE;
rename("$outfile.$$",$outfile) or die "Can't move $outfile.$$ to $outfile: $!\n";
exit;
#################################################################
# pullRSS - Retrieves, formats, and returns headline info in the
# provided template.
#
# externalRSS = URL of external RSS file to retrieve
# localrRSS = file to store RSS info in (or retrieve
# info from if external is unavailable
# headlines = number of headlines to format
# siteDomain = short domain name for this site
# hTemplate = HTML template for formatted headlines
#
# RETURNS: formatted HTML string with top headlines
#
sub pullRSS {
my ($externalRSS,$localRSS,$headlines,$siteDomain,$hTemplate) = @_;
my $usedExternal=0;
# the HTML template must exist in the specified location
die "HTML template ($hTemplate) does not exist.\n" unless (-e $hTemplate);
# Initialize template object. Set to allow passing of parameters
# that may not exist within the template, and allow the __FIRST__,
# __LAST__, __INNER__, and __ODD__ variables within loops.
my $template=HTML::Template->new(filename => $hTemplate,
loop_context_vars => 1,
die_on_bad_params => 0);
# initialize RSS object
my $rssinfo=new XML::RSS;
# attempt to retrieve external RSS file
my $rssdata="";
if ($externalRSS) {
# initalize URL object
my $url = new URI::URL $externalRSS;
$rssdata=&fetch_url($url,$AGENT_NAME);
print " --> Unable to retrieve external RSS file. Trying local RSS file, instead.\n" unless ($rssdata);
}
if (!($rssdata)) {
die "File \"$localRSS\" doesn't exist, and it needs to.\n" unless (-e $localRSS);
$rssinfo->parsefile($localRSS);
} else {
$rssinfo->parse($rssdata);
$usedExternal=1;
}
my @rssitems=@{$rssinfo->{'items'}};
die "No headlines exist in specified RSS file.\n" unless (@rssitems >= 1);
# store these for later - used in redirection logic if necessary
my $daynum=(localtime())[3];
my $month =(localtime())[4];
$month++;
$month="0".$month if ($month < 10);
$daynum="0".$daynum if ($daynum < 10);
# Format headlines for insertion in template
my @loopEntries = ();
my $loopCounter = 0;
my $goodCount = 0;
while (($loopCounter<@rssitems)&&($goodCount<$headlines)) {
my $ilink =$rssitems[$loopCounter]->{'link'};
my $ititle=$rssitems[$loopCounter]->{'title'};
my $idesc =$rssitems[$loopCounter]->{'description'};
my $domainOK=1;
foreach my $sd (@$siteDomain) {
$domainOK=0 if ($ilink=~m/$sd/i);
}
if ($domainOK) {
$goodCount++;
# rebuild the redirection URL if necessary
if (($siteId) && ($redirectServer)) {
$ilink=~s/^\s*http\:\/\///si;
$ilink=$redirectServer."/".$siteId."_".$goodCount."-".$daynum.".".$month."/".$ilink;
}
# build up the hash for the HTML template
my $thisLoopCount = "THISLOOP".$goodCount;
my %thisRow = ();
$thisRow{'URL'} =$ilink;
$thisRow{'TEXT'} =$ititle;
$thisRow{'DESC'} =$idesc;
$thisRow{$thisLoopCount} =1;
push(@loopEntries,\%thisRow);
}
$loopCounter++;
}
# if we retrieved an external file, save it in the local
# backup file in case we need it next time
$rssinfo->save($localRSS) if ($usedExternal);
# insert variables into HTML template
$template->param(CHANNEL_TITLE => $rssinfo->channel('title'),
CHANNEL_LINK => $rssinfo->channel('link'),
CHANNEL_LANGUAGE => $rssinfo->channel('language'),
CHANNEL_DESCRIPTION => $rssinfo->channel('description'),
CHANNEL_RATING => $rssinfo->channel('rating'),
CHANNEL_COPYRIGHT => $rssinfo->channel('copyright'),
CHANNEL_PUBDATE => $rssinfo->channel('pubDate'),
CHANNEL_LASTBUILDDATE => $rssinfo->channel('lastBuildDate'),
CHANNEL_DOCS => $rssinfo->channel('docs'),
CHANNEL_MANAGINGEDITOR => $rssinfo->channel('managingEditor'),
CHANNEL_WEBMASTER => $rssinfo->channel('webMaster'),
IMAGE_TITLE => $rssinfo->image('title'),
IMAGE_URL => $rssinfo->image('url'),
IMAGE_LINK => $rssinfo->image('link'),
IMAGE_WIDTH => $rssinfo->image('width'),
IMAGE_HEIGHT => $rssinfo->image('height'),
IMAGE_DESCRIPTION => $rssinfo->image('description'),
TEXTINPUT_TITLE => $rssinfo->image('title'),
TEXTINPUT_DESCRIPTION => $rssinfo->image('description'),
TEXTINPUT_NAME => $rssinfo->image('name'),
TEXTINPUT_LINK => $rssinfo->image('link'),
HEADLINE_LOOP => \@loopEntries);
return $template->output();
}
#
# end pullRSS
#################################################################
sub fetch_url {
##*************************************************************#
# DESC: make a HTTP request and return the results #
# SYNTAX: &fetch_url($url,$agent); #
# PARAMS: $url - URL tp fetch #
# $agent - agent name to sent to Web server #
# RETURN: $response->content - The content of the Web server #
# response #
#**************************************************************#
# LOCAL VARIABLES
my ($url,$agent) = @_;
# MAIN
my $ua = new LWP::UserAgent; # create a new LWP agent
$ua->from($ADMIN_EMAIL); # set HTTP From
$ua->agent($agent); # set Agent-Name
# retrieve the file from $url
my $request = new HTTP::Request GET => $url;
my $response = $ua->request($request);
# return content
if ($response->is_success) {
return $response->content;
} else {
return "";
}
#**************************************************************#
# End function &fetch_url($url,$agent) #
#**************************************************************#
}
[edited by: jatar_k at 9:33 pm (utc) on Sep. 16, 2005]
I am very new to this, could you tell me how to write a simple script that would perform a simple "search and replace" on a faulty file? like change "&" to "&"
For the first glance :
.....
# grab and format headlines from the RSS file
my $topHeadlines = &pullRSS($externalRSS,$localRSS,$numheadlines,\@siteDomains,$hTemplate);
#Moby_Dim INSERTION ;))) :
$topHeadlines =~ s/&quot;/"/g;
#End of INSERTION
# write to output file
open(OUTFILE,">$outfile.$$") or die "Can't open $outfile.$$ for output: $!\n";
print OUTFILE $topHeadlines;
close OUTFILE;
.....
Should work 100%, if you view "$outfile" as the result.