Sitemap generator

The idea of this script is to generate a sitemap.html by scanning directories, and parsing the title and meta desription tags of files found to produce a basic page of links to every page on the site.

It's by no means fully functional - it only scans one directory deep, and has trouble with title and description tags that span more than one line, so if anyone can offer any insight into how to improve it it'd be most welcome...

#!/usr/bin/perl

### Configuration Options ###
$root_directory="/home/www/html/"; # server path to web site
$output_file="/home/www/html/sitemap.html";
$root_url="http://www.mydomain.com";

# next, a pipe (¦) seperated list of sub directories to ignore
# while creating the sitemap.
# (the '..' directory is the unix 'up a level' directory, we
# don't want to include that..)
$excluded_dirs="..¦images¦bk2¦mailing_list¦includes¦foo";

### End configuration ###

# open a file to write the site map to
open (OF,">$output_file");

# print the HTML header
&header;

# scan one level deep for sub directories
opendir (DIR,"$root_directory");
@sub_dirlist=grep { -d "$root_directory/$_" } readdir(DIR);
closedir (DIR);

# iterate through the list of directories, calling the
# parse_dir subroutine for each one
foreach $sub_dir (@sub_dirlist) {
unless ($excluded_dirs=~/(^¦\¦)$sub_dir(\¦¦$)/) {
&parse_dir($root_url,$root_directory,$sub_dir);}
}

#print out the HTML footer, close output file and quit.
&footer;
close(OF);
exit;

sub parse_dir {
my ($root_url,$root_directory,$sub_dir)=@_;

# The next 3 lines just put the directory name into a
# format suitable for display - could be customised to
# include your own html
if ($sub_dir eq ".") {$sub_dir_formatted="/"}
else {$sub_dir_formatted="/$sub_dir/"}
print OF "<h2>$sub_dir_formatted</h2>\n";

# get a list of all files in this directory
opendir (DIR,"$root_directory/$sub_dir");
@dirlist=readdir(DIR);
closedir (DIR);

foreach $i (@dirlist) {
$title="";
$desc="";

# check if file is html
if ($i=~/\.html$¦\.htm$/) {

# if so, open it and parse for title and description
open (FP, "$root_directory/$sub_dir/$i");
while (!eof(FP)) {
chomp($line=<FP>);
if ($line=~/^.*<title>.*<\/title>.*$/i) {
($title = $line) =~ s/^.*<title>¦<\/title>.*$//gi;}
if ($line=~/^.*<meta name="description" content=".*">$/i) {
($desc = $line) =~ s/^<meta name="description" content="¦">$//gi;
}

}
# print the title, description to output file if we've found them.
# again, your own html could be substituted here
if ($title) {
print OF "<a href=\"$root_url$sub_dir_formatted$i\">$title</a><br>\n";}
else {
print OF "<a href=\"$root_url$sub_dir_formatted$i\">$i</a><br>\n";}

if ($desc){
print OF " $desc<br><br>\n" }
else {print OF "<br>\n";}
close(FP);
}
}
}

sub header {
print OF qq^
<html><head>
<title>foo</title>
<meta name="description" content="bar">
</head><body>
^;
}

sub footer {
print OF qq^
</body></html>
^;
}

Sitemap generator

sugarkane

mivox

sugarkane

mivox

sugarkane

andytt

sugarkane

theperlyking

sugarkane

Join The Conversation

Moderators and Top Contributors

Hot Threads This Week