#!/usr/bin/perl
# pagehit - a perl script which Generates page ranking HTML

# ------------------------------------------------------------------------
# This perl script generates the HTML source code for the page ranking
# pages on my site. Pages are ranked according to the number of hits
# registered since 01-Jun-2002, so older pages have an advantage. When
# reading this source code you might also wish to view the HTML source
# code that this script produces. Here are three examples of page ranking
# URLs:
# 	http://www.pgts.com.au/pgtsj/pgtsj0211a.html
# 	http://www.pgts.com.au/download/download_stats.html
# 	http://www.pgts.com.au/download/humour/humour_stats.html
# 
# Normally I do not put many comments in my scripts. I usually only add
# enough to remind myself what I was thinking at the time I wrote it. This
# probably goes back to a deep seated prejudice that I have against
# interpreters. (See: http://www.pgts.com.au/pgtsj/pgtsj0205a.html) In the
# past I was always concerned about over-burdening an interpreter with too
# many comments. These concerns may have been relevant twenty years ago,
# but today's hardware has capacity to spare. So for the purpose of
# explaining how this script works I have included a plethora of comments.
# This is experimental, since I usually keep documentation (if there is
# any) separate from source code. Hopefully this will be sufficient to
# serve as documentation for this script.
# 
# This code is site specific. When it gets down to the nitty gritty. It is
# unlikely that anyone will write a generic script that deals with
# generation of web pages. The task is, by its' nature, unique for each
# business and tends to be highly customised. Still, it might give you
# ideas for your own site.
# 
# I apologise to those of you who still prefer 80 character console mode
# systems. I still work at a text only console, but I have recently taken
# to using Linux distributions, which have a very nice font that result in
# a console that is 100 characters wide. This script won't look very
# pretty on a console that is restricted to 80 characters. You GUI people
# don't have to worry about those previous two sentances, however if you
# are fond of your mouse and X, you might like to try a recent version of
# Konqueror, which will detect the perl shebang (#!/usr/bin/perl), and
# display this page in pretty syntax-specific colours.
# 
# I have to warn you that if you are going to comprehend this code, you
# will need a working knowledge of basic perl regular expressions. You can
# learn about perl regular expressions from the "perlre" man page.
# 
# If you want to contact me, try the following URL:
# 	http://www.pgts.com.au/page04.html
# 
# Gerry Patterson (November Edition PGTS Journal)
# ------------------------------------------------------------------------

# We need this for "basename" (extract base file name from filespec)
use File::Basename;
# This is the location of the master copy
$pageSRC = "/My_cronfiles/pagehit.html";
# This is the location of the (apache) DocumentRoot
$WWWroot = "/MyDocumentRoot";
# array of the months (note: xxx - should never happen)
@Mth = qw/xxx Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;

# ------------------------------------------------------------------------
# This loop reads stdin. This is the output from the hits script which is
# used to parse the logfile.
# The hits perl scripts produces three columns:
# 	Col 1 is the type (Visitor/Robot)
# 	Col 2 is the number of hits
# 	Col 3 is the page (with the http://domain portion stripped)
# ------------------------------------------------------------------------
while (<>){
	# When we get to the list of Robots, we're finished ...
	last if (/^Robot/ );
	# Also, I'm not interested in how many visitors bookmarked me
	next if ( /favicon.ico/);
	chomp;
	# This should not happen, since looking at robots.txt is usually
	# considered 'Robot' behaviour -- I left it, just in case ...
	next if (/robots.txt$/);
	# standard awk-style split ...
	@F = split(' ');
	# I am only interested in pages that exist on my server ...
	next unless (-r "$WWWroot" . $F[2]);
	# Also I am only interested in lines that have hits > 0
	next unless ($F[1] > 0);
	# If it's a directory, make sure there is a trailing '/'
	if ( -d "$WWWroot" . $F[2]){
		$F[2] = $F[2] . "/";
		$F[2] =~ s/\/+$/\//;
	}
	# Get rid of '//' (usually it's a typo - but it still works)
	$F[2] =~ s/\/\//\//g;
	# If we reach this statement - it's a valid page, so count it.
	$hit{$F[2]} += $F[1];
}

# ------------------------------------------------------------------------
# Now the HTML source from the master file, $pageSRC This contains the
# page ranking data between two marker comments as follows:
# <!-- PAGE HIT -->
# ... Data Goes Here ...
# <!-- End PAGE HIT -->
# The data that goes in between the two markers has all been created with
# this script, so we can rely on a reliable standard format. The data
# consists of table rows. Each row has four cells as follows:
# 1. Number of hits
# 2. Page Title/Description (actually a URL)
# 3. Author (or Category)
# 4. Date Published (for articles - creation date for other documents)
# 
# This next piece of code opens $pageSRC and reads each line, storing it
# in the array @HTMLsrc until it reaches the first marker. It then parses
# each row, extracting the contents of the cells. The URL is extracted
# from cell number 2, which is contained in a <a href> tag. The front
# portion of the URL is stripped off and the remaining string is used as a
# hash key, which is stored in the variable $url. There are four hashes,
# one for each cell, %hit, %title, %author and %date. When all the data
# has been extracted (i.e. we reach the second marker), the lines are once
# more pushed into @HTMLsrc.
# ------------------------------------------------------------------------
open (PAGESRC, $pageSRC) || die "$!";
for ($found = 0; (<PAGESRC>);){
	# start of data marker ...
	$found++ if ( /<!-- PAGE HIT -->/ );
	# end of data marker ...
	$found = 0 if ( /<!-- End PAGE HIT -->/ );
	# if we are not processing data, push it and loop
	unless ($found){
		push (@HTMLsrc,$_);
		next;
	}
	chomp;
	# split on the end of data tag for each cell (<td>)
	@sf = split ( /<\/td>/, $_);
	# if there is no URL, forget it ...
	next unless ($sf[1] =~ /http:\/\/www.pgts.com.au(\S+)"/);
	$url = $1;
	# Now remove HTML tags from each field 
	for $i ( 0 .. $#sf){
		$sf[$i] =~ s/<[#-;A-z ="\!\.\/\?]*>//g;
	}
	# Somthing has gone wrong if we don't have a URL
	next unless ($url);
	# Load the four hashes with cell data
	$hit{$url} += $sf[0];
	$title{$url} = $sf[1];
	$author{$url} = $sf[2];
	$date{$url} = $sf[3];
}

# ------------------------------------------------------------------------
# This next section of code loops through the @HTMLsrc array, printing
# each line to stdout, until we reach the first marker. Then we generate a
# data row for each key in the %hit hash. Each URL string arrives from the
# hits script with the leading portion of the URL stripped. If we are
# going to examine the actual file we need to add the DocumentRoot as a
# suffix. Note if you use aliased directories this algorithm will fail.
# 
# This section of the algorithm is specific to my site. There are a few
# unique aspects to the structure of my site and this script relies on
# this structure. These are as follows:
# 
# 1. The articles for "The PGTS Journal" are all published in the "/pgtsj"
#    folder. This also serves as the archive folder. This means that after
#    an article is published it does not have to moved to an archive
#    folder. Because it already is in the archive folder. What changes is
#    a moving IndexIgnore directive in the .htaccess for the pages in the
#    current edition for the /pgtsj folder. After the issue moves to next
#    month this directive is rotated up the next issue (see naming
#    convention next).
# 
# 2. The articles in the journal all have a name in the format
#    pgtsjYYMMx.html. Where YY is the year, MM is the month and x is an
#    alphanumeric letter (starting at 'a'). This means that the date of
#    publication can be inferred from the name. Of course sometimes I miss
#    the intended publication date (1st of the month).
# 
# 3. As each issue rolls to the new month the front page that was the
#    current page is moved into the archive with a new name in the format
#    pgtsjYYMM.html and linked into the archives. The articles that it
#    references remain in the same location. This means that search
#    engines (like Google) don't get upset because files are moved around.
#    And yet on the face of it the articles "appears" to have been moved
#    into the archive area.
# 
# 4. The descriptions published in the .htaccess files have a single
#    unique description for each page. The entries in the humour section are
#    an exception, however. They have a description which corresponds to
#    the humour "Category".
# 
# With this knowledge the script can assume that a page that has "hits",
# but is not contained in the masterfile (i.e. there is an entry in %hits
# but not in %title) is a relatively new file. It looks first for the
# description in the .htaccess file. If it doesn't find it, then it sets
# the description to the "filename" (i.e. basename).
# ------------------------------------------------------------------------
foreach $html (@HTMLsrc){
	# look for the second marker (1st one has been omitted from @HTMLsrc)
	if ($html =~ /<!-- End PAGE HIT -->/ ){
		# Now print the data for the page stats
		print "<!-- PAGE HIT -->\n";
		# sort the array in reverse hits order (highest first)
		foreach $url(sort{$hit{$b}<=>$hit{$a}} keys %hit){
			# full filespec is Document Root + URL
			$fspec = "$WWWroot$url";
			# get the basename
			($name,$path) = fileparse($fspec);
			# is a new file?
			unless ($title{$url}){
				# yes it is, look for entry in .htaccess
				if ( open HTACCESS, "$path/.htaccess" ){
					my @t = grep (/^AddDescription\s*\".*\"\s*$name$/, (<HTACCESS>));
					$title{$url} = $1 if (@t == 1 && $t[0]=~/^AddDescription\s*\"(.*)\"\s*$name$/);
				}
				# If it's a directory set the description to basename
				# as with all these, this can be edited later ...
				$title{$url} = $url if (-d $fspec);
				$title{$url} = $name unless ($title{$url});
				if ( -f $fspec && $name =~ /^pgtsj([0-9]*)/){
					my $x = $1;
					my $m = substr($x,2);
					# derive the date from the filename
					# for articles in the Journal
					$date{$url} = "01-$Mth[$m]-20" . substr($x,0,2);
					# G. Patterson is default author for journal
					$author{$url} = 'G. Patterson';
				}
				# humour files have category humour
				elsif ( -f $fspec && $path =~ /\/humour\/$/){
					$title{$url} = $name;
					$author{$url} = "Humour";
				}
			}
			$title{$url} =~ s/\s*$//;
			unless ($author{$url}){
				# Directories have the default category "Directory"
				if ( -d $fspec){
					$author{$url} = "Directory";
					$date{$url} = '-';
				}
			}
			# Is the date missing?
			unless ($date{$url}){
				my @s = stat($fspec);
				my @t = localtime($s[9]);
				$date{$url} = sprintf "%02d-%s-%04d",
						$t[3],$Mth[$t[4] + 1],$t[5] + 1900;
			}
			# print the row
			printf "<tr><td align=right>%d</td><td><a href=\"http://www.pgts.com.au%s\">%s</a></td><td>%s</td><td>%s</td></tr>\n",
				$hit{$url},$url,$title{$url},$author{$url},$date{$url};
		}
	}
	# print the line from @HTMLsrc
	print $html;
}

# ------------------------------------------------------------------------
# I have not included any notifications about copyright or Public Licenses
# etc. That's because the script is customised for my own site. It would
# need considerable modification to work at another site. Still the
# principals could be applied to most businesses.
# 
# In actual fact, I didn't put all these comments into the script. I
# simply included a special comment that has format as follows:
# # include file_name
# 
# And when the script is copied to my publication area, it is sent via
# another perl script which then replaces such statements with all the
# statements in "file_name", prepending a "#" to each line. It also makes
# a few changes to directories ... You didn't think that I really would
# use names like /MyDocumentRoot did you? The main advantage of keeping
# the documentation in separate text files was that it allowed me to
# format it, change it about easily and spell check it.
# 
# Or, maybe it was just that I still have this prejudice about comments
# and interpreters ...
# ------------------------------------------------------------------------