#!/usr/bin/perl
# suspects - search for crawlers
# Gerry Patterson, Nov 2002
use Time::Local;
use Getopt::Std;
require "/Mypath/agent_id";
require "/Mypath/agent_data";

# options:
#   -D  Debug level, where the level = suspicion index (or -1)
#   -l  print a list of suspects on the console
#   -u  run SQL to Update the agents data
#   -U  run SQL to Update the robots data (month end)

# init variables
%mth = qw(Jan 0 Feb 1 Mar 2 Apr 3 May 4 Jun 5 Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11);
@Mth = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
# Page Types
$NULL   = 0;
$IMAGE  = 1;
$BIN    = 2;
$ROBOT  = 3;
$HTML   = 4;
$TXT    = 5;

# Referer Types
$ANON   = 0;
$LOCAL  = 1;
$REMOTE = 2;
getopt('D');
$opt_D += 0;

# ------------------------------------------------------------------------

# convert a time into (log format) time string
sub time_str{
	my @T = localtime($_[0]);
	my $tm = sprintf "%02d/%s/%04d:%02d:%02d:%02d",$T[3],$Mth[$T[4]],
		$T[5]+1900,$T[2],$T[1],$T[0];
	return($tm);
}

# ------------------------------------------------------------------------

# convert a time to psql format string
sub psql_time{
	my @T = localtime $_[0];
	return (sprintf "%04d-%02d-%02d %02d:%02d:%02d",$T[5]+1900,
		$T[4]+1,$T[3],$T[2],$T[1],$T[0]);
}

# ------------------------------------------------------------------------

# convert psql string to system time
sub time_psql{
	my $t = $_[0];
	$t =~ s/\+[0-9]+$//;
	my @t = split(' ',$t);
	my @t1 = split (/-/,$t[0]);
	my @t2 = split (/:/,$t[1]);
	my @tt = (reverse(@t2),reverse(@t1));
	$tt[4]--;
	return timelocal(@tt);
}

# ------------------------------------------------------------------------

# extract a 16-bit subnet from a dotted quad
sub sub16{
	my @s = split(/\./,$_[0]);
	return("$s[0].$s[1]");
}

# ------------------------------------------------------------------------

sub LoadNonSuspects{
	my @t = chomp_sql("select ip_addr,start_time,end_time from robot_suspects where not confirmed;");
	foreach my $x(@t){
		my @w = split(/\t/,$x);
		my @a = (time_psql($w[1]),time_psql($w[2]));
		push (@{$NonSuspects{$w[0]}},@a);
	}
}

# ------------------------------------------------------------------------

# examine given IP addr, time to determine if exempt from suspect list
sub non_suspect{
	return(0) unless ($NonSuspects{$_[0]});
	my @a = @{$NonSuspects{$_[0]}};
	while(@a){
		my $t1 = shift(@a);
		my $t2 = shift(@a);
		return(1) if ($t1 <= $_[1] && $_[1] <= $t2);
	}
	return(0);
}

# ------------------------------------------------------------------------

sub FindCrawler{
	my $session_start;
	my @TD = @_;
	my @result;
	for (my $Ndx = 0; $Ndx <= $#TD; $Ndx++){
		my @v = @{$TD[$Ndx]};
		printf "%-15s %s %s %s\n", $IP,time_str($v[0]),$v[2],$v[3] if ($opt_D == -1);
		unless ($session_start){
			$prev_time = $session_start = $v[0];
			@Ptype = (0,0,0,0,0,0);
			@Rtype = (0,0,0,0,0,0);
			$hits = 0;
			undef @session_agents;
			$suspicion = 0;
		}
		if (non_suspect($IP,$v[0])){
			$SusAgent[$v[1]] = 0;
			$non_suspect[$v[1]]++;
			next;
		}
		$session_agents[$v[1]]++;
		$hits++;
		$Ptype[$v[2]]++;
		$Rtype[$v[3]]++;
		$HitIP{$IP}++;
		if ( ( ($v[0] - $prev_time) > 14400) || $Ndx == $#TD){
			# round up the agent_ids
			my @Sagent;
			for my $i(0 .. $#session_agents){push(@Sagent,$i) if ($session_agents[$i])};
			# now, check whether previous session is a crawler suspect
			$duration = $prev_time - $session_start;
			$suspicion |= 1 if (@Sagent == 1 && $SusAgent[$Sagent[0]]);
			$suspicion |= 2 if ($hits>10&&($Ptype[$IMAGE]==0 || $Ptype[$IMAGE]==$hits));
			$suspicion &= 253 if (@Sagent==1 && $TxtBrowser[$Sagent[0]]);
			$suspicion |= 4 if ($Ptype[$HTML]>1 && $Ptype[$IMAGE]==0 && $Rtype[$ANON]==$hits);
			$suspicion |= 8 if ($Ptype[$ROBOT]);
			$suspicion |= 16 if ($SusIP{$IP} && $Ptype[$IMAGE]==0 && $Rtype[$ANON]==$hits);
			$suspicion |= 32 if ($hits > 10 && ($hits/$duration) > 1.4);
			# include option to Debug suspicion index
			printf "%d %-15s %s %s %d %d %d %d\n", $suspicion, $IP,
				time_str($session_start),
				time_str($v[0]),$Ptype[$IMAGE],$hits,$SusAgent[$Sagent[0]],$Sagent[0]
				if ($suspicion & $opt_D || $opt_D == -1);
			push (@result,$suspicion,$session_start,$prev_time);
			undef $session_start;
			$SusIP{$IP} += $hits if ($suspicion);
			for my $i (0 .. $#session_agents){
				$suspect[$i] += $session_agents[$i]
					if ($suspicion && $session_agents[$i]);
				$non_suspect[$i] += $session_agents[$i]
					if ($suspicion == 0 && $session_agents[$i]);
			}
		}
		$prev_time = $v[0];
	}
	return (@result);
}

# ------------------------------------------------------------------------

sub parse_log{
	my @w = split ( ' ', $_[0]);
	# First perform a sanity check
	# My log uses the 'combined' format as follows:
	# remotehost login authuser [date] "request" status bytes "Referer" "Agent"
	# where: remotehost = IP address
	#        [date]     = timestamp and tz (always +1000 or +1100 for VIC)
	#        login      = remote login as per RFC931 (always -)
	#        authuser   = authenticated username (always -)
	#        "request"  = request cmd sent from the remote agent
	#        status     = numeric status returned by apache
	#        bytes      = number of bytes transmitted
	#        "Referer"  = URL of the Referer
	#        "Agent"    = name of the remote user agent
	# This will have to be customised for individual sites
	die "Error insane at line $.:$_" unless
		( $w[0] =~ /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ &&
		  $w[1] =~ /-/ && $w[2] =~ /-/ &&
		  $w[4] =~ /^\+1[01]00\]$/);
	# Exclude these subnets/addresses
	return() if ( $w[0] =~ /127\.0\.0\.1/       ||
		      $w[0] =~ /1\.2\.3\..*/    ||
		      $w[0] =~ /1\.2\.3\.4[01]$/ );
	# remove the '[' from the date and convert to timestamp with timelocal()
	$w[3] =~ s/^\[//;
	my @t = split(/:/,$w[3]);
	my @d = split( /\//,shift( @t) );
	my @Htime = (reverse(@t),$d[0],$mth{$d[1]},$d[2] - 1900);
	my $ltime = timelocal(@Htime);
	# Original HTML cmd, referer and agent are all enclosed in '"'
	@t = split( '"',$_);
	# extract the status and size
	$t[2] =~ s/^\s+//;
	my @t1 = split(' ',$t[2]);
	return($w[0],$ltime,$t[1],$t1[0],$t1[1],$t[3],$t[5]);
}

# ------------------------------------------------------------------------

LoadNonSuspects();
# Get a list of text-only browsers
@t = chomp_sql("select agent_id from webagents where text_only;\n");
foreach $i(@t){ $TxtBrowser[$i]++};
# Search for robot suspects
while(<>){
	next unless (($IP,$htime,$cmd,$status,$bytes,$referer,$agent) = parse_log($_));
	@t = split( ' ',$cmd);
	$page = $t[1];
	unless ($page){
		$page_type = $NULL;
	}
	elsif ($page =~ /\/icons\// || $page =~ /\/images\// ){
		$page_type = $IMAGE;
	}
	elsif ($page =~ /\.exe$/ || $page =~ /\.gz$/ || $page =~ /\.msi$/ || $page =~ /\.zip$/){
		$page_type = $BIN;
	}
	elsif ($page =~ /robots.txt$/ ){
		$page_type = $ROBOT;
	}
	elsif ($page =~ /\.txt$/ || $page =~ /-HOWTO$/ ){
		$page_type = $TXT;
	}
	else {
		$page_type = $HTML;
	}
	if (length($agent) < 2){
		# it's a worm or an attack -- forget it (for now)
		# Some time (when I get round to it), I will put some code here
		# to analyse behaviour patterns of Microsoft worms
		$agent = "-";
		next;
	}
	unless ($agent_id = $AgentID{$agent}){
		push (@agent_array, $agent);
		$agent_id = $AgentID{$agent} = $#agent_array;
	}
	if ($referer =~ /pgts.com.au/ || $referer =~ /^\//){
		$ref_type = $LOCAL;
	}
	elsif (length($referer) > 3){
		$ref_type = $REMOTE;
	}
	else {
		$ref_type = $ANON;
	}
	$Phit[$agent_id]++ unless ( $page_type == $NULL || $page_type == $IMAGE);
	$Thit[$agent_id]++;
	${$AgentIP[$agent_id]}{$IP}++;
	if ($RobotArray[$agent_id]){
		my @H = @{$AgentHit[$agent_id]};
		push (@{$AgentHit[$agent_id]},$htime);
		next;
	}
	if ($AgentHit[$agent_id]){
		# Alias the IP, if hit with same subnet/agent_string within 30 sec
		# This looks like a kludge, because it is a kludge -- too bad!
		my @H = @{$AgentHit[$agent_id]};
		my @I = @{$AliasIP[$agent_id]};
		if ( sub16($IP) eq sub16($I[$#I]) && ($htime - $H[$#H]) < 30){
			$IP = $I[$#I];
		}
	}
	push (@{$AliasIP[$agent_id]},$IP);
	push (@{$AgentHit[$agent_id]},$htime);
	my @v = ($htime,$agent_id,$page_type,$ref_type);
	push (@{$IPdata{$IP}},\@v);
	next;
}
# look for suspicious agents
for $agent_id(0 .. $#Thit){
	next unless ($Thit[$agent_id] > 1);
	next if ($Thit[$agent_id] != $Phit[$agent_id] && $Phit[$agent_id] > 0);
	my $nIP = keys %{$AgentIP[$agent_id]};
	next if ($TxtBrowser[$agent_id]);
	printf "%5d %5d %5d %s\n",$Thit[$agent_id],$Phit[$agent_id],
		$nIP,$agent_array[$agent_id] if ($opt_D == -1);
	$SusAgent[$agent_id]++;
}
undef @AliasIP;
# now check for suspicious IPs
foreach $IP(sort keys %IPdata){
	my @t = @{$IPdata{$IP}};
	# check the IP address for signs of crawlers
	my @behave = FindCrawler(@t);
	# consolidate suspicious behaviour into the SusData array
	if (@behave < 4){
		push (@SusData,$IP,$behave[1],$behave[2]) if ($behave[0]);
	}
	else {
		undef $Start;
		for (my $i = 0; $i < $#behave; $i+=3){
			if ($behave[$i]){
				$Start = $behave[$i+1] unless ($Start);
				push (@SusData,$IP,$Start,$behave[$i+2]) if ($i > $#behave - 3);
			}
			else {
				push (@SusData,$IP,$Start,$behave[$i+1]) if ($Start);
				undef $Start;
			}
		}
	}
}
if ($opt_l){
	# print a list of suspects to the console
	for (my $i=0; $i < $#SusData; $i += 3){
		$IP = $SusData[$i];
		my $t1 = time_str($SusData[$i+1]);
		my $t2 = time_str($SusData[$i+2]);
		printf "%-15s %s %s\n", $IP,$t1,$t2;
	}
}
if ($opt_u || $opt_U){
	my $tmpfile = opentmp_sql();
	print OTMP "delete from robot_suspects where confirmed;\n";
	for (my $i=0; $i < $#SusData; $i += 3){
		$IP = $SusData[$i];
		my $t1 = time_str($SusData[$i+1]);
		my $t2 = time_str($SusData[$i+2]);
		print OTMP "insert into robot_suspects values ('$IP','$t1','$t2','t');\n";
	}
	runtmp_sql($tmpfile);
}
# produce SQL to update the agents table
if ($opt_u || $opt_U){
	my $tmpfile = opentmp_sql();
	for my $i(0 .. $#agent_array){
		my $IPaddr = ""; my $utime = ""; $ctime = "";
		my $hits = $Phit[$i] + 0;
		$hits = $Thit[$i] + 0 if ($RobotArray[$i]);
		next unless ($AgentIP[$i]);
		my %I = %{$AgentIP[$i]};
		$IPaddr = join(' ',sort keys %I);
		$IPaddr = sprintf('(%d)',keys(%I)+0) if (length($IPaddr)>2048);
		my @H = @{$AgentHit[$i]};
		next unless (@H);
		$utime = psql_time($H[$#H]);
		$ctime = psql_time($H[0]);
		die "Missing agent stats" unless ($utime);
		if ($i <= $#AgentName){
			print OTMP "update webagents set hit00 = $hits,";
			print OTMP "ip_addr = '$IPaddr',update_date = now(),";
			print OTMP "last_visit = '$utime'";
			print OTMP "\nwhere agent_id = $i;\n";
			next;
		}
		my $name = ""; my $version = ""; my $os = "";
		$name = which_browser($agent_array[$i]);
		if ($name =~ /(.*)\s+([0-9.]+)$/){
			$name = $1;
			$version = $2;
		}
		$os = which_OS($agent_array[$i]);
		my $agent_str = $agent_array[$i];
		$agent_str =~ s/\t/&nbsp;/g;
		$agent_str =~ s/'/\\'/g;
		print OTMP "insert into webagents(agent_id,agent_string,name,version,os,";
		print OTMP "ip_addr,hit00,last_visit,create_date,update_date,robot_ind) ";
		print OTMP "values ($i,'$agent_str','$name','$version','$os',";
		print OTMP "'$IPaddr',$hits,'$utime','$ctime',now(),0);\n";
	}
	print OTMP "update webagents set text_only = 't' where name in ('Links','Lynx','Elinks','W3m');\n";
	runtmp_sql($tmpfile);
}
if ($opt_U){
	# Give eligible suspects a permanent entry in the robots file
	# NOTE: This usually runs at month-end 
	# (and maybe manually when a few new and obvious robots visit)
	my $tmpfile = opentmp_sql();
	for my $i(0 .. $#agent_array){
		next if ($RobotArray[$i]);
		next if ($non_suspect[$i]);
		next unless ($suspect[$i]);
		my %I = %{$AgentIP[$i]};
		my @H = @{$AgentHit[$i]};
		my $ctime = psql_time($H[0]);
		my $name1 = $AgentName[$i];
		my $name2 = which_robot($agent_array[$i]);
		$name1 = which_browser($agent_array[$i]) unless ($name1);
		if ($name2 eq "unknown"){
			# next if (@H < 10);
			next if (@H > 1 && ((keys(%I)+0)/(@H+0)) > 0.8);
			my @T = localtime($ctime);
			$name = sprintf "u%02d%02d%02d",$T[5]-1900,$T[4]+1,$T[3];
		}
		my $version = "";
		my $email = "";
		my $url = "";
		my $x = $agent_array[$i];
		if ($x =~ /(http:\S*)/){
			$url = $1;
		}
		elsif ($x =~ /(www\.\S*)/) {
			$url = 'http://' . $1;
		}
		$url =~ s/\W+$//;
		$email = $1 if ($x =~ /(\S+\@\S+)/);
		$email =~ s/\W+$//;
		$email =~ s/^\W+//;
		if ($name2 =~ /(.*)\s+([0-9.]+)$/){
			$name2 = $1;
			$version = $2;
		}
		print OTMP "insert into webrobots values(";
		print OTMP "$i,'$email','$url','f','t',now(),now());\n";
		print OTMP "update webagents set robot_ind = -1,name = '$name2',version = '$version',update_date = now() where agent_id = $i;\n";
	}
	# runtmp_sql($tmpfile);
}
