
duncf at apache
Mar 7, 2004, 3:28 PM
Post #1 of 1
(33 views)
Permalink
|
|
svn commit: rev 7043 - incubator/spamassassin/trunk/tools
|
|
Author: duncf Date: Sun Mar 7 14:28:21 2004 New Revision: 7043 Modified: incubator/spamassassin/trunk/tools/sa-stats.pl Log: Bug 2988: Patch from Bob Apthorpe for sa-stats.pl Modified: incubator/spamassassin/trunk/tools/sa-stats.pl ============================================================================== --- incubator/spamassassin/trunk/tools/sa-stats.pl (original) +++ incubator/spamassassin/trunk/tools/sa-stats.pl Sun Mar 7 14:28:21 2004 @@ -22,19 +22,24 @@ use strict; -#Configuration section +# Configuration section my %opt = (); -$opt{'logfile'} = '/var/log/maillog'; # Log file +$opt{'logfile'} = '/var/log/maillog'; # Log file $opt{'sendmail'} = '/usr/sbin/sendmail'; # Path to sendmail stub $opt{'from'} = 'SpamAssassin System Admin'; # Who is the mail from $opt{'end'} = ""; $opt{'start'} = "today"; +my $diag = ''; +$diag .= "Default options:\n" . join('', map { "opt{$_} => " . ($opt{$_} || '<undefined>') . "\n" } (sort keys %opt)); +# &vdbg($diag); # This won't work until *after* getopt() is called. Duh. + ########################################################## ############# Nothing to edit below here ################# ########################################################## my $VERSION = '$Id$'; +my ($VER_NUM) = '$Revision: 6256 $' =~ m#\$Revision:\s+(\S+)#o; # internal modules (part of core perl distribution) use Getopt::Long; @@ -44,7 +49,11 @@ use Date::Manip; use Parse::Syslog; -my $tstart = time; +my %timing = (); +$timing{'start'} = 0; +$timing{'end'} = 0; +$timing{'hrsinperiod'} = 0; +$timing{'telapsed'} = time; Getopt::Long::Configure("bundling"); GetOptions('logfile|l=s' => \$opt{'logfile'}, @@ -54,6 +63,8 @@ 'debug|D' => \$opt{'debug'}, 'userstats|u' => \$opt{'userstats'}, 'verbose|v' => \$opt{'verbose'}, + 'html|H' => \$opt{'html'}, + 'top|T:25' => \$opt{'topusers'}, 'help|h' => \$opt{'help'}, 'version|V' => \$opt{'version'}, 'start|s=s' => \$opt{'start'}, @@ -69,31 +80,45 @@ exit 0; } -#Local variables -my $mean_spam_score = 0; -my $mean_spam_time = 0; -my $mean_spam_bytes = 0; - -my $mean_ham_score = 0; -my $mean_ham_time = 0; -my $mean_ham_bytes = 0; +# No point in specifying topusers w/o specifying userstats; coerce -u if -T +if ($opt{'topusers'}) { + $opt{'userstats'} = 1; +} + +$diag .= "\nUser options:\n" . join('', map { "opt{$_} => " . ($opt{$_} || '<undefined>') . "\n" } (sort keys %opt)); +&vdbg($diag . "\n"); + +# Local variables -my %stats = (); # %stats is a multidimensional hash with the following structure: -# $stats{'total'}{'bytes'} -# $stats{'total'}{'count'} -# $stats{'total'}{'time'} -# $stats{'total'}{'threshold'} -# $stats{'ham'}{'bytes'} -# $stats{'ham'}{'count'} -# $stats{'ham'}{'time'} -# $stats{'ham'}{'score'} -# $stats{'ham'}{'byhour'} -# $stats{'spam'}{'bytes'} -# $stats{'spam'}{'count'} -# $stats{'spam'}{'time'} -# $stats{'spam'}{'score'} -# $stats{'spam'}{'byhour'} +my %stats = (); + +$stats{'spam'}{'mean_score'} = 0; +$stats{'spam'}{'mean_time'} = 0; +$stats{'spam'}{'mean_bytes'} = 0; + +$stats{'ham'}{'mean_score'} = 0; +$stats{'ham'}{'mean_time'} = 0; +$stats{'ham'}{'mean_bytes'} = 0; + +$stats{'total'}{'bytes'} = 0; +$stats{'total'}{'count'} = 0; +$stats{'total'}{'time'} = 0; +$stats{'total'}{'score'} = 0; +# $stats{'total'}{'byhour'}{$hr} +$stats{'total'}{'threshold'} = 0; + +$stats{'ham'}{'bytes'} = 0; +$stats{'ham'}{'count'} = 0; +$stats{'ham'}{'time'} = 0; +$stats{'ham'}{'score'} = 0; +# $stats{'ham'}{'byhour'}{$hr} + +$stats{'spam'}{'bytes'} = 0; +$stats{'spam'}{'count'} = 0; +$stats{'spam'}{'time'} = 0; +$stats{'spam'}{'score'} = 0; +# $stats{'spam'}{'byhour'}{$hr} my %userstats = (); # $userstats{$recipient}{'total'}{'bytes'} @@ -108,32 +133,47 @@ # $userstats{$recipient}{'spam'}{'time'} # $userstats{$recipient}{'spam'}{'score'} -my ($start, $end) = parse_arg($opt{'start'}, $opt{'end'}); +# my ($start, $end) = parse_arg($opt{'start'}, $opt{'end'}); +($timing{'start'}, $timing{'end'}) = parse_arg($opt{'start'}, $opt{'end'}); + +&vdbg("Timing:\nstart = $timing{'start'} " . UnixDate("epoch " . $timing{'start'}, '%C') + . "\n end = $timing{'end'} " . UnixDate("epoch " . $timing{'end'}, '%C') . "\n\n"); -die "Can't find " . $opt{'logfile'} . " $!\n" unless (-e $opt{'logfile'}); +die "Can't find " . $opt{'logfile'} . " $!\n" unless (-e $opt{'logfile'} || ($opt{'logfile'} eq '-')); + +my $logyear = UnixDate("epoch " . $timing{'start'}, "%Y"); +my $logmonth = UnixDate("epoch " . $timing{'start'}, "%m") - 1; + +&vdbg("Creating log parser: Parse::Syslog->new(" . $opt{'logfile'} + . ", year => $logyear, _last_mon => $logmonth,)\n" + . "Note that _last_mon = month - 1\n\n"); my $parser = Parse::Syslog->new( $opt{'logfile'} , - year => UnixDate("epoch $start", "%Y"), - _last_mon => UnixDate("epoch $start", "%m") - 1); -# Hack for end-of-year support -- sets _last_mon to current month (0 based, not 1 based) + year => $logyear, + _last_mon => $logmonth,); +# Hack for end-of-year support -- sets _last_mon to current month (0 based, not +# 1 based) + +&vdbg("##### Entering parseloop:\n\n"); parseloop: while (my $sl = $parser->next) { + &vdbg('Found log entry at ' . $sl->{'timestamp'} . ' for ' . $sl->{'program'} . ' containing ' . $sl->{'text'} . "\n"); next parseloop unless ($sl->{'program'} eq 'spamd'); if ($sl->{'text'} =~ m/ (clean\smessage|identified\sspam)\s # Status \(([-0-9.]+)\/([-0-9.]+)\)\s # Score, Threshold for\s - ([^:]+):\d+\s # for daf:1000 + ([^:]+):\d+\s # for daf:1000 in\s ([0-9.]+)\sseconds,\s+ ([0-9]+)\sbytes\. /x) { # discard records outside defined analysis interval - next parseloop if ($sl->{'timestamp'} < $start); + next parseloop if ($sl->{'timestamp'} < $timing{'start'}); # We can assume that logs are chronological - last parseloop if ($sl->{'timestamp'} > $end); + last parseloop if ($sl->{'timestamp'} > $timing{'end'}); my $status = $1; my $score = $2; @@ -142,14 +182,14 @@ my $time_processed = $5; my $bytes_processed = $6; - dbg("Found: " . $sl->{'text'} . "\n"); - dbg(" tstamp : " . $sl->{'timestamp'} . "\n"); - dbg(" status: $status\n"); - dbg(" score : $score\n"); - dbg(" thresh: $threshold\n"); - dbg(" recip : $recipient\n"); - dbg(" time : $time_processed\n"); - dbg(" bytes : $bytes_processed\n\n"); + dbg("Found: " . $sl->{'text'} . "\n"); + dbg(" tstamp : " . $sl->{'timestamp'} . "\n"); + dbg(" status: $status\n"); + dbg(" score : $score\n"); + dbg(" thresh: $threshold\n"); + dbg(" recip : $recipient\n"); + dbg(" time : $time_processed\n"); + dbg(" bytes : $bytes_processed\n\n"); my $clean_recipient = lc($recipient); $clean_recipient =~ s#\+[^@]*(@?)#$1#; @@ -178,8 +218,10 @@ # Total score $stats{'total'}{'count'}++; + $stats{'total'}{'score'} += $score; $stats{'total'}{'bytes'} += $bytes_processed; $stats{'total'}{'time'} += $time_processed; + $stats{'total'}{'byhour'}{$abshour}++; $stats{'total'}{'threshold'} += $threshold; if ($opt{'userstats'}) { @@ -200,147 +242,88 @@ } } +&vdbg("##### Exiting parseloop:\n\n"); + #Calculate some numbers -my $threshavg = 0; -my $spampercent = 0; -my $hampercent = 0; -my $bytesperhour = 0; -my $emailperhour = 0; -my $secperhour = 0; - -my $spamcount = $stats{'spam'}{'count'} || 0; -my $hamcount = $stats{'ham'}{'count'} || 0; -my $totalcount = $stats{'total'}{'count'} || 0; - -my $hrsinperiod = (($end-$start) / 3600); - -if ($totalcount > 0) { - if ($spamcount > 0) { - $mean_spam_score = $stats{'spam'}{'score'} / $spamcount; - $mean_spam_time = $stats{'spam'}{'time'} / $spamcount; - $mean_spam_bytes = $stats{'spam'}{'bytes'} / $spamcount; - $spampercent = (($spamcount/$totalcount) * 100); - } - - if ($hamcount > 0) { - $mean_ham_score = $stats{'ham'}{'score'} / $hamcount; - $mean_ham_time = $stats{'ham'}{'time'} / $hamcount; - $mean_ham_bytes = $stats{'ham'}{'bytes'} / $hamcount; - $hampercent = (($hamcount/$totalcount) * 100); - } - - $threshavg = $stats{'total'}{'threshold'} / $totalcount; - $emailperhour = ($totalcount/$hrsinperiod); - $bytesperhour = ($stats{'total'}{'bytes'} / $hrsinperiod); - $secperhour = ($stats{'total'}{'time'} / $hrsinperiod); -} +my %aggregate_stats = (); -my $oldfh; -#Open Sendmail if we are mailing it -if ($opt{'mail'}) { - open (SENDMAIL, "|$opt{'sendmail'} -oi -t -odq") or die "Can't open sendmail: $!\n"; - print SENDMAIL "From: $opt{'from'}\n"; - print SENDMAIL "To: $opt{'mail'}\n"; - print SENDMAIL "Subject: SpamAssassin statistics\n\n"; - $oldfh = select SENDMAIL; -} +$aggregate_stats{'threshavg'} = 0; +$aggregate_stats{'spampercent'} = 0; +$aggregate_stats{'hampercent'} = 0; +$aggregate_stats{'bytesperhour'} = 0; +$aggregate_stats{'emailperhour'} = 0; +$aggregate_stats{'secperhour'} = 0; + +$timing{'hrsinperiod'} = (($timing{'end'} - $timing{'start'}) / 3600); + +if ($stats{'total'}{'count'} > 0) { + $stats{'total'}{'mean_score'} = ($stats{'total'}{'score'} || 0) / $stats{'total'}{'count'}; + + if ($stats{'spam'}{'count'} > 0) { + $stats{'spam'}{'mean_score'} = $stats{'spam'}{'score'} / $stats{'spam'}{'count'}; + $stats{'spam'}{'mean_time'} = $stats{'spam'}{'time'} / $stats{'spam'}{'count'}; + $stats{'spam'}{'mean_bytes'} = $stats{'spam'}{'bytes'} / $stats{'spam'}{'count'}; + $aggregate_stats{'spampercent'} = (($stats{'spam'}{'count'}/$stats{'total'}{'count'}) * 100); + } -my $telapsed = time - $tstart; + if ($stats{'ham'}{'count'} > 0) { + $stats{'ham'}{'mean_score'} = $stats{'ham'}{'score'} / $stats{'ham'}{'count'}; + $stats{'ham'}{'mean_time'} = $stats{'ham'}{'time'} / $stats{'ham'}{'count'}; + $stats{'ham'}{'mean_bytes'} = $stats{'ham'}{'bytes'} / $stats{'ham'}{'count'}; + $aggregate_stats{'hampercent'} = (($stats{'ham'}{'count'}/$stats{'total'}{'count'}) * 100); + } -#Output results -my $rpt = ''; -$rpt .= "Report Title : SpamAssassin - Spam Statistics\n"; -$rpt .= "Report Date : " . strftime("%F", localtime) . "\n"; -$rpt .= "Period Beginning : " . strftime("%c", localtime($start)) . "\n"; -$rpt .= "Period Ending : " . strftime("%c", localtime($end)) . "\n"; -$rpt .= "\n"; -$rpt .= sprintf("Reporting Period : %.2f hrs\n", $hrsinperiod); -$rpt .= "--------------------------------------------------\n"; -$rpt .= "\n"; -$rpt .= "Note: 'ham' = 'nonspam'\n"; -$rpt .= "\n"; -$rpt .= sprintf("Total spam detected : %8d (%7.2f%%)\n", $spamcount, $spampercent || 0); -$rpt .= sprintf("Total ham accepted : %8d (%7.2f%%)\n", $hamcount, $hampercent || 0); -$rpt .= " -------------------\n"; -$rpt .= sprintf("Total emails processed : %8d (%5.f/hr)\n", $totalcount, $emailperhour || 0); -$rpt .= "\n"; -$rpt .= sprintf("Average spam threshold : %11.2f\n", $threshavg || 0); -$rpt .= sprintf("Average spam score : %11.2f\n", $mean_spam_score || 0); -$rpt .= sprintf("Average ham score : %11.2f\n", $mean_ham_score || 0); -$rpt .= "\n"; -$rpt .= sprintf("Spam kbytes processed : %8d (%5.f kb/hr)\n", - $stats{'spam'}{'bytes'}/1024, - $stats{'spam'}{'bytes'}/(1024 * $hrsinperiod)); -$rpt .= sprintf("Ham kbytes processed : %8d (%5.f kb/hr)\n", - $stats{'ham'}{'bytes'}/1024, - $stats{'ham'}{'bytes'}/(1024 * $hrsinperiod)); -$rpt .= sprintf("Total kbytes processed : %8d (%5.f kb/hr)\n", - $stats{'total'}{'bytes'}/1024, $bytesperhour/1024); -$rpt .= "\n"; -$rpt .= sprintf("Spam analysis time : %8d s (%5.f s/hr)\n", - $stats{'spam'}{'time'}, - $stats{'spam'}{'time'}/$hrsinperiod); -$rpt .= sprintf("Ham analysis time : %8d s (%5.f s/hr)\n", - $stats{'ham'}{'time'}, - $stats{'ham'}{'time'}/$hrsinperiod); -$rpt .= sprintf("Total analysis time : %8d s (%5.f s/hr)\n", - $stats{'total'}{'time'}, $secperhour); -$rpt .= "\n\n"; -$rpt .= "Statistics by Hour\n"; -$rpt .= "-------------------------------------\n"; -$rpt .= "Hour Spam Ham\n"; -$rpt .= "-------------- -------- --------\n"; - -my $hour = floor($start/3600); -while ($hour < $end/3600) { - $rpt .= sprintf("%s %8d %8d\n", - strftime("%F, %H", localtime($hour*3600)), - $stats{'spam'}{'byhour'}{$hour} || 0, - $stats{'ham'}{'byhour'}{$hour} || 0); - $hour++; -} -$rpt .= "\n\n"; - -if ($opt{'userstats'}) { - my $usercount = scalar(keys(%userstats)); - if ($usercount > 0) { - my $upper_userlimit = ($usercount > 25) ? 25 : $usercount; - - $rpt .= "Top $upper_userlimit spam victims:\n"; - $rpt .= "User S AvScr H AvScr Count % Count Bytes % Bytes Time % Time\n"; - $rpt .= "-------------------------------- ------- ------- -------- ---------- -------- ---------- -------- ----------\n"; - foreach my $user (sort { - $userstats{$b}{'total'}{'count'} <=> $userstats{$a}{'total'}{'count'} - } keys %userstats) { - $rpt .= sprintf("%-32s %7.2f %7.2f %8d (%7.2f%%) %8d (%7.2f%%) %8d (%7.2f%%)\n", - $user, - ($userstats{$user}{'spam'}{'count'} > 0) ? - $userstats{$user}{'spam'}{'score'} / $userstats{$user}{'spam'}{'count'} : 0, - ($userstats{$user}{'ham'}{'count'} > 0) ? - $userstats{$user}{'ham'}{'score'} / $userstats{$user}{'ham'}{'count'} : 0, - $userstats{$user}{'spam'}{'count'}, - ($userstats{$user}{'total'}{'count'} > 0) ? - $userstats{$user}{'spam'}{'count'} / $userstats{$user}{'total'}{'count'} * 100 : 0, - $userstats{$user}{'spam'}{'bytes'}, - ($userstats{$user}{'total'}{'bytes'} > 0) ? - $userstats{$user}{'spam'}{'bytes'} / $userstats{$user}{'total'}{'bytes'} * 100 : 0, - $userstats{$user}{'spam'}{'time'}, - ($userstats{$user}{'total'}{'time'} > 0) ? - $userstats{$user}{'spam'}{'time'} / $userstats{$user}{'total'}{'time'} * 100 : 0, - ); + $aggregate_stats{'threshavg'} = $stats{'total'}{'threshold'} / $stats{'total'}{'count'}; + $aggregate_stats{'emailperhour'} = ($stats{'total'}{'count'}/$timing{'hrsinperiod'}); + $aggregate_stats{'bytesperhour'} = ($stats{'total'}{'bytes'} / $timing{'hrsinperiod'}); + $aggregate_stats{'secperhour'} = ($stats{'total'}{'time'} / $timing{'hrsinperiod'}); + + foreach my $partition (qw(ham spam total)) { + $stats{$partition}{'d_mean_score'} = sprintf("%.2f", $stats{$partition}{'mean_score'}); + $stats{$partition}{'d_kbytes'} = sprintf("%.2f", $stats{$partition}{'bytes'} / 1024); + $stats{$partition}{'d_mbytes'} = sprintf("%.2f", $stats{$partition}{'bytes'} / (1024 * 1024)); + foreach my $metric (qw(count bytes time)) { + if ($partition eq 'total') { + $stats{'total'}{'percent'}{$metric} = '100%'; + } else { + if (defined($stats{'total'}{$metric}) && ($stats{'total'}{$metric} > 0)) { + $stats{$partition}{'percent'}{$metric} = + sprintf("%.2f%%", 100 * ($stats{$partition}{$metric} || 0) / $stats{'total'}{$metric}); + } else { + $stats{$partition}{'percent'}{$metric} = '-- %'; + } + } + + $stats{$partition}{'perhour'}{$metric} = + sprintf("%.2f", ($stats{$partition}{$metric} || 0) / $timing{'hrsinperiod'}); } + $stats{$partition}{'d_time'} = sprintf("%.1f", $stats{$partition}{'time'}); + $stats{$partition}{'perhour'}{'d_kbytes'} = sprintf("%.2f", $stats{$partition}{'perhour'}{'bytes'} / 1024); + $stats{$partition}{'perhour'}{'d_mbytes'} = sprintf("%.2f", $stats{$partition}{'perhour'}{'bytes'} / (1024 * 1024)); } - $rpt .= "\n"; + } -$rpt .= "Done. Report generated in $telapsed sec.\n"; +$timing{'telapsed'} = time - $timing{'telapsed'}; -print $rpt; +# build report +my $rpt = ''; +if ($opt{'html'}) { + $rpt = &build_html_report(\%timing, \%stats, \%userstats, \%aggregate_stats, ); +} else { + $rpt = &build_text_report(\%timing, \%stats, \%userstats, \%aggregate_stats, ); +} -#Close Senmdmail if it was opened +# send report via mail or just print it out if ($opt{'mail'}) { - select $oldfh; + open (SENDMAIL, "|$opt{'sendmail'} -oi -t -odq") or die "Can't open sendmail: $!\n"; + print SENDMAIL "From: $opt{'from'}\n"; + print SENDMAIL "To: $opt{'mail'}\n"; + print SENDMAIL "Subject: SpamAssassin statistics\n\n"; + print SENDMAIL $rpt; close (SENDMAIL); +} else { + print $rpt; } #All done @@ -350,6 +333,438 @@ # Subroutines ############################################################### ############################################################################# + +######################################## +# Build text report # +######################################## +sub build_text_report { + my $Rh_timing = shift; + my $Rh_stats = shift; + my $Rh_userstats = shift; + my $Rh_aggregate_stats = shift; + + my $rpt = ''; + $rpt .= "Report Title : SpamAssassin - Spam Statistics\n"; + $rpt .= "Report Date : " . strftime("%Y-%m-%d", localtime) . "\n"; + $rpt .= "Period Beginning : " . strftime("%c", localtime($Rh_timing->{'start'})) . "\n"; + $rpt .= "Period Ending : " . strftime("%c", localtime($Rh_timing->{'end'})) . "\n"; + $rpt .= "\n"; + $rpt .= sprintf("Reporting Period : %.2f hrs\n", $Rh_timing->{'hrsinperiod'}); + $rpt .= "--------------------------------------------------\n"; + $rpt .= "\n"; + $rpt .= "Note: 'ham' = 'nonspam'\n"; + $rpt .= "\n"; + $rpt .= sprintf("Total spam detected : %8d (%7.2f%%)\n", $Rh_stats->{'spam'}{'count'}, $Rh_aggregate_stats->{'spampercent'} || 0); + $rpt .= sprintf("Total ham accepted : %8d (%7.2f%%)\n", $Rh_stats->{'ham'}{'count'}, $Rh_aggregate_stats->{'hampercent'} || 0); + $rpt .= " -------------------\n"; + $rpt .= sprintf("Total emails processed : %8d (%5.f/hr)\n", $Rh_stats->{'total'}{'count'}, $Rh_aggregate_stats->{'emailperhour'} || 0); + $rpt .= "\n"; + $rpt .= sprintf("Average spam threshold : %11.2f\n", $Rh_aggregate_stats->{'threshavg'} || 0); + $rpt .= sprintf("Average spam score : %11.2f\n", $Rh_stats->{'spam'}{'mean_score'} || 0); + $rpt .= sprintf("Average ham score : %11.2f\n", $Rh_stats->{'ham'}{'mean_score'} || 0); + $rpt .= "\n"; + $rpt .= sprintf("Spam kbytes processed : %8d (%5.f kb/hr)\n", + $Rh_stats->{'spam'}{'bytes'}/1024, + $Rh_stats->{'spam'}{'bytes'}/(1024 * $Rh_timing->{'hrsinperiod'})); + $rpt .= sprintf("Ham kbytes processed : %8d (%5.f kb/hr)\n", + $Rh_stats->{'ham'}{'bytes'}/1024, + $Rh_stats->{'ham'}{'bytes'}/(1024 * $Rh_timing->{'hrsinperiod'})); + $rpt .= sprintf("Total kbytes processed : %8d (%5.f kb/hr)\n", + $Rh_stats->{'total'}{'bytes'}/1024, $Rh_aggregate_stats->{'bytesperhour'}/1024); + $rpt .= "\n"; + $rpt .= sprintf("Spam analysis time : %8d s (%5.f s/hr)\n", + $Rh_stats->{'spam'}{'time'}, + $Rh_stats->{'spam'}{'time'}/$Rh_timing->{'hrsinperiod'}); + $rpt .= sprintf("Ham analysis time : %8d s (%5.f s/hr)\n", + $Rh_stats->{'ham'}{'time'}, + $Rh_stats->{'ham'}{'time'}/$Rh_timing->{'hrsinperiod'}); + $rpt .= sprintf("Total analysis time : %8d s (%5.f s/hr)\n", + $Rh_stats->{'total'}{'time'}, $Rh_aggregate_stats->{'secperhour'}); + $rpt .= "\n\n"; + $rpt .= "Statistics by Hour\n"; + $rpt .= "----------------------------------------------------\n"; + $rpt .= "Hour Spam Ham\n"; + $rpt .= "------------- ----------------- --------------\n"; + + my $hour = floor($Rh_timing->{'start'}/3600); + + while ($hour < $Rh_timing->{'end'}/3600) { + + my $hourly_spam_percent = 0; + my $hourly_ham_percent = 0; + + if (defined($Rh_stats->{'total'}{'byhour'}{$hour}) && ($Rh_stats->{'total'}{'byhour'}{$hour} > 0)) { + if (!defined($Rh_stats->{'spam'}{'byhour'}{$hour}) || $Rh_stats->{'spam'}{'byhour'}{$hour} == 0) { + $Rh_stats->{'spam'}{'byhour'}{$hour} = 0; + $hourly_ham_percent = 100; + } elsif (!defined($Rh_stats->{'ham'}{'byhour'}{$hour}) || $Rh_stats->{'ham'}{'byhour'}{$hour} == 0) { + $Rh_stats->{'ham'}{'byhour'}{$hour} = 0; + $hourly_spam_percent = 100; + } else { + $hourly_spam_percent = 100 * $Rh_stats->{'spam'}{'byhour'}{$hour} / $Rh_stats->{'total'}{'byhour'}{$hour}; + $hourly_ham_percent = 100 * $Rh_stats->{'ham'}{'byhour'}{$hour} / $Rh_stats->{'total'}{'byhour'}{$hour}; + } + } + + $rpt .= sprintf("%-16s %8d (%3d%%) %8d (%3d%%)\n", + strftime("%Y-%m-%d %H", localtime($hour*3600)), + $Rh_stats->{'spam'}{'byhour'}{$hour} || 0, + $hourly_spam_percent, + $Rh_stats->{'ham'}{'byhour'}{$hour} || 0, + $hourly_ham_percent); + $hour++; + } + $rpt .= "\n\n"; + + if ($opt{'userstats'}) { + my $topusers = 25; + if (defined($opt{'topusers'}) && ($opt{'topusers'} > 0)) { + $topusers = $opt{'topusers'}; + } + my $usercount = scalar(keys(%{$Rh_userstats})); + if ($usercount > 0) { + my $upper_userlimit = ($usercount > $topusers) ? $topusers : $usercount; + + $rpt .= "Top $upper_userlimit spam victims:\n"; + $rpt .= "User S AvScr H AvScr Count % Count Bytes % Bytes Time % Time\n"; + $rpt .= "-------------------------------- ------- ------- -------- ---------- -------- ---------- -------- ----------\n"; + foreach my $user (sort { + $Rh_userstats->{$b}{'total'}{'count'} <=> $Rh_userstats->{$a}{'total'}{'count'} + } keys %{$Rh_userstats}) { + + foreach my $partition (qw(spam ham total)) { + foreach my $metric (qw(score bytes count time)) { + $Rh_userstats->{$user}{$partition}{$metric} ||= 0; + } + } + + $rpt .= sprintf("%-32s %7.2f %7.2f %8d (%7.2f%%) %8d (%7.2f%%) %8d (%7.2f%%)\n", + $user, + ($Rh_userstats->{$user}{'spam'}{'count'} > 0) ? + $Rh_userstats->{$user}{'spam'}{'score'} / $Rh_userstats->{$user}{'spam'}{'count'} : 0, + ($Rh_userstats->{$user}{'ham'}{'count'} > 0) ? + $Rh_userstats->{$user}{'ham'}{'score'} / $Rh_userstats->{$user}{'ham'}{'count'} : 0, + $Rh_userstats->{$user}{'spam'}{'count'}, + ($Rh_userstats->{$user}{'total'}{'count'} > 0) ? + 100 * $Rh_userstats->{$user}{'spam'}{'count'} / $Rh_userstats->{$user}{'total'}{'count'} : 0, + $Rh_userstats->{$user}{'spam'}{'bytes'}, + ($Rh_userstats->{$user}{'total'}{'bytes'} > 0) ? + 100 * $Rh_userstats->{$user}{'spam'}{'bytes'} / $Rh_userstats->{$user}{'total'}{'bytes'} : 0, + $Rh_userstats->{$user}{'spam'}{'time'}, + ($Rh_userstats->{$user}{'total'}{'time'} > 0) ? + 100 * $Rh_userstats->{$user}{'spam'}{'time'} / $Rh_userstats->{$user}{'total'}{'time'} : 0, + ); + } + } + $rpt .= "\n"; + } + + my $codename = $0; + $codename =~ s#^.*/##o; + $rpt .= "Done. Report generated in " . $Rh_timing->{'telapsed'} . " sec by $codename, version $VER_NUM.\n"; + + return $rpt; +} + +######################################## +# Build HTML report # +######################################## +sub build_html_report { + my $Rh_timing = shift; + my $Rh_stats = shift; + my $Rh_userstats = shift; + my $Rh_aggregate_stats = shift; + + my $rpt = ''; + + my $d_now = strftime("%c", localtime(time)); + my $d_start = strftime("%A, %B %e %Y %T %Z", localtime($Rh_timing->{'start'})); + my $d_end = strftime("%A, %B %e %Y %T %Z", localtime($Rh_timing->{'end'})); + my $d_telapsed = $Rh_timing->{'telapsed'}; + my $d_period = sprintf("%.2f", $Rh_timing->{'hrsinperiod'}); + + my $d_mean_thresh = sprintf("%.2f", $Rh_aggregate_stats->{'threshavg'}); + + my $t_spam_overview =<<"T_OVERVIEW"; +<table border="1" summary="Aggregate mail statistics"> +<tr> +<th rowspan="2"></th> +<th colspan="3">Messages</th> +<th colspan="3">Size</th> +<th colspan="3">Time</th> +<th>Mean Score</th> +</tr> + +<tr> +<th>[#]</th> +<th>[#/hr]</th> +<th>[%]</th> +<th>[Kb]</th> +<th>[Kb/hr]</th> +<th>[%]</th> +<th>[s]</th> +<th>[s/hr]</th> +<th>[%]</th> +<th>[#]</th> +</tr> + +<tr> +<th bgcolor="#CCFFCC">Ham</th> +<td align="right">$Rh_stats->{'ham'}{'count'}</td> +<td align="right">$Rh_stats->{'ham'}{'perhour'}{'count'}</td> +<td align="right">$Rh_stats->{'ham'}{'percent'}{'count'}</td> +<td align="right">$Rh_stats->{'ham'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'ham'}{'perhour'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'ham'}{'percent'}{'bytes'}</td> +<td align="right">$Rh_stats->{'ham'}{'d_time'}</td> +<td align="right">$Rh_stats->{'ham'}{'perhour'}{'time'}</td> +<td align="right">$Rh_stats->{'ham'}{'percent'}{'time'}</td> +<td align="right">$Rh_stats->{'ham'}{'d_mean_score'}</td> +</tr> + +<tr> +<th bgcolor="#FFCCCC">Spam</th> +<td align="right">$Rh_stats->{'spam'}{'count'}</td> +<td align="right">$Rh_stats->{'spam'}{'perhour'}{'count'}</td> +<td align="right">$Rh_stats->{'spam'}{'percent'}{'count'}</td> +<td align="right">$Rh_stats->{'spam'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'spam'}{'perhour'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'spam'}{'percent'}{'bytes'}</td> +<td align="right">$Rh_stats->{'spam'}{'d_time'}</td> +<td align="right">$Rh_stats->{'spam'}{'perhour'}{'time'}</td> +<td align="right">$Rh_stats->{'spam'}{'percent'}{'time'}</td> +<td align="right">$Rh_stats->{'spam'}{'d_mean_score'}</td> +</tr> + +<tr> +<th bgcolor="#CCCCFF">Total</th> +<td align="right">$Rh_stats->{'total'}{'count'}</td> +<td align="right">$Rh_stats->{'total'}{'perhour'}{'count'}</td> +<td align="right">$Rh_stats->{'total'}{'percent'}{'count'}</td> +<td align="right">$Rh_stats->{'total'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'total'}{'perhour'}{'d_kbytes'}</td> +<td align="right">$Rh_stats->{'total'}{'percent'}{'bytes'}</td> +<td align="right">$Rh_stats->{'total'}{'d_time'}</td> +<td align="right">$Rh_stats->{'total'}{'perhour'}{'time'}</td> +<td align="right">$Rh_stats->{'total'}{'percent'}{'time'}</td> +<td align="right">$Rh_stats->{'total'}{'d_mean_score'}</td> +</tr> +</table> +T_OVERVIEW + + my $t_hourly =<<"T_HOURLY"; +<table border="0" summary="Hourly ham/spam trends"> +<tr> +<th colspan="3">Statistics by hour</th> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +<td> </td> +</tr> + +<tr> +<th>Hour</th> +<th bgcolor="#FFCCCC">Spam</th> +<th bgcolor="#CCFFCC">Ham</th> +<td colspan="10"></td> +</tr> +T_HOURLY + + my $hour = floor($Rh_timing->{'start'}/3600); + my $prev_day = ''; + + my $null_color = 'bgcolor="#CCCCFF"'; + my $spam_color = 'bgcolor="#FFCCCC"'; + my $ham_color = 'bgcolor="#CCFFCC"'; +# my $spam_mark = qq{<td $spamcolor"> </td>}; +# my $ham_mark = qq{<td $hamcolor"> </td>}; + while ($hour < $Rh_timing->{'end'}/3600) { + foreach my $partition (qw(spam ham total)) { + $Rh_stats->{$partition}{'byhour'}{$hour} = 0 unless + (defined($Rh_stats->{$partition}{'byhour'}{$hour}) && + ($Rh_stats->{$partition}{'byhour'}{$hour} > 0)); + } + + my $curr_hour = strftime("%H:00", localtime($hour*3600)); + my $curr_day = strftime("%Y-%m-%d", localtime($hour*3600)); + if ($curr_day ne $prev_day) { + $curr_hour = "<b>$curr_day $curr_hour</b>"; + } + $prev_day = $curr_day; + + my $ham_fraction = 0; + my $spam_fraction = 0; + my $check_total = $Rh_stats->{'ham'}{'byhour'}{$hour} + $Rh_stats->{'spam'}{'byhour'}{$hour}; + my $tab_graph = qq{<td colspan="10" $null_color> </td>}; + if ($Rh_stats->{'total'}{'byhour'}{$hour} > 0) { + $spam_fraction = int(10 * ($Rh_stats->{'spam'}{'byhour'}{$hour} / $check_total)); + $ham_fraction = 10 - $spam_fraction; + if ($spam_fraction == 10) { + $tab_graph = qq{<td colspan="10" $spam_color> </td>}; + } elsif ($spam_fraction == 9) { + $tab_graph = qq{<td colspan="9" $spam_color> </td>}; + $tab_graph .= "<td $ham_color> </td>"; + } elsif ($spam_fraction == 1) { + $tab_graph = "<td $spam_color> </td>"; + $tab_graph .= qq{<td colspan="9" $ham_color> </td>}; + } elsif ($spam_fraction == 0) { + $tab_graph = qq{<td colspan="10" $ham_color> </td>}; + } else { + $tab_graph = qq{<td colspan="$spam_fraction" $spam_color> </td>}; + $tab_graph .= qq{<td colspan="$ham_fraction" $ham_color> </td>}; + } +# $tab_graph = ($spam_mark x $spam_fraction) . ($ham_mark x $ham_fraction); + } + + $t_hourly .= sprintf(qq{<tr align="right"><td>%s</td><td>%d</td><td>%d</td>%s</tr>\n}, + $curr_hour, + $Rh_stats->{'spam'}{'byhour'}{$hour}, + $Rh_stats->{'ham'}{'byhour'}{$hour}, + $tab_graph,); + + $hour++; + } + $t_hourly .= "</table>\n"; + + my $t_userstats = ''; + if ($opt{'userstats'} && defined($Rh_stats->{'total'}{'count'}) && + ($Rh_stats->{'total'}{'count'} > 0)) { + my $topusers = 25; + if (defined($opt{'topusers'}) && ($opt{'topusers'} > 0)) { + $topusers = $opt{'topusers'}; + } + + my $usercount = scalar(keys(%{$Rh_userstats})); + if ($usercount > 0) { + my $upper_userlimit = ($usercount > $topusers) ? $topusers : $usercount; + + $t_userstats =<<"T_USERSTATS"; +<table border="1" summary="Top $upper_userlimit spam victims"> +<tr> +<th colspan="3">Top $upper_userlimit spam victims</th> +<th colspan="6" bgcolor="#FFCCCC">Spam</th> +</tr> +<tr> +<th rowspan="2">User</th> +<th colspan="2">Avg. Score</th> +<th colspan="2">Messages Received</th> +<th colspan="2">Bytes Received</th> +<th colspan="2">Processing Time</th> +</tr> +<tr> +<th bgcolor="#FFCCCC">Spam</th> +<th bgcolor="#CCFFCC">Ham</th> +<th>[#]</th> +<th>%</th> +<th>[bytes]</th> +<th>%</th> +<th>[s]</th> +<th>%</th> +</tr> +T_USERSTATS + +# $rpt .= "Top $upper_userlimit spam victims:\n"; +# $rpt .= "User S AvScr H AvScr Count % Count Bytes % Bytes Time % Time\n"; +# $rpt .= "-------------------------------- ------- ------- -------- ---------- -------- ---------- -------- ----------\n"; + foreach my $user (sort { + $Rh_userstats->{$b}{'total'}{'count'} <=> $Rh_userstats->{$a}{'total'}{'count'} + } keys %{$Rh_userstats}) { + + foreach my $partition (qw(spam ham total)) { + foreach my $metric (qw(score bytes count)) { + $Rh_userstats->{$user}{$partition}{$metric} ||= 0; + } + } + + my %avg_score = (); + foreach my $partition (qw(ham spam total)) { + + foreach my $metric (qw(count bytes time)) { + $Rh_userstats->{$user}{$partition}{$metric} = 0 unless + (defined($Rh_userstats->{$user}{$partition}{$metric})); + } + + if ($partition ne 'total') { + if (defined($Rh_userstats->{$user}{$partition}{'count'}) + && ($Rh_userstats->{$user}{$partition}{'count'} > 0)) { + $avg_score{$partition} = sprintf('%.2f', + $Rh_userstats->{$user}{$partition}{'score'} + / $Rh_userstats->{$user}{$partition}{'count'}); + } else { + $avg_score{$partition} = 0; + } + } + } + + $t_userstats .= sprintf(qq{<tr align="right"><td align="left">%s</td><td>%.2f</td><td>%.2f</td><td>%d</td><td>%.2f%%</td><td>%d</td><td>%.2f%%</td><td>%d</td><td>%.2f%%</td></tr>\n}, + $user, + $avg_score{'spam'}, + $avg_score{'ham'}, + $Rh_userstats->{$user}{'spam'}{'count'}, + (defined($Rh_userstats->{$user}{'total'}{'count'}) && ($Rh_userstats->{$user}{'total'}{'count'} > 0)) ? + 100 * $Rh_userstats->{$user}{'spam'}{'count'} / $Rh_userstats->{$user}{'total'}{'count'} : 0, + $Rh_userstats->{$user}{'spam'}{'bytes'}, + (defined($Rh_userstats->{$user}{'total'}{'bytes'}) && ($Rh_userstats->{$user}{'total'}{'bytes'} > 0)) ? + 100 * $Rh_userstats->{$user}{'spam'}{'bytes'} / $Rh_userstats->{$user}{'total'}{'bytes'} : 0, + $Rh_userstats->{$user}{'spam'}{'time'}, + (defined($Rh_userstats->{$user}{'total'}{'time'}) && ($Rh_userstats->{$user}{'total'}{'time'} > 0)) ? + 100 * $Rh_userstats->{$user}{'spam'}{'time'} / $Rh_userstats->{$user}{'total'}{'time'}: 0, + ); + } + + $t_userstats .= "</table>\n<hr>\n"; + + } + } + + my $codename = $0; + $codename =~ s#^.*/##o; + + $rpt .=<<"HTMLPAGE"; +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<head> +<title>SpamAssassin Statistics: $d_start - $d_end</title> +</head> +<body> +<h1>SpamAssassin Statistics:</h1> +<p> +Period of <b>$d_period</b> hour(s) extending from<br> +<b>$d_start</b> to<br> +<b>$d_end</b> +</p> +<p> +Generated on <b>$d_now</b> in $d_telapsed second(s) by $codename, version $VER_NUM. +</p> +<hr> +<p> +Note: 'ham' = 'nonspam' +</p> +<p> +The mean spam threshold score is $d_mean_thresh; mail scoring below the threshold is ham, mail scoring at or above the threshold is spam. +</p> +$t_spam_overview +<hr> +$t_hourly +<hr> +$t_userstats +<p> +Generated on <b>$d_now</b> in $d_telapsed second(s) by $codename, version $VER_NUM. +</p> +</body> +</html> +HTMLPAGE + + return $rpt; +} + + ######################################## # Process parms # ######################################## @@ -392,47 +807,96 @@ } sub dbg { - my $msg = shift; + print STDERR @_ if ($opt{debug}); +} - if ($opt{debug}) { - print STDERR $msg; - } +sub vdbg { + print STDERR @_ if ($opt{debug} && $opt{verbose}); } __END__ =head1 NAME - + sa-stats.pl - Builds received spam/ham report from mail log - + =head1 VERSION - - $Revision: 1.5 $ - + + $Revision: 1.17 $ + =head1 SYNOPSIS - + Usage: sa-stats.pl [options] Options: - -l, --logfile=filename logfile to read (default: /var/log/maillog) + -l, --logfile=filename logfile to read + (default: /var/log/maillog) -s, --start Sets date/time for start of reporting period -e, --end Sets date/time for end of reporting period - -u, --userstats Generates stats for the top 25 spam victims + -u, --userstats Generates stats for the top spam victims + (default is 25; see -T) + -H, --html Generates HTML report + (default: plain text) + -T, --top=# Display top # spam victims + (# defaults to 25; -T implies -u) -h, --help Displays this message -V, --version Display version info --mail=emailaddress Sends report to emailaddress - --sendmail=/path/to/sendmail Location of sendmail binary (default: /usr/sbin/sendmail) - --from=emailaddress Sets From: field of mail - -v, --verbose Sets verbose mode + --sendmail=/path/to/sendmail Location of sendmail binary + (default: /usr/sbin/sendmail) + --from=emailaddress Sets From: field of mail + -v, --verbose Sets verbose mode (requires -D) -D, --debug Sets debug mode =head1 DESCRIPTION Creates simple text report of spam/ham detected by SpamAssassin by -parsing the mail log (generally /var/log/maillog) +parsing spamd entries in the mail log (generally /var/log/maillog) + +=head1 EXAMPLES + +To generate a text report from midnight to present using /var/log/maillog: + + ./sa-stats.pl -s 'midnight' -e 'now' > sa_stats.txt + +To generate an HTML report including the top 5 spam victims for the month of +January 2004 from compressed mail logs: + + gunzip -c /var/log/maillog-200401*.gz | ./sa-stats.pl -H -T 5 -l - \ + -s '2001-01-01 00:00:00' -e '2004-01-31 23:59:59' > jan_2004_stats.html + +Note the use of '-' as a filename to represent STDIN. + +To generate a text report with per-user stats from yesterday, reading from +/var/log/mail and turning on all debugging output: + + ./sa-stats.pl -v -D -u -l /var/log/mail \ + -s 'yesterday midnight' 1>stats.txt 2>stats.err + +=head1 TIPS + +=over 4 + +=item * + +Are you running spamd? Currently sa-stats.pl only reads syslog entries from +spamd; it doesn't work with MTA-level calls to Mail::SpamAssassin or with logs +generated by the spamassassin perl script. + +=item * + +Are there spamd entries in your mail log? Use 'grep spamd /var/log/maillog' to find out. + +=item * + +Are there spamd entries in your mail log within the analysis interval? Run +'sa-stats.pl -v -D ...' to see the entries that are found and discarded as well +as to see the actual analysis interval. + +=back =head1 DEPENDENCIES - + =over 4 =item * @@ -456,37 +920,46 @@ Parse::Syslog; =back - + =head1 BUGS - + +=over 4 + =item * Because of poor year handling in Parse::Syslog, the script may not work well when the log file dates back to the previous year. - + +=back + =head1 TO DO =over 4 - + =item * + Find bugs =item * + Fix bugs =item * + Don't call /usr/sbin/sendmail directly; use Mail::Internet or Net::SMTP or other standard module =item * -Add support for piped-in logs, compressed logs (see gzopen() from Compress::Zlib) + +Add support for compressed logs (see gzopen() from Compress::Zlib) =item * -Have --verbose and --debug actually do something. + +Have --verbose work without --debug =back - + =head1 AUTHORS - + Brad Rathbun <brad [at] computechnv> http://www.computechnv.com/ Bob Apthorpe <apthorpe+sa [at] cynistar> http://www.cynistar.net/~apthorpe/ @@ -495,7 +968,6 @@ =head1 SEE ALSO -Mail::SpamAssassin - -=cut +Mail::SpamAssassin, Date::Manip, spamd(1) +=cut
|