Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: SpamAssassin: commits

svn commit: r1511229 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

 

 

SpamAssassin commits RSS feed   Index | Next | Previous | View Threaded


mmartinec at apache

Aug 7, 2013, 2:47 AM

Post #1 of 1 (15 views)
Permalink
svn commit: r1511229 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Author: mmartinec
Date: Wed Aug 7 09:47:13 2013
New Revision: 1511229

URL: http://svn.apache.org/r1511229
Log:
Bug 6963: Anybody cares for a saved millisecond or two in computing bayes probabilities for tokens?

Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1511229&r1=1511228&r2=1511229&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Aug 7 09:47:13 2013
@@ -705,14 +705,17 @@ sub scan {
{ my $timer = $self->{main}->time_method('b_tok_get_all');
$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
}
- my %pw;

my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
+
+ my $probabilities_ref =
+ $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
+
+ my %pw;
foreach my $tokendata (@{$tokensdata}) {
- my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
- my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
+ my $prob = shift(@$probabilities_ref);
next unless defined $prob;
-
+ my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
$pw{$token} = {
prob => $prob,
spam_count => $tok_spam,
@@ -721,15 +724,17 @@ sub scan {
};
}

+ my @pw_keys = keys %pw;
+
# If none of the tokens were found in the DB, we're going to skip
# this message...
- if (!%pw) {
+ if (!@pw_keys) {
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
goto skip;
}

my $tcount_total = keys %{$msgtokens};
- my $tcount_learned = keys %pw;
+ my $tcount_learned = scalar @pw_keys;

# Figure out the message receive time (used as atime below)
# If the message atime comes back as being in the future, something's
@@ -739,50 +744,47 @@ sub scan {
my $now = time;
$msgatime = $now if ( $msgatime > $now );

- # now take the $count most significant tokens and calculate probs using
- # Robinson's formula.
- my $count = N_SIGNIFICANT_TOKENS;
- my @sorted;
-
my @touch_tokens;
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];

- my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), keys %pw);
+ my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
my $log_each_token = (would_log('dbg', 'bayes') > 1);

- foreach my $tok (sort {
- $tok_strength{$b} <=> $tok_strength{$a}
- } keys %pw)
- {
- if ($count-- < 0) { last; }
- next if ($tok_strength{$tok} <
- $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
+ # now take the most significant tokens and calculate probs using
+ # Robinson's formula.
+
+ @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
+
+ if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
+
+ my @sorted;
+ foreach my $tok (@pw_keys) {
+ next if $tok_strength{$tok} <
+ $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

- my $pw = $pw{$tok}->{prob};
+ my $pw_tok = $pw{$tok};
+ my $pw_prob = $pw_tok->{prob};

# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
# won't be used? Just collecting the data is certainly simpler.
#
my $raw_token = $msgtokens->{$tok} || "(unknown)";
- my $s = $pw{$tok}->{spam_count};
- my $n = $pw{$tok}->{ham_count};
- my $a = $pw{$tok}->{atime};
+ my $s = $pw_tok->{spam_count};
+ my $n = $pw_tok->{ham_count};
+ my $a = $pw_tok->{atime};

- if ($pw < 0.5) {
- push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];
- } else {
- push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
- }
+ push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
+ [$raw_token, $pw_prob, $s, $n, $a] );

- push (@sorted, $pw);
+ push(@sorted, $pw_prob);

# update the atime on this token, it proved useful
push(@touch_tokens, $tok);

if ($log_each_token) {
- dbg("bayes: token '$raw_token' => $pw");
+ dbg("bayes: token '$raw_token' => $pw_prob");
}
}

@@ -1408,60 +1410,78 @@ sub _tokenize_mail_addrs {

###########################################################################

-# compute the probability that a token is spammish
-sub _compute_prob_for_token {
- my ($self, $token, $ns, $nn, $s, $n) = @_;
+# compute the probability that a token is spammish for each token
+sub _compute_prob_for_all_tokens {
+ my ($self, $tokensdata, $ns, $nn) = @_;
+ my @probabilities;

- # we allow the caller to give us the token information, just
- # to save a potentially expensive lookup
- if (!defined($s) || !defined($n)) {
- ($s, $n, undef) = $self->{store}->tok_get ($token);
- }
-
- return if ($s == 0 && $n == 0);
+ return if !$ns || !$nn;

+ my $threshold = 1; # ignore low-freq tokens below this s+n threshold
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
- return if ($s + $n < 10); # ignore low-freq tokens
+ $threshold = 10;
}
-
if (!$self->{use_hapaxes}) {
- return if ($s + $n < 2);
+ $threshold = 2;
}

- return if ( $ns == 0 || $nn == 0 );
-
- my $ratios = ($s / $ns);
- my $ration = ($n / $nn);
+ foreach my $tokendata (@{$tokensdata}) {
+ my $s = $tokendata->[1]; # spam count
+ my $n = $tokendata->[2]; # ham count
+ my $prob;
+
+ no warnings 'uninitialized'; # treat undef as zero in addition
+ if ($s + $n >= $threshold) {
+ # ignoring low-freq tokens, also covers the (!$s && !$n) case
+
+ # my $ratios = $s / $ns;
+ # my $ration = $n / $nn;
+ # $prob = $ratios / ($ration + $ratios);
+ #
+ $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
+
+ if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
+ # use Robinson's f(x) equation for low-n tokens, instead of just
+ # ignoring them
+ my $robn = $s + $n;
+ $prob =
+ ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
+ /
+ ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
+ }
+ }

- my $prob;
+ # 'log_raw_counts' is used to log the raw data for the Bayes equations
+ # during a mass-check, allowing the S and X constants to be optimized
+ # quickly without requiring re-tokenization of the messages for each
+ # attempt. There's really no need for this code to be uncommented in
+ # normal use, however. It has never been publicly documented, so
+ # commenting it out is fine. ;)
+ #
+ ## if ($self->{log_raw_counts}) {
+ ## $self->{raw_counts} .= " s=$s,n=$n ";
+ ## }

- if ($ratios == 0 && $ration == 0) {
- warn "bayes: oops? ratios == ration == 0";
- return;
- } else {
- $prob = ($ratios) / ($ration + $ratios);
+ push(@probabilities, $prob);
}
+ return \@probabilities;
+}

- if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
- # use Robinson's f(x) equation for low-n tokens, instead of just
- # ignoring them
- my $robn = $s+$n;
- $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
- /
- ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
- }
-
- # 'log_raw_counts' is used to log the raw data for the Bayes equations during
- # a mass-check, allowing the S and X constants to be optimized quickly
- # without requiring re-tokenization of the messages for each attempt. There's
- # really no need for this code to be uncommented in normal use, however. It
- # has never been publicly documented, so commenting it out is fine. ;)
+# compute the probability that a token is spammish
+sub _compute_prob_for_token {
+ my ($self, $token, $ns, $nn, $s, $n) = @_;

- ## if ($self->{log_raw_counts}) {
- ## $self->{raw_counts} .= " s=$s,n=$n ";
- ## }
+ # we allow the caller to give us the token information, just
+ # to save a potentially expensive lookup
+ if (!defined($s) || !defined($n)) {
+ ($s, $n, undef) = $self->{store}->tok_get($token);
+ }
+ return if !$s && !$n;
+
+ my $probabilities_ref =
+ $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);

- return $prob;
+ return $probabilities_ref->[0];
}

###########################################################################

SpamAssassin commits RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.