Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Wikipedia: Mediawiki-CVS

SVN: [37807] trunk/extensions/AbuseFilter/AbuseFilter.class.php

 

 

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded


werdna at svn

Jul 17, 2008, 7:18 PM

Post #1 of 1 (32 views)
Permalink
SVN: [37807] trunk/extensions/AbuseFilter/AbuseFilter.class.php

Revision: 37807
Author: werdna
Date: 2008-07-18 02:18:58 +0000 (Fri, 18 Jul 2008)

Log Message:
-----------
AbuseFilter:
* Improve normalisation: use AntiSpoof.
* Allow callers to prevent the use of a condition counter. It's kinda annoying in batch processes.
* Improved caching of tokenisation - reduces average time to process a condition from 4ms right down to 200us
* Some new modifiers, caching of modifier data.

Modified Paths:
--------------
trunk/extensions/AbuseFilter/AbuseFilter.class.php

Modified: trunk/extensions/AbuseFilter/AbuseFilter.class.php
===================================================================
--- trunk/extensions/AbuseFilter/AbuseFilter.class.php 2008-07-18 02:16:50 UTC (rev 37806)
+++ trunk/extensions/AbuseFilter/AbuseFilter.class.php 2008-07-18 02:18:58 UTC (rev 37807)
@@ -8,9 +8,12 @@
public static $condCheckCount = array();
public static $condMatchCount = array();
public static $statsStoragePeriod = 86400;
- public static $modifierWords = array( 'norm', 'supernorm', 'lcase', 'length', 'specialratio', 'htmldecode', 'htmlencode', 'urlencode', 'urldecode' );
+ public static $modifierWords = array( 'norm', 'supernorm', 'lcase', 'length', 'specialratio', 'htmldecode', 'htmlencode', 'urlencode', 'urldecode', 'htmlfullencode' );
public static $operatorWords = array( 'eq', 'neq', 'gt', 'lt', 'regex', 'contains' );
public static $validJoinConditions = array( '!', '|', '&' );
+ public static $condLimitEnabled = true;
+ public static $tokenCache = array();
+ public static $modifyCache = array();

public static function generateUserVars( $user ) {
$vars = array();
@@ -28,6 +31,11 @@

return $vars;
}
+
+ public static function disableConditionLimit() {
+ // For use in batch scripts and the like
+ self::$condLimitEnabled = false;
+ }

public static function generateTitleVars( $title, $prefix ) {
$vars = array();
@@ -51,7 +59,7 @@
$fname = __METHOD__;

global $wgAbuseFilterConditionLimit;
- if (self::$condCount > $wgAbuseFilterConditionLimit) {
+ if (self::$condCount > $wgAbuseFilterConditionLimit && self::$condLimitEnabled) {
return false;
}

@@ -88,7 +96,7 @@
}

// We've hit the limit.
- if (self::$condCount > $wgAbuseFilterConditionLimit) {
+ if (self::$condCount > $wgAbuseFilterConditionLimit && self::$condLimitEnabled) {
return false;
}

@@ -145,8 +153,12 @@
// Get the rest of the string after the operator.
$parameters = explode( ' ', $conds, $wordNum+2);
$parameters = trim($parameters[$wordNum+1]);
-
- if (in_array( $parameters, array_keys( $vars ) )) {
+
+ list($firstWord,$rest) = explode( ' ', $parameters, 2 );
+ if (in_array( $firstWord, self::$modifierWords ) && in_array( $rest, array_keys($vars))) {
+ // Allow the compare target to be modified, too.
+ $parameters = self::modifyValue( $firstWord, $vars[$rest] );
+ } elseif (in_array( $parameters, array_keys( $vars ) )) {
$parameters = $vars[$parameters];
}

@@ -167,6 +179,12 @@

public static function tokeniseList( $list ) {
wfProfileIn( __METHOD__ );
+
+
+ if (isset(self::$tokenCache[$list])) {
+ return self::$tokenCache[$list];
+ }
+
// Parse it, character by character.
$escapeNext = false;
$listLevel = 0;
@@ -220,6 +238,8 @@

// Put any leftovers in
$allTokens[] = $thisToken;
+
+ self::$tokenCache[$list] = $allTokens;

wfProfileOut( __METHOD__ );

@@ -227,26 +247,41 @@
}

public static function modifyValue( $modifier, $value ) {
+ if (isset(self::$modifyCache[$modifier][$value]))
+ return self::$modifyCache[$modifier][$value];
+
+
if ($modifier == 'norm') {
- return self::normalise( $value );
+ $val = self::normalise( $value );
} elseif ($modifier == 'supernorm') {
- return self::superNormalise( $value );
+ $val = self::superNormalise( $value );
} elseif ($modifier == 'lcase') {
- return strtolower($value);
+ $val = strtolower($value);
} elseif ($modifier == 'length') {
- return strlen($value);
+ $val = strlen($value);
} elseif ($modifier == 'specialratio') {
$specialsonly = preg_replace('/\w/', '', $value );
- return (strlen($specialsonly) / strlen($value));
+ $val = (strlen($specialsonly) / strlen($value));
} elseif ($modifier == 'htmlencode') {
- return htmlspecialchars($value);
+ $val = htmlspecialchars($value);
} elseif ($modifier == 'htmldecode') {
- return htmlspecialchars_decode($value);
+ $val = htmlspecialchars_decode($value);
} elseif ($modifier == 'urlencode') {
- return urlencode($value);
+ $val = urlencode($value);
} elseif ($modifier == 'urldecode') {
- return urldecode($value);
+ $val = urldecode($value);
+ } elseif ($modifier == 'htmlfullencode') {
+ $val = htmlentities( $value );
+ } elseif ($modifier == 'simplenorm') {
+ $val = preg_replace( '/[\d\W]+/', '', $value );
+ $val = strtolower( $value );
}
+
+ if (count(self::$modifyCache[$modifier][$value]) > 1000) {
+ self::$modifyCache = array();
+ }
+
+ return self::$modifyCache[$modifier][$value] = $val;
}

public static function checkOperator( $operator, $value, $parameters ) {
@@ -269,19 +304,18 @@

public static function superNormalise( $text ) {
$text = self::normalise( $text );
- $text = preg_split( '//', $text, -1, PREG_SPLIT_NO_EMPTY ); // Split to a char array.
+ $text = AntiSpoof::stringToList($text); // Split to a char array.
sort($text);
$text = array_unique( $text ); // Remove duplicate characters.
- $text = implode( '', $text );
+ $text = AntiSpoof::listToString( $text );

return $text;
}

public static function normalise( $text ) {
$old_text = $text;
- $text = preg_replace( '/\W/', '', $text ); // Remove any special characters.
$text = strtolower($text);
- $text = preg_split( '//', $text, -1, PREG_SPLIT_NO_EMPTY ); // Split to a char array.
+ $text = AntiSpoof::stringToList( $text );
$text = AntiSpoof::equivString( $text ); // Normalise

// Remove repeated characters, but not all duplicates.
@@ -293,7 +327,9 @@
}
}

- $text = implode('', $text ); // Sort in alphabetical order, put back as it was.
+ $text = AntiSpoof::listToString( $text ); // Sort in alphabetical order, put back as it was.
+
+ $text = preg_replace( '/\W/', '', $text ); // Remove any special characters.

return $text;
}
@@ -328,9 +364,12 @@
$filter_matched[$row->af_id] = false;
}
}
+
+ // Don't store stats if the cond limit is disabled.
+ // It's probably a batch process or similar.
+ if (!self::$condLimitEnabled)
+ self::recordStats( $filter_matched );

- self::recordStats( $filter_matched );
-
if (count($blocking_filters) == 0 ) {
// No problems.
return true;



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS[at]lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact lists@gossamer-threads.com
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.