Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Wikipedia: Mediawiki-CVS

SVN: [59326] branches/lucene-search-2.1

 

 

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded


rainman at svn

Nov 21, 2009, 5:24 PM

Post #1 of 1 (188 views)
Permalink
SVN: [59326] branches/lucene-search-2.1

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/59326

Revision: 59326
Author: rainman
Date: 2009-11-22 01:24:50 +0000 (Sun, 22 Nov 2009)

Log Message:
-----------
Couple of changes to original andrews implementation:
* never use namespace names in index, since they can change and have aliases: convert ThreadPage into 0:Mainpage format
* parse ondiscussionpage similar to prefix: and add it as a special clause to query
* convert mwdumper mixed hashtable into all-string format
* update mwdumper to latest version

Modified Paths:
--------------
branches/lucene-search-2.1/lib/mwdumper.jar
branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java

Modified: branches/lucene-search-2.1/lib/mwdumper.jar
===================================================================
(Binary files differ)

Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
===================================================================
--- branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java 2009-11-22 01:21:03 UTC (rev 59325)
+++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java 2009-11-22 01:24:50 UTC (rev 59326)
@@ -146,6 +146,11 @@
protected boolean isInTitle = false;
protected int isInTitleLevel = 0;

+ /** Raw fields to append to queries like ondiscussionpage */
+ protected HashMap<String,String> rawFields = new HashMap<String,String>();
+
+ Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>();
+
protected Pattern urlPattern = Pattern.compile("(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?");

/** default operator (must = AND, should = OR) for boolean queries */
@@ -361,6 +366,9 @@
tokens = new ArrayList<Token>();
this.namespacePolicy = nsPolicy;
disableTitleAliases = true;
+ keywordFieldMapping = new Hashtable<String,String>();
+ keywordFieldMapping.put("inthread", "ThreadAncestor");
+ keywordFieldMapping.put("ondiscussionpage", "ThreadPage");
initNamespaces();
this.stopWords = new HashSet<String>();
if(stopWords != null)
@@ -425,8 +433,7 @@
HashSet<String> fields = getFields(queryText);
HashSet<NamespaceFilter> ret = new HashSet<NamespaceFilter>();
List ThreadingKeywords = new ArrayList();
- ThreadingKeywords.add("inthread");
- ThreadingKeywords.add("ondiscussionpage");
+ ThreadingKeywords.add("inthread");

for(String field : fields){
field = field.toLowerCase();
@@ -667,8 +674,7 @@
List<String> fieldOperators = new ArrayList<String>();
fieldOperators.add("intitle");
fieldOperators.add("incategory");
- fieldOperators.add("inthread");
- fieldOperators.add("ondiscussionpage");
+ fieldOperators.add("inthread");

return fieldOperators;
}
@@ -753,10 +759,8 @@

/** Make a lucene term from string */
private Term makeTerm(String t){
- Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>();
- keywordFieldMapping.put("inthread", "ThreadAncestor");
- keywordFieldMapping.put("ondiscussionpage", "ThreadPage");

+
if(currentField == null)
return new Term(defaultField,builder.isExactCase()? t : t.toLowerCase());
else if(defaultField.equals("contents") && isInTitle)
@@ -1292,8 +1296,50 @@
return queryText;
}

+ /**
+ * Extract prefix: field from the query and put it into prefixFilter
+ * variable for later retrieval
+ *
+ * @param queryText
+ * @param field (like "ondiscussionthread:")
+ * @return [0] - queryText with field part deleted
+ * [1] - the field part
+ */
+ public static String[] extractRawField(String queryText, String field){
+ ArrayList<String> filters = new ArrayList<String>();
+ int start = 0;
+ while(start < queryText.length()){
+ int end = indexOf(queryText,'"',start); // begin of phrase
+ int inx = queryText.indexOf(field);
+ if(inx >=0 && inx < end){
+ String prefix = queryText.substring(inx+field.length());
+
+ String full = null;
+ if(prefix.startsWith("[") && prefix.contains("]:")){
+ // convert from [2]:query to 2:query form
+ full = prefix.replace("[","").replace("]:",":");
+ } else // default to main namespace
+ full = "0:"+prefix ;
+
+ // add lowercase nonempty prefixes
+ if(full != null && full.length()>0)
+ filters.add(full);
+
+ return new String[]{ queryText.substring(0,inx), full };
+
+ }
+ start = end+1;
+ if(start < queryText.length()){
+ // skip phrase
+ start = indexOf(queryText,'"',start) + 1;
+ }
+ }
+
+ return new String[]{ queryText, null };
+ }
+
/** Like string.indexOf but return end of string instead of -1 when needle is not found */
- protected int indexOf(String string, char needle, int start){
+ protected static int indexOf(String string, char needle, int start){
int inx = string.indexOf(needle,start);
if(inx == -1)
return string.length();
@@ -1394,6 +1440,17 @@
this.namespacePolicy = options.policy;
defaultBoost = CONTENTS_BOOST;
defaultAliasBoost = ALIAS_BOOST;
+
+ this.rawFields = new HashMap<String,String>();
+ // parse out raw queries
+ for(String field : new String[] {"ondiscussionpage:"}){
+ String[] ret = extractRawField(queryText, field);
+ queryText = ret[0];
+ if( ret[1] != null )
+ this.rawFields.put(field,ret[1]);
+ }
+
+
Query qc = parseRaw(queryText);
ParsedWords words = parsedWords;
this.namespacePolicy = defaultPolicy;
@@ -1436,6 +1493,17 @@
if(redirectMatch != null)
full.add(redirectMatch, Occur.SHOULD);

+ // add raw fields as global constrains
+ for(Entry<String,String> e : rawFields.entrySet()){
+ String field = e.getKey();
+ if(field.endsWith(":"))
+ field = field.substring(0, field.length()-1);
+ // find target field in the index, e.g. ondiscussionpage -> ThreadPage
+ String targetField = keywordFieldMapping.get(field);
+ if( targetField != null)
+ full.add(new TermQuery(new Term(targetField, e.getValue())),Occur.MUST);
+ }
+
// init global scaling of articles
ArticleScaling scale = new ArticleScaling.None();
// based on age
@@ -1449,6 +1517,7 @@
}

}
+
// additional rank
AggregateInfo rank = iid.useAdditionalRank()? new AggregateInfoImpl() : null;
ArticleNamespaceScaling nsScale = iid.getNamespaceScaling();

Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java
===================================================================
--- branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java 2009-11-22 01:21:03 UTC (rev 59325)
+++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java 2009-11-22 01:24:50 UTC (rev 59326)
@@ -4,6 +4,7 @@
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
+import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
@@ -13,8 +14,11 @@
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
+import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
import org.mediawiki.importer.DumpWriter;
import org.mediawiki.importer.Page;
import org.mediawiki.importer.Revision;
@@ -111,7 +115,7 @@
Article article = new Article(page.Id,page.Title.Namespace,
page.Title.Text,revision.Text,redirectTo,references,
redirectTargetNamespace,0,redirects,rel,anchors,date,
- page.DiscussionThreadingInfo);
+ processLiquidThreadInfo(page.DiscussionThreadingInfo));
// index
if(indexWriter != null)
indexWriter.addArticle(article);
@@ -125,6 +129,25 @@
throw new IOException("stopped");
}

+ /** Process LQT properties, convert titles into correct format */
+ public static Hashtable<String, String> processLiquidThreadInfo(Hashtable info){
+ Enumeration e = info.keys();
+ Hashtable<String,String> res = new Hashtable<String,String>();
+ while (e.hasMoreElements()) {
+ String key = (String)e.nextElement();
+ Object rawvalue = info.get(key);
+ String value = rawvalue.toString();
+ if(rawvalue instanceof org.mediawiki.importer.Title){
+ // put titles into <ns>:<title> format (where ns is integer)
+ org.mediawiki.importer.Title t = (org.mediawiki.importer.Title) rawvalue;
+ value = t.Namespace+":"+t.Text;
+ }
+ res.put(key, value);
+ }
+
+ return res;
+ }
+
public void close() throws IOException {
// nop
}

Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
===================================================================
--- branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java 2009-11-22 01:21:03 UTC (rev 59325)
+++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java 2009-11-22 01:24:50 UTC (rev 59326)
@@ -18,6 +18,7 @@
import org.wikimedia.lsearch.beans.Redirect;
import org.wikimedia.lsearch.config.GlobalConfiguration;
import org.wikimedia.lsearch.config.IndexId;
+import org.wikimedia.lsearch.importer.DumpImporter;
import org.wikimedia.lsearch.index.IndexUpdateRecord;
import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
import org.wikimedia.lsearch.ranks.LinksBuilder;
@@ -83,7 +84,7 @@
revision.Text, redirectTo, references, 0, 0,
redirects, new ArrayList<RelatedTitle>(),
new Hashtable<String,Integer>(), date,
- page.DiscussionThreadingInfo );
+ DumpImporter.processLiquidThreadInfo(page.DiscussionThreadingInfo) );
log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects);
records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE));
log.debug(iid+": Update for "+article);

Modified: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java
===================================================================
--- branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java 2009-11-22 01:21:03 UTC (rev 59325)
+++ branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java 2009-11-22 01:24:50 UTC (rev 59326)
@@ -180,4 +180,9 @@
e.printStackTrace();
}
}
+
+ public void testExtractRawFields(){
+ assertEquals("[something , 0:eh heh]", Arrays.toString(WikiQueryParser.extractRawField("something ondiscussionpage:eh heh", "ondiscussionpage:")));
+ assertEquals("[something , 0:eh \"heh\"]", Arrays.toString(WikiQueryParser.extractRawField("something ondiscussionpage:eh \"heh\"", "ondiscussionpage:")));
+ }
}



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS [at] lists
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.