
rainman at svn
Nov 21, 2009, 5:24 PM
Post #1 of 1
(188 views)
Permalink
|
|
SVN: [59326] branches/lucene-search-2.1
|
|
http://www.mediawiki.org/wiki/Special:Code/MediaWiki/59326 Revision: 59326 Author: rainman Date: 2009-11-22 01:24:50 +0000 (Sun, 22 Nov 2009) Log Message: ----------- Couple of changes to original andrews implementation: * never use namespace names in index, since they can change and have aliases: convert ThreadPage into 0:Mainpage format * parse ondiscussionpage similar to prefix: and add it as a special clause to query * convert mwdumper mixed hashtable into all-string format * update mwdumper to latest version Modified Paths: -------------- branches/lucene-search-2.1/lib/mwdumper.jar branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java Modified: branches/lucene-search-2.1/lib/mwdumper.jar =================================================================== (Binary files differ) Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java =================================================================== --- branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java 2009-11-22 01:21:03 UTC (rev 59325) +++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java 2009-11-22 01:24:50 UTC (rev 59326) @@ -146,6 +146,11 @@ protected boolean isInTitle = false; protected int isInTitleLevel = 0; + /** Raw fields to append to queries like ondiscussionpage */ + protected HashMap<String,String> rawFields = new HashMap<String,String>(); + + Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>(); + protected Pattern urlPattern = Pattern.compile("(\\w+:{0,1}\\w*@)?(\\S+)(:[0-9]+)?(\\/|\\/([\\w#!:.?+=&%@!\\-\\/]))?"); /** default operator (must = AND, should = OR) for boolean queries */ @@ -361,6 +366,9 @@ tokens = new ArrayList<Token>(); this.namespacePolicy = nsPolicy; disableTitleAliases = true; + keywordFieldMapping = new Hashtable<String,String>(); + keywordFieldMapping.put("inthread", "ThreadAncestor"); + keywordFieldMapping.put("ondiscussionpage", "ThreadPage"); initNamespaces(); this.stopWords = new HashSet<String>(); if(stopWords != null) @@ -425,8 +433,7 @@ HashSet<String> fields = getFields(queryText); HashSet<NamespaceFilter> ret = new HashSet<NamespaceFilter>(); List ThreadingKeywords = new ArrayList(); - ThreadingKeywords.add("inthread"); - ThreadingKeywords.add("ondiscussionpage"); + ThreadingKeywords.add("inthread"); for(String field : fields){ field = field.toLowerCase(); @@ -667,8 +674,7 @@ List<String> fieldOperators = new ArrayList<String>(); fieldOperators.add("intitle"); fieldOperators.add("incategory"); - fieldOperators.add("inthread"); - fieldOperators.add("ondiscussionpage"); + fieldOperators.add("inthread"); return fieldOperators; } @@ -753,10 +759,8 @@ /** Make a lucene term from string */ private Term makeTerm(String t){ - Hashtable<String,String> keywordFieldMapping = new Hashtable<String,String>(); - keywordFieldMapping.put("inthread", "ThreadAncestor"); - keywordFieldMapping.put("ondiscussionpage", "ThreadPage"); + if(currentField == null) return new Term(defaultField,builder.isExactCase()? t : t.toLowerCase()); else if(defaultField.equals("contents") && isInTitle) @@ -1292,8 +1296,50 @@ return queryText; } + /** + * Extract prefix: field from the query and put it into prefixFilter + * variable for later retrieval + * + * @param queryText + * @param field (like "ondiscussionthread:") + * @return [0] - queryText with field part deleted + * [1] - the field part + */ + public static String[] extractRawField(String queryText, String field){ + ArrayList<String> filters = new ArrayList<String>(); + int start = 0; + while(start < queryText.length()){ + int end = indexOf(queryText,'"',start); // begin of phrase + int inx = queryText.indexOf(field); + if(inx >=0 && inx < end){ + String prefix = queryText.substring(inx+field.length()); + + String full = null; + if(prefix.startsWith("[") && prefix.contains("]:")){ + // convert from [2]:query to 2:query form + full = prefix.replace("[","").replace("]:",":"); + } else // default to main namespace + full = "0:"+prefix ; + + // add lowercase nonempty prefixes + if(full != null && full.length()>0) + filters.add(full); + + return new String[]{ queryText.substring(0,inx), full }; + + } + start = end+1; + if(start < queryText.length()){ + // skip phrase + start = indexOf(queryText,'"',start) + 1; + } + } + + return new String[]{ queryText, null }; + } + /** Like string.indexOf but return end of string instead of -1 when needle is not found */ - protected int indexOf(String string, char needle, int start){ + protected static int indexOf(String string, char needle, int start){ int inx = string.indexOf(needle,start); if(inx == -1) return string.length(); @@ -1394,6 +1440,17 @@ this.namespacePolicy = options.policy; defaultBoost = CONTENTS_BOOST; defaultAliasBoost = ALIAS_BOOST; + + this.rawFields = new HashMap<String,String>(); + // parse out raw queries + for(String field : new String[] {"ondiscussionpage:"}){ + String[] ret = extractRawField(queryText, field); + queryText = ret[0]; + if( ret[1] != null ) + this.rawFields.put(field,ret[1]); + } + + Query qc = parseRaw(queryText); ParsedWords words = parsedWords; this.namespacePolicy = defaultPolicy; @@ -1436,6 +1493,17 @@ if(redirectMatch != null) full.add(redirectMatch, Occur.SHOULD); + // add raw fields as global constrains + for(Entry<String,String> e : rawFields.entrySet()){ + String field = e.getKey(); + if(field.endsWith(":")) + field = field.substring(0, field.length()-1); + // find target field in the index, e.g. ondiscussionpage -> ThreadPage + String targetField = keywordFieldMapping.get(field); + if( targetField != null) + full.add(new TermQuery(new Term(targetField, e.getValue())),Occur.MUST); + } + // init global scaling of articles ArticleScaling scale = new ArticleScaling.None(); // based on age @@ -1449,6 +1517,7 @@ } } + // additional rank AggregateInfo rank = iid.useAdditionalRank()? new AggregateInfoImpl() : null; ArticleNamespaceScaling nsScale = iid.getNamespaceScaling(); Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java =================================================================== --- branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java 2009-11-22 01:21:03 UTC (rev 59325) +++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/DumpImporter.java 2009-11-22 01:24:50 UTC (rev 59326) @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.Calendar; import java.util.Date; +import java.util.Enumeration; import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; @@ -13,8 +14,11 @@ import java.util.regex.Pattern; import org.apache.log4j.Logger; +import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; import org.mediawiki.importer.DumpWriter; import org.mediawiki.importer.Page; import org.mediawiki.importer.Revision; @@ -111,7 +115,7 @@ Article article = new Article(page.Id,page.Title.Namespace, page.Title.Text,revision.Text,redirectTo,references, redirectTargetNamespace,0,redirects,rel,anchors,date, - page.DiscussionThreadingInfo); + processLiquidThreadInfo(page.DiscussionThreadingInfo)); // index if(indexWriter != null) indexWriter.addArticle(article); @@ -125,6 +129,25 @@ throw new IOException("stopped"); } + /** Process LQT properties, convert titles into correct format */ + public static Hashtable<String, String> processLiquidThreadInfo(Hashtable info){ + Enumeration e = info.keys(); + Hashtable<String,String> res = new Hashtable<String,String>(); + while (e.hasMoreElements()) { + String key = (String)e.nextElement(); + Object rawvalue = info.get(key); + String value = rawvalue.toString(); + if(rawvalue instanceof org.mediawiki.importer.Title){ + // put titles into <ns>:<title> format (where ns is integer) + org.mediawiki.importer.Title t = (org.mediawiki.importer.Title) rawvalue; + value = t.Namespace+":"+t.Text; + } + res.put(key, value); + } + + return res; + } + public void close() throws IOException { // nop } Modified: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java =================================================================== --- branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java 2009-11-22 01:21:03 UTC (rev 59325) +++ branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java 2009-11-22 01:24:50 UTC (rev 59326) @@ -18,6 +18,7 @@ import org.wikimedia.lsearch.beans.Redirect; import org.wikimedia.lsearch.config.GlobalConfiguration; import org.wikimedia.lsearch.config.IndexId; +import org.wikimedia.lsearch.importer.DumpImporter; import org.wikimedia.lsearch.index.IndexUpdateRecord; import org.wikimedia.lsearch.interoperability.RMIMessengerClient; import org.wikimedia.lsearch.ranks.LinksBuilder; @@ -83,7 +84,7 @@ revision.Text, redirectTo, references, 0, 0, redirects, new ArrayList<RelatedTitle>(), new Hashtable<String,Integer>(), date, - page.DiscussionThreadingInfo ); + DumpImporter.processLiquidThreadInfo(page.DiscussionThreadingInfo) ); log.debug("Collected "+article+" with rank "+references+" and "+redirects.size()+" redirects: "+redirects); records.add(new IndexUpdateRecord(iid,article,IndexUpdateRecord.Action.UPDATE)); log.debug(iid+": Update for "+article); Modified: branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java =================================================================== --- branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java 2009-11-22 01:21:03 UTC (rev 59325) +++ branches/lucene-search-2.1/test/org/wikimedia/lsearch/analyzers/WikiQueryParserTest.java 2009-11-22 01:24:50 UTC (rev 59326) @@ -180,4 +180,9 @@ e.printStackTrace(); } } + + public void testExtractRawFields(){ + assertEquals("[something , 0:eh heh]", Arrays.toString(WikiQueryParser.extractRawField("something ondiscussionpage:eh heh", "ondiscussionpage:"))); + assertEquals("[something , 0:eh \"heh\"]", Arrays.toString(WikiQueryParser.extractRawField("something ondiscussionpage:eh \"heh\"", "ondiscussionpage:"))); + } } _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS [at] lists https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs
|