
marvin at rectangular
Aug 18, 2006, 12:23 PM
Post #1 of 1
(124 views)
Permalink
|
|
Negate Operator bug fixed.
|
|
Greets, The "negate operator bug" in QueryParser-- which also affected other multi-field queries with required/negated terms -- has been stomped. QueryParser's interface has changed slightly: the 'default_field' parameter has been deprecated and is being replaced by the plural 'fields', which should be an arrayref. my $query_parser = KinoSearch::QueryParser::QueryParser->new( analyzer => $analyzer, fields => [ 'body', 'title' ], ); I decided not to subclass QueryParser, as Java Lucene does with MultiFieldQueryParser, since the ability to spec multiple fields is really the only difference between those two classes, and I think that it's more common to be searching multiple fields than just one. A happy secondary effect is that Searcher's _prepare_simple_search method is now a little easier to grok. Tony had suggested that it would be nice if the transition from searching on strings to creating Query objects wasn't quite so steep; it's just been rounded off a bit. Since the svn trunk has now taken on the UTF-8 challenge and broken backwards compatibility, a maintenance branch has been split off from the 0.12 release. Version 0.13, to be released shortly, will be taken from this branch, as will all future 0.1x releases. What is now in trunk will probably come out as 0.20. Marvin Humphrey -- I'm looking for a part-time job. slothbear:~/projects/ksmaint marvin$ svn diff Index: lib/KinoSearch/QueryParser/QueryParser.pm =================================================================== --- lib/KinoSearch/QueryParser/QueryParser.pm (revision 1032) +++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy) @@ -9,7 +9,8 @@ # constructor args / members analyzer => undef, default_boolop => 'OR', - default_field => undef, + default_field => undef, # back compat + fields => undef, # members bool_groups => {}, phrases => {}, @@ -41,6 +42,20 @@ # create labels which won't appear in search strings $self->{phrase_re} = qr/^(_phrase$randstring\d+)/; $self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/; + + # verify fields param + my $fields = + defined $self->{fields} + ? $self->{fields} + : [ $self->{default_field} ]; + croak("Required parameter 'fields' not supplied as arrayref") + unless ( defined $fields + and reftype($fields) eq 'ARRAY' ); + $self->{fields} = $fields; + + # verify analyzer + croak("Missing required param 'analyzer'") + unless a_isa_b( $self->{analyzer}, 'KinoSearch::Analysis::Analyzer' ); } # regex matching a quoted string @@ -83,7 +98,7 @@ sub parse { my ( $self, $qstring_orig ) = @_; $qstring_orig = '' unless defined $qstring_orig; - my $default_field = $self->{default_field}; + my $default_fields = $self->{fields}; my $default_boolop = $self->{default_boolop}; my @clauses; @@ -124,7 +139,7 @@ } # set the field - my $field = s/^$field_re// ? $1 : $default_field; + my $fields = s/^$field_re// ? [$1] : $default_fields; # if a phrase label is detected... if (s/$self->{phrase_re}//) { @@ -133,24 +148,11 @@ # retreive the text and analyze it my $orig_phrase_text = delete $self->{phrases}{$1}; my $token_texts = $self->_analyze($orig_phrase_text); - - # create a TermQuery, a PhraseQuery, or nothing - if ( @$token_texts == 1 ) { - my $term = KinoSearch::Index::Term->new( $field, - $token_texts->[0] ); - $query = KinoSearch::Search::TermQuery->new( term => $term ); + if (@$token_texts) { + my $query = $self->_get_field_query( $fields, $token_texts ); + push @clauses, { query => $query, occur => $occur } + if defined $query; } - elsif ( @$token_texts > 1 ) { - $query = KinoSearch::Search::PhraseQuery->new; - for my $token_text (@$token_texts) { - $query->add_term( - KinoSearch::Index::Term->new( $field, $token_text ), - ); - } - } - - push @clauses, { query => $query, occur => $occur } - if defined $query; } # if a label indicating a bool group is detected... elsif (s/$self->{bool_group_re}//) { @@ -162,12 +164,9 @@ # what's left is probably a term elsif (s/([^"(\s]+)//) { my $token_texts = $self->_analyze($1); - my @terms = map { KinoSearch::Index::Term->new( $field, $_ ) } - grep { $_ ne '' } - @$token_texts; - for my $term (@terms) { - my $query - = KinoSearch::Search::TermQuery->new( term => $term ); + @$token_texts = grep { $_ ne '' } @$token_texts; + if (@$token_texts) { + my $query = $self->_get_field_query( $fields, $token_texts ); push @clauses, { occur => $occur, query => $query }; } } @@ -190,6 +189,50 @@ } } +# Wrap a TermQuery/PhraseQuery to deal with multiple fields. +sub _get_field_query { + my ( $self, $fields, $token_texts ) = @_; + + my @queries = grep { defined $_ } + map { $self->_gen_single_field_query( $_, $token_texts ) } @ $fields; + + if ( @queries == 0 ) { + return; + } + elsif ( @queries == 1 ) { + return $queries[0]; + } + else { + my $wrapper_query = KinoSearch::Search::BooleanQuery->new; + for my $query (@queries) { + $wrapper_query->add_clause( + query => $query, + occur => 'SHOULD', + ); + } + return $wrapper_query; + } +} + +# Create a TermQuery, a PhraseQuery, or nothing. +sub _gen_single_field_query { + my ( $self, $field, $token_texts ) = @_; + + if ( @$token_texts == 1 ) { + my $term = KinoSearch::Index::Term->new( $field, $token_texts->[0] ); + return KinoSearch::Search::TermQuery->new( term => $term ); + } + elsif ( @$token_texts > 1 ) { + my $phrase_query = KinoSearch::Search::PhraseQuery->new; + for my $token_text (@$token_texts) { + $phrase_query->add_term( + KinoSearch::Index::Term->new( $field, $token_text ), + ); + } + return $phrase_query; + } +} + # break a string into tokens sub _analyze { my ( $self, $string ) = @_; @@ -247,8 +290,8 @@ =head1 SYNOPSIS my $query_parser = KinoSearch::QueryParser::QueryParser->new( - analyzer => $analyzer, - default_field => 'bodytext', + analyzer => $analyzer, + fields => [ 'bodytext' ], ); my $query = $query_parser->parse( $query_string ); my $hits = $searcher->search( query => $query ); @@ -286,7 +329,7 @@ Field-specific terms, in the form of C<fieldname:termtext>. (The field specified by fieldname will be used instead of the QueryParser's default -field). +fields). =back @@ -295,9 +338,9 @@ =head2 new my $query_parser = KinoSearch::QueryParser::QueryParser->new( - analyzer => $analyzer, # required - default_field => 'bodytext', # required - default_boolop => 'AND', # default: 'OR' + analyzer => $analyzer, # required + fields => [ 'bodytext' ], # required + default_boolop => 'AND', # default: 'OR' ); Constructor. Takes hash-style parameters: @@ -313,12 +356,15 @@ =item * -B<default_field> - the name of the (only) field which will be searched -against. If you need to search multiple fields, you need multiple QueryParser -objects. +B<fields> - the names of the fields which will be searched against. Must be +supplied as an arrayref. =item * +B<default_field> - deprecated. Use C<fields> instead. + +=item * + B<default_boolop> - two possible values: 'AND' and 'OR'. The default is 'OR', which means: return documents which match any of the query terms. If you want only documents which match all of the query terms, set this to 'AND'. Index: lib/KinoSearch/Searcher.pm =================================================================== --- lib/KinoSearch/Searcher.pm (revision 1032) +++ lib/KinoSearch/Searcher.pm (working copy) @@ -82,23 +82,13 @@ sub _prepare_simple_search { my ( $self, $query_string ) = @_; - # add each parsed query as a boolean clause to a super-query - my $super_query = KinoSearch::Search::BooleanQuery->new; my $indexed_field_names = $self->{reader}->get_field_names( indexed => 1 ); - for my $field_name (@$indexed_field_names) { - my $query_parser = KinoSearch::QueryParser::QueryParser->new( - default_field => $field_name, - analyzer => $self->{analyzer}, - ); - my $sub_query = $query_parser->parse($query_string); - $super_query->add_clause( - query => $sub_query, - occur => 'SHOULD', - ); - } - - return $super_query; + my $query_parser = KinoSearch::QueryParser::QueryParser->new( + fields => $indexed_field_names, + analyzer => $self->{analyzer}, + ); + return $query_parser->parse($query_string); } my %search_hit_collector_args = ( Index: MANIFEST =================================================================== --- MANIFEST (revision 1032) +++ MANIFEST (working copy) @@ -145,6 +145,7 @@ t/601-queryparser.t t/602-boosts.t t/603-query_boosts.t +t/604-simple_search.t t/701-uscon.t t/999-remove_invindexes.t t/benchmarks/extract_reuters.plx
|