
peter at peknet
May 14, 2007, 6:58 AM
Post #7 of 8
(2103 views)
Permalink
|
|
Re: Analyzer API mods (was API request for KS::InvIndexer...)
[In reply to]
|
|
Of the remaining tasks, the attached patch addresses these: > * Add perfunctory tests for analyze_field to the relevant test files. > o 150-polyanalyzer.t > o 151-analyzer.t > o 153-lc_normalizer.t > o 154-tokenizer.t > o 155-stopalizer.t > o 156-stemmer.t > > > * Change SegWriter to use analyze_field. [NOTE: Marvin did this] > * Add optimized analyze_field implementations to LCNormalizer and > PolyAnalyzer. > * Add optimized analyze_field implementation to Tokenizer. This one's > harder because it requires some advanced XS. > > * Copy and paste the utf8ify code into StringHelper.pm. > * Add some tests to verify that it works. > * Replace calls to utf8::upgrade with utf8ify. > * We'll skip moving the utf8 conversion from InvIndexer to the > Analyzers for now, since that has other implications. > I'm not convinced that the approach I took in Tokenizer's XS was most optimal. But it passes all tests. Still TODO: > * Test that you can mod a document's contents, using code nearly > identical to what will end up in the Swish/KS glue eventually. > * Expand Analyzer's docs with regard to subclassing. I expect I'll be able to do both of those once I think a little more about what I want Swish to do. -- Peter Karman . http://peknet.com/ . peter [at] peknet -------------- next part -------------- Index: buildlib/KinoTestUtils.pm =================================================================== --- buildlib/KinoTestUtils.pm (revision 2433) +++ buildlib/KinoTestUtils.pm (working copy) @@ -150,6 +150,13 @@ @got = $analyzer->analyze_raw($source); Test::More::is_deeply( \@got, $expected, "analyze_raw: $message" ); + + $batch = $analyzer->analyze_field({content => $source}, 'content'); + @got = (); + while ( my $token = $batch->next ) { + push @got, $token->get_text; + } + Test::More::is_deeply( \@got, $expected, "analyze_field: $message" ); } 1; Index: t/508-hits.t =================================================================== --- t/508-hits.t (revision 2433) +++ t/508-hits.t (working copy) @@ -8,7 +8,7 @@ use KinoSearch::Searcher; use KinoTestUtils qw( create_invindex ); -my @docs = ( 'a b', 'a a b', 'a a a b', 'x' ); +my @docs = ( 'a b', 'a a b', 'a a a b', 'x' ); my $invindex = create_invindex(@docs); my $searcher = KinoSearch::Searcher->new( invindex => $invindex, ); Index: t/154-tokenizer.t =================================================================== --- t/154-tokenizer.t (revision 2433) +++ t/154-tokenizer.t (working copy) @@ -1,7 +1,7 @@ use strict; use warnings; -use Test::More tests => 8; +use Test::More tests => 9; use KinoSearch::Analysis::Tokenizer; use KinoSearch::Analysis::TokenBatch; @@ -51,3 +51,13 @@ [ 'a', ' ', 'b', ' ', 'c' ], "no freakout when fed multiple tokens" ); + +$batch->reset; +$tokenizer = KinoSearch::Analysis::Tokenizer->new(); +$batch + = $tokenizer->analyze_field( { monroe => 'some like it hot' }, 'monroe' ); +@token_texts = (); +while ( my $token = $batch->next ) { + push @token_texts, $token->get_text; +} +is_deeply( \@token_texts, [ 'some', 'like', 'it', 'hot' ], "analyze_field" ); Index: t/601-queryparser.t =================================================================== --- t/601-queryparser.t (revision 2433) +++ t/601-queryparser.t (working copy) @@ -39,7 +39,7 @@ sub analyzer { KinoSearch::Analysis::Tokenizer->new } package main; -use Test::More tests => 210; +use Test::More tests => 212; use KinoSearch::QueryParser::QueryParser; use KinoSearch::Analysis::PolyAnalyzer; @@ -47,7 +47,7 @@ use KinoSearch::InvIndexer; use KinoSearch::Searcher; use KinoSearch::Store::RAMFolder; -use KinoSearch::Util::StringHelper qw( utf8_flag_on ); +use KinoSearch::Util::StringHelper qw( utf8_flag_on utf8ify ); use KinoTestUtils qw( create_invindex ); @@ -197,6 +197,16 @@ $hits = $searcher->search( query => $motorhead ); is( $hits->total_hits, 1, "QueryParser parses UTF-8 strings correctly" ); +$motorhead = "Mot\xF6rhead"; +utf8ify($motorhead); +$unicode_invindex = create_invindex($motorhead); +$searcher = KinoSearch::Searcher->new( invindex => $unicode_invindex, ); + +$hits = $searcher->search( query => 'Mot' ); +is( $hits->total_hits, 0, "Pre-test - indexing worked properly" ); +$hits = $searcher->search( query => $motorhead ); +is( $hits->total_hits, 1, "QueryParser utf8ifys UTF-8 strings correctly" ); + my $mf_folder = KinoSearch::Store::RAMFolder->new; my $mf_schema = MultiFieldSchema->new; my $mf_invindex = KinoSearch::InvIndex->create( Index: t/519-and_or_scorer.t =================================================================== --- t/519-and_or_scorer.t (revision 2433) +++ t/519-and_or_scorer.t (working copy) @@ -123,7 +123,7 @@ my $score_docs = $hc->get_hit_queue->score_docs; my @by_score_then_num = map { $_->get_doc_num } sort { - $b->get_score <=> $a->get_score + $b->get_score <=> $a->get_score || $a->get_doc_num <=> $b->get_doc_num } @$score_docs; my @by_num = sort { $a <=> $b } @by_score_then_num; Index: t/155-stopalizer.t =================================================================== --- t/155-stopalizer.t (revision 2433) +++ t/155-stopalizer.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 6; +use Test::More tests => 8; use KinoTestUtils qw( test_analyzer ); use KinoSearch::Analysis::Stopalizer; Index: t/016-varray.t =================================================================== --- t/016-varray.t (revision 2433) +++ t/016-varray.t (working copy) @@ -10,7 +10,7 @@ my ( $varray, @orig, @got ); $varray = KinoSearch::Util::VArray->new( capacity => 0 ); - [at] ori = 1 .. 10; +@orig = 1 .. 10; $varray->push( KinoSearch::Util::ByteBuf->new($_) ) for @orig; is( $varray->get_size, 10, "get_size after pushing 10 elements" ); Index: t/215-term_vectors.t =================================================================== --- t/215-term_vectors.t (revision 2433) +++ t/215-term_vectors.t (working copy) @@ -47,12 +47,12 @@ $invindexer->finish; my $searcher = KinoSearch::Searcher->new( invindex => $invindex ); -my $doc_vec = $searcher->fetch_doc_vec(0); +my $doc_vec = $searcher->fetch_doc_vec(0); my $term_vector = $doc_vec->term_vector( "content", "foo" ); ok( defined $term_vector, "successfully retrieved term vector" ); -$doc_vec = $searcher->fetch_doc_vec(1); +$doc_vec = $searcher->fetch_doc_vec(1); $term_vector = $doc_vec->term_vector( 'content', 'ma??ana' ); ok( defined $term_vector, "utf-8 term vector retrieved" ); Index: t/151-analyzer.t =================================================================== --- t/151-analyzer.t (revision 2433) +++ t/151-analyzer.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 5; +use Test::More tests => 6; use KinoSearch::Analysis::Analyzer; use KinoTestUtils qw( utf8_test_strings test_analyzer ); Index: t/505-hit_queue.t =================================================================== --- t/505-hit_queue.t (revision 2433) +++ t/505-hit_queue.t (working copy) @@ -26,7 +26,8 @@ } @docs_and_scores; my @correct_order = sort { - $b->get_score <=> $a->get_score or $a->get_doc_num <=> $b->get_doc_num + $b->get_score <=> $a->get_score + or $a->get_doc_num <=> $b->get_doc_num } @score_docs; my @correct_docs = map { $_->get_doc_num } @correct_order; my @correct_scores = map { $_->get_score } @correct_order; Index: t/153-lc_normalizer.t =================================================================== --- t/153-lc_normalizer.t (revision 2433) +++ t/153-lc_normalizer.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 3; +use Test::More tests => 4; use KinoTestUtils qw( test_analyzer ); use KinoSearch::Analysis::LCNormalizer; Index: t/518-or_scorer.t =================================================================== --- t/518-or_scorer.t (revision 2433) +++ t/518-or_scorer.t (working copy) @@ -83,7 +83,7 @@ perform_search( [ 'a' .. $_ ] ) for 'a' .. 'z'; sub perform_search { - my $letters = shift; + my $letters = shift; my $letter_string = join ' ', @$letters; my $subscorers @@ -125,8 +125,8 @@ my @doc_nums = keys %{ $letters{$letter} }; $counts{$_} += 1 for @doc_nums; } - my @by_count_then_num = - sort { $counts{$b} <=> $counts{$a} || $a <=> $b } + my @by_count_then_num + = sort { $counts{$b} <=> $counts{$a} || $a <=> $b } keys %counts; my @by_num = sort { $a <=> $b } @by_count_then_num; @@ -139,7 +139,7 @@ my $score_docs = $hc->get_hit_queue->score_docs; my @by_score_then_num = map { $_->get_doc_num } sort { - $b->get_score <=> $a->get_score + $b->get_score <=> $a->get_score || $a->get_doc_num <=> $b->get_doc_num } @$score_docs; my @by_num = sort { $a <=> $b } @by_score_then_num; Index: t/012-priority_queue.t =================================================================== --- t/012-priority_queue.t (revision 2433) +++ t/012-priority_queue.t (working copy) @@ -30,7 +30,7 @@ ); 1 while defined $pq->pop; # empty queue; -$pq = KinoSearch::Util::PriorityQueue->new( max_size => 5 ); +$pq = KinoSearch::Util::PriorityQueue->new( max_size => 5 ); @prioritized = (); $pq->insert($_) for ( 1 .. 10, -3, 1590 .. 1600, 5 ); @@ -50,4 +50,3 @@ $pq->insert( splice( @nums, $tick, 1 ) ); } is_deeply( $pq->pop_all, [ reverse 1 .. 100 ], "random order insertion" ); - Index: t/156-stemmer.t =================================================================== --- t/156-stemmer.t (revision 2433) +++ t/156-stemmer.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 6; +use Test::More tests => 8; use KinoTestUtils qw( test_analyzer ); use KinoSearch::Analysis::Stemmer; Index: t/514-and_scorer.t =================================================================== --- t/514-and_scorer.t (revision 2433) +++ t/514-and_scorer.t (working copy) @@ -19,7 +19,7 @@ push @docs, ('c d x'); my $invindex = create_invindex(@docs); -my $searcher = KinoSearch::Searcher->new( invindex => $invindex, ); +my $searcher = KinoSearch::Searcher->new( invindex => $invindex, ); my $similarity = KinoSearch::Search::Similarity->new; my $c_query = KinoSearch::Search::TermQuery->new( @@ -96,4 +96,3 @@ } return \@doc_nums; } - Index: t/150-polyanalyzer.t =================================================================== --- t/150-polyanalyzer.t (revision 2433) +++ t/150-polyanalyzer.t (working copy) @@ -2,7 +2,7 @@ use warnings; use lib 'buildlib'; -use Test::More tests => 15; +use Test::More tests => 20; use KinoTestUtils qw( test_analyzer ); Index: t/013-bit_vector.t =================================================================== --- t/013-bit_vector.t (revision 2433) +++ t/013-bit_vector.t (working copy) @@ -60,7 +60,7 @@ } } -my @set_1 = ( 1 .. 3, 10, 20, 30 ); +my @set_1 = ( 1 .. 3, 10, 20, 30 ); my @set_2 = ( 2 .. 10, 25 .. 35 ); $bit_vec = KinoSearch::Util::BitVector->new; Index: lib/KinoSearch/QueryParser/QueryParser.pm =================================================================== --- lib/KinoSearch/QueryParser/QueryParser.pm (revision 2433) +++ lib/KinoSearch/QueryParser/QueryParser.pm (working copy) @@ -3,6 +3,7 @@ package KinoSearch::QueryParser::QueryParser; use KinoSearch::Util::ToolSet; +use KinoSearch::Util::StringHelper qw( utf8ify ); use base qw( KinoSearch::Util::Class ); our %instance_vars = ( @@ -100,7 +101,7 @@ sub parse { my ( $self, $qstring_orig ) = @_; $qstring_orig = '' unless defined $qstring_orig; - utf8::upgrade($qstring_orig); + utf8ify($qstring_orig); my $default_fields = $self->{fields}; my $default_boolop = $self->{default_boolop}; my @clauses; Index: lib/KinoSearch/Analysis/PolyAnalyzer.pm =================================================================== --- lib/KinoSearch/Analysis/PolyAnalyzer.pm (revision 2433) +++ lib/KinoSearch/Analysis/PolyAnalyzer.pm (working copy) @@ -18,7 +18,7 @@ use KinoSearch::Analysis::Stemmer; sub init_instance { - my $self = shift; + my $self = shift; my $language = $self->{language} = lc( $self->{language} ); # create a default set of analyzers if language was specified @@ -61,6 +61,24 @@ } } +sub analyze_field { + my $analyzers = $_[0]->{analyzers}; + + if ( !@$analyzers ) { + return KinoSearch::Analysis::TokenBatch->new( + text => $_[1]->{ $_[2] } ); + } + elsif ( @$analyzers == 1 ) { + return $analyzers->[0]->analyze_field( $_[1], $_[2] ); + } + else { + my $batch = $analyzers->[0]->analyze_field( $_[1], $_[2] ); + $batch = $_->analyze_batch($batch) + for @{$analyzers}[ 1 .. $#$analyzers ]; + return $batch; + } +} + 1; __END__ Index: lib/KinoSearch/Analysis/Tokenizer.pm =================================================================== --- lib/KinoSearch/Analysis/Tokenizer.pm (revision 2433) +++ lib/KinoSearch/Analysis/Tokenizer.pm (working copy) @@ -25,12 +25,13 @@ MODULE = KinoSearch PACKAGE = KinoSearch::Analysis::Tokenizer kino_TokenBatch* -_do_analyze(self_hv, batch_or_text_sv) +_do_analyze(self_hv, batch_or_text_sv, ...) HV *self_hv; SV *batch_or_text_sv; ALIAS: analyze_batch = 1 analyze_text = 2 + analyze_field = 3 CODE: { kino_TokenBatch *batch = NULL; @@ -40,11 +41,30 @@ chy_u32_t num_code_points = 0; SV *wrapper = sv_newmortal(); RETVAL = kino_TokenBatch_new(NULL); - + char *string = NULL; + STRLEN string_len = 0; + if (ix == 1) { EXTRACT_STRUCT( batch_or_text_sv, batch, kino_TokenBatch*, "KinoSearch::Analysis::TokenBatch"); } + if (ix == 2) { + string = SvPVutf8( ST(1), string_len ); + } + if (ix == 3) { + if (items != 3) + CONFESS("analyze_text() takes 2 arguments, got %d", items - 1); + if (!SvROK(batch_or_text_sv)) + CONFESS("first argument to analyze_text() must be hash ref"); + + STRLEN len; + char *field_name = SvPV(ST(2), len); + string = SvPVutf8(extract_sv( + (HV*)SvRV(batch_or_text_sv), + field_name, + len), + string_len); + } /* extract regexp struct from qr// entity */ if (SvROK(token_re)) { @@ -63,7 +83,6 @@ SvUTF8_on(wrapper); while (1) { - STRLEN len; char *string_beg; char *string_end; char *string_arg; @@ -72,20 +91,20 @@ kino_Token *token = Kino_TokenBatch_Next(batch); if (token == NULL) break; - len = token->len; + string_len = token->len; string_beg = token->text; - string_end = string_beg + len; + string_end = string_beg + string_len; string_arg = string_beg; } else { - string_beg = SvPVutf8( ST(1), len ); - string_end = string_beg + len; + string_beg = string; + string_end = string_beg + string_len; string_arg = string_beg; } /* wrap the string in an SV to please the regex engine */ SvPVX(wrapper) = string_beg; - SvCUR_set(wrapper, len); + SvCUR_set(wrapper, string_len); SvPOK_on(wrapper); while ( @@ -128,7 +147,7 @@ REFCOUNT_DEC(new_token); } - if (ix == 2) /* analyze_text only runs one loop iter */ + if (ix > 1) /* analyze_text and analyze_field only run one loop iter */ break; } } Index: lib/KinoSearch/Analysis/Stopalizer.pm =================================================================== --- lib/KinoSearch/Analysis/Stopalizer.pm (revision 2433) +++ lib/KinoSearch/Analysis/Stopalizer.pm (working copy) @@ -16,7 +16,7 @@ use Lingua::StopWords; sub init_instance { - my $self = shift; + my $self = shift; my $language = $self->{language} = lc( $self->{language} ); # verify a supplied stoplist @@ -139,4 +139,3 @@ See L<KinoSearch> version 0.20. =cut - Index: lib/KinoSearch/Analysis/LCNormalizer.pm =================================================================== --- lib/KinoSearch/Analysis/LCNormalizer.pm (revision 2433) +++ lib/KinoSearch/Analysis/LCNormalizer.pm (working copy) @@ -31,6 +31,12 @@ return $_[0]->analyze_batch($batch); } +sub analyze_field { + my $batch = KinoSearch::Analysis::TokenBatch->new( + text => lc( $_[1]->{ $_[2] } ) ); + return $_[0]->analyze_batch($batch); +} + 1; __END__ Index: lib/KinoSearch/Index/SegWriter.pm =================================================================== --- lib/KinoSearch/Index/SegWriter.pm (revision 2433) +++ lib/KinoSearch/Index/SegWriter.pm (working copy) @@ -3,6 +3,7 @@ package KinoSearch::Index::SegWriter; use KinoSearch::Util::ToolSet; +use KinoSearch::Util::StringHelper qw( utf8ify ); use base qw( KinoSearch::Util::Class ); our %instance_vars = ( @@ -91,7 +92,7 @@ # upgrade fields that aren't binary to utf8 if ( !$field_spec->binary ) { - utf8::upgrade( $doc->{$field_name} ); + utf8ify( $doc->{$field_name} ); } next unless $field_spec->indexed; Index: lib/KinoSearch/Util/StringHelper.pm =================================================================== --- lib/KinoSearch/Util/StringHelper.pm (revision 2433) +++ lib/KinoSearch/Util/StringHelper.pm (working copy) @@ -9,6 +9,7 @@ utf8_flag_off to_base36 from_base36 + utf8ify ); 1; @@ -62,6 +63,19 @@ RETVAL = strtol(str, NULL, 36); OUTPUT: RETVAL +=for comment + +Upgrade a SV to UTF8, converting Latin1 if necessary. Equivalent to utf::upgrade(). + +=cut + +void +utf8ify(sv) + SV *sv; +PPCODE: + sv_utf8_upgrade(sv); + + __POD__ =begin devdocs
|