
marvin at rectangular
Sep 17, 2008, 8:01 PM
Post #1 of 1
(2108 views)
Permalink
|
|
r3878 - in trunk: c_src/KinoSearch/Posting perl perl/lib/KinoSearch/Posting perl/t
|
|
Author: creamyg Date: 2008-09-17 20:01:28 -0700 (Wed, 17 Sep 2008) New Revision: 3878 Added: trunk/perl/t/400-match_posting.t Modified: trunk/c_src/KinoSearch/Posting/MatchPosting.bp trunk/c_src/KinoSearch/Posting/MatchPosting.c trunk/c_src/KinoSearch/Posting/ScorePosting.bp trunk/c_src/KinoSearch/Posting/ScorePosting.c trunk/perl/MANIFEST trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm Log: Finish a preliminary implementation of MatchPosting. This version includes freq, which is not ideal -- but the primary rationale is testing, not a public API. Modified: trunk/c_src/KinoSearch/Posting/MatchPosting.bp =================================================================== --- trunk/c_src/KinoSearch/Posting/MatchPosting.bp 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/c_src/KinoSearch/Posting/MatchPosting.bp 2008-09-18 03:01:28 UTC (rev 3878) @@ -2,8 +2,6 @@ /** Match but not score documents. * - * TODO: This class is not yet fully implemented. - * * Use MatchPosting for fields which only need to be matched, not scored. For * instance, if you need to determine that that a query matches a particular * category, but don't want the match to contribute to the document score, use @@ -13,6 +11,7 @@ extends KinoSearch::Posting { Similarity *sim; + u32_t freq; /* Constructor. */ @@ -23,14 +22,31 @@ init(MatchPosting *self, Similarity *similarity); void + Destroy(MatchPosting *self); + + incremented MatchPosting* + Clone(MatchPosting *self); + + void + Read_Record(MatchPosting *self, InStream *instream); + + incremented RawPosting* + Read_Raw(MatchPosting *self, InStream *instream, i32_t last_doc_num, + CharBuf *term_text, MemoryPool *mem_pool); + + void + Add_Inversion_To_Pool(MatchPosting *self, PostingPool *post_pool, + Inversion *inversion, FieldSpec *fspec, + i32_t doc_num, float doc_boost, + float length_norm); + + void Reset(MatchPosting *self, i32_t doc_num); public incremented MatchPostingScorer* Make_Scorer(MatchPosting *self, Similarity *sim, PostingList *plist, Compiler *compiler); - void - Destroy(MatchPosting *self); } class KinoSearch::Posting::MatchPostingScorer cnick MatchPostScorer Modified: trunk/c_src/KinoSearch/Posting/MatchPosting.c =================================================================== --- trunk/c_src/KinoSearch/Posting/MatchPosting.c 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/c_src/KinoSearch/Posting/MatchPosting.c 2008-09-18 03:01:28 UTC (rev 3878) @@ -1,10 +1,22 @@ #include "KinoSearch/Util/ToolSet.h" #include "KinoSearch/Posting/MatchPosting.h" +#include "KinoSearch/Analysis/Inversion.h" +#include "KinoSearch/Analysis/Token.h" +#include "KinoSearch/FieldSpec.h" #include "KinoSearch/Index/PostingList.h" +#include "KinoSearch/Index/PostingPool.h" +#include "KinoSearch/Posting/RawPosting.h" #include "KinoSearch/Search/Similarity.h" #include "KinoSearch/Search/Compiler.h" +#include "KinoSearch/Store/InStream.h" +#include "KinoSearch/Util/MemoryPool.h" +#define MAX_RAW_POSTING_LEN(_text_len) \ + ( sizeof(RawPosting) \ + + _text_len + 1 /* term text content */ \ + ) + MatchPosting* MatchPost_new(Similarity *sim) { @@ -26,12 +38,72 @@ FREE_OBJ(self); } +MatchPosting* +MatchPost_clone(MatchPosting *self) +{ + MatchPosting *evil_twin = (MatchPosting*)VTable_Make_Obj(self->_); + return MatchPost_init(evil_twin, self->sim); +} + void MatchPost_reset(MatchPosting *self, i32_t doc_num) { self->doc_num = doc_num; } +void +MatchPost_read_record(MatchPosting *self, InStream *instream) +{ + const u32_t doc_code = InStream_Read_C32(instream); + const u32_t doc_delta = doc_code >> 1; + + /* Apply delta doc and retrieve freq. */ + self->doc_num += doc_delta; + if (doc_code & 1) + self->freq = 1; + else + self->freq = InStream_Read_C32(instream); +} + +RawPosting* +MatchPost_read_raw(MatchPosting *self, InStream *instream, i32_t last_doc_num, + CharBuf *term_text, MemoryPool *mem_pool) +{ + const size_t text_size = CB_Get_Size(term_text); + const u32_t doc_code = InStream_Read_C32(instream); + const u32_t delta_doc = doc_code >> 1; + const i32_t doc_num = last_doc_num + delta_doc; + const u32_t freq = (doc_code & 1) + ? 1 + : InStream_Read_C32(instream); + size_t raw_post_bytes = MAX_RAW_POSTING_LEN(text_size); + void *const allocation = MemPool_Grab(mem_pool, raw_post_bytes); + + return RawPost_new(allocation, doc_num, freq, term_text->ptr, text_size); +} + +void +MatchPost_add_inversion_to_pool(MatchPosting *self, PostingPool *post_pool, + Inversion *inversion, FieldSpec *fspec, + i32_t doc_num, float doc_boost, + float length_norm) +{ + MemoryPool *mem_pool = post_pool->mem_pool; + Token **tokens; + u32_t freq; + + Inversion_Reset(inversion); + while ( (tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL ) { + Token *token = *tokens; + u32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len); + RawPosting *raw_posting = RawPost_new( + MemPool_Grab(mem_pool, raw_post_bytes), doc_num, freq, + token->text, token->len + ); + PostPool_Add_Elem(post_pool, (Obj*)raw_posting); + } +} + MatchPostingScorer* MatchPost_make_scorer(MatchPosting *self, Similarity *sim, PostingList *plist, Compiler *compiler) Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.bp =================================================================== --- trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/c_src/KinoSearch/Posting/ScorePosting.bp 2008-09-18 03:01:28 UTC (rev 3878) @@ -9,7 +9,6 @@ class KinoSearch::Posting::ScorePosting cnick ScorePost extends KinoSearch::Posting::MatchPosting { - u32_t freq; float weight; u32_t *prox; u32_t prox_cap; Modified: trunk/c_src/KinoSearch/Posting/ScorePosting.c =================================================================== --- trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/c_src/KinoSearch/Posting/ScorePosting.c 2008-09-18 03:01:28 UTC (rev 3878) @@ -46,8 +46,7 @@ ScorePosting* ScorePost_clone(ScorePosting *self) { - VTable *vtable = self->_; - ScorePosting *evil_twin = (ScorePosting*)CREATE(NULL, (*vtable)); + ScorePosting *evil_twin = (ScorePosting*)VTable_Make_Obj(self->_); ScorePost_init(evil_twin, self->sim); if (self->freq) { Modified: trunk/perl/MANIFEST =================================================================== --- trunk/perl/MANIFEST 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/perl/MANIFEST 2008-09-18 03:01:28 UTC (rev 3878) @@ -307,6 +307,7 @@ t/308-simple.t t/309-span.t t/310-heat_map.t +t/400-match_posting.t t/501-termquery.t t/502-phrasequery.t t/504-similarity.t Modified: trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm =================================================================== --- trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/perl/lib/KinoSearch/Posting/MatchPosting.pm 2008-09-18 03:01:28 UTC (rev 3878) @@ -18,6 +18,7 @@ { "KinoSearch::Posting::MatchPosting" => { make_constructors => ["new"], + make_getters => [qw( freq )], # make_pod => { # synopsis => $synopsis, # } Modified: trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm =================================================================== --- trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm 2008-09-18 01:18:15 UTC (rev 3877) +++ trunk/perl/lib/KinoSearch/Posting/ScorePosting.pm 2008-09-18 03:01:28 UTC (rev 3878) @@ -19,7 +19,7 @@ { "KinoSearch::Posting::ScorePosting" => { make_constructors => ["new"], - make_getters => [qw( freq weight )], + make_getters => [qw( weight )], # make_pod => { # synopsis => $synopsis, # } Added: trunk/perl/t/400-match_posting.t =================================================================== --- trunk/perl/t/400-match_posting.t (rev 0) +++ trunk/perl/t/400-match_posting.t 2008-09-18 03:01:28 UTC (rev 3878) @@ -0,0 +1,78 @@ +use strict; +use warnings; +use lib 'buildlib'; + +package MatchSchema::MatchOnly; +use base qw( KinoSearch::FieldSpec::TextField ); +use KinoSearch::Posting::MatchPosting; + +sub posting { + if ( @_ == 2 ) { + return KinoSearch::Posting::MatchPosting->new( similarity => $_[1] ); + } + else { + shift; + return KinoSearch::Posting::MatchPosting->new(@_); + } +} + +package MatchSchema; +use base qw( KinoSearch::Schema ); +use KinoSearch::Analysis::Tokenizer; + +our %fields = ( content => 'MatchSchema::MatchOnly', ); + +sub analyzer { KinoSearch::Analysis::Tokenizer->new } + +package main; + +use KinoSearch::Test::TestUtils qw( get_uscon_docs ); +use KinoSearch::Test::TestSchema; +use Test::More tests => 6; + +my $uscon_docs = get_uscon_docs(); +my $match_invindex = make_index( MatchSchema->new, $uscon_docs ); +my $score_invindex + = make_index( KinoSearch::Test::TestSchema->new, $uscon_docs ); + +my $match_searcher = KinoSearch::Searcher->new( invindex => $match_invindex ); +my $score_searcher = KinoSearch::Searcher->new( invindex => $score_invindex ); + +for (qw( land of the free )) { + my $match_got = hit_nums_array( $match_searcher, $_ ); + my $score_got = hit_nums_array( $score_searcher, $_ ); + is_deeply( $match_got, $score_got, "same hits for '$_'" ); +} + +my $qstring = '"the legislature"'; +my $should_have_hits = hit_nums_array( $score_searcher, $qstring ); +my $should_be_empty = hit_nums_array( $match_searcher, $qstring ); +ok( scalar @$should_have_hits, "successfully scored phrase $qstring" ); +ok( !scalar @$should_be_empty, "no hits matched for phrase $qstring" ); + +sub make_index { + my ( $schema, $docs ) = @_; + my $folder = KinoSearch::Store::RAMFolder->new; + my $invindex = KinoSearch::InvIndex->clobber( + schema => $schema, + folder => $folder, + ); + + my $invindexer = KinoSearch::InvIndexer->new( invindex => $invindex, ); + $invindexer->add_doc( { content => $_->{bodytext} } ) for values %$docs; + $invindexer->finish; + return $invindex; +} + +sub hit_nums_array { + my ( $searcher, $query_string ) = @_; + my $query = $searcher->glean_query($query_string); + + my $bit_vec = KinoSearch::Util::BitVector->new( + capacity => $searcher->max_docs + 1 ); + my $bit_collector = KinoSearch::Search::HitCollector::BitCollector->new( + bit_vector => $bit_vec, ); + $searcher->collect( query => $query, collector => $bit_collector ); + return $bit_vec->to_array->to_arrayref; +} + _______________________________________________ kinosearch-commits mailing list kinosearch-commits [at] rectangular http://www.rectangular.com/mailman/listinfo/kinosearch-commits
|