
marvin at rectangular
Jul 31, 2008, 8:57 AM
Post #1 of 1
(386 views)
Permalink
|
|
r3687 - in trunk: c_src/KinoSearch/Highlight perl/lib perl/lib/KinoSearch/Highlight perl/t
|
|
Author: creamyg Date: 2008-07-31 08:57:38 -0700 (Thu, 31 Jul 2008) New Revision: 3687 Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp trunk/c_src/KinoSearch/Highlight/Highlighter.c trunk/perl/lib/KinoSearch.pm trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm trunk/perl/t/303-highlighter.t Log: Represent sentence boundaries in the highlighter using an array of Spans, communicating both offset and length rather than an array of integers communicating offset only. Replace Find_Sentence_Boundaries() with Find_Sentences(). Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp =================================================================== --- trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 03:01:32 UTC (rev 3686) +++ trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-31 15:57:38 UTC (rev 3687) @@ -26,9 +26,11 @@ public incremented CharBuf* Encode(Highlighter *self, CharBuf *text); - /** Find the sentence boundaries within the specified range, returning - * them as an array of offsets (offset from 0, not from - * <code>start</code>). + /** Find sentence boundaries within the specified range, returning them as + * an array of Spans. The "offset" of each Span indicates the start of + * the sentence, and is measured from 0, not from <code>offset</code>. + * The Span's "length" member indicates the sentence length in code + * points. * * @param text The string to scan. * @param offset The place to start looking for offsets, measured in @@ -37,9 +39,9 @@ * scan. The default value of 0 is a sentinel which indicates to scan * until the end of the string. */ - incremented IntMap* - Find_Sentence_Boundaries(Highlighter *self, CharBuf *text, i32_t offset = 0, - i32_t length = 0); + incremented VArray* + Find_Sentences(Highlighter *self, CharBuf *text, i32_t offset = 0, + i32_t length = 0); public incremented CharBuf* Highlight(Highlighter *self, const CharBuf *text); @@ -75,7 +77,7 @@ i32_t Raw_Excerpt(Highlighter *self, const CharBuf *field_val, const CharBuf *fragment, CharBuf *raw_excerpt, i32_t top, - IntMap *edges); + VArray *sentences); /* Take the text in raw_excerpt, add highlight tags, encode, and place the * result into [highlighted]. Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.c =================================================================== --- trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 03:01:32 UTC (rev 3686) +++ trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-31 15:57:38 UTC (rev 3687) @@ -10,7 +10,6 @@ #include "KinoSearch/Search/Span.h" #include "KinoSearch/Index/DocVector.h" #include "KinoSearch/Util/ByteBuf.h" -#include "KinoSearch/Util/IntMap.h" #include "KinoSearch/Util/Native.h" /* If Highlighter_Encode has been overridden, return its output. If not, @@ -102,11 +101,11 @@ self->excerpt_length * 0.6666); i32_t top = Highlighter_Find_Best_Fragment(self, (CharBuf*)field_val, (ViewCharBuf*)&fragment, heat_map); - IntMap *edges = Highlighter_Find_Sentence_Boundaries(self, + VArray *sentences = Highlighter_Find_Sentences(self, (CharBuf*)field_val, top, self->window_width); top = Highlighter_Raw_Excerpt(self, (CharBuf*)field_val, - (CharBuf*)&fragment, raw_excerpt, top, edges); + (CharBuf*)&fragment, raw_excerpt, top, sentences); VA_Sort(score_spans, Span_compare); Highlighter_highlight_excerpt(self, score_spans, raw_excerpt, highlighted, top); @@ -115,7 +114,7 @@ REFCOUNT_DEC(score_spans); REFCOUNT_DEC(heat_map); REFCOUNT_DEC(raw_excerpt); - REFCOUNT_DEC(edges); + REFCOUNT_DEC(sentences); return highlighted; } @@ -194,7 +193,7 @@ i32_t Highlighter_raw_excerpt(Highlighter *self, const CharBuf *field_val, const CharBuf *fragment, CharBuf *raw_excerpt, - i32_t top, IntMap *edges) + i32_t top, VArray *sentences) { bool_t found_starting_edge = false; bool_t found_ending_edge = false; @@ -203,11 +202,12 @@ i32_t this_excerpt_len; /* Try to find a starting sentence boundary. */ - if (edges->size) { + if (sentences->size) { u32_t i; - for (i = 0; i < edges->size; i++) { - i32_t candidate = IntMap_Get(edges, i); + for (i = 0; i < sentences->size; i++) { + Span *sentence = (Span*)VA_Fetch(sentences, i); + i32_t candidate = sentence->offset; if (candidate < top){ continue; @@ -235,13 +235,14 @@ } /* Try to end on a sentence boundary (but don't try very hard). */ - if(edges->size) { + if(sentences->size) { u32_t i; ZombieCharBuf start_trimmed = ZCB_make(fragment); ZCB_Nip(&start_trimmed, start - top); - for (i = edges->size; i--; ) { - i32_t last_edge = IntMap_Get(edges, i); + for (i = sentences->size; i--; ) { + Span *sentence = (Span*)VA_Fetch(sentences, i); + i32_t last_edge = sentence->offset + sentence->length; if (last_edge <= start) { /* Sanity. */ @@ -393,78 +394,97 @@ REFCOUNT_DEC(encode_buf); } -static INLINE void -add_bound(i32_t pos, ByteBuf *bounds_bb) { - if (bounds_bb->cap - bounds_bb->len < sizeof(i32_t)) { - BB_Grow(bounds_bb, bounds_bb->len + 10 * sizeof(i32_t)); - } - *(i32_t*)BBEND(bounds_bb) = pos; - bounds_bb->len += sizeof(u32_t); +static void +close_sentence(VArray *sentences, Span **sentence_ptr, i32_t sentence_end) +{ + Span *sentence = *sentence_ptr; + i32_t length = sentence_end - Span_Get_Offset(sentence); + Span_Set_Length(sentence, length); + VA_Push(sentences, (Obj*)sentence); + REFCOUNT_DEC(sentence); + *sentence_ptr = NULL; } -IntMap* -Highlighter_find_sentence_boundaries(Highlighter *self, CharBuf *text, - i32_t offset, i32_t length) +VArray* +Highlighter_find_sentences(Highlighter *self, CharBuf *text, i32_t offset, + i32_t length) { - ByteBuf *bounds_bb = BB_new(10 * sizeof(u32_t)); - i32_t max = length == 0 - ? I32_MAX - : offset + length; - ZombieCharBuf substring = ZCB_make(text); - i32_t pos = ZCB_Trim_Top(&substring); + /* When [sentence] is NULL, that means a sentence start has not yet been + * found. When it is a Span object, we have a start, but we haven't found + * an end. Once we find the end, we add the sentence to the [sentences] + * array and set [sentence] back to NULL to indicate that we're looking + * for a start once more. + */ + Span *sentence = NULL; + VArray *sentences = VA_new(10); + i32_t excerpt_end = length == 0 + ? I32_MAX + : offset + length; + ZombieCharBuf fragment = ZCB_make(text); + i32_t pos; UNUSED_VAR(self); - if (offset <= pos) { + /* Our first task will be to find a sentence that either starts at the top + * of the fragment, or overlaps its start. Starting at 0 -- i.e. the top + * of the field -- is a special case. We define the first non-whitespace + * character to begin a sentence, rather than look for the first character + * following a period and whitespace. Everywhere else, we have to define + * sentence starts based on a sentence end that has just passed by. + */ + if (offset == 0) { + pos = ZCB_Trim_Top(&fragment); /* Assume that first non-whitespace character begins a sentence. */ - if (pos < max && ZCB_Get_Size(&substring) > 0) { - add_bound(pos, bounds_bb); + if (pos < excerpt_end && ZCB_Get_Size(&fragment) > 0) { + sentence = Span_new(pos, 0, 0.0); } } - pos += ZCB_Nip(&substring, offset - pos); + else { + pos = ZCB_Nip(&fragment, offset); + } - while (pos < max) { - u32_t code_point = ZCB_Code_Point_At(&substring, 0); + while (1) { + u32_t code_point = ZCB_Code_Point_At(&fragment, 0); if (!code_point) { - /* End of substring. Add a bound if it's also the end of the field, + /* End of fragment. If we have a sentence open, close it, * then bail. */ - if (substring.ptr == CBEND(text)) - add_bound(pos, bounds_bb); - + if (sentence) close_sentence(sentences, &sentence, pos); break; } else if (code_point == '.') { u32_t whitespace_count; - pos += ZCB_Nip(&substring, 1); /* advance past "." */ + pos += ZCB_Nip(&fragment, 1); /* advance past "." */ - if (pos == max && ZCB_Get_Size(&substring) == 0) { + if (pos == excerpt_end && ZCB_Get_Size(&fragment) == 0) { /* Period ending the field string. */ - add_bound(pos, bounds_bb); + if (sentence) close_sentence(sentences, &sentence, pos); break; } - else if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) { + else if (0 != (whitespace_count = ZCB_Trim_Top(&fragment))) { + /* We've found a period followed by whitespace. Close out the + * existing sentence, if there is one. */ + if (sentence) close_sentence(sentences, &sentence, pos); + /* Advance past whitespace. */ pos += whitespace_count; - if (pos < max && ZCB_Get_Size(&substring) > 0) { - /* Not at the end of the string? Then success! */ - add_bound(pos, bounds_bb); + if (pos < excerpt_end && ZCB_Get_Size(&fragment) > 0) { + /* Not at the end of the string? Then we've found a + * sentence start. */ + sentence = Span_new(pos, 0, 0.0); } } + + /* We may not have reached the end of the field yet, but it's + * entirely possible that our last sentence overlapped the end of + * the fragment -- in which case, it's time to bail. */ + if (pos >= excerpt_end) break; } else { - ZCB_Nip(&substring, 1); + ZCB_Nip(&fragment, 1); pos++; } } - { - u32_t num_bounds = bounds_bb->len / sizeof(u32_t); - IntMap *retval; - i32_t *ints = MALLOCATE(num_bounds, i32_t); - memcpy(ints, bounds_bb->ptr, bounds_bb->len); - retval = IntMap_new(ints, num_bounds); - REFCOUNT_DEC(bounds_bb); - return retval; - } + return sentences; } CharBuf* Modified: trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm =================================================================== --- trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-31 03:01:32 UTC (rev 3686) +++ trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-31 15:57:38 UTC (rev 3687) @@ -18,7 +18,7 @@ get_pre_tag set_post_tag get_post_tag - _fsb|find_sentence_boundaries ) + _find_sentences|find_sentences ) ], make_getters => [qw( searchable query excerpt_length compiler field )], Modified: trunk/perl/lib/KinoSearch.pm =================================================================== --- trunk/perl/lib/KinoSearch.pm 2008-07-31 03:01:32 UTC (rev 3686) +++ trunk/perl/lib/KinoSearch.pm 2008-07-31 15:57:38 UTC (rev 3687) @@ -392,9 +392,9 @@ { package KinoSearch::Highlight::Highlighter; - sub find_sentence_boundaries { + sub find_sentences { my $self = shift; - return $self->_fsb(@_)->to_arrayref; + return $self->_find_sentences(@_)->to_pobj; } } Modified: trunk/perl/t/303-highlighter.t =================================================================== --- trunk/perl/t/303-highlighter.t 2008-07-31 03:01:32 UTC (rev 3686) +++ trunk/perl/t/303-highlighter.t 2008-07-31 15:57:38 UTC (rev 3687) @@ -81,7 +81,7 @@ my $top = $hl->_find_best_fragment( fragment => $target, field_val => $field_val, - heat_map => make_heat_map( [ 2, 1 ] ), + heat_map => make_heat_map( [ 2, 1, 1.0 ] ), ); is( $target->to_perl, "$phi $phi b", "Find_Best_Fragment" ); is( $top, 2, "correct offset returned by Find_Best_Fragment" ); @@ -90,7 +90,7 @@ $top = $hl->_find_best_fragment( fragment => $target, field_val => $field_val, - heat_map => make_heat_map( [ 2, 1 ] ), + heat_map => make_heat_map( [ 2, 1, 1.0 ] ), ); is( $target->to_perl, $field_val->to_perl, "Find_Best_Fragment returns whole field when field is short" ); @@ -100,7 +100,7 @@ $top = $hl->_find_best_fragment( fragment => $target, field_val => $field_val, - heat_map => make_heat_map( [ 6, 2 ] ), + heat_map => make_heat_map( [ 6, 2, 1.0 ] ), ); is( $target->to_perl, "b$phi$phi", "Find_Best_Fragment shifts left to deal with overrun" ); @@ -110,7 +110,7 @@ $top = $hl->_find_best_fragment( fragment => $target, field_val => $field_val, - heat_map => make_heat_map( [ 0, 1 ] ), + heat_map => make_heat_map( [ 0, 1, 1.0 ] ), ); is( $target->to_perl, "a$phi" . "bcd", @@ -132,9 +132,9 @@ fragment => "Ook. Urk.", raw_excerpt => $target, top => 0, - edges => make_int_map( 0, 6 ), + sentences => make_spans( [ 0, 4, 0 ], [ 6, 4, 0 ] ), ); -is( $target->to_perl, "Ook. ", "Raw_Excerpt at top" ); +is( $target->to_perl, "Ook.", "Raw_Excerpt at top" ); is( $top, 0, "top still 0" ); $target = make_cb(""); @@ -143,29 +143,30 @@ fragment => ". Urk. I", raw_excerpt => $target, top => 3, - edges => make_int_map( 6, 12 ), + sentences => make_spans( [ 6, 4, 0 ], [ 12, 4, 0 ] ), ); -is( $target->to_perl, "Urk. ", "Raw_Excerpt in middle, with 2 bounds" ); +is( $target->to_perl, "Urk.", "Raw_Excerpt in middle, with 2 bounds" ); is( $top, 6, "top in the middle modified by Raw_Excerpt" ); -$target = make_cb(""); -$top = $hl->_raw_excerpt( - field_val => "Ook urk ick iz", - fragment => "ick iz", +$target = make_cb(""); +$field_val = "Ook urk ick i."; +$top = $hl->_raw_excerpt( + field_val => $field_val, + fragment => "ick i.", raw_excerpt => $target, top => 8, - edges => make_int_map(14), + sentences => make_spans( [ 0, length($field_val), 0 ] ), ); -is( $target->to_perl, "... iz", "Ellipsis at top" ); +is( $target->to_perl, "... i.", "Ellipsis at top" ); is( $top, 8, "top correct when leading ellipsis inserted" ); $target = make_cb(""); -$top = $hl->_raw_excerpt( - field_val => "Urk. Iz no good.", +$field_val = "Urk. Iz no good.", $top = $hl->_raw_excerpt( + field_val => $field_val, fragment => " Iz no go", raw_excerpt => $target, top => 4, - edges => make_int_map(6), + sentences => make_spans( [ 6, length($field_val) - 6, 0 ] ), ); is( $target->to_perl, "Iz...", "Ellipsis at end" ); is( $top, 6, "top trimmed" ); @@ -189,7 +190,7 @@ $target = make_cb(""); $hl->_highlight_excerpt( raw_excerpt => "$phi $phi $phi", - spans => make_spans( [ 2, 1 ] ), + spans => make_spans( [ 2, 1, 1.0 ] ), top => 0, highlighted => $target, ); @@ -202,7 +203,7 @@ $target = make_cb(""); $hl->_highlight_excerpt( raw_excerpt => "$phi $phi $phi", - spans => make_spans( [ 3, 1 ] ), + spans => make_spans( [ 3, 1, 1.0 ] ), top => 1, highlighted => $target, ); @@ -298,74 +299,86 @@ "... but not another field" ); -my $sentences = 'This is a sentence. ' x 15; +my $sentence_text = 'This is a sentence. ' x 15; $hl = KinoSearch::Highlight::Highlighter->new( searchable => $searcher, query => $q, field => 'content', ); -is_deeply( - $hl->find_sentence_boundaries( - text => $sentences, - offset => 101, - length => 50, - ), - [ 120, 140 ], - 'find_sentence_boundaries in list context with explicit args' +my $sentences = $hl->find_sentences( + text => $sentence_text, + offset => 101, + length => 50, ); is_deeply( - $hl->find_sentence_boundaries( - text => $sentences, - offset => 101, - length => 4, - ), - [], - 'fsb with explicit args, finding nothing' + spans_to_arg_array($sentences), + [ [ 120, 19, 0 ], [ 140, 19, 0 ] ], + 'find_sentences with explicit args' ); -is_deeply( - $hl->find_sentence_boundaries( text => $sentences ), - [. 0, 20, 40, 60, 80, 100, 120, 140, - 160, 180, 200, 220, 240, 260, 280, 300 - ], - 'fsb with default offset and length' + +$sentences = $hl->find_sentences( + text => $sentence_text, + offset => 101, + length => 4, ); +is_deeply( spans_to_arg_array($sentences), + [], 'find_sentences with explicit args, finding nothing' ); + +my @expected; +for my $i ( 0 .. 14 ) { + push @expected, [ $i * 20, 19, 0 ]; +} +$sentences = $hl->find_sentences( text => $sentence_text ); +is_deeply( spans_to_arg_array($sentences), + \@expected, 'find_sentences with default offset and length' ); + +$sentences = $hl->find_sentences( text => ' Foo' ); is_deeply( - $hl->find_sentence_boundaries( text => ' Foo' ), - [ 1, 4 ], + spans_to_arg_array($sentences), + [ [ 1, 3, 0 ] ], "Skip leading whitespace but get first sentence" ); $hl = MyHighlighter->new( - searchable => $searcher, - query => "blind", - field => 'content', + searchable => $searcher, + query => "blind", + field => 'content', ); $hits = $searcher->search( query => 'blind' ); $hit = $hits->fetch_hit; -like( $hl->create_excerpt($hit), qr/\*wise\*/, - "override both Encode() and Highlight()" ); +like( $hl->create_excerpt($hit), + qr/\*wise\*/, "override both Encode() and Highlight()" ); sub make_cb { return KinoSearch::Util::CharBuf->new(shift); } +sub make_heat_map { + return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) ); +} + +sub make_span { + return KinoSearch::Search::Span->new( + offset => $_[0], + length => $_[1], + weight => $_[2], + ); +} + sub make_spans { - my $spans = KinoSearch::Util::VArray->new( capacity => @_ / 2 ); + my $spans = KinoSearch::Util::VArray->new( capacity => scalar @_ ); for my $span_spec (@_) { - my $span = KinoSearch::Search::Span->new( - offset => $span_spec->[0], - length => $span_spec->[1], - weight => 1, - ); - $spans->push($span); + $spans->push( make_span( @{$span_spec}[ 0 .. 2 ] ) ); } return $spans; } -sub make_heat_map { - return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) ); +sub spans_to_arg_array { + my $spans = shift; + my @out; + for (@$spans) { + push @out, [ $_->get_offset, $_->get_length, $_->get_weight ]; + } + return \@out; } -sub make_int_map { - return KinoSearch::Util::IntMap->new( ints => [@_] ); -} _______________________________________________ kinosearch-commits mailing list kinosearch-commits [at] rectangular http://www.rectangular.com/mailman/listinfo/kinosearch-commits
|