
marvin at rectangular
Jul 15, 2008, 5:33 PM
Views: 138
Permalink
|
|
r3600 - in trunk: c_src/KinoSearch/Highlight perl/lib/KinoSearch/Highlight perl/t
|
|
Author: creamyg Date: 2008-07-15 17:33:14 -0700 (Tue, 15 Jul 2008) New Revision: 3600 Modified: trunk/c_src/KinoSearch/Highlight/HeatMap.bp trunk/c_src/KinoSearch/Highlight/HeatMap.c trunk/c_src/KinoSearch/Highlight/Highlighter.bp trunk/c_src/KinoSearch/Highlight/Highlighter.c trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm trunk/perl/t/303-highlighter.t Log: Port Highlighter::create_excerpt to C, breaking it up into three main subroutines and changing the algorithm slightly to emphasize sentence boundaries when defining where snippets begin and end. (Note: the new algorithm is still a little buggy.) Modified: trunk/c_src/KinoSearch/Highlight/HeatMap.bp =================================================================== --- trunk/c_src/KinoSearch/Highlight/HeatMap.bp 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/c_src/KinoSearch/Highlight/HeatMap.bp 2008-07-16 00:33:14 UTC (rev 3600) @@ -8,6 +8,11 @@ static incremented HeatMap* new(VArray *spans, u32_t window = 133); + /** + * @param spans An array of HighlightSpans, which need not be sorted. + * @param window The greatest distance between which heat points may + * reinforce each other. + */ static HeatMap* init(HeatMap *self, VArray *spans, u32_t window = 133); @@ -51,6 +56,9 @@ incremented VArray* Generate_Proximity_Boosts(HeatMap *self, VArray *spans); + u32_t + Hottest(HeatMap *self); + void Destroy(HeatMap *self); } Modified: trunk/c_src/KinoSearch/Highlight/HeatMap.c =================================================================== --- trunk/c_src/KinoSearch/Highlight/HeatMap.c 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/c_src/KinoSearch/Highlight/HeatMap.c 2008-07-16 00:33:14 UTC (rev 3600) @@ -2,6 +2,7 @@ #include "KinoSearch/Highlight/HeatMap.h" #include "KinoSearch/Highlight/HighlightSpan.h" +#include "KinoSearch/Util/Native.h" HeatMap* HeatMap_new(VArray *spans, u32_t window) @@ -13,21 +14,29 @@ HeatMap* HeatMap_init(HeatMap *self, VArray *spans, u32_t window) { + VArray *spans_copy = VA_Shallow_Copy(spans); VArray *spans_plus_boosts; self->spans = NULL; self->window = window; - VA_Sort(spans, HeatMap_compare_spans); - spans_plus_boosts = HeatMap_Generate_Proximity_Boosts(self, spans); - VA_Push_VArray(spans_plus_boosts, spans); + VA_Sort(spans_copy, HeatMap_compare_spans); + spans_plus_boosts = HeatMap_Generate_Proximity_Boosts(self, spans_copy); + VA_Push_VArray(spans_plus_boosts, spans_copy); VA_Sort(spans_plus_boosts, HeatMap_compare_spans); self->spans = HeatMap_Flatten_Spans(self, spans_plus_boosts); + REFCOUNT_DEC(spans_copy); REFCOUNT_DEC(spans_plus_boosts); return self; } +u32_t +HeatMap_hottest(HeatMap *self) +{ + return (u32_t)Native_callback_i(self, "_hottest", 0); +} + int HeatMap_compare_spans(const void *va, const void *vb) { Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp =================================================================== --- trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-16 00:33:14 UTC (rev 3600) @@ -6,6 +6,8 @@ Query *query; CharBuf *field; u32_t excerpt_length; + u32_t window_width; + u32_t slop; CharBuf *pre_tag; CharBuf *post_tag; Compiler *compiler; @@ -18,6 +20,9 @@ init(Highlighter *self, Searchable *searchable, Query *query, const CharBuf *field, u32_t excerpt_length = 200); + CharBuf* + Create_Excerpt(Highlighter *self, HitDoc *hit_doc); + incremented CharBuf* Encode(Highlighter *self, CharBuf *text); @@ -50,8 +55,38 @@ CharBuf* Get_Post_Tag(Highlighter *self); - + + /* Decide based on heat map the best fragment of field to concentrate on. + * Place the result into [fragment] and return its offset in code points + * from the top of the field. + * + * (Helper function for Create_Excerpt only exposed for testing purposes.) + */ + u32_t + Find_Best_Fragment(Highlighter *self, const CharBuf *field_val, + ViewCharBuf *fragment, HeatMap *heat_map); + + /* Take the fragment and determine the best edges for it based on sentence + * boundaries when possible. Add ellipses when boundaries cannot be + * found. + * + * (Helper function for Create_Excerpt only exposed for testing purposes.) + */ + u32_t + Raw_Excerpt(Highlighter *self, const CharBuf *field_val, + const CharBuf *fragment, CharBuf *raw_excerpt, u32_t top, + IntMap *edges); + + /* Take the text in raw_excerpt, add highlight tags, encode, and place the + * result into [highlighted]. + * + * (Helper function for Create_Excerpt only exposed for testing purposes.) + */ void + Highlight_Excerpt(Highlighter *self, VArray *spans, + CharBuf *raw_excerpt, CharBuf *highlighted, u32_t top); + + void Destroy(Highlighter *self); } Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.c =================================================================== --- trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-16 00:33:14 UTC (rev 3600) @@ -1,10 +1,13 @@ #include "KinoSearch/Util/ToolSet.h" #include "KinoSearch/Highlight/Highlighter.h" +#include "KinoSearch/Doc/HitDoc.h" #include "KinoSearch/Search/Compiler.h" #include "KinoSearch/Search/Query.h" #include "KinoSearch/Search/Searchable.h" +#include "KinoSearch/Highlight/HeatMap.h" #include "KinoSearch/Highlight/HighlightSpan.h" +#include "KinoSearch/Index/DocVector.h" #include "KinoSearch/Util/ByteBuf.h" #include "KinoSearch/Util/IntMap.h" #include "KinoSearch/Util/Native.h" @@ -27,6 +30,8 @@ self->compiler = Query_Make_Compiler(query, searchable, Query_Get_Boost(query)); self->excerpt_length = excerpt_length; + self->slop = excerpt_length / 3; + self->window_width = excerpt_length + (self->slop * 2); self->pre_tag = CB_new_from_trusted_utf8("<strong>", 8); self->post_tag = CB_new_from_trusted_utf8("</strong>", 9); return self; @@ -54,6 +59,319 @@ CharBuf* Highlighter_get_post_tag(Highlighter *self) { return self->post_tag; } +CharBuf* +Highlighter_create_excerpt(Highlighter *self, HitDoc *hit_doc) +{ + ZombieCharBuf field_val_zcb = ZCB_BLANK; + ZombieCharBuf *field_val = (ZombieCharBuf*)HitDoc_Extract(hit_doc, + self->field, (ViewCharBuf*)&field_val_zcb); + + if (!field_val || !OBJ_IS_A(field_val, CHARBUF)) { + return NULL; + } + else if (!field_val->len) { + /* Empty string yields empty string. */ + return CB_new(0); + } + else { + ZombieCharBuf fragment = ZCB_make((CharBuf*)field_val); + CharBuf *raw_excerpt = CB_new(self->excerpt_length + 10); + CharBuf *highlighted = CB_new(self->excerpt_length * 1.5); + DocVector *doc_vec = Searchable_Fetch_Doc_Vec(self->searchable, + HitDoc_Get_Doc_Num(hit_doc)); + VArray *score_spans = Compiler_Highlight_Spans(self->compiler, + self->searchable, doc_vec, self->field); + HeatMap *heat_map = HeatMap_new(score_spans, + self->excerpt_length * 0.6666); + u32_t top = Highlighter_Find_Best_Fragment(self, (CharBuf*)field_val, + (ViewCharBuf*)&fragment, heat_map); + IntMap *edges = Highlighter_Find_Sentence_Boundaries(self, + (CharBuf*)field_val, top, self->window_width); + + top = Highlighter_Raw_Excerpt(self, (CharBuf*)field_val, + (CharBuf*)&fragment, raw_excerpt, top, edges); + VA_Sort(score_spans, HeatMap_compare_spans); + Highlighter_highlight_excerpt(self, score_spans, raw_excerpt, + highlighted, top); + + REFCOUNT_DEC(doc_vec); + REFCOUNT_DEC(score_spans); + REFCOUNT_DEC(heat_map); + REFCOUNT_DEC(raw_excerpt); + + return highlighted; + } +} + +static u32_t +hottest(HeatMap *heat_map) +{ + u32_t i; + float max_score = 0.0f; + u32_t retval = 0; + for (i = heat_map->spans->size; i--; ) { + HighlightSpan *span = (HighlightSpan*)VA_Fetch(heat_map->spans, i); + if (span->weight >= max_score) { + retval = span->start_offset; + max_score = span->weight; + } + } + return retval; +} + +u32_t +Highlighter_find_best_fragment(Highlighter *self, const CharBuf *field_val, + ViewCharBuf *fragment, HeatMap *heat_map) +{ + /* Window is 1.66 * excerpt_length, with the loc in the middle. */ + u32_t best_location = hottest(heat_map); + + if (best_location < self->slop) { + /* If the beginning of the string falls within the window centered + * around the hottest point in the field, start the fragment at the + * beginning. */ + u32_t top; + ViewCB_Assign(fragment, (CharBuf*)field_val); + top = ViewCB_Trim_Top(fragment); + ViewCB_Truncate(fragment, self->window_width); + return top; + } + else { + u32_t top = best_location - self->slop; + u32_t chars_left; + u32_t overrun; + + ViewCB_Assign(fragment, (CharBuf*)field_val); + ViewCB_Nip(fragment, top); + top += ViewCB_Trim_Top(fragment); + chars_left = ViewCB_Truncate(fragment, self->excerpt_length); + overrun = self->excerpt_length - chars_left; + + if (!overrun) { + /* We've found an acceptable window. */ + ViewCB_Assign(fragment, (CharBuf*)field_val); + ViewCB_Nip(fragment, top); + top += ViewCB_Trim_Top(fragment); + ViewCB_Truncate(fragment, self->window_width); + return top; + } + else if (overrun > top) { + /* The field is very short, so make the whole field the + * "fragment". */ + ViewCB_Assign(fragment, (CharBuf*)field_val); + return ViewCB_Trim_Top(fragment); + } + else { + /* The fragment is too close to the end, so slide it back. */ + top -= overrun; + ViewCB_Assign(fragment, (CharBuf*)field_val); + ViewCB_Nip(fragment, top); + top += ViewCB_Trim_Top(fragment); + ViewCB_Truncate(fragment, self->excerpt_length); + return top; + } + } +} + +u32_t +Highlighter_raw_excerpt(Highlighter *self, const CharBuf *field_val, + const CharBuf *fragment, CharBuf *raw_excerpt, + u32_t top, IntMap *edges) +{ + bool_t found_starting_edge = false; + bool_t found_ending_edge = false; + u32_t start = top; + u32_t end = 0; + u32_t this_excerpt_len; + + /* Try to find a starting sentence boundary. */ + if (edges->size) { + u32_t i; + + for (i = 0; i < edges->size; i++) { + u32_t candidate = (u32_t)IntMap_Get(edges, i); + + if (candidate < top){ + continue; + } + else if (candidate == top) { + /* Bingo! The fragment already starts on a boundary. */ + found_starting_edge = true; + start = top; + } + else { + /* Try to start on the first sentence boundary, but only if + * there's enough material left after it in the fragment. */ + ZombieCharBuf temp = ZCB_make(fragment); + u32_t chars_left; + + ZCB_Nip(&temp, candidate - top); + chars_left = ZCB_Truncate(&temp, self->excerpt_length); + if (chars_left >= self->excerpt_length) { + start = candidate; + found_starting_edge = true; + } + } + break; + } + } + + /* Try to end on a sentence boundary (but don't try very hard). */ + if(edges->size) { + u32_t i; + ZombieCharBuf start_trimmed = ZCB_make(fragment); + ZCB_Nip(&start_trimmed, start - top); + + for (i = edges->size; i--; ) { + u32_t last_edge = (u32_t)IntMap_Get(edges, i); + + if (last_edge <= start) { + /* sanity. */ + break; + } + else if (last_edge - start > self->excerpt_length) { + continue; + } + else { + ZombieCharBuf temp = ZCB_make((CharBuf*)&start_trimmed); + u32_t chars_left = ZCB_Nip(&temp, last_edge - start); + if (chars_left > (self->excerpt_length * 0.6666)) { + found_ending_edge = true; + end = last_edge; + } + else { + ZCB_Trim_Tail(&temp); + if (ZCB_Get_Len(&temp) == 0) { + /* Short, but ending on a boundary already. */ + found_ending_edge = true; + end = last_edge; + } + } + } + } + } + this_excerpt_len = found_ending_edge + ? end - start + : self->excerpt_length; + if (!this_excerpt_len) return start; + + if (found_starting_edge) { + ZombieCharBuf temp = ZCB_make((CharBuf*)field_val); + ZCB_Nip(&temp, start); + ZCB_Truncate(&temp, this_excerpt_len); + CB_Copy(raw_excerpt, (CharBuf*) &temp); + } + /* If not starting on a sentence boundary, prepend an ellipsis. */ + else { + ZombieCharBuf temp = ZCB_make((CharBuf*)field_val); + const size_t ELLIPSIS_LEN = 4; /* Three dots and a space. */ + + /* If the excerpt is already shorter than the spec'd length, we might + * not need to make room. */ + this_excerpt_len += ELLIPSIS_LEN; + + /* Move the start back one in case the character right before the + * excerpt starts is whitespace. */ + if (start) { + this_excerpt_len += 1; + start -= 1; + ZCB_Nip(&temp, start); + } + + do { + u32_t code_point = ZCB_Nip_One(&temp); + start++; + this_excerpt_len--; + + if (StrHelp_is_whitespace(code_point)) { + if (!found_ending_edge) { + /* If we still need room, we'll lop it off the end since + * we don't know a solid end point yet. */ + break; + } + else if (this_excerpt_len <= self->excerpt_length) { + break; + } + } + } while (ZCB_Get_Len(&temp)); + + ZCB_Truncate(&temp, self->excerpt_length - ELLIPSIS_LEN); + CB_Cat_Trusted_Str(raw_excerpt, "... ", ELLIPSIS_LEN); + CB_Cat(raw_excerpt, (CharBuf*)&temp); + start -= ELLIPSIS_LEN; + } + + /* If excerpt doesn't end on a sentence boundary, tack on an ellipsis. */ + if (!found_ending_edge) { + CB_Truncate(raw_excerpt, self->excerpt_length - 2); + do { + u32_t code_point = CB_Code_Point_From(raw_excerpt, 1); + CB_Chop(raw_excerpt, 1); + if (StrHelp_is_whitespace(code_point)) { + CB_Trim_Tail(raw_excerpt); + break; + } + } while (CB_Get_Len(raw_excerpt)); + CB_Cat_Trusted_Str(raw_excerpt, "...", 3); + } + + return start; +} + +void +Highlighter_highlight_excerpt(Highlighter *self, VArray *spans, + CharBuf *raw_excerpt, CharBuf *highlighted, + u32_t top) +{ + u32_t i; + i32_t last_end = 0; + ZombieCharBuf temp = ZCB_make(raw_excerpt); + + for (i = 0; i < spans->size; i++) { + HighlightSpan *span = (HighlightSpan*)VA_Fetch(spans, i); + if (span->start_offset < top) { + continue; + } + else { + i32_t relative_start = span->start_offset - top; + i32_t relative_end = span->end_offset - top; + + if (relative_start > last_end) { + CharBuf *encoded; + i32_t non_highlighted_len = relative_start - last_end; + ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp); + ZCB_Truncate(&to_cat, non_highlighted_len); + encoded = Highlighter_Encode(self, (CharBuf*)&to_cat); + CB_Cat(highlighted, (CharBuf*)encoded); + ZCB_Nip(&temp, non_highlighted_len); + REFCOUNT_DEC(encoded); + } + if (relative_end > relative_start) { + CharBuf *encoded; + CharBuf *hl_frag; + i32_t highlighted_len = relative_end - relative_start; + ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp); + ZCB_Truncate(&to_cat, highlighted_len); + encoded = Highlighter_Encode(self, (CharBuf*)&to_cat); + hl_frag = Highlighter_Highlight(self, encoded); + CB_Cat(highlighted, hl_frag); + ZCB_Nip(&temp, highlighted_len); + REFCOUNT_DEC(encoded); + REFCOUNT_DEC(hl_frag); + } + last_end = relative_end; + } + } + + /* Last text, beyond last highlight span. */ + { + CharBuf *encoded = Highlighter_Encode(self, (CharBuf*)&temp); + CB_Cat(highlighted, encoded); + REFCOUNT_DEC(encoded); + } + CB_Trim_Tail(highlighted); +} + static INLINE void add_bound(u32_t pos, ByteBuf *bounds_bb) { if (bounds_bb->cap - bounds_bb->len < sizeof(u32_t)) { @@ -68,37 +386,41 @@ u32_t offset, u32_t length) { ByteBuf *bounds_bb = BB_new(10 * sizeof(u32_t)); - u32_t pos = offset; u32_t max = length == 0 ? I32_MAX : offset + length; - ZombieCharBuf substring = ZCB_BLANK; + ZombieCharBuf substring = ZCB_make(text); + u32_t pos = ZCB_Trim_Top(&substring); UNUSED_VAR(self); - ZCB_Assign(&substring, text); - if (offset == 0) { - /* If offset is zero, assume that the first non-whitespace character - * is the beginning of a sentence. */ - pos += ZCB_Trim_Top(&substring); + if (offset <= pos) { + /* Assume that first non-whitespace character begins a sentence. */ if (pos < max && ZCB_Get_Len(&substring) > 0) { add_bound(pos, bounds_bb); } } - else { - ZCB_Nip(&substring, offset); - } + pos += ZCB_Nip(&substring, offset - pos); while (pos < max) { u32_t code_point = ZCB_Code_Point_At(&substring, 0); if (!code_point) { - /* End of string, so bail. */ + /* End of substring. Add a bound if it's also the end of the field, + * then bail. */ + if (substring.ptr == CBEND(text)) + add_bound(pos, bounds_bb); + break; } else if (code_point == '.') { u32_t whitespace_count; pos += ZCB_Nip(&substring, 1); /* advance past "." */ - if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) { + if (pos == max && ZCB_Get_Len(&substring) == 0) { + /* Period ending the field string. */ + add_bound(pos, bounds_bb); + break; + } + else if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) { /* Advance past whitespace. */ pos += whitespace_count; if (pos < max && ZCB_Get_Len(&substring) > 0) { Modified: trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm =================================================================== --- trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm 2008-07-16 00:33:14 UTC (rev 3600) @@ -27,7 +27,7 @@ return $self; } -sub hottest { +sub _hottest { return $sorted_loc{ ${+shift} }[-1]; } @@ -62,6 +62,7 @@ qw( calc_proximity_boost _generate_proximity_boosts|generate_proximity_boosts _flatten_spans|flatten_spans + hottest ) ], make_getters => [qw( spans window )], Modified: trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm =================================================================== --- trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-16 00:33:14 UTC (rev 3600) @@ -28,136 +28,6 @@ return $self; } -sub create_excerpt { - my ( $self, $hitdoc ) = @_; - my $excerpt_field = $self->get_field; - my $excerpt_length = $self->get_excerpt_length; - my $searchable = $self->get_searchable; - my $token_re = $token_re{$$self}; - - # scoring window is 1.66 * excerpt_length, with the loc in the middle - my $limit = int( $excerpt_length / 3 ); - - # retrieve the text from the chosen field - my $text = $hitdoc->{$excerpt_field}; - return unless defined $text; - my $text_length = length $text; - my $orig_length = $text_length; - return '' unless $text_length; - - # determine the rough boundaries of the excerpt - my @posits = $self->get_compiler->highlight_spans( - searchable => $searchable, - field => $excerpt_field, - doc_vec => $searchable->fetch_doc_vec( $hitdoc->get_doc_num ), - ); - @posits = sort { $a->get_start_offset <=> $b->get_start_offset } @posits; - my $best_location = KinoSearch::Highlight::HeatMap->new( - spans => \@posits, - window => $limit*2 - )->hottest || 0; # undef --> 0 - my $top = $best_location - $limit; - - # expand the excerpt if the best location is near the end - $top - = $text_length - $excerpt_length < $top - ? $text_length - $excerpt_length - : $top; - - # if the best starting point is the very beginning, cool... - if ( $top <= 0 ) { - $top = 0; - } - # ... otherwise ... - else { - my $bounds = $self->find_sentence_boundaries( - text => $text, - offset => $top, - length => $limit - ); - if( @$bounds) { - $top = $bounds->[0]; - $text = substr $text, $top; - } - # no sentence boundary, so we'll need an ellipsis - else { - # lop off $top characters - $text = substr( $text, $top ); - - # skip past possible partial tokens, prepend an ellipsis - if ($text =~ s/ - \A - ( - .{0,$limit}? # don't go outside the window - $token_re # match possible partial token - .*? # ... and any junk following that token - ) - (?=$token_re) # just before the start of a full token... - /... /xsm # ... insert an ellipsis - ) - { - $top += length($1); - $top -= 4 # three dots and a space - } - } - } - - # remove possible partial tokens from the end of the excerpt - $text = substr( $text, 0, $excerpt_length + 1 ); - if ( length($text) > $excerpt_length ) { - my $extra_char = chop $text; - # if the extra char wasn't part of a token, we aren't splitting one - if ( $extra_char =~ $token_re ) { - $text =~ s/$token_re$//; # if this is unsuccessful, that's fine - } - } - - # if the excerpt doesn't end with a full stop, end with an an ellipsis - if ( $orig_length > length($text) and $text !~ /\.\s*\Z/xsm ) { - $text =~ s/\W+\Z//xsm; - while ( length($text) + 4 > $excerpt_length ) { - my $extra_char = chop $text; - if ( $extra_char =~ $token_re ) { - $text =~ s/\W+$token_re\Z//xsm; # if unsuccessful, that's fine - } - $text =~ s/\W+\Z//xsm; - } - $text .= ' ...'; - } - - # remap locations now that we know the starting and ending bytes - $text_length = length($text); - my @relative_starts = map { $_->get_start_offset - $top } @posits; - my @relative_ends = map { $_->get_end_offset - $top } @posits; - - # get rid of pairs with at least one member outside the text - while ( @relative_starts and $relative_starts[0] < 0 ) { - shift @relative_starts; - shift @relative_ends; - } - while ( @relative_ends and $relative_ends[-1] > $text_length ) { - pop @relative_starts; - pop @relative_ends; - } - - # insert highlight tags - my $output_text = ''; - my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 ); - while (@relative_starts) { - $end = shift @relative_ends; - $start = shift @relative_starts; - my $not_highlighted = substr( $text, $last_end, $start - $last_end ); - $output_text .= $self->encode($not_highlighted); - my $highlighted = substr( $text, $start, $end - $start ); - $output_text .= $self->highlight( $self->encode($highlighted) ); - $last_end = $end; - } - my $last_text = substr( $text, $last_end ); - $output_text .= $self->encode($last_text); - - return $output_text; -} - sub do_encode { return encode_entities( $_[1] ) } sub find_sentence_boundaries { @@ -175,6 +45,10 @@ bind_methods => [. qw( highlight encode + create_excerpt + _find_best_fragment|find_best_fragment + _raw_excerpt|raw_excerpt + _highlight_excerpt|highlight_excerpt set_pre_tag get_pre_tag set_post_tag Modified: trunk/perl/t/303-highlighter.t =================================================================== --- trunk/perl/t/303-highlighter.t 2008-07-16 00:29:31 UTC (rev 3599) +++ trunk/perl/t/303-highlighter.t 2008-07-16 00:33:14 UTC (rev 3600) @@ -19,7 +19,7 @@ package main; -use Test::More tests => 15; +use Test::More tests => 33; binmode( STDOUT, ":utf8" ); @@ -30,7 +30,7 @@ use KinoSearch::Store::RAMFolder; my $phi = "\x{03a6}"; -my $encoded_phi = "φ"; +my $encoded_phi = "Φ"; my $string = '1 2 3 4 5 ' x 20; # 200 characters $string .= "$phi a b c d x y z h i j k "; @@ -52,33 +52,174 @@ my $searcher = KinoSearch::Searcher->new( invindex => $invindex, ); -my $q = qq|"x y z" AND $phi|; +my $q = qq|"x y z" AND $phi|; my $hits = $searcher->search( query => $q ); -my $hit = $hits->fetch_hit; -my $hl = KinoSearch::Highlight::Highlighter->new( +my $hl = KinoSearch::Highlight::Highlighter->new( + searchable => $searcher, + query => $q, + field => 'content', + excerpt_length => 3, +); + +my $target = KinoSearch::Util::ViewCharBuf->_new(""); + +my $field_val = make_cb("a $phi $phi b c"); +my $top = $hl->_find_best_fragment( + fragment => $target, + field_val => $field_val, + heat_map => make_heat_map( [ 2, 3 ] ), +); +is( $target->to_perl, "$phi $phi b", "Find_Best_Fragment" ); +is( $top, 2, "correct offset returned by Find_Best_Fragment" ); + +$field_val = make_cb("aa$phi"); +$top = $hl->_find_best_fragment( + fragment => $target, + field_val => $field_val, + heat_map => make_heat_map( [ 2, 3 ] ), +); +is( $target->to_perl, $field_val->to_perl, + "Find_Best_Fragment returns whole field when field is short" ); +is( $top, 0, "correct offset" ); + +$field_val = make_cb("aaaab$phi$phi"); +$top = $hl->_find_best_fragment( + fragment => $target, + field_val => $field_val, + heat_map => make_heat_map( [ 6, 8 ] ), +); +is( $target->to_perl, "b$phi$phi", + "Find_Best_Fragment shifts left to deal with overrun" ); +is( $top, 4, "correct offset" ); + +$field_val = make_cb( "a$phi" . "bcde" ); +$top = $hl->_find_best_fragment( + fragment => $target, + field_val => $field_val, + heat_map => make_heat_map( [ 0, 2 ] ), +); +is( $target->to_perl, + "a$phi" . "bcd", + "Find_Best_Fragment start at field beginning" +); +is( $top, 0, "correct offset" ); +undef $target; + +$hl = KinoSearch::Highlight::Highlighter->new( + searchable => $searcher, + query => $q, + field => 'content', + excerpt_length => 6, +); + +$target = make_cb(""); +$top = $hl->_raw_excerpt( + field_val => "Ook. Urk. Ick. ", + fragment => "Ook. Urk.", + raw_excerpt => $target, + top => 0, + edges => make_int_map( 0, 6 ), +); +is( $target->to_perl, "Ook. ", "Raw_Excerpt at top" ); +is( $top, 0, "top still 0" ); + +$target = make_cb(""); +$top = $hl->_raw_excerpt( + field_val => "Ook. Urk. Ick. ", + fragment => ". Urk. I", + raw_excerpt => $target, + top => 3, + edges => make_int_map( 6, 12 ), +); +is( $target->to_perl, "Urk. ", "Raw_Excerpt in middle, with 2 bounds" ); +is( $top, 6, "top in the middle modified by Raw_Excerpt" ); + +$target = make_cb(""); +$top = $hl->_raw_excerpt( + field_val => "Ook urk ick iz", + fragment => "ick iz", + raw_excerpt => $target, + top => 8, + edges => make_int_map(14), +); +is( $target->to_perl, "... iz", "Ellipsis at top" ); +is( $top, 8, "top correct when leading ellipsis inserted" ); + +$target = make_cb(""); +$top = $hl->_raw_excerpt( + field_val => "Urk. Iz no good.", + fragment => " Iz no go", + raw_excerpt => $target, + top => 4, + edges => make_int_map(6), +); +is( $target->to_perl, "Iz...", "Ellipsis at end" ); +is( $top, 6, "top trimmed" ); + +$hl = KinoSearch::Highlight::Highlighter->new( + searchable => $searcher, + query => $q, + field => 'content', + excerpt_length => 3, +); + +$target = make_cb(""); +$hl->_highlight_excerpt( + raw_excerpt => 'a b c', + spans => make_spans( [ 2, 3 ] ), + top => 0, + highlighted => $target, +); +is( $target->to_perl, "a <strong>b</strong> c", "basic Highlight_Excerpt" ); + +$target = make_cb(""); +$hl->_highlight_excerpt( + raw_excerpt => "$phi $phi $phi", + spans => make_spans( [ 2, 3 ] ), + top => 0, + highlighted => $target, +); +like( + $target->to_perl, + qr#$encoded_phi <strong>$encoded_phi</strong> $encoded_phi#i, + "encode invoked by Highlight_Excerpt" +); + +$target = make_cb(""); +$hl->_highlight_excerpt( + raw_excerpt => "$phi $phi $phi", + spans => make_spans( [ 3, 4 ] ), + top => 1, + highlighted => $target, +); +like( + $target->to_perl, + qr#^$encoded_phi <strong>$encoded_phi</strong> $encoded_phi$#i, + "Highlight_Excerpt pays attention to offset" +); + +$hl = KinoSearch::Highlight::Highlighter->new( searchable => $searcher, query => $q, field => 'content', ); -my $excerpt = $hl->create_excerpt( $hit ); -like( $excerpt, - qr/$encoded_phi.*?z/i, "excerpt contains all relevant terms" ); + +my $hit = $hits->fetch_hit; +my $excerpt = $hl->create_excerpt($hit); +like( $excerpt, qr/$encoded_phi.*?z/i, + "excerpt contains all relevant terms" ); +like( $excerpt, qr#<strong>x y z</strong>#, "highlighter tagged the phrase" ); like( $excerpt, - qr#<strong>x y z</strong>#, - "highlighter tagged the phrase" -); -like( - $excerpt, qr#<strong>$encoded_phi</strong>#i, "highlighter tagged the single term" ); -$hl->set_pre_tag("\e[.1m"); $hl->set_post_tag("\e[.0m"); +$hl->set_pre_tag("\e[.1m"); +$hl->set_post_tag("\e[.0m"); like( - $hl->create_excerpt( $hit ), - qr#\e\[.1m$encoded_phi\e\[.0m#i, - "set_pre_tag and set_post_tag", + $hl->create_excerpt($hit), + qr#\e\[.1m$encoded_phi\e\[.0m#i, "set_pre_tag and set_post_tag", ); like( $hl->create_excerpt( $hits->fetch_hit() ), @@ -91,8 +232,9 @@ query => $q, field => 'content', ); -like( $hl->create_excerpt( $hits->fetch_hit() ), - qr/x y z/, +$excerpt = $hl->create_excerpt( $hits->fetch_hit() ); +$excerpt =~ s#</?strong>##g; +like( $excerpt, qr/x y z/, "query with same word in both phrase and term doesn't cause freakout" ); $hits = $searcher->search( query => $q = 'blind' ); @@ -101,8 +243,10 @@ searchable => $searcher, query => $q, field => 'content', - )->create_excerpt( $hits->fetch_hit() ), - qr/quot/, "HTML entity encoded properly" ); + )->create_excerpt( $hits->fetch_hit() ), + qr/quot/, + "HTML entity encoded properly" +); $hits = $searcher->search( query => $q = 'why' ); unlike( @@ -110,8 +254,10 @@ searchable => $searcher, query => $q, field => 'content', - )->create_excerpt( $hits->fetch_hit() ), - qr/\.\.\./, "no ellipsis for short excerpt" ); + )->create_excerpt( $hits->fetch_hit() ), + qr/\.\.\./, + "no ellipsis for short excerpt" +); my $term_query = KinoSearch::Search::TermQuery->new( field => 'content', @@ -124,15 +270,18 @@ searchable => $searcher, query => $term_query, field => 'content', - )->create_excerpt( $hit ), - qr/strong/, "specify field highlights correct field..." ); + )->create_excerpt($hit), + qr/strong/, + "specify field highlights correct field..." +); unlike( KinoSearch::Highlight::Highlighter->new( searchable => $searcher, query => $term_query, field => 'alt', - )->create_excerpt( $hit ), - qr/strong/, "... but not another field" + )->create_excerpt($hit), + qr/strong/, + "... but not another field" ); my $sentences = 'This is a sentence. ' x 15; @@ -150,7 +299,8 @@ [ 120, 140 ], 'find_sentence_boundaries in list context with explicit args' ); -is_deeply( $hl->find_sentence_boundaries( +is_deeply( + $hl->find_sentence_boundaries( text => $sentences, offset => 101, length => 4, @@ -160,11 +310,38 @@ ); is_deeply( $hl->find_sentence_boundaries( text => $sentences ), - [. 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280 ], - 'fsb in list context with default offset and length' + [. 0, 20, 40, 60, 80, 100, 120, 140, + 160, 180, 200, 220, 240, 260, 280, 300 + ], + 'fsb with default offset and length' ); -is_deeply( $hl->find_sentence_boundaries( text => ' Foo' ), [1], - "Skip leading whitespace but get first sentence"); -is_deeply( $hl->find_sentence_boundaries( text => 'Foo. Foo. ' ), [0, 5], - "No start at end even after period and whitespace" ); +is_deeply( + $hl->find_sentence_boundaries( text => ' Foo' ), + [ 1, 4 ], + "Skip leading whitespace but get first sentence" +); +sub make_cb { + return KinoSearch::Util::CharBuf->new(shift); +} + +sub make_spans { + my $spans = KinoSearch::Util::VArray->new( capacity => @_ / 2 ); + for my $span_spec (@_) { + my $hl_span = KinoSearch::Highlight::HighlightSpan->new( + start_offset => $span_spec->[0], + end_offset => $span_spec->[1], + weight => 1, + ); + $spans->push($hl_span); + } + return $spans; +} + +sub make_heat_map { + return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) ); +} + +sub make_int_map { + return KinoSearch::Util::IntMap->new( ints => [@_] ); +} _______________________________________________ kinosearch-commits mailing list kinosearch-commits [at] rectangular http://www.rectangular.com/mailman/listinfo/kinosearch-commits
|