Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: kinosearch: commits
r3600 - in trunk: c_src/KinoSearch/Highlight perl/lib/KinoSearch/Highlight perl/t
 

Index | Next | Previous | View Flat


marvin at rectangular

Jul 15, 2008, 5:33 PM


Views: 138
Permalink
r3600 - in trunk: c_src/KinoSearch/Highlight perl/lib/KinoSearch/Highlight perl/t

Author: creamyg
Date: 2008-07-15 17:33:14 -0700 (Tue, 15 Jul 2008)
New Revision: 3600

Modified:
trunk/c_src/KinoSearch/Highlight/HeatMap.bp
trunk/c_src/KinoSearch/Highlight/HeatMap.c
trunk/c_src/KinoSearch/Highlight/Highlighter.bp
trunk/c_src/KinoSearch/Highlight/Highlighter.c
trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm
trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm
trunk/perl/t/303-highlighter.t
Log:
Port Highlighter::create_excerpt to C, breaking it up into three main
subroutines and changing the algorithm slightly to emphasize sentence
boundaries when defining where snippets begin and end. (Note: the new
algorithm is still a little buggy.)


Modified: trunk/c_src/KinoSearch/Highlight/HeatMap.bp
===================================================================
--- trunk/c_src/KinoSearch/Highlight/HeatMap.bp 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/c_src/KinoSearch/Highlight/HeatMap.bp 2008-07-16 00:33:14 UTC (rev 3600)
@@ -8,6 +8,11 @@
static incremented HeatMap*
new(VArray *spans, u32_t window = 133);

+ /**
+ * @param spans An array of HighlightSpans, which need not be sorted.
+ * @param window The greatest distance between which heat points may
+ * reinforce each other.
+ */
static HeatMap*
init(HeatMap *self, VArray *spans, u32_t window = 133);

@@ -51,6 +56,9 @@
incremented VArray*
Generate_Proximity_Boosts(HeatMap *self, VArray *spans);

+ u32_t
+ Hottest(HeatMap *self);
+
void
Destroy(HeatMap *self);
}

Modified: trunk/c_src/KinoSearch/Highlight/HeatMap.c
===================================================================
--- trunk/c_src/KinoSearch/Highlight/HeatMap.c 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/c_src/KinoSearch/Highlight/HeatMap.c 2008-07-16 00:33:14 UTC (rev 3600)
@@ -2,6 +2,7 @@

#include "KinoSearch/Highlight/HeatMap.h"
#include "KinoSearch/Highlight/HighlightSpan.h"
+#include "KinoSearch/Util/Native.h"

HeatMap*
HeatMap_new(VArray *spans, u32_t window)
@@ -13,21 +14,29 @@
HeatMap*
HeatMap_init(HeatMap *self, VArray *spans, u32_t window)
{
+ VArray *spans_copy = VA_Shallow_Copy(spans);
VArray *spans_plus_boosts;

self->spans = NULL;
self->window = window;

- VA_Sort(spans, HeatMap_compare_spans);
- spans_plus_boosts = HeatMap_Generate_Proximity_Boosts(self, spans);
- VA_Push_VArray(spans_plus_boosts, spans);
+ VA_Sort(spans_copy, HeatMap_compare_spans);
+ spans_plus_boosts = HeatMap_Generate_Proximity_Boosts(self, spans_copy);
+ VA_Push_VArray(spans_plus_boosts, spans_copy);
VA_Sort(spans_plus_boosts, HeatMap_compare_spans);
self->spans = HeatMap_Flatten_Spans(self, spans_plus_boosts);
+ REFCOUNT_DEC(spans_copy);
REFCOUNT_DEC(spans_plus_boosts);

return self;
}

+u32_t
+HeatMap_hottest(HeatMap *self)
+{
+ return (u32_t)Native_callback_i(self, "_hottest", 0);
+}
+
int
HeatMap_compare_spans(const void *va, const void *vb)
{

Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.bp
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.bp 2008-07-16 00:33:14 UTC (rev 3600)
@@ -6,6 +6,8 @@
Query *query;
CharBuf *field;
u32_t excerpt_length;
+ u32_t window_width;
+ u32_t slop;
CharBuf *pre_tag;
CharBuf *post_tag;
Compiler *compiler;
@@ -18,6 +20,9 @@
init(Highlighter *self, Searchable *searchable, Query *query,
const CharBuf *field, u32_t excerpt_length = 200);

+ CharBuf*
+ Create_Excerpt(Highlighter *self, HitDoc *hit_doc);
+
incremented CharBuf*
Encode(Highlighter *self, CharBuf *text);

@@ -50,8 +55,38 @@

CharBuf*
Get_Post_Tag(Highlighter *self);
-
+
+ /* Decide based on heat map the best fragment of field to concentrate on.
+ * Place the result into [fragment] and return its offset in code points
+ * from the top of the field.
+ *
+ * (Helper function for Create_Excerpt only exposed for testing purposes.)
+ */
+ u32_t
+ Find_Best_Fragment(Highlighter *self, const CharBuf *field_val,
+ ViewCharBuf *fragment, HeatMap *heat_map);
+
+ /* Take the fragment and determine the best edges for it based on sentence
+ * boundaries when possible. Add ellipses when boundaries cannot be
+ * found.
+ *
+ * (Helper function for Create_Excerpt only exposed for testing purposes.)
+ */
+ u32_t
+ Raw_Excerpt(Highlighter *self, const CharBuf *field_val,
+ const CharBuf *fragment, CharBuf *raw_excerpt, u32_t top,
+ IntMap *edges);
+
+ /* Take the text in raw_excerpt, add highlight tags, encode, and place the
+ * result into [highlighted].
+ *
+ * (Helper function for Create_Excerpt only exposed for testing purposes.)
+ */
void
+ Highlight_Excerpt(Highlighter *self, VArray *spans,
+ CharBuf *raw_excerpt, CharBuf *highlighted, u32_t top);
+
+ void
Destroy(Highlighter *self);
}


Modified: trunk/c_src/KinoSearch/Highlight/Highlighter.c
===================================================================
--- trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/c_src/KinoSearch/Highlight/Highlighter.c 2008-07-16 00:33:14 UTC (rev 3600)
@@ -1,10 +1,13 @@
#include "KinoSearch/Util/ToolSet.h"

#include "KinoSearch/Highlight/Highlighter.h"
+#include "KinoSearch/Doc/HitDoc.h"
#include "KinoSearch/Search/Compiler.h"
#include "KinoSearch/Search/Query.h"
#include "KinoSearch/Search/Searchable.h"
+#include "KinoSearch/Highlight/HeatMap.h"
#include "KinoSearch/Highlight/HighlightSpan.h"
+#include "KinoSearch/Index/DocVector.h"
#include "KinoSearch/Util/ByteBuf.h"
#include "KinoSearch/Util/IntMap.h"
#include "KinoSearch/Util/Native.h"
@@ -27,6 +30,8 @@
self->compiler = Query_Make_Compiler(query, searchable,
Query_Get_Boost(query));
self->excerpt_length = excerpt_length;
+ self->slop = excerpt_length / 3;
+ self->window_width = excerpt_length + (self->slop * 2);
self->pre_tag = CB_new_from_trusted_utf8("<strong>", 8);
self->post_tag = CB_new_from_trusted_utf8("</strong>", 9);
return self;
@@ -54,6 +59,319 @@
CharBuf*
Highlighter_get_post_tag(Highlighter *self) { return self->post_tag; }

+CharBuf*
+Highlighter_create_excerpt(Highlighter *self, HitDoc *hit_doc)
+{
+ ZombieCharBuf field_val_zcb = ZCB_BLANK;
+ ZombieCharBuf *field_val = (ZombieCharBuf*)HitDoc_Extract(hit_doc,
+ self->field, (ViewCharBuf*)&field_val_zcb);
+
+ if (!field_val || !OBJ_IS_A(field_val, CHARBUF)) {
+ return NULL;
+ }
+ else if (!field_val->len) {
+ /* Empty string yields empty string. */
+ return CB_new(0);
+ }
+ else {
+ ZombieCharBuf fragment = ZCB_make((CharBuf*)field_val);
+ CharBuf *raw_excerpt = CB_new(self->excerpt_length + 10);
+ CharBuf *highlighted = CB_new(self->excerpt_length * 1.5);
+ DocVector *doc_vec = Searchable_Fetch_Doc_Vec(self->searchable,
+ HitDoc_Get_Doc_Num(hit_doc));
+ VArray *score_spans = Compiler_Highlight_Spans(self->compiler,
+ self->searchable, doc_vec, self->field);
+ HeatMap *heat_map = HeatMap_new(score_spans,
+ self->excerpt_length * 0.6666);
+ u32_t top = Highlighter_Find_Best_Fragment(self, (CharBuf*)field_val,
+ (ViewCharBuf*)&fragment, heat_map);
+ IntMap *edges = Highlighter_Find_Sentence_Boundaries(self,
+ (CharBuf*)field_val, top, self->window_width);
+
+ top = Highlighter_Raw_Excerpt(self, (CharBuf*)field_val,
+ (CharBuf*)&fragment, raw_excerpt, top, edges);
+ VA_Sort(score_spans, HeatMap_compare_spans);
+ Highlighter_highlight_excerpt(self, score_spans, raw_excerpt,
+ highlighted, top);
+
+ REFCOUNT_DEC(doc_vec);
+ REFCOUNT_DEC(score_spans);
+ REFCOUNT_DEC(heat_map);
+ REFCOUNT_DEC(raw_excerpt);
+
+ return highlighted;
+ }
+}
+
+static u32_t
+hottest(HeatMap *heat_map)
+{
+ u32_t i;
+ float max_score = 0.0f;
+ u32_t retval = 0;
+ for (i = heat_map->spans->size; i--; ) {
+ HighlightSpan *span = (HighlightSpan*)VA_Fetch(heat_map->spans, i);
+ if (span->weight >= max_score) {
+ retval = span->start_offset;
+ max_score = span->weight;
+ }
+ }
+ return retval;
+}
+
+u32_t
+Highlighter_find_best_fragment(Highlighter *self, const CharBuf *field_val,
+ ViewCharBuf *fragment, HeatMap *heat_map)
+{
+ /* Window is 1.66 * excerpt_length, with the loc in the middle. */
+ u32_t best_location = hottest(heat_map);
+
+ if (best_location < self->slop) {
+ /* If the beginning of the string falls within the window centered
+ * around the hottest point in the field, start the fragment at the
+ * beginning. */
+ u32_t top;
+ ViewCB_Assign(fragment, (CharBuf*)field_val);
+ top = ViewCB_Trim_Top(fragment);
+ ViewCB_Truncate(fragment, self->window_width);
+ return top;
+ }
+ else {
+ u32_t top = best_location - self->slop;
+ u32_t chars_left;
+ u32_t overrun;
+
+ ViewCB_Assign(fragment, (CharBuf*)field_val);
+ ViewCB_Nip(fragment, top);
+ top += ViewCB_Trim_Top(fragment);
+ chars_left = ViewCB_Truncate(fragment, self->excerpt_length);
+ overrun = self->excerpt_length - chars_left;
+
+ if (!overrun) {
+ /* We've found an acceptable window. */
+ ViewCB_Assign(fragment, (CharBuf*)field_val);
+ ViewCB_Nip(fragment, top);
+ top += ViewCB_Trim_Top(fragment);
+ ViewCB_Truncate(fragment, self->window_width);
+ return top;
+ }
+ else if (overrun > top) {
+ /* The field is very short, so make the whole field the
+ * "fragment". */
+ ViewCB_Assign(fragment, (CharBuf*)field_val);
+ return ViewCB_Trim_Top(fragment);
+ }
+ else {
+ /* The fragment is too close to the end, so slide it back. */
+ top -= overrun;
+ ViewCB_Assign(fragment, (CharBuf*)field_val);
+ ViewCB_Nip(fragment, top);
+ top += ViewCB_Trim_Top(fragment);
+ ViewCB_Truncate(fragment, self->excerpt_length);
+ return top;
+ }
+ }
+}
+
+u32_t
+Highlighter_raw_excerpt(Highlighter *self, const CharBuf *field_val,
+ const CharBuf *fragment, CharBuf *raw_excerpt,
+ u32_t top, IntMap *edges)
+{
+ bool_t found_starting_edge = false;
+ bool_t found_ending_edge = false;
+ u32_t start = top;
+ u32_t end = 0;
+ u32_t this_excerpt_len;
+
+ /* Try to find a starting sentence boundary. */
+ if (edges->size) {
+ u32_t i;
+
+ for (i = 0; i < edges->size; i++) {
+ u32_t candidate = (u32_t)IntMap_Get(edges, i);
+
+ if (candidate < top){
+ continue;
+ }
+ else if (candidate == top) {
+ /* Bingo! The fragment already starts on a boundary. */
+ found_starting_edge = true;
+ start = top;
+ }
+ else {
+ /* Try to start on the first sentence boundary, but only if
+ * there's enough material left after it in the fragment. */
+ ZombieCharBuf temp = ZCB_make(fragment);
+ u32_t chars_left;
+
+ ZCB_Nip(&temp, candidate - top);
+ chars_left = ZCB_Truncate(&temp, self->excerpt_length);
+ if (chars_left >= self->excerpt_length) {
+ start = candidate;
+ found_starting_edge = true;
+ }
+ }
+ break;
+ }
+ }
+
+ /* Try to end on a sentence boundary (but don't try very hard). */
+ if(edges->size) {
+ u32_t i;
+ ZombieCharBuf start_trimmed = ZCB_make(fragment);
+ ZCB_Nip(&start_trimmed, start - top);
+
+ for (i = edges->size; i--; ) {
+ u32_t last_edge = (u32_t)IntMap_Get(edges, i);
+
+ if (last_edge <= start) {
+ /* sanity. */
+ break;
+ }
+ else if (last_edge - start > self->excerpt_length) {
+ continue;
+ }
+ else {
+ ZombieCharBuf temp = ZCB_make((CharBuf*)&start_trimmed);
+ u32_t chars_left = ZCB_Nip(&temp, last_edge - start);
+ if (chars_left > (self->excerpt_length * 0.6666)) {
+ found_ending_edge = true;
+ end = last_edge;
+ }
+ else {
+ ZCB_Trim_Tail(&temp);
+ if (ZCB_Get_Len(&temp) == 0) {
+ /* Short, but ending on a boundary already. */
+ found_ending_edge = true;
+ end = last_edge;
+ }
+ }
+ }
+ }
+ }
+ this_excerpt_len = found_ending_edge
+ ? end - start
+ : self->excerpt_length;
+ if (!this_excerpt_len) return start;
+
+ if (found_starting_edge) {
+ ZombieCharBuf temp = ZCB_make((CharBuf*)field_val);
+ ZCB_Nip(&temp, start);
+ ZCB_Truncate(&temp, this_excerpt_len);
+ CB_Copy(raw_excerpt, (CharBuf*) &temp);
+ }
+ /* If not starting on a sentence boundary, prepend an ellipsis. */
+ else {
+ ZombieCharBuf temp = ZCB_make((CharBuf*)field_val);
+ const size_t ELLIPSIS_LEN = 4; /* Three dots and a space. */
+
+ /* If the excerpt is already shorter than the spec'd length, we might
+ * not need to make room. */
+ this_excerpt_len += ELLIPSIS_LEN;
+
+ /* Move the start back one in case the character right before the
+ * excerpt starts is whitespace. */
+ if (start) {
+ this_excerpt_len += 1;
+ start -= 1;
+ ZCB_Nip(&temp, start);
+ }
+
+ do {
+ u32_t code_point = ZCB_Nip_One(&temp);
+ start++;
+ this_excerpt_len--;
+
+ if (StrHelp_is_whitespace(code_point)) {
+ if (!found_ending_edge) {
+ /* If we still need room, we'll lop it off the end since
+ * we don't know a solid end point yet. */
+ break;
+ }
+ else if (this_excerpt_len <= self->excerpt_length) {
+ break;
+ }
+ }
+ } while (ZCB_Get_Len(&temp));
+
+ ZCB_Truncate(&temp, self->excerpt_length - ELLIPSIS_LEN);
+ CB_Cat_Trusted_Str(raw_excerpt, "... ", ELLIPSIS_LEN);
+ CB_Cat(raw_excerpt, (CharBuf*)&temp);
+ start -= ELLIPSIS_LEN;
+ }
+
+ /* If excerpt doesn't end on a sentence boundary, tack on an ellipsis. */
+ if (!found_ending_edge) {
+ CB_Truncate(raw_excerpt, self->excerpt_length - 2);
+ do {
+ u32_t code_point = CB_Code_Point_From(raw_excerpt, 1);
+ CB_Chop(raw_excerpt, 1);
+ if (StrHelp_is_whitespace(code_point)) {
+ CB_Trim_Tail(raw_excerpt);
+ break;
+ }
+ } while (CB_Get_Len(raw_excerpt));
+ CB_Cat_Trusted_Str(raw_excerpt, "...", 3);
+ }
+
+ return start;
+}
+
+void
+Highlighter_highlight_excerpt(Highlighter *self, VArray *spans,
+ CharBuf *raw_excerpt, CharBuf *highlighted,
+ u32_t top)
+{
+ u32_t i;
+ i32_t last_end = 0;
+ ZombieCharBuf temp = ZCB_make(raw_excerpt);
+
+ for (i = 0; i < spans->size; i++) {
+ HighlightSpan *span = (HighlightSpan*)VA_Fetch(spans, i);
+ if (span->start_offset < top) {
+ continue;
+ }
+ else {
+ i32_t relative_start = span->start_offset - top;
+ i32_t relative_end = span->end_offset - top;
+
+ if (relative_start > last_end) {
+ CharBuf *encoded;
+ i32_t non_highlighted_len = relative_start - last_end;
+ ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp);
+ ZCB_Truncate(&to_cat, non_highlighted_len);
+ encoded = Highlighter_Encode(self, (CharBuf*)&to_cat);
+ CB_Cat(highlighted, (CharBuf*)encoded);
+ ZCB_Nip(&temp, non_highlighted_len);
+ REFCOUNT_DEC(encoded);
+ }
+ if (relative_end > relative_start) {
+ CharBuf *encoded;
+ CharBuf *hl_frag;
+ i32_t highlighted_len = relative_end - relative_start;
+ ZombieCharBuf to_cat = ZCB_make((CharBuf*)&temp);
+ ZCB_Truncate(&to_cat, highlighted_len);
+ encoded = Highlighter_Encode(self, (CharBuf*)&to_cat);
+ hl_frag = Highlighter_Highlight(self, encoded);
+ CB_Cat(highlighted, hl_frag);
+ ZCB_Nip(&temp, highlighted_len);
+ REFCOUNT_DEC(encoded);
+ REFCOUNT_DEC(hl_frag);
+ }
+ last_end = relative_end;
+ }
+ }
+
+ /* Last text, beyond last highlight span. */
+ {
+ CharBuf *encoded = Highlighter_Encode(self, (CharBuf*)&temp);
+ CB_Cat(highlighted, encoded);
+ REFCOUNT_DEC(encoded);
+ }
+ CB_Trim_Tail(highlighted);
+}
+
static INLINE void
add_bound(u32_t pos, ByteBuf *bounds_bb) {
if (bounds_bb->cap - bounds_bb->len < sizeof(u32_t)) {
@@ -68,37 +386,41 @@
u32_t offset, u32_t length)
{
ByteBuf *bounds_bb = BB_new(10 * sizeof(u32_t));
- u32_t pos = offset;
u32_t max = length == 0
? I32_MAX
: offset + length;
- ZombieCharBuf substring = ZCB_BLANK;
+ ZombieCharBuf substring = ZCB_make(text);
+ u32_t pos = ZCB_Trim_Top(&substring);
UNUSED_VAR(self);

- ZCB_Assign(&substring, text);
- if (offset == 0) {
- /* If offset is zero, assume that the first non-whitespace character
- * is the beginning of a sentence. */
- pos += ZCB_Trim_Top(&substring);
+ if (offset <= pos) {
+ /* Assume that first non-whitespace character begins a sentence. */
if (pos < max && ZCB_Get_Len(&substring) > 0) {
add_bound(pos, bounds_bb);
}
}
- else {
- ZCB_Nip(&substring, offset);
- }
+ pos += ZCB_Nip(&substring, offset - pos);

while (pos < max) {
u32_t code_point = ZCB_Code_Point_At(&substring, 0);
if (!code_point) {
- /* End of string, so bail. */
+ /* End of substring. Add a bound if it's also the end of the field,
+ * then bail. */
+ if (substring.ptr == CBEND(text))
+ add_bound(pos, bounds_bb);
+
break;
}
else if (code_point == '.') {
u32_t whitespace_count;
pos += ZCB_Nip(&substring, 1); /* advance past "." */

- if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) {
+ if (pos == max && ZCB_Get_Len(&substring) == 0) {
+ /* Period ending the field string. */
+ add_bound(pos, bounds_bb);
+ break;
+ }
+ else if (0 != (whitespace_count = ZCB_Trim_Top(&substring))) {
/* Advance past whitespace. */
pos += whitespace_count;
if (pos < max && ZCB_Get_Len(&substring) > 0) {

Modified: trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/perl/lib/KinoSearch/Highlight/HeatMap.pm 2008-07-16 00:33:14 UTC (rev 3600)
@@ -27,7 +27,7 @@
return $self;
}

-sub hottest {
+sub _hottest {
return $sorted_loc{ ${+shift} }[-1];
}

@@ -62,6 +62,7 @@
qw( calc_proximity_boost
_generate_proximity_boosts|generate_proximity_boosts
_flatten_spans|flatten_spans
+ hottest
)
],
make_getters => [qw( spans window )],

Modified: trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm
===================================================================
--- trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/perl/lib/KinoSearch/Highlight/Highlighter.pm 2008-07-16 00:33:14 UTC (rev 3600)
@@ -28,136 +28,6 @@
return $self;
}

-sub create_excerpt {
- my ( $self, $hitdoc ) = @_;
- my $excerpt_field = $self->get_field;
- my $excerpt_length = $self->get_excerpt_length;
- my $searchable = $self->get_searchable;
- my $token_re = $token_re{$$self};
-
- # scoring window is 1.66 * excerpt_length, with the loc in the middle
- my $limit = int( $excerpt_length / 3 );
-
- # retrieve the text from the chosen field
- my $text = $hitdoc->{$excerpt_field};
- return unless defined $text;
- my $text_length = length $text;
- my $orig_length = $text_length;
- return '' unless $text_length;
-
- # determine the rough boundaries of the excerpt
- my @posits = $self->get_compiler->highlight_spans(
- searchable => $searchable,
- field => $excerpt_field,
- doc_vec => $searchable->fetch_doc_vec( $hitdoc->get_doc_num ),
- );
- @posits = sort { $a->get_start_offset <=> $b->get_start_offset } @posits;
- my $best_location = KinoSearch::Highlight::HeatMap->new(
- spans => \@posits,
- window => $limit*2
- )->hottest || 0; # undef --> 0
- my $top = $best_location - $limit;
-
- # expand the excerpt if the best location is near the end
- $top
- = $text_length - $excerpt_length < $top
- ? $text_length - $excerpt_length
- : $top;
-
- # if the best starting point is the very beginning, cool...
- if ( $top <= 0 ) {
- $top = 0;
- }
- # ... otherwise ...
- else {
- my $bounds = $self->find_sentence_boundaries(
- text => $text,
- offset => $top,
- length => $limit
- );
- if( @$bounds) {
- $top = $bounds->[0];
- $text = substr $text, $top;
- }
- # no sentence boundary, so we'll need an ellipsis
- else {
- # lop off $top characters
- $text = substr( $text, $top );
-
- # skip past possible partial tokens, prepend an ellipsis
- if ($text =~ s/
- \A
- (
- .{0,$limit}? # don't go outside the window
- $token_re # match possible partial token
- .*? # ... and any junk following that token
- )
- (?=$token_re) # just before the start of a full token...
- /... /xsm # ... insert an ellipsis
- )
- {
- $top += length($1);
- $top -= 4 # three dots and a space
- }
- }
- }
-
- # remove possible partial tokens from the end of the excerpt
- $text = substr( $text, 0, $excerpt_length + 1 );
- if ( length($text) > $excerpt_length ) {
- my $extra_char = chop $text;
- # if the extra char wasn't part of a token, we aren't splitting one
- if ( $extra_char =~ $token_re ) {
- $text =~ s/$token_re$//; # if this is unsuccessful, that's fine
- }
- }
-
- # if the excerpt doesn't end with a full stop, end with an an ellipsis
- if ( $orig_length > length($text) and $text !~ /\.\s*\Z/xsm ) {
- $text =~ s/\W+\Z//xsm;
- while ( length($text) + 4 > $excerpt_length ) {
- my $extra_char = chop $text;
- if ( $extra_char =~ $token_re ) {
- $text =~ s/\W+$token_re\Z//xsm; # if unsuccessful, that's fine
- }
- $text =~ s/\W+\Z//xsm;
- }
- $text .= ' ...';
- }
-
- # remap locations now that we know the starting and ending bytes
- $text_length = length($text);
- my @relative_starts = map { $_->get_start_offset - $top } @posits;
- my @relative_ends = map { $_->get_end_offset - $top } @posits;
-
- # get rid of pairs with at least one member outside the text
- while ( @relative_starts and $relative_starts[0] < 0 ) {
- shift @relative_starts;
- shift @relative_ends;
- }
- while ( @relative_ends and $relative_ends[-1] > $text_length ) {
- pop @relative_starts;
- pop @relative_ends;
- }
-
- # insert highlight tags
- my $output_text = '';
- my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
- while (@relative_starts) {
- $end = shift @relative_ends;
- $start = shift @relative_starts;
- my $not_highlighted = substr( $text, $last_end, $start - $last_end );
- $output_text .= $self->encode($not_highlighted);
- my $highlighted = substr( $text, $start, $end - $start );
- $output_text .= $self->highlight( $self->encode($highlighted) );
- $last_end = $end;
- }
- my $last_text = substr( $text, $last_end );
- $output_text .= $self->encode($last_text);
-
- return $output_text;
-}
-
sub do_encode { return encode_entities( $_[1] ) }

sub find_sentence_boundaries {
@@ -175,6 +45,10 @@
bind_methods => [.
qw( highlight
encode
+ create_excerpt
+ _find_best_fragment|find_best_fragment
+ _raw_excerpt|raw_excerpt
+ _highlight_excerpt|highlight_excerpt
set_pre_tag
get_pre_tag
set_post_tag

Modified: trunk/perl/t/303-highlighter.t
===================================================================
--- trunk/perl/t/303-highlighter.t 2008-07-16 00:29:31 UTC (rev 3599)
+++ trunk/perl/t/303-highlighter.t 2008-07-16 00:33:14 UTC (rev 3600)
@@ -19,7 +19,7 @@

package main;

-use Test::More tests => 15;
+use Test::More tests => 33;

binmode( STDOUT, ":utf8" );

@@ -30,7 +30,7 @@
use KinoSearch::Store::RAMFolder;

my $phi = "\x{03a6}";
-my $encoded_phi = "&phi;";
+my $encoded_phi = "&Phi;";

my $string = '1 2 3 4 5 ' x 20; # 200 characters
$string .= "$phi a b c d x y z h i j k ";
@@ -52,33 +52,174 @@

my $searcher = KinoSearch::Searcher->new( invindex => $invindex, );

-my $q = qq|"x y z" AND $phi|;
+my $q = qq|"x y z" AND $phi|;
my $hits = $searcher->search( query => $q );
-my $hit = $hits->fetch_hit;
-my $hl = KinoSearch::Highlight::Highlighter->new(
+my $hl = KinoSearch::Highlight::Highlighter->new(
+ searchable => $searcher,
+ query => $q,
+ field => 'content',
+ excerpt_length => 3,
+);
+
+my $target = KinoSearch::Util::ViewCharBuf->_new("");
+
+my $field_val = make_cb("a $phi $phi b c");
+my $top = $hl->_find_best_fragment(
+ fragment => $target,
+ field_val => $field_val,
+ heat_map => make_heat_map( [ 2, 3 ] ),
+);
+is( $target->to_perl, "$phi $phi b", "Find_Best_Fragment" );
+is( $top, 2, "correct offset returned by Find_Best_Fragment" );
+
+$field_val = make_cb("aa$phi");
+$top = $hl->_find_best_fragment(
+ fragment => $target,
+ field_val => $field_val,
+ heat_map => make_heat_map( [ 2, 3 ] ),
+);
+is( $target->to_perl, $field_val->to_perl,
+ "Find_Best_Fragment returns whole field when field is short" );
+is( $top, 0, "correct offset" );
+
+$field_val = make_cb("aaaab$phi$phi");
+$top = $hl->_find_best_fragment(
+ fragment => $target,
+ field_val => $field_val,
+ heat_map => make_heat_map( [ 6, 8 ] ),
+);
+is( $target->to_perl, "b$phi$phi",
+ "Find_Best_Fragment shifts left to deal with overrun" );
+is( $top, 4, "correct offset" );
+
+$field_val = make_cb( "a$phi" . "bcde" );
+$top = $hl->_find_best_fragment(
+ fragment => $target,
+ field_val => $field_val,
+ heat_map => make_heat_map( [ 0, 2 ] ),
+);
+is( $target->to_perl,
+ "a$phi" . "bcd",
+ "Find_Best_Fragment start at field beginning"
+);
+is( $top, 0, "correct offset" );
+undef $target;
+
+$hl = KinoSearch::Highlight::Highlighter->new(
+ searchable => $searcher,
+ query => $q,
+ field => 'content',
+ excerpt_length => 6,
+);
+
+$target = make_cb("");
+$top = $hl->_raw_excerpt(
+ field_val => "Ook. Urk. Ick. ",
+ fragment => "Ook. Urk.",
+ raw_excerpt => $target,
+ top => 0,
+ edges => make_int_map( 0, 6 ),
+);
+is( $target->to_perl, "Ook. ", "Raw_Excerpt at top" );
+is( $top, 0, "top still 0" );
+
+$target = make_cb("");
+$top = $hl->_raw_excerpt(
+ field_val => "Ook. Urk. Ick. ",
+ fragment => ". Urk. I",
+ raw_excerpt => $target,
+ top => 3,
+ edges => make_int_map( 6, 12 ),
+);
+is( $target->to_perl, "Urk. ", "Raw_Excerpt in middle, with 2 bounds" );
+is( $top, 6, "top in the middle modified by Raw_Excerpt" );
+
+$target = make_cb("");
+$top = $hl->_raw_excerpt(
+ field_val => "Ook urk ick iz",
+ fragment => "ick iz",
+ raw_excerpt => $target,
+ top => 8,
+ edges => make_int_map(14),
+);
+is( $target->to_perl, "... iz", "Ellipsis at top" );
+is( $top, 8, "top correct when leading ellipsis inserted" );
+
+$target = make_cb("");
+$top = $hl->_raw_excerpt(
+ field_val => "Urk. Iz no good.",
+ fragment => " Iz no go",
+ raw_excerpt => $target,
+ top => 4,
+ edges => make_int_map(6),
+);
+is( $target->to_perl, "Iz...", "Ellipsis at end" );
+is( $top, 6, "top trimmed" );
+
+$hl = KinoSearch::Highlight::Highlighter->new(
+ searchable => $searcher,
+ query => $q,
+ field => 'content',
+ excerpt_length => 3,
+);
+
+$target = make_cb("");
+$hl->_highlight_excerpt(
+ raw_excerpt => 'a b c',
+ spans => make_spans( [ 2, 3 ] ),
+ top => 0,
+ highlighted => $target,
+);
+is( $target->to_perl, "a <strong>b</strong> c", "basic Highlight_Excerpt" );
+
+$target = make_cb("");
+$hl->_highlight_excerpt(
+ raw_excerpt => "$phi $phi $phi",
+ spans => make_spans( [ 2, 3 ] ),
+ top => 0,
+ highlighted => $target,
+);
+like(
+ $target->to_perl,
+ qr#$encoded_phi <strong>$encoded_phi</strong> $encoded_phi#i,
+ "encode invoked by Highlight_Excerpt"
+);
+
+$target = make_cb("");
+$hl->_highlight_excerpt(
+ raw_excerpt => "$phi $phi $phi",
+ spans => make_spans( [ 3, 4 ] ),
+ top => 1,
+ highlighted => $target,
+);
+like(
+ $target->to_perl,
+ qr#^$encoded_phi <strong>$encoded_phi</strong> $encoded_phi$#i,
+ "Highlight_Excerpt pays attention to offset"
+);
+
+$hl = KinoSearch::Highlight::Highlighter->new(
searchable => $searcher,
query => $q,
field => 'content',
);
-my $excerpt = $hl->create_excerpt( $hit );
-like( $excerpt,
- qr/$encoded_phi.*?z/i, "excerpt contains all relevant terms" );
+
+my $hit = $hits->fetch_hit;
+my $excerpt = $hl->create_excerpt($hit);
+like( $excerpt, qr/$encoded_phi.*?z/i,
+ "excerpt contains all relevant terms" );
+like( $excerpt, qr#<strong>x y z</strong>#, "highlighter tagged the phrase" );
like(
$excerpt,
- qr#<strong>x y z</strong>#,
- "highlighter tagged the phrase"
-);
-like(
- $excerpt,
qr#<strong>$encoded_phi</strong>#i,
"highlighter tagged the single term"
);

-$hl->set_pre_tag("\e[.1m"); $hl->set_post_tag("\e[.0m");
+$hl->set_pre_tag("\e[.1m");
+$hl->set_post_tag("\e[.0m");
like(
- $hl->create_excerpt( $hit ),
- qr#\e\[.1m$encoded_phi\e\[.0m#i,
- "set_pre_tag and set_post_tag",
+ $hl->create_excerpt($hit),
+ qr#\e\[.1m$encoded_phi\e\[.0m#i, "set_pre_tag and set_post_tag",
);

like( $hl->create_excerpt( $hits->fetch_hit() ),
@@ -91,8 +232,9 @@
query => $q,
field => 'content',
);
-like( $hl->create_excerpt( $hits->fetch_hit() ),
- qr/x y z/,
+$excerpt = $hl->create_excerpt( $hits->fetch_hit() );
+$excerpt =~ s#</?strong>##g;
+like( $excerpt, qr/x y z/,
"query with same word in both phrase and term doesn't cause freakout" );

$hits = $searcher->search( query => $q = 'blind' );
@@ -101,8 +243,10 @@
searchable => $searcher,
query => $q,
field => 'content',
- )->create_excerpt( $hits->fetch_hit() ),
- qr/quot/, "HTML entity encoded properly" );
+ )->create_excerpt( $hits->fetch_hit() ),
+ qr/quot/,
+ "HTML entity encoded properly"
+);

$hits = $searcher->search( query => $q = 'why' );
unlike(
@@ -110,8 +254,10 @@
searchable => $searcher,
query => $q,
field => 'content',
- )->create_excerpt( $hits->fetch_hit() ),
- qr/\.\.\./, "no ellipsis for short excerpt" );
+ )->create_excerpt( $hits->fetch_hit() ),
+ qr/\.\.\./,
+ "no ellipsis for short excerpt"
+);

my $term_query = KinoSearch::Search::TermQuery->new(
field => 'content',
@@ -124,15 +270,18 @@
searchable => $searcher,
query => $term_query,
field => 'content',
- )->create_excerpt( $hit ),
- qr/strong/, "specify field highlights correct field..." );
+ )->create_excerpt($hit),
+ qr/strong/,
+ "specify field highlights correct field..."
+);
unlike(
KinoSearch::Highlight::Highlighter->new(
searchable => $searcher,
query => $term_query,
field => 'alt',
- )->create_excerpt( $hit ),
- qr/strong/, "... but not another field"
+ )->create_excerpt($hit),
+ qr/strong/,
+ "... but not another field"
);

my $sentences = 'This is a sentence. ' x 15;
@@ -150,7 +299,8 @@
[ 120, 140 ],
'find_sentence_boundaries in list context with explicit args'
);
-is_deeply( $hl->find_sentence_boundaries(
+is_deeply(
+ $hl->find_sentence_boundaries(
text => $sentences,
offset => 101,
length => 4,
@@ -160,11 +310,38 @@
);
is_deeply(
$hl->find_sentence_boundaries( text => $sentences ),
- [. 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280 ],
- 'fsb in list context with default offset and length'
+ [. 0, 20, 40, 60, 80, 100, 120, 140,
+ 160, 180, 200, 220, 240, 260, 280, 300
+ ],
+ 'fsb with default offset and length'
);
-is_deeply( $hl->find_sentence_boundaries( text => ' Foo' ), [1],
- "Skip leading whitespace but get first sentence");
-is_deeply( $hl->find_sentence_boundaries( text => 'Foo. Foo. ' ), [0, 5],
- "No start at end even after period and whitespace" );
+is_deeply(
+ $hl->find_sentence_boundaries( text => ' Foo' ),
+ [ 1, 4 ],
+ "Skip leading whitespace but get first sentence"
+);

+sub make_cb {
+ return KinoSearch::Util::CharBuf->new(shift);
+}
+
+sub make_spans {
+ my $spans = KinoSearch::Util::VArray->new( capacity => @_ / 2 );
+ for my $span_spec (@_) {
+ my $hl_span = KinoSearch::Highlight::HighlightSpan->new(
+ start_offset => $span_spec->[0],
+ end_offset => $span_spec->[1],
+ weight => 1,
+ );
+ $spans->push($hl_span);
+ }
+ return $spans;
+}
+
+sub make_heat_map {
+ return KinoSearch::Highlight::HeatMap->new( spans => make_spans(@_) );
+}
+
+sub make_int_map {
+ return KinoSearch::Util::IntMap->new( ints => [@_] );
+}


_______________________________________________
kinosearch-commits mailing list
kinosearch-commits [at] rectangular
http://www.rectangular.com/mailman/listinfo/kinosearch-commits

Subject User Time
r3600 - in trunk: c_src/KinoSearch/Highlight perl/lib/KinoSearch/Highlight perl/t marvin at rectangular Jul 15, 2008, 5:33 PM

  Index | Next | Previous | View Flat
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.