user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] searchidx: fix -Lmedium for IDs and filenames
@ 2021-03-13 15:40 Eric Wong
  2021-03-13 22:43 ` Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2021-03-13 15:40 UTC (permalink / raw)
  To: meta

This fixes "m:", "l:", "dfn:", and "n:" search prefixes under
indexlevel=medium when mixed with indexlevel=full inboxish.
We need positional data for Message-IDs, List-Id, and filenames
for exact matches, though we still want to support wildcards.

Fortunately the storage cost is still small as these prefixes
tend to be small compared to message bodies.  These are NOT
boolean terms since wildcard support and partial matching is
desired.
---
 I noticed this while I was working on another patch, and
 it took forever to figure out why m: wasn't working for me.

 lib/PublicInbox/SearchIdx.pm | 47 ++++++++++++++++++++++++++++--------
 t/lei_xsearch.t              | 31 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3372bea5..772f5a64 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,6 +22,7 @@ use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn nodatacow_dir);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Address;
 our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
 	index_text term_generator add_val is_bad_blob);
 my $X = \%PublicInbox::Search::X;
@@ -158,22 +159,44 @@ sub term_generator ($) { # write-only
 	}
 }
 
+sub index_phrase ($$$$) {
+	my ($self, $text, $wdf_inc, $prefix) = @_;
+
+	my $tg = term_generator($self);
+	$tg->index_text($text, $wdf_inc, $prefix);
+	$tg->increase_termpos;
+}
+
 sub index_text ($$$$) {
 	my ($self, $text, $wdf_inc, $prefix) = @_;
-	my $tg = term_generator($self); # man Search::Xapian::TermGenerator
 
 	if ($self->{indexlevel} eq 'full') {
-		$tg->index_text($text, $wdf_inc, $prefix);
-		$tg->increase_termpos;
+		index_phrase($self, $text, $wdf_inc, $prefix);
 	} else {
+		my $tg = term_generator($self);
 		$tg->index_text_without_positions($text, $wdf_inc, $prefix);
 	}
 }
 
 sub index_headers ($$) {
 	my ($self, $smsg) = @_;
-	my @x = (from => 'A', # Author
-		subject => 'S', to => 'XTO', cc => 'XCC');
+	my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author
+	while (my ($field, $pfx) = splice(@x, 0, 2)) {
+		my $val = $smsg->{$field};
+		next if $val eq '';
+		# include "(comments)" after the address, too, so not using
+		# PublicInbox::Address::names or pairs
+		index_text($self, $val, 1, $pfx);
+
+		# we need positional info for email addresses since they
+		# can be considered phrases
+		if ($self->{indexlevel} eq 'medium') {
+			for my $addr (PublicInbox::Address::emails($val)) {
+				index_phrase($self, $addr, 1, $pfx);
+			}
+		}
+	}
+	@x = (subject => 'S');
 	while (my ($field, $pfx) = splice(@x, 0, 2)) {
 		my $val = $smsg->{$field};
 		index_text($self, $val, 1, $pfx) if $val ne '';
@@ -186,7 +209,11 @@ sub index_diff_inc ($$$$) {
 		index_text($self, join("\n", @$xnq), 1, 'XNQ');
 		@$xnq = ();
 	}
-	index_text($self, $text, 1, $pfx);
+	if ($pfx eq 'XDFN') {
+		index_phrase($self, $text, 1, $pfx);
+	} else {
+		index_text($self, $text, 1, $pfx);
+	}
 }
 
 sub index_old_diff_fn {
@@ -292,7 +319,7 @@ sub index_xapian { # msg_iter callback
 	my $ct = $part->content_type || 'text/plain';
 	my $fn = $part->filename;
 	if (defined $fn && $fn ne '') {
-		index_text($self, $fn, 1, 'XFN');
+		index_phrase($self, $fn, 1, 'XFN');
 	}
 	if ($part->{is_submsg}) {
 		my $mids = mids_for_index($part);
@@ -330,20 +357,20 @@ sub index_list_id ($$$) {
 		$l =~ /<([^>]+)>/ or next;
 		my $lid = lc $1;
 		$doc->add_boolean_term('G' . $lid);
-		index_text($self, $lid, 1, 'XL'); # probabilistic
+		index_phrase($self, $lid, 1, 'XL'); # probabilistic
 	}
 }
 
 sub index_ids ($$$$) {
 	my ($self, $doc, $hdr, $mids) = @_;
 	for my $mid (@$mids) {
-		index_text($self, $mid, 1, 'XM');
+		index_phrase($self, $mid, 1, 'XM');
 
 		# because too many Message-IDs are prefixed with
 		# "Pine.LNX."...
 		if ($mid =~ /\w{12,}/) {
 			my @long = ($mid =~ /(\w{3,}+)/g);
-			index_text($self, join(' ', @long), 1, 'XM');
+			index_phrase($self, join(' ', @long), 1, 'XM');
 		}
 	}
 	$doc->add_boolean_term('Q' . $_) for @$mids;
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
index f865ff43..5bfbcfe6 100644
--- a/t/lei_xsearch.t
+++ b/t/lei_xsearch.t
@@ -78,4 +78,35 @@ is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back');
 is($lxs->search, $lxs, '->search works');
 is($lxs->over, undef, '->over fails');
 
+{
+	$lxs = PublicInbox::LeiXSearch->new;
+	my $v2ibx = PublicInbox::InboxWritable->new({
+		inboxdir => "$home/v2full",
+		name => 'v2full',
+		version => 2,
+		indexlevel => 'full',
+		-primary_address => 'v2full@example.com',
+	}, {});
+	my $im = $v2ibx->importer(0);
+	$im->add(eml_load('t/plack-qp.eml'));
+	$im->done;
+	my $v1ibx = PublicInbox::InboxWritable->new({
+		inboxdir => "$home/v1medium",
+		name => 'v1medium',
+		version => 1,
+		indexlevel => 'medium',
+		-primary_address => 'v1medium@example.com',
+	}, {});
+	$im = $v1ibx->importer(0);
+	$im->add(eml_load('t/utf8.eml'));
+	$im->done;
+	$lxs->prepare_external($v1ibx);
+	$lxs->prepare_external($v2ibx);
+	for my $loc ($lxs->locals) {
+		$lxs->attach_external($loc);
+	}
+	my $mset = $lxs->mset('m:testmessage@example.com');
+	is($mset->size, 1, 'got m: match on medium+full XSearch mix');
+}
+
 done_testing;

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] searchidx: fix -Lmedium for IDs and filenames
  2021-03-13 15:40 [PATCH] searchidx: fix -Lmedium for IDs and filenames Eric Wong
@ 2021-03-13 22:43 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2021-03-13 22:43 UTC (permalink / raw)
  To: meta

Eric Wong <e@80x24.org> wrote:
>  sub index_headers ($$) {
>  	my ($self, $smsg) = @_;
> -	my @x = (from => 'A', # Author
> -		subject => 'S', to => 'XTO', cc => 'XCC');
> +	my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author
> +	while (my ($field, $pfx) = splice(@x, 0, 2)) {
> +		my $val = $smsg->{$field};
> +		next if $val eq '';
> +		# include "(comments)" after the address, too, so not using
> +		# PublicInbox::Address::names or pairs
> +		index_text($self, $val, 1, $pfx);
> +
> +		# we need positional info for email addresses since they
> +		# can be considered phrases
> +		if ($self->{indexlevel} eq 'medium') {
> +			for my $addr (PublicInbox::Address::emails($val)) {
> +				index_phrase($self, $addr, 1, $pfx);
> +			}
> +		}
> +	}

I forgot to note email addresses are also handled as phrases
unconditionally.  In any case, pushed as
commit 64b557420689476493d752968d99ab8ae62bad9a

    searchidx: fix -Lmedium for IDs and filenames

    This fixes "m:", "l:", "f:", "t:", "c:", "dfn:", and "n:" search
    prefixes under indexlevel=medium when mixed with indexlevel=full
    inboxish.  We need positional data for Message-IDs, List-Id,
    email addresses and filenames for exact matches, though we still
    want to support wildcards.

    Fortunately the storage cost is still small as these prefixes
    tend to be small compared to message bodies.  These are NOT
    boolean terms since wildcard support and partial matching is
    desired.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-03-13 22:43 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-13 15:40 [PATCH] searchidx: fix -Lmedium for IDs and filenames Eric Wong
2021-03-13 22:43 ` Eric Wong

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git