user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [RFC 6/7] index: allow search/lookups on X-Alt-Message-ID
  2019-10-24  0:12  7% [PATCH 0/7] redundant header madness Eric Wong
@ 2019-10-24  0:12  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2019-10-24  0:12 UTC (permalink / raw)
  To: meta

Since we replace extra Message-ID headers with X-Alt-Message-ID
to placate NNTP clients, we should allow searching and indexing
on X-Alt-Message-ID just like we do with Message-ID.
---
 lib/PublicInbox/MID.pm       | 27 +++++++++++++++++++++------
 lib/PublicInbox/OverIdx.pm   |  4 ++--
 lib/PublicInbox/SearchIdx.pm |  4 ++--
 t/mid.t                      |  7 ++++++-
 t/v2writable.t               | 16 ++++++++++++++++
 5 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 14089f91..d7a42c38 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -7,7 +7,7 @@ use strict;
 use warnings;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC
-	mids references/;
+	mids references mids_for_index/;
 use URI::Escape qw(uri_escape_utf8);
 use Digest::SHA qw/sha1_hex/;
 require PublicInbox::Address;
@@ -54,11 +54,10 @@ sub mid2path {
 # Only for v1 code paths:
 sub mid_mime ($) { mids($_[0]->header_obj)->[0] }
 
-sub mids ($) {
-	my ($hdr) = @_;
+# only intended for Message-ID and X-Alt-Message-ID
+sub extract_mids {
 	my @mids;
-	my @v = $hdr->header_raw('Message-Id');
-	foreach my $v (@v) {
+	for my $v (@_) {
 		my @cur = ($v =~ /<([^>]+)>/sg);
 		if (@cur) {
 			push(@mids, @cur);
@@ -66,7 +65,23 @@ sub mids ($) {
 			push(@mids, $v);
 		}
 	}
-	uniq_mids(\@mids);
+	\@mids;
+}
+
+sub mids ($) {
+	my ($hdr) = @_;
+	my @mids = $hdr->header_raw('Message-Id');
+	uniq_mids(extract_mids(@mids));
+}
+
+# we allow searching on X-Alt-Message-ID since PublicInbox::NNTP uses them
+# to placate some clients, and we want to ensure NNTP-only clients can
+# import and index without relying on HTTP endpoints
+sub mids_for_index ($) {
+	my ($hdr) = @_;
+	my @mids = $hdr->header_raw('Message-Id');
+	my @alts = $hdr->header_raw('X-Alt-Message-ID');
+	uniq_mids(extract_mids(@mids, @alts));
 }
 
 # last References should be IRT, but some mail clients do things
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 01ca6f11..189bd21d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -13,7 +13,7 @@ use warnings;
 use base qw(PublicInbox::Over);
 use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
-use PublicInbox::MID qw/id_compress mids references/;
+use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::SearchMsg qw(subject_normalized);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
@@ -256,7 +256,7 @@ sub add_overview {
 		lines => $lines,
 		blob => $oid,
 	}, 'PublicInbox::SearchMsg';
-	my $mids = mids($mime->header_obj);
+	my $mids = mids_for_index($mime->header_obj);
 	my $refs = parse_references($smsg, $mid0, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index aed3875a..b2d71a1f 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -12,7 +12,7 @@ use warnings;
 use base qw(PublicInbox::Search PublicInbox::Lock);
 use PublicInbox::MIME;
 use PublicInbox::InboxWritable;
-use PublicInbox::MID qw/mid_clean id_compress mid_mime mids/;
+use PublicInbox::MID qw/mid_clean id_compress mid_mime mids_for_index/;
 use PublicInbox::MsgIter;
 use Carp qw(croak);
 use POSIX qw(strftime);
@@ -344,7 +344,7 @@ sub add_xapian ($$$$$) {
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $mids = mids($mime->header_obj);
+	my $mids = mids_for_index($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
 		$self->_msgmap_init;
diff --git a/t/mid.t b/t/mid.t
index 9ad10a99..98b0c200 100644
--- a/t/mid.t
+++ b/t/mid.t
@@ -1,7 +1,7 @@
 # Copyright (C) 2016-2019 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 use Test::More;
-use PublicInbox::MID qw(mid_escape mids references);
+use PublicInbox::MID qw(mid_escape mids references mids_for_index);
 
 is(mid_escape('foo!@(bar)'), 'foo!@(bar)');
 is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
@@ -10,6 +10,7 @@ is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
 {
 	use Email::MIME;
 	my $mime = Email::MIME->create;
+	$mime->header_set('X-Alt-Message-ID', '<alt-id-for-nntp>');
 	$mime->header_set('Message-Id', '<mid-1@a>');
 	is_deeply(['mid-1@a'], mids($mime->header_obj), 'mids in common case');
 	$mime->header_set('Message-Id', '<mid-1@a>', '<mid-2@b>');
@@ -40,6 +41,10 @@ is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
 	$mime->header_set('To', 'u@example.com');
 	$mime->header_set('References', '<hello> <world> <n> <u@example.com>');
 	is_deeply(references($mime->header_obj), [qw(hello world)]);
+
+	is_deeply([qw(helloworld alt-id-for-nntp)],
+		mids_for_index($mime->header_obj),
+		'X-Alt-Message-ID can be indexed');
 }
 
 done_testing();
diff --git a/t/v2writable.t b/t/v2writable.t
index c2daac2f..2b825768 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -115,6 +115,7 @@ if ('ensure git configs are correct') {
 
 {
 	$mime->header_set('Message-Id', '<abcde@1>', '<abcde@2>');
+	$mime->header_set('X-Alt-Message-Id', '<alt-id-for-nntp>');
 	$mime->header_set('References', '<zz-mid@b>');
 	ok($im->add($mime), 'message with multiple Message-ID');
 	$im->done;
@@ -127,6 +128,21 @@ if ('ensure git configs are correct') {
 	is($mset2->size, 1, 'message found by second MID');
 	is((($mset1->items)[0])->get_docid, (($mset2->items)[0])->get_docid,
 		'same document') if ($mset1->size);
+
+	my $alt = $srch->reopen->query('m:alt-id-for-nntp', { mset => 1 });
+	is($alt->size, 1, 'message found by alt MID (NNTP)');
+	is((($alt->items)[0])->get_docid, (($mset1->items)[0])->get_docid,
+		'same document') if ($mset1->size);
+	$mime->header_set('X-Alt-Message-Id');
+
+	my %uniq;
+	for my $mid (qw(abcde@1 abcde@2 alt-id-for-nntp)) {
+		my $msgs = $ibx->over->get_thread($mid);
+		my $key = join(' ', sort(map { $_->{num} } @$msgs));
+		$uniq{$key}++;
+	}
+	is(scalar(keys(%uniq)), 1, 'all alt Message-ID queries give same smsg');
+	is_deeply([values(%uniq)], [3], '3 queries, 3 results');
 }
 
 {

^ permalink raw reply related	[relevance 5%]

* [PATCH 0/7] redundant header madness
@ 2019-10-24  0:12  7% Eric Wong
  2019-10-24  0:12  5% ` [RFC 6/7] index: allow search/lookups on X-Alt-Message-ID Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2019-10-24  0:12 UTC (permalink / raw)
  To: meta

Garbage in, garbage out.  Since we try to never drop messages in
v2, we now better support horrible messages with redundant and
confusing headers in the HTML view and search indices.

I'm a little iffy on adding more support to X-Alt-Message-IDs
because it's a gross hack; but so was injecting new Message-IDs
to support different messages with different content.

And I very much want NNTP clients to be able to build partial
mirrors without relying on expensive git clones.  Those mirrors
should still be able to lookup messages by whatever Message-IDs
the message was known by.

Eric Wong (7):
  search: support multiple From/To/Cc/Subject headers
  view: display redundant headers in permalink
  view: move '<' and '>' outside <a>
  view: improve warning for multiple Message-IDs
  linkify: support adding "(raw)" link for Message-IDs

  # next two are RFCs but I'm leaning heavily on supporting them:
  index: allow search/lookups on X-Alt-Message-ID
  view: show X-Alt-Message-ID in permalink view, too

 lib/PublicInbox/Linkify.pm   |  31 ++++++++++
 lib/PublicInbox/MID.pm       |  27 +++++++--
 lib/PublicInbox/OverIdx.pm   |   4 +-
 lib/PublicInbox/SearchIdx.pm |   4 +-
 lib/PublicInbox/SearchMsg.pm |   4 +-
 lib/PublicInbox/View.pm      | 106 ++++++++++++++++++++---------------
 t/mid.t                      |   7 ++-
 t/psgi_v2.t                  |   2 +-
 t/v2reindex.t                |  16 ++++--
 t/v2writable.t               |  16 ++++++
 10 files changed, 153 insertions(+), 64 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2019-10-24  0:12  7% [PATCH 0/7] redundant header madness Eric Wong
2019-10-24  0:12  5% ` [RFC 6/7] index: allow search/lookups on X-Alt-Message-ID Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).