user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/3] improve external Message-ID handling
@ 2015-09-03  1:57 Eric Wong
  2015-09-03  1:57 ` [PATCH 1/3] view: include ghost messages in thread views Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Eric Wong @ 2015-09-03  1:57 UTC (permalink / raw)
  To: meta

Yet another Xapian schema change.  We no longer compress Message-IDs
in Xapian, so looking for cross-posted messages in external archives
should be possible.

Eric Wong (3):
      view: include ghost messages in thread views
      search: disable Message-ID compression in Xapian
      ExtMsg: 300 to external mailing list archives

 lib/PublicInbox/ExtMsg.pm    | 39 +++++++++++++++++++++++++------
 lib/PublicInbox/Feed.pm      | 29 ++++++++---------------
 lib/PublicInbox/Search.pm    |  6 ++---
 lib/PublicInbox/SearchIdx.pm | 37 +++++++++++------------------
 lib/PublicInbox/SearchMsg.pm |  4 ++--
 lib/PublicInbox/View.pm      | 55 ++++++++++++++++++++++++++++++++++++--------
 public-inbox.cgi             |  1 +
 t/cgi.t                      |  6 ++---
 t/search.t                   |  3 +--
 9 files changed, 111 insertions(+), 69 deletions(-)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/3] view: include ghost messages in thread views
  2015-09-03  1:57 [PATCH 0/3] improve external Message-ID handling Eric Wong
@ 2015-09-03  1:57 ` Eric Wong
  2015-09-03  2:04   ` Eric Wong
  2015-09-03  1:57 ` [PATCH 2/3] search: disable Message-ID compression in Xapian Eric Wong
  2015-09-03  1:57 ` [PATCH 3/3] ExtMsg: 300 to external mailing list archives Eric Wong
  2 siblings, 1 reply; 5+ messages in thread
From: Eric Wong @ 2015-09-03  1:57 UTC (permalink / raw)
  To: meta

We'll be expanding our ghost message lookup facilities, so
it makes sense to generate links to them even if they are
currently unknown.
---
 lib/PublicInbox/View.pm | 55 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e18895f..8a02725 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -19,6 +19,7 @@ use constant MAX_INLINE_QUOTED => 12; # half an 80x24 terminal
 use constant MAX_TRUNC_LEN => 72;
 use constant PRE_WRAP => "<pre\nstyle=\"white-space:pre-wrap\">";
 use constant T_ANCHOR => '#u';
+use constant INDENT => '  ';
 
 *ascii_html = *PublicInbox::Hval::ascii_html;
 
@@ -101,7 +102,7 @@ sub index_entry {
 	my $ts = _msg_date($mime);
 	my $rv = "<table\nsummary=l$level><tr>";
 	if ($level) {
-		$rv .= '<td><pre>' . ('  ' x $level) . '</pre></td>';
+		$rv .= '<td><pre>' . (INDENT x $level) . '</pre></td>';
 	}
 	$rv .= "<td\nid=s$midx>" . PRE_WRAP;
 	$rv .= "<b\nid=\"$id\">$subj</b>\n";
@@ -594,22 +595,50 @@ sub __thread_entry {
 	my ($cb, $git, $state, $mime, $level) = @_;
 
 	# lazy load the full message from mini_mime:
-	my $path = mid2path(mid_clean($mime->header('Message-ID')));
-	$mime = eval { Email::MIME->new($git->cat_file("HEAD:$path")) };
-	if ($mime) {
-		if ($state->{anchor_idx} == 0) {
-			thread_html_head($cb, $mime);
+	$mime = eval {
+		my $path = mid2path(mid_clean($mime->header('Message-ID')));
+		Email::MIME->new($git->cat_file('HEAD:'.$path));
+	} or return;
+
+	if ($state->{anchor_idx} == 0) {
+		thread_html_head($cb, $mime, $state);
+	}
+
+	if (my $ghost = delete $state->{ghost}) {
+		# n.b. ghost messages may only be parents, not children
+		foreach my $g (@$ghost) {
+			my $mid = PublicInbox::Hval->new_msgid($g->[0]);
+			my $pfx = INDENT x $g->[1];
+			my $href = $mid->as_href;
+			my $html = $mid->as_html;
+			$$cb->write("<table><tr><td>$pfx</td><td>" .
+					PRE_WRAP .
+					'[parent not found: &lt;' .
+					qq{<a\nhref="../../$href/">}.
+					"$html</a>&gt;]</pre></td></table>");
 		}
-		index_entry($$cb, $mime, $level, $state);
 	}
+	index_entry($$cb, $mime, $level, $state);
+	1;
+}
+
+sub __ghost_entry {
+	my ($state, $node, $level) = @_;
+	my $ghost = $state->{ghost} ||= [];
+	push @$ghost, [ $node->messageid, $level ];
 }
 
 sub thread_entry {
 	my ($cb, $git, $state, $node, $level) = @_;
 	return unless $node;
 	if (my $mime = $node->message) {
-		__thread_entry($cb, $git, $state, $mime, $level);
+		unless (__thread_entry($cb, $git, $state, $mime, $level)) {
+			__ghost_entry($state, $node, $level);
+		}
+	} else {
+		__ghost_entry($state, $node, $level);
 	}
+
 	thread_entry($cb, $git, $state, $node->child, $level + 1);
 	thread_entry($cb, $git, $state, $node->next, $level);
 }
@@ -651,7 +680,7 @@ sub _msg_date {
 
 sub _inline_header {
 	my ($dst, $state, $upfx, $mime, $level) = @_;
-	my $pfx = '  ' x $level;
+	my $pfx = INDENT x $level;
 
 	my $cur = $state->{cur};
 	my $mid = $mime->header('Message-ID');
@@ -705,6 +734,14 @@ sub inline_dump {
 			$state->{parent} = $mid;
 		}
 		_inline_header($dst, $state, $upfx, $mime, $level);
+	} else {
+		my $pfx = INDENT x $level;
+		my $v = PublicInbox::Hval->new_msgid($node->messageid, 1);
+		my $html = $v->as_html;
+		my $href = $v->as_href;
+		$$dst .= $pfx . '` [parent not found: &lt;' .
+				qq{<a\nhref="$upfx../$href/">}.
+				"$html</a>&gt];\n";
 	}
 	inline_dump($dst, $state, $upfx, $node->child, $level+1);
 	inline_dump($dst, $state, $upfx, $node->next, $level);
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] search: disable Message-ID compression in Xapian
  2015-09-03  1:57 [PATCH 0/3] improve external Message-ID handling Eric Wong
  2015-09-03  1:57 ` [PATCH 1/3] view: include ghost messages in thread views Eric Wong
@ 2015-09-03  1:57 ` Eric Wong
  2015-09-03  1:57 ` [PATCH 3/3] ExtMsg: 300 to external mailing list archives Eric Wong
  2 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-09-03  1:57 UTC (permalink / raw)
  To: meta

We'll continue to compress long Message-IDs in URLs (which we know
about), but we will store entire Message-IDs in the Xapian database
to facilitate ease-of-lookups in external databases.
---
 lib/PublicInbox/ExtMsg.pm    |  9 ++++-----
 lib/PublicInbox/Search.pm    |  6 +++---
 lib/PublicInbox/SearchIdx.pm | 37 +++++++++++++------------------------
 lib/PublicInbox/SearchMsg.pm |  4 ++--
 t/search.t                   |  3 +--
 5 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 1c0887c..bdbff78 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -12,7 +12,6 @@ sub ext_msg {
 	my $pi_config = $ctx->{pi_config};
 	my $listname = $ctx->{listname};
 	my $mid = $ctx->{mid};
-	my $cmid = mid_compress($mid);
 
 	eval { require PublicInbox::Search };
 	my $have_xap = $@ ? 0 : 1;
@@ -35,13 +34,13 @@ sub ext_msg {
 		if ($have_xap) {
 			my $doc_id = eval {
 				my $s = PublicInbox::Search->new($git_dir);
-				$s->find_unique_doc_id('mid', $cmid);
+				$s->find_unique_doc_id('mid', $mid);
 			};
 			if ($@) {
 				# xapian not configured for this repo
 			} else {
 				# maybe we found it!
-				return r302($url, $cmid) if (defined $doc_id);
+				return r302($url, $mid) if (defined $doc_id);
 
 				# no point in trying the fork fallback if we
 				# know Xapian is up-to-date but missing the
@@ -55,7 +54,7 @@ sub ext_msg {
 	}
 
 	# Xapian not installed or configured for some repos
-	my $path = "HEAD:" . mid2path($cmid);
+	my $path = "HEAD:" . mid2path($mid);
 
 	foreach my $n (@nox) {
 		my @cmd = ('git', "--git-dir=$n->{git_dir}", 'cat-file',
@@ -70,7 +69,7 @@ sub ext_msg {
 			my $type = eval { local $/; <$fh> };
 			close $fh;
 			if ($? == 0 && $type eq "blob\n") {
-				return r302($n->{url}, $cmid);
+				return r302($n->{url}, $mid);
 			}
 		}
 	}
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8b32ef3..eb49f72 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -26,14 +26,15 @@ use constant {
 	# 6 - preserve References: order in document data
 	# 7 - remove references and inreplyto terms
 	# 8 - remove redundant/unneeded document data
-	SCHEMA_VERSION => 8,
+	# 9 - disable Message-ID compression
+	SCHEMA_VERSION => 9,
 	QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
 };
 
 # setup prefixes
 my %bool_pfx_internal = (
 	type => 'T', # "mail" or "ghost"
-	mid => 'Q', # uniQue id (Message-ID or mid_compress)
+	mid => 'Q', # uniQue id (Message-ID)
 );
 
 my %bool_pfx_external = (
@@ -171,7 +172,6 @@ sub date_range_processor {
 sub lookup_message {
 	my ($self, $mid) = @_;
 	$mid = mid_clean($mid);
-	$mid = mid_compress($mid);
 
 	my $doc_id = $self->find_unique_doc_id('mid', $mid);
 	my $smsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 32e0714..ee85268 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -41,8 +41,7 @@ sub add_message {
 	my $db = $self->{xdb};
 
 	my $doc_id;
-	my $mid_orig = mid_clean($mime->header('Message-ID'));
-	my $mid = mid_compress($mid_orig);
+	my $mid = mid_clean($mime->header('Message-ID'));
 	my $was_ghost = 0;
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 
@@ -139,7 +138,7 @@ sub add_message {
 	};
 
 	if ($@) {
-		warn "failed to index message <$mid_orig>: $@\n";
+		warn "failed to index message <$mid>: $@\n";
 		return undef;
 	}
 	$doc_id;
@@ -147,11 +146,10 @@ sub add_message {
 
 # returns deleted doc_id on success, undef on missing
 sub remove_message {
-	my ($self, $mid_orig) = @_;
+	my ($self, $mid) = @_;
 	my $db = $self->{xdb};
 	my $doc_id;
-	$mid_orig = mid_clean($mid_orig);
-	my $mid = mid_compress($mid_orig);
+	$mid = mid_clean($mid);
 
 	eval {
 		$doc_id = $self->find_unique_doc_id('mid', $mid);
@@ -159,7 +157,7 @@ sub remove_message {
 	};
 
 	if ($@) {
-		warn "failed to remove message <$mid_orig>: $@\n";
+		warn "failed to remove message <$mid>: $@\n";
 		return undef;
 	}
 	$doc_id;
@@ -204,32 +202,24 @@ sub link_message {
 sub link_message_to_parents {
 	my ($self, $smsg) = @_;
 	my $doc = $smsg->{doc};
-	my $mid = mid_compress($smsg->mid);
+	my $mid = $smsg->mid;
 	my $mime = $smsg->mime;
 	my $refs = $mime->header('References');
 	my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
-	my $irt = $mime->header('In-Reply-To');
-	if ($irt) {
-		$irt = mid_compress(mid_clean($irt));
-
-		# maybe some crazies will try to make a circular reference:
-		if ($irt eq $mid) {
-			$irt = undef;
-		} else {
-			# last References should be $irt
-			# we will de-dupe later
-			push @refs, $irt;
-		}
+	if (my $irt = $mime->header('In-Reply-To')) {
+		# last References should be $irt
+		# we will de-dupe later
+		push @refs, mid_clean($irt);
 	}
 
 	my $tid;
 	if (@refs) {
-		my @crefs = map { mid_compress($_) } @refs;
 		my %uniq = ($mid => 1);
+		my @orig_refs = @refs;
+		@refs = ();
 
 		# prevent circular references via References: here:
-		@refs = ();
-		foreach my $ref (@crefs) {
+		foreach my $ref (@orig_refs) {
 			next if $uniq{$ref};
 			$uniq{$ref} = 1;
 			push @refs, $ref;
@@ -342,7 +332,6 @@ sub _resolve_mid_to_tid {
 sub create_ghost {
 	my ($self, $mid, $tid) = @_;
 
-	$mid = mid_compress($mid);
 	$tid = $self->next_thread_id unless defined $tid;
 
 	my $doc = Search::Xapian::Document->new;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 1821b07..3891823 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -9,7 +9,7 @@ use Email::Address qw//;
 use Email::Simple qw//;
 use POSIX qw//;
 use Date::Parse qw/str2time/;
-use PublicInbox::MID qw/mid_clean mid_compress/;
+use PublicInbox::MID qw/mid_clean/;
 use Encode qw/find_encoding/;
 my $enc_utf8 = find_encoding('UTF-8');
 our $PFX2TERM_RE = undef;
@@ -167,7 +167,7 @@ sub _extract_mid {
 	my ($self) = @_;
 
 	my $mid = $self->mime->header('Message-ID');
-	$mid ? mid_compress(mid_clean($mid)) : $mid;
+	defined $mid ? mid_clean($mid) : $mid;
 }
 
 sub mime {
diff --git a/t/search.t b/t/search.t
index 02189ac..4ad8a31 100644
--- a/t/search.t
+++ b/t/search.t
@@ -192,7 +192,6 @@ sub filter_mids {
 	$rw_commit->();
 	$ro->reopen;
 	my $long_mid = 'last' . ('x' x 60). '@s';
-	my $long_midc = Digest::SHA::sha1_hex($long_mid);
 
 	my $long = Email::MIME->create(
 		header_str => [
@@ -232,7 +231,7 @@ sub filter_mids {
 	$ro->reopen;
 	my $t = $ro->get_thread('root@s');
 	is($t->{total}, 4, "got all 4 mesages in thread");
-	my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_midc);
+	my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_mid);
 	@res = filter_mids($t);
 	is_deeply(\@res, \@exp, "get_thread works");
 }
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] ExtMsg: 300 to external mailing list archives
  2015-09-03  1:57 [PATCH 0/3] improve external Message-ID handling Eric Wong
  2015-09-03  1:57 ` [PATCH 1/3] view: include ghost messages in thread views Eric Wong
  2015-09-03  1:57 ` [PATCH 2/3] search: disable Message-ID compression in Xapian Eric Wong
@ 2015-09-03  1:57 ` Eric Wong
  2 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-09-03  1:57 UTC (permalink / raw)
  To: meta

Since cross-posting is inevitable, we shall link to external
message archives for interopability.
---
 lib/PublicInbox/ExtMsg.pm | 30 ++++++++++++++++++++++++++++--
 public-inbox.cgi          |  1 +
 t/cgi.t                   |  6 +++---
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index bdbff78..3e0e6e4 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -7,6 +7,14 @@ use URI::Escape qw(uri_escape_utf8);
 use PublicInbox::Hval;
 use PublicInbox::MID qw/mid_compress mid2path/;
 
+# TODO: user-configurable
+our @EXT_URL = (
+	'http://mid.gmane.org/%s',
+	'https://lists.debian.org/msgid-search/%s',
+	'http://mid.mail-archive.com/%s',
+	'http://marc.info/?i=%s',
+);
+
 sub ext_msg {
 	my ($ctx) = @_;
 	my $pi_config = $ctx->{pi_config};
@@ -74,9 +82,27 @@ sub ext_msg {
 		}
 	}
 
-	# Fall back to external repos
+	my $code = 404;
+	my $h = PublicInbox::Hval->new_msgid($mid, 1);
+	my $href = $h->as_href;
+	my $html = $h->as_html;
+	my $title = "Message-ID &lt;$html&gt; not found";
+
+	# Fall back to external repos if configured
+	my $s = "<html><head><title>$title</title>" .
+		"</head><body><pre><b>$title</b>";
+
+	if (@EXT_URL) {
+		$code = 300;
+		$s .= "\n\nPerhaps try an external site:\n\n";
+		foreach my $u (@EXT_URL) {
+			my $r = sprintf($u, $href);
+			my $t = sprintf($u, $html);
+			$s .= qq{<a\nhref="$r">$t</a>\n};
+		}
+	}
 
-	[404, ['Content-Type'=>'text/plain'], ['Not found']];
+	[300, ['Content-Type'=>'text/html; charset=UTF-8'], [$s]];
 }
 
 # Redirect to another public-inbox which is mapped by $pi_config
diff --git a/public-inbox.cgi b/public-inbox.cgi
index 1fcc04f..b095621 100755
--- a/public-inbox.cgi
+++ b/public-inbox.cgi
@@ -17,6 +17,7 @@ BEGIN {
 
 	%HTTP_CODES = (
 		200 => 'OK',
+		300 => 'Multiple Choices',
 		301 => 'Moved Permanently',
 		302 => 'Found',
 		404 => 'Not Found',
diff --git a/t/cgi.t b/t/cgi.t
index a6600c2..b0af8ae 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -160,19 +160,19 @@ EOF
 	like($res->{body}, qr/Message-Id: <blahblah\@example\.com>/,
 		"mid raw hit");
 	$res = cgi_run("/test/blahblah\@example.con/raw");
-	like($res->{head}, qr/Status: 404 Not Found/, "mid raw miss");
+	like($res->{head}, qr/Status: 300 Multiple Choices/, "mid raw miss");
 
 	$res = cgi_run("/test/blahblah\@example.com/");
 	like($res->{body}, qr/\A<html>/, "mid html hit");
 	like($res->{head}, qr/Status: 200 OK/, "200 response");
 	$res = cgi_run("/test/blahblah\@example.con/");
-	like($res->{head}, qr/Status: 404 Not Found/, "mid html miss");
+	like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss");
 
 	$res = cgi_run("/test/blahblah\@example.com/f/");
 	like($res->{body}, qr/\A<html>/, "mid html");
 	like($res->{head}, qr/Status: 200 OK/, "200 response");
 	$res = cgi_run("/test/blahblah\@example.con/f/");
-	like($res->{head}, qr/Status: 404 Not Found/, "mid html miss");
+	like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss");
 
 	$res = cgi_run("/test/");
 	like($res->{body}, qr/slashy%2Fasdf%40example\.com/,
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/3] view: include ghost messages in thread views
  2015-09-03  1:57 ` [PATCH 1/3] view: include ghost messages in thread views Eric Wong
@ 2015-09-03  2:04   ` Eric Wong
  0 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-09-03  2:04 UTC (permalink / raw)
  To: meta

Eric Wong <e@80x24.org> wrote:
> +				"$html</a>&gt];\n";

Oops, will squash the following:

--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -741,7 +741,7 @@ sub inline_dump {
 		my $href = $v->as_href;
 		$$dst .= $pfx . '` [parent not found: &lt;' .
 				qq{<a\nhref="$upfx../$href/">}.
-				"$html</a>&gt];\n";
+				"$html</a>&gt;]\n";
 	}
 	inline_dump($dst, $state, $upfx, $node->child, $level+1);
 	inline_dump($dst, $state, $upfx, $node->next, $level);

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-09-03  2:04 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-03  1:57 [PATCH 0/3] improve external Message-ID handling Eric Wong
2015-09-03  1:57 ` [PATCH 1/3] view: include ghost messages in thread views Eric Wong
2015-09-03  2:04   ` Eric Wong
2015-09-03  1:57 ` [PATCH 2/3] search: disable Message-ID compression in Xapian Eric Wong
2015-09-03  1:57 ` [PATCH 3/3] ExtMsg: 300 to external mailing list archives Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).