user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 24/34] searchidx: store the primary MID in doc data for NNTP
Date: Tue,  6 Mar 2018 08:42:32 +0000	[thread overview]
Message-ID: <20180306084242.19988-25-e@80x24.org> (raw)
In-Reply-To: <20180306084242.19988-1-e@80x24.org>

We can't rely on header order for Message-ID after all
since we fall back to existing MIDs if they exist and
are unseen.  This lets us use SearchMsg->mid to get the
MID we associated with the NNTP article number to ensure
all NNTP article lookups roundtrip correctly.
---
 lib/PublicInbox/SearchIdx.pm     |  6 ++--
 lib/PublicInbox/SearchIdxPart.pm |  8 ++---
 lib/PublicInbox/SearchMsg.pm     |  9 ++---
 lib/PublicInbox/V2Writable.pm    | 34 +++++++++++--------
 t/search-thr-index.t             |  2 +-
 t/v2writable.t                   | 73 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 107 insertions(+), 25 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index a70e1eb..71469a9 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -302,7 +302,8 @@ sub index_body ($$$) {
 }
 
 sub add_message {
-	my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object
+	# mime = Email::MIME object
+	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
 	my $doc_id;
 	my $mids = mids($mime->header_obj);
 	my $skel = $self->{skeleton};
@@ -370,7 +371,8 @@ sub add_message {
 
 		# populates smsg->references for smsg->to_doc_data
 		my $refs = parse_references($smsg);
-		my $data = $smsg->to_doc_data($blob);
+		$mid0 = $mids->[0] unless defined $mid0;
+		my $data = $smsg->to_doc_data($oid, $mid0);
 		foreach my $mid (@$mids) {
 			$tg->index_text($mid, 1, 'XM');
 		}
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
index 2f577ec..6d8cb2a 100644
--- a/lib/PublicInbox/SearchIdxPart.pm
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -51,7 +51,7 @@ sub partition_worker_loop ($$$) {
 			$xdb = $txn = undef;
 		} else {
 			chomp $line;
-			my ($len, $artnum, $object_id) = split(/ /, $line);
+			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
 			$xdb ||= $self->_xdb_acquire;
 			if (!$txn) {
 				$xdb->begin_transaction;
@@ -61,7 +61,7 @@ sub partition_worker_loop ($$$) {
 			$n == $len or die "short read: $n != $len\n";
 			my $mime = PublicInbox::MIME->new(\$msg);
 			$artnum = int($artnum);
-			$self->add_message($mime, $n, $artnum, $object_id);
+			$self->add_message($mime, $n, $artnum, $oid, $mid0);
 		}
 	}
 	warn "$$ still in transaction\n" if $txn;
@@ -70,9 +70,9 @@ sub partition_worker_loop ($$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $len, $msgref, $artnum, $object_id) = @_;
+	my ($self, $len, $msgref, $artnum, $object_id, $mid0) = @_;
 	my $w = $self->{w};
-	print $w "$len $artnum $object_id\n", $$msgref or die
+	print $w "$len $artnum $object_id $mid0\n", $$msgref or die
 		"failed to write partition $!\n";
 	$w->flush or die "failed to flush: $!\n";
 }
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 93e6fd8..a62a649 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -31,13 +31,14 @@ sub get_val ($$) {
 
 sub load_from_data ($$) {
 	my ($self) = $_[0]; # data = $_[1]
-	my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $_[1]);
+	my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]);
 	$self->{subject} = $subj;
 	$self->{from} = $from;
 	$self->{references} = $refs;
 	$self->{to} = $to;
 	$self->{cc} = $cc;
 	$self->{blob} = $blob;
+	$self->{mid} = $mid0;
 }
 
 sub load_expand {
@@ -120,11 +121,11 @@ sub ts {
 }
 
 sub to_doc_data {
-	my ($self, $blob) = @_;
+	my ($self, $oid, $mid0) = @_;
 	my @rows = ($self->subject, $self->from, $self->references,
 			$self->to, $self->cc);
-	push @rows, $blob if defined $blob;
-	join("\n", @rows);
+	$oid = '' unless defined $oid;
+	join("\n", @rows, $oid, $mid0);
 }
 
 sub references {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index c73d859..caabc8e 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -62,8 +62,10 @@ sub add {
 	# leaking FDs to it...
 	$self->idx_init;
 
-	my $num = num_for($self, $mime);
+	my $mid0;
+	my $num = num_for($self, $mime, \$mid0);
 	defined $num or return; # duplicate
+	defined $mid0 or die "BUG: $mid0 undefined\n";
 	my $im = $self->importer;
 	my $cmt = $im->add($mime);
 	$cmt = $im->get_mark($cmt);
@@ -73,7 +75,7 @@ sub add {
 	my $nparts = $self->{partitions};
 	my $part = $num % $nparts;
 	my $idx = $self->idx_part($part);
-	$idx->index_raw($len, $msgref, $num, $oid);
+	$idx->index_raw($len, $msgref, $num, $oid, $mid0);
 	my $n = $self->{transact_bytes} += $len;
 	if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
 		$self->checkpoint;
@@ -83,12 +85,15 @@ sub add {
 }
 
 sub num_for {
-	my ($self, $mime) = @_;
+	my ($self, $mime, $mid0) = @_;
 	my $mids = mids($mime->header_obj);
 	if (@$mids) {
 		my $mid = $mids->[0];
 		my $num = $self->{skel}->{mm}->mid_insert($mid);
-		return $num if defined($num); # common case
+		if (defined $num) { # common case
+			$$mid0 = $mid;
+			return $num;
+		};
 
 		# crap, Message-ID is already known, hope somebody just resent:
 		$self->done; # write barrier, clears $self->{skel}
@@ -111,38 +116,39 @@ sub num_for {
 			$num = $self->{skel}->{mm}->mid_insert($m);
 			if (defined $num) {
 				warn "alternative <$m> for <$mid> found\n";
+				$$mid0 = $m;
 				return $num;
 			}
 		}
 	}
 	# none of the existing Message-IDs are good, generate a new one:
-	num_for_harder($self, $mime);
+	num_for_harder($self, $mime, $mid0);
 }
 
 sub num_for_harder {
-	my ($self, $mime) = @_;
+	my ($self, $mime, $mid0) = @_;
 
 	my $hdr = $mime->header_obj;
 	my $dig = content_digest($mime);
-	my $mid = $dig->clone->hexdigest . '@localhost';
-	my $num = $self->{skel}->{mm}->mid_insert($mid);
+	$$mid0 = $dig->clone->hexdigest . '@localhost';
+	my $num = $self->{skel}->{mm}->mid_insert($$mid0);
 	unless (defined $num) {
 		# it's hard to spoof the last Received: header
 		my @recvd = $hdr->header_raw('Received');
 		$dig->add("Received: $_") foreach (@recvd);
-		$mid = $dig->clone->hexdigest . '@localhost';
-		$num = $self->{skel}->{mm}->mid_insert($mid);
+		$$mid0 = $dig->clone->hexdigest . '@localhost';
+		$num = $self->{skel}->{mm}->mid_insert($$mid0);
 
 		# fall back to a random Message-ID and give up determinism:
 		until (defined($num)) {
 			$dig->add(rand);
-			$mid = $dig->clone->hexdigest . '@localhost';
-			warn "using random Message-ID <$mid> as fallback\n";
-			$num = $self->{skel}->{mm}->mid_insert($mid);
+			$$mid0 = $dig->clone->hexdigest . '@localhost';
+			warn "using random Message-ID <$$mid0> as fallback\n";
+			$num = $self->{skel}->{mm}->mid_insert($$mid0);
 		}
 	}
 	my @cur = $hdr->header_raw('Message-Id');
-	$hdr->header_set('Message-Id', "<$mid>", @cur);
+	$hdr->header_set('Message-Id', "<$$mid0>", @cur);
 	$num;
 }
 
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
index c3534f6..6c6e4c5 100644
--- a/t/search-thr-index.t
+++ b/t/search-thr-index.t
@@ -41,8 +41,8 @@ foreach (reverse split(/\n\n/, $data)) {
 	$mime->header_set('From' => 'bw@g');
 	$mime->header_set('To' => 'git@vger.kernel.org');
 	my $bytes = bytes::length($mime->as_string);
-	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored');
 	my $mid = $mime->header('Message-Id');
+	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid);
 	push @mids, $mid;
 	ok($doc_id, 'message added: '. $mid);
 }
diff --git a/t/v2writable.t b/t/v2writable.t
index f95b2e7..bf8ae5e 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -93,5 +93,78 @@ ok($im->add($mime), 'ordinary message added');
 	ok($found[1]->{doc_id} > 0, 'doc_id is positive');
 }
 
+SKIP: {
+	use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD);
+	use Net::NNTP;
+	use IO::Socket;
+	use Socket qw(SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY);
+	eval { require Danga::Socket };
+	skip "Danga::Socket missing $@", 2 if $@;
+	my $err = "$mainrepo/stderr.log";
+	my $out = "$mainrepo/stdout.log";
+	my %opts = (
+		LocalAddr => '127.0.0.1',
+		ReuseAddr => 1,
+		Proto => 'tcp',
+		Type => SOCK_STREAM,
+		Listen => 1024,
+	);
+	my $group = 'inbox.comp.test.v2writable';
+	my $pi_config = "$mainrepo/pi_config";
+	open my $fh, '>', $pi_config or die "open: $!\n";
+	print $fh <<EOF
+[publicinbox "test-v2writable"]
+	mainrepo = $mainrepo
+	version = 2
+	address = test\@example.com
+	newsgroup = $group
+EOF
+	;
+	close $fh or die "close: $!\n";
+	my $sock = IO::Socket::INET->new(%opts);
+	ok($sock, 'sock created');
+	my $pid;
+	my $len;
+	END { kill 'TERM', $pid if defined $pid };
+	$! = 0;
+	my $fl = fcntl($sock, F_GETFD, 0);
+	ok(! $!, 'no error from fcntl(F_GETFD)');
+	is($fl, FD_CLOEXEC, 'cloexec set by default (Perl behavior)');
+	$pid = fork;
+	if ($pid == 0) {
+		use POSIX qw(dup2);
+		$ENV{PI_CONFIG} = $pi_config;
+		# pretend to be systemd
+		fcntl($sock, F_SETFD, $fl &= ~FD_CLOEXEC);
+		dup2(fileno($sock), 3) or die "dup2 failed: $!\n";
+		$ENV{LISTEN_PID} = $$;
+		$ENV{LISTEN_FDS} = 1;
+		my $nntpd = 'blib/script/public-inbox-nntpd';
+		exec $nntpd, "--stdout=$out", "--stderr=$err";
+		die "FAIL: $!\n";
+	}
+	ok(defined $pid, 'forked nntpd process successfully');
+	$! = 0;
+	fcntl($sock, F_SETFD, $fl |= FD_CLOEXEC);
+	ok(! $!, 'no error from fcntl(F_SETFD)');
+	my $host_port = $sock->sockhost . ':' . $sock->sockport;
+	my $n = Net::NNTP->new($host_port);
+	$n->group($group);
+	my $x = $n->xover('1-');
+	my %uniq;
+	foreach my $num (sort { $a <=> $b } keys %$x) {
+		my $mid = $x->{$num}->[3];
+		is($uniq{$mid}++, 0, "MID for $num is unique in XOVER");
+		is_deeply($n->xhdr('Message-ID', $num),
+			 { $num => $mid }, "XHDR lookup OK on num $num");
+		is_deeply($n->xhdr('Message-ID', $mid),
+			 { $mid => $mid }, "XHDR lookup OK on MID $num");
+	}
+	my %nn;
+	foreach my $mid (@{$n->newnews(0, $group)}) {
+		is($nn{$mid}++, 0, "MID is unique in NEWNEWS");
+	}
+	is_deeply([sort keys %nn], [sort keys %uniq]);
+};
 
 done_testing();
-- 
EW


  parent reply	other threads:[~2018-03-06  8:42 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-06  8:42 [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 01/34] v2writable: delete ::Import obj when ->done Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 02/34] search: remove informational "warning" message Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 03/34] searchidx: add PID to error message when die-ing Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 04/34] content_id: special treatment for Message-Id headers Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 05/34] evcleanup: disable outside of daemon Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 06/34] v2writable: deduplicate detection on add Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 07/34] evcleanup: do not create event loop if nothing was registered Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 08/34] mid: add `mids' and `references' methods for extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 09/34] content_id: use `mids' and `references' for MID extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 10/34] searchidx: use new `references' method for parsing References Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 11/34] content_id: no need to be human-friendly Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 12/34] v2writable: inject new Message-IDs on true duplicates Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 13/34] search: revert to using 'Q' as a uniQue id per-Xapian conventions Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 14/34] searchidx: support indexing multiple MIDs Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 15/34] mid: be strict with References, but loose on Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 16/34] searchidx: avoid excessive XNQ indexing with diffs Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 17/34] searchidxskeleton: add a note about locking Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 18/34] v2writable: generated Message-ID goes first Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 19/34] searchidx: use add_boolean_term for internal terms Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 20/34] searchidx: add NNTP article number as a searchable term Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 21/34] mid: truncate excessively long MIDs early Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 22/34] nntp: use NNTP article numbers for lookups Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 23/34] nntp: fix NEWNEWS command Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-06  8:42 ` [PATCH 25/34] import: consolidate object info for v2 imports Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 26/34] v2: avoid redundant/repeated configs for git partition repos Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 27/34] INSTALL: document more optional dependencies Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 28/34] search: favor skeleton DB for lookup_mail Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 29/34] search: each_smsg_by_mid uses skeleton if available Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 30/34] v2writable: remove unnecessary skeleton commit Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 31/34] favor Received: date over Date: header globally Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 32/34] import: fall back to Sender for extracting name and email Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 33/34] scripts/import_vger_from_mbox: perform mboxrd or mboxo escaping Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 34/34] v2writable: detect and use previous partition count Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:53 ` [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180306084242.19988-25-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).