user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 06/34] v2writable: deduplicate detection on add
Date: Tue,  6 Mar 2018 08:42:14 +0000	[thread overview]
Message-ID: <20180306084242.19988-7-e@80x24.org> (raw)
In-Reply-To: <20180306084242.19988-1-e@80x24.org>

This is a bit expensive in a multi-process situation because
we need to make our indices and packs visible to the read-only
pieces.
---
 lib/PublicInbox/Search.pm     | 16 ++++++++++
 lib/PublicInbox/SearchIdx.pm  |  9 ++++--
 lib/PublicInbox/V2Writable.pm | 68 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 21c72b6..c074410 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -371,6 +371,22 @@ sub lookup_mail { # no ghosts!
 	});
 }
 
+sub each_smsg_by_mid {
+	my ($self, $mid, $cb) = @_;
+	$mid = mid_clean($mid);
+	my $xdb = $self->{xdb};
+	# XXX retry_reopen isn't necessary for V2Writable, but the PSGI
+	# interface will need it...
+	my ($head, $tail) = $self->find_doc_ids('XMID' . $mid);
+	for (; $head->nequal($tail); $head->inc) {
+		my $doc_id = $head->get_docid;
+		my $doc = $xdb->get_document($doc_id);
+		my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+		$smsg->{doc_id} = $doc_id;
+		$cb->($smsg) or return;
+	}
+}
+
 sub find_unique_doc_id {
 	my ($self, $termval) = @_;
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ec3a6f3..ed52e38 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -284,7 +284,11 @@ sub add_message {
 	my $db = $self->{xdb};
 
 	my ($doc_id, $old_tid);
-	my $mid = mid_clean(mid_mime($mime));
+	my @mids = mid_mime($mime);
+	if (@mids > 1) {
+		warn "Multi-MID: ( ",join(' | ', @mids)," )\n";
+	}
+	my $mid = mid_clean($mids[0]);
 	my $skel = $self->{skeleton};
 
 	eval {
@@ -512,13 +516,12 @@ sub unindex_blob {
 }
 
 sub index_mm {
-	my ($self, $mime, $warn_existing) = @_;
+	my ($self, $mime) = @_;
 	my $mid = mid_clean(mid_mime($mime));
 	my $mm = $self->{mm};
 	my $num = $mm->mid_insert($mid);
 	return $num if defined $num;
 
-	warn "<$mid> reused\n" if $warn_existing;
 	# fallback to num_for since filters like RubyLang set the number
 	$mm->num_for($mid);
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0470fb0..57cb7d3 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -11,6 +11,9 @@ use PublicInbox::SearchIdxSkeleton;
 use PublicInbox::MIME;
 use PublicInbox::Git;
 use PublicInbox::Import;
+use PublicInbox::MID qw(mid_clean mid_mime);
+use PublicInbox::ContentId qw(content_id);
+use PublicInbox::Inbox;
 
 # an estimate of the post-packed size to the raw uncompressed size
 my $PACKING_FACTOR = 0.4;
@@ -46,22 +49,40 @@ sub new {
 # mimics Import::add and wraps it for v2
 sub add {
 	my ($self, $mime, $check_cb) = @_;
-	my $existing = $self->lookup_content($mime);
 
-	if ($existing) {
-		return undef if $existing->type eq 'mail'; # duplicate
+	# spam check:
+	if ($check_cb) {
+		$mime = $check_cb->($mime) or return;
 	}
 
-	my $im = $self->importer;
+	# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
+	# as does SQLite 3.4.1+ (released in 2007-07-20), and
+	# Xapian 1.3.2+ (released 2015-03-15).
+	# For the most part, we can spawn git-fast-import without
+	# leaking FDs to it...
+	$self->idx_init;
+
+	my $mid = mid_clean(mid_mime($mime));
+	my $num = $self->{skel}->{mm}->mid_insert($mid);
+	if (!defined($num)) { # mid is already known
+		$self->done; # ensure all subprocesses are done writing
+
+		my $existing = $self->lookup_content($mime);
+		warn "<$mid> resent\n" if $existing;
+		return if $existing; # easy, don't store duplicates
+
+		# reuse NNTP article number?
+		warn "<$mid> reused for mismatched content\n";
+		$self->idx_init;
+		$num = $self->{skel}->{mm}->num_for($mid);
+	}
 
-	# im->add returns undef if check_cb fails
-	my $cmt = $im->add($mime, $check_cb) or return;
+	my $im = $self->importer;
+	my $cmt = $im->add($mime);
 	$cmt = $im->get_mark($cmt);
 	my $oid = $im->{last_object_id};
 	my ($len, $msgref) = @{$im->{last_object}};
 
-	$self->idx_init;
-	my $num = $self->{skel}->index_mm($mime, 1);
 	my $nparts = $self->{partitions};
 	my $part = $num % $nparts;
 	my $idx = $self->idx_part($part);
@@ -83,6 +104,12 @@ sub idx_part {
 sub idx_init {
 	my ($self) = @_;
 	return if $self->{idx_parts};
+	my $ibx = $self->{-inbox};
+
+	# do not leak read-only FDs to child processes, we only have these
+	# FDs for duplicate detection so they should not be
+	# frequently activated.
+	delete $ibx->{$_} foreach (qw(git mm search));
 
 	# first time initialization, first we create the skeleton pipe:
 	my $skel = $self->{skel} = PublicInbox::SearchIdxSkeleton->new($self);
@@ -241,7 +268,30 @@ sub import_init {
 }
 
 sub lookup_content {
-	undef # TODO
+	my ($self, $mime) = @_;
+	my $ibx = $self->{-inbox};
+
+	my $srch = $ibx->search;
+	my $cid = content_id($mime);
+	my $found;
+	my $mid = mid_mime($mime);
+	$srch->each_smsg_by_mid($mid, sub {
+		my ($smsg) = @_;
+		$smsg->load_expand;
+		my $msg = $ibx->msg_by_smsg($smsg);
+		if (!defined($msg)) {
+			warn "broken smsg for $mid\n";
+			return 1; # continue
+		}
+		my $cur = PublicInbox::MIME->new($msg);
+		if (content_id($cur) eq $cid) {
+			$smsg->{mime} = $cur;
+			$found = $smsg;
+			return 0; # break out of loop
+		}
+		1; # continue
+	});
+	$found;
 }
 
 sub atfork_child {
-- 
EW


  parent reply	other threads:[~2018-03-06  8:42 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-06  8:42 [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 01/34] v2writable: delete ::Import obj when ->done Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 02/34] search: remove informational "warning" message Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 03/34] searchidx: add PID to error message when die-ing Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 04/34] content_id: special treatment for Message-Id headers Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 05/34] evcleanup: disable outside of daemon Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-06  8:42 ` [PATCH 07/34] evcleanup: do not create event loop if nothing was registered Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 08/34] mid: add `mids' and `references' methods for extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 09/34] content_id: use `mids' and `references' for MID extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 10/34] searchidx: use new `references' method for parsing References Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 11/34] content_id: no need to be human-friendly Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 12/34] v2writable: inject new Message-IDs on true duplicates Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 13/34] search: revert to using 'Q' as a uniQue id per-Xapian conventions Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 14/34] searchidx: support indexing multiple MIDs Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 15/34] mid: be strict with References, but loose on Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 16/34] searchidx: avoid excessive XNQ indexing with diffs Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 17/34] searchidxskeleton: add a note about locking Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 18/34] v2writable: generated Message-ID goes first Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 19/34] searchidx: use add_boolean_term for internal terms Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 20/34] searchidx: add NNTP article number as a searchable term Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 21/34] mid: truncate excessively long MIDs early Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 22/34] nntp: use NNTP article numbers for lookups Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 23/34] nntp: fix NEWNEWS command Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 24/34] searchidx: store the primary MID in doc data for NNTP Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 25/34] import: consolidate object info for v2 imports Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 26/34] v2: avoid redundant/repeated configs for git partition repos Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 27/34] INSTALL: document more optional dependencies Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 28/34] search: favor skeleton DB for lookup_mail Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 29/34] search: each_smsg_by_mid uses skeleton if available Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 30/34] v2writable: remove unnecessary skeleton commit Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 31/34] favor Received: date over Date: header globally Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 32/34] import: fall back to Sender for extracting name and email Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 33/34] scripts/import_vger_from_mbox: perform mboxrd or mboxo escaping Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 34/34] v2writable: detect and use previous partition count Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:53 ` [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180306084242.19988-7-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).