user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH] v2writable: drop SQLite-based multi_mid_q_new
@ 2020-04-20  9:59  7% Eric Wong
  0 siblings, 0 replies; 4+ results
From: Eric Wong @ 2020-04-20  9:59 UTC (permalink / raw)
  To: meta

We switched to the SDBM-based queue to store author/committer
info last month.

Fixes: c7acdfe78bda5bf3 ("v2: SDBM-based multi Message-ID queue")
---
 lib/PublicInbox/V2Writable.pm | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1c78ef24..feab606d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -980,18 +980,6 @@ sub check_unindexed ($$$) {
 	}
 }
 
-# reuse Msgmap to store num => oid mapping (rather than num => mid)
-sub multi_mid_q_new () {
-	my ($fh, $fn) = tempfile('multi_mid-XXXXXXX', EXLOCK => 0, TMPDIR => 1);
-	my $multi_mid = PublicInbox::Msgmap->new_file($fn, 1);
-	$multi_mid->{dbh}->do('PRAGMA synchronous = OFF');
-	# for Msgmap->DESTROY:
-	$multi_mid->{tmp_name} = $fn;
-	$multi_mid->{pid} = $$;
-	close $fh or die "failed to close $fn: $!";
-	$multi_mid
-}
-
 sub multi_mid_q_push ($$$) {
 	my ($self, $sync, $oid) = @_;
 	my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;

^ permalink raw reply related	[relevance 7%]

* [PATCH] t/multi-mid: allow test to run w/o Xapian
@ 2020-03-30 19:42  6% Eric Wong
  0 siblings, 0 replies; 4+ results
From: Eric Wong @ 2020-03-30 19:42 UTC (permalink / raw)
  To: meta

While the v1 inbox in this test is created without Xapian,
the v2 inbox in this test defaults to having Xapian enabled
regardless of whether it's installed or not.

Fixes: c7acdfe78bda5bf3 ("v2: SDBM-based multi Message-ID queue")
---
 t/multi-mid.t | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/multi-mid.t b/t/multi-mid.t
index 87240c2c..31a8fd74 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -67,7 +67,7 @@ for my $order ([$bad, $good], [$good, $bad]) {
 
 	system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
 		"$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
-	$cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+	$cmd = [ '-init', '-Lbasic', '-V2', 'v2c', "$tmpdir/v2-clone",
 		'http://example.com/v2c', 'v2c@example.com' ];
 	ok(run_script($cmd, $env), 'init clone');
 	$cmd = [ '-index', "$tmpdir/v2-clone" ];

^ permalink raw reply related	[relevance 6%]

* [PATCH 9/9] v2: SDBM-based multi Message-ID queue
  2020-03-20  8:18  6% ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
@ 2020-03-20  8:18  4%   ` Eric Wong
  0 siblings, 0 replies; 4+ results
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

This lets us store author and committer times for deferred
indexing messages with ambiguous Message-IDs.  This allows
us to reproducibly reindex messages with the git commit
and author times when a rare message lacks Received and/or
Date headers while having ambiguous Message-IDs.
---
 MANIFEST                         |  1 +
 lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/V2Writable.pm    | 23 +++++++------
 t/multi-mid.t                    | 27 +++++++++++++--
 4 files changed, 95 insertions(+), 13 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm

diff --git a/MANIFEST b/MANIFEST
index ec80c90f..f077d722 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MsgIter.pm
 lib/PublicInbox/MsgTime.pm
 lib/PublicInbox/Msgmap.pm
+lib/PublicInbox/MultiMidQueue.pm
 lib/PublicInbox/NNTP.pm
 lib/PublicInbox/NNTPD.pm
 lib/PublicInbox/NNTPdeflate.pm
diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm
new file mode 100644
index 00000000..3c28ebbc
--- /dev/null
+++ b/lib/PublicInbox/MultiMidQueue.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary queue for public-inbox-index to support multi-Message-ID
+# messages on mirrors of v2 inboxes
+package PublicInbox::MultiMidQueue;
+use strict;
+use SDBM_File; # part of Perl standard library
+use Fcntl qw(O_RDWR O_CREAT);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000);
+my $O_CLOEXEC = $e{$^O} // 0;
+
+sub new {
+	my ($class) = @_;
+	my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1);
+	my $base = $tmpdir->dirname . '/q';
+	my %sdbm;
+	my $flags = O_RDWR|O_CREAT;
+	if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) {
+		if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) {
+			die "could not tie ($base): $!";
+		}
+		$O_CLOEXEC = 0;
+	}
+
+	bless {
+		cur => 1,
+		min => 1,
+		max => 0,
+		sdbm => \%sdbm,
+		tmpdir => $tmpdir,
+	}, $class;
+}
+
+sub set_oid {
+	my ($self, $i, $oid, $v2w) = @_;
+	$self->{max} = $i if $i > $self->{max};
+	$self->{min} = $i if $i < $self->{min};
+	$self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}";
+}
+
+sub get_oid {
+	my ($self, $i, $v2w) = @_;
+	my $rec = $self->{sdbm}->{$i} or return;
+	my ($oid, $autime, $cotime) = split(/\t/, $rec);
+	$v2w->{autime} = $autime;
+	$v2w->{cotime} = $cotime;
+	$oid
+}
+
+sub push_oid {
+	my ($self, $oid, $v2w) = @_;
+	set_oid($self, $self->{cur}++, $oid, $v2w);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b45d2722..1c78ef24 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -20,6 +20,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -991,15 +992,15 @@ sub multi_mid_q_new () {
 	$multi_mid
 }
 
-sub multi_mid_q_push ($$) {
-	my ($sync, $oid) = @_;
-	my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+sub multi_mid_q_push ($$$) {
+	my ($self, $sync, $oid) = @_;
+	my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;
 	if ($sync->{reindex}) { # no regen on reindex
-		$multi_mid->mid_insert($oid);
+		$multi_mid->push_oid($oid, $self);
 	} else {
 		my $num = $sync->{regen}--;
 		die "BUG: ran out of article numbers" if $num <= 0;
-		$multi_mid->mid_set($num, $oid);
+		$multi_mid->set_oid($num, $oid, $self);
 	}
 }
 
@@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) {
 			# do not delete from {mm_tmp}, since another
 			# single-MID message may use it.
 		} else { # handle them at the end:
-			multi_mid_q_push($sync, $oid);
+			multi_mid_q_push($self, $sync, $oid);
 		}
 		return;
 	}
@@ -1352,19 +1353,21 @@ sub index_sync {
 	}
 	if (my $multi_mid = delete $sync->{multi_mid}) {
 		$git //= $self->{-inbox}->git;
-		my ($min, $max) = $multi_mid->minmax;
+		my $min = $multi_mid->{min};
+		my $max = $multi_mid->{max};
 		if ($sync->{reindex}) {
 			# we may need to create new Message-IDs if mirrors
 			# were initially indexed with old versions
 			for (my $i = $max; $i >= $min; $i--) {
-				my $oid = $multi_mid->mid_for($i);
+				my $oid;
+				$oid = $multi_mid->get_oid($i, $self) or next;
 				next unless defined $oid;
 				reindex_oid_m($self, $sync, $git, $oid);
 			}
 		} else { # regen on initial index
 			for my $num ($min..$max) {
-				my $oid = $multi_mid->mid_for($num);
-				next unless defined $oid;
+				my $oid;
+				$oid = $multi_mid->get_oid($num, $self) or next;
 				reindex_oid_m($self, $sync, $git, $oid, $num);
 			}
 		}
diff --git a/t/multi-mid.t b/t/multi-mid.t
index df865efb..87240c2c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
 use Test::More;
 use PublicInbox::MIME;
 use PublicInbox::TestCommon;
@@ -7,6 +8,7 @@ use PublicInbox::InboxWritable;
 require_git(2.6);
 require_mods(qw(DBD::SQLite));
 require PublicInbox::SearchIdx;
+my $delay = $ENV{TEST_DELAY_CONVERT};
 
 my $addr = 'test@example.com';
 my $bad = PublicInbox::MIME->new(<<EOF);
@@ -14,14 +16,12 @@ Message-ID: <a\@example.com>
 Message-ID: <b\@example.com>
 From: a\@example.com
 To: $addr
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 Subject: bad
 
 EOF
 
 my $good = PublicInbox::MIME->new(<<EOF);
 Message-ID: <b\@example.com>
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 From: b\@example.com
 To: $addr
 Subject: good
@@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) {
 		indexlevel => 'basic',
 		-primary_address => $addr,
 	}, my $creat_opt = {});
+	my @old;
 	if ('setup v1 inbox') {
 		my $im = $ibx->importer(0);
-		ok($im->add($_), 'added '.$_->header('Subject')) for @$order;
+		for (@$order) {
+			ok($im->add($_), 'added '.$_->header('Subject'));
+			sleep($delay) if $delay;
+		}
 		$im->done;
 		my $s = PublicInbox::SearchIdx->new($ibx, 1);
 		$s->index_sync;
 		$before = [ $ibx->mm->minmax ];
+		@old = ($ibx->over->get_art(1), $ibx->over->get_art(2));
 		$ibx->cleanup;
 	}
 	my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') };
@@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) {
 	$ibx->{inboxdir} = "$tmpdir/v2";
 	is_deeply([$ibx->mm->minmax], $before,
 		'min, max article numbers unchanged');
+
+	my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2, \@old, 'v2 conversion times match');
+
+	system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
+		"$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
+	$cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+		'http://example.com/v2c', 'v2c@example.com' ];
+	ok(run_script($cmd, $env), 'init clone');
+	$cmd = [ '-index', "$tmpdir/v2-clone" ];
+	sleep($delay) if $delay;
+	ok(run_script($cmd, $env), 'index the clone');
+	$ibx->cleanup;
+	$ibx->{inboxdir} = "$tmpdir/v2-clone";
+	my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2c, \@old, 'v2 clone times match');
 }
 
 done_testing();

^ permalink raw reply related	[relevance 4%]

* [PATCH 0/9] preserve time and date of initial commit
  @ 2020-03-20  8:18  6% ` Eric Wong
  2020-03-20  8:18  4%   ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong
  0 siblings, 1 reply; 4+ results
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

For messages lacking Date and/or Received headers, search
queries for "d:YYYYMMDD..YYYYMMDD" ranges can be unreliable in
mirrors, as can the $INBOX_URL/?t=$TIMESTAMP query which
only hits SQLite.

Yes, this ended up being a lot of work to deal with corner case
messages (probably most of which are spam), but there's also a
lot of internal cleanups which made the end result easier to
follow, I think...

The main fix is actually in 1/9, but it's gross.

Patch 2/9 fixes a small window where a race can happen and
cause searches to be off by a minute.

Patches 3-8 cleanup the mess left in 1 and 2,

Finally, patch 9 fixes the corner-case-of-corner-cases for
dealing with multi-MID messages which require a one-off queue to
store the git commit/author times instead of overloading msgmap.

Eric Wong (9):
  index: use git commit times on missing Date/Received
  v2writable: preserve timestamps from import if generated
  rename PublicInbox::SearchMsg => PublicInbox::Smsg
  smsg: to_doc_data: use existing fields
  overidx: parse_references: less error-prone args
  *idx: pass $smsg in more places instead of many args
  v2: pass smsg in more places
  *idx: pass smsg in even more places
  v2: SDBM-based multi Message-ID queue

 Documentation/mknews.perl                   |  4 +-
 Documentation/technical/data_structures.txt |  4 +-
 MANIFEST                                    |  4 +-
 lib/PublicInbox/ExtMsg.pm                   |  3 +-
 lib/PublicInbox/Feed.pm                     |  4 +-
 lib/PublicInbox/Import.pm                   | 19 +++--
 lib/PublicInbox/Inbox.pm                    |  2 +-
 lib/PublicInbox/Mbox.pm                     |  4 +-
 lib/PublicInbox/MsgTime.pm                  | 12 +--
 lib/PublicInbox/MultiMidQueue.pm            | 57 +++++++++++++
 lib/PublicInbox/NNTP.pm                     | 14 ++--
 lib/PublicInbox/Over.pm                     |  8 +-
 lib/PublicInbox/OverIdx.pm                  | 30 +++----
 lib/PublicInbox/Search.pm                   |  8 +-
 lib/PublicInbox/SearchIdx.pm                | 68 ++++++++++-----
 lib/PublicInbox/SearchIdxShard.pm           | 26 ++++--
 lib/PublicInbox/SearchView.pm               |  8 +-
 lib/PublicInbox/{SearchMsg.pm => Smsg.pm}   | 19 +++--
 lib/PublicInbox/SolverGit.pm                |  2 +-
 lib/PublicInbox/V2Writable.pm               | 84 ++++++++++++-------
 lib/PublicInbox/View.pm                     |  2 +-
 t/import.t                                  | 14 ++--
 t/index-git-times.t                         | 93 +++++++++++++++++++++
 t/multi-mid.t                               | 27 +++++-
 t/search-thr-index.t                        | 17 +++-
 t/thread-cycle.t                            |  2 +-
 26 files changed, 386 insertions(+), 149 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm
 rename lib/PublicInbox/{SearchMsg.pm => Smsg.pm} (92%)
 create mode 100644 t/index-git-times.t

^ permalink raw reply	[relevance 6%]

Results 1-4 of 4 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-03-05  5:13     [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-20  8:18  6% ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20  8:18  4%   ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong
2020-03-30 19:42  6% [PATCH] t/multi-mid: allow test to run w/o Xapian Eric Wong
2020-04-20  9:59  7% [PATCH] v2writable: drop SQLite-based multi_mid_q_new Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).