user/dev discussion of public-inbox itself
 help / color / Atom feed
* [PATCH] index: use git commit times on missing Date/Received
@ 2020-03-05  3:23 Eric Wong
  2020-03-05  5:13 ` Eric Wong
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Wong @ 2020-03-05  3:23 UTC (permalink / raw)
  To: meta

When indexing messages without Date: and/or Received: headers,
fall back to using timestamps originally recorded by git in the
commit object.  This allows git mirrors to preserve the import
datestamp and timestamp of a message according to what was fed
into git, instead of blindly falling back to the current time.
---
 MANIFEST                          |  1 +
 lib/PublicInbox/MsgTime.pm        | 12 ++--
 lib/PublicInbox/OverIdx.pm        | 10 +++-
 lib/PublicInbox/SearchIdx.pm      | 17 ++++--
 lib/PublicInbox/SearchIdxShard.pm | 15 +++--
 lib/PublicInbox/V2Writable.pm     | 12 ++--
 t/index-git-times.t               | 93 +++++++++++++++++++++++++++++++
 7 files changed, 138 insertions(+), 22 deletions(-)
 create mode 100644 t/index-git-times.t

diff --git a/MANIFEST b/MANIFEST
index 265ad909..f9230cd3 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -242,6 +242,7 @@ t/httpd.t
 t/hval.t
 t/import.t
 t/inbox.t
+t/index-git-times.t
 t/indexlevels-mirror-v1.t
 t/indexlevels-mirror.t
 t/init.t
diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm
index 8703d7bc..bd7ef811 100644
--- a/lib/PublicInbox/MsgTime.pm
+++ b/lib/PublicInbox/MsgTime.pm
@@ -167,21 +167,21 @@ sub msg_date_only ($) {
 }
 
 # Favors Received header for sorting globally
-sub msg_timestamp ($) {
-	my ($hdr) = @_; # Email::MIME::Header
+sub msg_timestamp ($;$) {
+	my ($hdr, $fallback) = @_; # Email::MIME::Header
 	my $ret;
 	$ret = msg_received_at($hdr) and return time_response($ret);
 	$ret = msg_date_only($hdr) and return time_response($ret);
-	wantarray ? (time, '+0000') : time;
+	time_response([ $fallback // time, '+0000' ]);
 }
 
 # Favors the Date: header for display and sorting within a thread
-sub msg_datestamp ($) {
-	my ($hdr) = @_; # Email::MIME::Header
+sub msg_datestamp ($;$) {
+	my ($hdr, $fallback) = @_; # Email::MIME::Header
 	my $ret;
 	$ret = msg_date_only($hdr) and return time_response($ret);
 	$ret = msg_received_at($hdr) and return time_response($ret);
-	wantarray ? (time, '+0000') : time;
+	time_response([ $fallback // time, '+0000' ]);
 }
 
 1;
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 0549c68b..9ee6d613 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -15,6 +15,7 @@ use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
 use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::SearchMsg qw(subject_normalized);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
 
@@ -246,7 +247,7 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+	my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_;
 	my $lines = $mime->body_raw =~ tr!\n!\n!;
 	my $smsg = bless {
 		mime => $mime,
@@ -255,7 +256,8 @@ sub add_overview {
 		lines => $lines,
 		blob => $oid,
 	}, 'PublicInbox::SearchMsg';
-	my $mids = mids_for_index($mime->header_obj);
+	my $hdr = $mime->header_obj;
+	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $mid0, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
@@ -266,7 +268,9 @@ sub add_overview {
 	my $dd = $smsg->to_doc_data($oid, $mid0);
 	utf8::encode($dd);
 	$dd = compress($dd);
-	my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ];
+	my $ds = msg_timestamp($hdr, $times->{autime});
+	my $ts = msg_datestamp($hdr, $times->{cotime});
+	my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ];
 	add_over($self, $values);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c33a48c3..261deb84 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -19,6 +19,7 @@ use POSIX qw(strftime);
 use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::Git qw(git_unquote);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 my $X = \%PublicInbox::Search::X;
 my ($DB_CREATE_OR_OPEN, $DB_OPEN);
 use constant {
@@ -308,10 +309,13 @@ sub index_xapian { # msg_iter callback
 sub add_xapian ($$$$$$) {
 	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
 	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $hdr = $mime->header_obj;
+	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
+	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
 	my $doc = $X->{Document}->new;
 	my $subj = $smsg->subject;
-	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-	my @ds = gmtime($smsg->ds);
+	add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
+	my @ds = gmtime($smsg->{ds});
 	my $yyyymmdd = strftime('%Y%m%d', @ds);
 	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
 	my $dt = strftime('%Y%m%d%H%M%S', @ds);
@@ -375,7 +379,8 @@ sub add_message {
 			add_xapian($self, $mime, $num, $oid, $mids, $mid0);
 		}
 		if (my $over = $self->{over}) {
-			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
+			$over->add_overview($mime, $bytes, $num, $oid, $mid0,
+						$self);
 		}
 	};
 
@@ -596,6 +601,10 @@ sub read_log {
 		} elsif ($line =~ /^commit ($h40)/o) {
 			$latest = $1;
 			$newest ||= $latest;
+		} elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) {
+			$self->{over}->{autime} = $self->{autime} = $1;
+		} elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) {
+			$self->{over}->{cotime} = $self->{cotime} = $1;
 		}
 	}
 	close($log) or die "git log failed: \$?=$?";
@@ -651,7 +660,7 @@ sub _git_log {
 		$self->{regen_down} = $high + $fcount;
 	}
 
-	$git->popen(qw/log --no-notes --no-color --no-renames
+	$git->popen(qw/log --pretty=raw --no-notes --no-color --no-renames
 				--raw -r --no-abbrev/, $range);
 }
 
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index ee176e50..c69e0cc8 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -67,12 +67,15 @@ sub shard_worker_loop ($$$$$) {
 			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
-			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
+			my ($len, $artnum, $oid, $mid0, $autime, $cotime) =
+							split(/ /, $line);
 			$self->begin_txn_lazy;
 			my $n = read($r, my $msg, $len) or die "read: $!\n";
 			$n == $len or die "short read: $n != $len\n";
 			my $mime = PublicInbox::MIME->new(\$msg);
 			$artnum = int($artnum);
+			$self->{autime} = $autime;
+			$self->{cotime} = $cotime;
 			$self->add_message($mime, $n, $artnum, $oid, $mid0);
 		}
 	}
@@ -81,13 +84,17 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_;
+	my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime, $times) = @_;
+	my $at = $times->{autime};
+	my $ct = $times->{cotime};
 	if (my $w = $self->{w}) {
-		print $w "$bytes $artnum $oid $mid0\n", $$msgref or die
-			"failed to write shard $!\n";
+		print $w "$bytes $artnum $oid $mid0 $at $ct\n", $$msgref or
+			die "failed to write shard $!\n";
 	} else {
 		$$msgref = undef;
 		$self->begin_txn_lazy;
+		$self->{autime} = $at;
+		$self->{cotime} = $ct;
 		$self->add_message($mime, $bytes, $artnum, $oid, $mid0);
 	}
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b42e6a13..dea9fff9 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -150,9 +150,9 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$$$$) {
 	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self);
 	my $idx = idx_shard($self, $num % $self->{shards});
-	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
 	my $n = $self->{transact_bytes} += $len;
 	$n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
 }
@@ -1266,15 +1266,17 @@ sub index_epoch ($$$) {
 		$pr->("$i.git indexing $range\n");
 	}
 
-	my @cmd = qw(log --raw -r --pretty=tformat:%H
+	my @cmd = qw(log --raw -r --pretty=tformat:%H.%at.%ct
 			--no-notes --no-color --no-abbrev --no-renames);
 	my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
 	my $cmt;
 	while (<$fh>) {
 		chomp;
 		$self->{current_info} = "$i.git $_";
-		if (/\A$x40$/o && !defined($cmt)) {
-			$cmt = $_;
+		if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o && !defined($cmt)) {
+			$cmt = $1;
+			$self->{autime} = $2;
+			$self->{cotime} = $3;
 		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
 			reindex_oid($self, $sync, $git, $1);
 		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
diff --git a/t/index-git-times.t b/t/index-git-times.t
new file mode 100644
index 00000000..2e9e88e8
--- /dev/null
+++ b/t/index-git-times.t
@@ -0,0 +1,93 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Import;
+use PublicInbox::Config;
+use File::Path qw(remove_tree);
+
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::Over';
+
+my ($tmpdir, $for_destroy) = tmpdir();
+local $ENV{PI_CONFIG} = "$tmpdir/cfg";
+my $v1dir = "$tmpdir/v1";
+my $addr = 'x@example.com';
+run_script(['-init', '--indexlevel=medium', 'v1', $v1dir,
+		'http://example.com/x', $addr])
+	or die "init failed";
+
+{
+	my $data = <<'EOF';
+blob
+mark :1
+data 133
+From: timeless <t@example.com>
+To: x <x@example.com>
+Subject: can I haz the time?
+Message-ID: <19700101000000-1234@example.com>
+
+plz
+
+reset refs/heads/master
+commit refs/heads/master
+mark :2
+author timeless <t@example.com> 749520000 +0100
+committer x <x@example.com> 1285977600 -0100
+data 20
+can I haz the time?
+M 100644 :1 53/256f6177504c2878d3a302ef5090dacf5e752c
+
+EOF
+	pipe(my($r, $w)) or die;
+	length($data) <= 512 or die "data too large to fit in POSIX pipe";
+	print $w $data or die;
+	close $w or die;
+	my $cmd = ['git', "--git-dir=$v1dir", 'fast-import', '--quiet'];
+	PublicInbox::Import::run_die($cmd, undef, { 0 => $r });
+}
+
+run_script(['-index', $v1dir]) or die 'v1 index failed';
+my $smsg;
+{
+	my $cfg = PublicInbox::Config->new;
+	my $ibx = $cfg->lookup($addr);
+	$smsg = $ibx->over->get_art(1);
+	is($smsg->{ds}, 749520000, 'datestamp from git author time');
+	is($smsg->{ts}, 1285977600, 'timestamp from git committer time');
+	my $res = $ibx->search->query("m:$smsg->{mid}");
+	is(scalar @$res, 1, 'got one result for m:');
+	is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp');
+	$res = $ibx->search->query('d:19931002..19931002');
+	is(scalar @$res, 1, 'got one result for d:');
+	is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp');
+}
+SKIP: {
+	require_git(2.6, 1) or skip('git 2.6+ required for v2', 10);
+	my $v2dir = "$tmpdir/v2";
+	run_script(['-convert', $v1dir, $v2dir]) or die 'v2 conversion failed';
+
+	my $check_v2 = sub {
+		my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir,
+				address => $addr});
+		my $v2smsg = $ibx->over->get_art(1);
+		is($v2smsg->{ds}, $smsg->{ds},
+			'v2 datestamp from git author time');
+		is($v2smsg->{ts}, $smsg->{ts},
+			'v2 timestamp from git committer time');
+		my $res = $ibx->search->query("m:$smsg->{mid}");
+		is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp');
+		$res = $ibx->search->query('d:19931002..19931002');
+		is(scalar @$res, 1, 'got one result for d:');
+		is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp');
+	};
+	$check_v2->();
+	remove_tree($v2dir);
+
+	# test non-parallelized conversion
+	run_script(['-convert', '-j0', $v1dir, $v2dir]) or
+		die 'v2 conversion failed';
+	$check_v2->();
+}
+
+done_testing;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] index: use git commit times on missing Date/Received
  2020-03-05  3:23 [PATCH] index: use git commit times on missing Date/Received Eric Wong
@ 2020-03-05  5:13 ` Eric Wong
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Wong @ 2020-03-05  5:13 UTC (permalink / raw)
  To: meta

Erm... sent prematurely :x  Warns in tests.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 0/9] preserve time and date of initial commit
  2020-03-05  5:13 ` Eric Wong
@ 2020-03-20  8:18   ` Eric Wong
  2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
                       ` (8 more replies)
  0 siblings, 9 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

For messages lacking Date and/or Received headers, search
queries for "d:YYYYMMDD..YYYYMMDD" ranges can be unreliable in
mirrors, as can the $INBOX_URL/?t=$TIMESTAMP query which
only hits SQLite.

Yes, this ended up being a lot of work to deal with corner case
messages (probably most of which are spam), but there's also a
lot of internal cleanups which made the end result easier to
follow, I think...

The main fix is actually in 1/9, but it's gross.

Patch 2/9 fixes a small window where a race can happen and
cause searches to be off by a minute.

Patches 3-8 cleanup the mess left in 1 and 2,

Finally, patch 9 fixes the corner-case-of-corner-cases for
dealing with multi-MID messages which require a one-off queue to
store the git commit/author times instead of overloading msgmap.

Eric Wong (9):
  index: use git commit times on missing Date/Received
  v2writable: preserve timestamps from import if generated
  rename PublicInbox::SearchMsg => PublicInbox::Smsg
  smsg: to_doc_data: use existing fields
  overidx: parse_references: less error-prone args
  *idx: pass $smsg in more places instead of many args
  v2: pass smsg in more places
  *idx: pass smsg in even more places
  v2: SDBM-based multi Message-ID queue

 Documentation/mknews.perl                   |  4 +-
 Documentation/technical/data_structures.txt |  4 +-
 MANIFEST                                    |  4 +-
 lib/PublicInbox/ExtMsg.pm                   |  3 +-
 lib/PublicInbox/Feed.pm                     |  4 +-
 lib/PublicInbox/Import.pm                   | 19 +++--
 lib/PublicInbox/Inbox.pm                    |  2 +-
 lib/PublicInbox/Mbox.pm                     |  4 +-
 lib/PublicInbox/MsgTime.pm                  | 12 +--
 lib/PublicInbox/MultiMidQueue.pm            | 57 +++++++++++++
 lib/PublicInbox/NNTP.pm                     | 14 ++--
 lib/PublicInbox/Over.pm                     |  8 +-
 lib/PublicInbox/OverIdx.pm                  | 30 +++----
 lib/PublicInbox/Search.pm                   |  8 +-
 lib/PublicInbox/SearchIdx.pm                | 68 ++++++++++-----
 lib/PublicInbox/SearchIdxShard.pm           | 26 ++++--
 lib/PublicInbox/SearchView.pm               |  8 +-
 lib/PublicInbox/{SearchMsg.pm => Smsg.pm}   | 19 +++--
 lib/PublicInbox/SolverGit.pm                |  2 +-
 lib/PublicInbox/V2Writable.pm               | 84 ++++++++++++-------
 lib/PublicInbox/View.pm                     |  2 +-
 t/import.t                                  | 14 ++--
 t/index-git-times.t                         | 93 +++++++++++++++++++++
 t/multi-mid.t                               | 27 +++++-
 t/search-thr-index.t                        | 17 +++-
 t/thread-cycle.t                            |  2 +-
 26 files changed, 386 insertions(+), 149 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm
 rename lib/PublicInbox/{SearchMsg.pm => Smsg.pm} (92%)
 create mode 100644 t/index-git-times.t

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/9] index: use git commit times on missing Date/Received
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
                       ` (7 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

When indexing messages without Date: and/or Received: headers,
fall back to using timestamps originally recorded by git in the
commit object.  This allows git mirrors to preserve the import
datestamp and timestamp of a message according to what was fed
into git, instead of blindly falling back to the current time.
---
 MANIFEST                          |  1 +
 lib/PublicInbox/MsgTime.pm        | 12 ++--
 lib/PublicInbox/OverIdx.pm        | 10 +++-
 lib/PublicInbox/SearchIdx.pm      | 17 ++++--
 lib/PublicInbox/SearchIdxShard.pm | 15 +++--
 lib/PublicInbox/V2Writable.pm     | 12 ++--
 t/index-git-times.t               | 93 +++++++++++++++++++++++++++++++
 7 files changed, 138 insertions(+), 22 deletions(-)
 create mode 100644 t/index-git-times.t

diff --git a/MANIFEST b/MANIFEST
index 265ad909..f9230cd3 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -242,6 +242,7 @@ t/httpd.t
 t/hval.t
 t/import.t
 t/inbox.t
+t/index-git-times.t
 t/indexlevels-mirror-v1.t
 t/indexlevels-mirror.t
 t/init.t
diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm
index 8703d7bc..bd7ef811 100644
--- a/lib/PublicInbox/MsgTime.pm
+++ b/lib/PublicInbox/MsgTime.pm
@@ -167,21 +167,21 @@ sub msg_date_only ($) {
 }
 
 # Favors Received header for sorting globally
-sub msg_timestamp ($) {
-	my ($hdr) = @_; # Email::MIME::Header
+sub msg_timestamp ($;$) {
+	my ($hdr, $fallback) = @_; # Email::MIME::Header
 	my $ret;
 	$ret = msg_received_at($hdr) and return time_response($ret);
 	$ret = msg_date_only($hdr) and return time_response($ret);
-	wantarray ? (time, '+0000') : time;
+	time_response([ $fallback // time, '+0000' ]);
 }
 
 # Favors the Date: header for display and sorting within a thread
-sub msg_datestamp ($) {
-	my ($hdr) = @_; # Email::MIME::Header
+sub msg_datestamp ($;$) {
+	my ($hdr, $fallback) = @_; # Email::MIME::Header
 	my $ret;
 	$ret = msg_date_only($hdr) and return time_response($ret);
 	$ret = msg_received_at($hdr) and return time_response($ret);
-	wantarray ? (time, '+0000') : time;
+	time_response([ $fallback // time, '+0000' ]);
 }
 
 1;
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 0549c68b..9ee6d613 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -15,6 +15,7 @@ use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
 use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::SearchMsg qw(subject_normalized);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
 
@@ -246,7 +247,7 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+	my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_;
 	my $lines = $mime->body_raw =~ tr!\n!\n!;
 	my $smsg = bless {
 		mime => $mime,
@@ -255,7 +256,8 @@ sub add_overview {
 		lines => $lines,
 		blob => $oid,
 	}, 'PublicInbox::SearchMsg';
-	my $mids = mids_for_index($mime->header_obj);
+	my $hdr = $mime->header_obj;
+	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $mid0, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
@@ -266,7 +268,9 @@ sub add_overview {
 	my $dd = $smsg->to_doc_data($oid, $mid0);
 	utf8::encode($dd);
 	$dd = compress($dd);
-	my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ];
+	my $ds = msg_timestamp($hdr, $times->{autime});
+	my $ts = msg_datestamp($hdr, $times->{cotime});
+	my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ];
 	add_over($self, $values);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c33a48c3..261deb84 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -19,6 +19,7 @@ use POSIX qw(strftime);
 use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::Git qw(git_unquote);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 my $X = \%PublicInbox::Search::X;
 my ($DB_CREATE_OR_OPEN, $DB_OPEN);
 use constant {
@@ -308,10 +309,13 @@ sub index_xapian { # msg_iter callback
 sub add_xapian ($$$$$$) {
 	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
 	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $hdr = $mime->header_obj;
+	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
+	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
 	my $doc = $X->{Document}->new;
 	my $subj = $smsg->subject;
-	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-	my @ds = gmtime($smsg->ds);
+	add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
+	my @ds = gmtime($smsg->{ds});
 	my $yyyymmdd = strftime('%Y%m%d', @ds);
 	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
 	my $dt = strftime('%Y%m%d%H%M%S', @ds);
@@ -375,7 +379,8 @@ sub add_message {
 			add_xapian($self, $mime, $num, $oid, $mids, $mid0);
 		}
 		if (my $over = $self->{over}) {
-			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
+			$over->add_overview($mime, $bytes, $num, $oid, $mid0,
+						$self);
 		}
 	};
 
@@ -596,6 +601,10 @@ sub read_log {
 		} elsif ($line =~ /^commit ($h40)/o) {
 			$latest = $1;
 			$newest ||= $latest;
+		} elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) {
+			$self->{over}->{autime} = $self->{autime} = $1;
+		} elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) {
+			$self->{over}->{cotime} = $self->{cotime} = $1;
 		}
 	}
 	close($log) or die "git log failed: \$?=$?";
@@ -651,7 +660,7 @@ sub _git_log {
 		$self->{regen_down} = $high + $fcount;
 	}
 
-	$git->popen(qw/log --no-notes --no-color --no-renames
+	$git->popen(qw/log --pretty=raw --no-notes --no-color --no-renames
 				--raw -r --no-abbrev/, $range);
 }
 
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index ee176e50..74c624a4 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -67,12 +67,15 @@ sub shard_worker_loop ($$$$$) {
 			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
-			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
+			my ($len, $artnum, $oid, $mid0, $autime, $cotime) =
+							split(/ /, $line);
 			$self->begin_txn_lazy;
 			my $n = read($r, my $msg, $len) or die "read: $!\n";
 			$n == $len or die "short read: $n != $len\n";
 			my $mime = PublicInbox::MIME->new(\$msg);
 			$artnum = int($artnum);
+			$self->{autime} = $autime;
+			$self->{cotime} = $cotime;
 			$self->add_message($mime, $n, $artnum, $oid, $mid0);
 		}
 	}
@@ -81,13 +84,17 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_;
+	my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime, $times) = @_;
+	my $at = $times->{autime} // time;
+	my $ct = $times->{cotime} // time;
 	if (my $w = $self->{w}) {
-		print $w "$bytes $artnum $oid $mid0\n", $$msgref or die
-			"failed to write shard $!\n";
+		print $w "$bytes $artnum $oid $mid0 $at $ct\n", $$msgref or
+			die "failed to write shard $!\n";
 	} else {
 		$$msgref = undef;
 		$self->begin_txn_lazy;
+		$self->{autime} = $at;
+		$self->{cotime} = $ct;
 		$self->add_message($mime, $bytes, $artnum, $oid, $mid0);
 	}
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b42e6a13..f1842843 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -150,9 +150,9 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$$$$) {
 	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self);
 	my $idx = idx_shard($self, $num % $self->{shards});
-	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
 	my $n = $self->{transact_bytes} += $len;
 	$n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
 }
@@ -1266,15 +1266,17 @@ sub index_epoch ($$$) {
 		$pr->("$i.git indexing $range\n");
 	}
 
-	my @cmd = qw(log --raw -r --pretty=tformat:%H
+	my @cmd = qw(log --raw -r --pretty=tformat:%H.%at.%ct
 			--no-notes --no-color --no-abbrev --no-renames);
 	my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
 	my $cmt;
 	while (<$fh>) {
 		chomp;
 		$self->{current_info} = "$i.git $_";
-		if (/\A$x40$/o && !defined($cmt)) {
-			$cmt = $_;
+		if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o) {
+			$cmt //= $1;
+			$self->{autime} = $2;
+			$self->{cotime} = $3;
 		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
 			reindex_oid($self, $sync, $git, $1);
 		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
diff --git a/t/index-git-times.t b/t/index-git-times.t
new file mode 100644
index 00000000..2e9e88e8
--- /dev/null
+++ b/t/index-git-times.t
@@ -0,0 +1,93 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::Import;
+use PublicInbox::Config;
+use File::Path qw(remove_tree);
+
+require_mods(qw(DBD::SQLite Search::Xapian));
+use_ok 'PublicInbox::Over';
+
+my ($tmpdir, $for_destroy) = tmpdir();
+local $ENV{PI_CONFIG} = "$tmpdir/cfg";
+my $v1dir = "$tmpdir/v1";
+my $addr = 'x@example.com';
+run_script(['-init', '--indexlevel=medium', 'v1', $v1dir,
+		'http://example.com/x', $addr])
+	or die "init failed";
+
+{
+	my $data = <<'EOF';
+blob
+mark :1
+data 133
+From: timeless <t@example.com>
+To: x <x@example.com>
+Subject: can I haz the time?
+Message-ID: <19700101000000-1234@example.com>
+
+plz
+
+reset refs/heads/master
+commit refs/heads/master
+mark :2
+author timeless <t@example.com> 749520000 +0100
+committer x <x@example.com> 1285977600 -0100
+data 20
+can I haz the time?
+M 100644 :1 53/256f6177504c2878d3a302ef5090dacf5e752c
+
+EOF
+	pipe(my($r, $w)) or die;
+	length($data) <= 512 or die "data too large to fit in POSIX pipe";
+	print $w $data or die;
+	close $w or die;
+	my $cmd = ['git', "--git-dir=$v1dir", 'fast-import', '--quiet'];
+	PublicInbox::Import::run_die($cmd, undef, { 0 => $r });
+}
+
+run_script(['-index', $v1dir]) or die 'v1 index failed';
+my $smsg;
+{
+	my $cfg = PublicInbox::Config->new;
+	my $ibx = $cfg->lookup($addr);
+	$smsg = $ibx->over->get_art(1);
+	is($smsg->{ds}, 749520000, 'datestamp from git author time');
+	is($smsg->{ts}, 1285977600, 'timestamp from git committer time');
+	my $res = $ibx->search->query("m:$smsg->{mid}");
+	is(scalar @$res, 1, 'got one result for m:');
+	is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp');
+	$res = $ibx->search->query('d:19931002..19931002');
+	is(scalar @$res, 1, 'got one result for d:');
+	is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp');
+}
+SKIP: {
+	require_git(2.6, 1) or skip('git 2.6+ required for v2', 10);
+	my $v2dir = "$tmpdir/v2";
+	run_script(['-convert', $v1dir, $v2dir]) or die 'v2 conversion failed';
+
+	my $check_v2 = sub {
+		my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir,
+				address => $addr});
+		my $v2smsg = $ibx->over->get_art(1);
+		is($v2smsg->{ds}, $smsg->{ds},
+			'v2 datestamp from git author time');
+		is($v2smsg->{ts}, $smsg->{ts},
+			'v2 timestamp from git committer time');
+		my $res = $ibx->search->query("m:$smsg->{mid}");
+		is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp');
+		$res = $ibx->search->query('d:19931002..19931002');
+		is(scalar @$res, 1, 'got one result for d:');
+		is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp');
+	};
+	$check_v2->();
+	remove_tree($v2dir);
+
+	# test non-parallelized conversion
+	run_script(['-convert', '-j0', $v1dir, $v2dir]) or
+		die 'v2 conversion failed';
+	$check_v2->();
+}
+
+done_testing;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/9] v2writable: preserve timestamps from import
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
  2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
                       ` (6 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

While v2 indexing is triggered immediately after writing the
commit to the git repository, there may be a gap between when
PublicInbox::Import generates a timestamp and when
PublicInbox::SearchIdx sees the message.  So follow the mirror
indexing behavior and take the to-be-indexed (time|date)stamps
directly from the git commit.
---
 lib/PublicInbox/Import.pm     | 12 ++++++++----
 lib/PublicInbox/V2Writable.pm |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 68dc0c7e..3853ff2b 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -274,8 +274,8 @@ sub git_timestamp {
 	"$ts $zone";
 }
 
-sub extract_cmt_info ($) {
-	my ($mime) = @_;
+sub extract_cmt_info ($;$) {
+	my ($mime, $v2w) = @_;
 
 	my $sender = '';
 	my $from = $mime->header('From');
@@ -325,6 +325,10 @@ sub extract_cmt_info ($) {
 	utf8::encode($subject);
 	my $at = git_timestamp(my @at = msg_datestamp($hdr));
 	my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
+	if ($v2w) { # set fallbacks in case message had no date
+		$v2w->{autime} = $at[0];
+		$v2w->{cotime} = $ct[0];
+	}
 	($name, $email, $at, $ct, $subject);
 }
 
@@ -370,9 +374,9 @@ sub clean_tree_v2 ($$$) {
 # returns undef on duplicate
 # returns the :MARK of the most recent commit
 sub add {
-	my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
+	my ($self, $mime, $check_cb, $v2w) = @_; # mime = Email::MIME
 
-	my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+	my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $v2w);
 	my $path_type = $self->{path_type};
 	my $path;
 	if ($path_type eq '2/38') {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index f1842843..d39a6f89 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -176,7 +176,7 @@ sub _add {
 	defined $num or return; # duplicate
 	defined $mid0 or die "BUG: $mid0 undefined\n";
 	my $im = $self->importer;
-	my $cmt = $im->add($mime);
+	my $cmt = $im->add($mime, undef, $self); # sets $self->{(au|co)time}
 	$cmt = $im->get_mark($cmt);
 	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
  2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
  2020-03-20  8:18     ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
                       ` (5 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

Since the introduction of over.sqlite3, SearchMsg is not tied to
our search functionality in any way, so stop confusing ourselves
and future hackers by just calling it "PublicInbox::Smsg".

Add a missing "use" in ExtMsg while we're at it.
---
 Documentation/mknews.perl                   |  4 ++--
 Documentation/technical/data_structures.txt |  4 ++--
 MANIFEST                                    |  2 +-
 lib/PublicInbox/ExtMsg.pm                   |  3 ++-
 lib/PublicInbox/Feed.pm                     |  4 ++--
 lib/PublicInbox/Inbox.pm                    |  2 +-
 lib/PublicInbox/Mbox.pm                     |  4 ++--
 lib/PublicInbox/NNTP.pm                     | 14 +++++++-------
 lib/PublicInbox/Over.pm                     |  8 ++++----
 lib/PublicInbox/OverIdx.pm                  |  4 ++--
 lib/PublicInbox/Search.pm                   |  8 ++++----
 lib/PublicInbox/SearchIdx.pm                |  4 ++--
 lib/PublicInbox/SearchView.pm               |  8 ++++----
 lib/PublicInbox/{SearchMsg.pm => Smsg.pm}   | 12 +++++++-----
 lib/PublicInbox/SolverGit.pm                |  2 +-
 lib/PublicInbox/View.pm                     |  2 +-
 t/thread-cycle.t                            |  2 +-
 17 files changed, 45 insertions(+), 42 deletions(-)
 rename lib/PublicInbox/{SearchMsg.pm => Smsg.pm} (93%)

diff --git a/Documentation/mknews.perl b/Documentation/mknews.perl
index d803ca77..adb83832 100755
--- a/Documentation/mknews.perl
+++ b/Documentation/mknews.perl
@@ -103,7 +103,7 @@ sub mime2txt {
 
 sub mime2html {
 	my ($out, $mime, $ctx) = @_;
-	my $smsg = bless { mime => $mime }, 'PublicInbox::SearchMsg';
+	my $smsg = bless { mime => $mime }, 'PublicInbox::Smsg';
 	print $out PublicInbox::View::index_entry($smsg, $ctx, 1) or die;
 }
 
@@ -147,7 +147,7 @@ EOF
 
 sub mime2atom  {
 	my ($out, $astream, $mime, $ctx) = @_;
-	my $smsg = bless { mime => $mime }, 'PublicInbox::SearchMsg';
+	my $smsg = bless { mime => $mime }, 'PublicInbox::Smsg';
 	if (defined(my $str = $astream->feed_entry($smsg))) {
 		print $out $str or die;
 	}
diff --git a/Documentation/technical/data_structures.txt b/Documentation/technical/data_structures.txt
index 4de83a77..08dfc7ac 100644
--- a/Documentation/technical/data_structures.txt
+++ b/Documentation/technical/data_structures.txt
@@ -47,7 +47,7 @@ Per-message classes
   Our PublicInbox::V2Writable class may have two objects of this
   type in memory at-a-time for deduplication.
 
-* PublicInbox::SearchMsg - small message skeleton
+* PublicInbox::Smsg - small message skeleton
   Used by: PublicInbox::{NNTP,WWW,SearchIdx}
   Common abbreviation: $smsg
 
@@ -69,7 +69,7 @@ Per-message classes
   JWZ's algorithm: <https://www.jwz.org/doc/threading.html>.
   This holds a $smsg and is only used for message threading.
   This wrapper class may go away in the future and handled
-  directly by PublicInbox::SearchMsg to save memory.
+  directly by PublicInbox::Smsg to save memory.
 
   As with $smsg objects, there may be hundreds or thousands
   of these objects in memory at-a-time.
diff --git a/MANIFEST b/MANIFEST
index f9230cd3..ec80c90f 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -145,10 +145,10 @@ lib/PublicInbox/SaPlugin/ListMirror.pod
 lib/PublicInbox/Search.pm
 lib/PublicInbox/SearchIdx.pm
 lib/PublicInbox/SearchIdxShard.pm
-lib/PublicInbox/SearchMsg.pm
 lib/PublicInbox/SearchThread.pm
 lib/PublicInbox/SearchView.pm
 lib/PublicInbox/Sigfd.pm
+lib/PublicInbox/Smsg.pm
 lib/PublicInbox/SolverGit.pm
 lib/PublicInbox/Spamcheck.pm
 lib/PublicInbox/Spamcheck/Spamc.pm
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 44884ad2..4d753a7e 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -10,6 +10,7 @@ use strict;
 use warnings;
 use PublicInbox::Hval qw(ascii_html prurl mid_href);
 use PublicInbox::WwwStream;
+use PublicInbox::Smsg;
 our $MIN_PARTIAL_LEN = 16;
 
 # TODO: user-configurable
@@ -29,7 +30,7 @@ our @EXT_URL = map { ascii_html($_) } (
 sub PARTIAL_MAX () { 100 }
 
 sub mids_from_mset { # Search::retry_reopen callback
-	[ map { PublicInbox::SearchMsg::from_mitem($_)->mid } $_[0]->items ];
+	[ map { PublicInbox::Smsg::from_mitem($_)->mid } $_[0]->items ];
 }
 
 sub search_partial ($$) {
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index ae69af36..07347c63 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -8,7 +8,7 @@ use warnings;
 use PublicInbox::MIME;
 use PublicInbox::View;
 use PublicInbox::WwwAtomStream;
-use PublicInbox::SearchMsg; # this loads w/o Search::Xapian
+use PublicInbox::Smsg; # this loads w/o Search::Xapian
 
 sub generate_i {
 	my ($ctx) = @_;
@@ -142,7 +142,7 @@ sub recent_msgs {
 	}
 
 	$ctx->{next_page} = "r=$last_commit" if $last_commit;
-	[ map { bless {blob => $_ }, 'PublicInbox::SearchMsg' } @oids ];
+	[ map { bless {blob => $_ }, 'PublicInbox::Smsg' } @oids ];
 }
 
 1;
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 06ce9ebf..4f27d1bb 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -346,7 +346,7 @@ sub smsg_by_mid ($$) {
 	# favor the Message-ID we used for the NNTP article number:
 	defined(my $num = mid2num($self, $mid)) or return;
 	my $smsg = $over->get_art($num) or return;
-	PublicInbox::SearchMsg::psgi_cull($smsg);
+	PublicInbox::Smsg::psgi_cull($smsg);
 }
 
 sub msg_by_mid ($$;$) {
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 5693d30b..4f632d63 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -12,7 +12,7 @@ use strict;
 use warnings;
 use PublicInbox::MID qw/mid_escape/;
 use PublicInbox::Hval qw/to_filename/;
-use PublicInbox::SearchMsg;
+use PublicInbox::Smsg;
 use Email::Simple;
 use Email::MIME::Encode;
 
@@ -204,7 +204,7 @@ sub results_cb {
 	my $srch = $ctx->{srch};
 	while (1) {
 		while (my $mi = (($mset->items)[$ctx->{iter}++])) {
-			my $smsg = PublicInbox::SearchMsg::from_mitem($mi,
+			my $smsg = PublicInbox::Smsg::from_mitem($mi,
 								$srch) or next;
 			return $smsg;
 		}
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 84335d30..277657e6 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -714,11 +714,11 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin
 sub over_header_for {
 	my ($over, $num, $field) = @_;
 	my $smsg = $over->get_art($num) or return;
-	return PublicInbox::SearchMsg::date($smsg) if $field eq 'date';
+	return PublicInbox::Smsg::date($smsg) if $field eq 'date';
 	$smsg->{$field};
 }
 
-sub searchmsg_range_i {
+sub smsg_range_i {
 	my ($self, $beg, $end, $field) = @_;
 	my $over = $self->{ng}->over;
 	my $msgs = $over->query_xover($$beg, $end);
@@ -732,7 +732,7 @@ sub searchmsg_range_i {
 	$$beg = $msgs->[-1]->{num} + 1;
 }
 
-sub hdr_searchmsg ($$$$) {
+sub hdr_smsg ($$$$) {
 	my ($self, $xhdr, $field, $range) = @_;
 	if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID
 		my ($ng, $n) = mid_lookup($self, $1);
@@ -744,7 +744,7 @@ sub hdr_searchmsg ($$$$) {
 		my $r = get_range($self, $range);
 		return $r unless ref $r;
 		more($self, $xhdr ? r221 : r225);
-		long_response($self, \&searchmsg_range_i, @$r, $field);
+		long_response($self, \&smsg_range_i, @$r, $field);
 	}
 }
 
@@ -757,9 +757,9 @@ sub do_hdr ($$$;$) {
 		hdr_xref($self, $xhdr, $range);
 	} elsif ($sub =~ /\A(?:subject|references|date|from|to|cc|
 				bytes|lines)\z/x) {
-		hdr_searchmsg($self, $xhdr, $sub, $range);
+		hdr_smsg($self, $xhdr, $sub, $range);
 	} elsif ($sub =~ /\A:(bytes|lines)\z/) {
-		hdr_searchmsg($self, $xhdr, $1, $range);
+		hdr_smsg($self, $xhdr, $1, $range);
 	} else {
 		$xhdr ? (r221 . "\r\n.") : "503 HDR not permitted on $header";
 	}
@@ -831,7 +831,7 @@ sub over_line ($$$$) {
 	my $s = join("\t", $num,
 		$smsg->{subject},
 		$smsg->{from},
-		PublicInbox::SearchMsg::date($smsg),
+		PublicInbox::Smsg::date($smsg),
 		"<$smsg->{mid}>",
 		$smsg->{references},
 		$smsg->{bytes},
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index b9b02f96..286fb7f6 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -9,7 +9,7 @@ use strict;
 use warnings;
 use DBI;
 use DBD::SQLite;
-use PublicInbox::SearchMsg;
+use PublicInbox::Smsg;
 use Compress::Zlib qw(uncompress);
 use constant DEFAULT_LIMIT => 1000;
 
@@ -41,14 +41,14 @@ sub connect { $_[0]->{dbh} ||= $_[0]->dbh_new }
 
 sub load_from_row ($;$) {
 	my ($smsg, $cull) = @_;
-	bless $smsg, 'PublicInbox::SearchMsg';
+	bless $smsg, 'PublicInbox::Smsg';
 	if (defined(my $data = delete $smsg->{ddd})) {
 		$data = uncompress($data);
 		utf8::decode($data);
-		PublicInbox::SearchMsg::load_from_data($smsg, $data);
+		PublicInbox::Smsg::load_from_data($smsg, $data);
 
 		# saves over 600K for 1000+ message threads
-		PublicInbox::SearchMsg::psgi_cull($smsg) if $cull;
+		PublicInbox::Smsg::psgi_cull($smsg) if $cull;
 	}
 	$smsg
 }
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 9ee6d613..fd521bdd 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -14,7 +14,7 @@ use base qw(PublicInbox::Over);
 use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
 use PublicInbox::MID qw/id_compress mids_for_index references/;
-use PublicInbox::SearchMsg qw(subject_normalized);
+use PublicInbox::Smsg qw(subject_normalized);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
@@ -255,7 +255,7 @@ sub add_overview {
 		bytes => $bytes,
 		lines => $lines,
 		blob => $oid,
-	}, 'PublicInbox::SearchMsg';
+	}, 'PublicInbox::Smsg';
 	my $hdr = $mime->header_obj;
 	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $mid0, $mids);
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7f901125..9a394404 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -12,7 +12,7 @@ use constant TS => 0;  # Received: header in Unix time
 use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI
 use constant DT => 2; # Date: YYYYMMDDHHMMSS
 
-use PublicInbox::SearchMsg;
+use PublicInbox::Smsg;
 use PublicInbox::Over;
 my $QP_FLAGS;
 our %X = map { $_ => 0 } qw(BoolWeight Database Enquire
@@ -36,8 +36,8 @@ sub load_xapian () {
 		$ENQ_ASCENDING = $x eq 'Xapian' ?
 				1 : Search::Xapian::ENQ_ASCENDING();
 
-		# for SearchMsg:
-		*PublicInbox::SearchMsg::sortable_unserialise =
+		# for Smsg:
+		*PublicInbox::Smsg::sortable_unserialise =
 						$Xap.'::sortable_unserialise';
 		# n.b. FLAG_PURE_NOT is expensive not suitable for a public
 		# website as it could become a denial-of-service vector
@@ -279,7 +279,7 @@ sub _enquire_once { # retry_reopen callback
 	my $limit = $opts->{limit} || 50;
 	my $mset = $enquire->get_mset($offset, $limit);
 	return $mset if $opts->{mset};
-	my @msgs = map { PublicInbox::SearchMsg::from_mitem($_) } $mset->items;
+	my @msgs = map { PublicInbox::Smsg::from_mitem($_) } $mset->items;
 	return \@msgs unless wantarray;
 
 	($mset->get_matches_estimated, \@msgs)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 261deb84..6e6c6424 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -308,7 +308,7 @@ sub index_xapian { # msg_iter callback
 
 sub add_xapian ($$$$$$) {
 	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
-	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $smsg = PublicInbox::Smsg->new($mime);
 	my $hdr = $mime->header_obj;
 	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
 	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
@@ -465,7 +465,7 @@ sub remove_by_oid {
 	for (; $head != $tail; $head++) {
 		my $docid = $head->get_docid;
 		my $doc = $db->get_document($docid);
-		my $smsg = PublicInbox::SearchMsg->wrap($mid);
+		my $smsg = PublicInbox::Smsg->wrap($mid);
 		$smsg->load_expand($doc);
 		if ($smsg->{blob} eq $oid) {
 			push(@delete, $docid);
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 287b0a28..4fbf59ef 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -6,7 +6,7 @@ package PublicInbox::SearchView;
 use strict;
 use warnings;
 use URI::Escape qw(uri_unescape uri_escape);
-use PublicInbox::SearchMsg;
+use PublicInbox::Smsg;
 use PublicInbox::Hval qw(ascii_html obfuscate_addrs mid_href);
 use PublicInbox::View;
 use PublicInbox::WwwAtomStream;
@@ -100,7 +100,7 @@ sub mset_summary {
 	foreach my $m ($mset->items) {
 		my $rank = sprintf("%${pad}d", $m->get_rank + 1);
 		my $pct = get_pct($m);
-		my $smsg = PublicInbox::SearchMsg::from_mitem($m, $srch);
+		my $smsg = PublicInbox::Smsg::from_mitem($m, $srch);
 		unless ($smsg) {
 			eval {
 				$m = "$m ".$m->get_docid . " expired\n";
@@ -260,7 +260,7 @@ sub load_msgs {
 	my ($mset) = @_;
 	[ map {
 		my $mi = $_;
-		my $smsg = PublicInbox::SearchMsg::from_mitem($mi);
+		my $smsg = PublicInbox::Smsg::from_mitem($mi);
 		$smsg->{pct} = get_pct($mi);
 		$smsg;
 	} ($mset->items) ]
@@ -338,7 +338,7 @@ sub adump_i {
 	my ($ctx) = @_;
 	while (my $mi = shift @{$ctx->{items}}) {
 		my $smsg = eval {
-			PublicInbox::SearchMsg::from_mitem($mi, $ctx->{srch});
+			PublicInbox::Smsg::from_mitem($mi, $ctx->{srch});
 		} or next;
 		$ctx->{-inbox}->smsg_mime($smsg) and return $smsg;
 	}
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/Smsg.pm
similarity index 93%
rename from lib/PublicInbox/SearchMsg.pm
rename to lib/PublicInbox/Smsg.pm
index 84fe4802..7a47703a 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -1,11 +1,13 @@
 # Copyright (C) 2015-2020 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# based on notmuch, but with no concept of folders, files or flags
 #
-# Wraps a document inside our Xapian search index.
-# There may be many of these objects loaded in memory at once
-# for large threads in our WWW UI.
-package PublicInbox::SearchMsg;
+# A small/skeleton/slim representation of a message.
+
+# This used to be "SearchMsg", but we split out overview
+# indexing into over.sqlite3 so it's not just "search".  There
+# may be many of these objects loaded in memory at once for
+# large threads in our WWW UI and the NNTP range responses.
+package PublicInbox::Smsg;
 use strict;
 use warnings;
 use base qw(Exporter);
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index 34669dbe..c3a6c9a4 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -42,7 +42,7 @@ my $MAX_PATCH = 9999;
 #	hdr_lines => string of various header lines for mode information
 #	mode_a => original mode of oid_a (string, not integer),
 #	ibx => PublicInbox::Inbox object containing the diff
-#	smsg => PublicInbox::SearchMsg object containing diff
+#	smsg => PublicInbox::Smsg object containing diff
 #	path_a => pre-image path
 #	path_b => post-image path
 #	n => numeric path of the patch (relative to worktree)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 91443f55..5baaffaf 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -18,7 +18,7 @@ use PublicInbox::Reply;
 use PublicInbox::ViewDiff qw(flush_diff);
 use POSIX qw(strftime);
 use Time::Local qw(timegm);
-use PublicInbox::SearchMsg qw(subject_normalized);
+use PublicInbox::Smsg qw(subject_normalized);
 use constant COLS => 72;
 use constant INDENT => '  ';
 use constant TCHILD => '` ';
diff --git a/t/thread-cycle.t b/t/thread-cycle.t
index 21309bd2..e9ea0a27 100644
--- a/t/thread-cycle.t
+++ b/t/thread-cycle.t
@@ -24,7 +24,7 @@ sub make_objs {
 			'References' => $msg->{references},
 		]);
 		push @simples, $simple;
-		bless $msg, 'PublicInbox::SearchMsg'
+		bless $msg, 'PublicInbox::Smsg'
 	} @_;
 	(\@simples, \@msgs);
 }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4/9] smsg: to_doc_data: use existing fields
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (2 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
                       ` (4 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

No need to pass extra parameters to this method, since
smsg has universal meanings for {blob} and {mid}.
---
 lib/PublicInbox/OverIdx.pm   | 2 +-
 lib/PublicInbox/SearchIdx.pm | 4 +++-
 lib/PublicInbox/Smsg.pm      | 7 +++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index fd521bdd..9640f9d1 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -265,7 +265,7 @@ sub add_overview {
 		$xpath = subject_path($subj);
 		$xpath = id_compress($xpath);
 	}
-	my $dd = $smsg->to_doc_data($oid, $mid0);
+	my $dd = $smsg->to_doc_data;
 	utf8::encode($dd);
 	$dd = compress($dd);
 	my $ds = msg_timestamp($hdr, $times->{autime});
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 6e6c6424..c0578809 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -340,7 +340,9 @@ sub add_xapian ($$$$$$) {
 	}
 	$smsg->{to} = $smsg->{cc} = '';
 	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
-	my $data = $smsg->to_doc_data($oid, $mid0);
+	$smsg->{blob} = $oid;
+	$smsg->{mid} = $mid0;
+	my $data = $smsg->to_doc_data;
 	$doc->set_data($data);
 	if (my $altid = $self->{-altid}) {
 		foreach my $alt (@$altid) {
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 7a47703a..5eb0723f 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -34,16 +34,15 @@ sub get_val ($$) {
 }
 
 sub to_doc_data {
-	my ($self, $oid, $mid0) = @_;
-	$oid = '' unless defined $oid;
+	my ($self) = @_;
 	join("\n",
 		$self->subject,
 		$self->from,
 		$self->references,
 		$self->to,
 		$self->cc,
-		$oid,
-		$mid0,
+		$self->{blob},
+		$self->{mid},
 		$self->{bytes} // '',
 		$self->{lines} // ''
 	);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 5/9] overidx: parse_references: less error-prone args
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (3 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
                       ` (3 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

Favor `$smsg->{mid}' instead of `$mid0' to reduce parameters
down-the-line, but favor passing the Email::MIME::Header object
around instead of relying on the bloat-prone `$smsg->{mime}'
and calling ->header_obj on it.
---
 lib/PublicInbox/OverIdx.pm   | 8 +++-----
 lib/PublicInbox/SearchIdx.pm | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 9640f9d1..f49dfa00 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -216,15 +216,13 @@ sub link_refs {
 }
 
 sub parse_references ($$$) {
-	my ($smsg, $mid0, $mids) = @_;
-	my $mime = $smsg->{mime};
-	my $hdr = $mime->header_obj;
+	my ($smsg, $hdr, $mids) = @_;
 	my $refs = references($hdr);
 	push(@$refs, @$mids) if scalar(@$mids) > 1;
 	return $refs if scalar(@$refs) == 0;
 
 	# prevent circular references here:
-	my %seen = ( $mid0 => 1 );
+	my %seen = ( $smsg->{mid} => 1 );
 	my @keep;
 	foreach my $ref (@$refs) {
 		if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) {
@@ -258,7 +256,7 @@ sub add_overview {
 	}, 'PublicInbox::Smsg';
 	my $hdr = $mime->header_obj;
 	my $mids = mids_for_index($hdr);
-	my $refs = parse_references($smsg, $mid0, $mids);
+	my $refs = parse_references($smsg, $hdr, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
 	if ($subj ne '') {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c0578809..32be9c3f 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -339,9 +339,9 @@ sub add_xapian ($$$$$$) {
 		}
 	}
 	$smsg->{to} = $smsg->{cc} = '';
-	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
 	$smsg->{blob} = $oid;
 	$smsg->{mid} = $mid0;
+	PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
 	my $data = $smsg->to_doc_data;
 	$doc->set_data($data);
 	if (my $altid = $self->{-altid}) {

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 6/9] *idx: pass $smsg in more places instead of many args
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (4 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
                       ` (2 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

We can pass blessed PublicInbox::Smsg objects to internal
indexing APIs instead of having long parameter lists in some
places.  The end goal is to avoid parsing redundant information
each step of the way and hopefully make things more
understandable.
---
 lib/PublicInbox/OverIdx.pm        | 14 +++-------
 lib/PublicInbox/SearchIdx.pm      | 45 ++++++++++++++++++-------------
 lib/PublicInbox/SearchIdxShard.pm | 16 +++++++++--
 lib/PublicInbox/V2Writable.pm     |  8 +++++-
 t/search-thr-index.t              | 17 ++++++++++--
 5 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index f49dfa00..2d71956d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -245,15 +245,9 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_;
-	my $lines = $mime->body_raw =~ tr!\n!\n!;
-	my $smsg = bless {
-		mime => $mime,
-		mid => $mid0,
-		bytes => $bytes,
-		lines => $lines,
-		blob => $oid,
-	}, 'PublicInbox::Smsg';
+	my ($self, $mime, $smsg, $times) = @_;
+	$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+	$smsg->{mime} = $mime; # XXX temporary?
 	my $hdr = $mime->header_obj;
 	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $hdr, $mids);
@@ -268,7 +262,7 @@ sub add_overview {
 	$dd = compress($dd);
 	my $ds = msg_timestamp($hdr, $times->{autime});
 	my $ts = msg_datestamp($hdr, $times->{cotime});
-	my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ];
+	my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ];
 	add_over($self, $values);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 32be9c3f..5ca819c3 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -306,9 +306,9 @@ sub index_xapian { # msg_iter callback
 	index_body($self, $_, /\A>/ ? 0 : $doc) for @sections;
 }
 
-sub add_xapian ($$$$$$) {
-	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
-	my $smsg = PublicInbox::Smsg->new($mime);
+sub add_xapian ($$$$) {
+	my ($self, $mime, $smsg, $mids) = @_;
+	$smsg->{mime} = $mime; # XXX dangerous
 	my $hdr = $mime->header_obj;
 	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
 	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
@@ -338,9 +338,7 @@ sub add_xapian ($$$$$$) {
 			index_text($self, join(' ', @long), 1, 'XM');
 		}
 	}
-	$smsg->{to} = $smsg->{cc} = '';
-	$smsg->{blob} = $oid;
-	$smsg->{mid} = $mid0;
+	$smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
 	PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
 	my $data = $smsg->to_doc_data;
 	$doc->set_data($data);
@@ -355,7 +353,7 @@ sub add_xapian ($$$$$$) {
 		}
 	}
 	$doc->add_boolean_term('Q' . $_) foreach @$mids;
-	$self->{xdb}->replace_document($num, $doc);
+	$self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
 sub _msgmap_init ($) {
@@ -369,20 +367,25 @@ sub _msgmap_init ($) {
 
 sub add_message {
 	# mime = Email::MIME object
-	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $mids = mids_for_index($mime->header_obj);
-	$mid0 //= $mids->[0]; # v1 compatibility
-	$num //= do { # v1
+	$smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat
+	$smsg->{mid} //= $mids->[0]; # v1 compatibility
+	$smsg->{num} //= do { # v1
 		_msgmap_init($self);
 		index_mm($self, $mime);
 	};
 	eval {
-		if (need_xapian($self)) {
-			add_xapian($self, $mime, $num, $oid, $mids, $mid0);
+		# order matters, overview stores every possible piece of
+		# data in doc_data (deflated).  Xapian only stores a subset
+		# of the fields which exist in over.sqlite3.  We may stop
+		# storing doc_data in Xapian sometime after we get multi-inbox
+		# search working.
+		if (my $over = $self->{over}) { # v1 only
+			$over->add_overview($mime, $smsg, $self);
 		}
-		if (my $over = $self->{over}) {
-			$over->add_overview($mime, $bytes, $num, $oid, $mid0,
-						$self);
+		if (need_xapian($self)) {
+			add_xapian($self, $mime, $smsg, $mids);
 		}
 	};
 
@@ -390,7 +393,7 @@ sub add_message {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$num;
+	$smsg->{num};
 }
 
 # returns begin and end PostingIterator
@@ -530,9 +533,10 @@ sub unindex_mm {
 }
 
 sub index_both {
-	my ($self, $mime, $bytes, $blob) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $num = index_mm($self, $mime);
-	add_message($self, $mime, $bytes, $num, $blob);
+	$smsg->{num} = $num;
+	add_message($self, $mime, $smsg);
 }
 
 sub unindex_both {
@@ -595,8 +599,11 @@ sub read_log {
 				next;
 			}
 			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+			my $smsg = bless {}, 'PublicInbox::Smsg';
 			batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
-			$add_cb->($self, $mime, $bytes, $blob);
+			$smsg->{blob} = $blob;
+			$smsg->{bytes} = $bytes;
+			$add_cb->($self, $mime, $smsg);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
 			$D{$blob} = 1;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 74c624a4..d29e6090 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -76,7 +76,13 @@ sub shard_worker_loop ($$$$$) {
 			$artnum = int($artnum);
 			$self->{autime} = $autime;
 			$self->{cotime} = $cotime;
-			$self->add_message($mime, $n, $artnum, $oid, $mid0);
+			my $smsg = bless {
+				bytes => $len,
+				num => $artnum,
+				blob => $oid,
+				mid => $mid0,
+			}, 'PublicInbox::Smsg';
+			$self->add_message($mime, $smsg);
 		}
 	}
 	$self->worker_done;
@@ -95,7 +101,13 @@ sub index_raw {
 		$self->begin_txn_lazy;
 		$self->{autime} = $at;
 		$self->{cotime} = $ct;
-		$self->add_message($mime, $bytes, $artnum, $oid, $mid0);
+		my $smsg = bless {
+			bytes => $bytes,
+			num => $artnum,
+			blob => $oid,
+			mid => $mid0,
+		}, 'PublicInbox::Smsg';
+		$self->add_message($mime, $smsg);
 	}
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index d39a6f89..34dd139b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -150,7 +150,13 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$$$$) {
 	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self);
+	my $smsg = bless {
+		bytes => $len,
+		num => $num,
+		blob => $oid,
+		mid => $mid0,
+	}, 'PublicInbox::Smsg';
+	$self->{over}->add_overview($mime, $smsg, $self);
 	my $idx = idx_shard($self, $num % $self->{shards});
 	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
 	my $n = $self->{transact_bytes} += $len;
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
index 6a5fd919..f073304a 100644
--- a/t/search-thr-index.t
+++ b/t/search-thr-index.t
@@ -9,6 +9,7 @@ use PublicInbox::MID qw(mids);
 use Email::MIME;
 require_mods(qw(DBD::SQLite Search::Xapian));
 require PublicInbox::SearchIdx;
+require PublicInbox::Smsg;
 require PublicInbox::Inbox;
 my ($tmpdir, $for_destroy) = tmpdir();
 my $git_dir = "$tmpdir/a.git";
@@ -45,7 +46,13 @@ foreach (reverse split(/\n\n/, $data)) {
 	$mime->header_set('To' => 'git@vger.kernel.org');
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => ++$num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	push @mids, $mid;
 	ok($doc_id, 'message added: '. $mid);
 }
@@ -86,7 +93,13 @@ SELECT tid FROM over WHERE num = ? LIMIT 1
 
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, $num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => $num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	ok($doc_id, 'message reindexed'. $mid);
 	is($doc_id, $num, "article number unchanged: $num");
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 7/9] v2: pass smsg in more places
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (5 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
  2020-03-20  8:18     ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

We can pass fewer order-dependent args to V2Writable::do_idx and
SearchIdxShard::index_raw by passing the smsg object, instead.
---
 lib/PublicInbox/SearchIdxShard.pm | 27 ++++++----------
 lib/PublicInbox/V2Writable.pm     | 52 +++++++++++++++++++------------
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index d29e6090..21e81b16 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -67,20 +67,19 @@ sub shard_worker_loop ($$$$$) {
 			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
-			my ($len, $artnum, $oid, $mid0, $autime, $cotime) =
+			my ($bytes, $num, $blob, $mid, $autime, $cotime) =
 							split(/ /, $line);
 			$self->begin_txn_lazy;
-			my $n = read($r, my $msg, $len) or die "read: $!\n";
-			$n == $len or die "short read: $n != $len\n";
+			my $n = read($r, my $msg, $bytes) or die "read: $!\n";
+			$n == $bytes or die "short read: $n != $bytes\n";
 			my $mime = PublicInbox::MIME->new(\$msg);
-			$artnum = int($artnum);
 			$self->{autime} = $autime;
 			$self->{cotime} = $cotime;
 			my $smsg = bless {
-				bytes => $len,
-				num => $artnum,
-				blob => $oid,
-				mid => $mid0,
+				bytes => $bytes,
+				num => $num + 0,
+				blob => $blob,
+				mid => $mid,
 			}, 'PublicInbox::Smsg';
 			$self->add_message($mime, $smsg);
 		}
@@ -90,23 +89,17 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime, $times) = @_;
+	my ($self, $msgref, $mime, $smsg, $times) = @_;
 	my $at = $times->{autime} // time;
 	my $ct = $times->{cotime} // time;
 	if (my $w = $self->{w}) {
-		print $w "$bytes $artnum $oid $mid0 $at $ct\n", $$msgref or
-			die "failed to write shard $!\n";
+		print $w join(' ', @$smsg{qw(bytes num blob mid)}, $at, $ct),
+			"\n", $$msgref or die "failed to write shard $!\n";
 	} else {
 		$$msgref = undef;
 		$self->begin_txn_lazy;
 		$self->{autime} = $at;
 		$self->{cotime} = $ct;
-		my $smsg = bless {
-			bytes => $bytes,
-			num => $artnum,
-			blob => $oid,
-			mid => $mid0,
-		}, 'PublicInbox::Smsg';
 		$self->add_message($mime, $smsg);
 	}
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 34dd139b..b5332da4 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -148,18 +148,12 @@ sub add {
 }
 
 # indexes a message, returns true if checkpointing is needed
-sub do_idx ($$$$$$$) {
-	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	my $smsg = bless {
-		bytes => $len,
-		num => $num,
-		blob => $oid,
-		mid => $mid0,
-	}, 'PublicInbox::Smsg';
+sub do_idx ($$$$) {
+	my ($self, $msgref, $mime, $smsg) = @_;
 	$self->{over}->add_overview($mime, $smsg, $self);
-	my $idx = idx_shard($self, $num % $self->{shards});
-	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
-	my $n = $self->{transact_bytes} += $len;
+	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
+	$idx->index_raw($msgref, $mime, $smsg, $self);
+	my $n = $self->{transact_bytes} += $smsg->{bytes};
 	$n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
 }
 
@@ -186,8 +180,10 @@ sub _add {
 	$cmt = $im->get_mark($cmt);
 	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
 
-	my ($oid, $len, $msgref) = @{$im->{last_object}};
-	if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) {
+	my $msgref;
+	my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg';
+	($smsg->{blob}, $smsg->{bytes}, $msgref) = @{$im->{last_object}};
+	if (do_idx($self, $msgref, $mime, $smsg)) {
 		$self->checkpoint;
 	}
 
@@ -557,17 +553,21 @@ W: $list
 	}
 
 	# make sure we really got the OID:
-	my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid);
-	$oid eq $expect_oid or die "BUG: $expect_oid not found after replace";
+	my ($blob, $type, $bytes) = $self->{-inbox}->git->check($expect_oid);
+	$blob eq $expect_oid or die "BUG: $expect_oid not found after replace";
 
 	# don't leak FDs to Xapian:
 	$self->{-inbox}->git->cleanup;
 
 	# reindex modified messages:
 	for my $smsg (@$need_reindex) {
-		my $num = $smsg->{num};
-		my $mid0 = $smsg->{mid};
-		do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0);
+		my $new_smsg = bless {
+			blob => $blob,
+			bytes => $bytes,
+			num => $smsg->{num},
+			mid => $smsg->{mid},
+		}, 'PublicInbox::Smsg';
+		do_idx($self, \$raw, $new_mime, $new_smsg);
 	}
 	$rewritten->{rewrites};
 }
@@ -955,7 +955,13 @@ sub reindex_oid_m ($$$$;$) {
 		}
 	}
 	$sync->{nr}++;
-	if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) {
+	my $smsg = bless {
+		bytes => $len,
+		num => $num,
+		blob => $oid,
+		mid => $mid0,
+	}, 'PublicInbox::Smsg';
+	if (do_idx($self, $msgref, $mime, $smsg)) {
 		reindex_checkpoint($self, $sync, $git);
 	}
 }
@@ -1051,7 +1057,13 @@ sub reindex_oid ($$$$) {
 	$sync->{mm_tmp}->mid_delete($mid0) or
 		die "failed to delete <$mid0> for article #$num\n";
 	$sync->{nr}++;
-	if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) {
+	my $smsg = bless {
+		bytes => $len,
+		num => $num,
+		blob => $oid,
+		mid => $mid0,
+	}, 'PublicInbox::Smsg';
+	if (do_idx($self, $msgref, $mime, $smsg)) {
 		reindex_checkpoint($self, $sync, $git);
 	}
 }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 8/9] *idx: pass smsg in even more places
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (6 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  2020-03-20  8:18     ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

We can finally get rid of the awkward, ad-hoc use of V2Writable,
SearchIdx, and OverIdx args for passing {cotime} and {autime}
between classes.

We'll still use those git time fields internally within
V2Writable and SearchIdx for (re)indexing, but that's not
worth avoiding as a fallback.
---
 lib/PublicInbox/Import.pm         | 19 ++++++++++---------
 lib/PublicInbox/OverIdx.pm        |  8 ++------
 lib/PublicInbox/SearchIdx.pm      | 16 ++++++++++------
 lib/PublicInbox/SearchIdxShard.pm | 14 +++++---------
 lib/PublicInbox/V2Writable.pm     | 15 ++++++++-------
 t/import.t                        | 14 ++++++--------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 3853ff2b..c72c1e92 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -275,7 +275,7 @@ sub git_timestamp {
 }
 
 sub extract_cmt_info ($;$) {
-	my ($mime, $v2w) = @_;
+	my ($mime, $smsg) = @_;
 
 	my $sender = '';
 	my $from = $mime->header('From');
@@ -325,9 +325,9 @@ sub extract_cmt_info ($;$) {
 	utf8::encode($subject);
 	my $at = git_timestamp(my @at = msg_datestamp($hdr));
 	my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
-	if ($v2w) { # set fallbacks in case message had no date
-		$v2w->{autime} = $at[0];
-		$v2w->{cotime} = $ct[0];
+	if ($smsg) {
+		$smsg->{ds} = $at[0];
+		$smsg->{ts} = $ct[0];
 	}
 	($name, $email, $at, $ct, $subject);
 }
@@ -374,9 +374,9 @@ sub clean_tree_v2 ($$$) {
 # returns undef on duplicate
 # returns the :MARK of the most recent commit
 sub add {
-	my ($self, $mime, $check_cb, $v2w) = @_; # mime = Email::MIME
+	my ($self, $mime, $check_cb, $smsg) = @_; # mime = Email::MIME
 
-	my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $v2w);
+	my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
 	my $path_type = $self->{path_type};
 	my $path;
 	if ($path_type eq '2/38') {
@@ -406,9 +406,10 @@ sub add {
 	print $w $raw_email, "\n" or wfail;
 
 	# v2: we need this for Xapian
-	if ($self->{want_object_info}) {
-		my $oid = $self->get_mark(":$blob");
-		$self->{last_object} = [ $oid, $n, \$raw_email ];
+	if ($smsg) {
+		$smsg->{blob} = $self->get_mark(":$blob");
+		$smsg->{bytes} = $n;
+		$smsg->{-raw_email} = \$raw_email;
 	}
 	my $ref = $self->{ref};
 	my $commit = $self->{mark}++;
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 2d71956d..acbf2c8d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -15,7 +15,6 @@ use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
 use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::Smsg qw(subject_normalized);
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
 
@@ -245,7 +244,7 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $smsg, $times) = @_;
+	my ($self, $mime, $smsg) = @_;
 	$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
 	$smsg->{mime} = $mime; # XXX temporary?
 	my $hdr = $mime->header_obj;
@@ -260,10 +259,7 @@ sub add_overview {
 	my $dd = $smsg->to_doc_data;
 	utf8::encode($dd);
 	$dd = compress($dd);
-	my $ds = msg_timestamp($hdr, $times->{autime});
-	my $ts = msg_datestamp($hdr, $times->{cotime});
-	my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ];
-	add_over($self, $values);
+	add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]);
 }
 
 sub add_over {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 5ca819c3..44b05813 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -310,8 +310,6 @@ sub add_xapian ($$$$) {
 	my ($self, $mime, $smsg, $mids) = @_;
 	$smsg->{mime} = $mime; # XXX dangerous
 	my $hdr = $mime->header_obj;
-	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
-	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
 	my $doc = $X->{Document}->new;
 	my $subj = $smsg->subject;
 	add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
@@ -368,13 +366,19 @@ sub _msgmap_init ($) {
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $smsg) = @_;
-	my $mids = mids_for_index($mime->header_obj);
+	my $hdr = $mime->header_obj;
+	my $mids = mids_for_index($hdr);
 	$smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat
 	$smsg->{mid} //= $mids->[0]; # v1 compatibility
 	$smsg->{num} //= do { # v1
 		_msgmap_init($self);
 		index_mm($self, $mime);
 	};
+
+	# v1 and tests only:
+	$smsg->{ds} //= msg_datestamp($hdr, $self->{autime});
+	$smsg->{ts} //= msg_timestamp($hdr, $self->{cotime});
+
 	eval {
 		# order matters, overview stores every possible piece of
 		# data in doc_data (deflated).  Xapian only stores a subset
@@ -382,7 +386,7 @@ sub add_message {
 		# storing doc_data in Xapian sometime after we get multi-inbox
 		# search working.
 		if (my $over = $self->{over}) { # v1 only
-			$over->add_overview($mime, $smsg, $self);
+			$over->add_overview($mime, $smsg);
 		}
 		if (need_xapian($self)) {
 			add_xapian($self, $mime, $smsg, $mids);
@@ -611,9 +615,9 @@ sub read_log {
 			$latest = $1;
 			$newest ||= $latest;
 		} elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) {
-			$self->{over}->{autime} = $self->{autime} = $1;
+			$self->{autime} = $1;
 		} elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) {
-			$self->{over}->{cotime} = $self->{cotime} = $1;
+			$self->{cotime} = $1;
 		}
 	}
 	close($log) or die "git log failed: \$?=$?";
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 21e81b16..2b48b1b4 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -67,19 +67,19 @@ sub shard_worker_loop ($$$$$) {
 			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
-			my ($bytes, $num, $blob, $mid, $autime, $cotime) =
+			my ($bytes, $num, $blob, $mid, $ds, $ts) =
 							split(/ /, $line);
 			$self->begin_txn_lazy;
 			my $n = read($r, my $msg, $bytes) or die "read: $!\n";
 			$n == $bytes or die "short read: $n != $bytes\n";
 			my $mime = PublicInbox::MIME->new(\$msg);
-			$self->{autime} = $autime;
-			$self->{cotime} = $cotime;
 			my $smsg = bless {
 				bytes => $bytes,
 				num => $num + 0,
 				blob => $blob,
 				mid => $mid,
+				ds => $ds,
+				ts => $ts,
 			}, 'PublicInbox::Smsg';
 			$self->add_message($mime, $smsg);
 		}
@@ -89,17 +89,13 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $msgref, $mime, $smsg, $times) = @_;
-	my $at = $times->{autime} // time;
-	my $ct = $times->{cotime} // time;
+	my ($self, $msgref, $mime, $smsg) = @_;
 	if (my $w = $self->{w}) {
-		print $w join(' ', @$smsg{qw(bytes num blob mid)}, $at, $ct),
+		print $w join(' ', @$smsg{qw(bytes num blob mid ds ts)}),
 			"\n", $$msgref or die "failed to write shard $!\n";
 	} else {
 		$$msgref = undef;
 		$self->begin_txn_lazy;
-		$self->{autime} = $at;
-		$self->{cotime} = $ct;
 		$self->add_message($mime, $smsg);
 	}
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b5332da4..b45d2722 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,6 +19,7 @@ use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -150,9 +151,11 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
 	my ($self, $msgref, $mime, $smsg) = @_;
-	$self->{over}->add_overview($mime, $smsg, $self);
+	$smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime});
+	$smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime});
+	$self->{over}->add_overview($mime, $smsg);
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
-	$idx->index_raw($msgref, $mime, $smsg, $self);
+	$idx->index_raw($msgref, $mime, $smsg);
 	my $n = $self->{transact_bytes} += $smsg->{bytes};
 	$n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
 }
@@ -176,13 +179,12 @@ sub _add {
 	defined $num or return; # duplicate
 	defined $mid0 or die "BUG: $mid0 undefined\n";
 	my $im = $self->importer;
-	my $cmt = $im->add($mime, undef, $self); # sets $self->{(au|co)time}
+	my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg';
+	my $cmt = $im->add($mime, undef, $smsg); # sets $smsg->{ds|ts|blob}
 	$cmt = $im->get_mark($cmt);
 	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
 
-	my $msgref;
-	my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg';
-	($smsg->{blob}, $smsg->{bytes}, $msgref) = @{$im->{last_object}};
+	my $msgref = delete $smsg->{-raw_email};
 	if (do_idx($self, $msgref, $mime, $smsg)) {
 		$self->checkpoint;
 	}
@@ -793,7 +795,6 @@ sub import_init {
 	my ($self, $git, $packed_bytes, $tmp) = @_;
 	my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
 	$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
-	$im->{want_object_info} = 1;
 	$im->{lock_path} = undef;
 	$im->{path_type} = 'v2';
 	$self->{im} = $im unless $tmp;
diff --git a/t/import.t b/t/import.t
index b88d308e..703aa362 100644
--- a/t/import.t
+++ b/t/import.t
@@ -28,15 +28,13 @@ my $mime = PublicInbox::MIME->create(
 	body => "hello world\n",
 );
 my $v2 = require_git(2.6, 1);
-
-$im->{want_object_info} = 1 if $v2;
-like($im->add($mime), qr/\A:\d+\z/, 'added one message');
+my $smsg = {} if $v2;
+like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
 
 if ($v2) {
-	my $info = $im->{last_object};
-	like($info->[0], qr/\A[a-f0-9]{40}\z/, 'got last object_id');
-	is($mime->as_string, ${$info->[2]}, 'string matches');
-	is($info->[1], length(${$info->[2]}), 'length matches');
+	like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
+	is($mime->as_string, ${$smsg->{-raw_email}}, 'string matches');
+	is($smsg->{bytes}, length(${$smsg->{-raw_email}}), 'length matches');
 	my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
 	my $in = tempfile();
 	print $in $mime->as_string or die "write failed: $!";
@@ -48,7 +46,7 @@ if ($v2) {
 	is($?, 0, 'hash-object');
 	seek($out, 0, SEEK_SET);
 	chomp(my $hashed_obj = <$out>);
-	is($hashed_obj, $info->[0], "last object_id matches exp");
+	is($hashed_obj, $smsg->{blob}, "blob object_id matches exp");
 }
 
 $im->done;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 9/9] v2: SDBM-based multi Message-ID queue
  2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
                       ` (7 preceding siblings ...)
  2020-03-20  8:18     ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
@ 2020-03-20  8:18     ` Eric Wong
  8 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

This lets us store author and committer times for deferred
indexing messages with ambiguous Message-IDs.  This allows
us to reproducibly reindex messages with the git commit
and author times when a rare message lacks Received and/or
Date headers while having ambiguous Message-IDs.
---
 MANIFEST                         |  1 +
 lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/V2Writable.pm    | 23 +++++++------
 t/multi-mid.t                    | 27 +++++++++++++--
 4 files changed, 95 insertions(+), 13 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm

diff --git a/MANIFEST b/MANIFEST
index ec80c90f..f077d722 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MsgIter.pm
 lib/PublicInbox/MsgTime.pm
 lib/PublicInbox/Msgmap.pm
+lib/PublicInbox/MultiMidQueue.pm
 lib/PublicInbox/NNTP.pm
 lib/PublicInbox/NNTPD.pm
 lib/PublicInbox/NNTPdeflate.pm
diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm
new file mode 100644
index 00000000..3c28ebbc
--- /dev/null
+++ b/lib/PublicInbox/MultiMidQueue.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary queue for public-inbox-index to support multi-Message-ID
+# messages on mirrors of v2 inboxes
+package PublicInbox::MultiMidQueue;
+use strict;
+use SDBM_File; # part of Perl standard library
+use Fcntl qw(O_RDWR O_CREAT);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000);
+my $O_CLOEXEC = $e{$^O} // 0;
+
+sub new {
+	my ($class) = @_;
+	my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1);
+	my $base = $tmpdir->dirname . '/q';
+	my %sdbm;
+	my $flags = O_RDWR|O_CREAT;
+	if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) {
+		if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) {
+			die "could not tie ($base): $!";
+		}
+		$O_CLOEXEC = 0;
+	}
+
+	bless {
+		cur => 1,
+		min => 1,
+		max => 0,
+		sdbm => \%sdbm,
+		tmpdir => $tmpdir,
+	}, $class;
+}
+
+sub set_oid {
+	my ($self, $i, $oid, $v2w) = @_;
+	$self->{max} = $i if $i > $self->{max};
+	$self->{min} = $i if $i < $self->{min};
+	$self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}";
+}
+
+sub get_oid {
+	my ($self, $i, $v2w) = @_;
+	my $rec = $self->{sdbm}->{$i} or return;
+	my ($oid, $autime, $cotime) = split(/\t/, $rec);
+	$v2w->{autime} = $autime;
+	$v2w->{cotime} = $cotime;
+	$oid
+}
+
+sub push_oid {
+	my ($self, $oid, $v2w) = @_;
+	set_oid($self, $self->{cur}++, $oid, $v2w);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b45d2722..1c78ef24 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -20,6 +20,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -991,15 +992,15 @@ sub multi_mid_q_new () {
 	$multi_mid
 }
 
-sub multi_mid_q_push ($$) {
-	my ($sync, $oid) = @_;
-	my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+sub multi_mid_q_push ($$$) {
+	my ($self, $sync, $oid) = @_;
+	my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;
 	if ($sync->{reindex}) { # no regen on reindex
-		$multi_mid->mid_insert($oid);
+		$multi_mid->push_oid($oid, $self);
 	} else {
 		my $num = $sync->{regen}--;
 		die "BUG: ran out of article numbers" if $num <= 0;
-		$multi_mid->mid_set($num, $oid);
+		$multi_mid->set_oid($num, $oid, $self);
 	}
 }
 
@@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) {
 			# do not delete from {mm_tmp}, since another
 			# single-MID message may use it.
 		} else { # handle them at the end:
-			multi_mid_q_push($sync, $oid);
+			multi_mid_q_push($self, $sync, $oid);
 		}
 		return;
 	}
@@ -1352,19 +1353,21 @@ sub index_sync {
 	}
 	if (my $multi_mid = delete $sync->{multi_mid}) {
 		$git //= $self->{-inbox}->git;
-		my ($min, $max) = $multi_mid->minmax;
+		my $min = $multi_mid->{min};
+		my $max = $multi_mid->{max};
 		if ($sync->{reindex}) {
 			# we may need to create new Message-IDs if mirrors
 			# were initially indexed with old versions
 			for (my $i = $max; $i >= $min; $i--) {
-				my $oid = $multi_mid->mid_for($i);
+				my $oid;
+				$oid = $multi_mid->get_oid($i, $self) or next;
 				next unless defined $oid;
 				reindex_oid_m($self, $sync, $git, $oid);
 			}
 		} else { # regen on initial index
 			for my $num ($min..$max) {
-				my $oid = $multi_mid->mid_for($num);
-				next unless defined $oid;
+				my $oid;
+				$oid = $multi_mid->get_oid($num, $self) or next;
 				reindex_oid_m($self, $sync, $git, $oid, $num);
 			}
 		}
diff --git a/t/multi-mid.t b/t/multi-mid.t
index df865efb..87240c2c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
 use Test::More;
 use PublicInbox::MIME;
 use PublicInbox::TestCommon;
@@ -7,6 +8,7 @@ use PublicInbox::InboxWritable;
 require_git(2.6);
 require_mods(qw(DBD::SQLite));
 require PublicInbox::SearchIdx;
+my $delay = $ENV{TEST_DELAY_CONVERT};
 
 my $addr = 'test@example.com';
 my $bad = PublicInbox::MIME->new(<<EOF);
@@ -14,14 +16,12 @@ Message-ID: <a\@example.com>
 Message-ID: <b\@example.com>
 From: a\@example.com
 To: $addr
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 Subject: bad
 
 EOF
 
 my $good = PublicInbox::MIME->new(<<EOF);
 Message-ID: <b\@example.com>
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 From: b\@example.com
 To: $addr
 Subject: good
@@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) {
 		indexlevel => 'basic',
 		-primary_address => $addr,
 	}, my $creat_opt = {});
+	my @old;
 	if ('setup v1 inbox') {
 		my $im = $ibx->importer(0);
-		ok($im->add($_), 'added '.$_->header('Subject')) for @$order;
+		for (@$order) {
+			ok($im->add($_), 'added '.$_->header('Subject'));
+			sleep($delay) if $delay;
+		}
 		$im->done;
 		my $s = PublicInbox::SearchIdx->new($ibx, 1);
 		$s->index_sync;
 		$before = [ $ibx->mm->minmax ];
+		@old = ($ibx->over->get_art(1), $ibx->over->get_art(2));
 		$ibx->cleanup;
 	}
 	my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') };
@@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) {
 	$ibx->{inboxdir} = "$tmpdir/v2";
 	is_deeply([$ibx->mm->minmax], $before,
 		'min, max article numbers unchanged');
+
+	my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2, \@old, 'v2 conversion times match');
+
+	system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
+		"$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
+	$cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+		'http://example.com/v2c', 'v2c@example.com' ];
+	ok(run_script($cmd, $env), 'init clone');
+	$cmd = [ '-index', "$tmpdir/v2-clone" ];
+	sleep($delay) if $delay;
+	ok(run_script($cmd, $env), 'index the clone');
+	$ibx->cleanup;
+	$ibx->{inboxdir} = "$tmpdir/v2-clone";
+	my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2c, \@old, 'v2 clone times match');
 }
 
 done_testing();

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, back to index

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-05  3:23 [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-05  5:13 ` Eric Wong
2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
2020-03-20  8:18     ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
2020-03-20  8:18     ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
2020-03-20  8:18     ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
2020-03-20  8:18     ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
2020-03-20  8:18     ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
2020-03-20  8:18     ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
2020-03-20  8:18     ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
2020-03-20  8:18     ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git