From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 628EC1F619 for ; Thu, 5 Mar 2020 03:23:55 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] index: use git commit times on missing Date/Received Date: Thu, 5 Mar 2020 03:23:55 +0000 Message-Id: <20200305032355.26584-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: When indexing messages without Date: and/or Received: headers, fall back to using timestamps originally recorded by git in the commit object. This allows git mirrors to preserve the import datestamp and timestamp of a message according to what was fed into git, instead of blindly falling back to the current time. --- MANIFEST | 1 + lib/PublicInbox/MsgTime.pm | 12 ++-- lib/PublicInbox/OverIdx.pm | 10 +++- lib/PublicInbox/SearchIdx.pm | 17 ++++-- lib/PublicInbox/SearchIdxShard.pm | 15 +++-- lib/PublicInbox/V2Writable.pm | 12 ++-- t/index-git-times.t | 93 +++++++++++++++++++++++++++++++ 7 files changed, 138 insertions(+), 22 deletions(-) create mode 100644 t/index-git-times.t diff --git a/MANIFEST b/MANIFEST index 265ad909..f9230cd3 100644 --- a/MANIFEST +++ b/MANIFEST @@ -242,6 +242,7 @@ t/httpd.t t/hval.t t/import.t t/inbox.t +t/index-git-times.t t/indexlevels-mirror-v1.t t/indexlevels-mirror.t t/init.t diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 8703d7bc..bd7ef811 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -167,21 +167,21 @@ sub msg_date_only ($) { } # Favors Received header for sorting globally -sub msg_timestamp ($) { - my ($hdr) = @_; # Email::MIME::Header +sub msg_timestamp ($;$) { + my ($hdr, $fallback) = @_; # Email::MIME::Header my $ret; $ret = msg_received_at($hdr) and return time_response($ret); $ret = msg_date_only($hdr) and return time_response($ret); - wantarray ? (time, '+0000') : time; + time_response([ $fallback // time, '+0000' ]); } # Favors the Date: header for display and sorting within a thread -sub msg_datestamp ($) { - my ($hdr) = @_; # Email::MIME::Header +sub msg_datestamp ($;$) { + my ($hdr, $fallback) = @_; # Email::MIME::Header my $ret; $ret = msg_date_only($hdr) and return time_response($ret); $ret = msg_received_at($hdr) and return time_response($ret); - wantarray ? (time, '+0000') : time; + time_response([ $fallback // time, '+0000' ]); } 1; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 0549c68b..9ee6d613 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -15,6 +15,7 @@ use IO::Handle; use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::SearchMsg qw(subject_normalized); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use Compress::Zlib qw(compress); use PublicInbox::Search; @@ -246,7 +247,7 @@ sub subject_path ($) { } sub add_overview { - my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; + my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_; my $lines = $mime->body_raw =~ tr!\n!\n!; my $smsg = bless { mime => $mime, @@ -255,7 +256,8 @@ sub add_overview { lines => $lines, blob => $oid, }, 'PublicInbox::SearchMsg'; - my $mids = mids_for_index($mime->header_obj); + my $hdr = $mime->header_obj; + my $mids = mids_for_index($hdr); my $refs = parse_references($smsg, $mid0, $mids); my $subj = $smsg->subject; my $xpath; @@ -266,7 +268,9 @@ sub add_overview { my $dd = $smsg->to_doc_data($oid, $mid0); utf8::encode($dd); $dd = compress($dd); - my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ]; + my $ds = msg_timestamp($hdr, $times->{autime}); + my $ts = msg_datestamp($hdr, $times->{cotime}); + my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ]; add_over($self, $values); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c33a48c3..261deb84 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -19,6 +19,7 @@ use POSIX qw(strftime); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn); use PublicInbox::Git qw(git_unquote); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); my $X = \%PublicInbox::Search::X; my ($DB_CREATE_OR_OPEN, $DB_OPEN); use constant { @@ -308,10 +309,13 @@ sub index_xapian { # msg_iter callback sub add_xapian ($$$$$$) { my ($self, $mime, $num, $oid, $mids, $mid0) = @_; my $smsg = PublicInbox::SearchMsg->new($mime); + my $hdr = $mime->header_obj; + $smsg->{ds} = msg_datestamp($hdr, $self->{autime}); + $smsg->{ts} = msg_timestamp($hdr, $self->{cotime}); my $doc = $X->{Document}->new; my $subj = $smsg->subject; - add_val($doc, PublicInbox::Search::TS(), $smsg->ts); - my @ds = gmtime($smsg->ds); + add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); + my @ds = gmtime($smsg->{ds}); my $yyyymmdd = strftime('%Y%m%d', @ds); add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); my $dt = strftime('%Y%m%d%H%M%S', @ds); @@ -375,7 +379,8 @@ sub add_message { add_xapian($self, $mime, $num, $oid, $mids, $mid0); } if (my $over = $self->{over}) { - $over->add_overview($mime, $bytes, $num, $oid, $mid0); + $over->add_overview($mime, $bytes, $num, $oid, $mid0, + $self); } }; @@ -596,6 +601,10 @@ sub read_log { } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; $newest ||= $latest; + } elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) { + $self->{over}->{autime} = $self->{autime} = $1; + } elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) { + $self->{over}->{cotime} = $self->{cotime} = $1; } } close($log) or die "git log failed: \$?=$?"; @@ -651,7 +660,7 @@ sub _git_log { $self->{regen_down} = $high + $fcount; } - $git->popen(qw/log --no-notes --no-color --no-renames + $git->popen(qw/log --pretty=raw --no-notes --no-color --no-renames --raw -r --no-abbrev/, $range); } diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index ee176e50..c69e0cc8 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -67,12 +67,15 @@ sub shard_worker_loop ($$$$$) { $self->remove_by_oid($oid, $mid); } else { chomp $line; - my ($len, $artnum, $oid, $mid0) = split(/ /, $line); + my ($len, $artnum, $oid, $mid0, $autime, $cotime) = + split(/ /, $line); $self->begin_txn_lazy; my $n = read($r, my $msg, $len) or die "read: $!\n"; $n == $len or die "short read: $n != $len\n"; my $mime = PublicInbox::MIME->new(\$msg); $artnum = int($artnum); + $self->{autime} = $autime; + $self->{cotime} = $cotime; $self->add_message($mime, $n, $artnum, $oid, $mid0); } } @@ -81,13 +84,17 @@ sub shard_worker_loop ($$$$$) { # called by V2Writable sub index_raw { - my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; + my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime, $times) = @_; + my $at = $times->{autime}; + my $ct = $times->{cotime}; if (my $w = $self->{w}) { - print $w "$bytes $artnum $oid $mid0\n", $$msgref or die - "failed to write shard $!\n"; + print $w "$bytes $artnum $oid $mid0 $at $ct\n", $$msgref or + die "failed to write shard $!\n"; } else { $$msgref = undef; $self->begin_txn_lazy; + $self->{autime} = $at; + $self->{cotime} = $ct; $self->add_message($mime, $bytes, $artnum, $oid, $mid0); } } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b42e6a13..dea9fff9 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -150,9 +150,9 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$$$$) { my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); + $self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self); my $idx = idx_shard($self, $num % $self->{shards}); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); + $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self); my $n = $self->{transact_bytes} += $len; $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } @@ -1266,15 +1266,17 @@ sub index_epoch ($$$) { $pr->("$i.git indexing $range\n"); } - my @cmd = qw(log --raw -r --pretty=tformat:%H + my @cmd = qw(log --raw -r --pretty=tformat:%H.%at.%ct --no-notes --no-color --no-abbrev --no-renames); my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range); my $cmt; while (<$fh>) { chomp; $self->{current_info} = "$i.git $_"; - if (/\A$x40$/o && !defined($cmt)) { - $cmt = $_; + if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o && !defined($cmt)) { + $cmt = $1; + $self->{autime} = $2; + $self->{cotime} = $3; } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { reindex_oid($self, $sync, $git, $1); } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { diff --git a/t/index-git-times.t b/t/index-git-times.t new file mode 100644 index 00000000..2e9e88e8 --- /dev/null +++ b/t/index-git-times.t @@ -0,0 +1,93 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use Test::More; +use PublicInbox::TestCommon; +use PublicInbox::Import; +use PublicInbox::Config; +use File::Path qw(remove_tree); + +require_mods(qw(DBD::SQLite Search::Xapian)); +use_ok 'PublicInbox::Over'; + +my ($tmpdir, $for_destroy) = tmpdir(); +local $ENV{PI_CONFIG} = "$tmpdir/cfg"; +my $v1dir = "$tmpdir/v1"; +my $addr = 'x@example.com'; +run_script(['-init', '--indexlevel=medium', 'v1', $v1dir, + 'http://example.com/x', $addr]) + or die "init failed"; + +{ + my $data = <<'EOF'; +blob +mark :1 +data 133 +From: timeless +To: x +Subject: can I haz the time? +Message-ID: <19700101000000-1234@example.com> + +plz + +reset refs/heads/master +commit refs/heads/master +mark :2 +author timeless 749520000 +0100 +committer x 1285977600 -0100 +data 20 +can I haz the time? +M 100644 :1 53/256f6177504c2878d3a302ef5090dacf5e752c + +EOF + pipe(my($r, $w)) or die; + length($data) <= 512 or die "data too large to fit in POSIX pipe"; + print $w $data or die; + close $w or die; + my $cmd = ['git', "--git-dir=$v1dir", 'fast-import', '--quiet']; + PublicInbox::Import::run_die($cmd, undef, { 0 => $r }); +} + +run_script(['-index', $v1dir]) or die 'v1 index failed'; +my $smsg; +{ + my $cfg = PublicInbox::Config->new; + my $ibx = $cfg->lookup($addr); + $smsg = $ibx->over->get_art(1); + is($smsg->{ds}, 749520000, 'datestamp from git author time'); + is($smsg->{ts}, 1285977600, 'timestamp from git committer time'); + my $res = $ibx->search->query("m:$smsg->{mid}"); + is(scalar @$res, 1, 'got one result for m:'); + is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp'); + $res = $ibx->search->query('d:19931002..19931002'); + is(scalar @$res, 1, 'got one result for d:'); + is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp'); +} +SKIP: { + require_git(2.6, 1) or skip('git 2.6+ required for v2', 10); + my $v2dir = "$tmpdir/v2"; + run_script(['-convert', $v1dir, $v2dir]) or die 'v2 conversion failed'; + + my $check_v2 = sub { + my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir, + address => $addr}); + my $v2smsg = $ibx->over->get_art(1); + is($v2smsg->{ds}, $smsg->{ds}, + 'v2 datestamp from git author time'); + is($v2smsg->{ts}, $smsg->{ts}, + 'v2 timestamp from git committer time'); + my $res = $ibx->search->query("m:$smsg->{mid}"); + is($res->[0]->{ds}, $smsg->{ds}, 'Xapian stored datestamp'); + $res = $ibx->search->query('d:19931002..19931002'); + is(scalar @$res, 1, 'got one result for d:'); + is($res->[0]->{ds}, $smsg->{ds}, 'Xapian search on datestamp'); + }; + $check_v2->(); + remove_tree($v2dir); + + # test non-parallelized conversion + run_script(['-convert', '-j0', $v1dir, $v2dir]) or + die 'v2 conversion failed'; + $check_v2->(); +} + +done_testing;