diff options
author | Eric Wong <e@yhbt.net> | 2020-03-20 08:18:13 +0000 |
---|---|---|
committer | Eric Wong <e@yhbt.net> | 2020-03-22 09:00:23 +0000 |
commit | 8e81d6f0d44198717ae540421a09824d75c9bb6d (patch) | |
tree | 754d15673307a90dcba5aa4cb088f7cbcd2ece49 /lib/PublicInbox | |
parent | c29b2b7ded47def906cf00e3baad65c102304120 (diff) | |
download | public-inbox-8e81d6f0d44198717ae540421a09824d75c9bb6d.tar.gz |
When indexing messages without Date: and/or Received: headers, fall back to using timestamps originally recorded by git in the commit object. This allows git mirrors to preserve the import datestamp and timestamp of a message according to what was fed into git, instead of blindly falling back to the current time.
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r-- | lib/PublicInbox/MsgTime.pm | 12 | ||||
-rw-r--r-- | lib/PublicInbox/OverIdx.pm | 10 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 17 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdxShard.pm | 15 | ||||
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 12 |
5 files changed, 44 insertions, 22 deletions
diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 8703d7bc..bd7ef811 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -167,21 +167,21 @@ sub msg_date_only ($) { } # Favors Received header for sorting globally -sub msg_timestamp ($) { - my ($hdr) = @_; # Email::MIME::Header +sub msg_timestamp ($;$) { + my ($hdr, $fallback) = @_; # Email::MIME::Header my $ret; $ret = msg_received_at($hdr) and return time_response($ret); $ret = msg_date_only($hdr) and return time_response($ret); - wantarray ? (time, '+0000') : time; + time_response([ $fallback // time, '+0000' ]); } # Favors the Date: header for display and sorting within a thread -sub msg_datestamp ($) { - my ($hdr) = @_; # Email::MIME::Header +sub msg_datestamp ($;$) { + my ($hdr, $fallback) = @_; # Email::MIME::Header my $ret; $ret = msg_date_only($hdr) and return time_response($ret); $ret = msg_received_at($hdr) and return time_response($ret); - wantarray ? (time, '+0000') : time; + time_response([ $fallback // time, '+0000' ]); } 1; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 0549c68b..9ee6d613 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -15,6 +15,7 @@ use IO::Handle; use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::SearchMsg qw(subject_normalized); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use Compress::Zlib qw(compress); use PublicInbox::Search; @@ -246,7 +247,7 @@ sub subject_path ($) { } sub add_overview { - my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; + my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_; my $lines = $mime->body_raw =~ tr!\n!\n!; my $smsg = bless { mime => $mime, @@ -255,7 +256,8 @@ sub add_overview { lines => $lines, blob => $oid, }, 'PublicInbox::SearchMsg'; - my $mids = mids_for_index($mime->header_obj); + my $hdr = $mime->header_obj; + my $mids = mids_for_index($hdr); my $refs = parse_references($smsg, $mid0, $mids); my $subj = $smsg->subject; my $xpath; @@ -266,7 +268,9 @@ sub add_overview { my $dd = $smsg->to_doc_data($oid, $mid0); utf8::encode($dd); $dd = compress($dd); - my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ]; + my $ds = msg_timestamp($hdr, $times->{autime}); + my $ts = msg_datestamp($hdr, $times->{cotime}); + my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ]; add_over($self, $values); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c33a48c3..261deb84 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -19,6 +19,7 @@ use POSIX qw(strftime); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn); use PublicInbox::Git qw(git_unquote); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); my $X = \%PublicInbox::Search::X; my ($DB_CREATE_OR_OPEN, $DB_OPEN); use constant { @@ -308,10 +309,13 @@ sub index_xapian { # msg_iter callback sub add_xapian ($$$$$$) { my ($self, $mime, $num, $oid, $mids, $mid0) = @_; my $smsg = PublicInbox::SearchMsg->new($mime); + my $hdr = $mime->header_obj; + $smsg->{ds} = msg_datestamp($hdr, $self->{autime}); + $smsg->{ts} = msg_timestamp($hdr, $self->{cotime}); my $doc = $X->{Document}->new; my $subj = $smsg->subject; - add_val($doc, PublicInbox::Search::TS(), $smsg->ts); - my @ds = gmtime($smsg->ds); + add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); + my @ds = gmtime($smsg->{ds}); my $yyyymmdd = strftime('%Y%m%d', @ds); add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); my $dt = strftime('%Y%m%d%H%M%S', @ds); @@ -375,7 +379,8 @@ sub add_message { add_xapian($self, $mime, $num, $oid, $mids, $mid0); } if (my $over = $self->{over}) { - $over->add_overview($mime, $bytes, $num, $oid, $mid0); + $over->add_overview($mime, $bytes, $num, $oid, $mid0, + $self); } }; @@ -596,6 +601,10 @@ sub read_log { } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; $newest ||= $latest; + } elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) { + $self->{over}->{autime} = $self->{autime} = $1; + } elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) { + $self->{over}->{cotime} = $self->{cotime} = $1; } } close($log) or die "git log failed: \$?=$?"; @@ -651,7 +660,7 @@ sub _git_log { $self->{regen_down} = $high + $fcount; } - $git->popen(qw/log --no-notes --no-color --no-renames + $git->popen(qw/log --pretty=raw --no-notes --no-color --no-renames --raw -r --no-abbrev/, $range); } diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index ee176e50..74c624a4 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -67,12 +67,15 @@ sub shard_worker_loop ($$$$$) { $self->remove_by_oid($oid, $mid); } else { chomp $line; - my ($len, $artnum, $oid, $mid0) = split(/ /, $line); + my ($len, $artnum, $oid, $mid0, $autime, $cotime) = + split(/ /, $line); $self->begin_txn_lazy; my $n = read($r, my $msg, $len) or die "read: $!\n"; $n == $len or die "short read: $n != $len\n"; my $mime = PublicInbox::MIME->new(\$msg); $artnum = int($artnum); + $self->{autime} = $autime; + $self->{cotime} = $cotime; $self->add_message($mime, $n, $artnum, $oid, $mid0); } } @@ -81,13 +84,17 @@ sub shard_worker_loop ($$$$$) { # called by V2Writable sub index_raw { - my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; + my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime, $times) = @_; + my $at = $times->{autime} // time; + my $ct = $times->{cotime} // time; if (my $w = $self->{w}) { - print $w "$bytes $artnum $oid $mid0\n", $$msgref or die - "failed to write shard $!\n"; + print $w "$bytes $artnum $oid $mid0 $at $ct\n", $$msgref or + die "failed to write shard $!\n"; } else { $$msgref = undef; $self->begin_txn_lazy; + $self->{autime} = $at; + $self->{cotime} = $ct; $self->add_message($mime, $bytes, $artnum, $oid, $mid0); } } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b42e6a13..f1842843 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -150,9 +150,9 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$$$$) { my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); + $self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self); my $idx = idx_shard($self, $num % $self->{shards}); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); + $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self); my $n = $self->{transact_bytes} += $len; $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } @@ -1266,15 +1266,17 @@ sub index_epoch ($$$) { $pr->("$i.git indexing $range\n"); } - my @cmd = qw(log --raw -r --pretty=tformat:%H + my @cmd = qw(log --raw -r --pretty=tformat:%H.%at.%ct --no-notes --no-color --no-abbrev --no-renames); my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range); my $cmt; while (<$fh>) { chomp; $self->{current_info} = "$i.git $_"; - if (/\A$x40$/o && !defined($cmt)) { - $cmt = $_; + if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o) { + $cmt //= $1; + $self->{autime} = $2; + $self->{cotime} = $3; } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { reindex_oid($self, $sync, $git, $1); } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { |