From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 94CD21FAE8 for ; Thu, 22 Mar 2018 09:40:16 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 05/13] use both Date: and Received: times Date: Thu, 22 Mar 2018 09:40:07 +0000 Message-Id: <20180322094015.14422-6-e@80x24.org> In-Reply-To: <20180322094015.14422-1-e@80x24.org> References: <20180322094015.14422-1-e@80x24.org> List-Id: We want to rely on Date: to sort messages within individual threads since it keeps messages from git-send-email(1) sorted. However, since developers occasionally have the clock set wrong on their machines, sort overall messages by the newest date in a Received: header so the landing page isn't forever polluted by messages from the future. This also gives us determinism for commit times in most cases, as we'll used the Received: timestamp there, as well. --- lib/PublicInbox/Import.pm | 17 ++++---- lib/PublicInbox/MsgTime.pm | 80 ++++++++++++++++++++++++------------ lib/PublicInbox/Search.pm | 5 ++- lib/PublicInbox/SearchIdx.pm | 6 ++- lib/PublicInbox/SearchIdxSkeleton.pm | 4 +- lib/PublicInbox/SearchMsg.pm | 26 ++++++------ lib/PublicInbox/SearchView.pm | 6 +-- lib/PublicInbox/View.pm | 30 +++++++------- 8 files changed, 103 insertions(+), 71 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index e50f115..d69934b 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -11,7 +11,7 @@ use base qw(PublicInbox::Lock); use PublicInbox::Spawn qw(spawn); use PublicInbox::MID qw(mids mid_mime mid2path); use PublicInbox::Address; -use PublicInbox::MsgTime qw(msg_timestamp); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::ContentId qw(content_digest); use PublicInbox::MDA; @@ -244,9 +244,8 @@ sub remove { (($self->{tip} = ":$commit"), $cur); } -sub parse_date ($) { - my ($mime) = @_; - my ($ts, $zone) = msg_timestamp($mime->header_obj); +sub git_timestamp { + my ($ts, $zone) = @_; $ts = 0 if $ts < 0; # git uses unsigned times "$ts $zone"; } @@ -295,7 +294,11 @@ sub add { my ($self, $mime, $check_cb) = @_; # mime = Email::MIME my ($name, $email) = extract_author_info($mime); - my $date_raw = parse_date($mime); + my $hdr = $mime->header_obj; + my @at = msg_datestamp($hdr); + my @ct = msg_timestamp($hdr); + my $author_time_raw = git_timestamp(@at); + my $commit_time_raw = git_timestamp(@ct); my $subject = $mime->header('Subject'); $subject = '(no subject)' unless defined $subject; my $path_type = $self->{path_type}; @@ -349,8 +352,8 @@ sub add { utf8::encode($subject); print $w "commit $ref\nmark :$commit\n", - "author $name <$email> $date_raw\n", - "committer $self->{ident} ", now_raw(), "\n" or wfail; + "author $name <$email> $author_time_raw\n", + "committer $self->{ident} $commit_time_raw\n" or wfail; print $w "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 87664f4..4295e87 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -4,48 +4,76 @@ package PublicInbox::MsgTime; use strict; use warnings; use base qw(Exporter); -our @EXPORT_OK = qw(msg_timestamp); +our @EXPORT_OK = qw(msg_timestamp msg_datestamp); use Date::Parse qw(str2time); use Time::Zone qw(tz_offset); -sub msg_timestamp ($) { +sub zone_clamp ($) { + my ($zone) = @_; + $zone ||= '+0000'; + # "-1200" is the furthest westermost zone offset, + # but git fast-import is liberal so we use "-1400" + if ($zone >= 1400 || $zone <= -1400) { + warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; + $zone = '+0000'; + } + $zone; +} + +sub time_response ($) { + my ($ret) = @_; + wantarray ? @$ret : $ret->[0]; +} + +sub msg_received_at ($) { my ($hdr) = @_; # Email::MIME::Header - my ($ts, $zone); - my $mid; my @recvd = $hdr->header_raw('Received'); + my ($ts, $zone); foreach my $r (@recvd) { $zone = undef; $r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+ \d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next; $zone = $2; $ts = eval { str2time($1) } and last; - $mid ||= $hdr->header_raw('Message-ID'); + my $mid = $hdr->header_raw('Message-ID'); warn "no date in $mid Received: $r\n"; } - unless (defined $ts) { - my @date = $hdr->header_raw('Date'); - foreach my $d (@date) { - $zone = undef; - $ts = eval { str2time($d) }; - if ($@) { - $mid ||= $hdr->header_raw('Message-ID'); - warn "bad Date: $d in $mid: $@\n"; - } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) { - $zone = $1; - } + defined $ts ? [ $ts, zone_clamp($zone) ] : undef; +} + +sub msg_date_only ($) { + my ($hdr) = @_; # Email::MIME::Header + my @date = $hdr->header_raw('Date'); + my ($ts, $zone); + foreach my $d (@date) { + $zone = undef; + $ts = eval { str2time($d) }; + if ($@) { + my $mid = $hdr->header_raw('Message-ID'); + warn "bad Date: $d in $mid: $@\n"; + } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) { + $zone = $1; } } - $ts = time unless defined $ts; - return $ts unless wantarray; + defined $ts ? [ $ts, zone_clamp($zone) ] : undef; +} - $zone ||= '+0000'; - # "-1200" is the furthest westermost zone offset, - # but git fast-import is liberal so we use "-1400" - if ($zone >= 1400 || $zone <= -1400) { - warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; - $zone = '+0000'; - } - ($ts, $zone); +# Favors Received header for sorting globally +sub msg_timestamp ($) { + my ($hdr) = @_; # Email::MIME::Header + my $ret; + $ret = msg_received_at($hdr) and return time_response($ret); + $ret = msg_date_only($hdr) and return time_response($ret); + wantarray ? (time, '+0000') : time; +} + +# Favors the Date: header for display and sorting within a thread +sub msg_datestamp ($) { + my ($hdr) = @_; # Email::MIME::Header + my $ret; + $ret = msg_date_only($hdr) and return time_response($ret); + $ret = msg_received_at($hdr) and return time_response($ret); + wantarray ? (time, '+0000') : time; } 1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 7e7c989..f08b987 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -8,11 +8,12 @@ use strict; use warnings; # values for searching -use constant TS => 0; # timestamp +use constant DS => 0; # Date: header in Unix time use constant NUM => 1; # NNTP article number use constant BYTES => 2; # :bytes as defined in RFC 3977 use constant LINES => 3; # :lines as defined in RFC 3977 -use constant YYYYMMDD => 4; # for searching in the WWW UI +use constant TS => 4; # Received: header in Unix time +use constant YYYYMMDD => 5; # for searching in the WWW UI use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 3d80b00..ef723a4 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -134,7 +134,9 @@ sub add_values ($$) { my $lines = $values->[PublicInbox::Search::LINES]; add_val($doc, PublicInbox::Search::LINES, $lines); - my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + my $ds = $values->[PublicInbox::Search::DS]; + add_val($doc, PublicInbox::Search::DS, $ds); + my $yyyymmdd = strftime('%Y%m%d', gmtime($ds)); add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); } @@ -298,7 +300,7 @@ sub add_message { } my $lines = $mime->body_raw =~ tr!\n!\n!; - my @values = ($smsg->ts, $num, $bytes, $lines); + my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts); add_values($doc, \@values); my $tg = $self->term_generator; diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index ba43969..78a1730 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -121,18 +121,16 @@ sub remote_remove { die $err if $err; } -# values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ] +# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ] sub index_skeleton_real ($$) { my ($self, $values) = @_; my $doc_data = pop @$values; my $xpath = pop @$values; my $mids = pop @$values; - my $ts = $values->[PublicInbox::Search::TS]; my $smsg = PublicInbox::SearchMsg->new(undef); my $doc = $smsg->{doc}; PublicInbox::SearchIdx::add_values($doc, $values); $doc->set_data($doc_data); - $smsg->{ts} = $ts; $smsg->load_from_data($doc_data); my $num = $values->[PublicInbox::Search::NUM]; my @refs = ($smsg->references =~ /<([^>]+)>/g); diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index a1cd0c2..de1fd13 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -9,7 +9,7 @@ use warnings; use Search::Xapian; use PublicInbox::MID qw/mid_clean mid_mime/; use PublicInbox::Address; -use PublicInbox::MsgTime qw(msg_timestamp); +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); sub new { my ($class, $mime) = @_; @@ -46,6 +46,7 @@ sub load_expand { my $doc = $self->{doc}; my $data = $doc->get_data or return; $self->{ts} = get_val($doc, &PublicInbox::Search::TS); + $self->{ds} = get_val($doc, &PublicInbox::Search::DS); utf8::decode($data); load_from_data($self, $data); $self; @@ -53,12 +54,8 @@ sub load_expand { sub load_doc { my ($class, $doc) = @_; - my $data = $doc->get_data or return; - my $ts = get_val($doc, &PublicInbox::Search::TS); - utf8::decode($data); - my $self = bless { doc => $doc, ts => $ts }, $class; - load_from_data($self, $data); - $self + my $self = bless { doc => $doc }, $class; + $self->load_expand; } # :bytes and :lines metadata in RFC 3977 @@ -91,9 +88,9 @@ my @MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); sub date ($) { my ($self) = @_; - my $ts = $self->{ts}; - return unless defined $ts; - my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ts); + my $ds = $self->{ds}; + return unless defined $ds; + my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ds); "$DoW[$wday], " . sprintf("%02d $MoY[$mon] %04d %02d:%02d:%02d +0000", $mday, $year+1900, $hour, $min, $sec); @@ -119,9 +116,12 @@ sub from_name { sub ts { my ($self) = @_; - $self->{ts} ||= eval { - msg_timestamp($self->{mime}->header_obj); - } || 0; + $self->{ts} ||= eval { msg_timestamp($self->{mime}->header_obj) } || 0; +} + +sub ds { + my ($self) = @_; + $self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0; } sub to_doc_data { diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 53b88c3..55c588c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -117,11 +117,11 @@ sub mset_summary { obfuscate_addrs($obfs_ibx, $s); obfuscate_addrs($obfs_ibx, $f); } - my $ts = PublicInbox::View::fmt_ts($smsg->ts); + my $date = PublicInbox::View::fmt_ts($smsg->ds); my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href}; $$res .= qq{$rank. }. $s . "\n"; - $$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n"; + $$res .= "$pfx - by $f @ $date UTC [$pct%]\n\n"; } $$res .= search_nav_bot($mset, $q); *noop; @@ -227,7 +227,7 @@ sub mset_thread { } ($mset->items) ]}); my $r = $q->{r}; my $rootset = PublicInbox::SearchThread::thread($msgs, - $r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ts, + $r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ds, $srch); my $skel = search_nav_bot($mset, $q). "
";
 	my $inbox = $ctx->{-inbox};
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index f811f4f..18882af 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -6,7 +6,7 @@
 package PublicInbox::View;
 use strict;
 use warnings;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_datestamp);
 use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
 use PublicInbox::Linkify;
 use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
@@ -735,7 +735,7 @@ sub load_results {
 sub thread_results {
 	my ($msgs, $srch) = @_;
 	require PublicInbox::SearchThread;
-	PublicInbox::SearchThread::thread($msgs, *sort_ts, $srch);
+	PublicInbox::SearchThread::thread($msgs, *sort_ds, $srch);
 }
 
 sub missing_thread {
@@ -746,7 +746,7 @@ sub missing_thread {
 
 sub _msg_date {
 	my ($hdr) = @_;
-	fmt_ts(msg_timestamp($hdr));
+	fmt_ts(msg_datestamp($hdr));
 }
 
 sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
@@ -782,7 +782,7 @@ sub skel_dump {
 	my $obfs_ibx = $ctx->{-obfs_ibx};
 	obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx;
 
-	my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level);
+	my $d = fmt_ts($smsg->{ds}) . ' ' . indent_for($level) . th_pfx($level);
 	my $attr = $f;
 	$ctx->{first_level} ||= $level;
 
@@ -863,10 +863,10 @@ sub _skel_ghost {
 	$$dst .= $d;
 }
 
-sub sort_ts {
+sub sort_ds {
 	[ sort {
-		(eval { $a->topmost->{smsg}->ts } || 0) <=>
-		(eval { $b->topmost->{smsg}->ts } || 0)
+		(eval { $a->topmost->{smsg}->ds } || 0) <=>
+		(eval { $b->topmost->{smsg}->ds } || 0)
 	} @{$_[0]} ];
 }
 
@@ -877,21 +877,21 @@ sub acc_topic {
 	my $srch = $ctx->{srch};
 	my $mid = $node->{id};
 	my $x = $node->{smsg} || $srch->lookup_mail($mid);
-	my ($subj, $ts);
+	my ($subj, $ds);
 	my $topic;
 	if ($x) {
 		$subj = $x->subject;
 		$subj = $srch->subject_normalized($subj);
-		$ts = $x->ts;
+		$ds = $x->ds;
 		if ($level == 0) {
-			$topic = [ $ts, 1, { $subj => $mid }, $subj ];
+			$topic = [ $ds, 1, { $subj => $mid }, $subj ];
 			$ctx->{-cur_topic} = $topic;
 			push @{$ctx->{order}}, $topic;
 			return;
 		}
 
 		$topic = $ctx->{-cur_topic}; # should never be undef
-		$topic->[0] = $ts if $ts > $topic->[0];
+		$topic->[0] = $ds if $ds > $topic->[0];
 		$topic->[1]++;
 		my $seen = $topic->[2];
 		if (scalar(@$topic) == 3) { # parent was a ghost
@@ -910,7 +910,7 @@ sub acc_topic {
 
 sub dump_topics {
 	my ($ctx) = @_;
-	my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ]
+	my $order = delete $ctx->{order}; # [ ds, subj1, subj2, subj3, ... ]
 	if (!@$order) {
 		$ctx->{-html_tip} = '
[No topics in range]
'; return 404; @@ -923,14 +923,14 @@ sub dump_topics { # sort by recency, this allows new posts to "bump" old topics... foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { - my ($ts, $n, $seen, $top, @ex) = @$topic; + my ($ds, $n, $seen, $top, @ex) = @$topic; @$topic = (); next unless defined $top; # ghost topic my $mid = delete $seen->{$top}; my $href = mid_escape($mid); my $prev_subj = [ split(/ /, $top) ]; $top = PublicInbox::Hval->new($top)->as_html; - $ts = fmt_ts($ts); + $ds = fmt_ts($ds); # $n isn't the total number of posts on the topic, # just the number of posts in the current results window @@ -946,7 +946,7 @@ sub dump_topics { my $mbox = qq(mbox.gz); my $atom = qq(Atom); my $s = "$top\n" . - " $ts UTC $n - $mbox / $atom\n"; + " $ds UTC $n - $mbox / $atom\n"; for (my $i = 0; $i < scalar(@ex); $i += 2) { my $level = $ex[$i]; my $subj = $ex[$i + 1]; -- EW