From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 1998A1F623 for ; Fri, 20 Mar 2020 08:18:23 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 8/9] *idx: pass smsg in even more places Date: Fri, 20 Mar 2020 08:18:20 +0000 Message-Id: <20200320081821.21715-9-e@yhbt.net> In-Reply-To: <20200320081821.21715-1-e@yhbt.net> References: <20200320081821.21715-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We can finally get rid of the awkward, ad-hoc use of V2Writable, SearchIdx, and OverIdx args for passing {cotime} and {autime} between classes. We'll still use those git time fields internally within V2Writable and SearchIdx for (re)indexing, but that's not worth avoiding as a fallback. --- lib/PublicInbox/Import.pm | 19 ++++++++++--------- lib/PublicInbox/OverIdx.pm | 8 ++------ lib/PublicInbox/SearchIdx.pm | 16 ++++++++++------ lib/PublicInbox/SearchIdxShard.pm | 14 +++++--------- lib/PublicInbox/V2Writable.pm | 15 ++++++++------- t/import.t | 14 ++++++-------- 6 files changed, 41 insertions(+), 45 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 3853ff2b..c72c1e92 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -275,7 +275,7 @@ sub git_timestamp { } sub extract_cmt_info ($;$) { - my ($mime, $v2w) = @_; + my ($mime, $smsg) = @_; my $sender = ''; my $from = $mime->header('From'); @@ -325,9 +325,9 @@ sub extract_cmt_info ($;$) { utf8::encode($subject); my $at = git_timestamp(my @at = msg_datestamp($hdr)); my $ct = git_timestamp(my @ct = msg_timestamp($hdr)); - if ($v2w) { # set fallbacks in case message had no date - $v2w->{autime} = $at[0]; - $v2w->{cotime} = $ct[0]; + if ($smsg) { + $smsg->{ds} = $at[0]; + $smsg->{ts} = $ct[0]; } ($name, $email, $at, $ct, $subject); } @@ -374,9 +374,9 @@ sub clean_tree_v2 ($$$) { # returns undef on duplicate # returns the :MARK of the most recent commit sub add { - my ($self, $mime, $check_cb, $v2w) = @_; # mime = Email::MIME + my ($self, $mime, $check_cb, $smsg) = @_; # mime = Email::MIME - my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $v2w); + my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg); my $path_type = $self->{path_type}; my $path; if ($path_type eq '2/38') { @@ -406,9 +406,10 @@ sub add { print $w $raw_email, "\n" or wfail; # v2: we need this for Xapian - if ($self->{want_object_info}) { - my $oid = $self->get_mark(":$blob"); - $self->{last_object} = [ $oid, $n, \$raw_email ]; + if ($smsg) { + $smsg->{blob} = $self->get_mark(":$blob"); + $smsg->{bytes} = $n; + $smsg->{-raw_email} = \$raw_email; } my $ref = $self->{ref}; my $commit = $self->{mark}++; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 2d71956d..acbf2c8d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -15,7 +15,6 @@ use IO::Handle; use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); -use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use Compress::Zlib qw(compress); use PublicInbox::Search; @@ -245,7 +244,7 @@ sub subject_path ($) { } sub add_overview { - my ($self, $mime, $smsg, $times) = @_; + my ($self, $mime, $smsg) = @_; $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!; $smsg->{mime} = $mime; # XXX temporary? my $hdr = $mime->header_obj; @@ -260,10 +259,7 @@ sub add_overview { my $dd = $smsg->to_doc_data; utf8::encode($dd); $dd = compress($dd); - my $ds = msg_timestamp($hdr, $times->{autime}); - my $ts = msg_datestamp($hdr, $times->{cotime}); - my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ]; - add_over($self, $values); + add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]); } sub add_over { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5ca819c3..44b05813 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -310,8 +310,6 @@ sub add_xapian ($$$$) { my ($self, $mime, $smsg, $mids) = @_; $smsg->{mime} = $mime; # XXX dangerous my $hdr = $mime->header_obj; - $smsg->{ds} = msg_datestamp($hdr, $self->{autime}); - $smsg->{ts} = msg_timestamp($hdr, $self->{cotime}); my $doc = $X->{Document}->new; my $subj = $smsg->subject; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); @@ -368,13 +366,19 @@ sub _msgmap_init ($) { sub add_message { # mime = Email::MIME object my ($self, $mime, $smsg) = @_; - my $mids = mids_for_index($mime->header_obj); + my $hdr = $mime->header_obj; + my $mids = mids_for_index($hdr); $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat $smsg->{mid} //= $mids->[0]; # v1 compatibility $smsg->{num} //= do { # v1 _msgmap_init($self); index_mm($self, $mime); }; + + # v1 and tests only: + $smsg->{ds} //= msg_datestamp($hdr, $self->{autime}); + $smsg->{ts} //= msg_timestamp($hdr, $self->{cotime}); + eval { # order matters, overview stores every possible piece of # data in doc_data (deflated). Xapian only stores a subset @@ -382,7 +386,7 @@ sub add_message { # storing doc_data in Xapian sometime after we get multi-inbox # search working. if (my $over = $self->{over}) { # v1 only - $over->add_overview($mime, $smsg, $self); + $over->add_overview($mime, $smsg); } if (need_xapian($self)) { add_xapian($self, $mime, $smsg, $mids); @@ -611,9 +615,9 @@ sub read_log { $latest = $1; $newest ||= $latest; } elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{over}->{autime} = $self->{autime} = $1; + $self->{autime} = $1; } elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{over}->{cotime} = $self->{cotime} = $1; + $self->{cotime} = $1; } } close($log) or die "git log failed: \$?=$?"; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 21e81b16..2b48b1b4 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -67,19 +67,19 @@ sub shard_worker_loop ($$$$$) { $self->remove_by_oid($oid, $mid); } else { chomp $line; - my ($bytes, $num, $blob, $mid, $autime, $cotime) = + my ($bytes, $num, $blob, $mid, $ds, $ts) = split(/ /, $line); $self->begin_txn_lazy; my $n = read($r, my $msg, $bytes) or die "read: $!\n"; $n == $bytes or die "short read: $n != $bytes\n"; my $mime = PublicInbox::MIME->new(\$msg); - $self->{autime} = $autime; - $self->{cotime} = $cotime; my $smsg = bless { bytes => $bytes, num => $num + 0, blob => $blob, mid => $mid, + ds => $ds, + ts => $ts, }, 'PublicInbox::Smsg'; $self->add_message($mime, $smsg); } @@ -89,17 +89,13 @@ sub shard_worker_loop ($$$$$) { # called by V2Writable sub index_raw { - my ($self, $msgref, $mime, $smsg, $times) = @_; - my $at = $times->{autime} // time; - my $ct = $times->{cotime} // time; + my ($self, $msgref, $mime, $smsg) = @_; if (my $w = $self->{w}) { - print $w join(' ', @$smsg{qw(bytes num blob mid)}, $at, $ct), + print $w join(' ', @$smsg{qw(bytes num blob mid ds ts)}), "\n", $$msgref or die "failed to write shard $!\n"; } else { $$msgref = undef; $self->begin_txn_lazy; - $self->{autime} = $at; - $self->{cotime} = $ct; $self->add_message($mime, $smsg); } } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b5332da4..b45d2722 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -19,6 +19,7 @@ use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::SearchIdx; +use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use IO::Handle; # ->autoflush use File::Temp qw(tempfile); @@ -150,9 +151,11 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; - $self->{over}->add_overview($mime, $smsg, $self); + $smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime}); + $smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime}); + $self->{over}->add_overview($mime, $smsg); my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); - $idx->index_raw($msgref, $mime, $smsg, $self); + $idx->index_raw($msgref, $mime, $smsg); my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } @@ -176,13 +179,12 @@ sub _add { defined $num or return; # duplicate defined $mid0 or die "BUG: $mid0 undefined\n"; my $im = $self->importer; - my $cmt = $im->add($mime, undef, $self); # sets $self->{(au|co)time} + my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg'; + my $cmt = $im->add($mime, undef, $smsg); # sets $smsg->{ds|ts|blob} $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - my $msgref; - my $smsg = bless { mid => $mid0, num => $num }, 'PublicInbox::Smsg'; - ($smsg->{blob}, $smsg->{bytes}, $msgref) = @{$im->{last_object}}; + my $msgref = delete $smsg->{-raw_email}; if (do_idx($self, $msgref, $mime, $smsg)) { $self->checkpoint; } @@ -793,7 +795,6 @@ sub import_init { my ($self, $git, $packed_bytes, $tmp) = @_; my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox}); $im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR); - $im->{want_object_info} = 1; $im->{lock_path} = undef; $im->{path_type} = 'v2'; $self->{im} = $im unless $tmp; diff --git a/t/import.t b/t/import.t index b88d308e..703aa362 100644 --- a/t/import.t +++ b/t/import.t @@ -28,15 +28,13 @@ my $mime = PublicInbox::MIME->create( body => "hello world\n", ); my $v2 = require_git(2.6, 1); - -$im->{want_object_info} = 1 if $v2; -like($im->add($mime), qr/\A:\d+\z/, 'added one message'); +my $smsg = {} if $v2; +like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) { - my $info = $im->{last_object}; - like($info->[0], qr/\A[a-f0-9]{40}\z/, 'got last object_id'); - is($mime->as_string, ${$info->[2]}, 'string matches'); - is($info->[1], length(${$info->[2]}), 'length matches'); + like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id'); + is($mime->as_string, ${$smsg->{-raw_email}}, 'string matches'); + is($smsg->{bytes}, length(${$smsg->{-raw_email}}), 'length matches'); my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); my $in = tempfile(); print $in $mime->as_string or die "write failed: $!"; @@ -48,7 +46,7 @@ if ($v2) { is($?, 0, 'hash-object'); seek($out, 0, SEEK_SET); chomp(my $hashed_obj = <$out>); - is($hashed_obj, $info->[0], "last object_id matches exp"); + is($hashed_obj, $smsg->{blob}, "blob object_id matches exp"); } $im->done;