From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 9B3631F621 for ; Fri, 20 Mar 2020 08:18:22 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 6/9] *idx: pass $smsg in more places instead of many args Date: Fri, 20 Mar 2020 08:18:18 +0000 Message-Id: <20200320081821.21715-7-e@yhbt.net> In-Reply-To: <20200320081821.21715-1-e@yhbt.net> References: <20200320081821.21715-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We can pass blessed PublicInbox::Smsg objects to internal indexing APIs instead of having long parameter lists in some places. The end goal is to avoid parsing redundant information each step of the way and hopefully make things more understandable. --- lib/PublicInbox/OverIdx.pm | 14 +++------- lib/PublicInbox/SearchIdx.pm | 45 ++++++++++++++++++------------- lib/PublicInbox/SearchIdxShard.pm | 16 +++++++++-- lib/PublicInbox/V2Writable.pm | 8 +++++- t/search-thr-index.t | 17 ++++++++++-- 5 files changed, 66 insertions(+), 34 deletions(-) diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index f49dfa00..2d71956d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -245,15 +245,9 @@ sub subject_path ($) { } sub add_overview { - my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_; - my $lines = $mime->body_raw =~ tr!\n!\n!; - my $smsg = bless { - mime => $mime, - mid => $mid0, - bytes => $bytes, - lines => $lines, - blob => $oid, - }, 'PublicInbox::Smsg'; + my ($self, $mime, $smsg, $times) = @_; + $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!; + $smsg->{mime} = $mime; # XXX temporary? my $hdr = $mime->header_obj; my $mids = mids_for_index($hdr); my $refs = parse_references($smsg, $hdr, $mids); @@ -268,7 +262,7 @@ sub add_overview { $dd = compress($dd); my $ds = msg_timestamp($hdr, $times->{autime}); my $ts = msg_datestamp($hdr, $times->{cotime}); - my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ]; + my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ]; add_over($self, $values); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 32be9c3f..5ca819c3 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -306,9 +306,9 @@ sub index_xapian { # msg_iter callback index_body($self, $_, /\A>/ ? 0 : $doc) for @sections; } -sub add_xapian ($$$$$$) { - my ($self, $mime, $num, $oid, $mids, $mid0) = @_; - my $smsg = PublicInbox::Smsg->new($mime); +sub add_xapian ($$$$) { + my ($self, $mime, $smsg, $mids) = @_; + $smsg->{mime} = $mime; # XXX dangerous my $hdr = $mime->header_obj; $smsg->{ds} = msg_datestamp($hdr, $self->{autime}); $smsg->{ts} = msg_timestamp($hdr, $self->{cotime}); @@ -338,9 +338,7 @@ sub add_xapian ($$$$$$) { index_text($self, join(' ', @long), 1, 'XM'); } } - $smsg->{to} = $smsg->{cc} = ''; - $smsg->{blob} = $oid; - $smsg->{mid} = $mid0; + $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids); my $data = $smsg->to_doc_data; $doc->set_data($data); @@ -355,7 +353,7 @@ sub add_xapian ($$$$$$) { } } $doc->add_boolean_term('Q' . $_) foreach @$mids; - $self->{xdb}->replace_document($num, $doc); + $self->{xdb}->replace_document($smsg->{num}, $doc); } sub _msgmap_init ($) { @@ -369,20 +367,25 @@ sub _msgmap_init ($) { sub add_message { # mime = Email::MIME object - my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; + my ($self, $mime, $smsg) = @_; my $mids = mids_for_index($mime->header_obj); - $mid0 //= $mids->[0]; # v1 compatibility - $num //= do { # v1 + $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat + $smsg->{mid} //= $mids->[0]; # v1 compatibility + $smsg->{num} //= do { # v1 _msgmap_init($self); index_mm($self, $mime); }; eval { - if (need_xapian($self)) { - add_xapian($self, $mime, $num, $oid, $mids, $mid0); + # order matters, overview stores every possible piece of + # data in doc_data (deflated). Xapian only stores a subset + # of the fields which exist in over.sqlite3. We may stop + # storing doc_data in Xapian sometime after we get multi-inbox + # search working. + if (my $over = $self->{over}) { # v1 only + $over->add_overview($mime, $smsg, $self); } - if (my $over = $self->{over}) { - $over->add_overview($mime, $bytes, $num, $oid, $mid0, - $self); + if (need_xapian($self)) { + add_xapian($self, $mime, $smsg, $mids); } }; @@ -390,7 +393,7 @@ sub add_message { warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } - $num; + $smsg->{num}; } # returns begin and end PostingIterator @@ -530,9 +533,10 @@ sub unindex_mm { } sub index_both { - my ($self, $mime, $bytes, $blob) = @_; + my ($self, $mime, $smsg) = @_; my $num = index_mm($self, $mime); - add_message($self, $mime, $bytes, $num, $blob); + $smsg->{num} = $num; + add_message($self, $mime, $smsg); } sub unindex_both { @@ -595,8 +599,11 @@ sub read_log { next; } my $mime = do_cat_mail($git, $blob, \$bytes) or next; + my $smsg = bless {}, 'PublicInbox::Smsg'; batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); - $add_cb->($self, $mime, $bytes, $blob); + $smsg->{blob} = $blob; + $smsg->{bytes} = $bytes; + $add_cb->($self, $mime, $smsg); } elsif ($line =~ /$delmsg/o) { my $blob = $1; $D{$blob} = 1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 74c624a4..d29e6090 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -76,7 +76,13 @@ sub shard_worker_loop ($$$$$) { $artnum = int($artnum); $self->{autime} = $autime; $self->{cotime} = $cotime; - $self->add_message($mime, $n, $artnum, $oid, $mid0); + my $smsg = bless { + bytes => $len, + num => $artnum, + blob => $oid, + mid => $mid0, + }, 'PublicInbox::Smsg'; + $self->add_message($mime, $smsg); } } $self->worker_done; @@ -95,7 +101,13 @@ sub index_raw { $self->begin_txn_lazy; $self->{autime} = $at; $self->{cotime} = $ct; - $self->add_message($mime, $bytes, $artnum, $oid, $mid0); + my $smsg = bless { + bytes => $bytes, + num => $artnum, + blob => $oid, + mid => $mid0, + }, 'PublicInbox::Smsg'; + $self->add_message($mime, $smsg); } } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index d39a6f89..34dd139b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -150,7 +150,13 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$$$$) { my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self); + my $smsg = bless { + bytes => $len, + num => $num, + blob => $oid, + mid => $mid0, + }, 'PublicInbox::Smsg'; + $self->{over}->add_overview($mime, $smsg, $self); my $idx = idx_shard($self, $num % $self->{shards}); $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self); my $n = $self->{transact_bytes} += $len; diff --git a/t/search-thr-index.t b/t/search-thr-index.t index 6a5fd919..f073304a 100644 --- a/t/search-thr-index.t +++ b/t/search-thr-index.t @@ -9,6 +9,7 @@ use PublicInbox::MID qw(mids); use Email::MIME; require_mods(qw(DBD::SQLite Search::Xapian)); require PublicInbox::SearchIdx; +require PublicInbox::Smsg; require PublicInbox::Inbox; my ($tmpdir, $for_destroy) = tmpdir(); my $git_dir = "$tmpdir/a.git"; @@ -45,7 +46,13 @@ foreach (reverse split(/\n\n/, $data)) { $mime->header_set('To' => 'git@vger.kernel.org'); my $bytes = bytes::length($mime->as_string); my $mid = mids($mime->header_obj)->[0]; - my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid); + my $smsg = bless { + bytes => $bytes, + num => ++$num, + mid => $mid, + blob => '', + }, 'PublicInbox::Smsg'; + my $doc_id = $rw->add_message($mime, $smsg); push @mids, $mid; ok($doc_id, 'message added: '. $mid); } @@ -86,7 +93,13 @@ SELECT tid FROM over WHERE num = ? LIMIT 1 my $bytes = bytes::length($mime->as_string); my $mid = mids($mime->header_obj)->[0]; - my $doc_id = $rw->add_message($mime, $bytes, $num, 'ignored', $mid); + my $smsg = bless { + bytes => $bytes, + num => $num, + mid => $mid, + blob => '', + }, 'PublicInbox::Smsg'; + my $doc_id = $rw->add_message($mime, $smsg); ok($doc_id, 'message reindexed'. $mid); is($doc_id, $num, "article number unchanged: $num");