From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5EFDD1F855 for ; Fri, 5 Aug 2016 01:03:00 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] searchmsg: add git object ID to doc_data Date: Fri, 5 Aug 2016 01:03:00 +0000 Message-Id: <20160805010300.7053-1-e@80x24.org> List-Id: Doing git tree lookups based on the SHA-1 of the Message-ID is expensive as trees get larger, instead, use the SHA-1 object ID directly. This drastically reduces the amount of time spent in the "git cat-file --batch" process for fetching the /$INBOX/all.mbox.gz endpoint on the ~800MB git@vger.kernel.org mirror This retains backwards compatibility and allows existing indices to be transparently upgraded without performance degradation. --- lib/PublicInbox/Inbox.pm | 12 ++++++++++++ lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/SearchIdx.pm | 24 +++++++++++++----------- lib/PublicInbox/SearchMsg.pm | 20 ++++++++++++++++---- 4 files changed, 42 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 4fbbb52..e552cd4 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -162,6 +162,18 @@ sub msg_by_path ($$;$) { $str; } +sub msg_by_smsg ($$;$) { + my ($self, $smsg, $ref) = @_; + + # backwards compat to fallback to msg_by_mid + # TODO: remove if we bump SCHEMA_VERSION in Search.pm: + defined(my $blob = $smsg->blob) or return msg_by_mid($self, $smsg->mid); + + my $str = git($self)->cat_file($blob, $ref); + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; + $str; +} + sub path_check { my ($self, $path) = @_; git($self)->check('HEAD:'.$path); diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index d2c0954..1e3de5b 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -129,7 +129,7 @@ sub getline { my $gz = $self->{gz}; do { while (defined(my $smsg = shift @{$self->{msgs}})) { - my $msg = eval { $ibx->msg_by_mid($smsg->mid) } or next; + my $msg = eval { $ibx->msg_by_smsg($smsg) } or next; $msg = Email::Simple->new($msg); $gz->write(PublicInbox::Mbox::msg_str($ctx, $msg)); my $bref = $self->{buf}; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f0a3687..f8249c5 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -91,7 +91,7 @@ sub add_val { } sub add_message { - my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; my ($doc_id, $old_tid); @@ -170,7 +170,7 @@ sub add_message { }); link_message($self, $smsg, $old_tid); - $doc->set_data($smsg->to_doc_data); + $doc->set_data($smsg->to_doc_data($blob)); if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { @@ -279,8 +279,8 @@ sub link_message { } sub index_blob { - my ($self, $git, $mime, $bytes, $num) = @_; - $self->add_message($mime, $bytes, $num); + my ($self, $git, $mime, $bytes, $num, $blob) = @_; + $self->add_message($mime, $bytes, $num, $blob); } sub unindex_blob { @@ -300,9 +300,9 @@ sub unindex_mm { } sub index_mm2 { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $git, $mime, $bytes, $blob) = @_; my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $git, $mime, $bytes, $num, $blob); } sub unindex_mm2 { @@ -312,9 +312,9 @@ sub unindex_mm2 { } sub index_both { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $git, $mime, $bytes, $blob) = @_; my $num = index_mm($self, $git, $mime); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $git, $mime, $bytes, $num, $blob); } sub unindex_both { @@ -355,10 +355,12 @@ sub rlog { my $line; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1, \$bytes) or next; - $add_cb->($self, $git, $mime, $bytes); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + $add_cb->($self, $git, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { - my $mime = do_cat_mail($git, $1) or next; + my $blob = $1; + my $mime = do_cat_mail($git, $blob) or next; $del_cb->($self, $git, $mime); } elsif ($line =~ /^commit ($h40)/o) { if (defined $max && --$max <= 0) { diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 4b0b645..9d873c4 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -38,7 +38,7 @@ sub load_doc { my $data = $doc->get_data or return; my $ts = get_val($doc, &PublicInbox::Search::TS); utf8::decode($data); - my ($subj, $from, $refs, $to, $cc) = split(/\n/, $data); + my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); bless { doc => $doc, subject => $subj, @@ -47,6 +47,7 @@ sub load_doc { references => $refs, to => $to, cc => $cc, + blob => $blob, }, $class; } @@ -105,9 +106,11 @@ sub ts { } sub to_doc_data { - my ($self) = @_; - join("\n", $self->subject, $self->from, $self->references, - $self->to, $self->cc); + my ($self, $blob) = @_; + my @rows = ($self->subject, $self->from, $self->references, + $self->to, $self->cc); + push @rows, $blob if defined $blob; + join("\n", @rows); } sub references { @@ -185,6 +188,15 @@ sub mid ($;$) { sub _extract_mid { mid_clean(mid_mime($_[0]->mime)) } +sub blob { + my ($self, $x40) = @_; + if (defined $x40) { + $self->{blob} = $x40; + } else { + $self->{blob}; + } +} + sub mime { my ($self, $mime) = @_; if (defined $mime) { -- EW