From 95acd5901491e4f333f5d2bbeed6fb5e6b53e07c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 2 Aug 2016 10:02:54 +0000 Subject: searchmsg: add git object ID to doc_data Doing git tree lookups based on the SHA-1 of the Message-ID is expensive as trees get larger, instead, use the SHA-1 object ID directly. This drastically reduces the amount of time spent in the "git cat-file --batch" process for fetching the /$INBOX/all.mbox.gz endpoint on the ~800MB git@vger.kernel.org mirror This retains backwards compatibility and allows existing indices to be transparently upgraded without performance degradation. --- lib/PublicInbox/SearchIdx.pm | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f0a3687d..f8249c50 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -91,7 +91,7 @@ sub add_val { } sub add_message { - my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; my ($doc_id, $old_tid); @@ -170,7 +170,7 @@ sub add_message { }); link_message($self, $smsg, $old_tid); - $doc->set_data($smsg->to_doc_data); + $doc->set_data($smsg->to_doc_data($blob)); if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { @@ -279,8 +279,8 @@ sub link_message { } sub index_blob { - my ($self, $git, $mime, $bytes, $num) = @_; - $self->add_message($mime, $bytes, $num); + my ($self, $git, $mime, $bytes, $num, $blob) = @_; + $self->add_message($mime, $bytes, $num, $blob); } sub unindex_blob { @@ -300,9 +300,9 @@ sub unindex_mm { } sub index_mm2 { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $git, $mime, $bytes, $blob) = @_; my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $git, $mime, $bytes, $num, $blob); } sub unindex_mm2 { @@ -312,9 +312,9 @@ sub unindex_mm2 { } sub index_both { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $git, $mime, $bytes, $blob) = @_; my $num = index_mm($self, $git, $mime); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $git, $mime, $bytes, $num, $blob); } sub unindex_both { @@ -355,10 +355,12 @@ sub rlog { my $line; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1, \$bytes) or next; - $add_cb->($self, $git, $mime, $bytes); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + $add_cb->($self, $git, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { - my $mime = do_cat_mail($git, $1) or next; + my $blob = $1; + my $mime = do_cat_mail($git, $blob) or next; $del_cb->($self, $git, $mime); } elsif ($line =~ /^commit ($h40)/o) { if (defined $max && --$max <= 0) { -- cgit v1.2.3-24-ge0c7