From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS6315 166.70.0.0/16 X-Spam-Status: No, score=-3.7 required=3.0 tests=AWL,BAYES_00, RCVD_IN_DNSWL_LOW,SPF_PASS shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from out02.mta.xmission.com (out02.mta.xmission.com [166.70.13.232]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id 8E8CC208E9; Wed, 18 Jul 2018 16:54:01 +0000 (UTC) Received: from in01.mta.xmission.com ([166.70.13.51]) by out02.mta.xmission.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.87) (envelope-from ) id 1ffpiC-0007wa-Vz; Wed, 18 Jul 2018 10:54:01 -0600 Received: from [97.119.167.31] (helo=x220.int.ebiederm.org) by in01.mta.xmission.com with esmtpsa (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.87) (envelope-from ) id 1ffpiB-00065S-1n; Wed, 18 Jul 2018 10:54:00 -0600 From: "Eric W. Biederman" To: Eric Wong Cc: meta@public-inbox.org, "Eric W. Biederman" Date: Wed, 18 Jul 2018 11:53:24 -0500 Message-Id: <20180718165325.19834-2-ebiederm@xmission.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <87pnzkh4fx.fsf_-_@xmission.com> References: <87pnzkh4fx.fsf_-_@xmission.com> X-XM-SPF: eid=1ffpiB-00065S-1n;;;mid=<20180718165325.19834-2-ebiederm@xmission.com>;;;hst=in01.mta.xmission.com;;;ip=97.119.167.31;;;frm=ebiederm@xmission.com;;;spf=neutral X-XM-AID: U2FsdGVkX19wA2Cj8zsIQ2QKFS3G/5RgtjJKF9OtvtM= X-SA-Exim-Connect-IP: 97.119.167.31 X-SA-Exim-Mail-From: ebiederm@xmission.com Subject: [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional X-SA-Exim-Version: 4.2.1 (built Thu, 05 May 2016 13:38:54 -0600) X-SA-Exim-Scanned: Yes (on in01.mta.xmission.com) List-Id: Create a new method add_xapian that holds all of the code to create Xapian indexes. The creation of this method simpliy involved idenitifying the relevant code and moving it from add_message. A call is added to add_xapian from add_message to keep everything working as it currently does. The new call is made conditional upon index levels of 'full' and 'medium'. The index levels that index positions and terms the two things public-inbox uses Xapian to index. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 172 ++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 83 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index b19618c71508..8978914ab087 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -268,10 +268,95 @@ sub index_body ($$$) { @$lines = (); } +sub add_xapian ($$$$$) { + my ($self, $mime, $num, $oid, $mids, $mid0) = @_; + my $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + my $subj = $smsg->subject; + add_val($doc, PublicInbox::Search::TS(), $smsg->ts); + my @ds = gmtime($smsg->ds); + my $yyyymmdd = strftime('%Y%m%d', @ds); + add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); + my $dt = strftime('%Y%m%d%H%M%S', @ds); + add_val($doc, PublicInbox::Search::DT(), $dt); + + my $tg = $self->term_generator; + + $tg->set_document($doc); + $self->index_text($subj, 1, 'S') if $subj; + $self->index_users($smsg); + + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $self->index_text($fn, 1, 'XFN'); + } + + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; + + my (@orig, @quot); + my $body = $part->body; + my @lines = split(/\n/, $body); + while (defined(my $l = shift @lines)) { + if ($l =~ /^>/) { + $self->index_body(\@orig, $doc) if @orig; + push @quot, $l; + } else { + $self->index_body(\@quot, 0) if @quot; + push @orig, $l; + } + } + $self->index_body(\@quot, 0) if @quot; + $self->index_body(\@orig, $doc) if @orig; + }); + + foreach my $mid (@$mids) { + $self->index_text($mid, 1, 'XM'); + + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + $self->index_text(join(' ', @long), 1, 'XM'); + } + } + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids); + my $data = $smsg->to_doc_data($oid, $mid0); + $doc->set_data($data); + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $pfx = $alt->{xprefix}; + foreach my $mid (@$mids) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_boolean_term($pfx . $id); + } + } + } + $doc->add_boolean_term('Q' . $_) foreach @$mids; + $self->{xdb}->replace_document($num, $doc); +} + sub add_message { # mime = Email::MIME object my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; - my $doc_id; + my $xapianlevels = qr/\A(?:full|medium)\z/; my $mids = mids($mime->header_obj); $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility unless (defined $num) { # v1 @@ -279,98 +364,19 @@ sub add_message { $num = index_mm($self, $mime); } eval { - my $smsg = PublicInbox::SearchMsg->new($mime); - my $doc = $smsg->{doc}; - my $subj = $smsg->subject; - add_val($doc, PublicInbox::Search::TS(), $smsg->ts); - my @ds = gmtime($smsg->ds); - my $yyyymmdd = strftime('%Y%m%d', @ds); - add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); - my $dt = strftime('%Y%m%d%H%M%S', @ds); - add_val($doc, PublicInbox::Search::DT(), $dt); - - my $tg = $self->term_generator; - - $tg->set_document($doc); - $self->index_text($subj, 1, 'S') if $subj; - $self->index_users($smsg); - - msg_iter($mime, sub { - my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || 'text/plain'; - my $fn = $part->filename; - if (defined $fn && $fn ne '') { - $self->index_text($fn, 1, 'XFN'); - } - - return if $ct =~ m!\btext/x?html\b!i; - - my $s = eval { $part->body_str }; - if ($@) { - if ($ct =~ m!\btext/plain\b!i) { - # Try to assume UTF-8 because Alpine - # seems to do wacky things and set - # charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; - $s = $part->body if $@; - } - } - defined $s or return; - - my (@orig, @quot); - my $body = $part->body; - my @lines = split(/\n/, $body); - while (defined(my $l = shift @lines)) { - if ($l =~ /^>/) { - $self->index_body(\@orig, $doc) if @orig; - push @quot, $l; - } else { - $self->index_body(\@quot, 0) if @quot; - push @orig, $l; - } - } - $self->index_body(\@quot, 0) if @quot; - $self->index_body(\@orig, $doc) if @orig; - }); - - foreach my $mid (@$mids) { - $self->index_text($mid, 1, 'XM'); - - # because too many Message-IDs are prefixed with - # "Pine.LNX."... - if ($mid =~ /\w{12,}/) { - my @long = ($mid =~ /(\w{3,}+)/g); - $self->index_text(join(' ', @long), 1, 'XM'); - } + if ($self->{indexlevel} =~ $xapianlevels) { + $self->add_xapian($mime, $num, $oid, $mids, $mid0) } - $smsg->{to} = $smsg->{cc} = ''; - PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids); - my $data = $smsg->to_doc_data($oid, $mid0); - $doc->set_data($data); - if (my $altid = $self->{-altid}) { - foreach my $alt (@$altid) { - my $pfx = $alt->{xprefix}; - foreach my $mid (@$mids) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_boolean_term($pfx . $id); - } - } - } - if (my $over = $self->{over}) { $over->add_overview($mime, $bytes, $num, $oid, $mid0); } - $doc->add_boolean_term('Q' . $_) foreach @$mids; - $self->{xdb}->replace_document($doc_id = $num, $doc); }; if ($@) { warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } - $doc_id; + $num; } # returns begin and end PostingIterator -- 2.17.1