From 3f2d9e30255824da52bca422d07e97e0859500e2 Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Sat, 3 Mar 2018 04:00:09 +0000 Subject: searchidx: support indexing multiple MIDs It's possible to have a message handle multiple terms; so use this feature to ensure messages with multiple MIDs can be found by either one. --- lib/PublicInbox/Search.pm | 1 - lib/PublicInbox/SearchIdx.pm | 121 ++++++++++++++++++++++------------- lib/PublicInbox/SearchIdxSkeleton.pm | 26 ++------ lib/PublicInbox/SearchMsg.pm | 7 ++ 4 files changed, 91 insertions(+), 64 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 74f406ad..fb7a126a 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -373,7 +373,6 @@ sub lookup_mail { # no ghosts! sub each_smsg_by_mid { my ($self, $mid, $cb) = @_; - $mid = mid_clean($mid); my $xdb = $self->{xdb}; # XXX retry_reopen isn't necessary for V2Writable, but the PSGI # interface will need it... diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 61dc0577..1c107282 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -281,29 +281,19 @@ sub index_body ($$$) { sub add_message { my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object - my $db = $self->{xdb}; - - my ($doc_id, $old_tid); - my @mids = mid_mime($mime); - if (@mids > 1) { - warn "Multi-MID: ( ",join(' | ', @mids)," )\n"; - } - my $mid = mid_clean($mids[0]); + my $doc_id; + my $mids = mids($mime->header_obj); my $skel = $self->{skeleton}; eval { - die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; - my $smsg = $self->lookup_message($mid); - if ($smsg) { - # convert a ghost to a regular message - # it will also clobber any existing regular message - $doc_id = $smsg->{doc_id}; - $old_tid = $smsg->thread_id unless $skel; - } - $smsg = PublicInbox::SearchMsg->new($mime); + my $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; - $doc->add_term('Q' . $mid); - + foreach my $mid (@$mids) { + # FIXME: may be abused to prevent archival + length($mid) > MAX_MID_SIZE and + die 'Message-ID too long'; + $doc->add_term('Q' . $mid); + } my $subj = $smsg->subject; my $xpath; if ($subj ne '') { @@ -366,31 +356,30 @@ sub add_message { # populates smsg->references for smsg->to_doc_data my $refs = parse_references($smsg); my $data = $smsg->to_doc_data($blob); - if ($skel) { - push @values, $mid, $xpath, $data; - $skel->index_skeleton(\@values); - } else { - link_message($self, $smsg, $refs, $old_tid); + foreach my $mid (@$mids) { + $tg->index_text($mid, 1, 'XM'); } - $tg->index_text($mid, 1, 'XM'); $doc->set_data($data); - if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_term($alt->{xprefix} . $id); + foreach my $mid (@$mids) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } } } - if (defined $doc_id) { - $db->replace_document($doc_id, $doc); + if ($skel) { + push @values, $mids, $xpath, $data; + $skel->index_skeleton(\@values); + $doc_id = $self->{xdb}->add_document($doc); } else { - $doc_id = $db->add_document($doc); + $doc_id = link_and_save($self, $doc, $mids, $refs); } }; if ($@) { - warn "failed to index message <$mid>: $@\n"; + warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } $doc_id; @@ -467,27 +456,62 @@ sub parse_references ($) { \@keep; } -sub link_message { - my ($self, $smsg, $refs, $old_tid) = @_; +sub link_doc { + my ($self, $doc, $refs, $old_tid) = @_; my $tid; if (@$refs) { - # first ref *should* be the thread root, # but we can never trust clients to do the right thing my $ref = shift @$refs; - $tid = $self->_resolve_mid_to_tid($ref); - $self->merge_threads($tid, $old_tid) if defined $old_tid; + $tid = resolve_mid_to_tid($self, $ref); + merge_threads($self, $tid, $old_tid) if defined $old_tid; # the rest of the refs should point to this tid: foreach $ref (@$refs) { - my $ptid = $self->_resolve_mid_to_tid($ref); + my $ptid = resolve_mid_to_tid($self, $ref); merge_threads($self, $tid, $ptid); } } else { $tid = defined $old_tid ? $old_tid : $self->next_thread_id; } - $smsg->{doc}->add_term('G' . $tid); + $doc->add_term('G' . $tid); + $tid; +} + +sub link_and_save { + my ($self, $doc, $mids, $refs) = @_; + my $db = $self->{xdb}; + my $old_tid; + my $doc_id; + my $vivified = 0; + foreach my $mid (@$mids) { + $self->each_smsg_by_mid($mid, sub { + my ($cur) = @_; + my $type = $cur->type; + my $cur_tid = $cur->thread_id; + $old_tid = $cur_tid unless defined $old_tid; + if ($type eq 'mail') { + # do not break existing mail messages, + # just merge the threads + merge_threads($self, $old_tid, $cur_tid); + return 1; + } + if ($type ne 'ghost') { + die "<$mid> has a bad type: $type\n"; + } + my $tid = link_doc($self, $doc, $refs, $old_tid); + $old_tid = $tid unless defined $old_tid; + $doc_id = $cur->{doc_id}; + $self->{xdb}->replace_document($doc_id, $doc); + ++$vivified; + 1; + }); + } + # not really important, but we return any vivified ghost docid, here: + return $doc_id if defined $doc_id; + link_doc($self, $doc, $refs, $old_tid); + $self->{xdb}->add_document($doc); } sub index_git_blob_id { @@ -709,11 +733,22 @@ sub _index_sync { } # this will create a ghost as necessary -sub _resolve_mid_to_tid { +sub resolve_mid_to_tid { my ($self, $mid) = @_; + my $tid; + $self->each_smsg_by_mid($mid, sub { + my ($smsg) = @_; + my $cur_tid = $smsg->thread_id; + if (defined $tid) { + merge_threads($self, $tid, $cur_tid); + } else { + $tid = $smsg->thread_id; + } + 1; + }); + return $tid if defined $tid; - my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid); - $smsg->thread_id; + $self->create_ghost($mid)->thread_id; } sub create_ghost { diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 333f965d..063c83e8 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -92,34 +92,20 @@ sub index_skeleton_real ($$) { my ($self, $values) = @_; my $doc_data = pop @$values; my $xpath = pop @$values; - my $mid = pop @$values; + my $mids = pop @$values; my $ts = $values->[PublicInbox::Search::TS]; - my $smsg = $self->lookup_message($mid); - my ($old_tid, $doc_id); - if ($smsg) { - # convert a ghost to a regular message - # it will also clobber any existing regular message - $doc_id = $smsg->{doc_id}; - $old_tid = $smsg->thread_id; - } else { - $smsg = PublicInbox::SearchMsg->new(undef); - $smsg->{mid} = $mid; - } + my $smsg = PublicInbox::SearchMsg->new(undef); my $doc = $smsg->{doc}; $doc->add_term('XPATH' . $xpath) if defined $xpath; - $doc->add_term('Q' . $mid); + foreach my $mid (@$mids) { + $doc->add_term('Q' . $mid); + } PublicInbox::SearchIdx::add_values($doc, $values); $doc->set_data($doc_data); $smsg->{ts} = $ts; $smsg->load_from_data($doc_data); my @refs = ($smsg->references =~ /<([^>]+)>/g); - $self->link_message($smsg, \@refs, $old_tid); - my $db = $self->{xdb}; - if (defined $doc_id) { - $db->replace_document($doc_id, $doc); - } else { - $doc_id = $db->add_document($doc); - } + $self->link_and_save($doc, $mids, \@refs); } 1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 014f4909..a5565345 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -176,4 +176,11 @@ sub path { $self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path } +sub type { + my ($self) = @_; + my $type = $self->{type}; + return $type if defined $type; + $self->{type} = _get_term_val($self, 'T', qr/\AT/); +} + 1; -- cgit v1.2.3-24-ge0c7