From 3fc59df0d633a17e0c5e43d633d12e8772c06ec3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 10 Jan 2017 21:40:37 +0000 Subject: introduce PublicInbox::MIME wrapper class This should fix problems with multipart messages where text/plain parts lack a header. cf. git clone --mirror https://github.com/rjbs/Email-MIME.git refs/pull/28/head In the future, we may still introduce as streaming interface to reduce memory usage on large emails. --- lib/PublicInbox/SearchIdx.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 87ee0d46..d63dd7c7 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -10,7 +10,7 @@ package PublicInbox::SearchIdx; use strict; use warnings; use Fcntl qw(:flock :DEFAULT); -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); @@ -400,7 +400,7 @@ sub do_cat_mail { my $str = $git->cat_file($blob, $sizeref); # fixup bugs from import: $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - Email::MIME->new($str); + PublicInbox::MIME->new($str); }; $@ ? undef : $mime; } -- cgit v1.2.3-24-ge0c7 From 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 6 Feb 2017 19:54:25 +0000 Subject: searchidx: deal with empty In-Reply-To and References headers In some messages, these headers exist, but have empty values. Do not let empty values throw off our search indexer to tie threads together, as it can make non-sensical threads grouped to a Message-Id of "" (empty string). See for an example of such a message. Thanks-to: Johannes Schindelin --- lib/PublicInbox/SearchIdx.pm | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index d63dd7c7..1142ca7a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -292,11 +292,15 @@ sub link_message { my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; my $refs = $hdr->header_raw('References'); - my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); + my @refs = defined $refs ? ($refs =~ /<([^>]+)>/g) : (); my $irt = $hdr->header_raw('In-Reply-To'); if (defined $irt) { - $irt = mid_clean($irt); - $irt = undef if $mid eq $irt; + if ($irt eq '') { + $irt = undef; + } else { + $irt = mid_clean($irt); + $irt = undef if $mid eq $irt; + } } my $tid; -- cgit v1.2.3-24-ge0c7 From 5d91adedf5f33ef1cb87df2a86306ddf370b4f8d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 6 Feb 2017 21:08:13 +0000 Subject: searchidx: reindex clobbers old thread IDs We cannot always reuse thread IDs since our threading logic may change as bugs are fixed. --- lib/PublicInbox/SearchIdx.pm | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1142ca7a..bc003c6c 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -157,6 +157,10 @@ sub add_message { # it will also clobber any existing regular message $doc_id = $smsg->{doc_id}; $old_tid = $smsg->thread_id; + + # no need to remove_term for old_tid, we use a new + # doc to replace the old one when reindexing: + $old_tid = undef if $self->{reindex}; } $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; @@ -464,7 +468,7 @@ sub _git_log { sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; - my $reindex = $opts->{reindex}; + $self->{reindex} = $opts->{reindex}; my ($mkey, $last_commit, $lx, $xlog); $self->{git}->batch_prepare; my $xdb = _xdb_acquire($self); @@ -474,7 +478,7 @@ sub _index_sync { $mkey = 'last_commit'; $last_commit = $xdb->get_metadata('last_commit'); $lx = $last_commit; - if ($reindex) { + if ($self->{reindex}) { $lx = ''; $mkey = undef if $last_commit ne ''; } -- cgit v1.2.3-24-ge0c7 From 6e83825a9e49ca68694c20ddfed54368d5f3e075 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 6 Feb 2017 21:37:26 +0000 Subject: Revert "searchidx: reindex clobbers old thread IDs" Oops, that's broken, too. I guess the only way to reindex after fixing the thread detection is to start from scratch. This reverts commit 5d91adedf5f33ef1cb87df2a86306ddf370b4f8d. --- lib/PublicInbox/SearchIdx.pm | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index bc003c6c..1142ca7a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -157,10 +157,6 @@ sub add_message { # it will also clobber any existing regular message $doc_id = $smsg->{doc_id}; $old_tid = $smsg->thread_id; - - # no need to remove_term for old_tid, we use a new - # doc to replace the old one when reindexing: - $old_tid = undef if $self->{reindex}; } $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; @@ -468,7 +464,7 @@ sub _git_log { sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; - $self->{reindex} = $opts->{reindex}; + my $reindex = $opts->{reindex}; my ($mkey, $last_commit, $lx, $xlog); $self->{git}->batch_prepare; my $xdb = _xdb_acquire($self); @@ -478,7 +474,7 @@ sub _index_sync { $mkey = 'last_commit'; $last_commit = $xdb->get_metadata('last_commit'); $lx = $last_commit; - if ($self->{reindex}) { + if ($reindex) { $lx = ''; $mkey = undef if $last_commit ne ''; } -- cgit v1.2.3-24-ge0c7