From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS16276 167.114.0.0/16 X-Spam-Status: No, score=-2.2 required=3.0 tests=AWL,BAYES_00,RCVD_IN_XBL shortcircuit=no autolearn=no version=3.3.2 X-Original-To: meta@public-inbox.org Received: from 80x24.org (mars.m3l.io [167.114.185.125]) by dcvr.yhbt.net (Postfix) with ESMTP id B2104200EC for ; Thu, 3 Sep 2015 01:57:23 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/3] search: disable Message-ID compression in Xapian Date: Thu, 3 Sep 2015 01:57:11 +0000 Message-Id: <1441245432-16378-3-git-send-email-e@80x24.org> In-Reply-To: <1441245432-16378-1-git-send-email-e@80x24.org> References: <1441245432-16378-1-git-send-email-e@80x24.org> List-Id: We'll continue to compress long Message-IDs in URLs (which we know about), but we will store entire Message-IDs in the Xapian database to facilitate ease-of-lookups in external databases. --- lib/PublicInbox/ExtMsg.pm | 9 ++++----- lib/PublicInbox/Search.pm | 6 +++--- lib/PublicInbox/SearchIdx.pm | 37 +++++++++++++------------------------ lib/PublicInbox/SearchMsg.pm | 4 ++-- t/search.t | 3 +-- 5 files changed, 23 insertions(+), 36 deletions(-) diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 1c0887c..bdbff78 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -12,7 +12,6 @@ sub ext_msg { my $pi_config = $ctx->{pi_config}; my $listname = $ctx->{listname}; my $mid = $ctx->{mid}; - my $cmid = mid_compress($mid); eval { require PublicInbox::Search }; my $have_xap = $@ ? 0 : 1; @@ -35,13 +34,13 @@ sub ext_msg { if ($have_xap) { my $doc_id = eval { my $s = PublicInbox::Search->new($git_dir); - $s->find_unique_doc_id('mid', $cmid); + $s->find_unique_doc_id('mid', $mid); }; if ($@) { # xapian not configured for this repo } else { # maybe we found it! - return r302($url, $cmid) if (defined $doc_id); + return r302($url, $mid) if (defined $doc_id); # no point in trying the fork fallback if we # know Xapian is up-to-date but missing the @@ -55,7 +54,7 @@ sub ext_msg { } # Xapian not installed or configured for some repos - my $path = "HEAD:" . mid2path($cmid); + my $path = "HEAD:" . mid2path($mid); foreach my $n (@nox) { my @cmd = ('git', "--git-dir=$n->{git_dir}", 'cat-file', @@ -70,7 +69,7 @@ sub ext_msg { my $type = eval { local $/; <$fh> }; close $fh; if ($? == 0 && $type eq "blob\n") { - return r302($n->{url}, $cmid); + return r302($n->{url}, $mid); } } } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 8b32ef3..eb49f72 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -26,14 +26,15 @@ use constant { # 6 - preserve References: order in document data # 7 - remove references and inreplyto terms # 8 - remove redundant/unneeded document data - SCHEMA_VERSION => 8, + # 9 - disable Message-ID compression + SCHEMA_VERSION => 9, QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; # setup prefixes my %bool_pfx_internal = ( type => 'T', # "mail" or "ghost" - mid => 'Q', # uniQue id (Message-ID or mid_compress) + mid => 'Q', # uniQue id (Message-ID) ); my %bool_pfx_external = ( @@ -171,7 +172,6 @@ sub date_range_processor { sub lookup_message { my ($self, $mid) = @_; $mid = mid_clean($mid); - $mid = mid_compress($mid); my $doc_id = $self->find_unique_doc_id('mid', $mid); my $smsg; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 32e0714..ee85268 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -41,8 +41,7 @@ sub add_message { my $db = $self->{xdb}; my $doc_id; - my $mid_orig = mid_clean($mime->header('Message-ID')); - my $mid = mid_compress($mid_orig); + my $mid = mid_clean($mime->header('Message-ID')); my $was_ghost = 0; my $ct_msg = $mime->header('Content-Type') || 'text/plain'; @@ -139,7 +138,7 @@ sub add_message { }; if ($@) { - warn "failed to index message <$mid_orig>: $@\n"; + warn "failed to index message <$mid>: $@\n"; return undef; } $doc_id; @@ -147,11 +146,10 @@ sub add_message { # returns deleted doc_id on success, undef on missing sub remove_message { - my ($self, $mid_orig) = @_; + my ($self, $mid) = @_; my $db = $self->{xdb}; my $doc_id; - $mid_orig = mid_clean($mid_orig); - my $mid = mid_compress($mid_orig); + $mid = mid_clean($mid); eval { $doc_id = $self->find_unique_doc_id('mid', $mid); @@ -159,7 +157,7 @@ sub remove_message { }; if ($@) { - warn "failed to remove message <$mid_orig>: $@\n"; + warn "failed to remove message <$mid>: $@\n"; return undef; } $doc_id; @@ -204,32 +202,24 @@ sub link_message { sub link_message_to_parents { my ($self, $smsg) = @_; my $doc = $smsg->{doc}; - my $mid = mid_compress($smsg->mid); + my $mid = $smsg->mid; my $mime = $smsg->mime; my $refs = $mime->header('References'); my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - my $irt = $mime->header('In-Reply-To'); - if ($irt) { - $irt = mid_compress(mid_clean($irt)); - - # maybe some crazies will try to make a circular reference: - if ($irt eq $mid) { - $irt = undef; - } else { - # last References should be $irt - # we will de-dupe later - push @refs, $irt; - } + if (my $irt = $mime->header('In-Reply-To')) { + # last References should be $irt + # we will de-dupe later + push @refs, mid_clean($irt); } my $tid; if (@refs) { - my @crefs = map { mid_compress($_) } @refs; my %uniq = ($mid => 1); + my @orig_refs = @refs; + @refs = (); # prevent circular references via References: here: - @refs = (); - foreach my $ref (@crefs) { + foreach my $ref (@orig_refs) { next if $uniq{$ref}; $uniq{$ref} = 1; push @refs, $ref; @@ -342,7 +332,6 @@ sub _resolve_mid_to_tid { sub create_ghost { my ($self, $mid, $tid) = @_; - $mid = mid_compress($mid); $tid = $self->next_thread_id unless defined $tid; my $doc = Search::Xapian::Document->new; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 1821b07..3891823 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -9,7 +9,7 @@ use Email::Address qw//; use Email::Simple qw//; use POSIX qw//; use Date::Parse qw/str2time/; -use PublicInbox::MID qw/mid_clean mid_compress/; +use PublicInbox::MID qw/mid_clean/; use Encode qw/find_encoding/; my $enc_utf8 = find_encoding('UTF-8'); our $PFX2TERM_RE = undef; @@ -167,7 +167,7 @@ sub _extract_mid { my ($self) = @_; my $mid = $self->mime->header('Message-ID'); - $mid ? mid_compress(mid_clean($mid)) : $mid; + defined $mid ? mid_clean($mid) : $mid; } sub mime { diff --git a/t/search.t b/t/search.t index 02189ac..4ad8a31 100644 --- a/t/search.t +++ b/t/search.t @@ -192,7 +192,6 @@ sub filter_mids { $rw_commit->(); $ro->reopen; my $long_mid = 'last' . ('x' x 60). '@s'; - my $long_midc = Digest::SHA::sha1_hex($long_mid); my $long = Email::MIME->create( header_str => [ @@ -232,7 +231,7 @@ sub filter_mids { $ro->reopen; my $t = $ro->get_thread('root@s'); is($t->{total}, 4, "got all 4 mesages in thread"); - my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_midc); + my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_mid); @res = filter_mids($t); is_deeply(\@res, \@exp, "get_thread works"); } -- EW