* [PATCH 2/3] search: disable Message-ID compression in Xapian
2015-09-03 1:57 7% [PATCH 0/3] improve external Message-ID handling Eric Wong
@ 2015-09-03 1:57 5% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2015-09-03 1:57 UTC (permalink / raw)
To: meta
We'll continue to compress long Message-IDs in URLs (which we know
about), but we will store entire Message-IDs in the Xapian database
to facilitate ease-of-lookups in external databases.
---
lib/PublicInbox/ExtMsg.pm | 9 ++++-----
lib/PublicInbox/Search.pm | 6 +++---
lib/PublicInbox/SearchIdx.pm | 37 +++++++++++++------------------------
lib/PublicInbox/SearchMsg.pm | 4 ++--
t/search.t | 3 +--
5 files changed, 23 insertions(+), 36 deletions(-)
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 1c0887c..bdbff78 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -12,7 +12,6 @@ sub ext_msg {
my $pi_config = $ctx->{pi_config};
my $listname = $ctx->{listname};
my $mid = $ctx->{mid};
- my $cmid = mid_compress($mid);
eval { require PublicInbox::Search };
my $have_xap = $@ ? 0 : 1;
@@ -35,13 +34,13 @@ sub ext_msg {
if ($have_xap) {
my $doc_id = eval {
my $s = PublicInbox::Search->new($git_dir);
- $s->find_unique_doc_id('mid', $cmid);
+ $s->find_unique_doc_id('mid', $mid);
};
if ($@) {
# xapian not configured for this repo
} else {
# maybe we found it!
- return r302($url, $cmid) if (defined $doc_id);
+ return r302($url, $mid) if (defined $doc_id);
# no point in trying the fork fallback if we
# know Xapian is up-to-date but missing the
@@ -55,7 +54,7 @@ sub ext_msg {
}
# Xapian not installed or configured for some repos
- my $path = "HEAD:" . mid2path($cmid);
+ my $path = "HEAD:" . mid2path($mid);
foreach my $n (@nox) {
my @cmd = ('git', "--git-dir=$n->{git_dir}", 'cat-file',
@@ -70,7 +69,7 @@ sub ext_msg {
my $type = eval { local $/; <$fh> };
close $fh;
if ($? == 0 && $type eq "blob\n") {
- return r302($n->{url}, $cmid);
+ return r302($n->{url}, $mid);
}
}
}
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8b32ef3..eb49f72 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -26,14 +26,15 @@ use constant {
# 6 - preserve References: order in document data
# 7 - remove references and inreplyto terms
# 8 - remove redundant/unneeded document data
- SCHEMA_VERSION => 8,
+ # 9 - disable Message-ID compression
+ SCHEMA_VERSION => 9,
QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
};
# setup prefixes
my %bool_pfx_internal = (
type => 'T', # "mail" or "ghost"
- mid => 'Q', # uniQue id (Message-ID or mid_compress)
+ mid => 'Q', # uniQue id (Message-ID)
);
my %bool_pfx_external = (
@@ -171,7 +172,6 @@ sub date_range_processor {
sub lookup_message {
my ($self, $mid) = @_;
$mid = mid_clean($mid);
- $mid = mid_compress($mid);
my $doc_id = $self->find_unique_doc_id('mid', $mid);
my $smsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 32e0714..ee85268 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -41,8 +41,7 @@ sub add_message {
my $db = $self->{xdb};
my $doc_id;
- my $mid_orig = mid_clean($mime->header('Message-ID'));
- my $mid = mid_compress($mid_orig);
+ my $mid = mid_clean($mime->header('Message-ID'));
my $was_ghost = 0;
my $ct_msg = $mime->header('Content-Type') || 'text/plain';
@@ -139,7 +138,7 @@ sub add_message {
};
if ($@) {
- warn "failed to index message <$mid_orig>: $@\n";
+ warn "failed to index message <$mid>: $@\n";
return undef;
}
$doc_id;
@@ -147,11 +146,10 @@ sub add_message {
# returns deleted doc_id on success, undef on missing
sub remove_message {
- my ($self, $mid_orig) = @_;
+ my ($self, $mid) = @_;
my $db = $self->{xdb};
my $doc_id;
- $mid_orig = mid_clean($mid_orig);
- my $mid = mid_compress($mid_orig);
+ $mid = mid_clean($mid);
eval {
$doc_id = $self->find_unique_doc_id('mid', $mid);
@@ -159,7 +157,7 @@ sub remove_message {
};
if ($@) {
- warn "failed to remove message <$mid_orig>: $@\n";
+ warn "failed to remove message <$mid>: $@\n";
return undef;
}
$doc_id;
@@ -204,32 +202,24 @@ sub link_message {
sub link_message_to_parents {
my ($self, $smsg) = @_;
my $doc = $smsg->{doc};
- my $mid = mid_compress($smsg->mid);
+ my $mid = $smsg->mid;
my $mime = $smsg->mime;
my $refs = $mime->header('References');
my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
- my $irt = $mime->header('In-Reply-To');
- if ($irt) {
- $irt = mid_compress(mid_clean($irt));
-
- # maybe some crazies will try to make a circular reference:
- if ($irt eq $mid) {
- $irt = undef;
- } else {
- # last References should be $irt
- # we will de-dupe later
- push @refs, $irt;
- }
+ if (my $irt = $mime->header('In-Reply-To')) {
+ # last References should be $irt
+ # we will de-dupe later
+ push @refs, mid_clean($irt);
}
my $tid;
if (@refs) {
- my @crefs = map { mid_compress($_) } @refs;
my %uniq = ($mid => 1);
+ my @orig_refs = @refs;
+ @refs = ();
# prevent circular references via References: here:
- @refs = ();
- foreach my $ref (@crefs) {
+ foreach my $ref (@orig_refs) {
next if $uniq{$ref};
$uniq{$ref} = 1;
push @refs, $ref;
@@ -342,7 +332,6 @@ sub _resolve_mid_to_tid {
sub create_ghost {
my ($self, $mid, $tid) = @_;
- $mid = mid_compress($mid);
$tid = $self->next_thread_id unless defined $tid;
my $doc = Search::Xapian::Document->new;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 1821b07..3891823 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -9,7 +9,7 @@ use Email::Address qw//;
use Email::Simple qw//;
use POSIX qw//;
use Date::Parse qw/str2time/;
-use PublicInbox::MID qw/mid_clean mid_compress/;
+use PublicInbox::MID qw/mid_clean/;
use Encode qw/find_encoding/;
my $enc_utf8 = find_encoding('UTF-8');
our $PFX2TERM_RE = undef;
@@ -167,7 +167,7 @@ sub _extract_mid {
my ($self) = @_;
my $mid = $self->mime->header('Message-ID');
- $mid ? mid_compress(mid_clean($mid)) : $mid;
+ defined $mid ? mid_clean($mid) : $mid;
}
sub mime {
diff --git a/t/search.t b/t/search.t
index 02189ac..4ad8a31 100644
--- a/t/search.t
+++ b/t/search.t
@@ -192,7 +192,6 @@ sub filter_mids {
$rw_commit->();
$ro->reopen;
my $long_mid = 'last' . ('x' x 60). '@s';
- my $long_midc = Digest::SHA::sha1_hex($long_mid);
my $long = Email::MIME->create(
header_str => [
@@ -232,7 +231,7 @@ sub filter_mids {
$ro->reopen;
my $t = $ro->get_thread('root@s');
is($t->{total}, 4, "got all 4 mesages in thread");
- my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_midc);
+ my @exp = sort($long_reply_mid, 'root@s', 'last@s', $long_mid);
@res = filter_mids($t);
is_deeply(\@res, \@exp, "get_thread works");
}
--
EW
^ permalink raw reply related [relevance 5%]
* [PATCH 0/3] improve external Message-ID handling
@ 2015-09-03 1:57 7% Eric Wong
2015-09-03 1:57 5% ` [PATCH 2/3] search: disable Message-ID compression in Xapian Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2015-09-03 1:57 UTC (permalink / raw)
To: meta
Yet another Xapian schema change. We no longer compress Message-IDs
in Xapian, so looking for cross-posted messages in external archives
should be possible.
Eric Wong (3):
view: include ghost messages in thread views
search: disable Message-ID compression in Xapian
ExtMsg: 300 to external mailing list archives
lib/PublicInbox/ExtMsg.pm | 39 +++++++++++++++++++++++++------
lib/PublicInbox/Feed.pm | 29 ++++++++---------------
lib/PublicInbox/Search.pm | 6 ++---
lib/PublicInbox/SearchIdx.pm | 37 +++++++++++------------------
lib/PublicInbox/SearchMsg.pm | 4 ++--
lib/PublicInbox/View.pm | 55 ++++++++++++++++++++++++++++++++++++--------
public-inbox.cgi | 1 +
t/cgi.t | 6 ++---
t/search.t | 3 +--
9 files changed, 111 insertions(+), 69 deletions(-)
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2015-09-03 1:57 7% [PATCH 0/3] improve external Message-ID handling Eric Wong
2015-09-03 1:57 5% ` [PATCH 2/3] search: disable Message-ID compression in Xapian Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).