* [PATCH 5/5] search: bump SCHEMA_VERSION to 4
2015-08-18 1:21 6% ` [PATCH 3/5] search: common Subject: normalization for Re: prefixes Eric Wong
@ 2015-08-18 1:21 7% ` Eric Wong
1 sibling, 0 replies; 2+ results
From: Eric Wong @ 2015-08-18 1:21 UTC (permalink / raw)
To: meta
The following two commits affect indexing behavior, so
change the schema version to avoid compatibility problems
or missing messages:
search: common Subject: normalization for Re: prefixes
search: avoid creating ghosts for circular References
---
lib/PublicInbox/Search.pm | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index d767941..b9f283f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -22,7 +22,8 @@ use constant {
# 1 - subject_path is lower-cased
# 2 - subject_path is mid_compressed in the index, only
# 3 - message-ID is compressed if it includes '%' (hack!)
- SCHEMA_VERSION => 3,
+ # 4 - change "Re: " normalization, avoid circular Reference ghosts
+ SCHEMA_VERSION => 4,
QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
};
--
EW
^ permalink raw reply related [relevance 7%]
* [PATCH 3/5] search: common Subject: normalization for Re: prefixes
@ 2015-08-18 1:21 6% ` Eric Wong
2015-08-18 1:21 7% ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong
1 sibling, 0 replies; 2+ results
From: Eric Wong @ 2015-08-18 1:21 UTC (permalink / raw)
To: meta
Drop German ("Aw:") support since it's non-standard and
is not supported by Mail::Thread and non-English prefixes
are more likely to conflict with prefixes used in Free Software
development where ("subsection:") prefixes are common and English is the
common language.
Anyways we don't filter "Vs: " (Finnish) or "Sv: "
(Norwegian, Swedish, Danish, Icelandic), either.
ref:
https://en.wikipedia.org/wiki/RE_(e-mail)#Abbreviations_in_other_languages
---
lib/PublicInbox/Search.pm | 6 +++++-
lib/PublicInbox/View.pm | 19 +++++++------------
2 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index db86301..6a05ce7 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -10,6 +10,10 @@ require PublicInbox::View;
use Email::MIME;
use PublicInbox::MID qw/mid_clean mid_compressed/;
+# This is English-only, everything else is non-standard and may be confused as
+# a prefix common in patch emails
+our $REPLY_RE = qr/^re:\s+/i;
+
use constant {
TS => 0,
# SCHEMA_VERSION history
@@ -490,7 +494,7 @@ sub subject_path {
$subj =~ s/\A\s+//;
$subj =~ s/\s+\z//;
- $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
+ $subj =~ s/$REPLY_RE//igo; # remove reply prefix
$subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
lc($subj);
}
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index b0b8e14..7122a38 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -457,6 +457,7 @@ sub html_footer {
if (my $c = $res->{count}) {
$c = $c == 1 ? '1 followup' : "$c followups";
$idx .= "\n$c:\n";
+ $res->{srch} = $srch;
thread_followups(\$idx, $mime, $res);
} else {
$idx .= "\n(no followups, yet)\n";
@@ -493,13 +494,14 @@ sub anchor_for {
sub simple_dump {
my ($dst, $root, $node, $level) = @_;
+ # $root = [ Root Message-ID, \%seen, $srch ];
my $pfx = ' ' x $level;
$$dst .= $pfx;
if (my $x = $node->message) {
my $mid = $x->header('Message-ID');
if ($root->[0] ne $mid) {
my $s = $x->header('Subject');
- my $h = hash_subj($s);
+ my $h = $root->[2]->subject_path($s);
if ($root->[1]->{$h}) {
$s = '';
} else {
@@ -525,15 +527,6 @@ sub simple_dump {
simple_dump($dst, $root, $node->next, $level) if $node->next;
}
-sub hash_subj {
- my ($subj) = @_;
- $subj =~ s/\A\s+//;
- $subj =~ s/\s+\z//;
- $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
- $subj =~ s/\s+/ /;
- Digest::SHA::sha1($subj);
-}
-
sub thread_followups {
my ($dst, $root, $res) = @_;
my @msgs = map { $_->mini_mime } @{$res->{msgs}};
@@ -542,8 +535,10 @@ sub thread_followups {
my $th = PublicInbox::Thread->new($root, @msgs);
$th->thread;
$th->order(*PublicInbox::Thread::sort_ts);
- $root = [ $root->header('Message-ID'),
- { hash_subj($root->header('Subject')) => 1 } ];
+ my $srch = $res->{srch};
+ my $subj = $srch->subject_path($root->header('Subject'));
+ my %seen = ($subj => 1);
+ $root = [ $root->header('Message-ID'), \%seen, $srch ];
simple_dump($dst, $root, $_, 0) for $th->rootset;
}
--
EW
^ permalink raw reply related [relevance 6%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2015-08-18 1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
2015-08-18 1:21 6% ` [PATCH 3/5] search: common Subject: normalization for Re: prefixes Eric Wong
2015-08-18 1:21 7% ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).