From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.5 required=3.0 tests=ALL_TRUSTED,BAYES_00, RP_MATCHES_RCVD,URIBL_BLOCKED shortcircuit=no autolearn=unavailable version=3.3.2 X-Original-To: meta@public-inbox.org Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A55EF1F81C for ; Tue, 18 Aug 2015 01:21:10 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/5] search: common Subject: normalization for Re: prefixes Date: Tue, 18 Aug 2015 01:21:08 +0000 Message-Id: <1439860870-8086-3-git-send-email-e@80x24.org> In-Reply-To: <1439860870-8086-1-git-send-email-e@80x24.org> References: <1439860870-8086-1-git-send-email-e@80x24.org> List-Id: Drop German ("Aw:") support since it's non-standard and is not supported by Mail::Thread and non-English prefixes are more likely to conflict with prefixes used in Free Software development where ("subsection:") prefixes are common and English is the common language. Anyways we don't filter "Vs: " (Finnish) or "Sv: " (Norwegian, Swedish, Danish, Icelandic), either. ref: https://en.wikipedia.org/wiki/RE_(e-mail)#Abbreviations_in_other_languages --- lib/PublicInbox/Search.pm | 6 +++++- lib/PublicInbox/View.pm | 19 +++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index db86301..6a05ce7 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -10,6 +10,10 @@ require PublicInbox::View; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compressed/; +# This is English-only, everything else is non-standard and may be confused as +# a prefix common in patch emails +our $REPLY_RE = qr/^re:\s+/i; + use constant { TS => 0, # SCHEMA_VERSION history @@ -490,7 +494,7 @@ sub subject_path { $subj =~ s/\A\s+//; $subj =~ s/\s+\z//; - $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) + $subj =~ s/$REPLY_RE//igo; # remove reply prefix $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; lc($subj); } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index b0b8e14..7122a38 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -457,6 +457,7 @@ sub html_footer { if (my $c = $res->{count}) { $c = $c == 1 ? '1 followup' : "$c followups"; $idx .= "\n$c:\n"; + $res->{srch} = $srch; thread_followups(\$idx, $mime, $res); } else { $idx .= "\n(no followups, yet)\n"; @@ -493,13 +494,14 @@ sub anchor_for { sub simple_dump { my ($dst, $root, $node, $level) = @_; + # $root = [ Root Message-ID, \%seen, $srch ]; my $pfx = ' ' x $level; $$dst .= $pfx; if (my $x = $node->message) { my $mid = $x->header('Message-ID'); if ($root->[0] ne $mid) { my $s = $x->header('Subject'); - my $h = hash_subj($s); + my $h = $root->[2]->subject_path($s); if ($root->[1]->{$h}) { $s = ''; } else { @@ -525,15 +527,6 @@ sub simple_dump { simple_dump($dst, $root, $node->next, $level) if $node->next; } -sub hash_subj { - my ($subj) = @_; - $subj =~ s/\A\s+//; - $subj =~ s/\s+\z//; - $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) - $subj =~ s/\s+/ /; - Digest::SHA::sha1($subj); -} - sub thread_followups { my ($dst, $root, $res) = @_; my @msgs = map { $_->mini_mime } @{$res->{msgs}}; @@ -542,8 +535,10 @@ sub thread_followups { my $th = PublicInbox::Thread->new($root, @msgs); $th->thread; $th->order(*PublicInbox::Thread::sort_ts); - $root = [ $root->header('Message-ID'), - { hash_subj($root->header('Subject')) => 1 } ]; + my $srch = $res->{srch}; + my $subj = $srch->subject_path($root->header('Subject')); + my %seen = ($subj => 1); + $root = [ $root->header('Message-ID'), \%seen, $srch ]; simple_dump($dst, $root, $_, 0) for $th->rootset; } -- EW