user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/5] search: common Subject: normalization for Re: prefixes
Date: Tue, 18 Aug 2015 01:21:08 +0000	[thread overview]
Message-ID: <1439860870-8086-3-git-send-email-e@80x24.org> (raw)
In-Reply-To: <1439860870-8086-1-git-send-email-e@80x24.org>

Drop German ("Aw:") support since it's non-standard and
is not supported by Mail::Thread and non-English prefixes
are more likely to conflict with prefixes used in Free Software
development where ("subsection:") prefixes are common and English is the
common language.

Anyways we don't filter "Vs: " (Finnish) or "Sv: "
(Norwegian, Swedish, Danish, Icelandic), either.

ref:
https://en.wikipedia.org/wiki/RE_(e-mail)#Abbreviations_in_other_languages
---
 lib/PublicInbox/Search.pm |  6 +++++-
 lib/PublicInbox/View.pm   | 19 +++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index db86301..6a05ce7 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -10,6 +10,10 @@ require PublicInbox::View;
 use Email::MIME;
 use PublicInbox::MID qw/mid_clean mid_compressed/;
 
+# This is English-only, everything else is non-standard and may be confused as
+# a prefix common in patch emails
+our $REPLY_RE = qr/^re:\s+/i;
+
 use constant {
 	TS => 0,
 	# SCHEMA_VERSION history
@@ -490,7 +494,7 @@ sub subject_path {
 
 	$subj =~ s/\A\s+//;
 	$subj =~ s/\s+\z//;
-	$subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
+	$subj =~ s/$REPLY_RE//igo; # remove reply prefix
 	$subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
 	lc($subj);
 }
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index b0b8e14..7122a38 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -457,6 +457,7 @@ sub html_footer {
 		if (my $c = $res->{count}) {
 			$c = $c == 1 ? '1 followup' : "$c followups";
 			$idx .= "\n$c:\n";
+			$res->{srch} = $srch;
 			thread_followups(\$idx, $mime, $res);
 		} else {
 			$idx .= "\n(no followups, yet)\n";
@@ -493,13 +494,14 @@ sub anchor_for {
 
 sub simple_dump {
 	my ($dst, $root, $node, $level) = @_;
+	# $root = [ Root Message-ID, \%seen, $srch ];
 	my $pfx = '  ' x $level;
 	$$dst .= $pfx;
 	if (my $x = $node->message) {
 		my $mid = $x->header('Message-ID');
 		if ($root->[0] ne $mid) {
 			my $s = $x->header('Subject');
-			my $h = hash_subj($s);
+			my $h = $root->[2]->subject_path($s);
 			if ($root->[1]->{$h}) {
 				$s = '';
 			} else {
@@ -525,15 +527,6 @@ sub simple_dump {
 	simple_dump($dst, $root, $node->next, $level) if $node->next;
 }
 
-sub hash_subj {
-	my ($subj) = @_;
-	$subj =~ s/\A\s+//;
-	$subj =~ s/\s+\z//;
-	$subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
-	$subj =~ s/\s+/ /;
-	Digest::SHA::sha1($subj);
-}
-
 sub thread_followups {
 	my ($dst, $root, $res) = @_;
 	my @msgs = map { $_->mini_mime } @{$res->{msgs}};
@@ -542,8 +535,10 @@ sub thread_followups {
 	my $th = PublicInbox::Thread->new($root, @msgs);
 	$th->thread;
 	$th->order(*PublicInbox::Thread::sort_ts);
-	$root = [ $root->header('Message-ID'),
-		  { hash_subj($root->header('Subject')) => 1 } ];
+	my $srch = $res->{srch};
+	my $subj = $srch->subject_path($root->header('Subject'));
+	my %seen = ($subj => 1);
+	$root = [ $root->header('Message-ID'), \%seen, $srch ];
 	simple_dump($dst, $root, $_, 0) for $th->rootset;
 }
 
-- 
EW


  parent reply	other threads:[~2015-08-18  1:21 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
2015-08-18  1:21 ` [PATCH 2/5] search: avoid creating ghosts for circular References Eric Wong
2015-08-18  1:21 ` Eric Wong [this message]
2015-08-18  1:21 ` [PATCH 4/5] search: expose $PublicInbox::Search::LANG variable Eric Wong
2015-08-18  1:21 ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1439860870-8086-3-git-send-email-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).