about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-10-06 10:12:21 +0000
committerEric Wong <e@80x24.org>2021-10-06 11:23:51 +0000
commite785573c556572a675407796604e9c0a9965ea9f (patch)
tree007e58811f281de2d8d51433021c85440bbc2163
parent89193578d21f847478f844e9e85495b9cae8842b (diff)
downloadpublic-inbox-e785573c556572a675407796604e9c0a9965ea9f.tar.gz
This should bring us closer to the "Base subject" definition in
IMAP ORDEREDSUBJECT (RFC 5256 2.1).  Larger changes may cause
some breakage (until --reindex).  But for now, a reindex will
prevents the non-ASCII subjects from being normalized to the
same fuzzy "thread" in the thread view.
-rw-r--r--lib/PublicInbox/OverIdx.pm7
-rw-r--r--lib/PublicInbox/Smsg.pm2
2 files changed, 6 insertions, 3 deletions
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 2e3d4534..0c8a4d9e 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -243,12 +243,13 @@ sub link_refs {
         $tid;
 }
 
-# normalize subjects so they are suitable as pathnames for URLs
-# XXX: consider for removal
+# normalize subjects somewhat, they used to be ASCII-only but now
+# we use \w for UTF-8 support.  We may still drop it entirely and
+# rely on Xapian for subject matches...
 sub subject_path ($) {
         my ($subj) = @_;
         $subj = subject_normalized($subj);
-        $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
+        $subj =~ s![^\w\.~/\-]+!_!g;
         lc($subj);
 }
 
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index da8ce590..fb28eff7 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -145,6 +145,8 @@ sub internaldate { # for IMAP
 
 our $REPLY_RE = qr/^re:\s+/i;
 
+# TODO: see RFC 5256 sec 2.1 "Base Subject" and evaluate compatibility
+# w/ existing indices...
 sub subject_normalized ($) {
         my ($subj) = @_;
         $subj =~ s/\A\s+//s; # no leading space