user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 1/5] view: cleaner Message-ID filtering for References
@ 2015-08-18  1:21 Eric Wong
  2015-08-18  1:21 ` [PATCH 2/5] search: avoid creating ghosts for circular References Eric Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Eric Wong @ 2015-08-18  1:21 UTC (permalink / raw)
  To: meta

Avoid compiling a weird and potentially fragile regexp every
time and use the same logic as our search module to dedupe
References.
---
 lib/PublicInbox/View.pm | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 6fbc366..b0b8e14 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -395,10 +395,19 @@ sub headers_to_html_header {
 
 	my $refs = $header_obj->header_raw('References');
 	if ($refs) {
-		$refs =~ s/\s*\Q$irt\E\s*// if (defined $irt);
-		my @refs = ($refs =~ /<([^>]+)>/g);
+		# avoid redundant URLs wasting bandwidth
+		my %seen;
+		$seen{mid_clean($irt)} = 1 if defined $irt;
+		my @refs;
+		my @raw_refs = ($refs =~ /<([^>]+)>/g);
+		foreach my $ref (@raw_refs) {
+			next if $seen{$ref};
+			$seen{$ref} = 1;
+			push @refs, linkify_ref($ref);
+		}
+
 		if (@refs) {
-			$rv .= 'References: '. linkify_refs(@refs) . "\n";
+			$rv .= 'References: '. join(' ', @refs) . "\n";
 		}
 	}
 
@@ -466,13 +475,11 @@ sub html_footer {
 	"$irt<a\nhref=\"" . ascii_html($href) . '">reply</a>' . $idx;
 }
 
-sub linkify_refs {
-	join(' ', map {
-		my $v = PublicInbox::Hval->new_msgid($_);
-		my $html = $v->as_html;
-		my $href = $v->as_href;
-		"&lt;<a\nhref=\"$href.html\">$html</a>&gt;";
-	} @_);
+sub linkify_ref {
+	my $v = PublicInbox::Hval->new_msgid($_[0]);
+	my $html = $v->as_html;
+	my $href = $v->as_href;
+	"&lt;<a\nhref=\"$href.html\">$html</a>&gt;";
 }
 
 sub anchor_for {
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] search: avoid creating ghosts for circular References
  2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
@ 2015-08-18  1:21 ` Eric Wong
  2015-08-18  1:21 ` [PATCH 3/5] search: common Subject: normalization for Re: prefixes Eric Wong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-08-18  1:21 UTC (permalink / raw)
  To: meta

Some mail software incorrectly creates circular references
and causes us to create ghosts before the actual mail doc
is created.
---
 lib/PublicInbox/Search.pm | 43 ++++++++++++++++++++++++++++++-------------
 t/search.t                | 19 +++++++++++++++++++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 617c267..db86301 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -79,8 +79,8 @@ sub add_message {
 	my $db = $self->{xdb};
 
 	my $doc_id;
-	my $mid = mid_clean($mime->header_obj->header_raw('Message-ID'));
-	$mid = mid_compressed($mid);
+	my $mid_orig = mid_clean($mime->header_obj->header_raw('Message-ID'));
+	my $mid = mid_compressed($mid_orig);
 	my $was_ghost = 0;
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 	my $enc_msg = PublicInbox::View::enc_for($ct_msg);
@@ -176,7 +176,7 @@ sub add_message {
 	};
 
 	if ($@) {
-		warn "failed to index message <$mid>: $@\n";
+		warn "failed to index message <$mid_orig>: $@\n";
 		return undef;
 	}
 	$doc_id;
@@ -184,11 +184,11 @@ sub add_message {
 
 # returns deleted doc_id on success, undef on missing
 sub remove_message {
-	my ($self, $mid) = @_;
+	my ($self, $mid_orig) = @_;
 	my $db = $self->{xdb};
 	my $doc_id;
-	$mid = mid_clean($mid);
-	$mid = mid_compressed($mid);
+	$mid_orig = mid_clean($mid_orig);
+	my $mid = mid_compressed($mid_orig);
 
 	eval {
 		$doc_id = $self->find_unique_doc_id('mid', $mid);
@@ -196,7 +196,7 @@ sub remove_message {
 	};
 
 	if ($@) {
-		warn "failed to remove message <$mid>: $@\n";
+		warn "failed to remove message <$mid_orig>: $@\n";
 		return undef;
 	}
 	$doc_id;
@@ -347,16 +347,33 @@ sub link_message_to_parents {
 		if ($irt =~ /<([^>]+)>/) {
 			$irt = $1;
 		}
-		push @refs, $irt;
+
+		# maybe some crazies will try to make a circular reference:
+		if ($irt eq $mid) {
+			$irt = undef;
+		} else {
+			push @refs, $irt;
+		}
 	}
 
 	my $tid;
 	if (@refs) {
-		@refs = map { mid_compressed($_) } @refs;
-		my %uniq;
-		@refs = grep { !$uniq{$_}++ } @refs; # uniq
-
-		$doc->add_term(xpfx('inreplyto') . $refs[-1]);
+		my @crefs = map { mid_compressed($_) } @refs;
+		my %uniq = ($mid => 1);
+
+		# prevent circular references via References: here:
+		@refs = ();
+		foreach my $ref (@crefs) {
+			next if $uniq{$ref};
+			$uniq{$ref} = 1;
+			push @refs, $ref;
+		}
+		$irt = undef if (defined $irt && !$uniq{$irt});
+	}
+	if (@refs) {
+		if (defined $irt) {
+			$doc->add_term(xpfx('inreplyto') . $irt);
+		}
 
 		my $ref_pfx = xpfx('references');
 
diff --git a/t/search.t b/t/search.t
index 0ad0886..55abe9e 100644
--- a/t/search.t
+++ b/t/search.t
@@ -243,6 +243,25 @@ sub filter_mids {
 		"quoted result returned if nothing else");
 }
 
+# circular references
+{
+	my $doc_id = $rw->add_message(Email::MIME->create(
+		header_str => [
+			Date => 'Sat, 02 Oct 2010 00:00:01 +0000',
+			Subject => 'Circle',
+			'Message-ID' => '<circle@a>',
+			'References' => '<circle@a>',
+			'In-Reply-To' => '<circle@a>',
+			From => 'Circle <circle@example.com>',
+			To => 'list@example.com',
+		],
+		body => "LOOP!\n"));
+	ok($doc_id > 0, "doc_id defined with circular reference");
+	my $smsg = $rw->lookup_message('circle@a');
+	$smsg->ensure_metadata;
+	is($smsg->{references}, undef, "no references created");
+}
+
 done_testing();
 
 1;
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] search: common Subject: normalization for Re: prefixes
  2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
  2015-08-18  1:21 ` [PATCH 2/5] search: avoid creating ghosts for circular References Eric Wong
@ 2015-08-18  1:21 ` Eric Wong
  2015-08-18  1:21 ` [PATCH 4/5] search: expose $PublicInbox::Search::LANG variable Eric Wong
  2015-08-18  1:21 ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-08-18  1:21 UTC (permalink / raw)
  To: meta

Drop German ("Aw:") support since it's non-standard and
is not supported by Mail::Thread and non-English prefixes
are more likely to conflict with prefixes used in Free Software
development where ("subsection:") prefixes are common and English is the
common language.

Anyways we don't filter "Vs: " (Finnish) or "Sv: "
(Norwegian, Swedish, Danish, Icelandic), either.

ref:
https://en.wikipedia.org/wiki/RE_(e-mail)#Abbreviations_in_other_languages
---
 lib/PublicInbox/Search.pm |  6 +++++-
 lib/PublicInbox/View.pm   | 19 +++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index db86301..6a05ce7 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -10,6 +10,10 @@ require PublicInbox::View;
 use Email::MIME;
 use PublicInbox::MID qw/mid_clean mid_compressed/;
 
+# This is English-only, everything else is non-standard and may be confused as
+# a prefix common in patch emails
+our $REPLY_RE = qr/^re:\s+/i;
+
 use constant {
 	TS => 0,
 	# SCHEMA_VERSION history
@@ -490,7 +494,7 @@ sub subject_path {
 
 	$subj =~ s/\A\s+//;
 	$subj =~ s/\s+\z//;
-	$subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
+	$subj =~ s/$REPLY_RE//igo; # remove reply prefix
 	$subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
 	lc($subj);
 }
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index b0b8e14..7122a38 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -457,6 +457,7 @@ sub html_footer {
 		if (my $c = $res->{count}) {
 			$c = $c == 1 ? '1 followup' : "$c followups";
 			$idx .= "\n$c:\n";
+			$res->{srch} = $srch;
 			thread_followups(\$idx, $mime, $res);
 		} else {
 			$idx .= "\n(no followups, yet)\n";
@@ -493,13 +494,14 @@ sub anchor_for {
 
 sub simple_dump {
 	my ($dst, $root, $node, $level) = @_;
+	# $root = [ Root Message-ID, \%seen, $srch ];
 	my $pfx = '  ' x $level;
 	$$dst .= $pfx;
 	if (my $x = $node->message) {
 		my $mid = $x->header('Message-ID');
 		if ($root->[0] ne $mid) {
 			my $s = $x->header('Subject');
-			my $h = hash_subj($s);
+			my $h = $root->[2]->subject_path($s);
 			if ($root->[1]->{$h}) {
 				$s = '';
 			} else {
@@ -525,15 +527,6 @@ sub simple_dump {
 	simple_dump($dst, $root, $node->next, $level) if $node->next;
 }
 
-sub hash_subj {
-	my ($subj) = @_;
-	$subj =~ s/\A\s+//;
-	$subj =~ s/\s+\z//;
-	$subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German)
-	$subj =~ s/\s+/ /;
-	Digest::SHA::sha1($subj);
-}
-
 sub thread_followups {
 	my ($dst, $root, $res) = @_;
 	my @msgs = map { $_->mini_mime } @{$res->{msgs}};
@@ -542,8 +535,10 @@ sub thread_followups {
 	my $th = PublicInbox::Thread->new($root, @msgs);
 	$th->thread;
 	$th->order(*PublicInbox::Thread::sort_ts);
-	$root = [ $root->header('Message-ID'),
-		  { hash_subj($root->header('Subject')) => 1 } ];
+	my $srch = $res->{srch};
+	my $subj = $srch->subject_path($root->header('Subject'));
+	my %seen = ($subj => 1);
+	$root = [ $root->header('Message-ID'), \%seen, $srch ];
 	simple_dump($dst, $root, $_, 0) for $th->rootset;
 }
 
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] search: expose $PublicInbox::Search::LANG variable
  2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
  2015-08-18  1:21 ` [PATCH 2/5] search: avoid creating ghosts for circular References Eric Wong
  2015-08-18  1:21 ` [PATCH 3/5] search: common Subject: normalization for Re: prefixes Eric Wong
@ 2015-08-18  1:21 ` Eric Wong
  2015-08-18  1:21 ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-08-18  1:21 UTC (permalink / raw)
  To: meta

This makes it easier to reconfigure for non-English users
---
 lib/PublicInbox/Search.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 6a05ce7..d767941 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -13,6 +13,7 @@ use PublicInbox::MID qw/mid_clean mid_compressed/;
 # This is English-only, everything else is non-standard and may be confused as
 # a prefix common in patch emails
 our $REPLY_RE = qr/^re:\s+/i;
+our $LANG = 'english';
 
 use constant {
 	TS => 0,
@@ -22,7 +23,6 @@ use constant {
 	# 2 - subject_path is mid_compressed in the index, only
 	# 3 - message-ID is compressed if it includes '%' (hack!)
 	SCHEMA_VERSION => 3,
-	LANG => 'english',
 	QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
 };
 
@@ -266,7 +266,7 @@ sub do_enquire {
 }
 
 # read-write
-sub stemmer { Search::Xapian::Stem->new(LANG) }
+sub stemmer { Search::Xapian::Stem->new($LANG) }
 
 # read-only
 sub qp {
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] search: bump SCHEMA_VERSION to 4
  2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
                   ` (2 preceding siblings ...)
  2015-08-18  1:21 ` [PATCH 4/5] search: expose $PublicInbox::Search::LANG variable Eric Wong
@ 2015-08-18  1:21 ` Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2015-08-18  1:21 UTC (permalink / raw)
  To: meta

The following two commits affect indexing behavior, so
change the schema version to avoid compatibility problems
or missing messages:

	search: common Subject: normalization for Re: prefixes
	search: avoid creating ghosts for circular References
---
 lib/PublicInbox/Search.pm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index d767941..b9f283f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -22,7 +22,8 @@ use constant {
 	# 1 - subject_path is lower-cased
 	# 2 - subject_path is mid_compressed in the index, only
 	# 3 - message-ID is compressed if it includes '%' (hack!)
-	SCHEMA_VERSION => 3,
+	# 4 - change "Re: " normalization, avoid circular Reference ghosts
+	SCHEMA_VERSION => 4,
 	QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
 };
 
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-08-18  1:21 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-18  1:21 [PATCH 1/5] view: cleaner Message-ID filtering for References Eric Wong
2015-08-18  1:21 ` [PATCH 2/5] search: avoid creating ghosts for circular References Eric Wong
2015-08-18  1:21 ` [PATCH 3/5] search: common Subject: normalization for Re: prefixes Eric Wong
2015-08-18  1:21 ` [PATCH 4/5] search: expose $PublicInbox::Search::LANG variable Eric Wong
2015-08-18  1:21 ` [PATCH 5/5] search: bump SCHEMA_VERSION to 4 Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).