user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Cc: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
Subject: [PATCH 07/12] v2: improve deduplication checks
Date: Wed, 18 Apr 2018 09:13:11 +0000	[thread overview]
Message-ID: <20180418091316.29114-8-e@80x24.org> (raw)
In-Reply-To: <20180418091316.29114-1-e@80x24.org>

First off, decode text portions of messages since some archived
mail I got was converted from quoted-printable or base-64 to
8bit by the original recipient.  Attempting to merge them with
my own archives (which had no conversion done) led to
unnecessary duplicates showing up.

Then, normalize CRLF line endings in text portions to LF.

In the headers, we relax the content_id hashing to ignore quotes
and lower-case domain names in To, Cc, and From headers since
some mail processors will alter them.

Finally, I've discovered Email::MIME->new($mime->as_string)
does not always round-trip reliably, so we calculate the
content_id twice on user-supplied messages.
---
 lib/PublicInbox/ContentId.pm  | 67 +++++++++++++++++++++++++++++------
 lib/PublicInbox/V2Writable.pm | 35 +++++++++++++-----
 t/content_id.t                | 10 ++++++
 3 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
index 279eec0..b1d27eb 100644
--- a/lib/PublicInbox/ContentId.pm
+++ b/lib/PublicInbox/ContentId.pm
@@ -7,10 +7,19 @@ use warnings;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/content_id content_digest/;
 use PublicInbox::MID qw(mids references);
+use PublicInbox::MsgIter;
 
 # not sure if less-widely supported hash families are worth bothering with
 use Digest::SHA;
 
+sub digest_addr ($$$) {
+	my ($dig, $h, $v) = @_;
+	$v =~ tr/"//d;
+	$v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge;
+	utf8::encode($v);
+	$dig->add("$h\0$v\0");
+}
+
 sub content_digest ($) {
 	my ($mime) = @_;
 	my $dig = Digest::SHA->new(256);
@@ -27,24 +36,62 @@ sub content_digest ($) {
 	}
 	foreach my $mid (@{references($hdr)}) {
 		next if $seen{$mid};
-		$dig->add('ref: '.$mid);
+		$dig->add("ref\0$mid\0");
 	}
 
 	# Only use Sender: if From is not present
 	foreach my $h (qw(From Sender)) {
-		my @v = $hdr->header_raw($h);
+		my @v = $hdr->header($h);
 		if (@v) {
-			$dig->add("$h: $_") foreach @v;
-			last;
+			digest_addr($dig, $h, $_) foreach @v;
 		}
 	}
-
-	# Content-* headers are often no-ops, so maybe we don't need them
-	foreach my $h (qw(Subject Date To Cc)) {
-		my @v = $hdr->header_raw($h);
-		$dig->add("$h: $_") foreach @v;
+	foreach my $h (qw(Subject Date)) {
+		my @v = $hdr->header($h);
+		foreach my $v (@v) {
+			utf8::encode($v);
+			$dig->add("$h\0$v\0");
+		}
+	}
+	# Some mail processors will add " to unquoted names that were
+	# not in the original message.  For the purposes of deduplication,
+	# do not take it into account:
+	foreach my $h (qw(To Cc)) {
+		my @v = $hdr->header($h);
+		digest_addr($dig, $h, $_) foreach @v;
 	}
-	$dig->add($mime->body_raw);
+	msg_iter($mime, sub {
+		my ($part, $depth, @idx) = @{$_[0]};
+		$dig->add("\0$depth:".join('.', @idx)."\0");
+		my $fn = $part->filename;
+		if (defined $fn) {
+			utf8::encode($fn);
+			$dig->add("fn\0$fn\0");
+		}
+		my @d = $part->header('Content-Description');
+		foreach my $d (@d) {
+			utf8::encode($d);
+			$dig->add("d\0$d\0");
+		}
+		$dig->add("b\0");
+		my $ct = $part->content_type || 'text/plain';
+		my $s = eval { $part->body_str };
+		if ($@ && $ct =~ m!\btext/plain\b!i) {
+			# Try to assume UTF-8 because Alpine
+			# seems to do wacky things and set
+			# charset=X-UNKNOWN
+			$part->charset_set('UTF-8');
+			$s = eval { $part->body_str };
+		}
+		if (defined $s) {
+			$s =~ s/\r\n/\n/gs;
+			$s =~ s/\s*\z//s;
+			utf8::encode($s);
+		} else {
+			$s = $part->body;
+		}
+		$dig->add($s);
+	});
 	$dig;
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0dcdeda..e9fd502 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -259,12 +259,32 @@ sub purge_oids {
 	$purges;
 }
 
+sub content_ids ($) {
+	my ($mime) = @_;
+	my @cids = ( content_id($mime) );
+
+	# Email::MIME->as_string doesn't always round-trip, so we may
+	# use a second content_id
+	my $rt = content_id(PublicInbox::MIME->new(\($mime->as_string)));
+	push @cids, $rt if $cids[0] ne $rt;
+	\@cids;
+}
+
+sub content_matches ($$) {
+	my ($cids, $existing) = @_;
+	my $cid = content_id($existing);
+	foreach (@$cids) {
+		return 1 if $_ eq $cid
+	}
+	0
+}
+
 sub remove_internal {
 	my ($self, $mime, $cmt_msg, $purge) = @_;
 	$self->idx_init;
 	my $im = $self->importer unless $purge;
 	my $over = $self->{over};
-	my $cid = content_id($mime);
+	my $cids = content_ids($mime);
 	my $parts = $self->{idx_parts};
 	my $mm = $self->{mm};
 	my $removed;
@@ -287,7 +307,7 @@ sub remove_internal {
 			}
 			my $orig = $$msg;
 			my $cur = PublicInbox::MIME->new($msg);
-			if (content_id($cur) eq $cid) {
+			if (content_matches($cids, $cur)) {
 				$smsg->{mime} = $cur;
 				$gone{$smsg->{num}} = [ $smsg, \$orig ];
 			}
@@ -572,8 +592,7 @@ sub get_blob ($$) {
 sub lookup_content {
 	my ($self, $mime, $mid) = @_;
 	my $over = $self->{over};
-	my $cid = content_id($mime);
-	my $found;
+	my $cids = content_ids($mime);
 	my ($id, $prev);
 	while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
 		my $msg = get_blob($self, $smsg);
@@ -582,16 +601,16 @@ sub lookup_content {
 			next;
 		}
 		my $cur = PublicInbox::MIME->new($msg);
-		if (content_id($cur) eq $cid) {
+		if (content_matches($cids, $cur)) {
 			$smsg->{mime} = $cur;
-			$found = $smsg;
-			last;
+			return $smsg;
 		}
 
+
 		# XXX DEBUG_DIFF is experimental and may be removed
 		diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};
 	}
-	$found;
+	undef;
 }
 
 sub atfork_child {
diff --git a/t/content_id.t b/t/content_id.t
index adcdb6c..01ce65e 100644
--- a/t/content_id.t
+++ b/t/content_id.t
@@ -22,4 +22,14 @@ my $orig = content_id($mime);
 my $reload = content_id(Email::MIME->new($mime->as_string));
 is($orig, $reload, 'content_id matches after serialization');
 
+foreach my $h (qw(From To Cc)) {
+	my $n = '"Quoted N\'Ame" <foo@EXAMPLE.com>';
+	$mime->header_str_set($h, "$n");
+	my $q = content_id($mime);
+	is($n, $mime->header($h), "content_id does not mutate $h:");
+	$mime->header_str_set($h, 'Quoted N\'Ame <foo@example.com>');
+	my $nq = content_id($mime);
+	is($nq, $q, "quotes ignored in $h:");
+}
+
 done_testing();
-- 
EW


  parent reply	other threads:[~2018-04-18  9:13 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-18  9:13 [PATCH 00/12] better dedupe, contiguous article numbers Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 01/12] feed: respect feedmax, again Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 02/12] v1: remove articles from overview DB Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 03/12] compact: do not merge v2 repos by default Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 04/12] v2writable: reduce partititions by one Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 05/12] search: preserve References in Xapian smsg for x=t view Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 06/12] v2: generate better Message-IDs for duplicates Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-04-18  9:13 ` [PATCH 08/12] import: cat_blob drops leading 'From ' lines like Inbox Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 09/12] searchidx: regenerate and avoid article number gaps on full index Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 10/12] extmsg: remove expensive git path checks Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 11/12] use %H consistently to disable abbreviations Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 12/12] searchidx: increase term positions for all text terms Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180418091316.29114-8-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).