From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E4AC71F51A; Wed, 18 Apr 2018 09:13:19 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Cc: "Eric Wong (Contractor, The Linux Foundation)" Subject: [PATCH 06/12] v2: generate better Message-IDs for duplicates Date: Wed, 18 Apr 2018 09:13:10 +0000 Message-Id: <20180418091316.29114-7-e@80x24.org> In-Reply-To: <20180418091316.29114-1-e@80x24.org> References: <20180418091316.29114-1-e@80x24.org> List-Id: While hunting duplicates, I noticed a leading '-' in some Message-IDs as a result of RFC4648 encoding. While '-' seems allowed by RFC5322 and URL-friendly (RFC4648), they are uncommon and make using Message-IDs as arguments for command-line tools more difficult. So prefix them with a datestamp to at least give readers some sense of the age. And shorten the "localhost" hostname to "z" to save space. --- MANIFEST | 1 + lib/PublicInbox/Import.pm | 18 ++++++------ lib/PublicInbox/V2Writable.pm | 6 ++-- scripts/dupe-finder | 54 +++++++++++++++++++++++++++++++++++ t/v2writable.t | 5 ++-- 5 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 scripts/dupe-finder diff --git a/MANIFEST b/MANIFEST index 58b3634..00a0970 100644 --- a/MANIFEST +++ b/MANIFEST @@ -124,6 +124,7 @@ script/public-inbox-watch script/public-inbox.cgi scripts/dc-dlvr scripts/dc-dlvr.pre +scripts/dupe-finder scripts/edit-sa-prefs scripts/import_maildir scripts/import_slrnspool diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 9e8900f..c7a96e1 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -14,6 +14,7 @@ use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::ContentId qw(content_digest); use PublicInbox::MDA; +use POSIX qw(strftime); sub new { my ($class, $git, $name, $email, $ibx) = @_; @@ -330,7 +331,7 @@ sub v1_mid0 ($) { my $mids = mids($hdr); if (!scalar(@$mids)) { # spam often has no Message-Id - my $mid0 = digest2mid(content_digest($mime)); + my $mid0 = digest2mid(content_digest($mime), $hdr); append_mid($hdr, $mid0); return $mid0; } @@ -445,18 +446,19 @@ sub atfork_child { } } -sub digest2mid ($) { - my ($dig) = @_; +sub digest2mid ($$) { + my ($dig, $hdr) = @_; my $b64 = $dig->clone->b64digest; # Make our own URLs nicer: # See "Base 64 Encoding with URL and Filename Safe Alphabet" in RFC4648 $b64 =~ tr!+/=!-_!d; - # We can make this more meaningful with a date prefix or other things, - # but this is only needed for crap that fails to generate a Message-ID - # or reuses one. In other words, it's usually spammers who hit this - # so they don't deserve nice Message-IDs :P - $b64 . '@localhost'; + # Add a date prefix to prevent a leading '-' in case that trips + # up some tools (e.g. if a Message-ID were a expected as a + # command-line arg) + my $dt = msg_datestamp($hdr); + $dt = POSIX::strftime('%Y%m%d%H%M%S', gmtime($dt)); + "$dt.$b64" . '@z'; } sub clean_purge_buffer { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 66f8a8a..0dcdeda 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -174,19 +174,19 @@ sub num_for_harder { my $hdr = $mime->header_obj; my $dig = content_digest($mime); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); my $num = $self->{mm}->mid_insert($$mid0); unless (defined $num) { # it's hard to spoof the last Received: header my @recvd = $hdr->header_raw('Received'); $dig->add("Received: $_") foreach (@recvd); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); $num = $self->{mm}->mid_insert($$mid0); # fall back to a random Message-ID and give up determinism: until (defined($num)) { $dig->add(rand); - $$mid0 = PublicInbox::Import::digest2mid($dig); + $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr); warn "using random Message-ID <$$mid0> as fallback\n"; $num = $self->{mm}->mid_insert($$mid0); } diff --git a/scripts/dupe-finder b/scripts/dupe-finder new file mode 100644 index 0000000..1402237 --- /dev/null +++ b/scripts/dupe-finder @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +# +# ad-hoc tool for finding duplicates, unstable! +use strict; +use warnings; +use PublicInbox::Inbox; +use PublicInbox::Over; +use PublicInbox::Search; +use PublicInbox::Config; +my $repo = shift; +my $ibx; +if (index($repo, '@') > 0) { + $ibx = PublicInbox::Config->new->lookup($repo); +} elsif (-d $repo) { + $ibx = { mainrepo => $repo, address => 'unnamed@example.com' }; + $ibx = PublicInbox::Inbox->new($ibx); +} else { + $ibx = PublicInbox::Config->new->lookup_name($repo); +} +$ibx or die "No inbox"; +$ibx->search or die "search not available for inbox"; +my $dbh = $ibx->search->{over_ro}->connect; +my $over = PublicInbox::Over->new($dbh->sqlite_db_filename); + +sub emit ($) { + my ($nums) = @_; + foreach my $n (@$nums) { + my $smsg = $over->get_art($n) or next; + print STDERR "$n $smsg->{blob} $smsg->{mid}\n"; + my $msg = $ibx->msg_by_smsg($smsg) or next; + print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n"; + $$msg =~ s/^(>*From )/>$1/gm; + print $$msg, "\n"; + } +} + +my $sth = $dbh->prepare(<<''); +SELECT id,num FROM id2num WHERE num > 0 ORDER BY id + +$sth->execute; +my $prev_id = -1; +my ($id, $num, @nums); +while (1) { + ($id, $num) = $sth->fetchrow_array; + defined $id or last; + if ($prev_id != $id) { + emit(\@nums) if scalar(@nums) > 1; + @nums = (); + } + $prev_id = $id; + push @nums, $num; +} diff --git a/t/v2writable.t b/t/v2writable.t index 85fb6a6..d37fb06 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -68,7 +68,7 @@ if ('ensure git configs are correct') { [ $sec->header_obj->header_raw('Message-Id') ], 'no new Message-Id added'); - my $sane_mid = qr/\A<[\w\-]+\@localhost>\z/; + my $sane_mid = qr/\A<[\w\-\.]+\@\w+>\z/; @warn = (); $mime->header_set('Message-Id', ''); $mime->body_set('different'); @@ -82,7 +82,8 @@ if ('ensure git configs are correct') { @warn = (); $mime->header_set('Message-Id', ''); $mime->body_set('this one needs a random mid'); - my $gen = PublicInbox::Import::digest2mid(content_digest($mime)); + my $hdr = $mime->header_obj; + my $gen = PublicInbox::Import::digest2mid(content_digest($mime), $hdr); unlike($gen, qr![\+/=]!, 'no URL-unfriendly chars in Message-Id'); my $fake = PublicInbox::MIME->new($mime->as_string); $fake->header_set('Message-Id', "<$gen>"); -- EW