From 69329215485cf2ab9d8cd1fa7faf65d8ec42dc0b Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Wed, 18 Apr 2018 09:13:10 +0000 Subject: v2: generate better Message-IDs for duplicates While hunting duplicates, I noticed a leading '-' in some Message-IDs as a result of RFC4648 encoding. While '-' seems allowed by RFC5322 and URL-friendly (RFC4648), they are uncommon and make using Message-IDs as arguments for command-line tools more difficult. So prefix them with a datestamp to at least give readers some sense of the age. And shorten the "localhost" hostname to "z" to save space. --- scripts/dupe-finder | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 scripts/dupe-finder (limited to 'scripts') diff --git a/scripts/dupe-finder b/scripts/dupe-finder new file mode 100644 index 00000000..14022379 --- /dev/null +++ b/scripts/dupe-finder @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +# +# ad-hoc tool for finding duplicates, unstable! +use strict; +use warnings; +use PublicInbox::Inbox; +use PublicInbox::Over; +use PublicInbox::Search; +use PublicInbox::Config; +my $repo = shift; +my $ibx; +if (index($repo, '@') > 0) { + $ibx = PublicInbox::Config->new->lookup($repo); +} elsif (-d $repo) { + $ibx = { mainrepo => $repo, address => 'unnamed@example.com' }; + $ibx = PublicInbox::Inbox->new($ibx); +} else { + $ibx = PublicInbox::Config->new->lookup_name($repo); +} +$ibx or die "No inbox"; +$ibx->search or die "search not available for inbox"; +my $dbh = $ibx->search->{over_ro}->connect; +my $over = PublicInbox::Over->new($dbh->sqlite_db_filename); + +sub emit ($) { + my ($nums) = @_; + foreach my $n (@$nums) { + my $smsg = $over->get_art($n) or next; + print STDERR "$n $smsg->{blob} $smsg->{mid}\n"; + my $msg = $ibx->msg_by_smsg($smsg) or next; + print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n"; + $$msg =~ s/^(>*From )/>$1/gm; + print $$msg, "\n"; + } +} + +my $sth = $dbh->prepare(<<''); +SELECT id,num FROM id2num WHERE num > 0 ORDER BY id + +$sth->execute; +my $prev_id = -1; +my ($id, $num, @nums); +while (1) { + ($id, $num) = $sth->fetchrow_array; + defined $id or last; + if ($prev_id != $id) { + emit(\@nums) if scalar(@nums) > 1; + @nums = (); + } + $prev_id = $id; + push @nums, $num; +} -- cgit v1.2.3-24-ge0c7