about summary refs log tree commit homepage
path: root/scripts
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-18 09:13:10 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-18 09:14:14 +0000
commit69329215485cf2ab9d8cd1fa7faf65d8ec42dc0b (patch)
treedcf2a469711f8e61ce428f521d5e57d13a2e97c7 /scripts
parent2a49233ab00a366251974824658a20bf68e519da (diff)
downloadpublic-inbox-69329215485cf2ab9d8cd1fa7faf65d8ec42dc0b.tar.gz
While hunting duplicates, I noticed a leading '-' in some
Message-IDs as a result of RFC4648 encoding.  While '-' seems
allowed by RFC5322 and URL-friendly (RFC4648), they are uncommon
and make using Message-IDs as arguments for command-line tools
more difficult.  So prefix them with a datestamp to at least
give readers some sense of the age.  And shorten the "localhost"
hostname to "z" to save space.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/dupe-finder54
1 files changed, 54 insertions, 0 deletions
diff --git a/scripts/dupe-finder b/scripts/dupe-finder
new file mode 100644
index 00000000..14022379
--- /dev/null
+++ b/scripts/dupe-finder
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# ad-hoc tool for finding duplicates, unstable!
+use strict;
+use warnings;
+use PublicInbox::Inbox;
+use PublicInbox::Over;
+use PublicInbox::Search;
+use PublicInbox::Config;
+my $repo = shift;
+my $ibx;
+if (index($repo, '@') > 0) {
+        $ibx = PublicInbox::Config->new->lookup($repo);
+} elsif (-d $repo) {
+        $ibx = { mainrepo => $repo, address => 'unnamed@example.com' };
+        $ibx = PublicInbox::Inbox->new($ibx);
+} else {
+        $ibx = PublicInbox::Config->new->lookup_name($repo);
+}
+$ibx or die "No inbox";
+$ibx->search or die "search not available for inbox";
+my $dbh = $ibx->search->{over_ro}->connect;
+my $over = PublicInbox::Over->new($dbh->sqlite_db_filename);
+
+sub emit ($) {
+        my ($nums) = @_;
+        foreach my $n (@$nums) {
+                my $smsg = $over->get_art($n) or next;
+                print STDERR "$n $smsg->{blob} $smsg->{mid}\n";
+                my $msg = $ibx->msg_by_smsg($smsg) or next;
+                print "From $smsg->{blob}\@$n Thu Jan  1 00:00:00 1970\n";
+                $$msg =~ s/^(>*From )/>$1/gm;
+                print $$msg, "\n";
+        }
+}
+
+my $sth = $dbh->prepare(<<'');
+SELECT id,num FROM id2num WHERE num > 0 ORDER BY id
+
+$sth->execute;
+my $prev_id = -1;
+my ($id, $num, @nums);
+while (1) {
+        ($id, $num) = $sth->fetchrow_array;
+        defined $id or last;
+        if ($prev_id != $id) {
+                emit(\@nums) if scalar(@nums) > 1;
+                @nums = ();
+        }
+        $prev_id = $id;
+        push @nums, $num;
+}