user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
blob 04714cbd9424dd4fe341b64e4d85f1e5caf93008 1442 bytes (raw)
name: scripts/dupe-finder 	 # note: path name is non-authoritative(*)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
 
#!/usr/bin/perl -w
# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
#
# ad-hoc tool for finding duplicates, unstable!
use strict;
use warnings;
use PublicInbox::Inbox;
use PublicInbox::Over;
use PublicInbox::Search;
use PublicInbox::Config;
my $repo = shift;
my $ibx;
if (index($repo, '@') > 0) {
	$ibx = PublicInbox::Config->new->lookup($repo);
} elsif (-d $repo) {
	$ibx = { inboxdir => $repo, address => 'unnamed@example.com' };
	$ibx = PublicInbox::Inbox->new($ibx);
} else {
	$ibx = PublicInbox::Config->new->lookup_name($repo);
}
$ibx or die "No inbox";
$ibx->search or die "search not available for inbox";
my $dbh = $ibx->search->{over_ro}->dbh;
my $over = PublicInbox::Over->new($dbh->sqlite_db_filename);

sub emit ($) {
	my ($nums) = @_;
	foreach my $n (@$nums) {
		my $smsg = $over->get_art($n) or next;
		print STDERR "$n $smsg->{blob} $smsg->{mid}\n";
		my $msg = $ibx->msg_by_smsg($smsg) or next;
		print "From $smsg->{blob}\@$n Thu Jan  1 00:00:00 1970\n";
		$$msg =~ s/^(>*From )/>$1/gm;
		print $$msg, "\n";
	}
}

my $sth = $dbh->prepare(<<'');
SELECT id,num FROM id2num WHERE num > 0 ORDER BY id

$sth->execute;
my $prev_id = -1;
my ($id, $num, @nums);
while (1) {
	($id, $num) = $sth->fetchrow_array;
	defined $id or last;
	if ($prev_id != $id) {
		emit(\@nums) if scalar(@nums) > 1;
		@nums = ();
	}
	$prev_id = $id;
	push @nums, $num;
}

debug log:

solving 04714cbd ...
found 04714cbd in https://80x24.org/public-inbox.git

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).