user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
blob 6e971b9b07d319d51a98d2badb7934f885374882 2849 bytes (raw)
name: t/lei_dedupe.t 	 # note: path name is non-authoritative(*)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
 
#!perl -w
# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
use strict;
use v5.10.1;
use Test::More;
use PublicInbox::TestCommon;
use PublicInbox::Eml;
use PublicInbox::Smsg;
require_mods(qw(DBD::SQLite));
use_ok 'PublicInbox::LeiDedupe';
my $eml = eml_load('t/plack-qp.eml');
my $mid = $eml->header_raw('Message-ID');
my $different = eml_load('t/msg_iter-order.eml');
$different->header_set('Message-ID', $mid);
my $smsg = bless { ds => time }, 'PublicInbox::Smsg';
$smsg->populate($eml);
$smsg->{$_} //= '' for (qw(to cc references)) ;

my $lei = { opt => { dedupe => 'none' } };
my $dd = PublicInbox::LeiDedupe->new($lei);
$dd->prepare_dedupe;
ok(!$dd->is_dup($eml), '1st is_dup w/o dedupe');
ok(!$dd->is_dup($eml), '2nd is_dup w/o dedupe');
ok(!$dd->is_dup($different), 'different is_dup w/o dedupe');
ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 1');
ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 2');

for my $strat (undef, 'content') {
	$lei->{opt}->{dedupe} = $strat;
	$dd = PublicInbox::LeiDedupe->new($lei);
	$dd->prepare_dedupe;
	my $desc = $strat // 'default';
	ok(!$dd->is_dup($eml), "1st is_dup with $desc dedupe");
	ok($dd->is_dup($eml), "2nd seen with $desc dedupe");
	ok(!$dd->is_dup($different), "different is_dup with $desc dedupe");
	ok(!$dd->is_smsg_dup($smsg), "is_smsg_dup pass w/ $desc dedupe");
	ok($dd->is_smsg_dup($smsg), "is_smsg_dup reject w/ $desc dedupe");
}
$lei->{opt}->{dedupe} = 'bogus';
eval { PublicInbox::LeiDedupe->new($lei) };
like($@, qr/unsupported.*bogus/, 'died on bogus strategy');

$lei->{opt}->{dedupe} = 'mid';
$dd = PublicInbox::LeiDedupe->new($lei);
$dd->prepare_dedupe;
ok(!$dd->is_dup($eml), '1st is_dup with mid dedupe');
ok($dd->is_dup($eml), '2nd seen with mid dedupe');
ok($dd->is_dup($different), 'different seen with mid dedupe');
ok(!$dd->is_smsg_dup($smsg), 'smsg mid dedupe pass');
ok($dd->is_smsg_dup($smsg), 'smsg mid dedupe reject');

$lei->{opt}->{dedupe} = 'oid';
$dd = PublicInbox::LeiDedupe->new($lei);
$dd->prepare_dedupe;

# --augment won't have OIDs:
ok(!$dd->is_dup($eml), '1st is_dup with oid dedupe (augment)');
ok($dd->is_dup($eml), '2nd seen with oid dedupe (augment)');
ok(!$dd->is_dup($different), 'different is_dup with mid dedupe (augment)');
$different->header_set('Status', 'RO');
ok($dd->is_dup($different), 'different seen with oid dedupe Status removed');

ok(!$dd->is_dup($eml, '01d'), '1st is_dup with oid dedupe');
ok($dd->is_dup($different, '01d'), 'different content ignored if oid matches');
ok($dd->is_dup($eml, '01D'), 'case insensitive oid comparison :P');
ok(!$dd->is_dup($eml, '01dbad'), 'case insensitive oid comparison :P');

$smsg->{blob} = 'dead';
ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe pass');
ok($dd->is_smsg_dup($smsg), 'smsg dedupe reject');

done_testing;

debug log:

solving 6e971b9b ...
found 6e971b9b in https://80x24.org/public-inbox.git

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).