From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A9D811FA12 for ; Sun, 10 Jan 2021 12:15:19 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 02/22] lei q: deduplicate smsg Date: Sun, 10 Jan 2021 12:14:59 +0000 Message-Id: <20210110121519.17044-3-e@80x24.org> In-Reply-To: <20210110121519.17044-1-e@80x24.org> References: <20210110121519.17044-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We don't want duplicate messages in results overviews, either. --- lib/PublicInbox/LeiDedupe.pm | 29 ++++++++++++++++++++++++++++- lib/PublicInbox/LeiQuery.pm | 5 +++++ t/lei_dedupe.t | 14 ++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index c4e5dffb..58eee533 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -33,12 +33,24 @@ sub _regen_oid ($) { sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } +sub smsg_hash ($) { + my ($smsg) = @_; + my $dig = Digest::SHA->new(256); + my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); + utf8::encode($x); + $dig->add($x); + $dig->digest; +} + # the paranoid option sub dedupe_oid () { my $skv = PublicInbox::SharedKV->new; ($skv, sub { # may be called in a child process my ($eml, $oid) = @_; $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(_oidbin($smsg->{blob}), ''); }); } @@ -51,6 +63,12 @@ sub dedupe_mid () { my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // content_hash($eml); $skv->set_maybe($mid, ''); + }, sub { + my ($smsg) = @_; + my $mid = $smsg->{mid}; + $mid = undef if $mid eq ''; + $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob}); + $skv->set_maybe($mid, ''); }); } @@ -60,11 +78,15 @@ sub dedupe_content () { ($skv, sub { # may be called in a child process my ($eml) = @_; # oid = $_[1], ignored $skv->set_maybe(content_hash($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(smsg_hash($smsg), ''); }); } # no deduplication at all -sub dedupe_none () { (undef, sub { 1 }) } +sub true { 1 } +sub dedupe_none () { (undef, \&true, \&true) } sub new { my ($cls, $lei, $dst) = @_; @@ -85,6 +107,11 @@ sub is_dup { !$self->[1]->($eml, $oid); } +sub is_smsg_dup { + my ($self, $smsg) = @_; + !$self->[2]->($smsg); +} + sub prepare_dedupe { my ($self) = @_; my $skv = $self->[0]; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm index d14da1bc..f69dccad 100644 --- a/lib/PublicInbox/LeiQuery.pm +++ b/lib/PublicInbox/LeiQuery.pm @@ -69,6 +69,8 @@ sub lei_q { } @argv); $opt->{limit} //= 10000; my $lxs; + require PublicInbox::LeiDedupe; + my $dd = PublicInbox::LeiDedupe->new($self); # --local is enabled by default my @src = $opt->{'local'} ? ($sto->search) : (); @@ -135,6 +137,7 @@ sub lei_q { delete @$smsg{qw(tid num)}; # only makes sense if single src chomp($buf = $json->encode(_smsg_unbless($smsg))); }; + $dd->prepare_dedupe; for my $src (@src) { my $srch = $src->search; my $over = $src->over; @@ -145,6 +148,7 @@ sub lei_q { if ($smsg_for) { for my $it ($mset->items) { my $smsg = $smsg_for->($srch, $it) or next; + next if $dd->is_smsg_dup($smsg); $self->out($buf .= $ORS) if defined $buf; $smsg->{relevance} = get_pct($it); $emit_cb->($smsg); @@ -160,6 +164,7 @@ sub lei_q { while ($over && $over->expand_thread($ctx)) { for my $n (@{$ctx->{xids}}) { my $t = $over->get_art($n) or next; + next if $dd->is_smsg_dup($t); if (my $p = delete $n2p{$t->{num}}) { $t->{relevance} = $p; } diff --git a/t/lei_dedupe.t b/t/lei_dedupe.t index b5e2b8f9..6e971b9b 100644 --- a/t/lei_dedupe.t +++ b/t/lei_dedupe.t @@ -6,12 +6,16 @@ use v5.10.1; use Test::More; use PublicInbox::TestCommon; use PublicInbox::Eml; +use PublicInbox::Smsg; require_mods(qw(DBD::SQLite)); use_ok 'PublicInbox::LeiDedupe'; my $eml = eml_load('t/plack-qp.eml'); my $mid = $eml->header_raw('Message-ID'); my $different = eml_load('t/msg_iter-order.eml'); $different->header_set('Message-ID', $mid); +my $smsg = bless { ds => time }, 'PublicInbox::Smsg'; +$smsg->populate($eml); +$smsg->{$_} //= '' for (qw(to cc references)) ; my $lei = { opt => { dedupe => 'none' } }; my $dd = PublicInbox::LeiDedupe->new($lei); @@ -19,6 +23,8 @@ $dd->prepare_dedupe; ok(!$dd->is_dup($eml), '1st is_dup w/o dedupe'); ok(!$dd->is_dup($eml), '2nd is_dup w/o dedupe'); ok(!$dd->is_dup($different), 'different is_dup w/o dedupe'); +ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 1'); +ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 2'); for my $strat (undef, 'content') { $lei->{opt}->{dedupe} = $strat; @@ -28,6 +34,8 @@ for my $strat (undef, 'content') { ok(!$dd->is_dup($eml), "1st is_dup with $desc dedupe"); ok($dd->is_dup($eml), "2nd seen with $desc dedupe"); ok(!$dd->is_dup($different), "different is_dup with $desc dedupe"); + ok(!$dd->is_smsg_dup($smsg), "is_smsg_dup pass w/ $desc dedupe"); + ok($dd->is_smsg_dup($smsg), "is_smsg_dup reject w/ $desc dedupe"); } $lei->{opt}->{dedupe} = 'bogus'; eval { PublicInbox::LeiDedupe->new($lei) }; @@ -39,6 +47,8 @@ $dd->prepare_dedupe; ok(!$dd->is_dup($eml), '1st is_dup with mid dedupe'); ok($dd->is_dup($eml), '2nd seen with mid dedupe'); ok($dd->is_dup($different), 'different seen with mid dedupe'); +ok(!$dd->is_smsg_dup($smsg), 'smsg mid dedupe pass'); +ok($dd->is_smsg_dup($smsg), 'smsg mid dedupe reject'); $lei->{opt}->{dedupe} = 'oid'; $dd = PublicInbox::LeiDedupe->new($lei); @@ -56,4 +66,8 @@ ok($dd->is_dup($different, '01d'), 'different content ignored if oid matches'); ok($dd->is_dup($eml, '01D'), 'case insensitive oid comparison :P'); ok(!$dd->is_dup($eml, '01dbad'), 'case insensitive oid comparison :P'); +$smsg->{blob} = 'dead'; +ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe pass'); +ok($dd->is_smsg_dup($smsg), 'smsg dedupe reject'); + done_testing;