From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 02/22] lei q: deduplicate smsg
Date: Sun, 10 Jan 2021 12:14:59 +0000 [thread overview]
Message-ID: <20210110121519.17044-3-e@80x24.org> (raw)
In-Reply-To: <20210110121519.17044-1-e@80x24.org>
We don't want duplicate messages in results overviews, either.
---
lib/PublicInbox/LeiDedupe.pm | 29 ++++++++++++++++++++++++++++-
lib/PublicInbox/LeiQuery.pm | 5 +++++
t/lei_dedupe.t | 14 ++++++++++++++
3 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm
index c4e5dffb..58eee533 100644
--- a/lib/PublicInbox/LeiDedupe.pm
+++ b/lib/PublicInbox/LeiDedupe.pm
@@ -33,12 +33,24 @@ sub _regen_oid ($) {
sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef }
+sub smsg_hash ($) {
+ my ($smsg) = @_;
+ my $dig = Digest::SHA->new(256);
+ my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)});
+ utf8::encode($x);
+ $dig->add($x);
+ $dig->digest;
+}
+
# the paranoid option
sub dedupe_oid () {
my $skv = PublicInbox::SharedKV->new;
($skv, sub { # may be called in a child process
my ($eml, $oid) = @_;
$skv->set_maybe(_oidbin($oid) // _regen_oid($eml), '');
+ }, sub {
+ my ($smsg) = @_;
+ $skv->set_maybe(_oidbin($smsg->{blob}), '');
});
}
@@ -51,6 +63,12 @@ sub dedupe_mid () {
my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) //
content_hash($eml);
$skv->set_maybe($mid, '');
+ }, sub {
+ my ($smsg) = @_;
+ my $mid = $smsg->{mid};
+ $mid = undef if $mid eq '';
+ $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob});
+ $skv->set_maybe($mid, '');
});
}
@@ -60,11 +78,15 @@ sub dedupe_content () {
($skv, sub { # may be called in a child process
my ($eml) = @_; # oid = $_[1], ignored
$skv->set_maybe(content_hash($eml), '');
+ }, sub {
+ my ($smsg) = @_;
+ $skv->set_maybe(smsg_hash($smsg), '');
});
}
# no deduplication at all
-sub dedupe_none () { (undef, sub { 1 }) }
+sub true { 1 }
+sub dedupe_none () { (undef, \&true, \&true) }
sub new {
my ($cls, $lei, $dst) = @_;
@@ -85,6 +107,11 @@ sub is_dup {
!$self->[1]->($eml, $oid);
}
+sub is_smsg_dup {
+ my ($self, $smsg) = @_;
+ !$self->[2]->($smsg);
+}
+
sub prepare_dedupe {
my ($self) = @_;
my $skv = $self->[0];
diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm
index d14da1bc..f69dccad 100644
--- a/lib/PublicInbox/LeiQuery.pm
+++ b/lib/PublicInbox/LeiQuery.pm
@@ -69,6 +69,8 @@ sub lei_q {
} @argv);
$opt->{limit} //= 10000;
my $lxs;
+ require PublicInbox::LeiDedupe;
+ my $dd = PublicInbox::LeiDedupe->new($self);
# --local is enabled by default
my @src = $opt->{'local'} ? ($sto->search) : ();
@@ -135,6 +137,7 @@ sub lei_q {
delete @$smsg{qw(tid num)}; # only makes sense if single src
chomp($buf = $json->encode(_smsg_unbless($smsg)));
};
+ $dd->prepare_dedupe;
for my $src (@src) {
my $srch = $src->search;
my $over = $src->over;
@@ -145,6 +148,7 @@ sub lei_q {
if ($smsg_for) {
for my $it ($mset->items) {
my $smsg = $smsg_for->($srch, $it) or next;
+ next if $dd->is_smsg_dup($smsg);
$self->out($buf .= $ORS) if defined $buf;
$smsg->{relevance} = get_pct($it);
$emit_cb->($smsg);
@@ -160,6 +164,7 @@ sub lei_q {
while ($over && $over->expand_thread($ctx)) {
for my $n (@{$ctx->{xids}}) {
my $t = $over->get_art($n) or next;
+ next if $dd->is_smsg_dup($t);
if (my $p = delete $n2p{$t->{num}}) {
$t->{relevance} = $p;
}
diff --git a/t/lei_dedupe.t b/t/lei_dedupe.t
index b5e2b8f9..6e971b9b 100644
--- a/t/lei_dedupe.t
+++ b/t/lei_dedupe.t
@@ -6,12 +6,16 @@ use v5.10.1;
use Test::More;
use PublicInbox::TestCommon;
use PublicInbox::Eml;
+use PublicInbox::Smsg;
require_mods(qw(DBD::SQLite));
use_ok 'PublicInbox::LeiDedupe';
my $eml = eml_load('t/plack-qp.eml');
my $mid = $eml->header_raw('Message-ID');
my $different = eml_load('t/msg_iter-order.eml');
$different->header_set('Message-ID', $mid);
+my $smsg = bless { ds => time }, 'PublicInbox::Smsg';
+$smsg->populate($eml);
+$smsg->{$_} //= '' for (qw(to cc references)) ;
my $lei = { opt => { dedupe => 'none' } };
my $dd = PublicInbox::LeiDedupe->new($lei);
@@ -19,6 +23,8 @@ $dd->prepare_dedupe;
ok(!$dd->is_dup($eml), '1st is_dup w/o dedupe');
ok(!$dd->is_dup($eml), '2nd is_dup w/o dedupe');
ok(!$dd->is_dup($different), 'different is_dup w/o dedupe');
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 1');
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 2');
for my $strat (undef, 'content') {
$lei->{opt}->{dedupe} = $strat;
@@ -28,6 +34,8 @@ for my $strat (undef, 'content') {
ok(!$dd->is_dup($eml), "1st is_dup with $desc dedupe");
ok($dd->is_dup($eml), "2nd seen with $desc dedupe");
ok(!$dd->is_dup($different), "different is_dup with $desc dedupe");
+ ok(!$dd->is_smsg_dup($smsg), "is_smsg_dup pass w/ $desc dedupe");
+ ok($dd->is_smsg_dup($smsg), "is_smsg_dup reject w/ $desc dedupe");
}
$lei->{opt}->{dedupe} = 'bogus';
eval { PublicInbox::LeiDedupe->new($lei) };
@@ -39,6 +47,8 @@ $dd->prepare_dedupe;
ok(!$dd->is_dup($eml), '1st is_dup with mid dedupe');
ok($dd->is_dup($eml), '2nd seen with mid dedupe');
ok($dd->is_dup($different), 'different seen with mid dedupe');
+ok(!$dd->is_smsg_dup($smsg), 'smsg mid dedupe pass');
+ok($dd->is_smsg_dup($smsg), 'smsg mid dedupe reject');
$lei->{opt}->{dedupe} = 'oid';
$dd = PublicInbox::LeiDedupe->new($lei);
@@ -56,4 +66,8 @@ ok($dd->is_dup($different, '01d'), 'different content ignored if oid matches');
ok($dd->is_dup($eml, '01D'), 'case insensitive oid comparison :P');
ok(!$dd->is_dup($eml, '01dbad'), 'case insensitive oid comparison :P');
+$smsg->{blob} = 'dead';
+ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe pass');
+ok($dd->is_smsg_dup($smsg), 'smsg dedupe reject');
+
done_testing;
next prev parent reply other threads:[~2021-01-10 12:15 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-01-10 12:14 [PATCH 00/22] lei query overview views Eric Wong
2021-01-10 12:14 ` [PATCH 01/22] lei query + pagination sorta working Eric Wong
2021-01-10 12:14 ` Eric Wong [this message]
2021-01-10 12:15 ` [PATCH 03/22] ds: block signals when reaping Eric Wong
2021-01-10 12:15 ` [PATCH 04/22] ipc: add support for asynchronous callbacks Eric Wong
2021-01-10 12:15 ` [PATCH 05/22] cmd_ipc: send FDs with buffer payload Eric Wong
2021-01-10 12:15 ` [PATCH 06/22] ipc: avoid excessive evals Eric Wong
2021-01-10 12:15 ` [PATCH 07/22] ipc: work queue support via SOCK_SEQPACKET Eric Wong
2021-01-10 12:15 ` [PATCH 08/22] ipc: eliminate ipc_worker_stop method Eric Wong
2021-01-10 12:15 ` [PATCH 09/22] ipc: wq: support dynamic worker count change Eric Wong
2021-01-10 12:15 ` [PATCH 10/22] ipc: drop -ipc_parent_pid field Eric Wong
2021-01-10 12:15 ` [PATCH 11/22] ipc: DESTROY and wq_workers methods Eric Wong
2021-01-10 12:15 ` [PATCH 12/22] lei: rename $w to $wpager for warning message Eric Wong
2021-01-10 12:15 ` [PATCH 13/22] lei: fix oneshot TTY detection by passing STD*{GLOB} Eric Wong
2021-01-10 12:15 ` [PATCH 14/22] lei: query: ensure pager exit is instantaneous Eric Wong
2021-01-10 12:15 ` [PATCH 15/22] ipc: start supporting sending/receiving more than 3 FDs Eric Wong
2021-01-10 12:15 ` [PATCH 16/22] ipc: fix IO::FDPass use with a worker limit of 1 Eric Wong
2021-01-10 12:15 ` [PATCH 17/22] ipc: drop unused fields, default sighandlers for wq Eric Wong
2021-01-10 12:15 ` [PATCH 18/22] lei: get rid of client {pid} field Eric Wong
2021-01-10 12:15 ` [PATCH 19/22] lei: fork + FD cleanup Eric Wong
2021-01-10 12:15 ` [PATCH 20/22] lei: run pager in client script Eric Wong
2021-01-10 12:15 ` [PATCH 21/22] lei_xsearch: transfer 4 FDs internally, drop IO::FDPass Eric Wong
2021-01-10 12:15 ` [PATCH 22/22] lei: query: restore JSON output overview Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210110121519.17044-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).