From f4cf089b427d07bedb80fcfbe79d84234ad92a75 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 10 Jan 2021 12:14:59 +0000 Subject: lei q: deduplicate smsg We don't want duplicate messages in results overviews, either. --- lib/PublicInbox/LeiDedupe.pm | 29 ++++++++++++++++++++++++++++- lib/PublicInbox/LeiQuery.pm | 5 +++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index c4e5dffb..58eee533 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -33,12 +33,24 @@ sub _regen_oid ($) { sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } +sub smsg_hash ($) { + my ($smsg) = @_; + my $dig = Digest::SHA->new(256); + my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); + utf8::encode($x); + $dig->add($x); + $dig->digest; +} + # the paranoid option sub dedupe_oid () { my $skv = PublicInbox::SharedKV->new; ($skv, sub { # may be called in a child process my ($eml, $oid) = @_; $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(_oidbin($smsg->{blob}), ''); }); } @@ -51,6 +63,12 @@ sub dedupe_mid () { my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // content_hash($eml); $skv->set_maybe($mid, ''); + }, sub { + my ($smsg) = @_; + my $mid = $smsg->{mid}; + $mid = undef if $mid eq ''; + $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob}); + $skv->set_maybe($mid, ''); }); } @@ -60,11 +78,15 @@ sub dedupe_content () { ($skv, sub { # may be called in a child process my ($eml) = @_; # oid = $_[1], ignored $skv->set_maybe(content_hash($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(smsg_hash($smsg), ''); }); } # no deduplication at all -sub dedupe_none () { (undef, sub { 1 }) } +sub true { 1 } +sub dedupe_none () { (undef, \&true, \&true) } sub new { my ($cls, $lei, $dst) = @_; @@ -85,6 +107,11 @@ sub is_dup { !$self->[1]->($eml, $oid); } +sub is_smsg_dup { + my ($self, $smsg) = @_; + !$self->[2]->($smsg); +} + sub prepare_dedupe { my ($self) = @_; my $skv = $self->[0]; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm index d14da1bc..f69dccad 100644 --- a/lib/PublicInbox/LeiQuery.pm +++ b/lib/PublicInbox/LeiQuery.pm @@ -69,6 +69,8 @@ sub lei_q { } @argv); $opt->{limit} //= 10000; my $lxs; + require PublicInbox::LeiDedupe; + my $dd = PublicInbox::LeiDedupe->new($self); # --local is enabled by default my @src = $opt->{'local'} ? ($sto->search) : (); @@ -135,6 +137,7 @@ sub lei_q { delete @$smsg{qw(tid num)}; # only makes sense if single src chomp($buf = $json->encode(_smsg_unbless($smsg))); }; + $dd->prepare_dedupe; for my $src (@src) { my $srch = $src->search; my $over = $src->over; @@ -145,6 +148,7 @@ sub lei_q { if ($smsg_for) { for my $it ($mset->items) { my $smsg = $smsg_for->($srch, $it) or next; + next if $dd->is_smsg_dup($smsg); $self->out($buf .= $ORS) if defined $buf; $smsg->{relevance} = get_pct($it); $emit_cb->($smsg); @@ -160,6 +164,7 @@ sub lei_q { while ($over && $over->expand_thread($ctx)) { for my $n (@{$ctx->{xids}}) { my $t = $over->get_art($n) or next; + next if $dd->is_smsg_dup($t); if (my $p = delete $n2p{$t->{num}}) { $t->{relevance} = $p; } -- cgit v1.2.3-24-ge0c7