about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-01-10 12:14:59 +0000
committerEric Wong <e@80x24.org>2021-01-12 03:51:42 +0000
commitf4cf089b427d07bedb80fcfbe79d84234ad92a75 (patch)
tree2aa3f5dea1dc8dc7ae754ea80b27738142cb8820 /lib/PublicInbox
parent4ff570e5c3cfb33aff3ca6ad674958d9dd2abda9 (diff)
downloadpublic-inbox-f4cf089b427d07bedb80fcfbe79d84234ad92a75.tar.gz
We don't want duplicate messages in results overviews, either.
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/LeiDedupe.pm29
-rw-r--r--lib/PublicInbox/LeiQuery.pm5
2 files changed, 33 insertions, 1 deletions
diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm
index c4e5dffb..58eee533 100644
--- a/lib/PublicInbox/LeiDedupe.pm
+++ b/lib/PublicInbox/LeiDedupe.pm
@@ -33,12 +33,24 @@ sub _regen_oid ($) {
 
 sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef }
 
+sub smsg_hash ($) {
+        my ($smsg) = @_;
+        my $dig = Digest::SHA->new(256);
+        my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)});
+        utf8::encode($x);
+        $dig->add($x);
+        $dig->digest;
+}
+
 # the paranoid option
 sub dedupe_oid () {
         my $skv = PublicInbox::SharedKV->new;
         ($skv, sub { # may be called in a child process
                 my ($eml, $oid) = @_;
                 $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), '');
+        }, sub {
+                my ($smsg) = @_;
+                $skv->set_maybe(_oidbin($smsg->{blob}), '');
         });
 }
 
@@ -51,6 +63,12 @@ sub dedupe_mid () {
                 my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) //
                         content_hash($eml);
                 $skv->set_maybe($mid, '');
+        }, sub {
+                my ($smsg) = @_;
+                my $mid = $smsg->{mid};
+                $mid = undef if $mid eq '';
+                $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob});
+                $skv->set_maybe($mid, '');
         });
 }
 
@@ -60,11 +78,15 @@ sub dedupe_content () {
         ($skv, sub { # may be called in a child process
                 my ($eml) = @_; # oid = $_[1], ignored
                 $skv->set_maybe(content_hash($eml), '');
+        }, sub {
+                my ($smsg) = @_;
+                $skv->set_maybe(smsg_hash($smsg), '');
         });
 }
 
 # no deduplication at all
-sub dedupe_none () { (undef, sub { 1 }) }
+sub true { 1 }
+sub dedupe_none () { (undef, \&true, \&true) }
 
 sub new {
         my ($cls, $lei, $dst) = @_;
@@ -85,6 +107,11 @@ sub is_dup {
         !$self->[1]->($eml, $oid);
 }
 
+sub is_smsg_dup {
+        my ($self, $smsg) = @_;
+        !$self->[2]->($smsg);
+}
+
 sub prepare_dedupe {
         my ($self) = @_;
         my $skv = $self->[0];
diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm
index d14da1bc..f69dccad 100644
--- a/lib/PublicInbox/LeiQuery.pm
+++ b/lib/PublicInbox/LeiQuery.pm
@@ -69,6 +69,8 @@ sub lei_q {
         } @argv);
         $opt->{limit} //= 10000;
         my $lxs;
+        require PublicInbox::LeiDedupe;
+        my $dd = PublicInbox::LeiDedupe->new($self);
 
         # --local is enabled by default
         my @src = $opt->{'local'} ? ($sto->search) : ();
@@ -135,6 +137,7 @@ sub lei_q {
                 delete @$smsg{qw(tid num)}; # only makes sense if single src
                 chomp($buf = $json->encode(_smsg_unbless($smsg)));
         };
+        $dd->prepare_dedupe;
         for my $src (@src) {
                 my $srch = $src->search;
                 my $over = $src->over;
@@ -145,6 +148,7 @@ sub lei_q {
                 if ($smsg_for) {
                         for my $it ($mset->items) {
                                 my $smsg = $smsg_for->($srch, $it) or next;
+                                next if $dd->is_smsg_dup($smsg);
                                 $self->out($buf .= $ORS) if defined $buf;
                                 $smsg->{relevance} = get_pct($it);
                                 $emit_cb->($smsg);
@@ -160,6 +164,7 @@ sub lei_q {
                         while ($over && $over->expand_thread($ctx)) {
                                 for my $n (@{$ctx->{xids}}) {
                                         my $t = $over->get_art($n) or next;
+                                        next if $dd->is_smsg_dup($t);
                                         if (my $p = delete $n2p{$t->{num}}) {
                                                 $t->{relevance} = $p;
                                         }