about summary refs log tree commit homepage
path: root/lib/PublicInbox/V2Writable.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-01-05 01:29:10 +0000
committerEric Wong <e@80x24.org>2021-01-09 18:09:54 +0000
commit392533147f50061d93cb9ed82abf98067dde5472 (patch)
treef214c7dd7cc8c6a8121e71b454d1ad3dd005270c /lib/PublicInbox/V2Writable.pm
parent348e7a39627fd40d7cc0ca1be01412da51a71352 (diff)
downloadpublic-inbox-392533147f50061d93cb9ed82abf98067dde5472.tar.gz
We've always temporarily unindexeded messages before reindexing
them again if there's discontiguous history.

This change improves the mechanism we use to prevent NNTP and
IMAP clients from seeing duplicate messages.

Previously, we relied on mapping Message-IDs to NNTP article
numbers to ensure clients would not see the same message twice.
This worked for most messages, but not for for messages with
reused or duplicate Message-IDs.

Instead of relying on Message-IDs as a key, we now rely on the
git blob object ID for exact content matching.  This allows
truly different messages to show up for NNTP|IMAP clients, while
still those clients from seeing the message again.
Diffstat (limited to 'lib/PublicInbox/V2Writable.pm')
-rw-r--r--lib/PublicInbox/V2Writable.pm21
1 files changed, 14 insertions, 7 deletions
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 459c7e86..54004fd7 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -888,12 +888,16 @@ sub index_oid { # cat_async callback
         }
 
         # {unindexed} is unlikely
-        if ((my $unindexed = $arg->{unindexed}) && scalar(@$mids) == 1) {
-                $num = delete($unindexed->{$mids->[0]});
+        if (my $unindexed = $arg->{unindexed}) {
+                my $oidbin = pack('H*', $oid);
+                my $u = $unindexed->{$oidbin};
+                ($num, $mid0) = splice(@$u, 0, 2) if $u;
                 if (defined $num) {
-                        $mid0 = $mids->[0];
                         $self->{mm}->mid_set($num, $mid0);
-                        delete($arg->{unindexed}) if !keys(%$unindexed);
+                        if (scalar(@$u) == 0) { # done with current OID
+                                delete $unindexed->{$oidbin};
+                                delete($arg->{unindexed}) if !keys(%$unindexed);
+                        }
                 }
         }
         if (!defined($num)) { # reuse if reindexing (or duplicates)
@@ -1160,10 +1164,13 @@ sub unindex_oid ($$;$) { # git->cat_async callback
                         warn "BUG: multiple articles linked to $oid\n",
                                 join(',',sort keys %gone), "\n";
                 }
-                foreach my $num (keys %gone) {
+                # reuse (num => mid) mapping in ascending numeric order
+                for my $num (sort { $a <=> $b } keys %gone) {
+                        $num += 0;
                         if ($unindexed) {
                                 my $mid0 = $mm->mid_for($num);
-                                $unindexed->{$mid0} = $num;
+                                my $oidbin = pack('H*', $oid);
+                                push @{$unindexed->{$oidbin}}, $num, $mid0;
                         }
                         $mm->num_delete($num);
                 }
@@ -1179,7 +1186,7 @@ sub git { $_[0]->{ibx}->git }
 sub unindex_todo ($$$) {
         my ($self, $sync, $unit) = @_;
         my $unindex_range = delete($unit->{unindex_range}) // return;
-        my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num
+        my $unindexed = $sync->{unindexed} //= {}; # $oidbin => [$num, $mid0]
         my $before = scalar keys %$unindexed;
         # order does not matter, here:
         my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color