diff options
author | Eric Wong <e@80x24.org> | 2021-01-05 01:29:10 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2021-01-09 18:09:54 +0000 |
commit | 392533147f50061d93cb9ed82abf98067dde5472 (patch) | |
tree | f214c7dd7cc8c6a8121e71b454d1ad3dd005270c /lib/PublicInbox/V2Writable.pm | |
parent | 348e7a39627fd40d7cc0ca1be01412da51a71352 (diff) | |
download | public-inbox-392533147f50061d93cb9ed82abf98067dde5472.tar.gz |
We've always temporarily unindexeded messages before reindexing them again if there's discontiguous history. This change improves the mechanism we use to prevent NNTP and IMAP clients from seeing duplicate messages. Previously, we relied on mapping Message-IDs to NNTP article numbers to ensure clients would not see the same message twice. This worked for most messages, but not for for messages with reused or duplicate Message-IDs. Instead of relying on Message-IDs as a key, we now rely on the git blob object ID for exact content matching. This allows truly different messages to show up for NNTP|IMAP clients, while still those clients from seeing the message again.
Diffstat (limited to 'lib/PublicInbox/V2Writable.pm')
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 459c7e86..54004fd7 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -888,12 +888,16 @@ sub index_oid { # cat_async callback } # {unindexed} is unlikely - if ((my $unindexed = $arg->{unindexed}) && scalar(@$mids) == 1) { - $num = delete($unindexed->{$mids->[0]}); + if (my $unindexed = $arg->{unindexed}) { + my $oidbin = pack('H*', $oid); + my $u = $unindexed->{$oidbin}; + ($num, $mid0) = splice(@$u, 0, 2) if $u; if (defined $num) { - $mid0 = $mids->[0]; $self->{mm}->mid_set($num, $mid0); - delete($arg->{unindexed}) if !keys(%$unindexed); + if (scalar(@$u) == 0) { # done with current OID + delete $unindexed->{$oidbin}; + delete($arg->{unindexed}) if !keys(%$unindexed); + } } } if (!defined($num)) { # reuse if reindexing (or duplicates) @@ -1160,10 +1164,13 @@ sub unindex_oid ($$;$) { # git->cat_async callback warn "BUG: multiple articles linked to $oid\n", join(',',sort keys %gone), "\n"; } - foreach my $num (keys %gone) { + # reuse (num => mid) mapping in ascending numeric order + for my $num (sort { $a <=> $b } keys %gone) { + $num += 0; if ($unindexed) { my $mid0 = $mm->mid_for($num); - $unindexed->{$mid0} = $num; + my $oidbin = pack('H*', $oid); + push @{$unindexed->{$oidbin}}, $num, $mid0; } $mm->num_delete($num); } @@ -1179,7 +1186,7 @@ sub git { $_[0]->{ibx}->git } sub unindex_todo ($$$) { my ($self, $sync, $unit) = @_; my $unindex_range = delete($unit->{unindex_range}) // return; - my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num + my $unindexed = $sync->{unindexed} //= {}; # $oidbin => [$num, $mid0] my $before = scalar keys %$unindexed; # order does not matter, here: my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color |