From 392533147f50061d93cb9ed82abf98067dde5472 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 5 Jan 2021 01:29:10 +0000 Subject: v2writable: exact discontiguous history handling We've always temporarily unindexeded messages before reindexing them again if there's discontiguous history. This change improves the mechanism we use to prevent NNTP and IMAP clients from seeing duplicate messages. Previously, we relied on mapping Message-IDs to NNTP article numbers to ensure clients would not see the same message twice. This worked for most messages, but not for for messages with reused or duplicate Message-IDs. Instead of relying on Message-IDs as a key, we now rely on the git blob object ID for exact content matching. This allows truly different messages to show up for NNTP|IMAP clients, while still those clients from seeing the message again. --- lib/PublicInbox/V2Writable.pm | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 459c7e86..54004fd7 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -888,12 +888,16 @@ sub index_oid { # cat_async callback } # {unindexed} is unlikely - if ((my $unindexed = $arg->{unindexed}) && scalar(@$mids) == 1) { - $num = delete($unindexed->{$mids->[0]}); + if (my $unindexed = $arg->{unindexed}) { + my $oidbin = pack('H*', $oid); + my $u = $unindexed->{$oidbin}; + ($num, $mid0) = splice(@$u, 0, 2) if $u; if (defined $num) { - $mid0 = $mids->[0]; $self->{mm}->mid_set($num, $mid0); - delete($arg->{unindexed}) if !keys(%$unindexed); + if (scalar(@$u) == 0) { # done with current OID + delete $unindexed->{$oidbin}; + delete($arg->{unindexed}) if !keys(%$unindexed); + } } } if (!defined($num)) { # reuse if reindexing (or duplicates) @@ -1160,10 +1164,13 @@ sub unindex_oid ($$;$) { # git->cat_async callback warn "BUG: multiple articles linked to $oid\n", join(',',sort keys %gone), "\n"; } - foreach my $num (keys %gone) { + # reuse (num => mid) mapping in ascending numeric order + for my $num (sort { $a <=> $b } keys %gone) { + $num += 0; if ($unindexed) { my $mid0 = $mm->mid_for($num); - $unindexed->{$mid0} = $num; + my $oidbin = pack('H*', $oid); + push @{$unindexed->{$oidbin}}, $num, $mid0; } $mm->num_delete($num); } @@ -1179,7 +1186,7 @@ sub git { $_[0]->{ibx}->git } sub unindex_todo ($$$) { my ($self, $sync, $unit) = @_; my $unindex_range = delete($unit->{unindex_range}) // return; - my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num + my $unindexed = $sync->{unindexed} //= {}; # $oidbin => [$num, $mid0] my $before = scalar keys %$unindexed; # order does not matter, here: my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color -- cgit v1.2.3-24-ge0c7