about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-07-24 05:55:47 +0000
committerEric Wong <e@yhbt.net>2020-07-25 20:48:18 +0000
commitb02c7a346bd36f1325518ca110a781c619082da1 (patch)
tree86b3a2868db2996e864bfa94056addc511ae5596 /lib
parent0ad076b1943bf7c65868d1db3cef4bd0ea1ef49a (diff)
downloadpublic-inbox-b02c7a346bd36f1325518ca110a781c619082da1.tar.gz
Older versions of public-inbox < 1.3.0 had subtly
different semantics around threading in some corner
cases.  This switch (when combined with --reindex)
allows us to fix them by regenerating associations.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/OverIdx.pm76
-rw-r--r--lib/PublicInbox/SearchIdx.pm7
-rw-r--r--lib/PublicInbox/V2Writable.pm4
3 files changed, 79 insertions, 8 deletions
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 5601e602..c57be724 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -17,6 +17,7 @@ use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::Smsg qw(subject_normalized);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
+use Carp qw(croak);
 
 sub dbh_new {
         my ($self) = @_;
@@ -37,6 +38,13 @@ sub dbh_new {
         $dbh;
 }
 
+sub new {
+        my ($class, $f) = @_;
+        my $self = $class->SUPER::new($f);
+        $self->{min_tid} = 0;
+        $self;
+}
+
 sub get_counter ($$) {
         my ($dbh, $key) = @_;
         my $sth = $dbh->prepare_cached(<<'', undef, 1);
@@ -164,8 +172,12 @@ sub _resolve_mid_to_tid {
         my $cur_tid = $smsg->{tid};
         if (defined $$tid) {
                 merge_threads($self, $$tid, $cur_tid);
-        } else {
+        } elsif ($cur_tid > $self->{min_tid}) {
                 $$tid = $cur_tid;
+        } else { # rethreading, queue up dead ghosts
+                $$tid = next_tid($self);
+                my $num = $smsg->{num};
+                push(@{$self->{-ghosts_to_delete}}, $num) if $num < 0;
         }
         1;
 }
@@ -175,7 +187,10 @@ sub resolve_mid_to_tid {
         my ($self, $mid) = @_;
         my $tid;
         each_by_mid($self, $mid, ['tid'], \&_resolve_mid_to_tid, \$tid);
-        defined $tid ? $tid : create_ghost($self, $mid);
+        if (my $del = delete $self->{-ghosts_to_delete}) {
+                delete_by_num($self, $_) for @$del;
+        }
+        $tid // create_ghost($self, $mid);
 }
 
 sub create_ghost {
@@ -221,7 +236,7 @@ sub link_refs {
                         merge_threads($self, $tid, $ptid);
                 }
         } else {
-                $tid = defined $old_tid ? $old_tid : next_tid($self);
+                $tid = $old_tid // next_tid($self);
         }
         $tid;
 }
@@ -278,10 +293,17 @@ sub _add_over {
         my $cur_tid = $smsg->{tid};
         my $n = $smsg->{num};
         die "num must not be zero for $mid" if !$n;
-        $$old_tid = $cur_tid unless defined $$old_tid;
+        my $cur_valid = $cur_tid > $self->{min_tid};
+
         if ($n > 0) { # regular mail
-                merge_threads($self, $$old_tid, $cur_tid);
+                if ($cur_valid) {
+                        $$old_tid //= $cur_tid;
+                        merge_threads($self, $$old_tid, $cur_tid);
+                } else {
+                        $$old_tid //= next_tid($self);
+                }
         } elsif ($n < 0) { # ghost
+                $$old_tid //= $cur_valid ? $cur_tid : next_tid($self);
                 link_refs($self, $refs, $$old_tid);
                 delete_by_num($self, $n);
                 $$v++;
@@ -297,6 +319,7 @@ sub add_over {
 
         begin_lazy($self);
         delete_by_num($self, $num, \$old_tid);
+        $old_tid = undef if ($old_tid // 0) <= $self->{min_tid};
         foreach my $mid (@$mids) {
                 my $v = 0;
                 each_by_mid($self, $mid, ['tid'], \&_add_over,
@@ -456,4 +479,47 @@ sub create {
         $self->disconnect;
 }
 
+sub rethread_prepare {
+        my ($self, $opt) = @_;
+        return unless $opt->{rethread};
+        begin_lazy($self);
+        my $min = $self->{min_tid} = get_counter($self->{dbh}, 'thread') // 0;
+        my $pr = $opt->{-progress};
+        $pr->("rethread min THREADID ".($min + 1)."\n") if $pr && $min;
+}
+
+sub rethread_done {
+        my ($self, $opt) = @_;
+        return unless $opt->{rethread} && $self->{txn};
+        defined(my $min = $self->{min_tid}) or croak('BUG: no min_tid');
+        my $dbh = $self->{dbh} or croak('BUG: no dbh');
+        my $rows = $dbh->selectall_arrayref(<<'', { Slice => {} }, $min);
+SELECT num,tid FROM over WHERE num < 0 AND tid < ?
+
+        my $show_id = $dbh->prepare('SELECT id FROM id2num WHERE num = ?');
+        my $show_mid = $dbh->prepare('SELECT mid FROM msgid WHERE id = ?');
+        my $pr = $opt->{-progress};
+        my $total = 0;
+        for my $r (@$rows) {
+                my $exp = 0;
+                $show_id->execute($r->{num});
+                while (defined(my $id = $show_id->fetchrow_array)) {
+                        ++$exp;
+                        $show_mid->execute($id);
+                        my $mid = $show_mid->fetchrow_array;
+                        if (!defined($mid)) {
+                                warn <<EOF;
+E: ghost NUM=$r->{num} ID=$id THREADID=$r->{tid} has no Message-ID
+EOF
+                                next;
+                        }
+                        $pr->(<<EOM) if $pr;
+I: ghost $r->{num} <$mid> THREADID=$r->{tid} culled
+EOM
+                }
+                delete_by_num($self, $r->{num});
+        }
+        $pr->("I: rethread culled $total ghosts\n") if $pr && $total;
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 83162509..e641ffd4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -723,6 +723,7 @@ sub _index_sync {
         my $pr = $opts->{-progress};
 
         my $xdb = $self->begin_txn_lazy;
+        $self->{over}->rethread_prepare($opts);
         my $mm = _msgmap_init($self);
         do {
                 $xlog = undef; # stop previous git-log via SIGPIPE
@@ -761,12 +762,14 @@ sub _index_sync {
                                 $xdb->set_metadata('last_commit', $newest);
                         }
                 }
+
+                $self->{over}->rethread_done($opts) if $newest; # all done
                 $self->commit_txn_lazy;
                 $git->cleanup;
                 $xdb = _xdb_release($self, $nr);
-                # let another process do some work... <
+                # let another process do some work...
                 $pr->("indexed $nr/$self->{ntodo}\n") if $pr && $nr;
-                if (!$newest) {
+                if (!$newest) { # more to come
                         $xdb = $self->begin_txn_lazy;
                         $dbh->begin_work if $dbh;
                 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0582dd5e..16556ddc 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1308,6 +1308,7 @@ sub index_sync {
         my $latest = git_dir_latest($self, \$epoch_max);
         return unless defined $latest;
         $self->idx_init($opt); # acquire lock
+        $self->{over}->rethread_prepare($opt);
         my $sync = {
                 D => {}, # "$mid\0$chash" => $oid
                 unindex_range => {}, # EPOCH => oid_old..oid_new
@@ -1370,12 +1371,13 @@ sub index_sync {
                 my $pr = $sync->{-opt}->{-progress};
                 $pr->('all.git '.sprintf($sync->{-regen_fmt}, $nr)) if $pr;
         }
+        $self->{over}->rethread_done($opt);
 
         # reindex does not pick up new changes, so we rerun w/o it:
         if ($opt->{reindex}) {
                 my %again = %$opt;
                 $sync = undef;
-                delete @again{qw(reindex -skip_lock)};
+                delete @again{qw(rethread reindex -skip_lock)};
                 index_sync($self, \%again);
         }
 }