about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-10-11 08:06:15 +0000
committerEric Wong <e@80x24.org>2021-10-12 03:40:28 +0000
commitc4a4e9809ddd10f2094e1b088728101ace89297f (patch)
treed5c07be0090d45ed75137adee1067b3735d19fa7 /lib/PublicInbox
parentc9567f1e142931cf4c5f092ad1ec5904f7c5bdc1 (diff)
downloadpublic-inbox-c4a4e9809ddd10f2094e1b088728101ace89297f.tar.gz
This required some tweaking of xref3 indices in over.sqlite3,
but the end result is it brings no-op "--reindex --fast --all"
checks down to roughly 20 minutes (from 30-40 minutes) on
lore/all.

This is faster because a bunch of small SQLite queries are still
slower en-mass than a bunch of perlops.  Despite the lack of IPC
overhead, crossing .so boundaries and repeating lookups over
btrees is still slower than doing the same with Perl hash tables.
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/ExtSearchIdx.pm171
-rw-r--r--lib/PublicInbox/Over.pm4
-rw-r--r--lib/PublicInbox/OverIdx.pm10
3 files changed, 94 insertions, 91 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d589d2c0..8da98ba4 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -807,10 +807,53 @@ sub reindex_unseen ($$$$) {
         $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req);
 }
 
-sub _reindex_check_unseen ($$$) {
+sub _unref_stale ($$$$$) {
+        my ($sync, $docid, $ibx, $xnum, $oidbin) = @_;
+        my $del = $sync->{self}->{oidx}->dbh->prepare_cached(<<'');
+DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
+
+        $del->bind_param(1, $ibx->{-ibx_id});
+        $del->bind_param(2, $xnum);
+        $del->bind_param(3, $oidbin, SQL_BLOB);
+        $del->execute;
+        my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
+        my $idx = $sync->{self}->idx_shard($docid);
+        if (scalar(@$xr3) == 0) { # all gone
+                $sync->{self}->{oidx}->delete_by_num($docid);
+                $sync->{self}->{oidx}->eidxq_del($docid);
+                $idx->ipc_do('xdb_remove', $docid);
+        } else { # enqueue for reindex of remaining messages
+                $idx->ipc_do('remove_eidx_info', $docid, $ibx->eidx_key);
+                $sync->{self}->{oidx}->eidxq_add($docid); # yes, add
+        }
+}
+
+sub _unref_stale_range ($$$) {
+        my ($sync, $ibx, $lt_or_gt) = @_;
+        my $r;
+        my $lim = 10000;
+        do {
+                $r = $sync->{self}->{oidx}->dbh->selectall_arrayref(
+                        <<EOS, undef, $ibx->{-ibx_id});
+SELECT docid,xnum,oidbin FROM xref3
+WHERE ibx_id = ? AND xnum $lt_or_gt LIMIT $lim
+EOS
+                return if $sync->{quit};
+                for (@$r) { # hopefully rare, not worth optimizing:
+                        my ($docid, $xnum, $oidbin) = @$_;
+                        my $hex = unpack('H*', $oidbin);
+                        warn("# $xnum:$hex (#$docid): stale\n");
+                        _unref_stale($sync, $docid, $ibx, $xnum, $oidbin);
+                }
+        } while (scalar(@$r) == $lim);
+        1;
+}
+
+sub _reindex_check_ibx ($$$) {
         my ($self, $sync, $ibx) = @_;
         my $ibx_id = $ibx->{-ibx_id};
-        my $slice = 1000;
+        my $slice = 10000;
+        my $opt = { limit => $slice };
         my ($beg, $end) = (1, $slice);
         my $err = sync_inbox($self, $sync, $ibx) and return;
         my $max = $ibx->over->max;
@@ -820,11 +863,12 @@ sub _reindex_check_unseen ($$$) {
         my $msgs;
         my $pr = $sync->{-opt}->{-progress};
         my $ekey = $ibx->eidx_key;
-        local $sync->{-regen_fmt} =
-                        "$ekey checking unseen %u/".$ibx->over->max."\n";
+        local $sync->{-regen_fmt} = "$ekey checking %u/$max\n";
         ${$sync->{nr}} = 0;
         my $fast = $sync->{-opt}->{fast};
-        while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) {
+        my $dsu; # _unref_stale_range (< $lo) called
+        my ($lo, $hi);
+        while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end, $opt)})) {
                 ${$sync->{nr}} = $beg;
                 $beg = $msgs->[-1]->{num} + 1;
                 $end = $beg + $slice;
@@ -832,92 +876,48 @@ sub _reindex_check_unseen ($$$) {
                 if (checkpoint_due($sync)) {
                         reindex_checkpoint($self, $sync); # release lock
                 }
-
-                my $inx3 = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
-SELECT DISTINCT(docid) FROM xref3 WHERE
-ibx_id = ? AND xnum = ? AND oidbin = ?
-
+                ($lo, $hi) = ($msgs->[0]->{num}, $msgs->[-1]->{num});
+                $dsu //= _unref_stale_range($sync, $ibx, "< $lo");
+                my $x3a = $self->{oidx}->dbh->selectall_arrayref(
+                        <<"", undef, $ibx_id, $lo, $hi);
+SELECT xnum,oidbin,docid FROM xref3 WHERE
+ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+                my %x3m;
+                for (@$x3a) {
+                        my $k = pack('J', $_->[0]) . $_->[1];
+                        push @{$x3m{$k}}, $_->[2];
+                }
+                undef $x3a;
                 for my $xsmsg (@$msgs) {
-                        my $oidbin = pack('H*', $xsmsg->{blob});
-                        $inx3->bind_param(1, $ibx_id);
-                        $inx3->bind_param(2, $xsmsg->{num});
-                        $inx3->bind_param(3, $oidbin, SQL_BLOB);
-                        $inx3->execute;
-                        my $docids = $inx3->fetchall_arrayref;
-                        # index messages which were totally missed
-                        # the first time around ASAP:
-                        if (scalar(@$docids) == 0) {
+                        my $k = pack('JH*', $xsmsg->{num}, $xsmsg->{blob});
+                        my $docids = delete($x3m{$k});
+                        if (!defined($docids)) {
                                 reindex_unseen($self, $sync, $ibx, $xsmsg);
-                        } elsif (!$fast) { # already seen, reindex later
-                                for my $r (@$docids) {
-                                        $self->{oidx}->eidxq_add($r->[0]);
+                        } elsif (!$fast) {
+                                for my $num (@$docids) {
+                                        $self->{oidx}->eidxq_add($num);
                                 }
+                                return if $sync->{quit};
                         }
-                        last if $sync->{quit};
-                }
-                last if $sync->{quit};
-        }
-}
-
-sub _reindex_check_stale ($$$) {
-        my ($self, $sync, $ibx) = @_;
-        my $min = 0;
-        my $pr = $sync->{-opt}->{-progress};
-        my $fetching;
-        my $ekey = $ibx->eidx_key;
-        local $sync->{-regen_fmt} =
-                        "$ekey checking stale/missing %u/".$ibx->over->max."\n";
-        ${$sync->{nr}} = 0;
-        do {
-                if (checkpoint_due($sync)) {
-                        reindex_checkpoint($self, $sync); # release lock
                 }
-                # now, check if there's stale xrefs
-                my $iter = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
-SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? AND docid > ?
-ORDER BY docid,xnum ASC LIMIT 10000
-
-                $iter->execute($ibx->{-ibx_id}, $min);
-                $fetching = undef;
-
-                while (my ($docid, $xnum, $oidbin) = $iter->fetchrow_array) {
-                        return if $sync->{quit};
-                        ${$sync->{nr}} = $xnum;
-
-                        $fetching = $min = $docid;
-                        my $smsg = $ibx->over->get_art($xnum);
-                        my $err;
-                        if (!$smsg) {
-                                $err = 'stale';
-                        } elsif (pack('H*', $smsg->{blob}) ne $oidbin) {
-                                $err = "mismatch (!= $smsg->{blob})";
-                        } else {
-                                next; # likely, all good
-                        }
-                        # current_info already has eidx_key
-                        my $oidhex = unpack('H*', $oidbin);
-                        warn "$xnum:$oidhex (#$docid): $err\n";
-                        my $del = $self->{oidx}->dbh->prepare_cached(<<'');
-DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
-
-                        $del->bind_param(1, $ibx->{-ibx_id});
-                        $del->bind_param(2, $xnum);
-                        $del->bind_param(3, $oidbin, SQL_BLOB);
-                        $del->execute;
-
-                        # get_xref3 over-fetches, but this is a rare path:
-                        my $xr3 = $self->{oidx}->get_xref3($docid, 1);
-                        my $idx = $self->idx_shard($docid);
-                        if (scalar(@$xr3) == 0) { # all gone
-                                $self->{oidx}->delete_by_num($docid);
-                                $self->{oidx}->eidxq_del($docid);
-                                $idx->ipc_do('xdb_remove', $docid);
-                        } else { # enqueue for reindex of remaining messages
-                                $idx->ipc_do('remove_eidx_info', $docid, $ekey);
-                                $self->{oidx}->eidxq_add($docid); # yes, add
+                return if $sync->{quit};
+                next unless scalar keys %x3m;
+
+                # eliminate stale/mismatched entries
+                my %mismatch = map { $_->{num} => $_->{blob} } @$msgs;
+                while (my ($k, $docids) = each %x3m) {
+                        my ($xnum, $hex) = unpack('JH*', $k);
+                        my $bin = pack('H*', $hex);
+                        my $exp = $mismatch{$xnum};
+                        my $m = defined($exp) ? "mismatch (!= $exp)" : 'stale';
+                        warn("# $xnum:$hex (#@$docids): $m\n");
+                        for my $i (@$docids) {
+                                _unref_stale($sync, $i, $ibx, $xnum, $bin);
                         }
                 }
-        } while (defined $fetching);
+        }
+        _unref_stale_range($sync, $ibx, "> $hi") if defined($hi);
 }
 
 sub _reindex_inbox ($$$) {
@@ -927,8 +927,7 @@ sub _reindex_inbox ($$$) {
         if (defined(my $err = _ibx_index_reject($ibx))) {
                 warn "W: cannot reindex $ekey ($err)\n";
         } else {
-                _reindex_check_unseen($self, $sync, $ibx);
-                _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit};
+                _reindex_check_ibx($self, $sync, $ibx);
         }
         delete @$ibx{qw(over mm search git)}; # won't need these for a bit
 }
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 98de82c0..30ad949d 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -108,8 +108,8 @@ sub do_get {
 }
 
 sub query_xover {
-        my ($self, $beg, $end) = @_;
-        do_get($self, <<'', {}, $beg, $end);
+        my ($self, $beg, $end, $opt) = @_;
+        do_get($self, <<'', $opt, $beg, $end);
 SELECT num,ts,ds,ddd FROM over WHERE num >= ? AND num <= ?
 ORDER BY num ASC
 
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 985abbf4..46f7a066 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -543,9 +543,13 @@ CREATE TABLE IF NOT EXISTS xref3 (
         $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
 
         # performance critical, this is not UNIQUE since we may need to
-        # tolerate some old bugs from indexing mirrors
-        $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
-                'xref3 (oidbin,xnum,ibx_id)');
+        # tolerate some old bugs from indexing mirrors.  n.b. we used
+        # to index oidbin here, but leaving it out speeds up reindexing
+        # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin
+        $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '.
+                'xref3 (xnum,ibx_id)');
+
+        $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)');
 
                 $dbh->do(<<'');
 CREATE TABLE IF NOT EXISTS eidx_meta (