about summary refs log tree commit homepage
path: root/lib/PublicInbox/OverIdx.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/OverIdx.pm')
-rw-r--r--lib/PublicInbox/OverIdx.pm103
1 files changed, 41 insertions, 62 deletions
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index e0893337..4f8533f7 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI
@@ -17,6 +17,7 @@ use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::Smsg qw(subject_normalized);
 use Compress::Zlib qw(compress);
 use Carp qw(croak);
+use bytes (); # length
 
 sub dbh_new {
         my ($self) = @_;
@@ -158,7 +159,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1
 
                 foreach (@$nums) {
                         $sth->execute($_->[0]);
-                        my $smsg = $sth->fetchrow_hashref;
+                        # $cb may delete rows and invalidate nums
+                        my $smsg = $sth->fetchrow_hashref // next;
                         $smsg = PublicInbox::Over::load_from_row($smsg);
                         $cb->($self, $smsg, @arg) or return;
                 }
@@ -198,7 +200,7 @@ sub resolve_mid_to_tid {
         $tid // do { # create a new ghost
                 my $id = mid2id($self, $mid);
                 my $num = next_ghost_num($self);
-                $num < 0 or die "ghost num is non-negative: $num\n";
+                $num < 0 or croak "BUG: ghost num is non-negative: $num\n";
                 $tid = next_tid($self);
                 my $dbh = $self->{dbh};
                 $dbh->prepare_cached(<<'')->execute($num, $tid);
@@ -243,12 +245,13 @@ sub link_refs {
         $tid;
 }
 
-# normalize subjects so they are suitable as pathnames for URLs
-# XXX: consider for removal
+# normalize subjects somewhat, they used to be ASCII-only but now
+# we use \w for UTF-8 support.  We may still drop it entirely and
+# rely on Xapian for subject matches...
 sub subject_path ($) {
         my ($subj) = @_;
         $subj = subject_normalized($subj);
-        $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g;
+        $subj =~ s![^\w\.~/\-]+!_!g;
         lc($subj);
 }
 
@@ -261,7 +264,10 @@ sub ddd_for ($) {
 
 sub add_overview {
         my ($self, $eml, $smsg) = @_;
-        $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!;
+        my $raw = $eml->body_raw;
+        $smsg->{lines} = $raw =~ tr!\n!\n!;
+        $smsg->{bytes} //= bytes::length $raw;
+        undef $raw;
         my $mids = mids_for_index($eml);
         my $refs = $smsg->parse_references($eml, $mids);
         $mids->[0] //= do {
@@ -281,7 +287,7 @@ sub _add_over {
         my ($self, $smsg, $mid, $refs, $old_tid, $v) = @_;
         my $cur_tid = $smsg->{tid};
         my $n = $smsg->{num};
-        die "num must not be zero for $mid" if !$n;
+        croak "BUG: num must not be zero for $mid" if !$n;
         my $cur_valid = $cur_tid > $self->{min_tid};
 
         if ($n > 0) { # regular mail
@@ -432,6 +438,7 @@ sub commit_lazy {
         my ($self) = @_;
         delete $self->{txn} or return;
         $self->{dbh}->commit;
+        eval { $self->{dbh}->do('PRAGMA optimize') };
 }
 
 sub begin_lazy {
@@ -451,7 +458,7 @@ sub rollback_lazy {
 
 sub dbh_close {
         my ($self) = @_;
-        die "in transaction" if $self->{txn};
+        Carp::confess('BUG: in transaction') if $self->{txn};
         $self->SUPER::dbh_close;
 }
 
@@ -506,18 +513,18 @@ EOF
                                 next;
                         }
                         $pr->(<<EOM) if $pr;
-I: ghost $r->{num} <$mid> THREADID=$r->{tid} culled
+# ghost $r->{num} <$mid> THREADID=$r->{tid} culled
 EOM
                 }
                 delete_by_num($self, $r->{num});
         }
-        $pr->("I: rethread culled $total ghosts\n") if $pr && $total;
+        $pr->("# rethread culled $total ghosts\n") if $pr && $total;
 }
 
 # used for cross-inbox search
 sub eidx_prep ($) {
         my ($self) = @_;
-        $self->{-eidx_prep} //= do {
+        $self->{-eidx_prep} // do {
                 my $dbh = $self->dbh;
                 $dbh->do(<<'');
 INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid')
@@ -541,9 +548,13 @@ CREATE TABLE IF NOT EXISTS xref3 (
         $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
 
         # performance critical, this is not UNIQUE since we may need to
-        # tolerate some old bugs from indexing mirrors
-        $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
-                'xref3 (oidbin,xnum,ibx_id)');
+        # tolerate some old bugs from indexing mirrors.  n.b. we used
+        # to index oidbin here, but leaving it out speeds up reindexing
+        # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin
+        $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '.
+                'xref3 (xnum,ibx_id)');
+
+        $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)');
 
                 $dbh->do(<<'');
 CREATE TABLE IF NOT EXISTS eidx_meta (
@@ -558,7 +569,7 @@ CREATE TABLE IF NOT EXISTS eidx_meta (
                 $dbh->do(<<'');
 CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL)
 
-                1;
+                $self->{-eidx_prep} = 1;
         };
 }
 
@@ -600,50 +611,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
         $sth->execute;
 }
 
-# returns remaining reference count to $docid
-sub remove_xref3 {
-        my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
-        begin_lazy($self);
-        my $oidbin = pack('H*', $oidhex);
-        my ($sth, $ibx_id);
-        if (defined $eidx_key) {
-                $ibx_id = ibx_id($self, $eidx_key);
-                $sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
-
-                $sth->bind_param(1, $docid);
-                $sth->bind_param(2, $ibx_id);
-                $sth->bind_param(3, $oidbin, SQL_BLOB);
-        } else {
-                $sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
-
-                $sth->bind_param(1, $docid);
-                $sth->bind_param(2, $oidbin, SQL_BLOB);
-        }
-        $sth->execute;
-        $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ?
-
-        $sth->execute($docid);
-        my $nr = $sth->fetchrow_array;
-        if ($nr == 0) {
-                delete_by_num($self, $docid);
-        } elsif (defined($ibx_id) && $rm_eidx_info) {
-                # if deduplication rules in ContentHash change, it's
-                # possible a docid can have multiple rows with the
-                # same ibx_id.  This governs whether or not we call
-                # ->shard_remove_eidx_info in ExtSearchIdx.
-                $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
-
-                $sth->execute($docid, $ibx_id);
-                my $count = $sth->fetchrow_array;
-                $$rm_eidx_info = ($count == 0);
-        }
-        $nr;
-}
-
 # for when an xref3 goes missing, this does NOT update {ts}
 sub update_blob {
         my ($self, $smsg, $oidhex) = @_;
@@ -657,8 +624,7 @@ UPDATE over SET ddd = ? WHERE num = ?
 }
 
 sub merge_xref3 { # used for "-extindex --dedupe"
-        my ($self, $keep_docid, $drop_docid, $oidhex) = @_;
-        my $oidbin = pack('H*', $oidhex);
+        my ($self, $keep_docid, $drop_docid, $oidbin) = @_;
         my $sth = $self->{dbh}->prepare_cached(<<'');
 UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ?
 
@@ -708,4 +674,17 @@ sub vivify_xvmd {
         $smsg->{-vivify_xvmd} = \@vivify_xvmd;
 }
 
+sub fork_ok {
+        state $fork_ok = eval("v$DBD::SQLite::sqlite_version") ge v3.8.3;
+        return 1 if $fork_ok;
+        my ($opt) = @_;
+        my @j = split(/,/, $opt->{jobs} // '');
+        state $warned;
+        grep { $_ > 1 } @j and $warned //= warn(<<EOM);
+DBD::SQLite version is v$DBD::SQLite::sqlite_version, need >= v3.8.3 for --jobs > 1
+EOM
+        $opt->{jobs} = '1,1';
+        undef;
+}
+
 1;