diff options
Diffstat (limited to 'lib/PublicInbox/OverIdx.pm')
-rw-r--r-- | lib/PublicInbox/OverIdx.pm | 103 |
1 files changed, 41 insertions, 62 deletions
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index e0893337..4f8533f7 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -17,6 +17,7 @@ use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); use Carp qw(croak); +use bytes (); # length sub dbh_new { my ($self) = @_; @@ -158,7 +159,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1 foreach (@$nums) { $sth->execute($_->[0]); - my $smsg = $sth->fetchrow_hashref; + # $cb may delete rows and invalidate nums + my $smsg = $sth->fetchrow_hashref // next; $smsg = PublicInbox::Over::load_from_row($smsg); $cb->($self, $smsg, @arg) or return; } @@ -198,7 +200,7 @@ sub resolve_mid_to_tid { $tid // do { # create a new ghost my $id = mid2id($self, $mid); my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; + $num < 0 or croak "BUG: ghost num is non-negative: $num\n"; $tid = next_tid($self); my $dbh = $self->{dbh}; $dbh->prepare_cached(<<'')->execute($num, $tid); @@ -243,12 +245,13 @@ sub link_refs { $tid; } -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } @@ -261,7 +264,10 @@ sub ddd_for ($) { sub add_overview { my ($self, $eml, $smsg) = @_; - $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; + my $raw = $eml->body_raw; + $smsg->{lines} = $raw =~ tr!\n!\n!; + $smsg->{bytes} //= bytes::length $raw; + undef $raw; my $mids = mids_for_index($eml); my $refs = $smsg->parse_references($eml, $mids); $mids->[0] //= do { @@ -281,7 +287,7 @@ sub _add_over { my ($self, $smsg, $mid, $refs, $old_tid, $v) = @_; my $cur_tid = $smsg->{tid}; my $n = $smsg->{num}; - die "num must not be zero for $mid" if !$n; + croak "BUG: num must not be zero for $mid" if !$n; my $cur_valid = $cur_tid > $self->{min_tid}; if ($n > 0) { # regular mail @@ -432,6 +438,7 @@ sub commit_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{dbh}->commit; + eval { $self->{dbh}->do('PRAGMA optimize') }; } sub begin_lazy { @@ -451,7 +458,7 @@ sub rollback_lazy { sub dbh_close { my ($self) = @_; - die "in transaction" if $self->{txn}; + Carp::confess('BUG: in transaction') if $self->{txn}; $self->SUPER::dbh_close; } @@ -506,18 +513,18 @@ EOF next; } $pr->(<<EOM) if $pr; -I: ghost $r->{num} <$mid> THREADID=$r->{tid} culled +# ghost $r->{num} <$mid> THREADID=$r->{tid} culled EOM } delete_by_num($self, $r->{num}); } - $pr->("I: rethread culled $total ghosts\n") if $pr && $total; + $pr->("# rethread culled $total ghosts\n") if $pr && $total; } # used for cross-inbox search sub eidx_prep ($) { my ($self) = @_; - $self->{-eidx_prep} //= do { + $self->{-eidx_prep} // do { my $dbh = $self->dbh; $dbh->do(<<''); INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') @@ -541,9 +548,13 @@ CREATE TABLE IF NOT EXISTS xref3 ( $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); # performance critical, this is not UNIQUE since we may need to - # tolerate some old bugs from indexing mirrors - $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. - 'xref3 (oidbin,xnum,ibx_id)'); + # tolerate some old bugs from indexing mirrors. n.b. we used + # to index oidbin here, but leaving it out speeds up reindexing + # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin + $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '. + 'xref3 (xnum,ibx_id)'); + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)'); $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidx_meta ( @@ -558,7 +569,7 @@ CREATE TABLE IF NOT EXISTS eidx_meta ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL) - 1; + $self->{-eidx_prep} = 1; }; } @@ -600,50 +611,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } -# returns remaining reference count to $docid -sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; - begin_lazy($self); - my $oidbin = pack('H*', $oidhex); - my ($sth, $ibx_id); - if (defined $eidx_key) { - $ibx_id = ibx_id($self, $eidx_key); - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $ibx_id); - $sth->bind_param(3, $oidbin, SQL_BLOB); - } else { - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $oidbin, SQL_BLOB); - } - $sth->execute; - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? - - $sth->execute($docid); - my $nr = $sth->fetchrow_array; - if ($nr == 0) { - delete_by_num($self, $docid); - } elsif (defined($ibx_id) && $rm_eidx_info) { - # if deduplication rules in ContentHash change, it's - # possible a docid can have multiple rows with the - # same ibx_id. This governs whether or not we call - # ->shard_remove_eidx_info in ExtSearchIdx. - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? - - $sth->execute($docid, $ibx_id); - my $count = $sth->fetchrow_array; - $$rm_eidx_info = ($count == 0); - } - $nr; -} - # for when an xref3 goes missing, this does NOT update {ts} sub update_blob { my ($self, $smsg, $oidhex) = @_; @@ -657,8 +624,7 @@ UPDATE over SET ddd = ? WHERE num = ? } sub merge_xref3 { # used for "-extindex --dedupe" - my ($self, $keep_docid, $drop_docid, $oidhex) = @_; - my $oidbin = pack('H*', $oidhex); + my ($self, $keep_docid, $drop_docid, $oidbin) = @_; my $sth = $self->{dbh}->prepare_cached(<<''); UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ? @@ -708,4 +674,17 @@ sub vivify_xvmd { $smsg->{-vivify_xvmd} = \@vivify_xvmd; } +sub fork_ok { + state $fork_ok = eval("v$DBD::SQLite::sqlite_version") ge v3.8.3; + return 1 if $fork_ok; + my ($opt) = @_; + my @j = split(/,/, $opt->{jobs} // ''); + state $warned; + grep { $_ > 1 } @j and $warned //= warn(<<EOM); +DBD::SQLite version is v$DBD::SQLite::sqlite_version, need >= v3.8.3 for --jobs > 1 +EOM + $opt->{jobs} = '1,1'; + undef; +} + 1; |