From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 688FB1FA1A for ; Wed, 23 Dec 2020 08:38:54 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 05/10] miscsearch: index UIDVALIDITY, use as startup cache Date: Wed, 23 Dec 2020 08:38:48 +0000 Message-Id: <20201223083853.30721-6-e@80x24.org> In-Reply-To: <20201223083853.30721-1-e@80x24.org> References: <20201223083853.30721-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This brings -nntpd startup time down from ~35s to ~5s with 50K inboxes. Further improvements ought to be possible with deeper changes to MiscIdx, since -mda having to load every inbox seems unreasonable; but this general change is fairly unintrusive. --- lib/PublicInbox/ExtSearchIdx.pm | 22 +++++++------ lib/PublicInbox/MiscIdx.pm | 26 ++++++++++----- lib/PublicInbox/MiscSearch.pm | 56 ++++++++++++++++++++++++++++++--- lib/PublicInbox/NNTPD.pm | 4 ++- lib/PublicInbox/Search.pm | 9 +++++- lib/PublicInbox/SearchIdx.pm | 7 ----- t/search.t | 4 +-- 7 files changed, 96 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index f04e0443..9d64ff5a 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -61,16 +61,20 @@ sub new { sub attach_inbox { my ($self, $ibx) = @_; - my $key = $ibx->eidx_key; - if (!$ibx->over || !$ibx->mm) { - warn "W: skipping $key (unindexed)\n"; - return; - } - if (!defined($ibx->uidvalidity)) { - warn "W: skipping $key (no UIDVALIDITY)\n"; - return; + my $ekey = $ibx->eidx_key; + my $misc = $self->{misc}; + if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed + } else { + if (!$ibx->over || !$ibx->mm) { + warn "W: skipping $ekey (unindexed)\n"; + return; + } + if (!defined($ibx->uidvalidity)) { + warn "W: skipping $ekey (no UIDVALIDITY)\n"; + return; + } } - $self->{ibx_map}->{$key} //= do { + $self->{ibx_map}->{$ekey} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 64591d05..a04dd1c5 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -21,6 +21,7 @@ use Carp qw(croak); use File::Path (); use PublicInbox::MiscSearch; use PublicInbox::Config; +my $json; sub new { my ($class, $eidx) = @_; @@ -30,6 +31,7 @@ sub new { nodatacow_dir($mi_dir); my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $json //= PublicInbox::Config::json(); bless { mi_dir => $mi_dir, flags => $flags, @@ -91,17 +93,27 @@ EOF $xdb->delete_document($_) for @drop; # just in case my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); - # allow sorting by modified + # allow sorting by modified and uidvalidity (created at) add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); - $doc->add_boolean_term('Q'.$eidx_key); - $doc->add_boolean_term('T'.'inbox'); - term_generator($self)->set_document($doc); + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + } + + # force reread from disk, {description} could be loaded from {misc} + delete $ibx->{description}; + my $desc = $ibx->description; # description = S/Subject (or title) # address = A/Author - index_text($self, $ibx->description, 1, 'S'); + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); my %map = ( address => 'A', listid => 'XLISTID', @@ -113,10 +125,8 @@ EOF index_text($self, $v, 1, $pfx); } } - index_text($self, $ibx->{name}, 1, 'XNAME'); my $data = {}; if (defined(my $max = $ibx->max_git_epoch)) { # v2 - my $desc = $ibx->description; my $pfx = "/$ibx->{name}/git/"; for my $epoch (0..$max) { my $git = $ibx->git_epoch($epoch) or return; @@ -130,7 +140,7 @@ EOF $ent->{git_dir} = $ibx->{inboxdir}; $data->{"/$ibx->{name}"} = $ent; } - $doc->set_data(PublicInbox::Config::json()->encode($data)); + $doc->set_data($json->encode($data)); if (defined $docid) { $xdb->replace_document($docid, $doc); } else { diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index de587d35..c6ce255f 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -5,10 +5,12 @@ package PublicInbox::MiscSearch; use strict; use v5.10.1; -use PublicInbox::Search qw(retry_reopen); +use PublicInbox::Search qw(retry_reopen int_val); +my $json; # Xapian value columns: our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) # avoid conflicting with message Search::prob_prefix for UI/UX reasons my %PROB_PREFIX = ( @@ -24,6 +26,7 @@ my %PROB_PREFIX = ( sub new { my ($class, $dir) = @_; PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; @@ -120,11 +123,13 @@ sub newsgroup_matches { sub ibx_data_once { my ($self, $ibx) = @_; my $xdb = $self->{xdb}; - my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private - my $head = $xdb->postlist_begin('Q'.$eidx_key); - my $tail = $xdb->postlist_end('Q'.$eidx_key); + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); if ($head != $tail) { my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); $doc->get_data; } else { undef; @@ -136,4 +141,47 @@ sub inbox_data { retry_reopen($self, \&ibx_data_once, $ibx); } +sub ibx_cache_load { + my ($doc, $cache) = @_; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + return if $cur == $end; + my $eidx_key = $cur->get_termname; + $eidx_key =~ s/\AQ// or return; # expired + my $ce = $cache->{$eidx_key} = {}; + $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY); + $ce->{-modified} = int_val($doc, $MODIFIED); + $ce->{description} = do { + # extract description from manifest.js.gz epoch description + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + $d; + } +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 7f9a1d58..6907a03c 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -36,10 +36,12 @@ sub refresh_groups { my ($self, $sig) = @_; my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg}; my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; $pi_cfg->each_inbox(sub { my ($ibx) = @_; my $ngname = $ibx->{newsgroup} // return; - if ($ibx->nntp_usable) { + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { # only valid if msgmap and over works # preload to avoid fragmentation: $ibx->description; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b1d38fb9..05c679c9 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(retry_reopen); +our @EXPORT_OK = qw(retry_reopen int_val); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -91,6 +91,7 @@ sub load_xapian () { 1 : Search::Xapian::ENQ_ASCENDING(); *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -436,4 +437,10 @@ sub help { \@ret; } +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cf2c2c55..d1b0c724 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -106,7 +106,6 @@ sub load_xapian_writable () { } eval 'require '.$X->{WritableDatabase} or die; *sortable_serialise = $xap.'::sortable_serialise'; - *sortable_unserialise = $xap.'::sortable_unserialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); my $ver = (eval($xap.'::major_version()') << 16) | @@ -501,12 +500,6 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub int_val ($$) { - my ($doc, $col) = @_; - my $val = $doc->get_value($col) or return; # undefined is '' in Xapian - sortable_unserialise($val) + 0; # PV => IV conversion -} - sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; diff --git a/t/search.t b/t/search.t index 11143204..3754717d 100644 --- a/t/search.t +++ b/t/search.t @@ -332,13 +332,13 @@ $ibx->with_umask(sub { like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears'); my $doc = $m->get_document; my $col = PublicInbox::Search::BYTES(); - my $bytes = PublicInbox::SearchIdx::int_val($doc, $col); + my $bytes = PublicInbox::Search::int_val($doc, $col); like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit'); ok($bytes > 0, '$bytes is > 0'); is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over'); $col = PublicInbox::Search::UID(); - my $uid = PublicInbox::SearchIdx::int_val($doc, $col); + my $uid = PublicInbox::Search::int_val($doc, $col); is($uid, $smsg->{num}, 'UID column matches {num}'); is($uid, $m->get_docid, 'UID column matches docid'); }