user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 05/10] miscsearch: index UIDVALIDITY, use as startup cache
  2020-12-23  8:38  7% [PATCH 00/10] start optimizing startup w/ ALL->misc Eric Wong
@ 2020-12-23  8:38  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-12-23  8:38 UTC (permalink / raw)
  To: meta

This brings -nntpd startup time down from ~35s to ~5s with 50K
inboxes.

Further improvements ought to be possible with deeper changes to
MiscIdx, since -mda having to load every inbox seems unreasonable;
but this general change is fairly unintrusive.
---
 lib/PublicInbox/ExtSearchIdx.pm | 22 +++++++------
 lib/PublicInbox/MiscIdx.pm      | 26 ++++++++++-----
 lib/PublicInbox/MiscSearch.pm   | 56 ++++++++++++++++++++++++++++++---
 lib/PublicInbox/NNTPD.pm        |  4 ++-
 lib/PublicInbox/Search.pm       |  9 +++++-
 lib/PublicInbox/SearchIdx.pm    |  7 -----
 t/search.t                      |  4 +--
 7 files changed, 96 insertions(+), 32 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index f04e0443..9d64ff5a 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -61,16 +61,20 @@ sub new {
 
 sub attach_inbox {
 	my ($self, $ibx) = @_;
-	my $key = $ibx->eidx_key;
-	if (!$ibx->over || !$ibx->mm) {
-		warn "W: skipping $key (unindexed)\n";
-		return;
-	}
-	if (!defined($ibx->uidvalidity)) {
-		warn "W: skipping $key (no UIDVALIDITY)\n";
-		return;
+	my $ekey = $ibx->eidx_key;
+	my $misc = $self->{misc};
+	if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed
+	} else {
+		if (!$ibx->over || !$ibx->mm) {
+			warn "W: skipping $ekey (unindexed)\n";
+			return;
+		}
+		if (!defined($ibx->uidvalidity)) {
+			warn "W: skipping $ekey (no UIDVALIDITY)\n";
+			return;
+		}
 	}
-	$self->{ibx_map}->{$key} //= do {
+	$self->{ibx_map}->{$ekey} //= do {
 		push @{$self->{ibx_list}}, $ibx;
 		$ibx;
 	}
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 64591d05..a04dd1c5 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -21,6 +21,7 @@ use Carp qw(croak);
 use File::Path ();
 use PublicInbox::MiscSearch;
 use PublicInbox::Config;
+my $json;
 
 sub new {
 	my ($class, $eidx) = @_;
@@ -30,6 +31,7 @@ sub new {
 	nodatacow_dir($mi_dir);
 	my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN;
 	$flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync};
+	$json //= PublicInbox::Config::json();
 	bless {
 		mi_dir => $mi_dir,
 		flags => $flags,
@@ -91,17 +93,27 @@ EOF
 	$xdb->delete_document($_) for @drop; # just in case
 
 	my $doc = $PublicInbox::Search::X{Document}->new;
+	term_generator($self)->set_document($doc);
 
-	# allow sorting by modified
+	# allow sorting by modified and uidvalidity (created at)
 	add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified);
+	add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity);
 
-	$doc->add_boolean_term('Q'.$eidx_key);
-	$doc->add_boolean_term('T'.'inbox');
-	term_generator($self)->set_document($doc);
+	$doc->add_boolean_term('Q'.$eidx_key); # uniQue id
+	$doc->add_boolean_term('T'.'inbox'); # Type
+
+	if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) {
+		$doc->add_boolean_term('T'.'newsgroup'); # additional Type
+	}
+
+	# force reread from disk, {description} could be loaded from {misc}
+	delete $ibx->{description};
+	my $desc = $ibx->description;
 
 	# description = S/Subject (or title)
 	# address = A/Author
-	index_text($self, $ibx->description, 1, 'S');
+	index_text($self, $desc, 1, 'S');
+	index_text($self, $ibx->{name}, 1, 'XNAME');
 	my %map = (
 		address => 'A',
 		listid => 'XLISTID',
@@ -113,10 +125,8 @@ EOF
 			index_text($self, $v, 1, $pfx);
 		}
 	}
-	index_text($self, $ibx->{name}, 1, 'XNAME');
 	my $data = {};
 	if (defined(my $max = $ibx->max_git_epoch)) { # v2
-		my $desc = $ibx->description;
 		my $pfx = "/$ibx->{name}/git/";
 		for my $epoch (0..$max) {
 			my $git = $ibx->git_epoch($epoch) or return;
@@ -130,7 +140,7 @@ EOF
 		$ent->{git_dir} = $ibx->{inboxdir};
 		$data->{"/$ibx->{name}"} = $ent;
 	}
-	$doc->set_data(PublicInbox::Config::json()->encode($data));
+	$doc->set_data($json->encode($data));
 	if (defined $docid) {
 		$xdb->replace_document($docid, $doc);
 	} else {
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index de587d35..c6ce255f 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -5,10 +5,12 @@
 package PublicInbox::MiscSearch;
 use strict;
 use v5.10.1;
-use PublicInbox::Search qw(retry_reopen);
+use PublicInbox::Search qw(retry_reopen int_val);
+my $json;
 
 # Xapian value columns:
 our $MODIFIED = 0;
+our $UIDVALIDITY = 1; # (created time)
 
 # avoid conflicting with message Search::prob_prefix for UI/UX reasons
 my %PROB_PREFIX = (
@@ -24,6 +26,7 @@ my %PROB_PREFIX = (
 sub new {
 	my ($class, $dir) = @_;
 	PublicInbox::Search::load_xapian();
+	$json //= PublicInbox::Config::json();
 	bless {
 		xdb => $PublicInbox::Search::X{Database}->new($dir)
 	}, $class;
@@ -120,11 +123,13 @@ sub newsgroup_matches {
 sub ibx_data_once {
 	my ($self, $ibx) = @_;
 	my $xdb = $self->{xdb};
-	my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private
-	my $head = $xdb->postlist_begin('Q'.$eidx_key);
-	my $tail = $xdb->postlist_end('Q'.$eidx_key);
+	my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private
+	my $head = $xdb->postlist_begin($term);
+	my $tail = $xdb->postlist_end($term);
 	if ($head != $tail) {
 		my $doc = $xdb->get_document($head->get_docid);
+		$ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY);
+		$ibx->{-modified} = int_val($doc, $MODIFIED);
 		$doc->get_data;
 	} else {
 		undef;
@@ -136,4 +141,47 @@ sub inbox_data {
 	retry_reopen($self, \&ibx_data_once, $ibx);
 }
 
+sub ibx_cache_load {
+	my ($doc, $cache) = @_;
+	my $end = $doc->termlist_end;
+	my $cur = $doc->termlist_begin;
+	$cur->skip_to('Q');
+	return if $cur == $end;
+	my $eidx_key = $cur->get_termname;
+	$eidx_key =~ s/\AQ// or return; # expired
+	my $ce = $cache->{$eidx_key} = {};
+	$ce->{uidvalidity} = int_val($doc, $UIDVALIDITY);
+	$ce->{-modified} = int_val($doc, $MODIFIED);
+	$ce->{description} = do {
+		# extract description from manifest.js.gz epoch description
+		my $d;
+		my $data = $json->decode($doc->get_data);
+		for (values %$data) {
+			$d = $_->{description} // next;
+			$d =~ s/ \[epoch [0-9]+\]\z// or next;
+			last;
+		}
+		$d;
+	}
+}
+
+sub _nntpd_cache_load { # retry_reopen callback
+	my ($self) = @_;
+	my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 };
+	my $mset = mset($self, 'type:newsgroup type:inbox', $opt);
+	my $cache = {};
+	for my $it ($mset->items) {
+		ibx_cache_load($it->get_document, $cache);
+	}
+	$cache
+}
+
+# returns { newsgroup => $cache_entry } mapping, $cache_entry contains
+# anything which may trigger seeks at startup, currently: description,
+# -modified, and uidvalidity.
+sub nntpd_cache_load {
+	my ($self) = @_;
+	retry_reopen($self, \&_nntpd_cache_load);
+}
+
 1;
diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm
index 7f9a1d58..6907a03c 100644
--- a/lib/PublicInbox/NNTPD.pm
+++ b/lib/PublicInbox/NNTPD.pm
@@ -36,10 +36,12 @@ sub refresh_groups {
 	my ($self, $sig) = @_;
 	my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg};
 	my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox
+	my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {};
 	$pi_cfg->each_inbox(sub {
 		my ($ibx) = @_;
 		my $ngname = $ibx->{newsgroup} // return;
-		if ($ibx->nntp_usable) {
+		my $ce = $cache->{$ngname};
+		if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) {
 			# only valid if msgmap and over works
 			# preload to avoid fragmentation:
 			$ibx->description;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index b1d38fb9..05c679c9 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -6,7 +6,7 @@
 package PublicInbox::Search;
 use strict;
 use parent qw(Exporter);
-our @EXPORT_OK = qw(retry_reopen);
+our @EXPORT_OK = qw(retry_reopen int_val);
 use List::Util qw(max);
 
 # values for searching, changing the numeric value breaks
@@ -91,6 +91,7 @@ sub load_xapian () {
 				1 : Search::Xapian::ENQ_ASCENDING();
 
 		*sortable_serialise = $x.'::sortable_serialise';
+		*sortable_unserialise = $x.'::sortable_unserialise';
 		# n.b. FLAG_PURE_NOT is expensive not suitable for a public
 		# website as it could become a denial-of-service vector
 		# FLAG_PHRASE also seems to cause performance problems chert
@@ -436,4 +437,10 @@ sub help {
 	\@ret;
 }
 
+sub int_val ($$) {
+	my ($doc, $col) = @_;
+	my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
+	sortable_unserialise($val) + 0; # PV => IV conversion
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index cf2c2c55..d1b0c724 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -106,7 +106,6 @@ sub load_xapian_writable () {
 	}
 	eval 'require '.$X->{WritableDatabase} or die;
 	*sortable_serialise = $xap.'::sortable_serialise';
-	*sortable_unserialise = $xap.'::sortable_unserialise';
 	$DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()');
 	$DB_OPEN = eval($xap.'::DB_OPEN()');
 	my $ver = (eval($xap.'::major_version()') << 16) |
@@ -501,12 +500,6 @@ sub remove_eidx_info {
 	$self->{xdb}->replace_document($docid, $doc);
 }
 
-sub int_val ($$) {
-	my ($doc, $col) = @_;
-	my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
-	sortable_unserialise($val) + 0; # PV => IV conversion
-}
-
 sub smsg_from_doc ($) {
 	my ($doc) = @_;
 	my $data = $doc->get_data or return;
diff --git a/t/search.t b/t/search.t
index 11143204..3754717d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -332,13 +332,13 @@ $ibx->with_umask(sub {
 		like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears');
 		my $doc = $m->get_document;
 		my $col = PublicInbox::Search::BYTES();
-		my $bytes = PublicInbox::SearchIdx::int_val($doc, $col);
+		my $bytes = PublicInbox::Search::int_val($doc, $col);
 		like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit');
 		ok($bytes > 0, '$bytes is > 0');
 		is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over');
 
 		$col = PublicInbox::Search::UID();
-		my $uid = PublicInbox::SearchIdx::int_val($doc, $col);
+		my $uid = PublicInbox::Search::int_val($doc, $col);
 		is($uid, $smsg->{num}, 'UID column matches {num}');
 		is($uid, $m->get_docid, 'UID column matches docid');
 	}

^ permalink raw reply related	[relevance 5%]

* [PATCH 00/10] start optimizing startup w/ ALL->misc
@ 2020-12-23  8:38  7% Eric Wong
  2020-12-23  8:38  5% ` [PATCH 05/10] miscsearch: index UIDVALIDITY, use as startup cache Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-12-23  8:38 UTC (permalink / raw)
  To: meta

-nntpd [PATCH 5/10] is the single most significant improvements.

And some cleanups, and some general improvements independent of
indexing in patches 7-10 (patch 8 is already superceded by 10,
but kept separate for documentation purposes).
PublicInbox::Config->new is over twice as fast, now.

Eric Wong (10):
  miscsearch: load Xapian at initialization
  xt: add create-many-inboxes helper test
  inbox: git_epoch: correct false comment
  inboxwritable: _init_v1: set created_at ASAP
  miscsearch: index UIDVALIDITY, use as startup cache
  extsearchidx: close SQLite handles after attaching
  config: _fill: inbox name extraction optimization
  config: git_config_dump: pre-compile RE for split
  config: config_fh_parse: micro-optimize
  config: config_fh_parse: micro-optimize even harder

 MANIFEST                         |  1 +
 lib/PublicInbox/Config.pm        | 26 ++++-----
 lib/PublicInbox/ExtSearchIdx.pm  | 25 +++++---
 lib/PublicInbox/Inbox.pm         |  2 +-
 lib/PublicInbox/InboxWritable.pm |  3 +-
 lib/PublicInbox/MiscIdx.pm       | 26 ++++++---
 lib/PublicInbox/MiscSearch.pm    | 57 ++++++++++++++++--
 lib/PublicInbox/NNTPD.pm         |  4 +-
 lib/PublicInbox/Search.pm        |  9 ++-
 lib/PublicInbox/SearchIdx.pm     |  7 ---
 t/search.t                       |  4 +-
 xt/create-many-inboxes.t         | 99 ++++++++++++++++++++++++++++++++
 12 files changed, 213 insertions(+), 50 deletions(-)
 create mode 100644 xt/create-many-inboxes.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-12-23  8:38  7% [PATCH 00/10] start optimizing startup w/ ALL->misc Eric Wong
2020-12-23  8:38  5% ` [PATCH 05/10] miscsearch: index UIDVALIDITY, use as startup cache Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).