user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/12] manifest: support faster generation via [extindex "all"]
Date: Mon, 23 Nov 2020 07:06:01 +0000	[thread overview]
Message-ID: <20201123070602.9698-12-e@80x24.org> (raw)
In-Reply-To: <20201123070602.9698-1-e@80x24.org>

For a mirror of lore.kernel.org with >140 inboxes, this speeds
up manifest.js.gz generation from ~1s to 40ms on my HW.  This
is still unacceptable when dealing with thousands of inboxes,
but gets us closer to where we need to be.
---
 lib/PublicInbox/Config.pm        |  3 +++
 lib/PublicInbox/Inbox.pm         |  2 ++
 lib/PublicInbox/InboxWritable.pm |  2 --
 lib/PublicInbox/ManifestJsGz.pm  | 39 ++++++++++++++++++++++++++------
 lib/PublicInbox/MiscSearch.pm    | 19 ++++++++++++++++
 5 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 039eb445..251008a3 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -94,6 +94,9 @@ sub lookup_ei {
 	$self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name");
 }
 
+# special case for [extindex "all"]
+sub ALL { lookup_ei($_[0], 'all') }
+
 sub each_inbox {
 	my ($self, $cb, @arg) = @_;
 	# may auto-vivify if config file is non-existent:
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index a1a072ad..5a22e40d 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -429,4 +429,6 @@ sub on_unlock {
 
 sub uidvalidity  { $_[0]->{uidvalidity} //= $_[0]->mm->created_at }
 
+sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
+
 1;
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index d3c255c7..e97c7e2d 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -319,6 +319,4 @@ sub git_dir_latest {
 	$latest;
 }
 
-sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
-
 1;
diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm
index 3b436827..2c4a231d 100644
--- a/lib/PublicInbox/ManifestJsGz.pm
+++ b/lib/PublicInbox/ManifestJsGz.pm
@@ -21,6 +21,14 @@ sub url_regexp {
 	$ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain');
 }
 
+sub inject_entry ($$$;$) {
+	my ($ctx, $url_path, $ent, $git_dir) = @_;
+	$ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path;
+	my $modified = $ent->{modified};
+	$ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0);
+	$ctx->{manifest}->{$url_path} = $ent;
+}
+
 sub manifest_add ($$;$$) {
 	my ($ctx, $ibx, $epoch, $default_desc) = @_;
 	my $url_path = "/$ibx->{name}";
@@ -32,15 +40,10 @@ sub manifest_add ($$;$$) {
 		$git = $ibx->git;
 	}
 	my $ent = $git->manifest_entry($epoch, $default_desc) or return;
-	$ctx->{-abs2urlpath}->{$git->{git_dir}} = $url_path;
-	my $modified = $ent->{modified};
-	if ($modified > ($ctx->{-mtime} // 0)) {
-		$ctx->{-mtime} = $modified;
-	}
-	$ctx->{manifest}->{$url_path} = $ent;
+	inject_entry($ctx, $url_path, $ent, $git->{git_dir});
 }
 
-sub ibx_entry {
+sub slow_manifest_add ($$) {
 	my ($ctx, $ibx) = @_;
 	eval {
 		if (defined(my $max = $ibx->max_git_epoch)) {
@@ -52,6 +55,28 @@ sub ibx_entry {
 			manifest_add($ctx, $ibx);
 		}
 	};
+}
+
+sub eidx_manifest_add ($$$) {
+	my ($ctx, $ALL, $ibx) = @_;
+	if (my $data = $ALL->misc->inbox_data($ibx)) {
+		$data = $json->decode($data);
+		while (my ($url_path, $ent) = each %$data) {
+			inject_entry($ctx, $url_path, $ent);
+		}
+	} else {
+		warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n";
+	}
+}
+
+sub ibx_entry {
+	my ($ctx, $ibx) = @_;
+	my $ALL = $ctx->{www}->{pi_config}->ALL;
+	if ($ALL) {
+		eidx_manifest_add($ctx, $ALL, $ibx);
+	} else {
+		slow_manifest_add($ctx, $ibx);
+	}
 	warn "E: $@" if $@;
 }
 
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index 8beb8349..5a44d751 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -76,4 +76,23 @@ sub mset {
 	retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]);
 }
 
+sub ibx_data_once {
+	my ($self, $ibx) = @{$_[0]};
+	my $xdb = $self->{xdb};
+	my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private
+	my $head = $xdb->postlist_begin('Q'.$eidx_key);
+	my $tail = $xdb->postlist_end('Q'.$eidx_key);
+	if ($head != $tail) {
+		my $doc = $xdb->get_document($head->get_docid);
+		$doc->get_data;
+	} else {
+		undef;
+	}
+}
+
+sub inbox_data {
+	my ($self, $ibx) = @_;
+	retry_reopen($self, \&ibx_data_once, [ $self, $ibx ]);
+}
+
 1;

  parent reply	other threads:[~2020-11-23  7:06 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-23  7:05 [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
2020-11-23  7:05 ` [PATCH 01/12] miscsearch: a new Xapian sub-DB for extindex Eric Wong
2020-11-23  7:05 ` [PATCH 02/12] move JSON module portability into PublicInbox::Config Eric Wong
2020-11-23  7:05 ` [PATCH 03/12] git: add manifest_entry method Eric Wong
2020-11-23  7:05 ` [PATCH 04/12] manifest: use ibx->git_epoch method for v2 Eric Wong
2020-11-23  7:05 ` [PATCH 05/12] inbox: git_epoch: remove ->version check Eric Wong
2020-11-23  7:05 ` [PATCH 06/12] miscidx: put grokmirror manifest entries in Xapian docdata Eric Wong
2020-11-23  7:05 ` [PATCH 07/12] extsearch: fix remaining "eindex" references Eric Wong
2020-11-23  7:05 ` [PATCH 08/12] miscidx: cleanup git processes after manifest indexing Eric Wong
2020-11-23  7:05 ` [PATCH 09/12] miscidx: store absolute git_dir of each epoch in docdata Eric Wong
2020-11-23  7:06 ` [PATCH 10/12] extsearchidx: do not short-circuit MiscIdx on no-op v2 prepare Eric Wong
2020-11-23  7:06 ` Eric Wong [this message]
2020-11-23  7:06 ` [PATCH 12/12] *search: simplify retry_reopen users Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201123070602.9698-12-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).