* [PATCH 11/12] manifest: support faster generation via [extindex "all"]
2020-11-23 7:05 7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
@ 2020-11-23 7:06 7% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-11-23 7:06 UTC (permalink / raw)
To: meta
For a mirror of lore.kernel.org with >140 inboxes, this speeds
up manifest.js.gz generation from ~1s to 40ms on my HW. This
is still unacceptable when dealing with thousands of inboxes,
but gets us closer to where we need to be.
---
lib/PublicInbox/Config.pm | 3 +++
lib/PublicInbox/Inbox.pm | 2 ++
lib/PublicInbox/InboxWritable.pm | 2 --
lib/PublicInbox/ManifestJsGz.pm | 39 ++++++++++++++++++++++++++------
lib/PublicInbox/MiscSearch.pm | 19 ++++++++++++++++
5 files changed, 56 insertions(+), 9 deletions(-)
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 039eb445..251008a3 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -94,6 +94,9 @@ sub lookup_ei {
$self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name");
}
+# special case for [extindex "all"]
+sub ALL { lookup_ei($_[0], 'all') }
+
sub each_inbox {
my ($self, $cb, @arg) = @_;
# may auto-vivify if config file is non-existent:
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index a1a072ad..5a22e40d 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -429,4 +429,6 @@ sub on_unlock {
sub uidvalidity { $_[0]->{uidvalidity} //= $_[0]->mm->created_at }
+sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
+
1;
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index d3c255c7..e97c7e2d 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -319,6 +319,4 @@ sub git_dir_latest {
$latest;
}
-sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
-
1;
diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm
index 3b436827..2c4a231d 100644
--- a/lib/PublicInbox/ManifestJsGz.pm
+++ b/lib/PublicInbox/ManifestJsGz.pm
@@ -21,6 +21,14 @@ sub url_regexp {
$ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain');
}
+sub inject_entry ($$$;$) {
+ my ($ctx, $url_path, $ent, $git_dir) = @_;
+ $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path;
+ my $modified = $ent->{modified};
+ $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0);
+ $ctx->{manifest}->{$url_path} = $ent;
+}
+
sub manifest_add ($$;$$) {
my ($ctx, $ibx, $epoch, $default_desc) = @_;
my $url_path = "/$ibx->{name}";
@@ -32,15 +40,10 @@ sub manifest_add ($$;$$) {
$git = $ibx->git;
}
my $ent = $git->manifest_entry($epoch, $default_desc) or return;
- $ctx->{-abs2urlpath}->{$git->{git_dir}} = $url_path;
- my $modified = $ent->{modified};
- if ($modified > ($ctx->{-mtime} // 0)) {
- $ctx->{-mtime} = $modified;
- }
- $ctx->{manifest}->{$url_path} = $ent;
+ inject_entry($ctx, $url_path, $ent, $git->{git_dir});
}
-sub ibx_entry {
+sub slow_manifest_add ($$) {
my ($ctx, $ibx) = @_;
eval {
if (defined(my $max = $ibx->max_git_epoch)) {
@@ -52,6 +55,28 @@ sub ibx_entry {
manifest_add($ctx, $ibx);
}
};
+}
+
+sub eidx_manifest_add ($$$) {
+ my ($ctx, $ALL, $ibx) = @_;
+ if (my $data = $ALL->misc->inbox_data($ibx)) {
+ $data = $json->decode($data);
+ while (my ($url_path, $ent) = each %$data) {
+ inject_entry($ctx, $url_path, $ent);
+ }
+ } else {
+ warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n";
+ }
+}
+
+sub ibx_entry {
+ my ($ctx, $ibx) = @_;
+ my $ALL = $ctx->{www}->{pi_config}->ALL;
+ if ($ALL) {
+ eidx_manifest_add($ctx, $ALL, $ibx);
+ } else {
+ slow_manifest_add($ctx, $ibx);
+ }
warn "E: $@" if $@;
}
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index 8beb8349..5a44d751 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -76,4 +76,23 @@ sub mset {
retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]);
}
+sub ibx_data_once {
+ my ($self, $ibx) = @{$_[0]};
+ my $xdb = $self->{xdb};
+ my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private
+ my $head = $xdb->postlist_begin('Q'.$eidx_key);
+ my $tail = $xdb->postlist_end('Q'.$eidx_key);
+ if ($head != $tail) {
+ my $doc = $xdb->get_document($head->get_docid);
+ $doc->get_data;
+ } else {
+ undef;
+ }
+}
+
+sub inbox_data {
+ my ($self, $ibx) = @_;
+ retry_reopen($self, \&ibx_data_once, [ $self, $ibx ]);
+}
+
1;
^ permalink raw reply related [relevance 7%]
* [PATCH 00/12] extindex: speed up manifest.js.gz generation
@ 2020-11-23 7:05 7% Eric Wong
2020-11-23 7:06 7% ` [PATCH 11/12] manifest: support faster generation via [extindex "all"] Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-11-23 7:05 UTC (permalink / raw)
To: meta
manifest.js.gz generation gets faster with this series
(~1000ms => ~40ms) on the current set of lore.kernel.org inboxes
We may need to rely on varnish to handle things up to 30-100K
inboxes, since manifest.js.gz generation won't monopolize the
-httpd event loop.
WwwListing (HTML) output still needs to be updated and searching
for inboxes needs to be implemented along with pagination for
30-100K inboxes.
Eric Wong (12):
miscsearch: a new Xapian sub-DB for extindex
move JSON module portability into PublicInbox::Config
git: add manifest_entry method
manifest: use ibx->git_epoch method for v2
inbox: git_epoch: remove ->version check
miscidx: put grokmirror manifest entries in Xapian docdata
extsearch: fix remaining "eindex" references
miscidx: cleanup git processes after manifest indexing
miscidx: store absolute git_dir of each epoch in docdata
extsearchidx: do not short-circuit MiscIdx on no-op v2 prepare
manifest: support faster generation via [extindex "all"]
*search: simplify retry_reopen users
MANIFEST | 3 +
lib/PublicInbox/Config.pm | 15 ++++
lib/PublicInbox/ExtSearch.pm | 8 +-
lib/PublicInbox/ExtSearchIdx.pm | 18 ++++-
lib/PublicInbox/Git.pm | 53 +++++++++++++
lib/PublicInbox/Inbox.pm | 6 +-
lib/PublicInbox/InboxWritable.pm | 2 -
lib/PublicInbox/ManifestJsGz.pm | 108 +++++++++-----------------
lib/PublicInbox/MiscIdx.pm | 125 +++++++++++++++++++++++++++++++
lib/PublicInbox/MiscSearch.pm | 98 ++++++++++++++++++++++++
lib/PublicInbox/Search.pm | 18 ++---
lib/PublicInbox/SearchIdx.pm | 7 +-
lib/PublicInbox/V2Writable.pm | 5 ++
script/public-inbox-extindex | 1 +
t/extsearch.t | 14 +++-
t/miscsearch.t | 57 ++++++++++++++
t/www_listing.t | 5 +-
17 files changed, 446 insertions(+), 97 deletions(-)
create mode 100644 lib/PublicInbox/MiscIdx.pm
create mode 100644 lib/PublicInbox/MiscSearch.pm
create mode 100644 t/miscsearch.t
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-11-23 7:05 7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
2020-11-23 7:06 7% ` [PATCH 11/12] manifest: support faster generation via [extindex "all"] Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).