user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 11/12] manifest: support faster generation via [extindex "all"]
  2020-11-23  7:05  7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
@ 2020-11-23  7:06  7% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-11-23  7:06 UTC (permalink / raw)
  To: meta

For a mirror of lore.kernel.org with >140 inboxes, this speeds
up manifest.js.gz generation from ~1s to 40ms on my HW.  This
is still unacceptable when dealing with thousands of inboxes,
but gets us closer to where we need to be.
---
 lib/PublicInbox/Config.pm        |  3 +++
 lib/PublicInbox/Inbox.pm         |  2 ++
 lib/PublicInbox/InboxWritable.pm |  2 --
 lib/PublicInbox/ManifestJsGz.pm  | 39 ++++++++++++++++++++++++++------
 lib/PublicInbox/MiscSearch.pm    | 19 ++++++++++++++++
 5 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 039eb445..251008a3 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -94,6 +94,9 @@ sub lookup_ei {
 	$self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name");
 }
 
+# special case for [extindex "all"]
+sub ALL { lookup_ei($_[0], 'all') }
+
 sub each_inbox {
 	my ($self, $cb, @arg) = @_;
 	# may auto-vivify if config file is non-existent:
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index a1a072ad..5a22e40d 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -429,4 +429,6 @@ sub on_unlock {
 
 sub uidvalidity  { $_[0]->{uidvalidity} //= $_[0]->mm->created_at }
 
+sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
+
 1;
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index d3c255c7..e97c7e2d 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -319,6 +319,4 @@ sub git_dir_latest {
 	$latest;
 }
 
-sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
-
 1;
diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm
index 3b436827..2c4a231d 100644
--- a/lib/PublicInbox/ManifestJsGz.pm
+++ b/lib/PublicInbox/ManifestJsGz.pm
@@ -21,6 +21,14 @@ sub url_regexp {
 	$ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain');
 }
 
+sub inject_entry ($$$;$) {
+	my ($ctx, $url_path, $ent, $git_dir) = @_;
+	$ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path;
+	my $modified = $ent->{modified};
+	$ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0);
+	$ctx->{manifest}->{$url_path} = $ent;
+}
+
 sub manifest_add ($$;$$) {
 	my ($ctx, $ibx, $epoch, $default_desc) = @_;
 	my $url_path = "/$ibx->{name}";
@@ -32,15 +40,10 @@ sub manifest_add ($$;$$) {
 		$git = $ibx->git;
 	}
 	my $ent = $git->manifest_entry($epoch, $default_desc) or return;
-	$ctx->{-abs2urlpath}->{$git->{git_dir}} = $url_path;
-	my $modified = $ent->{modified};
-	if ($modified > ($ctx->{-mtime} // 0)) {
-		$ctx->{-mtime} = $modified;
-	}
-	$ctx->{manifest}->{$url_path} = $ent;
+	inject_entry($ctx, $url_path, $ent, $git->{git_dir});
 }
 
-sub ibx_entry {
+sub slow_manifest_add ($$) {
 	my ($ctx, $ibx) = @_;
 	eval {
 		if (defined(my $max = $ibx->max_git_epoch)) {
@@ -52,6 +55,28 @@ sub ibx_entry {
 			manifest_add($ctx, $ibx);
 		}
 	};
+}
+
+sub eidx_manifest_add ($$$) {
+	my ($ctx, $ALL, $ibx) = @_;
+	if (my $data = $ALL->misc->inbox_data($ibx)) {
+		$data = $json->decode($data);
+		while (my ($url_path, $ent) = each %$data) {
+			inject_entry($ctx, $url_path, $ent);
+		}
+	} else {
+		warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n";
+	}
+}
+
+sub ibx_entry {
+	my ($ctx, $ibx) = @_;
+	my $ALL = $ctx->{www}->{pi_config}->ALL;
+	if ($ALL) {
+		eidx_manifest_add($ctx, $ALL, $ibx);
+	} else {
+		slow_manifest_add($ctx, $ibx);
+	}
 	warn "E: $@" if $@;
 }
 
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index 8beb8349..5a44d751 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -76,4 +76,23 @@ sub mset {
 	retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]);
 }
 
+sub ibx_data_once {
+	my ($self, $ibx) = @{$_[0]};
+	my $xdb = $self->{xdb};
+	my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private
+	my $head = $xdb->postlist_begin('Q'.$eidx_key);
+	my $tail = $xdb->postlist_end('Q'.$eidx_key);
+	if ($head != $tail) {
+		my $doc = $xdb->get_document($head->get_docid);
+		$doc->get_data;
+	} else {
+		undef;
+	}
+}
+
+sub inbox_data {
+	my ($self, $ibx) = @_;
+	retry_reopen($self, \&ibx_data_once, [ $self, $ibx ]);
+}
+
 1;

^ permalink raw reply related	[relevance 7%]

* [PATCH 00/12] extindex: speed up manifest.js.gz generation
@ 2020-11-23  7:05  7% Eric Wong
  2020-11-23  7:06  7% ` [PATCH 11/12] manifest: support faster generation via [extindex "all"] Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-11-23  7:05 UTC (permalink / raw)
  To: meta

manifest.js.gz generation gets faster with this series
(~1000ms => ~40ms) on the current set of lore.kernel.org inboxes

We may need to rely on varnish to handle things up to 30-100K
inboxes, since manifest.js.gz generation won't monopolize the
-httpd event loop.

WwwListing (HTML) output still needs to be updated and searching
for inboxes needs to be implemented along with pagination for
30-100K inboxes.

Eric Wong (12):
  miscsearch: a new Xapian sub-DB for extindex
  move JSON module portability into PublicInbox::Config
  git: add manifest_entry method
  manifest: use ibx->git_epoch method for v2
  inbox: git_epoch: remove ->version check
  miscidx: put grokmirror manifest entries in Xapian docdata
  extsearch: fix remaining "eindex" references
  miscidx: cleanup git processes after manifest indexing
  miscidx: store absolute git_dir of each epoch in docdata
  extsearchidx: do not short-circuit MiscIdx on no-op v2 prepare
  manifest: support faster generation via [extindex "all"]
  *search: simplify retry_reopen users

 MANIFEST                         |   3 +
 lib/PublicInbox/Config.pm        |  15 ++++
 lib/PublicInbox/ExtSearch.pm     |   8 +-
 lib/PublicInbox/ExtSearchIdx.pm  |  18 ++++-
 lib/PublicInbox/Git.pm           |  53 +++++++++++++
 lib/PublicInbox/Inbox.pm         |   6 +-
 lib/PublicInbox/InboxWritable.pm |   2 -
 lib/PublicInbox/ManifestJsGz.pm  | 108 +++++++++-----------------
 lib/PublicInbox/MiscIdx.pm       | 125 +++++++++++++++++++++++++++++++
 lib/PublicInbox/MiscSearch.pm    |  98 ++++++++++++++++++++++++
 lib/PublicInbox/Search.pm        |  18 ++---
 lib/PublicInbox/SearchIdx.pm     |   7 +-
 lib/PublicInbox/V2Writable.pm    |   5 ++
 script/public-inbox-extindex     |   1 +
 t/extsearch.t                    |  14 +++-
 t/miscsearch.t                   |  57 ++++++++++++++
 t/www_listing.t                  |   5 +-
 17 files changed, 446 insertions(+), 97 deletions(-)
 create mode 100644 lib/PublicInbox/MiscIdx.pm
 create mode 100644 lib/PublicInbox/MiscSearch.pm
 create mode 100644 t/miscsearch.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-11-23  7:05  7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
2020-11-23  7:06  7% ` [PATCH 11/12] manifest: support faster generation via [extindex "all"] Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).