* [PATCH 06/12] miscidx: put grokmirror manifest entries in Xapian docdata
2020-11-23 7:05 7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
@ 2020-11-23 7:05 7% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-11-23 7:05 UTC (permalink / raw)
To: meta
This should make it possible for us quickly generate
manifest.js.gz files with less random I/O and process
spawning in the WWW code.
---
lib/PublicInbox/MiscIdx.pm | 15 +++++++++++++++
script/public-inbox-extindex | 1 +
t/extsearch.t | 7 ++++++-
t/miscsearch.t | 3 +++
4 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index edc70f9b..9dcc96b7 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -20,6 +20,7 @@ use PublicInbox::Spawn qw(nodatacow_dir);
use Carp qw(croak);
use File::Path ();
use PublicInbox::MiscSearch;
+use PublicInbox::Config;
sub new {
my ($class, $eidx) = @_;
@@ -97,6 +98,20 @@ EOF
}
}
index_text($self, $ibx->{name}, 1, 'XNAME');
+ my $data = {};
+ if (defined(my $max = $ibx->max_git_epoch)) { # v2
+ my $desc = $ibx->description;
+ my $pfx = "/$ibx->{name}/git/";
+ for my $epoch (0..$max) {
+ my $git = $ibx->git_epoch($epoch) or return;
+ if (my $ent = $git->manifest_entry($epoch, $desc)) {
+ $data->{"$pfx$epoch.git"} = $ent;
+ }
+ }
+ } elsif (my $ent = $ibx->git->manifest_entry) { # v1
+ $data->{"/$ibx->{name}"} = $ent;
+ }
+ $doc->set_data(PublicInbox::Config::json()->encode($data));
if (defined $docid) {
$xdb->replace_document($docid, $doc);
} else {
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 78d6d9d9..20a0737c 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -38,6 +38,7 @@ require PublicInbox::Admin;
my $cfg = PublicInbox::Config->new;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
PublicInbox::Admin::progress_prepare($opt);
my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
local %ENV = (%ENV, %$env) if $env;
diff --git a/t/extsearch.t b/t/extsearch.t
index e28e2f71..dc825bf4 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -4,7 +4,9 @@
use strict;
use Test::More;
use PublicInbox::TestCommon;
+use PublicInbox::Config;
use Fcntl qw(:seek);
+my $json = PublicInbox::Config::json() or plan skip_all => 'JSON missing';
require_git(2.6);
require_mods(qw(DBD::SQLite Search::Xapian));
use_ok 'PublicInbox::ExtSearch';
@@ -73,6 +75,9 @@ my $es = PublicInbox::ExtSearch->new("$home/eindex");
}
my $misc = $es->misc;
-is(scalar($misc->mset('')->items), 2, 'two inboxes');
+my @it = $misc->mset('')->items;
+is(scalar(@it), 2, 'two inboxes');
+like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
+like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
done_testing;
diff --git a/t/miscsearch.t b/t/miscsearch.t
index 45a19da9..0ba79194 100644
--- a/t/miscsearch.t
+++ b/t/miscsearch.t
@@ -50,5 +50,8 @@ is(scalar($mset->items), 1, 'match partial address');
$mset = $ms->mset('hope');
is(scalar($mset->items), 1, 'match name');
+my $mi = ($mset->items)[0];
+my $doc = $mi->get_document;
+is($doc->get_data, '{}', 'stored empty data');
done_testing;
^ permalink raw reply related [relevance 7%]
* [PATCH 00/12] extindex: speed up manifest.js.gz generation
@ 2020-11-23 7:05 7% Eric Wong
2020-11-23 7:05 7% ` [PATCH 06/12] miscidx: put grokmirror manifest entries in Xapian docdata Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-11-23 7:05 UTC (permalink / raw)
To: meta
manifest.js.gz generation gets faster with this series
(~1000ms => ~40ms) on the current set of lore.kernel.org inboxes
We may need to rely on varnish to handle things up to 30-100K
inboxes, since manifest.js.gz generation won't monopolize the
-httpd event loop.
WwwListing (HTML) output still needs to be updated and searching
for inboxes needs to be implemented along with pagination for
30-100K inboxes.
Eric Wong (12):
miscsearch: a new Xapian sub-DB for extindex
move JSON module portability into PublicInbox::Config
git: add manifest_entry method
manifest: use ibx->git_epoch method for v2
inbox: git_epoch: remove ->version check
miscidx: put grokmirror manifest entries in Xapian docdata
extsearch: fix remaining "eindex" references
miscidx: cleanup git processes after manifest indexing
miscidx: store absolute git_dir of each epoch in docdata
extsearchidx: do not short-circuit MiscIdx on no-op v2 prepare
manifest: support faster generation via [extindex "all"]
*search: simplify retry_reopen users
MANIFEST | 3 +
lib/PublicInbox/Config.pm | 15 ++++
lib/PublicInbox/ExtSearch.pm | 8 +-
lib/PublicInbox/ExtSearchIdx.pm | 18 ++++-
lib/PublicInbox/Git.pm | 53 +++++++++++++
lib/PublicInbox/Inbox.pm | 6 +-
lib/PublicInbox/InboxWritable.pm | 2 -
lib/PublicInbox/ManifestJsGz.pm | 108 +++++++++-----------------
lib/PublicInbox/MiscIdx.pm | 125 +++++++++++++++++++++++++++++++
lib/PublicInbox/MiscSearch.pm | 98 ++++++++++++++++++++++++
lib/PublicInbox/Search.pm | 18 ++---
lib/PublicInbox/SearchIdx.pm | 7 +-
lib/PublicInbox/V2Writable.pm | 5 ++
script/public-inbox-extindex | 1 +
t/extsearch.t | 14 +++-
t/miscsearch.t | 57 ++++++++++++++
t/www_listing.t | 5 +-
17 files changed, 446 insertions(+), 97 deletions(-)
create mode 100644 lib/PublicInbox/MiscIdx.pm
create mode 100644 lib/PublicInbox/MiscSearch.pm
create mode 100644 t/miscsearch.t
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-11-23 7:05 7% [PATCH 00/12] extindex: speed up manifest.js.gz generation Eric Wong
2020-11-23 7:05 7% ` [PATCH 06/12] miscidx: put grokmirror manifest entries in Xapian docdata Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).