From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/12] v2: support Xapian + SQLite indexing
Date: Thu, 22 Feb 2018 21:42:17 +0000 [thread overview]
Message-ID: <20180222214222.1086-8-e@80x24.org> (raw)
In-Reply-To: <20180222214222.1086-1-e@80x24.org>
This is too slow, currently. Working with only 2017 LKML
archives:
git-only: ~1 minute
git + SQLite: ~12 minutes
git+Xapian+SQlite: ~45 minutes
So yes, it looks like we'll need to parallelize Xapian indexing,
at least.
---
lib/PublicInbox/Import.pm | 1 +
lib/PublicInbox/Inbox.pm | 4 +++-
lib/PublicInbox/Search.pm | 26 +++++++++++++++-----
lib/PublicInbox/SearchIdx.pm | 56 +++++++++++++++++++++++++++++++------------
lib/PublicInbox/V2Writable.pm | 45 ++++++++++++++++++++++++++++------
scripts/import_vger_from_mbox | 3 +++
6 files changed, 106 insertions(+), 29 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 364ab60..1a2698a 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -285,6 +285,7 @@ sub add {
# v2: we need this for Xapian
if ($self->{want_object_id}) {
chomp($self->{last_object_id} = $self->get_mark(":$blob"));
+ $self->{last_object_size} = $n;
}
my $ref = $self->{ref};
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 2ec2be6..e7856e3 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -79,7 +79,9 @@ sub new {
sub git {
my ($self) = @_;
$self->{git} ||= eval {
- my $g = PublicInbox::Git->new($self->{mainrepo});
+ my $git_dir = $self->{mainrepo};
+ $git_dir .= '/all.git' if (($self->{version} || 1) == 2);
+ my $g = PublicInbox::Git->new($git_dir);
$g->{-httpbackend_limiter} = $self->{-httpbackend_limiter};
_cleanup_later($self);
$g;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 33a1f2d..eac11bd 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -120,15 +120,29 @@ chomp @HELP;
my $mail_query = Search::Xapian::Query->new('T' . 'mail');
sub xdir {
- my (undef, $git_dir) = @_;
- "$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
+ my ($self) = @_;
+ if ($self->{version} == 1) {
+ "$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION;
+ } else {
+ "$self->{mainrepo}/xap" . SCHEMA_VERSION;
+ }
}
sub new {
- my ($class, $git_dir, $altid) = @_;
- my $dir = $class->xdir($git_dir);
- my $db = Search::Xapian::Database->new($dir);
- bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class;
+ my ($class, $mainrepo, $altid) = @_;
+ my $version = 1;
+ my $ibx = $mainrepo;
+ if (ref $ibx) {
+ $version = $ibx->{version} || 1;
+ $mainrepo = $ibx->{mainrepo};
+ }
+ my $self = bless {
+ mainrepo => $mainrepo,
+ altid => $altid,
+ version => $version,
+ }, $class;
+ $self->{xdb} = Search::Xapian::Database->new($self->xdir);
+ $self;
}
sub reopen { $_[0]->{xdb}->reopen }
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 265403a..c6c5bd2 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -51,26 +51,43 @@ sub git_unquote ($) {
}
sub new {
- my ($class, $inbox, $creat) = @_;
- my $git_dir = $inbox;
- my $altid;
- if (ref $inbox) {
- $git_dir = $inbox->{mainrepo};
- $altid = $inbox->{altid};
+ my ($class, $ibx, $creat) = @_;
+ my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config
+ my $git_dir = $mainrepo;
+ my ($altid, $git);
+ my $version = 1;
+ if (ref $ibx) {
+ $mainrepo = $ibx->{mainrepo};
+ $altid = $ibx->{altid};
+ $version = $ibx->{version} || 1;
if ($altid) {
require PublicInbox::AltId;
$altid = [ map {
- PublicInbox::AltId->new($inbox, $_);
+ PublicInbox::AltId->new($ibx, $_);
} @$altid ];
}
+ $git = $ibx->git;
+ } else {
+ $git = PublicInbox::Git->new($git_dir); # v1 only
}
require Search::Xapian::WritableDatabase;
- my $self = bless { git_dir => $git_dir, -altid => $altid }, $class;
+ my $self = bless {
+ mainrepo => $mainrepo,
+ git => $git,
+ -altid => $altid,
+ version => $version,
+ }, $class;
my $perm = $self->_git_config_perm;
my $umask = _umask_for($perm);
$self->{umask} = $umask;
- $self->{lock_path} = "$git_dir/ssoma.lock";
- $self->{git} = PublicInbox::Git->new($git_dir);
+ if ($version == 1) {
+ $self->{lock_path} = "$mainrepo/ssoma.lock";
+ } elsif ($version == 2) {
+ $self->{lock_path} = "$mainrepo/inbox.lock";
+ $self->{msgmap_path} = "$mainrepo/msgmap.sqlite3";
+ } else {
+ die "unsupported inbox version=$version\n";
+ }
$self->{creat} = ($creat || 0) == 1;
$self;
}
@@ -86,7 +103,7 @@ sub _xdb_release {
sub _xdb_acquire {
my ($self) = @_;
croak 'already acquired' if $self->{xdb};
- my $dir = PublicInbox::Search->xdir($self->{git_dir});
+ my $dir = $self->xdir;
my $flag = Search::Xapian::DB_OPEN;
if ($self->{creat}) {
require File::Path;
@@ -541,6 +558,7 @@ sub batch_adjust ($$$$) {
}
}
+# only for v1
sub rlog {
my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
my $hex = '[a-f0-9]';
@@ -573,9 +591,14 @@ sub rlog {
sub _msgmap_init {
my ($self) = @_;
- $self->{mm} = eval {
+ $self->{mm} ||= eval {
require PublicInbox::Msgmap;
- PublicInbox::Msgmap->new($self->{git_dir}, 1);
+ my $msgmap_path = $self->{msgmap_path};
+ if (defined $msgmap_path) { # v2
+ PublicInbox::Msgmap->new_file($msgmap_path, 1);
+ } else {
+ PublicInbox::Msgmap->new($self->{mainrepo}, 1);
+ }
};
}
@@ -712,8 +735,11 @@ sub merge_threads {
sub _read_git_config_perm {
my ($self) = @_;
- my @cmd = qw(config core.sharedRepository);
- my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd);
+ my @cmd = qw(config);
+ if ($self->{version} == 2) {
+ push @cmd, "--file=$self->{mainrepo}/inbox-config";
+ }
+ my $fh = $self->{git}->popen(@cmd, 'core.sharedRepository');
local $/ = "\n";
my $perm = <$fh>;
chomp $perm if defined $perm;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 9b68e9b..41bfb8d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -34,7 +34,7 @@ sub new {
xap_ro => undef,
# limit each repo to 1GB or so
- rotate_bytes => int((100 * 1024 * 1024) / $PACKING_FACTOR),
+ rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
};
bless $self, $class
}
@@ -55,11 +55,29 @@ sub add {
my $cmt = $im->add($mime, $check_cb) or return;
$cmt = $im->get_mark($cmt);
my $oid = $im->{last_object_id};
- $self->index_msg($mime, $existing, $cmt, $oid);
+ my $size = $im->{last_object_size};
+
+ my $idx = $self->search_idx;
+ $idx->index_both($mime, $size, $oid);
+ $idx->{xdb}->set_metadata('last_commit', $cmt);
+ my $n = $self->{transact_bytes} += $size;
+ if ($n > PublicInbox::SearchIdx::BATCH_BYTES) {
+ $self->checkpoint;
+ }
+
$mime;
}
-sub index_msg { # TODO
+sub search_idx {
+ my ($self) = @_;
+ $self->{idx} ||= eval {
+ my $idx = PublicInbox::SearchIdx->new($self->{-inbox}, 1);
+ my $mm = $idx->_msgmap_init;
+ $idx->_xdb_acquire->begin_transaction;
+ $self->{transact_bytes} = 0;
+ $mm->{dbh}->begin_work;
+ $idx
+ };
}
sub remove {
@@ -79,12 +97,25 @@ sub remove {
sub done {
my ($self) = @_;
- $self->{im}->done; # PublicInbox::Import::done
+ my $im = $self->{im};
+ $im->done if $im; # PublicInbox::Import::done
+ $self->searchidx_checkpoint;
}
sub checkpoint {
my ($self) = @_;
- $self->{im}->checkpoint; # PublicInbox::Import::checkpoint
+ my $im = $self->{im};
+ $im->checkpoint if $im; # PublicInbox::Import::checkpoint
+ $self->searchidx_checkpoint;
+}
+
+sub searchidx_checkpoint {
+ my ($self) = @_;
+ my $idx = delete $self->{idx} or return;
+
+ $idx->{mm}->{dbh}->commit;
+ $idx->{xdb}->commit_transaction;
+ $idx->_xdb_release;
}
sub git_init {
@@ -127,6 +158,7 @@ sub importer {
} else {
$self->{im} = undef;
$im->done;
+ $self->searchidx_checkpoint;
$im = undef;
my $git_dir = $self->git_init(++$self->{max_git});
my $git = PublicInbox::Git->new($git_dir);
@@ -156,8 +188,6 @@ sub importer {
$self->{max_git} = $max;
return $self->import_init($git, $packed_bytes);
}
- } else {
- warn "latest not found in $pfx\n";
}
$self->{max_git} = $new;
$latest = $self->git_init($new);
@@ -168,6 +198,7 @@ sub import_init {
my ($self, $git, $packed_bytes) = @_;
my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
+ $im->{want_object_id} = 1;
$im->{ssoma_lock} = 0;
$im->{path_type} = 'v2';
$self->{im} = $im;
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index c45dc4e..d30e8a3 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -7,6 +7,7 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
use Date::Parse qw/str2time/;
use Email::MIME;
$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect
+use PublicInbox::Inbox;
use PublicInbox::V2Writable;
my $usage = "usage: $0 NAME EMAIL DIR <MBOX\n";
my $dry_run;
@@ -18,8 +19,10 @@ my $mainrepo = shift or die $usage; # /path/to/v2/repo
my $v2ibx = {
mainrepo => $mainrepo,
name => $name,
+ version => 2,
-primary_address => $email,
};
+$v2ibx = PublicInbox::Inbox->new($v2ibx);
my $im = $dry_run ? undef : PublicInbox::V2Writable->new($v2ibx, 1);
binmode STDIN;
my $msg = '';
--
EW
next prev parent reply other threads:[~2018-02-22 21:42 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-02-22 21:42 [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 01/12] import: allow the epoch (0s) as a valid time Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 02/12] extmsg: fix broken Xapian MID lookup Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 03/12] search: stop assuming Message-ID is unique Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 04/12] www: stop assuming mainrepo == git_dir Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 05/12] v2writable: initial cut for repo-rotation Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 06/12] git: reload alternates file on missing blob Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-02-22 21:42 ` [PATCH 08/12] import_vger_from_inbox: allow "-V" option Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 09/12] import_vger_from_mbox: use PublicInbox::MIME and avoid clobbering Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 10/12] v2: parallelize Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 11/12] v2writable: round-robin to partitions based on article number Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 12/12] searchidxpart: increase pipe size for partitions Eric Wong (Contractor, The Linux Foundation)
2018-02-23 1:22 ` [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180222214222.1086-8-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).