user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/12] v2: support Xapian + SQLite indexing
Date: Thu, 22 Feb 2018 21:42:17 +0000	[thread overview]
Message-ID: <20180222214222.1086-8-e@80x24.org> (raw)
In-Reply-To: <20180222214222.1086-1-e@80x24.org>

This is too slow, currently.  Working with only 2017 LKML
archives:

         git-only: ~1 minute
     git + SQLite: ~12 minutes
git+Xapian+SQlite: ~45 minutes

So yes, it looks like we'll need to parallelize Xapian indexing,
at least.
---
 lib/PublicInbox/Import.pm     |  1 +
 lib/PublicInbox/Inbox.pm      |  4 +++-
 lib/PublicInbox/Search.pm     | 26 +++++++++++++++-----
 lib/PublicInbox/SearchIdx.pm  | 56 +++++++++++++++++++++++++++++++------------
 lib/PublicInbox/V2Writable.pm | 45 ++++++++++++++++++++++++++++------
 scripts/import_vger_from_mbox |  3 +++
 6 files changed, 106 insertions(+), 29 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 364ab60..1a2698a 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -285,6 +285,7 @@ sub add {
 	# v2: we need this for Xapian
 	if ($self->{want_object_id}) {
 		chomp($self->{last_object_id} = $self->get_mark(":$blob"));
+		$self->{last_object_size} = $n;
 	}
 
 	my $ref = $self->{ref};
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 2ec2be6..e7856e3 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -79,7 +79,9 @@ sub new {
 sub git {
 	my ($self) = @_;
 	$self->{git} ||= eval {
-		my $g = PublicInbox::Git->new($self->{mainrepo});
+		my $git_dir = $self->{mainrepo};
+		$git_dir .= '/all.git' if (($self->{version} || 1) == 2);
+		my $g = PublicInbox::Git->new($git_dir);
 		$g->{-httpbackend_limiter} = $self->{-httpbackend_limiter};
 		_cleanup_later($self);
 		$g;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 33a1f2d..eac11bd 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -120,15 +120,29 @@ chomp @HELP;
 my $mail_query = Search::Xapian::Query->new('T' . 'mail');
 
 sub xdir {
-	my (undef, $git_dir) = @_;
-	"$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
+	my ($self) = @_;
+	if ($self->{version} == 1) {
+		"$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION;
+	} else {
+		"$self->{mainrepo}/xap" . SCHEMA_VERSION;
+	}
 }
 
 sub new {
-	my ($class, $git_dir, $altid) = @_;
-	my $dir = $class->xdir($git_dir);
-	my $db = Search::Xapian::Database->new($dir);
-	bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class;
+	my ($class, $mainrepo, $altid) = @_;
+	my $version = 1;
+	my $ibx = $mainrepo;
+	if (ref $ibx) {
+		$version = $ibx->{version} || 1;
+		$mainrepo = $ibx->{mainrepo};
+	}
+	my $self = bless {
+		mainrepo => $mainrepo,
+		altid => $altid,
+		version => $version,
+	}, $class;
+	$self->{xdb} = Search::Xapian::Database->new($self->xdir);
+	$self;
 }
 
 sub reopen { $_[0]->{xdb}->reopen }
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 265403a..c6c5bd2 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -51,26 +51,43 @@ sub git_unquote ($) {
 }
 
 sub new {
-	my ($class, $inbox, $creat) = @_;
-	my $git_dir = $inbox;
-	my $altid;
-	if (ref $inbox) {
-		$git_dir = $inbox->{mainrepo};
-		$altid = $inbox->{altid};
+	my ($class, $ibx, $creat) = @_;
+	my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config
+	my $git_dir = $mainrepo;
+	my ($altid, $git);
+	my $version = 1;
+	if (ref $ibx) {
+		$mainrepo = $ibx->{mainrepo};
+		$altid = $ibx->{altid};
+		$version = $ibx->{version} || 1;
 		if ($altid) {
 			require PublicInbox::AltId;
 			$altid = [ map {
-				PublicInbox::AltId->new($inbox, $_);
+				PublicInbox::AltId->new($ibx, $_);
 			} @$altid ];
 		}
+		$git = $ibx->git;
+	} else {
+		$git = PublicInbox::Git->new($git_dir); # v1 only
 	}
 	require Search::Xapian::WritableDatabase;
-	my $self = bless { git_dir => $git_dir, -altid => $altid }, $class;
+	my $self = bless {
+		mainrepo => $mainrepo,
+		git => $git,
+		-altid => $altid,
+		version => $version,
+	}, $class;
 	my $perm = $self->_git_config_perm;
 	my $umask = _umask_for($perm);
 	$self->{umask} = $umask;
-	$self->{lock_path} = "$git_dir/ssoma.lock";
-	$self->{git} = PublicInbox::Git->new($git_dir);
+	if ($version == 1) {
+		$self->{lock_path} = "$mainrepo/ssoma.lock";
+	} elsif ($version == 2) {
+		$self->{lock_path} = "$mainrepo/inbox.lock";
+		$self->{msgmap_path} = "$mainrepo/msgmap.sqlite3";
+	} else {
+		die "unsupported inbox version=$version\n";
+	}
 	$self->{creat} = ($creat || 0) == 1;
 	$self;
 }
@@ -86,7 +103,7 @@ sub _xdb_release {
 sub _xdb_acquire {
 	my ($self) = @_;
 	croak 'already acquired' if $self->{xdb};
-	my $dir = PublicInbox::Search->xdir($self->{git_dir});
+	my $dir = $self->xdir;
 	my $flag = Search::Xapian::DB_OPEN;
 	if ($self->{creat}) {
 		require File::Path;
@@ -541,6 +558,7 @@ sub batch_adjust ($$$$) {
 	}
 }
 
+# only for v1
 sub rlog {
 	my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
 	my $hex = '[a-f0-9]';
@@ -573,9 +591,14 @@ sub rlog {
 
 sub _msgmap_init {
 	my ($self) = @_;
-	$self->{mm} = eval {
+	$self->{mm} ||= eval {
 		require PublicInbox::Msgmap;
-		PublicInbox::Msgmap->new($self->{git_dir}, 1);
+		my $msgmap_path = $self->{msgmap_path};
+		if (defined $msgmap_path) { # v2
+			PublicInbox::Msgmap->new_file($msgmap_path, 1);
+		} else {
+			PublicInbox::Msgmap->new($self->{mainrepo}, 1);
+		}
 	};
 }
 
@@ -712,8 +735,11 @@ sub merge_threads {
 
 sub _read_git_config_perm {
 	my ($self) = @_;
-	my @cmd = qw(config core.sharedRepository);
-	my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd);
+	my @cmd = qw(config);
+	if ($self->{version} == 2) {
+		push @cmd, "--file=$self->{mainrepo}/inbox-config";
+	}
+	my $fh = $self->{git}->popen(@cmd, 'core.sharedRepository');
 	local $/ = "\n";
 	my $perm = <$fh>;
 	chomp $perm if defined $perm;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 9b68e9b..41bfb8d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -34,7 +34,7 @@ sub new {
 		xap_ro => undef,
 
 		# limit each repo to 1GB or so
-		rotate_bytes => int((100 * 1024 * 1024) / $PACKING_FACTOR),
+		rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
 	};
 	bless $self, $class
 }
@@ -55,11 +55,29 @@ sub add {
 	my $cmt = $im->add($mime, $check_cb) or return;
 	$cmt = $im->get_mark($cmt);
 	my $oid = $im->{last_object_id};
-	$self->index_msg($mime, $existing, $cmt, $oid);
+	my $size = $im->{last_object_size};
+
+	my $idx = $self->search_idx;
+	$idx->index_both($mime, $size, $oid);
+	$idx->{xdb}->set_metadata('last_commit', $cmt);
+	my $n = $self->{transact_bytes} += $size;
+	if ($n > PublicInbox::SearchIdx::BATCH_BYTES) {
+		$self->checkpoint;
+	}
+
 	$mime;
 }
 
-sub index_msg {  # TODO
+sub search_idx {
+	my ($self) = @_;
+	$self->{idx} ||= eval {
+		my $idx = PublicInbox::SearchIdx->new($self->{-inbox}, 1);
+		my $mm = $idx->_msgmap_init;
+		$idx->_xdb_acquire->begin_transaction;
+		$self->{transact_bytes} = 0;
+		$mm->{dbh}->begin_work;
+		$idx
+	};
 }
 
 sub remove {
@@ -79,12 +97,25 @@ sub remove {
 
 sub done {
 	my ($self) = @_;
-	$self->{im}->done; # PublicInbox::Import::done
+	my $im = $self->{im};
+	$im->done if $im; # PublicInbox::Import::done
+	$self->searchidx_checkpoint;
 }
 
 sub checkpoint {
 	my ($self) = @_;
-	$self->{im}->checkpoint; # PublicInbox::Import::checkpoint
+	my $im = $self->{im};
+	$im->checkpoint if $im; # PublicInbox::Import::checkpoint
+	$self->searchidx_checkpoint;
+}
+
+sub searchidx_checkpoint {
+	my ($self) = @_;
+	my $idx = delete $self->{idx} or return;
+
+	$idx->{mm}->{dbh}->commit;
+	$idx->{xdb}->commit_transaction;
+	$idx->_xdb_release;
 }
 
 sub git_init {
@@ -127,6 +158,7 @@ sub importer {
 		} else {
 			$self->{im} = undef;
 			$im->done;
+			$self->searchidx_checkpoint;
 			$im = undef;
 			my $git_dir = $self->git_init(++$self->{max_git});
 			my $git = PublicInbox::Git->new($git_dir);
@@ -156,8 +188,6 @@ sub importer {
 			$self->{max_git} = $max;
 			return $self->import_init($git, $packed_bytes);
 		}
-	} else {
-		warn "latest not found in $pfx\n";
 	}
 	$self->{max_git} = $new;
 	$latest = $self->git_init($new);
@@ -168,6 +198,7 @@ sub import_init {
 	my ($self, $git, $packed_bytes) = @_;
 	my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
 	$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
+	$im->{want_object_id} = 1;
 	$im->{ssoma_lock} = 0;
 	$im->{path_type} = 'v2';
 	$self->{im} = $im;
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index c45dc4e..d30e8a3 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -7,6 +7,7 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
 use Date::Parse qw/str2time/;
 use Email::MIME;
 $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect
+use PublicInbox::Inbox;
 use PublicInbox::V2Writable;
 my $usage = "usage: $0 NAME EMAIL DIR <MBOX\n";
 my $dry_run;
@@ -18,8 +19,10 @@ my $mainrepo = shift or die $usage; # /path/to/v2/repo
 my $v2ibx = {
 	mainrepo => $mainrepo,
 	name => $name,
+	version => 2,
 	-primary_address => $email,
 };
+$v2ibx = PublicInbox::Inbox->new($v2ibx);
 my $im = $dry_run ? undef : PublicInbox::V2Writable->new($v2ibx, 1);
 binmode STDIN;
 my $msg = '';
-- 
EW


  parent reply	other threads:[~2018-02-22 21:42 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-22 21:42 [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 01/12] import: allow the epoch (0s) as a valid time Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 02/12] extmsg: fix broken Xapian MID lookup Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 03/12] search: stop assuming Message-ID is unique Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 04/12] www: stop assuming mainrepo == git_dir Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 05/12] v2writable: initial cut for repo-rotation Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 06/12] git: reload alternates file on missing blob Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-02-22 21:42 ` [PATCH 08/12] import_vger_from_inbox: allow "-V" option Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 09/12] import_vger_from_mbox: use PublicInbox::MIME and avoid clobbering Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 10/12] v2: parallelize Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 11/12] v2writable: round-robin to partitions based on article number Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 12/12] searchidxpart: increase pipe size for partitions Eric Wong (Contractor, The Linux Foundation)
2018-02-23  1:22 ` [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180222214222.1086-8-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).