From feabfb1809b911fc97538282234c8b1f087ddb6a Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Tue, 20 Feb 2018 21:00:21 +0000 Subject: v2: support Xapian + SQLite indexing This is too slow, currently. Working with only 2017 LKML archives: git-only: ~1 minute git + SQLite: ~12 minutes git+Xapian+SQlite: ~45 minutes So yes, it looks like we'll need to parallelize Xapian indexing, at least. --- lib/PublicInbox/Import.pm | 1 + lib/PublicInbox/Inbox.pm | 4 +++- lib/PublicInbox/Search.pm | 26 +++++++++++++++----- lib/PublicInbox/SearchIdx.pm | 56 +++++++++++++++++++++++++++++++------------ lib/PublicInbox/V2Writable.pm | 45 ++++++++++++++++++++++++++++------ scripts/import_vger_from_mbox | 3 +++ 6 files changed, 106 insertions(+), 29 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 364ab602..1a2698a7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -285,6 +285,7 @@ sub add { # v2: we need this for Xapian if ($self->{want_object_id}) { chomp($self->{last_object_id} = $self->get_mark(":$blob")); + $self->{last_object_size} = $n; } my $ref = $self->{ref}; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 2ec2be69..e7856e3c 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -79,7 +79,9 @@ sub new { sub git { my ($self) = @_; $self->{git} ||= eval { - my $g = PublicInbox::Git->new($self->{mainrepo}); + my $git_dir = $self->{mainrepo}; + $git_dir .= '/all.git' if (($self->{version} || 1) == 2); + my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; _cleanup_later($self); $g; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 33a1f2d3..eac11bd4 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -120,15 +120,29 @@ chomp @HELP; my $mail_query = Search::Xapian::Query->new('T' . 'mail'); sub xdir { - my (undef, $git_dir) = @_; - "$git_dir/public-inbox/xapian" . SCHEMA_VERSION; + my ($self) = @_; + if ($self->{version} == 1) { + "$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION; + } else { + "$self->{mainrepo}/xap" . SCHEMA_VERSION; + } } sub new { - my ($class, $git_dir, $altid) = @_; - my $dir = $class->xdir($git_dir); - my $db = Search::Xapian::Database->new($dir); - bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class; + my ($class, $mainrepo, $altid) = @_; + my $version = 1; + my $ibx = $mainrepo; + if (ref $ibx) { + $version = $ibx->{version} || 1; + $mainrepo = $ibx->{mainrepo}; + } + my $self = bless { + mainrepo => $mainrepo, + altid => $altid, + version => $version, + }, $class; + $self->{xdb} = Search::Xapian::Database->new($self->xdir); + $self; } sub reopen { $_[0]->{xdb}->reopen } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 265403a3..c6c5bd25 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -51,26 +51,43 @@ sub git_unquote ($) { } sub new { - my ($class, $inbox, $creat) = @_; - my $git_dir = $inbox; - my $altid; - if (ref $inbox) { - $git_dir = $inbox->{mainrepo}; - $altid = $inbox->{altid}; + my ($class, $ibx, $creat) = @_; + my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config + my $git_dir = $mainrepo; + my ($altid, $git); + my $version = 1; + if (ref $ibx) { + $mainrepo = $ibx->{mainrepo}; + $altid = $ibx->{altid}; + $version = $ibx->{version} || 1; if ($altid) { require PublicInbox::AltId; $altid = [ map { - PublicInbox::AltId->new($inbox, $_); + PublicInbox::AltId->new($ibx, $_); } @$altid ]; } + $git = $ibx->git; + } else { + $git = PublicInbox::Git->new($git_dir); # v1 only } require Search::Xapian::WritableDatabase; - my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; + my $self = bless { + mainrepo => $mainrepo, + git => $git, + -altid => $altid, + version => $version, + }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; - $self->{lock_path} = "$git_dir/ssoma.lock"; - $self->{git} = PublicInbox::Git->new($git_dir); + if ($version == 1) { + $self->{lock_path} = "$mainrepo/ssoma.lock"; + } elsif ($version == 2) { + $self->{lock_path} = "$mainrepo/inbox.lock"; + $self->{msgmap_path} = "$mainrepo/msgmap.sqlite3"; + } else { + die "unsupported inbox version=$version\n"; + } $self->{creat} = ($creat || 0) == 1; $self; } @@ -86,7 +103,7 @@ sub _xdb_release { sub _xdb_acquire { my ($self) = @_; croak 'already acquired' if $self->{xdb}; - my $dir = PublicInbox::Search->xdir($self->{git_dir}); + my $dir = $self->xdir; my $flag = Search::Xapian::DB_OPEN; if ($self->{creat}) { require File::Path; @@ -541,6 +558,7 @@ sub batch_adjust ($$$$) { } } +# only for v1 sub rlog { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; @@ -573,9 +591,14 @@ sub rlog { sub _msgmap_init { my ($self) = @_; - $self->{mm} = eval { + $self->{mm} ||= eval { require PublicInbox::Msgmap; - PublicInbox::Msgmap->new($self->{git_dir}, 1); + my $msgmap_path = $self->{msgmap_path}; + if (defined $msgmap_path) { # v2 + PublicInbox::Msgmap->new_file($msgmap_path, 1); + } else { + PublicInbox::Msgmap->new($self->{mainrepo}, 1); + } }; } @@ -712,8 +735,11 @@ sub merge_threads { sub _read_git_config_perm { my ($self) = @_; - my @cmd = qw(config core.sharedRepository); - my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd); + my @cmd = qw(config); + if ($self->{version} == 2) { + push @cmd, "--file=$self->{mainrepo}/inbox-config"; + } + my $fh = $self->{git}->popen(@cmd, 'core.sharedRepository'); local $/ = "\n"; my $perm = <$fh>; chomp $perm if defined $perm; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 9b68e9b1..41bfb8d1 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -34,7 +34,7 @@ sub new { xap_ro => undef, # limit each repo to 1GB or so - rotate_bytes => int((100 * 1024 * 1024) / $PACKING_FACTOR), + rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR), }; bless $self, $class } @@ -55,11 +55,29 @@ sub add { my $cmt = $im->add($mime, $check_cb) or return; $cmt = $im->get_mark($cmt); my $oid = $im->{last_object_id}; - $self->index_msg($mime, $existing, $cmt, $oid); + my $size = $im->{last_object_size}; + + my $idx = $self->search_idx; + $idx->index_both($mime, $size, $oid); + $idx->{xdb}->set_metadata('last_commit', $cmt); + my $n = $self->{transact_bytes} += $size; + if ($n > PublicInbox::SearchIdx::BATCH_BYTES) { + $self->checkpoint; + } + $mime; } -sub index_msg { # TODO +sub search_idx { + my ($self) = @_; + $self->{idx} ||= eval { + my $idx = PublicInbox::SearchIdx->new($self->{-inbox}, 1); + my $mm = $idx->_msgmap_init; + $idx->_xdb_acquire->begin_transaction; + $self->{transact_bytes} = 0; + $mm->{dbh}->begin_work; + $idx + }; } sub remove { @@ -79,12 +97,25 @@ sub remove { sub done { my ($self) = @_; - $self->{im}->done; # PublicInbox::Import::done + my $im = $self->{im}; + $im->done if $im; # PublicInbox::Import::done + $self->searchidx_checkpoint; } sub checkpoint { my ($self) = @_; - $self->{im}->checkpoint; # PublicInbox::Import::checkpoint + my $im = $self->{im}; + $im->checkpoint if $im; # PublicInbox::Import::checkpoint + $self->searchidx_checkpoint; +} + +sub searchidx_checkpoint { + my ($self) = @_; + my $idx = delete $self->{idx} or return; + + $idx->{mm}->{dbh}->commit; + $idx->{xdb}->commit_transaction; + $idx->_xdb_release; } sub git_init { @@ -127,6 +158,7 @@ sub importer { } else { $self->{im} = undef; $im->done; + $self->searchidx_checkpoint; $im = undef; my $git_dir = $self->git_init(++$self->{max_git}); my $git = PublicInbox::Git->new($git_dir); @@ -156,8 +188,6 @@ sub importer { $self->{max_git} = $max; return $self->import_init($git, $packed_bytes); } - } else { - warn "latest not found in $pfx\n"; } $self->{max_git} = $new; $latest = $self->git_init($new); @@ -168,6 +198,7 @@ sub import_init { my ($self, $git, $packed_bytes) = @_; my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox}); $im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR); + $im->{want_object_id} = 1; $im->{ssoma_lock} = 0; $im->{path_type} = 'v2'; $self->{im} = $im; diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox index c45dc4ee..d30e8a30 100644 --- a/scripts/import_vger_from_mbox +++ b/scripts/import_vger_from_mbox @@ -7,6 +7,7 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use Date::Parse qw/str2time/; use Email::MIME; $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::Inbox; use PublicInbox::V2Writable; my $usage = "usage: $0 NAME EMAIL DIR $mainrepo, name => $name, + version => 2, -primary_address => $email, }; +$v2ibx = PublicInbox::Inbox->new($v2ibx); my $im = $dry_run ? undef : PublicInbox::V2Writable->new($v2ibx, 1); binmode STDIN; my $msg = ''; -- cgit v1.2.3-24-ge0c7