about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-20 21:00:21 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-20 21:00:21 +0000
commitfeabfb1809b911fc97538282234c8b1f087ddb6a (patch)
tree8fc484f46f1c2c191ccb0a72fcbebfcad38bc115
parentb42bbc915750cf2f0c559514041ba3f5d1a44a12 (diff)
downloadpublic-inbox-feabfb1809b911fc97538282234c8b1f087ddb6a.tar.gz
This is too slow, currently.  Working with only 2017 LKML
archives:

         git-only: ~1 minute
     git + SQLite: ~12 minutes
git+Xapian+SQlite: ~45 minutes

So yes, it looks like we'll need to parallelize Xapian indexing,
at least.
-rw-r--r--lib/PublicInbox/Import.pm1
-rw-r--r--lib/PublicInbox/Inbox.pm4
-rw-r--r--lib/PublicInbox/Search.pm26
-rw-r--r--lib/PublicInbox/SearchIdx.pm56
-rw-r--r--lib/PublicInbox/V2Writable.pm45
-rw-r--r--scripts/import_vger_from_mbox3
6 files changed, 106 insertions, 29 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 364ab602..1a2698a7 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -285,6 +285,7 @@ sub add {
         # v2: we need this for Xapian
         if ($self->{want_object_id}) {
                 chomp($self->{last_object_id} = $self->get_mark(":$blob"));
+                $self->{last_object_size} = $n;
         }
 
         my $ref = $self->{ref};
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 2ec2be69..e7856e3c 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -79,7 +79,9 @@ sub new {
 sub git {
         my ($self) = @_;
         $self->{git} ||= eval {
-                my $g = PublicInbox::Git->new($self->{mainrepo});
+                my $git_dir = $self->{mainrepo};
+                $git_dir .= '/all.git' if (($self->{version} || 1) == 2);
+                my $g = PublicInbox::Git->new($git_dir);
                 $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter};
                 _cleanup_later($self);
                 $g;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 33a1f2d3..eac11bd4 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -120,15 +120,29 @@ chomp @HELP;
 my $mail_query = Search::Xapian::Query->new('T' . 'mail');
 
 sub xdir {
-        my (undef, $git_dir) = @_;
-        "$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
+        my ($self) = @_;
+        if ($self->{version} == 1) {
+                "$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION;
+        } else {
+                "$self->{mainrepo}/xap" . SCHEMA_VERSION;
+        }
 }
 
 sub new {
-        my ($class, $git_dir, $altid) = @_;
-        my $dir = $class->xdir($git_dir);
-        my $db = Search::Xapian::Database->new($dir);
-        bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class;
+        my ($class, $mainrepo, $altid) = @_;
+        my $version = 1;
+        my $ibx = $mainrepo;
+        if (ref $ibx) {
+                $version = $ibx->{version} || 1;
+                $mainrepo = $ibx->{mainrepo};
+        }
+        my $self = bless {
+                mainrepo => $mainrepo,
+                altid => $altid,
+                version => $version,
+        }, $class;
+        $self->{xdb} = Search::Xapian::Database->new($self->xdir);
+        $self;
 }
 
 sub reopen { $_[0]->{xdb}->reopen }
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 265403a3..c6c5bd25 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -51,26 +51,43 @@ sub git_unquote ($) {
 }
 
 sub new {
-        my ($class, $inbox, $creat) = @_;
-        my $git_dir = $inbox;
-        my $altid;
-        if (ref $inbox) {
-                $git_dir = $inbox->{mainrepo};
-                $altid = $inbox->{altid};
+        my ($class, $ibx, $creat) = @_;
+        my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config
+        my $git_dir = $mainrepo;
+        my ($altid, $git);
+        my $version = 1;
+        if (ref $ibx) {
+                $mainrepo = $ibx->{mainrepo};
+                $altid = $ibx->{altid};
+                $version = $ibx->{version} || 1;
                 if ($altid) {
                         require PublicInbox::AltId;
                         $altid = [ map {
-                                PublicInbox::AltId->new($inbox, $_);
+                                PublicInbox::AltId->new($ibx, $_);
                         } @$altid ];
                 }
+                $git = $ibx->git;
+        } else {
+                $git = PublicInbox::Git->new($git_dir); # v1 only
         }
         require Search::Xapian::WritableDatabase;
-        my $self = bless { git_dir => $git_dir, -altid => $altid }, $class;
+        my $self = bless {
+                mainrepo => $mainrepo,
+                git => $git,
+                -altid => $altid,
+                version => $version,
+        }, $class;
         my $perm = $self->_git_config_perm;
         my $umask = _umask_for($perm);
         $self->{umask} = $umask;
-        $self->{lock_path} = "$git_dir/ssoma.lock";
-        $self->{git} = PublicInbox::Git->new($git_dir);
+        if ($version == 1) {
+                $self->{lock_path} = "$mainrepo/ssoma.lock";
+        } elsif ($version == 2) {
+                $self->{lock_path} = "$mainrepo/inbox.lock";
+                $self->{msgmap_path} = "$mainrepo/msgmap.sqlite3";
+        } else {
+                die "unsupported inbox version=$version\n";
+        }
         $self->{creat} = ($creat || 0) == 1;
         $self;
 }
@@ -86,7 +103,7 @@ sub _xdb_release {
 sub _xdb_acquire {
         my ($self) = @_;
         croak 'already acquired' if $self->{xdb};
-        my $dir = PublicInbox::Search->xdir($self->{git_dir});
+        my $dir = $self->xdir;
         my $flag = Search::Xapian::DB_OPEN;
         if ($self->{creat}) {
                 require File::Path;
@@ -541,6 +558,7 @@ sub batch_adjust ($$$$) {
         }
 }
 
+# only for v1
 sub rlog {
         my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
         my $hex = '[a-f0-9]';
@@ -573,9 +591,14 @@ sub rlog {
 
 sub _msgmap_init {
         my ($self) = @_;
-        $self->{mm} = eval {
+        $self->{mm} ||= eval {
                 require PublicInbox::Msgmap;
-                PublicInbox::Msgmap->new($self->{git_dir}, 1);
+                my $msgmap_path = $self->{msgmap_path};
+                if (defined $msgmap_path) { # v2
+                        PublicInbox::Msgmap->new_file($msgmap_path, 1);
+                } else {
+                        PublicInbox::Msgmap->new($self->{mainrepo}, 1);
+                }
         };
 }
 
@@ -712,8 +735,11 @@ sub merge_threads {
 
 sub _read_git_config_perm {
         my ($self) = @_;
-        my @cmd = qw(config core.sharedRepository);
-        my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd);
+        my @cmd = qw(config);
+        if ($self->{version} == 2) {
+                push @cmd, "--file=$self->{mainrepo}/inbox-config";
+        }
+        my $fh = $self->{git}->popen(@cmd, 'core.sharedRepository');
         local $/ = "\n";
         my $perm = <$fh>;
         chomp $perm if defined $perm;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 9b68e9b1..41bfb8d1 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -34,7 +34,7 @@ sub new {
                 xap_ro => undef,
 
                 # limit each repo to 1GB or so
-                rotate_bytes => int((100 * 1024 * 1024) / $PACKING_FACTOR),
+                rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
         };
         bless $self, $class
 }
@@ -55,11 +55,29 @@ sub add {
         my $cmt = $im->add($mime, $check_cb) or return;
         $cmt = $im->get_mark($cmt);
         my $oid = $im->{last_object_id};
-        $self->index_msg($mime, $existing, $cmt, $oid);
+        my $size = $im->{last_object_size};
+
+        my $idx = $self->search_idx;
+        $idx->index_both($mime, $size, $oid);
+        $idx->{xdb}->set_metadata('last_commit', $cmt);
+        my $n = $self->{transact_bytes} += $size;
+        if ($n > PublicInbox::SearchIdx::BATCH_BYTES) {
+                $self->checkpoint;
+        }
+
         $mime;
 }
 
-sub index_msg {  # TODO
+sub search_idx {
+        my ($self) = @_;
+        $self->{idx} ||= eval {
+                my $idx = PublicInbox::SearchIdx->new($self->{-inbox}, 1);
+                my $mm = $idx->_msgmap_init;
+                $idx->_xdb_acquire->begin_transaction;
+                $self->{transact_bytes} = 0;
+                $mm->{dbh}->begin_work;
+                $idx
+        };
 }
 
 sub remove {
@@ -79,12 +97,25 @@ sub remove {
 
 sub done {
         my ($self) = @_;
-        $self->{im}->done; # PublicInbox::Import::done
+        my $im = $self->{im};
+        $im->done if $im; # PublicInbox::Import::done
+        $self->searchidx_checkpoint;
 }
 
 sub checkpoint {
         my ($self) = @_;
-        $self->{im}->checkpoint; # PublicInbox::Import::checkpoint
+        my $im = $self->{im};
+        $im->checkpoint if $im; # PublicInbox::Import::checkpoint
+        $self->searchidx_checkpoint;
+}
+
+sub searchidx_checkpoint {
+        my ($self) = @_;
+        my $idx = delete $self->{idx} or return;
+
+        $idx->{mm}->{dbh}->commit;
+        $idx->{xdb}->commit_transaction;
+        $idx->_xdb_release;
 }
 
 sub git_init {
@@ -127,6 +158,7 @@ sub importer {
                 } else {
                         $self->{im} = undef;
                         $im->done;
+                        $self->searchidx_checkpoint;
                         $im = undef;
                         my $git_dir = $self->git_init(++$self->{max_git});
                         my $git = PublicInbox::Git->new($git_dir);
@@ -156,8 +188,6 @@ sub importer {
                         $self->{max_git} = $max;
                         return $self->import_init($git, $packed_bytes);
                 }
-        } else {
-                warn "latest not found in $pfx\n";
         }
         $self->{max_git} = $new;
         $latest = $self->git_init($new);
@@ -168,6 +198,7 @@ sub import_init {
         my ($self, $git, $packed_bytes) = @_;
         my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
         $im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
+        $im->{want_object_id} = 1;
         $im->{ssoma_lock} = 0;
         $im->{path_type} = 'v2';
         $self->{im} = $im;
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index c45dc4ee..d30e8a30 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -7,6 +7,7 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
 use Date::Parse qw/str2time/;
 use Email::MIME;
 $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect
+use PublicInbox::Inbox;
 use PublicInbox::V2Writable;
 my $usage = "usage: $0 NAME EMAIL DIR <MBOX\n";
 my $dry_run;
@@ -18,8 +19,10 @@ my $mainrepo = shift or die $usage; # /path/to/v2/repo
 my $v2ibx = {
         mainrepo => $mainrepo,
         name => $name,
+        version => 2,
         -primary_address => $email,
 };
+$v2ibx = PublicInbox::Inbox->new($v2ibx);
 my $im = $dry_run ? undef : PublicInbox::V2Writable->new($v2ibx, 1);
 binmode STDIN;
 my $msg = '';