about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/public-inbox-index.pod8
-rw-r--r--Documentation/public-inbox-init.pod10
-rw-r--r--lib/PublicInbox/Admin.pm12
-rw-r--r--lib/PublicInbox/SearchIdx.pm35
-rw-r--r--lib/PublicInbox/SearchIdxShard.pm2
-rwxr-xr-xscript/public-inbox-convert3
-rwxr-xr-xscript/public-inbox-index7
-rwxr-xr-xscript/public-inbox-init8
-rw-r--r--t/inbox_idle.t2
-rw-r--r--t/index-git-times.t11
-rw-r--r--t/init.t13
11 files changed, 91 insertions, 20 deletions
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 1ed9f5e7..46a53825 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -145,6 +145,14 @@ below.
 
 Available in public-inbox 1.6.0 (PENDING).
 
+=item --skip-docdata
+
+Stop storing document data in Xapian on an existing inbox.
+
+See L<public-inbox-init(1)/--skip-docdata> for description and caveats.
+
+Available in public-inbox 1.6.0 (PENDING).
+
 =back
 
 =head1 FILES
diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod
index 31c5f743..b25dd1e4 100644
--- a/Documentation/public-inbox-init.pod
+++ b/Documentation/public-inbox-init.pod
@@ -95,6 +95,16 @@ default due to contention in the top-level producer process.
 
 Default: the number of online CPUs, up to 4
 
+=item --skip-docdata
+
+Do not store document data in Xapian, reducing Xapian storage
+overhead by around 1.5%.
+
+Warning: this option prevents rollbacks to public-inbox 1.5.0
+and earlier.
+
+Available since public-inbox 1.6.0 (PENDING).
+
 =back
 
 =head1 ENVIRONMENT
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index f5427af7..b8ead6f7 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -48,13 +48,14 @@ sub resolve_repo_dir {
 sub detect_indexlevel ($) {
         my ($ibx) = @_;
 
-        # brand new or never before indexed inboxes default to full
-        return 'full' unless $ibx->over;
-        delete $ibx->{over}; # don't leave open FD lying around
+        my $over = $ibx->over;
+        my $srch = $ibx->search;
+        delete @$ibx{qw(over search)}; # don't leave open FDs lying around
 
+        # brand new or never before indexed inboxes default to full
+        return 'full' unless $over;
         my $l = 'basic';
-        my $srch = $ibx->search or return $l;
-        delete $ibx->{search}; # don't leave open FD lying around
+        return $l unless $srch;
         if (my $xdb = $srch->xdb) {
                 $l = 'full';
                 my $m = $xdb->get_metadata('indexlevel');
@@ -65,6 +66,7 @@ sub detect_indexlevel ($) {
 $ibx->{inboxdir} has unexpected indexlevel in Xapian: $m
 
                 }
+                $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata');
         }
         $l;
 }
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 5c39f3d6..be46b2b9 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -61,6 +61,10 @@ sub new {
         }, $class;
         $self->xpfx_init;
         $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium';
+        if ($ibx->{-skip_docdata}) {
+                $self->{-set_skip_docdata_once} = 1;
+                $self->{-skip_docdata} = 1;
+        }
         $ibx->umask_prepare;
         if ($version == 1) {
                 $self->{lock_path} = "$inboxdir/ssoma.lock";
@@ -359,10 +363,18 @@ sub add_xapian ($$$$) {
 
         msg_iter($eml, \&index_xapian, [ $self, $doc ]);
         index_ids($self, $doc, $eml, $mids);
-        $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
-        PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
-        my $data = $smsg->to_doc_data;
-        $doc->set_data($data);
+
+        # by default, we maintain compatibility with v1.5.0 and earlier
+        # by writing to docdata.glass, users who never exect to downgrade can
+        # use --skip-docdata
+        if (!$self->{-skip_docdata}) {
+                # WWW doesn't need {to} or {cc}, only NNTP
+                $smsg->{to} = $smsg->{cc} = '';
+                PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
+                my $data = $smsg->to_doc_data;
+                $doc->set_data($data);
+        }
+
         if (my $altid = $self->{-altid}) {
                 foreach my $alt (@$altid) {
                         my $pfx = $alt->{xprefix};
@@ -831,23 +843,28 @@ sub begin_txn_lazy {
 
 # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard)
 # This metadata is read by Admin::detect_indexlevel:
-sub set_indexlevel {
+sub set_metadata_once {
         my ($self) = @_;
 
-        if (!$self->{shard} && # undef or 0, not >0
-                        delete($self->{-set_indexlevel_once})) {
-                my $xdb = $self->{xdb};
+        return if $self->{shard}; # only continue if undef or 0, not >0
+        my $xdb = $self->{xdb};
+
+        if (delete($self->{-set_indexlevel_once})) {
                 my $level = $xdb->get_metadata('indexlevel');
                 if (!$level || $level ne 'medium') {
                         $xdb->set_metadata('indexlevel', 'medium');
                 }
         }
+        if (delete($self->{-set_skip_docdata_once})) {
+                $xdb->get_metadata('skip_docdata') or
+                        $xdb->set_metadata('skip_docdata', '1');
+        }
 }
 
 sub _commit_txn {
         my ($self) = @_;
         if (my $xdb = $self->{xdb}) {
-                set_indexlevel($self);
+                set_metadata_once($self);
                 $xdb->commit_transaction;
         }
         $self->{over}->commit_lazy if $self->{over};
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 59b36087..20077e08 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -16,7 +16,7 @@ sub new {
         my $self = $class->SUPER::new($ibx, 1, $shard);
         # create the DB before forking:
         $self->idx_acquire;
-        $self->set_indexlevel;
+        $self->set_metadata_once;
         $self->idx_release;
         $self->spawn_worker($v2w, $shard) if $v2w->{parallel};
         $self;
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index d655dcc6..4ff198d1 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -77,7 +77,8 @@ if ($old) {
 die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2;
 
 require PublicInbox::Admin;
-$old->{indexlevel} //= PublicInbox::Admin::detect_indexlevel($old);
+my $detected = PublicInbox::Admin::detect_indexlevel($old);
+$old->{indexlevel} //= $detected;
 my $env;
 if ($opt->{'index'}) {
         my $mods = {};
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 30d24838..9855c67d 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -39,7 +39,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
                 indexlevel|index-level|L=s max_size|max-size=s
                 batch_size|batch-size=s
                 sequential_shard|seq-shard|sequential-shard
-                all help|?))
+                skip-docdata all help|?))
         or die "bad command-line args\n$usage";
 if ($opt->{help}) { print $help; exit 0 };
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
@@ -58,9 +58,11 @@ unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
 
 my $mods = {};
 foreach my $ibx (@ibxs) {
+        # detect_indexlevel may also set $ibx->{-skip_docdata}
+        my $detected = PublicInbox::Admin::detect_indexlevel($ibx);
         # XXX: users can shoot themselves in the foot, with opt->{indexlevel}
         $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
-                        'full' : PublicInbox::Admin::detect_indexlevel($ibx));
+                        'full' : $detected);
         PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }
 
@@ -75,6 +77,7 @@ for my $ibx (@ibxs) {
                 PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt});
         }
         $ibx->{-no_fsync} = 1 if !$opt->{fsync};
+        $ibx->{-skip_docdata} //= $opt->{'skip-docdata'};
 
         my $ibx_opt = $opt;
         if (defined(my $s = $ibx->{lc('indexSequentialShard')})) {
diff --git a/script/public-inbox-init b/script/public-inbox-init
index b19c2321..037e8e56 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -34,6 +34,7 @@ require PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-base');
 
 my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs, $show_help);
+my $skip_docdata;
 my $ng = '';
 my %opts = (
         'V|version=i' => \$version,
@@ -42,6 +43,7 @@ my %opts = (
         'skip-artnum=i' => \$skip_artnum,
         'j|jobs=i' => \$jobs,
         'ng|newsgroup=s' => \$ng,
+        'skip-docdata' => \$skip_docdata,
         'help|?' => \$show_help,
 );
 my $usage_cb = sub {
@@ -177,6 +179,12 @@ if (defined $jobs) {
 
 require PublicInbox::InboxWritable;
 $ibx = PublicInbox::InboxWritable->new($ibx, $creat_opt);
+if ($skip_docdata) {
+        $ibx->{indexlevel} //= 'full'; # ensure init_inbox writes xdb
+        $ibx->{indexlevel} eq 'basic' and
+                die "--skip-docdata ignored with --indexlevel=basic\n";
+        $ibx->{-skip_docdata} = $skip_docdata;
+}
 $ibx->init_inbox(0, $skip_epoch, $skip_artnum);
 
 # needed for git prior to v2.1.0
diff --git a/t/inbox_idle.t b/t/inbox_idle.t
index 61287200..e16ee11b 100644
--- a/t/inbox_idle.t
+++ b/t/inbox_idle.t
@@ -29,7 +29,7 @@ for my $V (1, 2) {
         if ($V == 1) {
                 my $sidx = PublicInbox::SearchIdx->new($ibx, 1);
                 $sidx->idx_acquire;
-                $sidx->set_indexlevel;
+                $sidx->set_metadata_once;
                 $sidx->idx_release; # allow watching on lockfile
         }
         my $pi_config = PublicInbox::Config->new(\<<EOF);
diff --git a/t/index-git-times.t b/t/index-git-times.t
index 2e9e88e8..8f80c866 100644
--- a/t/index-git-times.t
+++ b/t/index-git-times.t
@@ -4,6 +4,7 @@ use Test::More;
 use PublicInbox::TestCommon;
 use PublicInbox::Import;
 use PublicInbox::Config;
+use PublicInbox::Admin;
 use File::Path qw(remove_tree);
 
 require_mods(qw(DBD::SQLite Search::Xapian));
@@ -47,11 +48,15 @@ EOF
         PublicInbox::Import::run_die($cmd, undef, { 0 => $r });
 }
 
-run_script(['-index', $v1dir]) or die 'v1 index failed';
+run_script(['-index', '--skip-docdata', $v1dir]) or die 'v1 index failed';
+
 my $smsg;
 {
         my $cfg = PublicInbox::Config->new;
         my $ibx = $cfg->lookup($addr);
+        my $lvl = PublicInbox::Admin::detect_indexlevel($ibx);
+        is($lvl, 'medium', 'indexlevel detected');
+        is($ibx->{-skip_docdata}, 1, '--skip-docdata flag set on -index');
         $smsg = $ibx->over->get_art(1);
         is($smsg->{ds}, 749520000, 'datestamp from git author time');
         is($smsg->{ts}, 1285977600, 'timestamp from git committer time');
@@ -70,6 +75,10 @@ SKIP: {
         my $check_v2 = sub {
                 my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir,
                                 address => $addr});
+                my $lvl = PublicInbox::Admin::detect_indexlevel($ibx);
+                is($lvl, 'medium', 'indexlevel detected after convert');
+                is($ibx->{-skip_docdata}, 1,
+                        '--skip-docdata preserved after convert');
                 my $v2smsg = $ibx->over->get_art(1);
                 is($v2smsg->{ds}, $smsg->{ds},
                         'v2 datestamp from git author time');
diff --git a/t/init.t b/t/init.t
index 4d2c5049..dad09435 100644
--- a/t/init.t
+++ b/t/init.t
@@ -95,6 +95,19 @@ SKIP: {
                 my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir });
                 is(PublicInbox::Admin::detect_indexlevel($ibx), $lvl,
                         'detected expected level w/o config');
+                ok(!$ibx->{-skip_docdata}, 'docdata written by default');
+        }
+        for my $v (1, 2) {
+                my $name = "v$v-skip-docdata";
+                my $dir = "$tmpdir/$name";
+                $cmd = [ '-init', $name, "-V$v", '--skip-docdata',
+                        $dir, "http://example.com/$name",
+                        "$name\@example.com" ];
+                ok(run_script($cmd), "-init -V$v --skip-docdata");
+                my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir });
+                is(PublicInbox::Admin::detect_indexlevel($ibx), 'full',
+                        "detected default indexlevel -V$v");
+                ok($ibx->{-skip_docdata}, "docdata skip set -V$v");
         }
 
         # loop for idempotency