about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-08-20 20:24:56 +0000
committerEric Wong <e@yhbt.net>2020-08-20 21:11:24 +0000
commitf344d64066f85dd6737daeb42c94902e1bbfda78 (patch)
tree9a68a2a657a13ec245cfe360031b601a4d9d0c5c /lib
parentf62ddb19552b19f398d56193d7cf20cf20b61a04 (diff)
downloadpublic-inbox-f344d64066f85dd6737daeb42c94902e1bbfda78.tar.gz
Since we no longer read document data from Xapian, allow users
to opt-out of storing it.

This breaks compatibility with previous releases of
public-inbox, but gives us a ~1.5% space savings on Xapian
storage (and associated I/O and page cache pressure reduction).
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Admin.pm12
-rw-r--r--lib/PublicInbox/SearchIdx.pm35
-rw-r--r--lib/PublicInbox/SearchIdxShard.pm2
3 files changed, 34 insertions, 15 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index f5427af7..b8ead6f7 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -48,13 +48,14 @@ sub resolve_repo_dir {
 sub detect_indexlevel ($) {
         my ($ibx) = @_;
 
-        # brand new or never before indexed inboxes default to full
-        return 'full' unless $ibx->over;
-        delete $ibx->{over}; # don't leave open FD lying around
+        my $over = $ibx->over;
+        my $srch = $ibx->search;
+        delete @$ibx{qw(over search)}; # don't leave open FDs lying around
 
+        # brand new or never before indexed inboxes default to full
+        return 'full' unless $over;
         my $l = 'basic';
-        my $srch = $ibx->search or return $l;
-        delete $ibx->{search}; # don't leave open FD lying around
+        return $l unless $srch;
         if (my $xdb = $srch->xdb) {
                 $l = 'full';
                 my $m = $xdb->get_metadata('indexlevel');
@@ -65,6 +66,7 @@ sub detect_indexlevel ($) {
 $ibx->{inboxdir} has unexpected indexlevel in Xapian: $m
 
                 }
+                $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata');
         }
         $l;
 }
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 5c39f3d6..be46b2b9 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -61,6 +61,10 @@ sub new {
         }, $class;
         $self->xpfx_init;
         $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium';
+        if ($ibx->{-skip_docdata}) {
+                $self->{-set_skip_docdata_once} = 1;
+                $self->{-skip_docdata} = 1;
+        }
         $ibx->umask_prepare;
         if ($version == 1) {
                 $self->{lock_path} = "$inboxdir/ssoma.lock";
@@ -359,10 +363,18 @@ sub add_xapian ($$$$) {
 
         msg_iter($eml, \&index_xapian, [ $self, $doc ]);
         index_ids($self, $doc, $eml, $mids);
-        $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
-        PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
-        my $data = $smsg->to_doc_data;
-        $doc->set_data($data);
+
+        # by default, we maintain compatibility with v1.5.0 and earlier
+        # by writing to docdata.glass, users who never exect to downgrade can
+        # use --skip-docdata
+        if (!$self->{-skip_docdata}) {
+                # WWW doesn't need {to} or {cc}, only NNTP
+                $smsg->{to} = $smsg->{cc} = '';
+                PublicInbox::OverIdx::parse_references($smsg, $eml, $mids);
+                my $data = $smsg->to_doc_data;
+                $doc->set_data($data);
+        }
+
         if (my $altid = $self->{-altid}) {
                 foreach my $alt (@$altid) {
                         my $pfx = $alt->{xprefix};
@@ -831,23 +843,28 @@ sub begin_txn_lazy {
 
 # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard)
 # This metadata is read by Admin::detect_indexlevel:
-sub set_indexlevel {
+sub set_metadata_once {
         my ($self) = @_;
 
-        if (!$self->{shard} && # undef or 0, not >0
-                        delete($self->{-set_indexlevel_once})) {
-                my $xdb = $self->{xdb};
+        return if $self->{shard}; # only continue if undef or 0, not >0
+        my $xdb = $self->{xdb};
+
+        if (delete($self->{-set_indexlevel_once})) {
                 my $level = $xdb->get_metadata('indexlevel');
                 if (!$level || $level ne 'medium') {
                         $xdb->set_metadata('indexlevel', 'medium');
                 }
         }
+        if (delete($self->{-set_skip_docdata_once})) {
+                $xdb->get_metadata('skip_docdata') or
+                        $xdb->set_metadata('skip_docdata', '1');
+        }
 }
 
 sub _commit_txn {
         my ($self) = @_;
         if (my $xdb = $self->{xdb}) {
-                set_indexlevel($self);
+                set_metadata_once($self);
                 $xdb->commit_transaction;
         }
         $self->{over}->commit_lazy if $self->{over};
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 59b36087..20077e08 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -16,7 +16,7 @@ sub new {
         my $self = $class->SUPER::new($ibx, 1, $shard);
         # create the DB before forking:
         $self->idx_acquire;
-        $self->set_indexlevel;
+        $self->set_metadata_once;
         $self->idx_release;
         $self->spawn_worker($v2w, $shard) if $v2w->{parallel};
         $self;