From f344d64066f85dd6737daeb42c94902e1bbfda78 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 20 Aug 2020 20:24:56 +0000 Subject: init+index: support --skip-docdata for Xapian Since we no longer read document data from Xapian, allow users to opt-out of storing it. This breaks compatibility with previous releases of public-inbox, but gives us a ~1.5% space savings on Xapian storage (and associated I/O and page cache pressure reduction). --- lib/PublicInbox/Admin.pm | 12 +++++++----- lib/PublicInbox/SearchIdx.pm | 35 ++++++++++++++++++++++++++--------- lib/PublicInbox/SearchIdxShard.pm | 2 +- 3 files changed, 34 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index f5427af7..b8ead6f7 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -48,13 +48,14 @@ sub resolve_repo_dir { sub detect_indexlevel ($) { my ($ibx) = @_; - # brand new or never before indexed inboxes default to full - return 'full' unless $ibx->over; - delete $ibx->{over}; # don't leave open FD lying around + my $over = $ibx->over; + my $srch = $ibx->search; + delete @$ibx{qw(over search)}; # don't leave open FDs lying around + # brand new or never before indexed inboxes default to full + return 'full' unless $over; my $l = 'basic'; - my $srch = $ibx->search or return $l; - delete $ibx->{search}; # don't leave open FD lying around + return $l unless $srch; if (my $xdb = $srch->xdb) { $l = 'full'; my $m = $xdb->get_metadata('indexlevel'); @@ -65,6 +66,7 @@ sub detect_indexlevel ($) { $ibx->{inboxdir} has unexpected indexlevel in Xapian: $m } + $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); } $l; } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5c39f3d6..be46b2b9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -61,6 +61,10 @@ sub new { }, $class; $self->xpfx_init; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; + if ($ibx->{-skip_docdata}) { + $self->{-set_skip_docdata_once} = 1; + $self->{-skip_docdata} = 1; + } $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; @@ -359,10 +363,18 @@ sub add_xapian ($$$$) { msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); - $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); - my $data = $smsg->to_doc_data; - $doc->set_data($data); + + # by default, we maintain compatibility with v1.5.0 and earlier + # by writing to docdata.glass, users who never exect to downgrade can + # use --skip-docdata + if (!$self->{-skip_docdata}) { + # WWW doesn't need {to} or {cc}, only NNTP + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + my $data = $smsg->to_doc_data; + $doc->set_data($data); + } + if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { my $pfx = $alt->{xprefix}; @@ -831,23 +843,28 @@ sub begin_txn_lazy { # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) # This metadata is read by Admin::detect_indexlevel: -sub set_indexlevel { +sub set_metadata_once { my ($self) = @_; - if (!$self->{shard} && # undef or 0, not >0 - delete($self->{-set_indexlevel_once})) { - my $xdb = $self->{xdb}; + return if $self->{shard}; # only continue if undef or 0, not >0 + my $xdb = $self->{xdb}; + + if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { $xdb->set_metadata('indexlevel', 'medium'); } } + if (delete($self->{-set_skip_docdata_once})) { + $xdb->get_metadata('skip_docdata') or + $xdb->set_metadata('skip_docdata', '1'); + } } sub _commit_txn { my ($self) = @_; if (my $xdb = $self->{xdb}) { - set_indexlevel($self); + set_metadata_once($self); $xdb->commit_transaction; } $self->{over}->commit_lazy if $self->{over}; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 59b36087..20077e08 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -16,7 +16,7 @@ sub new { my $self = $class->SUPER::new($ibx, 1, $shard); # create the DB before forking: $self->idx_acquire; - $self->set_indexlevel; + $self->set_metadata_once; $self->idx_release; $self->spawn_worker($v2w, $shard) if $v2w->{parallel}; $self; -- cgit v1.2.3-24-ge0c7