diff options
Diffstat (limited to 'lib/PublicInbox/MiscIdx.pm')
-rw-r--r-- | lib/PublicInbox/MiscIdx.pm | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm new file mode 100644 index 00000000..6708527d --- /dev/null +++ b/lib/PublicInbox/MiscIdx.pm @@ -0,0 +1,161 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# like PublicInbox::SearchIdx, but for searching for non-mail messages. +# Things indexed include: +# * inboxes themselves +# * epoch information +# * (maybe) git code repository information (not commits) +# Expect ~100K-1M documents with no parallelism opportunities, +# so no sharding, here. +# +# See MiscSearch for read-only counterpart +package PublicInbox::MiscIdx; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable; +use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat +use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use Carp qw(croak); +use File::Path (); +use PublicInbox::MiscSearch; +use PublicInbox::Config; +use PublicInbox::Syscall; +my $json; + +sub new { + my ($class, $eidx) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $mi_dir = "$eidx->{xpfx}/misc"; + File::Path::mkpath($mi_dir); + PublicInbox::Syscall::nodatacow_dir($mi_dir); + my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; + $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $flags |= $PublicInbox::SearchIdx::DB_DANGEROUS if $eidx->{-dangerous}; + $json //= PublicInbox::Config::json(); + bless { + mi_dir => $mi_dir, + flags => $flags, + indexlevel => 'full', # small DB, no point in medium? + }, $class; +} + +sub _begin_txn ($) { + my ($self) = @_; + my $wdb = $PublicInbox::Search::X{WritableDatabase}; + my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; + croak "Failed opening $self->{mi_dir}: $@" if $@; + $xdb->begin_transaction; + $xdb; +} + +sub commit_txn { + my ($self) = @_; + my $xdb = delete $self->{xdb} or return; + $xdb->commit_transaction; +} + +sub create_xdb { + my ($self) = @_; + $self->{xdb} //= _begin_txn($self); + commit_txn($self); +} + +sub remove_eidx_key { + my ($self, $eidx_key) = @_; + my $xdb = $self->{xdb} //= _begin_txn($self); + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my @docids; # only one, unless we had bugs + for (; $head != $tail; $head++) { + push @docids, $head->get_docid; + } + for my $docid (@docids) { + $xdb->delete_document($docid); + warn "# remove inbox docid #$docid ($eidx_key)\n"; + } +} + +# adds or updates according to $eidx_key +sub index_ibx { + my ($self, $ibx) = @_; + my $eidx_key = $ibx->eidx_key; + my $xdb = $self->{xdb} //= _begin_txn($self); + # Q = uniQue in Xapian terminology + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my ($docid, @drop); + for (; $head != $tail; $head++) { + if (defined $docid) { + my $i = $head->get_docid; + push @drop, $i; + warn <<EOF; +W: multiple inboxes keyed to `$eidx_key', deleting #$i +EOF + } else { + $docid = $head->get_docid; + } + } + $xdb->delete_document($_) for @drop; # just in case + + my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); + + # allow sorting by modified and uidvalidity (created at) + add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); + + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + # force reread from disk, {description} could be loaded from {misc} + delete @$ibx{qw(-art_min -art_max description)}; + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + my $n = $ibx->art_min; + add_val($doc, $PublicInbox::MiscSearch::ART_MIN, $n) if $n; + $n = $ibx->art_max; + add_val($doc, $PublicInbox::MiscSearch::ART_MAX, $n) if $n; + } + + my $desc = $ibx->description; + + # description = S/Subject (or title) + # address = A/Author + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); + my %map = ( + address => 'A', + listid => 'XLISTID', + infourl => 'XINFOURL', + url => 'XURL' + ); + while (my ($f, $pfx) = each %map) { + for my $v (@{$ibx->{$f} // []}) { + index_text($self, $v, 1, $pfx); + } + } + my $data = {}; + if (defined(my $max = $ibx->max_git_epoch)) { # v2 + my $pfx = "/$ibx->{name}/git/"; + for my $epoch (0..$max) { + my $git = $ibx->git_epoch($epoch) or return; + if (my $ent = $git->manifest_entry($epoch, $desc)) { + $data->{"$pfx$epoch.git"} = $ent; + $ent->{git_dir} = $git->{git_dir}; + } + $git->cleanup; # ->modified starts cat-file --batch + } + } elsif (my $ent = $ibx->git->manifest_entry) { # v1 + $ent->{git_dir} = $ibx->{inboxdir}; + $data->{"/$ibx->{name}"} = $ent; + } + $doc->set_data($json->encode($data)); + if (defined $docid) { + $xdb->replace_document($docid, $doc); + } else { + $xdb->add_document($doc); + } +} + +1; |