# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # like PublicInbox::SearchIdx, but for searching for non-mail messages. # Things indexed include: # * inboxes themselves # * epoch information # * (maybe) git code repository information # Expect ~100K-1M documents with no parallelism opportunities, # so no sharding, here. # # See MiscSearch for read-only counterpart package PublicInbox::MiscIdx; use strict; use v5.10.1; use PublicInbox::InboxWritable; use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat use PublicInbox::SearchIdx qw(index_text term_generator add_val); use PublicInbox::Spawn qw(nodatacow_dir); use Carp qw(croak); use File::Path (); use PublicInbox::MiscSearch; use PublicInbox::Config; my $json; sub new { my ($class, $eidx) = @_; PublicInbox::SearchIdx::load_xapian_writable(); my $mi_dir = "$eidx->{xpfx}/misc"; File::Path::mkpath($mi_dir); nodatacow_dir($mi_dir); my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; $json //= PublicInbox::Config::json(); bless { mi_dir => $mi_dir, flags => $flags, indexlevel => 'full', # small DB, no point in medium? }, $class; } sub begin_txn { my ($self) = @_; croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy? my $wdb = $PublicInbox::Search::X{WritableDatabase}; my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; croak "Failed opening $self->{mi_dir}: $@" if $@; $self->{xdb} = $xdb; $xdb->begin_transaction; } sub commit_txn { my ($self) = @_; croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy? delete($self->{xdb})->commit_transaction; } sub remove_eidx_key { my ($self, $eidx_key) = @_; my $xdb = $self->{xdb}; my $head = $xdb->postlist_begin('Q'.$eidx_key); my $tail = $xdb->postlist_end('Q'.$eidx_key); my @docids; # only one, unless we had bugs for (; $head != $tail; $head++) { push @docids, $head->get_docid; } for my $docid (@docids) { $xdb->delete_document($docid); warn "I: remove inbox docid #$docid ($eidx_key)\n"; } } # adds or updates according to $eidx_key sub index_ibx { my ($self, $ibx) = @_; my $eidx_key = $ibx->eidx_key; my $xdb = $self->{xdb}; # Q = uniQue in Xapian terminology my $head = $xdb->postlist_begin('Q'.$eidx_key); my $tail = $xdb->postlist_end('Q'.$eidx_key); my ($docid, @drop); for (; $head != $tail; $head++) { if (defined $docid) { my $i = $head->get_docid; push @drop, $i; warn <get_docid; } } $xdb->delete_document($_) for @drop; # just in case my $doc = $PublicInbox::Search::X{Document}->new; term_generator($self)->set_document($doc); # allow sorting by modified and uidvalidity (created at) add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); $doc->add_boolean_term('Q'.$eidx_key); # uniQue id $doc->add_boolean_term('T'.'inbox'); # Type if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { $doc->add_boolean_term('T'.'newsgroup'); # additional Type } # force reread from disk, {description} could be loaded from {misc} delete $ibx->{description}; my $desc = $ibx->description; # description = S/Subject (or title) # address = A/Author index_text($self, $desc, 1, 'S'); index_text($self, $ibx->{name}, 1, 'XNAME'); my %map = ( address => 'A', listid => 'XLISTID', infourl => 'XINFOURL', url => 'XURL' ); while (my ($f, $pfx) = each %map) { for my $v (@{$ibx->{$f} // []}) { index_text($self, $v, 1, $pfx); } } my $data = {}; if (defined(my $max = $ibx->max_git_epoch)) { # v2 my $pfx = "/$ibx->{name}/git/"; for my $epoch (0..$max) { my $git = $ibx->git_epoch($epoch) or return; if (my $ent = $git->manifest_entry($epoch, $desc)) { $data->{"$pfx$epoch.git"} = $ent; $ent->{git_dir} = $git->{git_dir}; } $git->cleanup; # ->modified starts cat-file --batch } } elsif (my $ent = $ibx->git->manifest_entry) { # v1 $ent->{git_dir} = $ibx->{inboxdir}; $data->{"/$ibx->{name}"} = $ent; } $doc->set_data($json->encode($data)); if (defined $docid) { $xdb->replace_document($docid, $doc); } else { $xdb->add_document($doc); } } 1;