From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A0D971F47A for ; Sun, 22 Dec 2019 22:17:41 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/4] search: support SWIG-generated Xapian.pm Date: Sun, 22 Dec 2019 22:17:39 +0000 Message-Id: <20191222221740.17114-4-e@80x24.org> In-Reply-To: <20191222221740.17114-1-e@80x24.org> References: <20191222221740.17114-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Xapian upstream is slowly phasing out the XS-based Search::Xapian in favor of the SWIG-generated "Xapian" package. While Debian and both FreeBSD have Search::Xapian, OpenBSD only includes the "Xapian" binding. More information about the status of the "Xapian" Perl module here: https://trac.xapian.org/ticket/523 --- lib/PublicInbox/Admin.pm | 4 +++ lib/PublicInbox/Search.pm | 57 ++++++++++++++++++++++++----------- lib/PublicInbox/SearchIdx.pm | 32 ++++++++++++++------ lib/PublicInbox/SearchMsg.pm | 3 +- lib/PublicInbox/TestCommon.pm | 10 +++++- lib/PublicInbox/V2Writable.pm | 5 ++- lib/PublicInbox/Xapcmd.pm | 25 +++++++++------ t/indexlevels-mirror.t | 3 +- t/psgi_multipart_not.t | 2 +- t/psgi_search.t | 2 +- t/psgi_v2.t | 7 ++--- t/xcpdb-reshard.t | 4 ++- 12 files changed, 105 insertions(+), 49 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 3d0d80b9..32a9f65e 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -159,6 +159,10 @@ sub check_require { while (my $mod = shift @mods) { if (my $groups = $mod_groups{$mod}) { push @mods, @$groups; + } elsif ($mod eq 'Search::Xapian') { + require PublicInbox::Search; + PublicInbox::Search::load_xapian() or + $err->{'Search::Xapian || Xapian'} = $@; } else { eval "require $mod"; $err->{$mod} = $@ if $@; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 88c8dc70..7d83419d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -17,20 +17,42 @@ use PublicInbox::MIME; use PublicInbox::MID qw/id_compress/; use PublicInbox::Over; my $QP_FLAGS; +our %X = map { $_ => 0 } qw(BoolWeight Database Enquire + NumberValueRangeProcessor QueryParser Stem); +our $Xap; # 'Search::Xapian' or 'Xapian' +my $ENQ_ASCENDING; + sub load_xapian () { - $QP_FLAGS ||= eval { - require Search::Xapian; - Search::Xapian->import(qw(:standard)); + return 1 if defined $Xap; + for my $x (qw(Xapian Search::Xapian)) { + eval "require $x"; + next if $@; + + $x->import(qw(:standard)); + $Xap = $x; + $X{$_} = $Xap.'::'.$_ for (keys %X); + # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, + # so lets hope this part of the ABI is stable because it's + # just an integer: + $ENQ_ASCENDING = $x eq 'Xapian' ? + 1 : Search::Xapian::ENQ_ASCENDING(); + + # for SearchMsg: + *PublicInbox::SearchMsg::sortable_unserialise = + $Xap.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert # (and probably earlier Xapian DBs). glass seems fine... # TODO: make this an option, maybe? # or make indexlevel=medium as default - FLAG_PHRASE()|FLAG_BOOLEAN()|FLAG_LOVEHATE()|FLAG_WILDCARD(); - }; -}; + $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | + FLAG_WILDCARD(); + last; + } + undef; +} # This is English-only, everything else is non-standard and may be confused as # a prefix common in patch emails @@ -148,7 +170,7 @@ sub _xdb ($) { if ($self->{version} >= 2) { foreach my $shard (<$dir/*>) { -d $shard && $shard =~ m!/[0-9]+\z! or next; - my $sub = Search::Xapian::Database->new($shard); + my $sub = $X{Database}->new($shard); if ($xdb) { $xdb->add_database($sub); } else { @@ -158,7 +180,7 @@ sub _xdb ($) { } } else { $slow_phrase = -f "$dir/iamchert"; - $xdb = Search::Xapian::Database->new($dir); + $xdb = $X{Database}->new($dir); } $$qpf |= FLAG_PHRASE() unless $slow_phrase; $xdb; @@ -222,7 +244,7 @@ sub retry_reopen { } # Exception: The revision being read has been discarded - # you should call Xapian::Database::reopen() - if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + if (ref($@) =~ /\bDatabaseModifiedError\b/) { warn "reopen try #$i on $@\n"; reopen($self); } else { @@ -243,13 +265,13 @@ sub _do_enquire { sub _enquire_once { my ($self, $query, $opts) = @_; my $xdb = xdb($self); - my $enquire = Search::Xapian::Enquire->new($xdb); + my $enquire = $X{Enquire}->new($xdb); $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; if (($opts->{mset} || 0) == 2) { - $enquire->set_docid_order(Search::Xapian::ENQ_ASCENDING()); - $enquire->set_weighting_scheme(Search::Xapian::BoolWeight->new); + $enquire->set_docid_order($ENQ_ASCENDING); + $enquire->set_weighting_scheme($X{BoolWeight}->new); } elsif ($opts->{relevance}) { $enquire->set_sort_by_relevance_then_value(TS, $desc); } else { @@ -268,7 +290,7 @@ sub _enquire_once { } # read-write -sub stemmer { Search::Xapian::Stem->new($LANG) } +sub stemmer { $X{Stem}->new($LANG) } # read-only sub qp { @@ -278,16 +300,15 @@ sub qp { return $qp if $qp; my $xdb = xdb($self); # new parser - $qp = Search::Xapian::QueryParser->new; + $qp = $X{QueryParser}->new; $qp->set_default_op(OP_AND()); $qp->set_database($xdb); $qp->set_stemmer($self->stemmer); $qp->set_stemming_strategy(STEM_SOME()); $qp->set_max_wildcard_expansion(100); - $qp->add_valuerangeprocessor( - Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); - $qp->add_valuerangeprocessor( - Search::Xapian::NumberValueRangeProcessor->new(DT, 'dt:')); + my $nvrp = $X{NumberValueRangeProcessor}; + $qp->add_valuerangeprocessor($nvrp->new(YYYYMMDD, 'd:')); + $qp->add_valuerangeprocessor($nvrp->new(DT, 'dt:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index b56fd0ee..21ab8119 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -19,7 +19,8 @@ use POSIX qw(strftime); use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn); use PublicInbox::Git qw(git_unquote); - +my $X = \%PublicInbox::Search::X; +my ($DB_CREATE_OR_OPEN, $DB_OPEN); use constant { BATCH_BYTES => defined($ENV{XAPIAN_FLUSH_THRESHOLD}) ? 0x7fffffff : 1_000_000, @@ -85,17 +86,28 @@ sub _xdb_release { undef; } +sub load_xapian_writable () { + return 1 if $X->{WritableDatabase}; + PublicInbox::Search::load_xapian() or return; + my $xap = $PublicInbox::Search::Xap; + for (qw(Document TermGenerator WritableDatabase)) { + $X->{$_} = $xap.'::'.$_; + } + eval 'require '.$X->{WritableDatabase} or die; + *sortable_serialise = $xap.'::sortable_serialise'; + $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); + $DB_OPEN = eval($xap.'::DB_OPEN()'); + 1; +} + sub _xdb_acquire { my ($self) = @_; my $flag; my $dir = $self->xdir; if (need_xapian($self)) { croak 'already acquired' if $self->{xdb}; - PublicInbox::Search::load_xapian(); - require Search::Xapian::WritableDatabase; - $flag = $self->{creat} ? - Search::Xapian::DB_CREATE_OR_OPEN() : - Search::Xapian::DB_OPEN(); + load_xapian_writable(); + $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } if ($self->{creat}) { require File::Path; @@ -108,7 +120,7 @@ sub _xdb_acquire { } } return unless defined $flag; - my $xdb = eval { Search::Xapian::WritableDatabase->new($dir, $flag) }; + my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; if ($@) { die "Failed opening $dir: ", $@; } @@ -117,7 +129,7 @@ sub _xdb_acquire { sub add_val ($$$) { my ($doc, $col, $num) = @_; - $num = Search::Xapian::sortable_serialise($num); + $num = sortable_serialise($num); $doc->add_value($col, $num); } @@ -274,7 +286,7 @@ sub index_body ($$$) { sub add_xapian ($$$$$) { my ($self, $mime, $num, $oid, $mids, $mid0) = @_; my $smsg = PublicInbox::SearchMsg->new($mime); - my $doc = Search::Xapian::Document->new; + my $doc = $X->{Document}->new; my $subj = $smsg->subject; add_val($doc, PublicInbox::Search::TS(), $smsg->ts); my @ds = gmtime($smsg->ds); @@ -458,7 +470,7 @@ sub term_generator { # write-only my $tg = $self->{term_generator}; return $tg if $tg; - $tg = Search::Xapian::TermGenerator->new; + $tg = $X->{TermGenerator}->new; $tg->set_stemmer($self->stemmer); $self->{term_generator} = $tg; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 7561e7f2..53882f73 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -27,7 +27,8 @@ sub wrap { sub get_val ($$) { my ($doc, $col) = @_; - Search::Xapian::sortable_unserialise($doc->get_value($col)); + # sortable_unserialise is defined by PublicInbox::Search::load_xapian() + sortable_unserialise($doc->get_value($col)); } sub to_doc_data { diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index 9680b94c..b0b1f4d9 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -64,7 +64,15 @@ sub require_mods { my $maybe = pop @mods if $mods[-1] =~ /\A[0-9]+\z/; my @need; for my $mod (@mods) { - eval "require $mod"; + if ($mod eq 'Search::Xapian') { + require PublicInbox::Search; + PublicInbox::Search::load_xapian() and next; + } elsif ($mod eq 'Search::Xapian::WritableDatabase') { + require PublicInbox::SearchIdx; + PublicInbox::SearchIdx::load_xapian_writable() and next; + } else { + eval "require $mod"; + } push @need, $mod if $@; } return unless @need; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ab80941a..77b3bde4 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -62,10 +62,13 @@ sub count_shards ($) { # Also, shard count may change while -watch is running # due to "xcpdb --reshard" if (-d $xpfx) { + require PublicInbox::Search; + PublicInbox::Search::load_xapian(); + my $XapianDatabase = $PublicInbox::Search::X{Database}; foreach my $shard (<$xpfx/*>) { -d $shard && $shard =~ m!/[0-9]+\z! or next; eval { - Search::Xapian::Database->new($shard)->close; + $XapianDatabase->new($shard)->close; $n++; }; } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 78b3a9ac..9f897dad 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -5,7 +5,7 @@ use strict; use warnings; use PublicInbox::Spawn qw(which spawn); use PublicInbox::Over; -use PublicInbox::Search; +use PublicInbox::SearchIdx; use File::Temp (); use File::Path qw(remove_tree); use File::Basename qw(dirname); @@ -99,7 +99,7 @@ sub prepare_reindex ($$$) { my ($ibx, $im, $reindex) = @_; if ($ibx->{version} == 1) { my $dir = $ibx->search->xdir(1); - my $xdb = Search::Xapian::Database->new($dir); + my $xdb = $PublicInbox::Search::X{Database}->new($dir); if (my $lc = $xdb->get_metadata('last_commit')) { $reindex->{from} = $lc; } @@ -164,7 +164,8 @@ sub run { if (!$opt->{-coarse_lock}) { $reindex = $opt->{reindex} = {}; $from = $reindex->{from} = []; - require Search::Xapian::WritableDatabase; + require PublicInbox::SearchIdx; + PublicInbox::SearchIdx::load_xapian_writable(); } $ibx->umask_prepare; @@ -251,7 +252,7 @@ sub run { sub cpdb_retryable ($$) { my ($src, $pfx) = @_; - if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + if (ref($@) =~ /\bDatabaseModifiedError\b/) { warn "$pfx Xapian DB modified, reopening and retrying\n"; $src->reopen; return 1; @@ -361,6 +362,8 @@ sub cpdb ($$) { my $new = $newdir->dirname; my ($src, $cur_shard); my $reshard; + PublicInbox::SearchIdx::load_xapian_writable() or die; + my $XapianDatabase = $PublicInbox::Search::X{Database}; if (ref($old) eq 'ARRAY') { ($cur_shard) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); defined $cur_shard or @@ -371,14 +374,14 @@ sub cpdb ($$) { # resharding, M:N copy means have full read access foreach (@$old) { if ($src) { - my $sub = Search::Xapian::Database->new($_); + my $sub = $XapianDatabase->new($_); $src->add_database($sub); } else { - $src = Search::Xapian::Database->new($_); + $src = $XapianDatabase->new($_); } } } else { - $src = Search::Xapian::Database->new($old); + $src = $XapianDatabase->new($old); } my ($tmp, $ft); @@ -395,8 +398,10 @@ sub cpdb ($$) { # like copydatabase(1), be sure we don't overwrite anything in case # of other bugs: - my $creat = Search::Xapian::DB_CREATE(); - my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat); + my $creat = eval($PublicInbox::Search::Xap.'::DB_CREATE()'); + die if $@; + my $XapianWritableDatabase = $PublicInbox::Search::X{WritableDatabase}; + my $dst = $XapianWritableDatabase->new($tmp, $creat); my $pr = $opt->{-progress}; my $pfx = $opt->{-progress_pfx} = progress_pfx($new); my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; @@ -439,7 +444,7 @@ sub cpdb ($$) { # individually. $src = undef; foreach (@$old) { - my $old = Search::Xapian::Database->new($_); + my $old = $XapianDatabase->new($_); cpdb_loop($old, $dst, $pr_data, $cur_shard, $reshard); } } else { diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t index e4313faa..2244870b 100644 --- a/t/indexlevels-mirror.t +++ b/t/indexlevels-mirror.t @@ -165,7 +165,8 @@ import_index_incremental($PI_TEST_VERSION, 'basic', $mime); SKIP: { require PublicInbox::Search; - PublicInbox::Search::load_xapian() or skip 'Search::Xapian missing', 2; + PublicInbox::Search::load_xapian() or + skip('Xapian perl binding missing', 2); foreach my $l (qw(medium full)) { import_index_incremental($PI_TEST_VERSION, $l, $mime); } diff --git a/t/psgi_multipart_not.t b/t/psgi_multipart_not.t index fbc09f55..606151c4 100644 --- a/t/psgi_multipart_not.t +++ b/t/psgi_multipart_not.t @@ -10,7 +10,7 @@ use PublicInbox::TestCommon; my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test URI::Escape Plack::Builder Plack::Test); require_mods(@mods); -use_ok($_) for @mods; +use_ok($_) for (qw(HTTP::Request::Common Plack::Test)); use_ok 'PublicInbox::V2Writable'; my ($repo, $for_destroy) = tmpdir(); my $ibx = PublicInbox::Inbox->new({ diff --git a/t/psgi_search.t b/t/psgi_search.t index 95749c66..534063f8 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -13,7 +13,7 @@ use PublicInbox::TestCommon; my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test URI::Escape Plack::Builder); require_mods(@mods); -use_ok $_ foreach (@mods, qw(PublicInbox::SearchIdx)); +use_ok($_) for (qw(HTTP::Request::Common Plack::Test)); my ($tmpdir, $for_destroy) = tmpdir(); my $ibx = PublicInbox::Inbox->new({ diff --git a/t/psgi_v2.t b/t/psgi_v2.t index 9a81ee9a..a02b90fb 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -9,10 +9,9 @@ use PublicInbox::MIME; use PublicInbox::Config; use PublicInbox::WWW; use PublicInbox::MID qw(mids); -my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test - URI::Escape Plack::Builder); -require_mods(@mods); -use_ok($_) for @mods; +require_mods(qw(DBD::SQLite Search::Xapian HTTP::Request::Common Plack::Test + URI::Escape Plack::Builder)); +use_ok($_) for (qw(HTTP::Request::Common Plack::Test)); use_ok 'PublicInbox::V2Writable'; my ($inboxdir, $for_destroy) = tmpdir(); my $ibx = { diff --git a/t/xcpdb-reshard.t b/t/xcpdb-reshard.t index 1eae234d..2a0aeb45 100644 --- a/t/xcpdb-reshard.t +++ b/t/xcpdb-reshard.t @@ -8,6 +8,7 @@ require_mods(qw(DBD::SQLite Search::Xapian)); require_git('2.6'); use PublicInbox::MIME; use PublicInbox::InboxWritable; +use PublicInbox::Search; my $mime = PublicInbox::MIME->create( header => [ @@ -61,8 +62,9 @@ for my $R (qw(2 4 1 3 3)) { # ensure docids in Xapian match NNTP article numbers my $tot = 0; my %tmp = %nums; + my $XapianDatabase = $PublicInbox::Search::X{Database}; foreach my $d (@new_shards) { - my $xdb = Search::Xapian::Database->new($d); + my $xdb = $XapianDatabase->new($d); $tot += $xdb->get_doccount; my $it = $xdb->postlist_begin(''); my $end = $xdb->postlist_end('');