From 0b1de991a099b5e8b9a9e3e85b5eaaacc9362dbb Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 15 May 2019 01:18:08 +0000 Subject: lazy load Xapian and make it optional for v2 More tests work without Search::Xapian, now. Usability issues still need to be fixed --- lib/PublicInbox/Inbox.pm | 19 +++++----- lib/PublicInbox/Search.pm | 84 ++++++++++++++++++++++++++------------------ lib/PublicInbox/SearchIdx.pm | 50 +++++++++++++++++--------- lib/PublicInbox/WWW.pm | 7 ++-- 4 files changed, 99 insertions(+), 61 deletions(-) (limited to 'lib/PublicInbox') diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index dc186b73..813ed997 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -25,7 +25,7 @@ sub cleanup_task () { my $next = {}; for my $ibx (values %$CLEANUP) { my $again; - foreach my $f (qw(mm search)) { + foreach my $f (qw(mm search over)) { delete $ibx->{$f} if SvREFCNT($ibx->{$f}) == 1; } my $expire = time - 60; @@ -37,7 +37,7 @@ sub cleanup_task () { $again = 1 if $git->cleanup($expire); } } - $again ||= !!($ibx->{mm} || $ibx->{search}); + $again ||= !!($ibx->{over} || $ibx->{mm} || $ibx->{search}); $next->{"$ibx"} = $ibx if $again; } $CLEANUP = $next; @@ -175,14 +175,17 @@ sub search ($;$) { require PublicInbox::Search; PublicInbox::Search->new($self, $self->{altid}); }; - # TODO: lazily load Xapian - # return $srch if $over_only || eval { $srch->xdb }; - # undef; + ($over_only || eval { $srch->xdb }) ? $srch : undef; } sub over ($) { - my $srch = search($_[0], 1) or return; - $srch->{over_ro}; + my ($self) = @_; + my $srch = search($self, 1) or return; + $self->{over} ||= eval { + my $over = $srch->{over_ro}; + $over->dbh_new; # may fail + $over; + } } sub try_cat { @@ -290,7 +293,7 @@ sub nntp_url { sub nntp_usable { my ($self) = @_; my $ret = mm($self) && over($self); - $self->{mm} = $self->{search} = undef; + $self->{mm} = $self->{over} = $self->{search} = undef; $ret; } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index e79ec0f8..b1e62f4c 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -12,11 +12,21 @@ use constant TS => 0; # Received: header in Unix time use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI use constant DT => 2; # Date: YYYYMMDDHHMMSS -use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; use PublicInbox::MIME; use PublicInbox::MID qw/id_compress/; use PublicInbox::Over; +my $QP_FLAGS; +sub load_xapian () { + $QP_FLAGS ||= eval { + require Search::Xapian; + Search::Xapian->import(qw(:standard)); + + # n.b. FLAG_PURE_NOT is expensive not suitable for a public + # website as it could become a denial-of-service vector + FLAG_PHRASE()|FLAG_BOOLEAN()|FLAG_LOVEHATE()|FLAG_WILDCARD(); + }; +}; # This is English-only, everything else is non-standard and may be confused as # a prefix common in patch emails @@ -41,10 +51,6 @@ use constant { # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0) # 14 - fix ghost root vivification SCHEMA_VERSION => 15, - - # n.b. FLAG_PURE_NOT is expensive not suitable for a public website - # as it could become a denial-of-service vector - QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; my %bool_pfx_external = ( @@ -113,18 +119,43 @@ EOF ); chomp @HELP; -sub xdir { - my ($self) = @_; +sub xdir ($;$) { + my ($self, $rdonly) = @_; if ($self->{version} == 1) { "$self->{mainrepo}/public-inbox/xapian" . SCHEMA_VERSION; } else { my $dir = "$self->{mainrepo}/xap" . SCHEMA_VERSION; + return $dir if $rdonly; + my $part = $self->{partition}; defined $part or die "partition not given"; $dir .= "/$part"; } } +sub xdb ($) { + my ($self) = @_; + $self->{xdb} ||= do { + load_xapian(); + my $dir = xdir($self, 1); + if ($self->{version} >= 2) { + my $xdb; + foreach my $part (<$dir/*>) { + -d $part && $part =~ m!/\d+\z! or next; + my $sub = Search::Xapian::Database->new($part); + if ($xdb) { + $xdb->add_database($sub); + } else { + $xdb = $sub; + } + } + $xdb; + } else { + Search::Xapian::Database->new($dir); + } + }; +} + sub new { my ($class, $mainrepo, $altid) = @_; my $version = 1; @@ -138,33 +169,16 @@ sub new { altid => $altid, version => $version, }, $class; - my $dir; - if ($version >= 2) { - $dir = "$self->{mainrepo}/xap" . SCHEMA_VERSION; - my $xdb; - my $parts = 0; - foreach my $part (<$dir/*>) { - -d $part && $part =~ m!/\d+\z! or next; - $parts++; - my $sub = Search::Xapian::Database->new($part); - if ($xdb) { - $xdb->add_database($sub); - } else { - $xdb = $sub; - } - } - $self->{xdb} = $xdb; - } else { - $dir = $self->xdir; - $self->{xdb} = Search::Xapian::Database->new($dir); - } + my $dir = xdir($self, 1); $self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3"); $self; } sub reopen { my ($self) = @_; - $self->{xdb}->reopen; + if (my $xdb = $self->{xdb}) { + $xdb->reopen; + } $self; # make chaining easier } @@ -175,7 +189,8 @@ sub query { if ($query_string eq '' && !$opts->{mset}) { $self->{over_ro}->recent($opts); } else { - my $query = $self->qp->parse_query($query_string, QP_FLAGS); + my $qp = qp($self); + my $query = $qp->parse_query($query_string, $QP_FLAGS); $opts->{relevance} = 1 unless exists $opts->{relevance}; _do_enquire($self, $query, $opts); } @@ -213,7 +228,8 @@ sub _do_enquire { sub _enquire_once { my ($self, $query, $opts) = @_; - my $enquire = Search::Xapian::Enquire->new($self->{xdb}); + my $xdb = xdb($self); + my $enquire = Search::Xapian::Enquire->new($xdb); $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; @@ -246,13 +262,13 @@ sub qp { my $qp = $self->{query_parser}; return $qp if $qp; - + my $xdb = xdb($self); # new parser $qp = Search::Xapian::QueryParser->new; - $qp->set_default_op(OP_AND); - $qp->set_database($self->{xdb}); + $qp->set_default_op(OP_AND()); + $qp->set_database($xdb); $qp->set_stemmer($self->stemmer); - $qp->set_stemming_strategy(STEM_SOME); + $qp->set_stemming_strategy(STEM_SOME()); $qp->set_max_wildcard_expansion(100); $qp->add_valuerangeprocessor( Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1b86f727..135b5eb9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -54,11 +54,10 @@ sub new { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } - } else { # v1 + } else { # FIXME: old tests: old tests $ibx = { mainrepo => $git_dir, version => 1 }; } $ibx = PublicInbox::InboxWritable->new($ibx); - require Search::Xapian::WritableDatabase; my $self = bless { mainrepo => $mainrepo, -inbox => $ibx, @@ -84,25 +83,36 @@ sub new { $self; } +sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels } + sub _xdb_release { my ($self) = @_; - my $xdb = delete $self->{xdb} or croak 'not acquired'; - $xdb->close; + if (need_xapian($self)) { + my $xdb = delete $self->{xdb} or croak 'not acquired'; + $xdb->close; + } $self->lock_release if $self->{creat}; undef; } sub _xdb_acquire { my ($self) = @_; - croak 'already acquired' if $self->{xdb}; + my $flag; my $dir = $self->xdir; - my $flag = Search::Xapian::DB_OPEN; + if (need_xapian($self)) { + croak 'already acquired' if $self->{xdb}; + PublicInbox::Search::load_xapian(); + require Search::Xapian::WritableDatabase; + $flag = $self->{creat} ? + Search::Xapian::DB_CREATE_OR_OPEN() : + Search::Xapian::DB_OPEN(); + } if ($self->{creat}) { require File::Path; $self->lock_acquire; File::Path::mkpath($dir); - $flag = Search::Xapian::DB_CREATE_OR_OPEN; } + return unless defined $flag; $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); } @@ -342,7 +352,7 @@ sub add_message { $num = index_mm($self, $mime); } eval { - if ($self->{indexlevel} =~ $xapianlevels) { + if (need_xapian($self)) { $self->add_xapian($mime, $num, $oid, $mids, $mid0) } if (my $over = $self->{over}) { @@ -383,7 +393,6 @@ sub batch_do { # v1 only, where $mid is unique sub remove_message { my ($self, $mid) = @_; - my $db = $self->{xdb}; $mid = mid_clean($mid); if (my $over = $self->{over}) { @@ -394,7 +403,8 @@ sub remove_message { warn "<$mid> missing for removal from overview\n"; } } - return if $self->{indexlevel} !~ $xapianlevels; + return unless need_xapian($self); + my $db = $self->{xdb}; my $nr = 0; eval { batch_do($self, 'Q' . $mid, sub { @@ -413,10 +423,12 @@ sub remove_message { # MID is a hint in V2 sub remove_by_oid { my ($self, $oid, $mid) = @_; - my $db = $self->{xdb}; $self->{over}->remove_oid($oid, $mid) if $self->{over}; + return unless need_xapian($self); + my $db = $self->{xdb}; + # XXX careful, we cannot use batch_do here since we conditionally # delete documents based on other factors, so we cannot call # find_doc_ids twice. @@ -664,7 +676,7 @@ sub _last_x_commit { my ($self, $mm) = @_; my $lm = $mm->last_commit || ''; my $lx = ''; - if ($self->{indexlevel} =~ $xapianlevels) { + if (need_xapian($self)) { $lx = $self->{xdb}->get_metadata('last_commit') || ''; } else { $lx = $lm; @@ -695,7 +707,7 @@ sub _index_sync { $self->{over}->disconnect; $git->cleanup; delete $self->{txn}; - $xdb->cancel_transaction; + $xdb->cancel_transaction if $xdb; $xdb = _xdb_release($self); # ensure we leak no FDs to "git log" with Xapian <= 1.2 @@ -717,7 +729,7 @@ sub _index_sync { } $dbh->commit; } - if ($newest && $self->{indexlevel} =~ $xapianlevels) { + if ($newest && need_xapian($self)) { my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); @@ -785,7 +797,7 @@ sub begin_txn_lazy { $self->{-inbox}->with_umask(sub { my $xdb = $self->{xdb} || $self->_xdb_acquire; $self->{over}->begin_lazy if $self->{over}; - $xdb->begin_transaction; + $xdb->begin_transaction if $xdb; $self->{txn} = 1; $xdb; }); @@ -795,14 +807,18 @@ sub commit_txn_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{-inbox}->with_umask(sub { - $self->{xdb}->commit_transaction; + if (my $xdb = $self->{xdb}) { + $xdb->commit_transaction; + } $self->{over}->commit_lazy if $self->{over}; }); } sub worker_done { my ($self) = @_; - die "$$ $0 xdb not released\n" if $self->{xdb}; + if (need_xapian($self)) { + die "$$ $0 xdb not released\n" if $self->{xdb}; + } die "$$ $0 still in transaction\n" if $self->{txn}; } diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 0f963dcb..8e1b1afe 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -149,8 +149,11 @@ sub preload { require PublicInbox::MIME; require Digest::SHA; require POSIX; - - foreach (qw(PublicInbox::Search PublicInbox::SearchView + eval { + require PublicInbox::Search; + PublicInbox::Search::load_xapian(); + }; + foreach (qw(PublicInbox::SearchView PublicInbox::Mbox IO::Compress::Gzip PublicInbox::NewsWWW)) { eval "require $_;"; -- cgit v1.2.3-24-ge0c7