diff options
author | Eric Wong <e@80x24.org> | 2023-03-21 23:07:21 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2023-03-25 09:37:45 +0000 |
commit | 32fa6be4222d9af593c22a7dc101d8d5e8835511 (patch) | |
tree | 16bbac338b62675b1214bd1fceea4ca4ab2d40cd /lib/PublicInbox/CodeSearch.pm | |
parent | 72dfac803728571c30e7ab8caf005229bc1f39f8 (diff) | |
download | public-inbox-32fa6be4222d9af593c22a7dc101d8d5e8835511.tar.gz |
It seems relying on root commits is a reasonable way to deduplicate and handle repositories with common history. I initially wanted to shoehorn this into extindex, but decided a separate Xapian index layout capable of being EITHER external to handle many forks or internal (in $GIT_DIR/public-inbox-cindex) for small projects is the right way to go. Unlike most existing parts of public-inbox, this relies on absolute paths of $GIT_DIR stored in the Xapian DB and does not rely on the config file. We'll be relying on the config file to map absolute paths to public URL paths for WWW.
Diffstat (limited to 'lib/PublicInbox/CodeSearch.pm')
-rw-r--r-- | lib/PublicInbox/CodeSearch.pm | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm new file mode 100644 index 00000000..1dfc124f --- /dev/null +++ b/lib/PublicInbox/CodeSearch.pm @@ -0,0 +1,121 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# read-only external index for coderepos +# currently, it only indexes commits and repository metadata +# (pathname, root commits); not blob contents +package PublicInbox::CodeSearch; +use v5.12; +use parent qw(PublicInbox::Search); +use PublicInbox::Search qw(retry_reopen int_val xap_terms); +use constant { + AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail) + CT => 1, # commit time (Unix time stamp, like TS/rt: in mail) + CIDX_SCHEMA_VER => 1, # brand new schema for code search + # for repos (`Tr'), CT(col=1) is used for the latest tip commit time + # in refs/{heads,tags}. AT(col=0) may be used to store disk usage + # in the future, but disk usage calculation is espensive w/ alternates +}; + +# note: the non-X term prefix allocations are shared with Xapian omega, +# see xapian-applications/omega/docs/termprefixes.rst +# bool_pfx_internal: +# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR +# tags are not indexed, only normal branches (refs/heads/*), not hidden +# 'P' # (pathname) GIT_DIR # uniq +# 'G' # (group) root commit (may have multiple roots) +my %bool_pfx_external = ( + oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256) + # type:repo - rel2abs_collapsed(GIT_DIR) + parent => 'XP', + %PublicInbox::Search::PATCH_BOOL_COMMON, +); + +my %prob_prefix = ( # copied from PublicInbox::Search + # do we care about committer? or partial commit OID via Xapian? + # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial + %PublicInbox::Search::PATCH_PROB_COMMON, + + # default: + '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY +); + +sub new { + my ($cls, $dir) = @_; + bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER }, $cls; +} + +sub cqparse_new ($) { + my ($self) = @_; + my $qp = $self->qp_init_common; + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'd:')); # mairix compat + $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'dt:')); # mail compat + $cb->($qp, $PublicInbox::Search::NVRP->new(CT, 'ct:')); + + while (my ($name, $pfx) = each %bool_pfx_external) { + $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx); + } + while (my ($name, $pfx) = each %prob_prefix) { + $qp->add_prefix($name, $_) for split(/ /, $pfx); + } + $qp; +} + +# returns a Xapian::Query to filter by roots +sub roots_filter { # retry_reopen callback + my ($self, $git_dir) = @_; + my $xdb = $self->xdb; + my $P = 'P'.$git_dir; + my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P)); + if ($cur == $end) { + warn "W: $git_dir not indexed?\n"; + return; + } + my @roots = xap_terms('G', $xdb, $cur->get_docid); + if (!@roots) { + warn "W: $git_dir has no root commits?\n"; + return; + } + my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots)); + for my $r (@roots) { + $q = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_OR(), + $q, 'G'.$r); + } + $q; +} + +sub mset { + my ($self, $qry_str, $opt) = @_; + my $qp = $self->{qp} //= cqparse_new($self); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + + # limit to commits with shared roots + if (defined(my $git_dir = $opt->{git_dir})) { + my $rf = retry_reopen($self, \&roots_filter, $git_dir) + or return; + + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, $rf); + } + + # we only want commits: + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, 'T'.'c'); + + my $enq = $PublicInbox::Search::X{Enquire}->new($self->xdb); + $enq->set_query($qry); + if ($opt->{relevance}) { + $enq->set_sort_by_relevance_then_value(CT, !$opt->{asc}); + } else { + $enq->set_sort_by_value_then_relevance(CT, !$opt->{asc}); + } + $self->retry_reopen($self->can('enquire_once'), $enq, + $opt->{offset} || 0, $opt->{limit} || 50); +} + +1; |