diff options
author | Eric Wong <e@80x24.org> | 2023-11-21 12:43:15 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2023-11-21 21:37:26 +0000 |
commit | fe3883762faf67fd6c4624ee721000e1f36bc59b (patch) | |
tree | 1fd745e8b42b6617b803ab2eac51b89e21a14768 /lib/PublicInbox/CodeSearch.pm | |
parent | 0f92d4d5fc65bdec2b3cef7679480422e818f352 (diff) | |
download | public-inbox-fe3883762faf67fd6c4624ee721000e1f36bc59b.tar.gz |
The association data is just stored as deflated JSON in Xapian metadata keys of shard[0] for now. It should be reasonably compact and fit in memory for now since we'll assume sane, non-malicious git coderepo history, for now. The new cindex-join.t test requires TEST_REMOTE_JOIN=1 to be set in the environment and tests the joins against the inboxes and coderepos of two small projects with a common history. Internally, we'll use `ibx_off', `root_off' instead of `ibx_id' and `root_id' since `_id' may be mistaken for columns in an SQL database which they are not.
Diffstat (limited to 'lib/PublicInbox/CodeSearch.pm')
-rw-r--r-- | lib/PublicInbox/CodeSearch.pm | 62 |
1 files changed, 60 insertions, 2 deletions
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm index 6234e259..9051d85f 100644 --- a/lib/PublicInbox/CodeSearch.pm +++ b/lib/PublicInbox/CodeSearch.pm @@ -7,7 +7,9 @@ package PublicInbox::CodeSearch; use v5.12; use parent qw(PublicInbox::Search); +use PublicInbox::Config; use PublicInbox::Search qw(retry_reopen int_val xap_terms); +use Compress::Zlib qw(uncompress); use constant { AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail) CT => 1, # commit time (Unix time stamp, like TS/rt: in mail) @@ -47,8 +49,21 @@ my %prob_prefix = ( # copied from PublicInbox::Search ); sub new { - my ($cls, $dir) = @_; - bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER }, $cls; + my ($cls, $dir, $cfg) = @_; + # can't have a PublicInbox::Config here due to circular refs + bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER, + -cfg_f => $cfg->{-f} }, $cls; +} + +sub join_data_key ($) { "join:$_[0]->{-cfg_f}" } + +sub join_data { + my ($self) = @_; + my $key = join_data_key($self); + my $cur = $self->xdb->get_metadata($key) or return; + $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; + warn "E: $@ (corrupt metadata in `$key' key?)" if $@; + $cur; } sub qparse_new ($) { @@ -151,4 +166,47 @@ sub mset { $self->do_enquire($qry, $opt, CT); } +sub roots2paths { # for diagnostics + my ($self) = @_; + my $cur = $self->xdb->allterms_begin('G'); + my $end = $self->{xdb}->allterms_end('G'); + my $qrepo = $PublicInbox::Search::X{Query}->new('T'.'r'); + my $enq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $enq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + $enq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + my %ret; + for (; $cur != $end; $cur++) { + my $G_oidhex = $cur->get_termname; + my $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qrepo, $G_oidhex); + $enq->set_query($qry); + my ($size, $off, $lim) = (0, 0, 100000); + my $dirs = $ret{substr($G_oidhex, 1)} = []; + do { + my $mset = $enq->get_mset($off += $size, $lim); + for my $x ($mset->items) { + my $tmp = xap_terms('P', $x->get_document); + push @$dirs, keys %$tmp; + } + $size = $mset->size; + } while ($size); + substr($_, 0, 1, '/') for @$dirs; # s!^P!/! + @$dirs = sort @$dirs; + } + \%ret; +} + +sub paths2roots { # for diagnostics + my ($self) = @_; + my %ret; + my $tmp = roots2paths($self); + for my $root_oidhex (keys %$tmp) { + my $paths = delete $tmp->{$root_oidhex}; + push @{$ret{$_}}, $root_oidhex for @$paths; + } + @$_ = sort(@$_) for values %ret; + \%ret; +} + 1; |