diff options
author | Eric Wong <e@80x24.org> | 2023-11-28 14:56:21 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2023-11-29 02:13:22 +0000 |
commit | 67ecd56a0aaf977fa1a22aa6e2378c9317c72549 (patch) | |
tree | edb6e7d87ab1bc1b05bde1e5eb07d9b1ccab6a21 /lib/PublicInbox/CodeSearch.pm | |
parent | 10b23966ff8fa35a112f7fe2b9386dd4967d818c (diff) | |
download | public-inbox-67ecd56a0aaf977fa1a22aa6e2378c9317c72549.tar.gz |
This is a major step in solving the problem of having to manually associate hundreds/thousands of coderepos with hundreds/thousands of public-inboxes to power solver (and more).
Diffstat (limited to 'lib/PublicInbox/CodeSearch.pm')
-rw-r--r-- | lib/PublicInbox/CodeSearch.pm | 153 |
1 files changed, 143 insertions, 10 deletions
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm index eb057525..7d7f6df6 100644 --- a/lib/PublicInbox/CodeSearch.pm +++ b/lib/PublicInbox/CodeSearch.pm @@ -21,7 +21,7 @@ use constant { our @CODE_NRP; our @CODE_VMAP = ( [ AT, 'd:' ], # mairix compat - [ AT, 'dt:' ], # mail compat + [ AT, 'dt:' ], # public-inbox mail compat [ CT, 'ct:' ], ); @@ -51,7 +51,7 @@ my %prob_prefix = ( # copied from PublicInbox::Search sub new { my ($cls, $dir, $cfg) = @_; # can't have a PublicInbox::Config here due to circular refs - bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER, + bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER, -cfg_f => $cfg->{-f} }, $cls; } @@ -63,7 +63,20 @@ sub join_data { my $cur = $self->xdb->get_metadata($key) or return; $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; warn "E: $@ (corrupt metadata in `$key' key?)" if $@; - $cur; + my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root); + if (@m) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} missing: @m +EOM + undef; + } elsif (@{$cur->{ekeys}} != @{$cur->{ibx2root}}) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} mismatched ekeys and ibx2root +EOM + undef; + } else { + $cur; + } } sub qparse_new ($) { @@ -196,16 +209,136 @@ sub roots2paths { # for diagnostics \%ret; } -sub paths2roots { # for diagnostics - my ($self) = @_; +sub root_oids ($$) { + my ($self, $git_dir) = @_; + my @ids = $self->docids_by_postlist('P'.$git_dir); + @ids or warn <<""; +BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir} + + warn <<"" if @ids > 1; +BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir} + my %ret; - my $tmp = roots2paths($self); - for my $root_oidhex (keys %$tmp) { - my $paths = delete $tmp->{$root_oidhex}; - push @{$ret{$_}}, $root_oidhex for @$paths; + for my $docid (@ids) { + my @oids = xap_terms('G', $self->xdb, $docid); + @ret{@oids} = @oids; + } + sort keys %ret; +} + +sub paths2roots { + my ($self, $paths) = @_; + my %ret; + if ($paths) { + for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) } + } else { + my $tmp = roots2paths($self); + for my $root_oidhex (keys %$tmp) { + my $paths = delete $tmp->{$root_oidhex}; + push @{$ret{$_}}, $root_oidhex for @$paths; + } + @$_ = sort(@$_) for values %ret; } - @$_ = sort(@$_) for values %ret; \%ret; } +sub load_commit_times { # each_cindex callback + my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...] + my (@pending, $rec, $dir, @ids, $doc); + while ($rec = shift @$todo) { + @ids = $self->docids_by_postlist('P'.$rec->[1]->{git_dir}); + if (@ids) { + warn <<EOM if @ids > 1; +W: $rec->[1]->{git_dir} indexed multiple times in $self->{topdir} +EOM + for (@ids) { + $doc = $self->get_doc($_) // next; + $rec->[0] = int_val($doc, CT); + last; + } + } else { # may be in another cindex: + push @pending, $rec; + } + } + @$todo = @pending; +} + +sub load_coderepos { # each_cindex callback + my ($self, $pi_cfg) = @_; + my $name = $self->{name}; + my $cfg_f = $pi_cfg->{-f}; + my $lpfx = $self->{localprefix} or return warn <<EOM; +W: cindex.$name.localprefix unset in $cfg_f, ignoring cindex.$name +EOM + my $lre = join('|', map { $_ .= '/'; tr!/!/!s; quotemeta } @$lpfx); + $lre = qr!\A(?:$lre)!; + my $coderepos = $pi_cfg->{-coderepos}; + my $nick_pfx = $name eq '' ? '' : "$name/"; + my %dir2cr; + for my $p ($self->all_terms('P')) { + my $nick = $p; + $nick =~ s!$lre!$nick_pfx!s or next; + $dir2cr{$p} = $coderepos->{$nick} //= do { + my $git = PublicInbox::Git->new($p); + $git->{nick} = $nick; # for git->pub_urls + $git; + }; + } + my $jd = join_data($self) or return warn <<EOM; +W: cindex.$name.topdir=$self->{topdir} has no usable join data for $cfg_f +EOM + my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)}; + my $roots2paths = roots2paths($self); + for my $root_offs (@$ibx2root) { + my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty'; + scalar(@$root_offs) or next; + my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do { + warn "W: `$ekey' gone from $cfg_f\n"; + next; + }; + my $gits = $ibx->{-repo_objs} //= []; + my $cr_score = $ibx->{-cr_score} //= {}; + my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits; + my $ibx2self; # cindex has an association w/ inbox? + for (@$root_offs) { # sorted by $nr descending + my ($nr, $root_off) = @$_; + my $root_oid = $roots->[$root_off] // do { + warn <<EOM; +BUG: root #$root_off invalid in join data for `$ekey' with $cfg_f +EOM + next; + }; + my $git_dirs = $roots2paths->{$root_oid}; + my @gits = map { $dir2cr{$_} // () } @$git_dirs; + $cr_score->{$_->{nick}} //= $nr for @gits; + @$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs; + # @$git_dirs or warn "W: no matches for $root_oid\n"; + for (@$git_dirs) { + if (my $git = $dir2cr{$_}) { + $ibx_p2g{$_} = $git; + $ibx2self = 1; + $ibx->{-hide}->{www} or + push @{$git->{ibx_score}}, + [ $nr, $ibx->{name} ]; + push @$gits, $git; + } else { + warn <<EOM; +W: no coderepo available for $_ (localprefix=@$lpfx) +EOM + } + } + } + if (@$gits) { + push @{$ibx->{-csrch}}, $self if $ibx2self; + } else { + delete $ibx->{-repo_objs}; + delete $ibx->{-cr_score}; + } + } + for my $git (values %dir2cr) { + my $s = $git->{ibx_score}; + @$s = sort { $b->[0] <=> $a->[0] } @$s if $s; + } +} + 1; |