# Copyright (C) 2017 all contributors
# License: AGPL-3.0+
#
# Qrefs/(tags|heads)/foo => 40-byte SHA1 hex of commit
# Q$SHA1HEX_OF_COMMIT
#
# Indexes any git repository with Xapian; intended for code;
# see PublicInbox::SearchIdx for a mail-specific indexer
package PublicInbox::RepoGitSearchIdx;
use strict;
use warnings;
use base qw(PublicInbox::RepoGitSearch); # base is read-only
use POSIX qw(strftime);
use PublicInbox::Git;
use PublicInbox::GitIdx;
use constant {
Z40 => ('0' x 40),
STATE_GPGSIG => -0x80000000,
DEBUG => !!$ENV{DEBUG},
BATCH_BYTES => 1_000_000,
};
sub new {
my ($class, $git_dir, $repo_dir) = @_;
require Search::Xapian::WritableDatabase;
my $self = $class->SUPER::new($git_dir, $repo_dir);
my $git = $self->{git} = PublicInbox::Git->new($git_dir);
$self->{want_refs_re} = qr!^refs/(?:heads|tags)/!;
$self->{'umask'} = git_umask_for($git);
$self;
}
sub xdb ($) {
my ($self) = @_;
$self->{xdb} ||= with_umask($self->{'umask'}, sub {
my $xdir = $self->{xdir};
unless (-d $xdir) {
require File::Path;
File::Path::mkpath($xdir);
}
Search::Xapian::WritableDatabase->new($xdir,
Search::Xapian::DB_CREATE_OR_OPEN);
});
}
sub doc_new ($$) {
my ($type, $unique_id) = @_;
my $doc = Search::Xapian::Document->new;
$doc->add_term('T'.$type);
$doc->add_term($unique_id);
$doc;
}
sub add_val ($$$) {
my ($doc, $col, $num) = @_;
$num = Search::Xapian::sortable_serialise($num);
$doc->add_value($col, $num);
}
sub each_term_val ($$$$) {
my ($doc, $pfx, $re, $cb) = @_;
my $end = $doc->termlist_end;
my $i = $doc->termlist_begin;
$i->skip_to($pfx);
while ($i != $end) {
my $val = $i->get_termname;
$val =~ s/$re// and $cb->($val);
$i->inc;
}
undef;
}
sub get_doc ($$$$) {
my ($self, $id_ref, $type, $oid) = @_;
my $doc;
my $doc_id = $self->find_unique_docid('Q'.$oid);
if (defined $doc_id) {
$doc = $self->{xdb}->get_document($doc_id);
} else {
$doc = doc_new($type, 'Q'.$oid);
}
$$id_ref = $doc_id;
$doc;
}
# increments and returns update generation counter
sub update_id ($) {
my ($self) = @_;
my $db = $self->{xdb};
my $update_id = int($db->get_metadata('last_update_id') || 0);
$db->set_metadata('last_update_id', ++$update_id);
$update_id;
}
sub replace_or_add ($$$) {
my ($db, $doc_id, $doc) = @_;
# update our ref:
if (defined $doc_id) {
$db->replace_document($doc_id, $doc);
} else {
$doc_id = $db->add_document($doc);
}
$doc_id;
}
sub decor_update {
my ($self, $doc, $decor, $oid) = @_;
# load all current refs
my $want = $self->{want_refs_re};
($decor) = ($decor =~ m!\((.+)\)!);
foreach (split(/, /, $decor)) {
my ($sym, $refname, $tag);
if (/^(\S+) -> (\S+)\z/) {
($sym, $refname) = ($1, $2);
} elsif (s/^tag: //) {
$refname = $_;
$tag = 1; # XXX use this
} else {
$refname = $_;
}
if ($refname =~ $want) {
$self->{-active_refs}->{$refname} = $oid;
}
# TODO: handle $sym, and do something with tags
}
}
sub term_generator ($) { # write-only
my ($self) = @_;
$self->{term_generator} ||= eval {
my $tg = Search::Xapian::TermGenerator->new;
$tg->set_stemmer($self->stemmer);
$tg;
};
}
sub index_text_inc ($$$) {
my ($tg, $text, $pfx) = @_;
$tg->index_text($text, 1, $pfx);
$tg->increase_termpos;
}
sub index_blob_id ($$$) {
my ($tg, $blob_id, $pfx) = @_;
index_text_inc($tg, $blob_id, $pfx) if $blob_id ne Z40;
}
sub each_log_line ($$) {
my ($self, $range) = @_;
my $log = $self->{git}->popen(qw(log --decorate=full --pretty=raw
--no-color --no-abbrev --no-notes
-r --raw -p
), $range, '--');
my $db = $self->{xdb};
my ($doc, $doc_id);
my $tg = term_generator($self);
my $state = 0; # 1: subject, 2: body, 3: diff, 4: diff -c
my $tip;
my $hex = '[a-f0-9]+';
my ($cc_ins, $cc_del);
my $batch = BATCH_BYTES;
local $/ = "\n";
while (defined(my $l = <$log>)) {
$batch -= bytes::length($l);
# prevent memory growth from Xapian
if ($batch <= 0) {
$db->flush;
$batch = BATCH_BYTES;
}
if ($l =~ /^commit (\S+)(\s+\([^\)]+\))?/) {
my ($oid, $decor) = ($1, $2);
replace_or_add($db, $doc_id, $doc) if $doc;
$tip ||= $oid;
$state = 0;
$cc_ins = $cc_del = undef;
$doc = get_doc($self, \$doc_id, 'commit', $oid);
decor_update($self, $doc, $decor, $oid) if $decor;
# old commit
last if defined $doc_id;
# new commit:
$tg->set_document($doc);
$doc->set_data($oid);
$doc->add_term('Q' . $oid);
index_text_inc($tg, $oid, 'Q');
} elsif ($l =~ /^parent (\S+)/) {
my $parent = $1;
index_text_inc($tg, $parent, 'XP');
} elsif ($l =~ /^author ([^<]*?<[^>]+>) (\d+)/) {
my ($au, $at) = ($1, $2);
index_text_inc($tg, $au, 'A');
add_val($doc, PublicInbox::RepoGitSearch::AD,
strftime('%Y%m%d', gmtime($at)));
} elsif ($l =~ /^committer ([^<]*?<[^>]+>) (\d+)/) {
my ($cu, $ct) = ($1, $2);
index_text_inc($tg, $cu, 'XC');
add_val($doc, PublicInbox::RepoGitSearch::CD,
strftime('%Y%m%d', gmtime($ct)));
} elsif ($l =~ /^gpgsig /) {
$state = STATE_GPGSIG;
} elsif ($l =~ /^mergetag /) {
$state = -1;
} elsif ($state < 0) { # inside mergetag or gpgsig
if ($l eq " \n") { # paragraph
$state--;
$tg->increase_termpos;
} elsif ($l eq "-----BEGIN PGP SIGNATURE-----\n") {
# no point in indexing a PGP signature
$state = STATE_GPGSIG;
} elsif ($state == -2) { # mergetag subject
$tg->index_text($l, 1);
$tg->increase_termpos;
} elsif ($state < -2 && $state > STATE_GPGSIG) {
$tg->index_text($l); # mergetag body
} elsif ($l eq "\n") {
# end of mergetag, onto normal commit message
$tg->increase_termpos;
$state = 0;
} elsif ($l =~ /^ (?:tag|tagger|type) /) {
# ignored
} elsif (DEBUG) {
if ($state <= STATE_GPGSIG) {
# skip
} else {
warn "unhandled mergetag: $l";
}
}
} elsif ($state < 3 && $l =~ s/^ //) { # subject and body
if ($state > 0) {
$l =~ /\S/ ? $tg->index_text($l, 1)
: $tg->increase_termpos;
$state = 2;
} else {
$state = 1;
$tg->index_text($l, 1, 'S') if $l ne "\n";
}
} elsif ($l =~ /^:\d{6} \d{6} ($hex) ($hex) (\S+)\t+(.+)/o) {
# --raw output (regular)
my ($pre, $post, $chg, $names) = ($1, $2, $3, $4);
index_blob_id($tg, $pre, 'XPRE');
index_blob_id($tg, $post, 'XPOST');
} elsif ($l =~ /^(::+)(?:\d{6} )+ ($hex .+)? (\S+)\t+(.+)/o) {
# --raw output (combined)
my ($colons, $blobs, $chg, $names) = ($1, $2, $3, $4);
my @blobs = split(/ /, $blobs);
my $post = pop @blobs;
my $n = length($colons);
if (scalar(@blobs) != $n) {
die "combined raw parsed wrong:\n$l\n//\n";
}
index_blob_id($tg, $_, 'XPRE') foreach @blobs;
index_blob_id($tg, $post, 'XPOST');
unless ($cc_ins) {
$n--;
$cc_ins = qr/^ {0,$n}[\+]\s*(.*)/;
$cc_del = qr/^ {0,$n}[\-]\s*(.*)/;
}
} elsif ($l =~ m!^diff --git (?:"?a/.+?) (?:"?b/.+)!) {
# regular diff, filenames handled by --raw
$state = 3;
} elsif ($l =~ /^diff --(?:cc|combined) (?:.+)/) {
# combined diff, filenames handled by --raw
$state = 4;
} elsif ($l =~ /^@@ (?:\S+) (?:\S+) @@(.*)/) {
my $hunk_hdr = $1;
# regular hunk header context
$hunk_hdr =~ /\S/ and
index_text_inc($tg, $hunk_hdr, 'XDHH');
# not currently handled:
} elsif ($l =~ /^index (?:$hex)\.\.(?:$hex)/o) {
} elsif ($l =~ /^index (?:$hex,[^\.]+)\.\.(?:$hex)(.*)$/o) {
#--cc
} elsif ($l =~ /^(?:@@@+) (?:\S+.*\S+) @@@+\z/) { # --cc
} elsif ($l =~ /^(?:old|new) mode/) {
} elsif ($l =~ /^(?:deleted|new) file mode/) {
} elsif ($l =~ /^tree (?:\S+)/) {
} elsif ($l =~ /^(?:copy|rename) (?:from|to) /) {
} elsif ($l =~ /^(?:dis)?similarity index /) {
} elsif ($l =~ /^\\ No newline at end of file/) {
} elsif ($l =~ /^Binary files .* differ/) {
} elsif ($l =~ /^--- /) { # preimage filename
} elsif ($l =~ /^\+\+\+ /) { # postimage filename
} elsif ($state == 3) { # diff --git
if ($l =~ s/^\+//) {
index_text_inc($tg, $l, 'XDFB');
} elsif ($l =~ s/^\-//) {
index_text_inc($tg, $l, 'XDFA');
} elsif ($l =~ s/^ //) {
index_text_inc($tg, $l, 'XDCTX');
} elsif (DEBUG) {
if ($l eq "\n") {
} else {
warn "unhandled diff -u $l";
}
}
} elsif ($state == 4) { # diff --cc/combined
if ($l =~ $cc_ins) {
index_text_inc($tg, $1, 'XDFB');
} elsif ($l =~ $cc_del) {
index_text_inc($tg, $1, 'XDFA');
} elsif ($l =~ s/^ //) {
index_text_inc($tg, $l, 'XDCTX');
} elsif (DEBUG) {
if ($l eq "\n") {
} else {
warn "unhandled diff --cc $l";
}
}
} elsif (DEBUG) {
warn "wtf $state $l\n" if $l ne "\n";
}
}
replace_or_add($db, $doc_id, $doc) if $doc;
$tip;
}
sub index_top_ref ($$$) {
my ($self, $refname, $end) = @_;
my $doc_id;
my $db = xdb($self);
my $ref_doc = get_doc($self, \$doc_id, 'ref', $refname);
my $begin = defined $doc_id ? $ref_doc->get_data : '';
my $active = $self->{-active_refs} = { $refname => undef };
my $git = $self->{git};
# check for discontiguous branches (from "push --force")
if ($begin ne '') {
my $base = $git->qx(qw(merge-base), $begin, $end);
chomp $base;
if ($base ne $begin) {
warn "$refname updated with force\n";
# TODO: cleanup_forced_update($self, $refname);
$begin = '';
}
}
my $range = $begin eq '' ? $end : "$begin^0..$end^0";
my $tip = each_log_line($self, $range);
my $progress = $self->{progress};
if (defined $tip) {
$ref_doc->set_data($tip);
print $progress "$refname => $tip\n" if $progress;
replace_or_add($db, $doc_id, $ref_doc);
}
$db->flush;
# update all decorated refs which got snowballed into this one
delete $active->{$refname};
my $n = 100;
foreach my $ref (keys %$active) {
if (--$n <= 0) {
$db->flush;
$n = 100;
}
$ref_doc = get_doc($self, \$doc_id, 'ref', $ref);
$ref_doc->set_data($active->{$ref});
if ($progress) {
print $progress "$ref => $active->{$ref} ($refname)\n";
}
replace_or_add($db, $doc_id, $ref_doc);
}
$db->flush;
}
# main entry sub:
sub index_sync {
my ($self, $opts) = @_;
$self->{progress} = $opts->{progress};
my $db = xdb($self);
$self->{-update_id} = update_id($self);
# go for most recent refs, first, since that reduces the amount
# of work we have to do.
my $refs = $self->{git}->popen(qw(for-each-ref --sort=-creatordate));
local $/ = "\n";
while (defined(my $line = <$refs>)) {
chomp $line;
my ($oid, $type, $refname) = split(/\s+/, $line);
next unless $refname =~ $self->{want_refs_re};
next unless $type eq 'commit' || $type eq 'tag';
index_top_ref($self, $refname, $oid);
}
}
1;