* Re: Print last time and committer a file was touched by for a whole repo
2010-07-03 9:13 ` Jakub Narebski
@ 2010-07-05 9:53 ` Jakub Narebski
0 siblings, 0 replies; 9+ messages in thread
From: Jakub Narebski @ 2010-07-05 9:53 UTC (permalink / raw)
To: Tim Visher; +Cc: Git Mailing List
[-- Attachment #1: Type: text/plain, Size: 1472 bytes --]
On Sat, 3 July 2010, Jakub Narebski wrote:
> On Fri, 2 July 2010, Tim Visher wrote:
>> On Thu, Jul 1, 2010 at 4:12 PM, Jakub Narebski <jnareb@gmail.com> wrote:
>>> Tim Visher <tim.visher@gmail.com> writes:
>>>
>>>> I need to get a listing of the entire contents of my current repo (as
>>>> in, I don't need deleted files or anything like that, just the current
>>>> snapshot) with the time the file was committed and who committed it.
>>>>
>>>> Thoughts on how to do that?
>>>
>>> There does not exist a single git command that would do what you want.
>>> You would need to use 'git log -1 --follow' for each file in current
>>> snapshot ('git ls-tree -r HEAD'). IIRC there is some example how to
>>> do that in GitFaq or GitTips on git wiki (http://git.wiki.kernel.org).
>>>
>>> Perhaps in the future 'git blame <directory>' would provide such
>>> output, or its equivalent (tree blame).
>>
>> That'd be cool.
>
> I am currently working on prototype in Perl, using 'git cat-file --batch'
> and 'git diff-tree --stdin', as I don't know git C code/API enought to
> write it in C; it is planned to be converted to C after proof of concept
> works.
And it even works[1]... but only for a top directory, because of bug in
--relative=<path> implementation for --raw / git-diff-tree output, see
http://permalink.gmane.org/gmane.comp.version-control.git/150248
[1] But I have not tested it very extensively.
[2] It is also missing some features.
--
Jakub Narebski
Poland
[-- Attachment #2: "git blame <directory>" proof of concept in Perl --]
[-- Type: text/plain, Size: 17223 bytes --]
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use Encode;
use Fcntl ':mode';
use List::Compare::Functional qw(:originals);
use List::MoreUtils qw(uniq pairwise);
use constant DEBUG => 0;
# ----------------------------------------------------------------------
{
package Git::Repo;
use strict;
use warnings;
use IPC::Open2 qw(open2);
use IO::Handle;
use base qw(Exporter);
our @EXPORT = qw();
our @EXPORT_OK = qw();
# Auxiliary subroutines
sub _assert_opts {
die "must have an even number of arguments for named options"
unless $#_ % 2;
}
sub _assert_sha1 {
my $sha1 = shift;
die "'$sha1' is not a SHA1 (need to use get_sha1?)"
unless $sha1 && $sha1 =~ /^[a-f0-9]{40}$/;
}
sub new {
my $class = shift;
_assert_opts @_;
my $self = {@_};
bless $self, $class;
die 'no repo_dir given' unless $self->{repo_dir};
return $self;
}
# Return the first items of the git command line, for instance
# qw(/usr/bin/git --git-dir=/path/to/repo.git).
sub _git_cmd {
my $self = shift;
return ($self->{git_binary} || 'git', '--git-dir=' . $self->{repo_dir});
}
sub get_sha1 {
my ($self, $object_id) = @_;
die 'no object identifier given' unless $object_id;
die 'object identifier must not contain newlines' if $object_id =~ /\n/;
unless ($self->{sha1_stdout}) {
# Open bidi pipe the first time get_sha1 is called.
# open2 raises an exception on error, no need to 'or die'.
$self->{sha1_pid} =
open2($self->{sha1_stdout}, $self->{sha1_stdin},
$self->_git_cmd, 'cat-file', '--batch-check');
}
$self->{sha1_stdin}->printflush("$object_id\n")
or die "cannot write to pipe: $!";
my $output = $self->{sha1_stdout}->getline()
or die "cannot read from pipe: $!";
chomp $output;
return if $output =~ /missing$/;
my ($sha1, $type, $size) =
($output =~ /^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$/)
or die "invalid response: $output";
return wantarray ? ($sha1, $type, $size) : $sha1;
}
sub get_object {
my ($self, $object_id) = @_;
unless ($self->{object_stdout}) {
# Open bidi pipe the first time get_object is called.
# open2 raises an exception on error, no need to 'or die'.
$self->{object_pid} =
open2($self->{object_stdout}, $self->{object_stdin},
$self->_git_cmd, 'cat-file', '--batch');
}
$self->{object_stdin}->printflush("$object_id\n")
or die "get_object: cannot write to pipe: $!";
my ($sha1, $type, $size) =
split ' ', $self->{object_stdout}->getline()
or die "get_object: cannot read from pipe: $!";
die "'$object_id' not found in repository"
if $type eq 'missing';
$self->{object_stdout}->read(my $content, $size);
$self->{object_stdout}->getline(); # eat trailing newline
return wantarray ? ($sha1, $type, $size, $content) : $content;
}
sub get_commit_difftree {
my ($self, $commit_id, $parent_id, $path) = @_;
unless ($self->{difftree_stdout}) {
# Open bidi pipe the first time get_object is called.
# open2 raises an exception on error, no need to 'or die'.
$self->{difftree_pid} =
open2($self->{difftree_stdout}, $self->{difftree_stdin},
$self->_git_cmd, 'diff-tree', '--stdin', '--raw', '--no-commit-id',
'--root', '--no-renames',
# defined $path ? ('--', $path) : ());
defined $path ? ("--relative=$path") : ());
}
# the additional LF ("\n") is to able to detect end of difftree
$self->{difftree_stdin}->printflush("$commit_id $parent_id\n\n")
or die "get_commit_difftree: cannot write to pipe: $!";
my @difftree_lines;
while (my $line = $self->{difftree_stdout}->getline()) {
chomp $line;
last unless $line;
push @difftree_lines, $line;
}
return wantarray ? @difftree_lines : \@difftree_lines;
}
sub DESTROY {
my $self = shift;
if (exists $self->{object_stdout}) {
close $self->{object_stdout}
or die "Closing stdout of git-cat-file --batch failed: $!";
}
if (exists $self->{object_stdin}) {
close $self->{object_stdin}
or die "Closing stdin of git-cat-file --batch failed: $!";
}
if (exists $self->{object_pid}) {
waitpid $self->{object_pid}, 0
or die "Waiting for pid=$self->{object_pid} failed: $!";
}
if (exists $self->{sha1_stdout}) {
close $self->{sha1_stdout}
or die "Closing stdout of git-cat-file --batch-check failed: $!";
}
if (exists $self->{sha1_stdin}) {
close $self->{sha1_stdin}
or die "Closing stdin of git-cat-file --batch-check failed: $!";
}
if (exists $self->{sha1_pid}) {
waitpid $self->{sha1_pid}, 0
or die "Waiting for pid=$self->{sha1_pid} failed: $!";
}
if (exists $self->{difftree_stdout}) {
close $self->{difftree_stdout}
or die "Closing stdout of git-cat-file --batch-check failed: $!";
}
if (exists $self->{difftree_stdin}) {
close $self->{difftree_stdin}
or die "Closing stdin of git-cat-file --batch-check failed: $!";
}
if (exists $self->{difftree_pid}) {
waitpid $self->{difftree_pid}, 0
or die "Waiting for pid=$self->{difftree_pid} failed: $!";
}
}
} # end package Git::Repo;
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
sub set_signals {
my $pid = shift;
$SIG{'PIPE'} = 'IGNORE';
$SIG{'CHLD'} = 'IGNORE';
$SIG{'CHLD'} = sub {
print "REAPER: status $? on $pid\n"
if waitpid($pid, 0) > 0;
};
}
sub cat_file_gitrepo {
my ($repo, $object_name) = @_;
my ($ret_sha1, $type, $size, $content) =
$repo->get_object($object_name);
return wantarray ? ("$ret_sha1 $type $size", $content) : $content;
}
# ----------------------------------------------------------------------
my $fallback_encoding = 'latin1';
# decode sequences of octets in utf8 into Perl's internal form,
# which is utf-8 with utf8 flag set if needed. gitweb writes out
# in utf-8 thanks to "binmode STDOUT, ':utf8'" at beginning
sub to_utf8 {
my $str = shift;
if (utf8::valid($str)) {
utf8::decode($str);
return $str;
} else {
return decode($fallback_encoding, $str, Encode::FB_DEFAULT);
}
}
sub parse_commit {
my ($commit_text) = @_;
my @commit_lines = split '\n', $commit_text;
my %co;
if (! @commit_lines) {
return;
}
my @parents;
HEADER:
while (my $line = shift @commit_lines) {
last if $line eq "\n";
if ($line =~ m/^tree ([0-9a-fA-F]{40})$/) {
$co{'tree'} = $1;
} elsif ($line =~ m/^parent ([0-9a-fA-F]{40})$/) {
push @parents, $1;
} elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) {
$co{'author'} = to_utf8($1);
$co{'author_epoch'} = $2;
$co{'author_tz'} = $3;
if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
$co{'author_name'} = $1;
$co{'author_email'} = $2;
} else {
$co{'author_name'} = $co{'author'};
}
} elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) {
$co{'committer'} = to_utf8($1);
$co{'committer_epoch'} = $2;
$co{'committer_tz'} = $3;
if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
$co{'committer_name'} = $1;
$co{'committer_email'} = $2;
} else {
$co{'committer_name'} = $co{'committer'};
}
}
}
if (!defined $co{'tree'}) {
return;
};
$co{'parents'} = \@parents;
$co{'parent'} = $parents[0];
$co{'comment'} = \@commit_lines;
return wantarray ? %co : \%co;
}
sub unquote { return $_[0] }
# parse line of git-ls-tree output
sub parse_ls_tree_line {
my $line = shift;
my %opts = @_;
my %res;
#'100644 blob 0fa3f3a66fb6a137f6ec2c19351ed4d807070ffa panic.c'
$line =~ m/^([0-9]+) (.+) ([0-9a-fA-F]{40})\t(.+)$/s;
$res{'mode'} = $1;
$res{'type'} = $2;
$res{'hash'} = $3;
if ($opts{'-z'}) {
$res{'name'} = $4;
} else {
$res{'name'} = unquote($4);
}
return wantarray ? %res : \%res;
}
# parse line of git-diff-tree "raw" output
sub parse_difftree_raw_line {
my $line = shift;
my %res;
# ':100644 100644 03b218260e99b78c6df0ed378e59ed9205ccc96d 3b93d5e7cc7f7dd4ebed13a5cc1a4ad976fc94d8 M ls-files.c'
# ':100644 100644 7f9281985086971d3877aca27704f2aaf9c448ce bc190ebc71bbd923f2b728e505408f5e54bd073a M rev-tree.c'
if ($line =~ m/^:([0-7]{6}) ([0-7]{6}) ([0-9a-fA-F]{40}) ([0-9a-fA-F]{40}) (.)([0-9]{0,3})\t(.*)$/) {
$res{'from_mode'} = $1;
$res{'to_mode'} = $2;
$res{'from_id'} = $3;
$res{'to_id'} = $4;
$res{'status'} = $5;
$res{'similarity'} = $6;
if ($res{'status'} eq 'R' || $res{'status'} eq 'C') { # renamed or copied
($res{'from_file'}, $res{'to_file'}) = map { unquote($_) } split("\t", $7);
} else {
$res{'from_file'} = $res{'to_file'} = $res{'file'} = unquote($7);
}
}
# '::100755 100755 100755 60e79ca1b01bc8b057abe17ddab484699a7f5fdb 94067cc5f73388f33722d52ae02f44692bc07490 94067cc5f73388f33722d52ae02f44692bc07490 MR git-gui/git-gui.sh'
# combined diff (for merge commit)
elsif ($line =~ s/^(::+)((?:[0-7]{6} )+)((?:[0-9a-fA-F]{40} )+)([a-zA-Z]+)\t(.*)$//) {
$res{'nparents'} = length($1);
$res{'from_mode'} = [ split(' ', $2) ];
$res{'to_mode'} = pop @{$res{'from_mode'}};
$res{'from_id'} = [ split(' ', $3) ];
$res{'to_id'} = pop @{$res{'from_id'}};
$res{'status'} = [ split('', $4) ];
$res{'to_file'} = unquote($5);
}
# 'c512b523472485aef4fff9e57b229d9d243c967f'
elsif ($line =~ m/^([0-9a-fA-F]{40})$/) {
$res{'commit'} = $1;
}
return wantarray ? %res : \%res;
}
# ......................................................................
# parse one entry of raw 'tree' object output (from 'git cat-file --batch')
sub decode_tree_entry {
my $buf = shift;
#use bytes;
$buf =~ s/^([0-7]+) //;
my ($mode_str) = $1;
my ($filename, $sha1_str) = unpack('Z*H[40]', $buf);
return ($mode_str, $filename, $sha1_str);
}
sub tree_entry_len {
my ($mode_str, $filename) = @_;
#use bytes;
# length of mode string + separator + 20 bytes of SHA-1
# + length of filename (in bytes) + terminating NUL ('\0')
length($mode_str)+1 + length($filename)+1 + 20;
}
use constant {
S_IFINVALID => 0030000,
S_IFGITLINK => 0160000,
};
# submodule/subproject, a commit object reference
sub S_ISGITLINK {
my $mode = shift;
return (($mode & S_IFMT) == S_IFGITLINK)
}
sub type_from_mode {
my $mode = oct shift;
if (S_ISGITLINK($mode)) {
return "commit";
} elsif (S_ISDIR($mode & S_IFMT)) {
return "tree";
}
return "blob";
}
# ......................................................................
sub decode_tree {
my $contents = shift;
#use bytes;
my @result;
while (my @entry = decode_tree_entry($contents)) {
#print join(' ', @entry)."\n";
#printf("%06d %s\t%s\n", $entry[0], $entry[2], $entry[1]);
push @result,
{ 'mode' => $entry[0],
'type' => type_from_mode($entry[0]),
'name' => $entry[1],
'hash' => $entry[2] };
my $len = tree_entry_len(@entry);
#print substr($contents, 0, $len)."\n";
$contents = substr($contents, $len);
last unless $contents;
}
return wantarray ? @result : \@result;
}
sub tree_entry_eq {
my ($a, $b) = @_;
return
$a->{'mode'} == $b->{'mode'} &&
$a->{'type'} eq $b->{'type'} &&
$a->{'name'} eq $b->{'name'} &&
$a->{'hash'} eq $b->{'hash'};
}
# ......................................................................
sub print_parsed_tree {
my $tree = shift;
foreach my $tree_entry (@$tree) {
print format_tree_entry($tree_entry)."\n";
}
}
sub print_tree_blame {
my ($tree, $tree_blame) = @_;
foreach my $tree_entry (@$tree) {
#print format_tree_blame_entry($tree_blame->{$tree_entry->{'hash'}})."\n";
print format_tree_blame_entry($tree_entry)."\n";
}
}
sub format_tree_entry {
my $tree_entry = shift;
return sprintf("%06d %s %s\t%s",
$tree_entry->{'mode'}, $tree_entry->{'type'},
$tree_entry->{'hash'}, $tree_entry->{'name'});
}
sub format_tree_blame_entry {
my $tree_entry = shift;
#my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday)
# = gmtime($tree_entry->{'author_epoch'});
#return sprintf("%06d %s %.8s %.8s %.15s %02d-%02d-%04d\t%s",
# $tree_entry->{'mode'}, $tree_entry->{'type'},
# $tree_entry->{'hash'}, $tree_entry->{'commit'},
# $tree_entry->{'author_name'}, $mday, $mon, 1900 + $year,
# $tree_entry->{'name'});
return sprintf("%06d %s %.8s %.8s\t%.10s\t%s",
$tree_entry->{'mode'}, $tree_entry->{'type'},
$tree_entry->{'hash'}, $tree_entry->{'commit'},
$tree_entry->{'name'}, $tree_entry->{'summary'});
}
# ======================================================================
sub tree_blame_commit {
my ($repo, $commit_id, $tree_blame, $path) = @_;
my ($commit_sha1, undef, undef, $commit_text) =
$repo->get_object($commit_id);
my %commit = parse_commit($commit_text);
my $nunblamed = scalar grep { !exists $_->{'commit'} } @$tree_blame;
printf("processing %6s (%1d parents, %d unblamed): %s\n",
substr($commit_sha1,0,6), scalar @{$commit{'parents'}},
$nunblamed, $commit{'comment'}[0])
if DEBUG >= 1;
print Dumper($tree_blame)
if DEBUG >= 2;
return unless $nunblamed > 0;
foreach my $parent (@{$commit{'parents'}}) {
my @difftree = $repo->get_commit_difftree($commit_sha1, $parent, $path);
@difftree = map { { parse_difftree_raw_line($_) } } @difftree;
printf("processing %6s: parent %6s has %d in difftree\n",
substr($commit_sha1,0,6), substr($parent,0,6),
scalar @difftree)
if DEBUG >= 1;
no warnings 'recursion';
# mark entries from @difftree
#mark_changed($tree_blame, \@difftree, [ $commit_sha1 ]);
mark_changed($tree_blame, \@difftree, \$commit_sha1);
# pass blame to parent
tree_blame_commit($repo, $parent, $tree_blame, $path);
# remove marks
#remove_marks($tree_blame, [ $commit_sha1 ]);
remove_marks($tree_blame, \$commit_sha1);
}
my $nblames_this = 0;
foreach my $tree_entry (@$tree_blame) {
if (!exists $tree_entry->{'commit'}) {
$tree_entry->{'commit'} = $commit_sha1;
$tree_entry->{'summary'} = $commit{'comment'}[0];
$tree_entry->{'author_name'} = $commit{'author_name'};
if ($commit{'author_email'} &&
$commit{'author_email'} =~ /^([^@]+)@/) {
$tree_entry->{'author_user'} = $1;
}
$tree_entry->{'author_epoch'} = $commit{'author_epoch'};
my @difftree = $repo->get_commit_difftree($commit_sha1, '-m', $path);
@difftree = map { { parse_difftree_raw_line($_) } } @difftree;
$tree_entry->{'difftree'} =
[ grep { $_->{'to_id'} eq $tree_entry->{'hash'} } @difftree ];
$nblames_this++;
}
}
printf("done %6s (got blamed by %1d, %d unblamed left)\n",
substr($commit_sha1,0,6), $nblames_this,
scalar grep { !exists $_->{'commit'} } @$tree_blame)
if DEBUG >= 1;
}
sub mark_changed {
my ($tree_blame, $difftree, $value) = @_;
my @blame_sha1 = map { $_->{'hash'} } @$tree_blame;
my @difftree_sha1 = map { $_->{'to_id'} } @$difftree;
# not optimized: both @$tree_blame and @$difftree are sorted by filename
my @common_sha1 = get_intersection('--unsorted', [ \@blame_sha1, \@difftree_sha1 ]);
my %tree_blame = map { $_->{'hash'} => $_ } @$tree_blame;
map { $_->{'commit'} = $value unless exists $_->{'commit'} }
@tree_blame{@common_sha1};
@$tree_blame = values %tree_blame;
return @$tree_blame;
}
sub remove_marks {
my ($tree_blame, $value) = @_;
map {
delete($_->{'commit'})
#if (ref($_->{'commit'}) && ($_->{'commit'}->[0] eq $value->[0]))
if (ref($_->{'commit'}) && (${$_->{'commit'}} eq $$value))
} @$tree_blame;
return @$tree_blame;
}
# ======================================================================
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ----------------------------------------------------------------------
# MAIN
my $git_dir = "/home/jnareb/git/gitweb/test/.git";
#my $tree_path = "sub";
my $tree_path = "";
my $start_commit = "HEAD";
#my $git_dir = "/home/jnareb/git/.git";
#my $tree_path = "contrib";
#my $start_commit = "HEAD";
#my $tree_path = '';
#my $start_commit = "todo";
my $repo = Git::Repo->new(repo_dir=>$git_dir);
my ($sha1, $type, $size, $obj_data) =
$repo->get_object("$start_commit:$tree_path");
my @tree_blame = decode_tree($obj_data);
tree_blame_commit($repo, $start_commit, \@tree_blame, $tree_path);
print Dumper(\@tree_blame)
if DEBUG >= 2;
print_tree_blame(\@tree_blame)
if DEBUG < 2;
#print Dumper(\@tree_blame);
#print Dumper($tree_blame[0]);
__END__
# ----------------------------------------------------------------------
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ======================================================================
#http://git.or.cz/gitwiki/ExampleScripts#Findingwhichcommitslasttouchedthefiles
#http://gist.github.com/247395
my %attributions = ();
my @files = ();
open IN, "git ls-tree -r --full-name HEAD |" or die;
while (<IN>) {
if (/^\S+\s+blob \S+\s+(\S+)$/) {
$files[$#files + 1] = $1;
$attributions{$1} = -1;
}
}
close IN;
my $remaining = $#files + 1;
open IN, "git log -r --root --raw --no-abbrev --pretty=format:%h~%an~%ad~ |" or die;
while (<IN>) {
if (/^([^:~]+)~(.*)~([^~]+)~$/) {
($commit, $author, $date) = ($1, $2, $3);
} elsif (/^:\S+\s+1\S+\s+\S+\s+\S+\s+\S\s+(.*)$/) {
if ($attributions{$1} == -1) {
$attributions{$1} = "$author, $date ($commit)";
$remaining--;
if ($remaining <= 0) {
break;
}
}
}
}
close IN;
for $f (@files) {
print "$f $attributions{$f}\n";
}
^ permalink raw reply [flat|nested] 9+ messages in thread