From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 008451F9F3 for ; Sat, 2 Oct 2021 11:18:36 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/4] lei mail-diff: diagnostic command to diff mail contents Date: Sat, 2 Oct 2021 11:18:33 +0000 Message-Id: <20211002111835.19220-3-e@80x24.org> In-Reply-To: <20211002111835.19220-1-e@80x24.org> References: <20211002111835.19220-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This is useful in finding the cause of deduplication bugs, and possibly the cause of missing threads reported by Konstantin in <20211001130527.z7eivotlgqbgetzz@meerkat.local> usage: u=https://yhbt.net/lore/all/87czop5j33.fsf@tynnyri.adurom.net/raw lei mail-diff $u --- MANIFEST | 1 + lib/PublicInbox/ContentHash.pm | 6 +- lib/PublicInbox/LEI.pm | 5 ++ lib/PublicInbox/LeiInput.pm | 6 ++ lib/PublicInbox/LeiMailDiff.pm | 111 +++++++++++++++++++++++++++++++++ lib/PublicInbox/LeiRediff.pm | 63 ++++++++++--------- 6 files changed, 160 insertions(+), 32 deletions(-) create mode 100644 lib/PublicInbox/LeiMailDiff.pm diff --git a/MANIFEST b/MANIFEST index 74b28d2d54ab..22b7df9bb89d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -237,6 +237,7 @@ lib/PublicInbox/LeiLsMailSource.pm lib/PublicInbox/LeiLsMailSync.pm lib/PublicInbox/LeiLsSearch.pm lib/PublicInbox/LeiLsWatch.pm +lib/PublicInbox/LeiMailDiff.pm lib/PublicInbox/LeiMailSync.pm lib/PublicInbox/LeiMirror.pm lib/PublicInbox/LeiNoteEvent.pm diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index cc4a54c922d0..f6ae9011c1bf 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -52,9 +52,9 @@ sub content_dig_i { $dig->add($s); } -sub content_digest ($) { - my ($eml) = @_; - my $dig = Digest::SHA->new(256); +sub content_digest ($;$) { + my ($eml, $dig) = @_; + $dig //= Digest::SHA->new(256); # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index fd59235846ae..51b0e95e1728 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -203,6 +203,11 @@ our %CMD = ( # sorted in order of importance/use: qw(git-dir=s@ cwd! verbose|v+ color:s no-color drq:1 dequote-only:1), @diff_opt, @lxs_opt, @net_opt, @c_opt ], +'mail-diff' => [ '--stdin|LOCATION...', 'diff the contents of emails', + 'stdin|', # /|\z/ must be first for lone dash + qw(verbose|v+ color:s no-color raw-header), + @diff_opt, @net_opt, @c_opt ], + 'add-external' => [ 'LOCATION', 'add/set priority of a publicinbox|extindex for extra matches', qw(boost=i mirror=s inbox-version=i epoch=s verbose|v+), diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index 22bedba6e6c3..83479221000b 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -57,6 +57,12 @@ sub check_input_format ($;$) { 1; } +sub input_mbox_cb { # base MboxReader callback + my ($eml, $self) = @_; + $eml->header_set($_) for (qw(Status X-Status)); + $self->input_eml_cb($eml); +} + # import a single file handle of $name # Subclass must define ->input_eml_cb and ->input_mbox_cb sub input_fh { diff --git a/lib/PublicInbox/LeiMailDiff.pm b/lib/PublicInbox/LeiMailDiff.pm new file mode 100644 index 000000000000..a29ae2259f71 --- /dev/null +++ b/lib/PublicInbox/LeiMailDiff.pm @@ -0,0 +1,111 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# The "lei mail-diff" sub-command, diffs input contents against +# the first message of input +package PublicInbox::LeiMailDiff; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use File::Temp 0.19 (); # 0.19 for ->newdir +use PublicInbox::Spawn qw(spawn which); +use PublicInbox::MsgIter qw(msg_part_text); +use File::Path qw(remove_tree); +use PublicInbox::ContentHash qw(content_digest); +require PublicInbox::LeiRediff; +use Data::Dumper (); + +sub write_part { # Eml->each_part callback + my ($ary, $self) = @_; + my ($part, $depth, $idx) = @$ary; + if ($idx ne '1' || $self->{lei}->{opt}->{'raw-header'}) { + open my $fh, '>', "$self->{curdir}/$idx.hdr" or die "open: $!"; + print $fh ${$part->{hdr}} or die "print $!"; + close $fh or die "close $!"; + } + my $ct = $part->content_type || 'text/plain'; + my ($s, $err) = msg_part_text($part, $ct); + my $sfx = defined($s) ? 'txt' : 'bin'; + open my $fh, '>', "$self->{curdir}/$idx.$sfx" or die "open: $!"; + print $fh ($s // $part->body) or die "print $!"; + close $fh or die "close $!"; +} + +sub dump_eml ($$$) { + my ($self, $dir, $eml) = @_; + local $self->{curdir} = $dir; + mkdir $dir or die "mkdir($dir): $!"; + $eml->each_part(\&write_part, $self); + + open my $fh, '>', "$dir/content_digest" or die "open: $!"; + my $dig = PublicInbox::ContentDigestDbg->new($fh); + local $Data::Dumper::Useqq = 1; + local $Data::Dumper::Terse = 1; + content_digest($eml, $dig); + print $fh "\n", $dig->hexdigest, "\n" or die "print $!"; + close $fh or die "close: $!"; +} + +sub prep_a ($$) { + my ($self, $eml) = @_; + $self->{tmp} = File::Temp->newdir('lei-mail-diff-XXXX', TMPDIR => 1); + dump_eml($self, "$self->{tmp}/a", $eml); +} + +sub diff_a ($$) { + my ($self, $eml) = @_; + ++$self->{nr}; + my $dir = "$self->{tmp}/N$self->{nr}"; + dump_eml($self, $dir, $eml); + my $cmd = [ qw(git diff --no-index) ]; + my $lei = $self->{lei}; + PublicInbox::LeiRediff::_lei_diff_prepare($lei, $cmd); + push @$cmd, qw(-- a), "N$self->{nr}"; + my $rdr = { -C => "$self->{tmp}" }; + @$rdr{1, 2} = @$lei{1, 2}; + my $pid = spawn($cmd, $lei->{env}, $rdr); + waitpid($pid, 0); + $lei->child_error($?) if $?; # for git diff --exit-code + File::Path::remove_tree($self->{curdir}); +} + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + $self->{tmp} ? diff_a($self, $eml) : prep_a($self, $eml); +} + +sub lei_mail_diff { + my ($lei, @argv) = @_; + $lei->{opt}->{'in-format'} //= 'eml'; + my $self = bless {}, __PACKAGE__; + $self->prepare_inputs($lei, \@argv) or return; + my $isatty = -t $lei->{1}; + $lei->{opt}->{color} //= $isatty; + $lei->start_pager if $isatty; + my $ops = {}; + $lei->{auth}->op_merge($ops, $self) if $lei->{auth}; + (my $op_c, $ops) = $lei->workers_start($self, 1, $ops); + $lei->{wq1} = $self; + $lei->{-err_type} = 'non-fatal'; + net_merge_all_done($self) unless $lei->{auth}; + $lei->wait_wq_events($op_c, $ops); +} + +no warnings 'once'; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +package PublicInbox::ContentDigestDbg; +use strict; +use v5.10.1; +use Data::Dumper; + +sub new { bless { dig => Digest::SHA->new(256), fh => $_[1] }, __PACKAGE__ } + +sub add { + $_[0]->{dig}->add($_[1]); + print { $_[0]->{fh} } Dumper($_[1]) or die "print $!"; +} + +sub hexdigest { $_[0]->{dig}->hexdigest; } + +1; diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm index 1e95e55ac1cc..decb721b7a4c 100644 --- a/lib/PublicInbox/LeiRediff.pm +++ b/lib/PublicInbox/LeiRediff.pm @@ -56,6 +56,34 @@ sub solve_1 ($$$) { $self->{blob}->{$oid_want}; # full OID } +sub _lei_diff_prepare ($$) { + my ($lei, $cmd) = @_; + my $opt = $lei->{opt}; + push @$cmd, '--'.($opt->{color} && !$opt->{'no-color'} ? '' : 'no-'). + 'color'; + for my $o (@PublicInbox::LEI::diff_opt) { + my $c = ''; + # remove single char short option + $o =~ s/\|([a-z0-9])\b//i and $c = $1; + if ($o =~ s/=[is]@\z//) { + my $v = $opt->{$o} or next; + push @$cmd, map { $c ? "-$c$_" : "--$o=$_" } @$v; + } elsif ($o =~ s/=[is]\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $c ? "-$c$v" : "--$o=$v"; + } elsif ($o =~ s/:[is]\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $c ? "-$c$v" : + ($v eq '' ? "--$o" : "--$o=$v"); + } elsif ($o =~ s/!\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $v ? "--$o" : "--no-$o"; + } elsif ($opt->{$o}) { + push @$cmd, $c ? "-$c" : "--$o"; + } + } +} + sub diff_ctxq ($$) { my ($self, $ctxq) = @_; return unless $ctxq; @@ -103,35 +131,12 @@ EOM waitpid($pid, 0); die "fast-import failed: \$?=$?" if $?; - my @cmd = qw(diff); - my $opt = $lei->{opt}; - push @cmd, '--'.($opt->{color} && !$opt->{'no-color'} ? '' : 'no-'). - 'color'; - for my $o (@PublicInbox::LEI::diff_opt) { - my $c = ''; - # remove single char short option - $o =~ s/\|([a-z0-9])\b//i and $c = $1; - if ($o =~ s/=[is]@\z//) { - my $v = $opt->{$o} or next; - push @cmd, map { $c ? "-$c$_" : "--$o=$_" } @$v; - } elsif ($o =~ s/=[is]\z//) { - my $v = $opt->{$o} // next; - push @cmd, $c ? "-$c$v" : "--$o=$v"; - } elsif ($o =~ s/:[is]\z//) { - my $v = $opt->{$o} // next; - push @cmd, $c ? "-$c$v" : - ($v eq '' ? "--$o" : "--$o=$v"); - } elsif ($o =~ s/!\z//) { - my $v = $opt->{$o} // next; - push @cmd, $v ? "--$o" : "--no-$o"; - } elsif ($opt->{$o}) { - push @cmd, $c ? "-$c" : "--$o"; - } - } - $lei->qerr("# git @cmd"); - push @cmd, qw(A B); - unshift @cmd, 'git', "--git-dir=$rw->{git_dir}"; - $pid = spawn(\@cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }); + my $cmd = [ 'diff' ]; + _lei_diff_prepare($lei, $cmd); + $lei->qerr("# git @$cmd"); + push @$cmd, qw(A B); + unshift @$cmd, 'git', "--git-dir=$rw->{git_dir}"; + $pid = spawn($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }); waitpid($pid, 0); $lei->child_error($?) if $?; # for git diff --exit-code undef;