about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-10-02 11:18:33 +0000
committerEric Wong <e@80x24.org>2021-10-02 20:09:31 +0000
commit64f6a4c97b05a709de60aea9c3a5f51d7a37f226 (patch)
treeb0be5f5383b06540f2f48a98db9c8ab4931423c3 /lib/PublicInbox
parent3e096b699243ea7bd9d12fec8f3048104e462802 (diff)
downloadpublic-inbox-64f6a4c97b05a709de60aea9c3a5f51d7a37f226.tar.gz
This is useful in finding the cause of deduplication bugs,
and possibly the cause of missing threads reported by
Konstantin in <20211001130527.z7eivotlgqbgetzz@meerkat.local>

usage:

  u=https://yhbt.net/lore/all/87czop5j33.fsf@tynnyri.adurom.net/raw
  lei mail-diff $u
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/ContentHash.pm6
-rw-r--r--lib/PublicInbox/LEI.pm5
-rw-r--r--lib/PublicInbox/LeiInput.pm6
-rw-r--r--lib/PublicInbox/LeiMailDiff.pm111
-rw-r--r--lib/PublicInbox/LeiRediff.pm63
5 files changed, 159 insertions, 32 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm
index cc4a54c9..f6ae9011 100644
--- a/lib/PublicInbox/ContentHash.pm
+++ b/lib/PublicInbox/ContentHash.pm
@@ -52,9 +52,9 @@ sub content_dig_i {
         $dig->add($s);
 }
 
-sub content_digest ($) {
-        my ($eml) = @_;
-        my $dig = Digest::SHA->new(256);
+sub content_digest ($;$) {
+        my ($eml, $dig) = @_;
+        $dig //= Digest::SHA->new(256);
 
         # References: and In-Reply-To: get used interchangeably
         # in some "duplicates" in LKML.  We treat them the same
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index fd592358..51b0e95e 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -203,6 +203,11 @@ our %CMD = ( # sorted in order of importance/use:
         qw(git-dir=s@ cwd! verbose|v+ color:s no-color drq:1 dequote-only:1),
         @diff_opt, @lxs_opt, @net_opt, @c_opt ],
 
+'mail-diff' => [ '--stdin|LOCATION...', 'diff the contents of emails',
+        'stdin|', # /|\z/ must be first for lone dash
+        qw(verbose|v+ color:s no-color raw-header),
+        @diff_opt, @net_opt, @c_opt ],
+
 'add-external' => [ 'LOCATION',
         'add/set priority of a publicinbox|extindex for extra matches',
         qw(boost=i mirror=s inbox-version=i epoch=s verbose|v+),
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 22bedba6..83479221 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -57,6 +57,12 @@ sub check_input_format ($;$) {
         1;
 }
 
+sub input_mbox_cb { # base MboxReader callback
+        my ($eml, $self) = @_;
+        $eml->header_set($_) for (qw(Status X-Status));
+        $self->input_eml_cb($eml);
+}
+
 # import a single file handle of $name
 # Subclass must define ->input_eml_cb and ->input_mbox_cb
 sub input_fh {
diff --git a/lib/PublicInbox/LeiMailDiff.pm b/lib/PublicInbox/LeiMailDiff.pm
new file mode 100644
index 00000000..a29ae225
--- /dev/null
+++ b/lib/PublicInbox/LeiMailDiff.pm
@@ -0,0 +1,111 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# The "lei mail-diff" sub-command, diffs input contents against
+# the first message of input
+package PublicInbox::LeiMailDiff;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC PublicInbox::LeiInput);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+use PublicInbox::Spawn qw(spawn which);
+use PublicInbox::MsgIter qw(msg_part_text);
+use File::Path qw(remove_tree);
+use PublicInbox::ContentHash qw(content_digest);
+require PublicInbox::LeiRediff;
+use Data::Dumper ();
+
+sub write_part { # Eml->each_part callback
+        my ($ary, $self) = @_;
+        my ($part, $depth, $idx) = @$ary;
+        if ($idx ne '1' || $self->{lei}->{opt}->{'raw-header'}) {
+                open my $fh, '>', "$self->{curdir}/$idx.hdr" or die "open: $!";
+                print $fh ${$part->{hdr}} or die "print $!";
+                close $fh or die "close $!";
+        }
+        my $ct = $part->content_type || 'text/plain';
+        my ($s, $err) = msg_part_text($part, $ct);
+        my $sfx = defined($s) ? 'txt' : 'bin';
+        open my $fh, '>', "$self->{curdir}/$idx.$sfx" or die "open: $!";
+        print $fh ($s // $part->body) or die "print $!";
+        close $fh or die "close $!";
+}
+
+sub dump_eml ($$$) {
+        my ($self, $dir, $eml) = @_;
+        local $self->{curdir} = $dir;
+        mkdir $dir or die "mkdir($dir): $!";
+        $eml->each_part(\&write_part, $self);
+
+        open my $fh, '>', "$dir/content_digest" or die "open: $!";
+        my $dig = PublicInbox::ContentDigestDbg->new($fh);
+        local $Data::Dumper::Useqq = 1;
+        local $Data::Dumper::Terse = 1;
+        content_digest($eml, $dig);
+        print $fh "\n", $dig->hexdigest, "\n" or die "print $!";
+        close $fh or die "close: $!";
+}
+
+sub prep_a ($$) {
+        my ($self, $eml) = @_;
+        $self->{tmp} = File::Temp->newdir('lei-mail-diff-XXXX', TMPDIR => 1);
+        dump_eml($self, "$self->{tmp}/a", $eml);
+}
+
+sub diff_a ($$) {
+        my ($self, $eml) = @_;
+        ++$self->{nr};
+        my $dir = "$self->{tmp}/N$self->{nr}";
+        dump_eml($self, $dir, $eml);
+        my $cmd = [ qw(git diff --no-index) ];
+        my $lei = $self->{lei};
+        PublicInbox::LeiRediff::_lei_diff_prepare($lei, $cmd);
+        push @$cmd, qw(-- a), "N$self->{nr}";
+        my $rdr = { -C => "$self->{tmp}" };
+        @$rdr{1, 2} = @$lei{1, 2};
+        my $pid = spawn($cmd, $lei->{env}, $rdr);
+        waitpid($pid, 0);
+        $lei->child_error($?) if $?; # for git diff --exit-code
+        File::Path::remove_tree($self->{curdir});
+}
+
+sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh
+        my ($self, $eml) = @_;
+        $self->{tmp} ? diff_a($self, $eml) : prep_a($self, $eml);
+}
+
+sub lei_mail_diff {
+        my ($lei, @argv) = @_;
+        $lei->{opt}->{'in-format'} //= 'eml';
+        my $self = bless {}, __PACKAGE__;
+        $self->prepare_inputs($lei, \@argv) or return;
+        my $isatty = -t $lei->{1};
+        $lei->{opt}->{color} //= $isatty;
+        $lei->start_pager if $isatty;
+        my $ops = {};
+        $lei->{auth}->op_merge($ops, $self) if $lei->{auth};
+        (my $op_c, $ops) = $lei->workers_start($self, 1, $ops);
+        $lei->{wq1} = $self;
+        $lei->{-err_type} = 'non-fatal';
+        net_merge_all_done($self) unless $lei->{auth};
+        $lei->wait_wq_events($op_c, $ops);
+}
+
+no warnings 'once';
+*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done;
+
+package PublicInbox::ContentDigestDbg;
+use strict;
+use v5.10.1;
+use Data::Dumper;
+
+sub new { bless { dig => Digest::SHA->new(256), fh => $_[1] }, __PACKAGE__ }
+
+sub add {
+        $_[0]->{dig}->add($_[1]);
+        print { $_[0]->{fh} } Dumper($_[1]) or die "print $!";
+}
+
+sub hexdigest { $_[0]->{dig}->hexdigest; }
+
+1;
diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm
index 1e95e55a..decb721b 100644
--- a/lib/PublicInbox/LeiRediff.pm
+++ b/lib/PublicInbox/LeiRediff.pm
@@ -56,6 +56,34 @@ sub solve_1 ($$$) {
         $self->{blob}->{$oid_want}; # full OID
 }
 
+sub _lei_diff_prepare ($$) {
+        my ($lei, $cmd) = @_;
+        my $opt = $lei->{opt};
+        push @$cmd, '--'.($opt->{color} && !$opt->{'no-color'} ? '' : 'no-').
+                        'color';
+        for my $o (@PublicInbox::LEI::diff_opt) {
+                my $c = '';
+                # remove single char short option
+                $o =~ s/\|([a-z0-9])\b//i and $c = $1;
+                if ($o =~ s/=[is]@\z//) {
+                        my $v = $opt->{$o} or next;
+                        push @$cmd, map { $c ? "-$c$_" : "--$o=$_" } @$v;
+                } elsif ($o =~ s/=[is]\z//) {
+                        my $v = $opt->{$o} // next;
+                        push @$cmd, $c ? "-$c$v" : "--$o=$v";
+                } elsif ($o =~ s/:[is]\z//) {
+                        my $v = $opt->{$o} // next;
+                        push @$cmd, $c ? "-$c$v" :
+                                        ($v eq '' ? "--$o" : "--$o=$v");
+                } elsif ($o =~ s/!\z//) {
+                        my $v = $opt->{$o} // next;
+                        push @$cmd, $v ? "--$o" : "--no-$o";
+                } elsif ($opt->{$o}) {
+                        push @$cmd, $c ? "-$c" : "--$o";
+                }
+        }
+}
+
 sub diff_ctxq ($$) {
         my ($self, $ctxq) = @_;
         return unless $ctxq;
@@ -103,35 +131,12 @@ EOM
         waitpid($pid, 0);
         die "fast-import failed: \$?=$?" if $?;
 
-        my @cmd = qw(diff);
-        my $opt = $lei->{opt};
-        push @cmd, '--'.($opt->{color} && !$opt->{'no-color'} ? '' : 'no-').
-                        'color';
-        for my $o (@PublicInbox::LEI::diff_opt) {
-                my $c = '';
-                # remove single char short option
-                $o =~ s/\|([a-z0-9])\b//i and $c = $1;
-                if ($o =~ s/=[is]@\z//) {
-                        my $v = $opt->{$o} or next;
-                        push @cmd, map { $c ? "-$c$_" : "--$o=$_" } @$v;
-                } elsif ($o =~ s/=[is]\z//) {
-                        my $v = $opt->{$o} // next;
-                        push @cmd, $c ? "-$c$v" : "--$o=$v";
-                } elsif ($o =~ s/:[is]\z//) {
-                        my $v = $opt->{$o} // next;
-                        push @cmd, $c ? "-$c$v" :
-                                        ($v eq '' ? "--$o" : "--$o=$v");
-                } elsif ($o =~ s/!\z//) {
-                        my $v = $opt->{$o} // next;
-                        push @cmd, $v ? "--$o" : "--no-$o";
-                } elsif ($opt->{$o}) {
-                        push @cmd, $c ? "-$c" : "--$o";
-                }
-        }
-        $lei->qerr("# git @cmd");
-        push @cmd, qw(A B);
-        unshift @cmd, 'git', "--git-dir=$rw->{git_dir}";
-        $pid = spawn(\@cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} });
+        my $cmd = [ 'diff' ];
+        _lei_diff_prepare($lei, $cmd);
+        $lei->qerr("# git @$cmd");
+        push @$cmd, qw(A B);
+        unshift @$cmd, 'git', "--git-dir=$rw->{git_dir}";
+        $pid = spawn($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} });
         waitpid($pid, 0);
         $lei->child_error($?) if $?; # for git diff --exit-code
         undef;