about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-10 05:13:19 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-12 17:25:13 +0000
commit5b491c0b308e576897cfcc270647ba4e35c6cc8a (patch)
tree37ae103a0931c3fb99456f5e375f69e3716d7ee5
parentd8bc86742a146f7665f0548cf855c2b2153617e0 (diff)
downloadpublic-inbox-5b491c0b308e576897cfcc270647ba4e35c6cc8a.tar.gz
Call order will need to change a bit since this is going to be
tied to Xapian
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/ContentId.pm30
-rw-r--r--lib/PublicInbox/Import.pm74
3 files changed, 91 insertions, 14 deletions
diff --git a/MANIFEST b/MANIFEST
index 5074d8dc..85e85031 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -46,6 +46,7 @@ examples/varnish-4.vcl
 lib/PublicInbox/Address.pm
 lib/PublicInbox/AltId.pm
 lib/PublicInbox/Config.pm
+lib/PublicInbox/ContentId.pm
 lib/PublicInbox/Daemon.pm
 lib/PublicInbox/Emergency.pm
 lib/PublicInbox/EvCleanup.pm
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
new file mode 100644
index 00000000..65d5a76c
--- /dev/null
+++ b/lib/PublicInbox/ContentId.pm
@@ -0,0 +1,30 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+package PublicInbox::ContentId;
+use strict;
+use warnings;
+use base qw/Exporter/;
+our @EXPORT_OK = qw/content_id/;
+
+# not sure if less-widely supported hash families are worth bothering with
+use Digest::SHA;
+
+# Content-* headers are often no-ops, so maybe we don't need them
+my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To);
+
+sub content_id ($;$) {
+        my ($mime, $alg) = @_;
+        $alg ||= 256;
+        my $dig = Digest::SHA->new($alg);
+        my $hdr = $mime->header_obj;
+
+        foreach my $h (@ID_HEADERS) {
+                my @v = $hdr->header_raw($h);
+                $dig->add($_) foreach @v;
+        }
+        $dig->add($mime->body_raw);
+        'SHA-' . $dig->algorithm . ':' . $dig->hexdigest;
+}
+
+1;
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 56633a84..b8e9dd03 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -11,6 +11,7 @@ use Fcntl qw(:flock :DEFAULT);
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::MID qw(mid_mime mid2path);
 use PublicInbox::Address;
+use PublicInbox::ContentId qw(content_id);
 
 sub new {
         my ($class, $git, $name, $email, $ibx) = @_;
@@ -26,6 +27,7 @@ sub new {
                 mark => 1,
                 ref => $ref,
                 inbox => $ibx,
+                path_type => '2/38', # or 'v2'
                 ssoma_lock => 1, # disable for v2
         }, $class
 }
@@ -88,6 +90,7 @@ sub norm_body ($) {
         $b
 }
 
+# only used for v1 (ssoma) inboxes
 sub _check_path ($$$$) {
         my ($r, $w, $tip, $path) = @_;
         return if $tip eq '';
@@ -97,17 +100,9 @@ sub _check_path ($$$$) {
         $info =~ /\Amissing / ? undef : $info;
 }
 
-# returns undef on non-existent
-# ('MISMATCH', msg) on mismatch
-# (:MARK, msg) on success
-sub remove {
-        my ($self, $mime, $msg) = @_; # mime = Email::MIME
-
-        my $mid = mid_mime($mime);
-        my $path = mid2path($mid);
+sub check_remove_v1 {
+        my ($r, $w, $tip, $path, $mime) = @_;
 
-        my ($r, $w) = $self->gfi_start;
-        my $tip = $self->{tip};
         my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef);
         $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info";
         my $blob = $1;
@@ -140,6 +135,34 @@ sub remove {
         if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) {
                 return ('MISMATCH', $cur);
         }
+        (undef, $cur);
+}
+
+# returns undef on non-existent
+# ('MISMATCH', msg) on mismatch
+# (:MARK, msg) on success
+#
+# For v2 inboxes, the content_id is returned instead of the msg
+# v2 callers should check with Xapian before calling this as
+# it is not idempotent.
+sub remove {
+        my ($self, $mime, $msg) = @_; # mime = Email::MIME
+
+        my $path_type = $self->{path_type};
+        my ($path, $err, $cur, $blob);
+
+        my ($r, $w) = $self->gfi_start;
+        my $tip = $self->{tip};
+        if ($path_type eq '2/38') {
+                $path = mid2path(mid_mime($mime));
+                ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime);
+                return ($err, $cur) if $err;
+        } else {
+                $cur = content_id($mime);
+                my $len = length($cur);
+                $blob = $self->{mark}++;
+                print $w "blob\nmark :$blob\ndata $len\n$cur\n" or wfail;
+        }
 
         my $ref = $self->{ref};
         my $commit = $self->{mark}++;
@@ -156,7 +179,11 @@ sub remove {
                 "committer $ident $now\n",
                 "data $len\n$msg\n\n",
                 'from ', ($parent ? $parent : $tip), "\n" or wfail;
-        print $w "D $path\n\n" or wfail;
+        if (defined $path) {
+                print $w "D $path\n\n" or wfail;
+        } else {
+                print $w "M 100644 :$blob d\n\n" or wfail;
+        }
         $self->{nchg}++;
         (($self->{tip} = ":$commit"), $cur);
 }
@@ -177,15 +204,25 @@ sub add {
         my $date = $mime->header('Date');
         my $subject = $mime->header('Subject');
         $subject = '(no subject)' unless defined $subject;
-        my $mid = mid_mime($mime);
-        my $path = mid2path($mid);
+        my $path_type = $self->{path_type};
+
+        my $path;
+        if ($path_type eq '2/38') {
+                $path = mid2path(mid_mime($mime));
+        } else { # v2 layout, one file:
+                $path = 'm';
+        }
 
         my ($r, $w) = $self->gfi_start;
         my $tip = $self->{tip};
-        _check_path($r, $w, $tip, $path) and return;
+        if ($path_type eq '2/38') {
+                _check_path($r, $w, $tip, $path) and return;
+        }
 
         # kill potentially confusing/misleading headers
         $mime->header_set($_) for qw(bytes lines content-length status);
+
+        # spam check:
         if ($check_cb) {
                 $mime = $check_cb->($mime) or return;
         }
@@ -194,6 +231,15 @@ sub add {
         my $blob = $self->{mark}++;
         print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail;
         print $w $mime, "\n" or wfail;
+
+        # v2: we need this for Xapian
+        if ($self->{want_object_id}) {
+                print $w "get-mark :$blob\n" or wfail;
+                defined(my $object_id = <$r>) or
+                                die "get-mark failed, need git 2.6.0+\n";
+                chomp($self->{last_object_id} = $object_id);
+        }
+
         my $ref = $self->{ref};
         my $commit = $self->{mark}++;
         my $parent = $tip =~ /\A:/ ? $tip : undef;