From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5AECD1FAE8 for ; Thu, 15 Feb 2018 11:08:45 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [WIP 06/17] import: initial handling for v2 Date: Thu, 15 Feb 2018 11:08:29 +0000 Message-Id: <20180215110840.30413-7-e@80x24.org> In-Reply-To: <20180215110840.30413-1-e@80x24.org> References: <20180215105509.GA22409@dcvr> <20180215110840.30413-1-e@80x24.org> List-Id: Call order will need to change a bit since this is going to be tied to Xapian --- MANIFEST | 1 + lib/PublicInbox/ContentId.pm | 30 ++++++++++++++++++ lib/PublicInbox/Import.pm | 74 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 91 insertions(+), 14 deletions(-) create mode 100644 lib/PublicInbox/ContentId.pm diff --git a/MANIFEST b/MANIFEST index 5074d8d..85e8503 100644 --- a/MANIFEST +++ b/MANIFEST @@ -46,6 +46,7 @@ examples/varnish-4.vcl lib/PublicInbox/Address.pm lib/PublicInbox/AltId.pm lib/PublicInbox/Config.pm +lib/PublicInbox/ContentId.pm lib/PublicInbox/Daemon.pm lib/PublicInbox/Emergency.pm lib/PublicInbox/EvCleanup.pm diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm new file mode 100644 index 0000000..65d5a76 --- /dev/null +++ b/lib/PublicInbox/ContentId.pm @@ -0,0 +1,30 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +package PublicInbox::ContentId; +use strict; +use warnings; +use base qw/Exporter/; +our @EXPORT_OK = qw/content_id/; + +# not sure if less-widely supported hash families are worth bothering with +use Digest::SHA; + +# Content-* headers are often no-ops, so maybe we don't need them +my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To); + +sub content_id ($;$) { + my ($mime, $alg) = @_; + $alg ||= 256; + my $dig = Digest::SHA->new($alg); + my $hdr = $mime->header_obj; + + foreach my $h (@ID_HEADERS) { + my @v = $hdr->header_raw($h); + $dig->add($_) foreach @v; + } + $dig->add($mime->body_raw); + 'SHA-' . $dig->algorithm . ':' . $dig->hexdigest; +} + +1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 56633a8..b8e9dd0 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -11,6 +11,7 @@ use Fcntl qw(:flock :DEFAULT); use PublicInbox::Spawn qw(spawn); use PublicInbox::MID qw(mid_mime mid2path); use PublicInbox::Address; +use PublicInbox::ContentId qw(content_id); sub new { my ($class, $git, $name, $email, $ibx) = @_; @@ -26,6 +27,7 @@ sub new { mark => 1, ref => $ref, inbox => $ibx, + path_type => '2/38', # or 'v2' ssoma_lock => 1, # disable for v2 }, $class } @@ -88,6 +90,7 @@ sub norm_body ($) { $b } +# only used for v1 (ssoma) inboxes sub _check_path ($$$$) { my ($r, $w, $tip, $path) = @_; return if $tip eq ''; @@ -97,17 +100,9 @@ sub _check_path ($$$$) { $info =~ /\Amissing / ? undef : $info; } -# returns undef on non-existent -# ('MISMATCH', msg) on mismatch -# (:MARK, msg) on success -sub remove { - my ($self, $mime, $msg) = @_; # mime = Email::MIME - - my $mid = mid_mime($mime); - my $path = mid2path($mid); +sub check_remove_v1 { + my ($r, $w, $tip, $path, $mime) = @_; - my ($r, $w) = $self->gfi_start; - my $tip = $self->{tip}; my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info"; my $blob = $1; @@ -140,6 +135,34 @@ sub remove { if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { return ('MISMATCH', $cur); } + (undef, $cur); +} + +# returns undef on non-existent +# ('MISMATCH', msg) on mismatch +# (:MARK, msg) on success +# +# For v2 inboxes, the content_id is returned instead of the msg +# v2 callers should check with Xapian before calling this as +# it is not idempotent. +sub remove { + my ($self, $mime, $msg) = @_; # mime = Email::MIME + + my $path_type = $self->{path_type}; + my ($path, $err, $cur, $blob); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + if ($path_type eq '2/38') { + $path = mid2path(mid_mime($mime)); + ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); + return ($err, $cur) if $err; + } else { + $cur = content_id($mime); + my $len = length($cur); + $blob = $self->{mark}++; + print $w "blob\nmark :$blob\ndata $len\n$cur\n" or wfail; + } my $ref = $self->{ref}; my $commit = $self->{mark}++; @@ -156,7 +179,11 @@ sub remove { "committer $ident $now\n", "data $len\n$msg\n\n", 'from ', ($parent ? $parent : $tip), "\n" or wfail; - print $w "D $path\n\n" or wfail; + if (defined $path) { + print $w "D $path\n\n" or wfail; + } else { + print $w "M 100644 :$blob d\n\n" or wfail; + } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); } @@ -177,15 +204,25 @@ sub add { my $date = $mime->header('Date'); my $subject = $mime->header('Subject'); $subject = '(no subject)' unless defined $subject; - my $mid = mid_mime($mime); - my $path = mid2path($mid); + my $path_type = $self->{path_type}; + + my $path; + if ($path_type eq '2/38') { + $path = mid2path(mid_mime($mime)); + } else { # v2 layout, one file: + $path = 'm'; + } my ($r, $w) = $self->gfi_start; my $tip = $self->{tip}; - _check_path($r, $w, $tip, $path) and return; + if ($path_type eq '2/38') { + _check_path($r, $w, $tip, $path) and return; + } # kill potentially confusing/misleading headers $mime->header_set($_) for qw(bytes lines content-length status); + + # spam check: if ($check_cb) { $mime = $check_cb->($mime) or return; } @@ -194,6 +231,15 @@ sub add { my $blob = $self->{mark}++; print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail; print $w $mime, "\n" or wfail; + + # v2: we need this for Xapian + if ($self->{want_object_id}) { + print $w "get-mark :$blob\n" or wfail; + defined(my $object_id = <$r>) or + die "get-mark failed, need git 2.6.0+\n"; + chomp($self->{last_object_id} = $object_id); + } + my $ref = $self->{ref}; my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; -- EW