user/dev discussion of public-inbox itself
 help / color / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [WIP 06/17] import: initial handling for v2
Date: Thu, 15 Feb 2018 11:08:29 +0000
Message-ID: <20180215110840.30413-7-e@80x24.org> (raw)
In-Reply-To: <20180215110840.30413-1-e@80x24.org>

Call order will need to change a bit since this is going to be
tied to Xapian
---
 MANIFEST                     |  1 +
 lib/PublicInbox/ContentId.pm | 30 ++++++++++++++++++
 lib/PublicInbox/Import.pm    | 74 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 91 insertions(+), 14 deletions(-)
 create mode 100644 lib/PublicInbox/ContentId.pm

diff --git a/MANIFEST b/MANIFEST
index 5074d8d..85e8503 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -46,6 +46,7 @@ examples/varnish-4.vcl
 lib/PublicInbox/Address.pm
 lib/PublicInbox/AltId.pm
 lib/PublicInbox/Config.pm
+lib/PublicInbox/ContentId.pm
 lib/PublicInbox/Daemon.pm
 lib/PublicInbox/Emergency.pm
 lib/PublicInbox/EvCleanup.pm
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
new file mode 100644
index 0000000..65d5a76
--- /dev/null
+++ b/lib/PublicInbox/ContentId.pm
@@ -0,0 +1,30 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+package PublicInbox::ContentId;
+use strict;
+use warnings;
+use base qw/Exporter/;
+our @EXPORT_OK = qw/content_id/;
+
+# not sure if less-widely supported hash families are worth bothering with
+use Digest::SHA;
+
+# Content-* headers are often no-ops, so maybe we don't need them
+my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To);
+
+sub content_id ($;$) {
+	my ($mime, $alg) = @_;
+	$alg ||= 256;
+	my $dig = Digest::SHA->new($alg);
+	my $hdr = $mime->header_obj;
+
+	foreach my $h (@ID_HEADERS) {
+		my @v = $hdr->header_raw($h);
+		$dig->add($_) foreach @v;
+	}
+	$dig->add($mime->body_raw);
+	'SHA-' . $dig->algorithm . ':' . $dig->hexdigest;
+}
+
+1;
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 56633a8..b8e9dd0 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -11,6 +11,7 @@ use Fcntl qw(:flock :DEFAULT);
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::MID qw(mid_mime mid2path);
 use PublicInbox::Address;
+use PublicInbox::ContentId qw(content_id);
 
 sub new {
 	my ($class, $git, $name, $email, $ibx) = @_;
@@ -26,6 +27,7 @@ sub new {
 		mark => 1,
 		ref => $ref,
 		inbox => $ibx,
+		path_type => '2/38', # or 'v2'
 		ssoma_lock => 1, # disable for v2
 	}, $class
 }
@@ -88,6 +90,7 @@ sub norm_body ($) {
 	$b
 }
 
+# only used for v1 (ssoma) inboxes
 sub _check_path ($$$$) {
 	my ($r, $w, $tip, $path) = @_;
 	return if $tip eq '';
@@ -97,17 +100,9 @@ sub _check_path ($$$$) {
 	$info =~ /\Amissing / ? undef : $info;
 }
 
-# returns undef on non-existent
-# ('MISMATCH', msg) on mismatch
-# (:MARK, msg) on success
-sub remove {
-	my ($self, $mime, $msg) = @_; # mime = Email::MIME
-
-	my $mid = mid_mime($mime);
-	my $path = mid2path($mid);
+sub check_remove_v1 {
+	my ($r, $w, $tip, $path, $mime) = @_;
 
-	my ($r, $w) = $self->gfi_start;
-	my $tip = $self->{tip};
 	my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef);
 	$info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info";
 	my $blob = $1;
@@ -140,6 +135,34 @@ sub remove {
 	if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) {
 		return ('MISMATCH', $cur);
 	}
+	(undef, $cur);
+}
+
+# returns undef on non-existent
+# ('MISMATCH', msg) on mismatch
+# (:MARK, msg) on success
+#
+# For v2 inboxes, the content_id is returned instead of the msg
+# v2 callers should check with Xapian before calling this as
+# it is not idempotent.
+sub remove {
+	my ($self, $mime, $msg) = @_; # mime = Email::MIME
+
+	my $path_type = $self->{path_type};
+	my ($path, $err, $cur, $blob);
+
+	my ($r, $w) = $self->gfi_start;
+	my $tip = $self->{tip};
+	if ($path_type eq '2/38') {
+		$path = mid2path(mid_mime($mime));
+		($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime);
+		return ($err, $cur) if $err;
+	} else {
+		$cur = content_id($mime);
+		my $len = length($cur);
+		$blob = $self->{mark}++;
+		print $w "blob\nmark :$blob\ndata $len\n$cur\n" or wfail;
+	}
 
 	my $ref = $self->{ref};
 	my $commit = $self->{mark}++;
@@ -156,7 +179,11 @@ sub remove {
 		"committer $ident $now\n",
 		"data $len\n$msg\n\n",
 		'from ', ($parent ? $parent : $tip), "\n" or wfail;
-	print $w "D $path\n\n" or wfail;
+	if (defined $path) {
+		print $w "D $path\n\n" or wfail;
+	} else {
+		print $w "M 100644 :$blob d\n\n" or wfail;
+	}
 	$self->{nchg}++;
 	(($self->{tip} = ":$commit"), $cur);
 }
@@ -177,15 +204,25 @@ sub add {
 	my $date = $mime->header('Date');
 	my $subject = $mime->header('Subject');
 	$subject = '(no subject)' unless defined $subject;
-	my $mid = mid_mime($mime);
-	my $path = mid2path($mid);
+	my $path_type = $self->{path_type};
+
+	my $path;
+	if ($path_type eq '2/38') {
+		$path = mid2path(mid_mime($mime));
+	} else { # v2 layout, one file:
+		$path = 'm';
+	}
 
 	my ($r, $w) = $self->gfi_start;
 	my $tip = $self->{tip};
-	_check_path($r, $w, $tip, $path) and return;
+	if ($path_type eq '2/38') {
+		_check_path($r, $w, $tip, $path) and return;
+	}
 
 	# kill potentially confusing/misleading headers
 	$mime->header_set($_) for qw(bytes lines content-length status);
+
+	# spam check:
 	if ($check_cb) {
 		$mime = $check_cb->($mime) or return;
 	}
@@ -194,6 +231,15 @@ sub add {
 	my $blob = $self->{mark}++;
 	print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail;
 	print $w $mime, "\n" or wfail;
+
+	# v2: we need this for Xapian
+	if ($self->{want_object_id}) {
+		print $w "get-mark :$blob\n" or wfail;
+		defined(my $object_id = <$r>) or
+				die "get-mark failed, need git 2.6.0+\n";
+		chomp($self->{last_object_id} = $object_id);
+	}
+
 	my $ref = $self->{ref};
 	my $commit = $self->{mark}++;
 	my $parent = $tip =~ /\A:/ ? $tip : undef;
-- 
EW


  parent reply index

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-09 20:51 [v2] one file to rule them all? Eric Wong
2018-02-15 10:55 ` Eric Wong
2018-02-15 11:08   ` [WIP 0/17] initial v2 work based on one-file tree Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 01/17] AUTHORS: add The Linux Foundation Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 02/17] watch_maildir: allow '-' in mail filename Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 03/17] scripts/import_vger_from_mbox: relax From_ line match slightly Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 04/17] import: stop writing legacy ssoma.index by default Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 05/17] import: begin supporting this without ssoma.lock Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-02-15 11:08     ` [WIP 07/17] t/import: test for last_object_id insertion Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 08/17] content_id: add test case Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 09/17] searchmsg: add mid_mime import for _extract_mid Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 10/17] scripts/import_vger_from_mbox: support --dry-run option Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 11/17] import: APIs to support v2 use Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 12/17] search: free up 'Q' prefix for a real unique identifier Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:08       ` Eric Wong
2018-02-15 11:08     ` [WIP 13/17] searchidx: fix comment around next_thread_id Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 14/17] address: extract more characters from email addresses Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 15/17] import: pass "raw" dates to git-fast-import(1) Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 16/17] scripts/import_vger_from_mbox: use v2 layout for import Eric Wong (Contractor, The Linux Foundation)
2018-02-15 11:08     ` [WIP 17/17] import: quiet down warnings from bogus From: lines Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180215110840.30413-7-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox