user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 6/8] lei export-kw: new command to export keywords to Maildirs
Date: Fri, 21 May 2021 10:28:30 +0000	[thread overview]
Message-ID: <20210521102832.10784-7-e@80x24.org> (raw)
In-Reply-To: <20210521102832.10784-1-e@80x24.org>

IMAP will eventually be supported.
---
 MANIFEST                       |   2 +
 lib/PublicInbox/LEI.pm         |   4 +
 lib/PublicInbox/LeiExportKw.pm | 180 +++++++++++++++++++++++++++++++++
 lib/PublicInbox/LeiMailSync.pm |  10 ++
 lib/PublicInbox/LeiSearch.pm   |  14 +++
 lib/PublicInbox/LeiToMail.pm   |   8 +-
 lib/PublicInbox/MdirReader.pm  |  14 +++
 t/lei-export-kw.t              |  35 +++++++
 t/mdir_reader.t                |   5 +
 9 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 lib/PublicInbox/LeiExportKw.pm
 create mode 100644 t/lei-export-kw.t

diff --git a/MANIFEST b/MANIFEST
index 684128aa..2d1ad5c3 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -202,6 +202,7 @@ lib/PublicInbox/LeiConvert.pm
 lib/PublicInbox/LeiCurl.pm
 lib/PublicInbox/LeiDedupe.pm
 lib/PublicInbox/LeiEditSearch.pm
+lib/PublicInbox/LeiExportKw.pm
 lib/PublicInbox/LeiExternal.pm
 lib/PublicInbox/LeiForgetSearch.pm
 lib/PublicInbox/LeiHelp.pm
@@ -408,6 +409,7 @@ t/iso-2202-jp.eml
 t/kqnotify.t
 t/lei-convert.t
 t/lei-daemon.t
+t/lei-export-kw.t
 t/lei-externals.t
 t/lei-import-http.t
 t/lei-import-imap.t
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 15680fe3..628908b5 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -243,6 +243,10 @@ our %CMD = ( # sorted in order of importance/use:
 	qw(stdin| offset=i recursive|r exclude=s include|I=s
 	lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync!),
 	qw(no-torsocks torsocks=s), PublicInbox::LeiQuery::curl_opt(), @c_opt ],
+
+'export-kw' => [ 'LOCATION...|--all',
+	'one-time export of keywords of sync sources',
+	qw(all:s mode=s), @c_opt ],
 'convert' => [ 'LOCATION...|--stdin',
 	'one-time conversion from URL or filesystem to another format',
 	qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw!),
diff --git a/lib/PublicInbox/LeiExportKw.pm b/lib/PublicInbox/LeiExportKw.pm
new file mode 100644
index 00000000..db4f7441
--- /dev/null
+++ b/lib/PublicInbox/LeiExportKw.pm
@@ -0,0 +1,180 @@
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# front-end for the "lei export-kw" sub-command
+package PublicInbox::LeiExportKw;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC PublicInbox::LeiInput);
+use Errno qw(EEXIST ENOENT);
+
+sub export_kw_md { # LeiMailSync->each_src callback
+	my ($oidbin, $id, $self, $mdir) = @_;
+	my $oidhex = unpack('H*', $oidbin);
+	my $sto_kw = $self->{lse}->oid_keywords($oidhex) or return;
+	my $bn = $$id;
+	my ($md_kw, $unknown, @try);
+	if ($bn =~ s/:2,([a-zA-Z]*)\z//) {
+		($md_kw, $unknown) = PublicInbox::MdirReader::flags2kw($1);
+		@try = qw(cur new);
+	} else {
+		$unknown = [];
+		@try = qw(new cur);
+	}
+	if ($self->{-merge_kw} && $md_kw) { # merging keywords is the default
+		@$sto_kw{keys %$md_kw} = values(%$md_kw);
+	}
+	$bn .= ':2,'.
+		PublicInbox::LeiToMail::kw2suffix([keys %$sto_kw], @$unknown);
+	my $dst = "$mdir/cur/$bn";
+	my @fail;
+	for my $d (@try) {
+		my $src = "$mdir/$d/$$id";
+		next if $src eq $dst;
+
+		# we use link(2) + unlink(2) since rename(2) may
+		# inadvertently clobber if the "uniquefilename" part wasn't
+		# actually unique.
+		if (link($src, $dst)) { # success
+			# unlink(2) may ENOENT from parallel invocation,
+			# ignore it, but not other serious errors
+			if (!unlink($src) and $! != ENOENT) {
+				$self->{lei}->child_error(1,
+							"E: unlink($src): $!");
+			}
+			$self->{lms}->mv_src("maildir:$mdir",
+						$oidbin, $id, $bn) or die;
+			return; # success anyways if link(2) worked
+		}
+		if ($! == ENOENT && !-e $src) { # some other process moved it
+			$self->{lms}->clear_src("maildir:$mdir", $id);
+			next;
+		}
+		push @fail, $src if $! != EEXIST;
+	}
+	return unless @fail;
+	# both tries failed
+	my $e = $!;
+	my $orig = '['.join('|', @fail).']';
+	$self->{lei}->child_error(1, "link($orig, $dst) ($oidhex): $e");
+}
+
+# overrides PublicInbox::LeiInput::input_path_url
+sub input_path_url {
+	my ($self, $input, @args) = @_;
+	my $lms = $self->{lms} //= $self->{lse}->lms;
+	$lms->lms_begin;
+	if ($input =~ s/\Amaildir://i) {
+		require PublicInbox::LeiToMail; # kw2suffix
+		$lms->each_src("maildir:$input", \&export_kw_md, $self, $input);
+	}
+	$lms->lms_commit;
+}
+
+sub lei_export_kw {
+	my ($lei, @folders) = @_;
+	my $sto = $lei->_lei_store or return $lei->fail(<<EOM);
+lei/store uninitialized, see lei-import(1)
+EOM
+	my $lse = $sto->search;
+	my $lms = $lse->lms or return $lei->fail(<<EOM);
+lei mail_sync uninitialized, see lei-import(1)
+EOM
+	my $opt = $lei->{opt};
+	my $all = $opt->{all};
+	my @all = $lms->folders;
+	if (defined $all) { # --all=<local|remote>
+		my %x = map { $_ => $_ } split(/,/, $all);
+		my @ok = grep(defined, delete(@x{qw(local remote), ''}));
+		my @no = keys %x;
+		if (@no) {
+			@no = (join(',', @no));
+			return $lei->fail(<<EOM);
+--all=@no not accepted (must be `local' and/or `remote')
+EOM
+		}
+		my (%seen, @inc);
+		for my $ok (@ok) {
+			if ($ok eq 'local') {
+				@inc = grep(!m!\A[a-z0-9\+]+://!i, @all);
+			} elsif ($ok eq 'remote') {
+				@inc = grep(m!\A[a-z0-9\+]+://!i, @all);
+			} elsif ($ok ne '') {
+				return $lei->fail("--all=$all not understood");
+			} else {
+				@inc = @all;
+			}
+			for (@inc) {
+				push(@folders, $_) unless $seen{$_}++;
+			}
+		}
+		return $lei->fail(<<EOM) if !@folders;
+no --mail-sync folders known to lei
+EOM
+	} else {
+		my %all = map { $_ => 1 } @all;
+		my @no;
+		for (@folders) {
+			next if $all{$_}; # ok
+			if (-d "$_/new" && -d "$_/cur") {
+				my $d = 'maildir:'.$lei->rel2abs($_);
+				push(@no, $_) unless $all{$d};
+				$_ = $d;
+			} else {
+				push @no, $_;
+			}
+		}
+		my $no = join("\n\t", @no);
+		return $lei->fail(<<EOF) if @no;
+No sync information for: $no
+Run `lei ls-mail-sync' to display valid choices
+EOF
+	}
+	my $self = bless { lse => $lse }, __PACKAGE__;
+	$lei->{opt}->{'mail-sync'} = 1; # for prepare_inputs
+	$self->prepare_inputs($lei, \@folders) or return;
+	my $j = $opt->{jobs} // scalar(@{$self->{inputs}}) || 1;
+	if (my @ro = grep(!/\A(?:maildir|imaps?):/, @folders)) {
+		return $lei->fail("cannot export to read-only folders: @ro");
+	}
+	if (my $net = $lei->{net}) {
+		require PublicInbox::NetWriter;
+		bless $net, 'PublicInbox::NetWriter';
+	}
+	undef $lms;
+	my $m = $opt->{mode} // 'merge';
+	if ($m eq 'merge') { # default
+		$self->{-merge_kw} = 1;
+	} elsif ($m eq 'set') {
+	} else {
+		return $lei->fail(<<EOM);
+--mode=$m not supported (`set' or `merge')
+EOM
+	}
+	my $ops = {};
+	$lei->{auth}->op_merge($ops, $self) if $lei->{auth};
+	$self->{-wq_nr_workers} = $j // 1; # locked
+	(my $op_c, $ops) = $lei->workers_start($self, $j, $ops);
+	$lei->{wq1} = $self;
+	$lei->{-err_type} = 'non-fatal';
+	net_merge_all_done($self) unless $lei->{auth};
+	$op_c->op_wait_event($ops); # calls net_merge_all_done if $lei->{auth}
+}
+
+sub _complete_export_kw {
+	my ($lei, @argv) = @_;
+	my $sto = $lei->_lei_store or return;
+	my $lms = $sto->search->lms or return;
+	my $match_cb = $lei->complete_url_prepare(\@argv);
+	map { $match_cb->($_) } $lms->folders;
+}
+
+no warnings 'once';
+
+*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child;
+*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done;
+
+# the following works even when LeiAuth is lazy-loaded
+*net_merge_all = \&PublicInbox::LeiAuth::net_merge_all;
+
+1;
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index 3bada42d..32e17c65 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -138,6 +138,16 @@ DELETE FROM blob2num WHERE fid = ? AND uid = ?
 	$sth->execute($fid, $id);
 }
 
+# Maildir-only
+sub mv_src {
+	my ($self, $folder, $oidbin, $id, $newbn) = @_;
+	my $fid = $self->{fmap}->{$folder} //= _fid_for($self, $folder, 1);
+	my $sth = $self->{dbh}->prepare_cached(<<'');
+UPDATE blob2name SET name = ? WHERE fid = ? AND oidbin = ? AND name = ?
+
+	$sth->execute($newbn, $fid, $oidbin, $$id);
+}
+
 # read-only, iterates every oidbin + UID or name for a given folder
 sub each_src {
 	my ($self, $folder, $cb, @args) = @_;
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index fb19229f..9297d060 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -27,6 +27,20 @@ sub msg_keywords {
 	wantarray ? sort(keys(%$kw)) : $kw;
 }
 
+# returns undef if blob is unknown
+sub oid_keywords {
+	my ($self, $oidhex) = @_;
+	my @num = $self->over->blob_exists($oidhex) or return;
+	my $xdb = $self->xdb; # set {nshard};
+	my %kw;
+	for my $num (@num) { # there should only be one...
+		my $doc = $xdb->get_document(num2docid($self, $num));
+		my $x = xap_terms('K', $doc);
+		%kw = (%kw, %$x);
+	}
+	\%kw;
+}
+
 # lookup keywords+labels for external messages
 sub xsmsg_vmd {
 	my ($self, $smsg, $want_label) = @_;
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 0cbdff8b..96a1f881 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -243,10 +243,14 @@ sub _rand () {
 	sprintf('%x,%x,%x,%x', rand(0xffffffff), time, $$, ++$seq);
 }
 
+sub kw2suffix ($;@) {
+	my $kw = shift;
+	join('', sort(map { $kw2char{$_} // () } @$kw, @_));
+}
+
 sub _buf2maildir {
 	my ($dst, $buf, $smsg) = @_;
 	my $kw = $smsg->{kw} // [];
-	my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw));
 	my $rand = ''; # chosen by die roll :P
 	my ($tmp, $fh, $base, $ok);
 	my $common = $smsg->{blob} // _rand;
@@ -263,7 +267,7 @@ sub _buf2maildir {
 		$dst .= 'cur/';
 		$rand = '';
 		do {
-			$base = $rand.$common.':2,'.$sfx
+			$base = $rand.$common.':2,'.kw2suffix($kw);
 		} while (!($ok = link($tmp, $dst.$base)) && $!{EEXIST} &&
 			($rand = _rand.','));
 		die "link($tmp, $dst$base): $!" unless $ok;
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index 7a0641fb..304be63d 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -86,4 +86,18 @@ sub maildir_each_eml {
 
 sub new { bless {}, __PACKAGE__ }
 
+sub flags2kw ($) {
+	my @unknown;
+	my %kw;
+	for (split(//, $_[0])) {
+		my $k = $c2kw{$_};
+		if (defined($k)) {
+			$kw{$k} = 1;
+		} else {
+			push @unknown, $_;
+		}
+	}
+	(\%kw, \@unknown);
+}
+
 1;
diff --git a/t/lei-export-kw.t b/t/lei-export-kw.t
new file mode 100644
index 00000000..9531949a
--- /dev/null
+++ b/t/lei-export-kw.t
@@ -0,0 +1,35 @@
+#!perl -w
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict; use v5.10.1; use PublicInbox::TestCommon;
+use File::Copy qw(cp);
+use File::Path qw(make_path);
+require_mods(qw(lei -imapd Mail::IMAPClient));
+my ($tmpdir, $for_destroy) = tmpdir;
+my ($ro_home, $cfg_path) = setup_public_inboxes;
+my $expect = eml_load('t/data/0001.patch');
+test_lei({ tmpdir => $tmpdir }, sub {
+	my $home = $ENV{HOME};
+	my $md = "$home/md";
+	make_path("$md/new", "$md/cur", "$md/tmp");
+	cp('t/data/0001.patch', "$md/new/y") or xbail "cp $md $!";
+	cp('t/data/message_embed.eml', "$md/cur/x:2,S") or xbail "cp $md $!";
+	lei_ok qw(index -q), $md;
+	lei_ok qw(tag t/data/0001.patch +kw:seen);
+	lei_ok qw(export-kw --all=local);
+	ok(!-e "$md/new/y", 'original gone');
+	is_deeply(eml_load("$md/cur/y:2,S"), $expect,
+		"`seen' kw exported");
+
+	lei_ok qw(tag t/data/0001.patch +kw:answered);
+	lei_ok qw(export-kw --all=local);
+	ok(!-e "$md/cur/y:2,S", 'seen-only file gone');
+	is_deeply(eml_load("$md/cur/y:2,RS"), $expect, "`R' added");
+
+	lei_ok qw(tag t/data/0001.patch -kw:answered -kw:seen);
+	lei_ok qw(export-kw --mode=set --all=local);
+	ok(!-e "$md/cur/y:2,RS", 'seen+answered file gone');
+	is_deeply(eml_load("$md/cur/y:2,"), $expect, 'no keywords left');
+});
+
+done_testing;
diff --git a/t/mdir_reader.t b/t/mdir_reader.t
index 51b38af4..c927e1a7 100644
--- a/t/mdir_reader.t
+++ b/t/mdir_reader.t
@@ -19,4 +19,9 @@ is(maildir_path_flags('/path/to/foo:2,'), '', 'no flags in path');
 use_ok 'PublicInbox::InboxWritable', qw(eml_from_path);
 is(eml_from_path('.'), undef, 'eml_from_path fails on directory');
 
+is_deeply([PublicInbox::MdirReader::flags2kw('S')], [{ 'seen' => 1 }, []],
+	"`seen' kw set from flag");
+is_deeply([PublicInbox::MdirReader::flags2kw('Su')], [{ 'seen' => 1 }, ['u']],
+	'unknown flag ignored');
+
 done_testing;

  parent reply	other threads:[~2021-05-21 10:28 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-21 10:28 [PATCH 0/8] lei: export-kw, IMAP import incompatibility Eric Wong
2021-05-21 10:28 ` [PATCH 1/8] treewide: favor open(..., '+<&=', $fd) Eric Wong
2021-05-21 10:28 ` [PATCH 2/8] lei: drop EOFpipe in favor of PktOp Eric Wong
2021-05-21 10:28 ` [PATCH 3/8] lei tag: support tagging index-only messages Eric Wong
2021-05-21 10:28 ` [PATCH 4/8] lei_input: fix canonicalization of Maildirs for sync Eric Wong
2021-05-21 10:28 ` [PATCH 5/8] lei index: support command-line options Eric Wong
2021-05-21 10:28 ` Eric Wong [this message]
2021-05-21 10:28 ` [PATCH 7/8] uri_imap: support uid/auth/user as full accessors Eric Wong
2021-05-21 10:28 ` [PATCH 8/8] lei import: store IMAP user+auth in mail_sync folder URI Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210521102832.10784-7-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    --subject='Re: [PATCH 6/8] lei export-kw: new command to export keywords to Maildirs' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git