user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/12] lei lcat: support NNTP URLs
Date: Tue, 21 Sep 2021 07:41:54 +0000	[thread overview]
Message-ID: <20210921074159.20052-8-e@80x24.org> (raw)
In-Reply-To: <20210921074159.20052-1-e@80x24.org>

NNTP URLs are probably more prevalent in public message archives
than IMAP URLs.
---
 lib/PublicInbox/LeiLcat.pm     | 66 +++++++++++++++++++++-------------
 lib/PublicInbox/LeiMailSync.pm | 14 +++++---
 t/lei-import-nntp.t            | 23 ++++++++++++
 3 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm
index 1a4a988e..0902c213 100644
--- a/lib/PublicInbox/LeiLcat.pm
+++ b/lib/PublicInbox/LeiLcat.pm
@@ -11,47 +11,64 @@ use PublicInbox::LeiViewText;
 use URI::Escape qw(uri_unescape);
 use PublicInbox::MID qw($MID_EXTRACT);
 
-sub lcat_folder ($$$) {
-	my ($lei, $lms, $folder) = @_;
-	$lms //= $lei->lms or return;
-	my $folders = [ $folder];
+sub lcat_folder ($$;$$) {
+	my ($lei, $folder, $beg, $end) = @_;
+	my $lms = $lei->{-lms_ro} //= $lei->lms // return;
+	my $folders = [ $folder ];
 	eval { $lms->arg2folder($lei, $folders) };
-	if ($@) {
-		$lei->child_error(0, "# unknown folder: $folder");
-	} else {
-		for my $f (@$folders) {
-			my $fid = $lms->fid_for($f);
-			push @{$lei->{lcat_todo}}, { fid => $fid };
-		}
+	return $lei->child_error(0, "# unknown folder: $folder") if $@;
+	my %range;
+	if (defined($beg)) { # NNTP article range
+		$range{min} = $beg;
+		$range{max} = $end // $beg;
+	}
+	for my $f (@$folders) {
+		my $fid = $lms->fid_for($f);
+		push @{$lei->{lcat_todo}}, { fid => $fid, %range };
 	}
 }
 
 sub lcat_imap_uri ($$) {
 	my ($lei, $uri) = @_;
-	my $lms = $lei->lms or return;
-	# cf. LeiXsearch->lcat_dump
+	# cf. LeiXSearch->lcat_dump
+	my $lms = $lei->{-lms_ro} //= $lei->lms // return;
 	if (defined $uri->uid) {
 		push @{$lei->{lcat_todo}}, $lms->imap_oidhex($lei, $uri);
 	} elsif (defined(my $fid = $lms->fid_for($$uri))) {
 		push @{$lei->{lcat_todo}}, { fid => $fid };
 	} else {
-		lcat_folder($lei, $lms, $$uri);
+		lcat_folder($lei, $$uri);
 	}
 }
 
+sub lcat_nntp_uri ($$) {
+	my ($lei, $uri) = @_;
+	my $mid = $uri->message; # already unescaped by URI::news
+	return "mid:$mid" if defined($mid);
+	my $lms = $lei->{-lms_ro} //= $lei->lms // return;
+	my ($ng, $beg, $end) = $uri->group;
+	$uri->group($ng);
+	lcat_folder($lei, $$uri, $beg, $end);
+	'""';
+}
+
 sub extract_1 ($$) {
 	my ($lei, $x) = @_;
-	if ($x =~ m!\b(imaps?://[^>]+)!i) {
-		my $u = $1;
-		require PublicInbox::URIimap;
-		lcat_imap_uri($lei, PublicInbox::URIimap->new($u));
-		'""'; # blank query, using {lcat_todo}
-	} elsif ($x =~ m!\b(maildir:.+)!i) {
-		lcat_folder($lei, undef, $1);
+	if ($x =~ m!\b(maildir:.+)!i) {
+		lcat_folder($lei, $1);
 		'""'; # blank query, using {lcat_todo}
-	} elsif ($x =~ m!\b([a-z]+?://\S+)!i) {
-		my $u = $1;
+	} elsif ($x =~ m!\b(([a-z]+)://\S+)!i) {
+		my ($u, $scheme) = ($1, $2);
 		$u =~ s/[\>\]\)\,\.\;]+\z//;
+		if ($scheme =~ m!\A(imaps?)\z!i) {
+			require PublicInbox::URIimap;
+			lcat_imap_uri($lei, PublicInbox::URIimap->new($u));
+			return '""'; # blank query, using {lcat_todo}
+		} elsif ($scheme =~ m!\A(?:nntps?|s?news)\z!i) {
+			require PublicInbox::URInntps;
+			$u = PublicInbox::URInntps->new($u);
+			return lcat_nntp_uri($lei, $u);
+		} # http, or something else:
 		require URI;
 		$u = URI->new($u);
 		my $p = $u->path;
@@ -93,7 +110,7 @@ sub extract_all {
 	my $strict = !$lei->{opt}->{stdin};
 	my @q;
 	for my $x (@argv) {
-		if (my $term = extract_1($lei,$x)) {
+		if (my $term = extract_1($lei, $x)) {
 			push @q, $term;
 		} elsif ($strict) {
 			return $lei->fail(<<"");
@@ -101,6 +118,7 @@ could not extract Message-ID from $x
 
 		}
 	}
+	delete $lei->{-lms_ro};
 	@q ? join(' OR ', @q) : $lei->fail("no Message-ID in: @argv");
 }
 
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index f83c7de2..522a5ebc 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -197,9 +197,12 @@ INSERT OR IGNORE INTO blob2name (oidbin, fid, name) VALUES (?, ?, ?)
 sub each_src {
 	my ($self, $folder, $cb, @args) = @_;
 	my $dbh = $self->{dbh} //= dbh_new($self);
-	my $fid;
+	my ($fid, @rng);
+	my $and_ge_le = '';
 	if (ref($folder) eq 'HASH') {
 		$fid = $folder->{fid} // die "BUG: no `fid'";
+		@rng = grep(defined, @$folder{qw(min max)});
+		$and_ge_le = 'AND uid >= ? AND uid <= ?' if @rng;
 	} else {
 		$fid = $self->{fmap}->{$folder} //=
 			fid_for($self, $folder) // return;
@@ -208,16 +211,17 @@ sub each_src {
 	# minimize implicit txn time to avoid blocking writers by
 	# batching SELECTs.  This looks wonky but is necessary since
 	# $cb-> may access the DB on its own.
-	my $ary = $dbh->selectall_arrayref(<<'', undef, $fid);
-SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ?
+	my $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng);
+SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? $and_ge_le
 ORDER BY _rowid_ ASC LIMIT 1000
 
 	my $min = @$ary ? $ary->[-1]->[0] : undef;
 	while (defined $min) {
 		for my $row (@$ary) { $cb->($row->[1], $row->[2], @args) }
 
-		$ary = $dbh->selectall_arrayref(<<'', undef, $fid, $min);
-SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? AND _rowid_ > ?
+		$ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng, $min);
+SELECT _rowid_,oidbin,uid FROM blob2num
+WHERE fid = ? $and_ge_le AND _rowid_ > ?
 ORDER BY _rowid_ ASC LIMIT 1000
 
 		$min = @$ary ? $ary->[-1]->[0] : undef;
diff --git a/t/lei-import-nntp.t b/t/lei-import-nntp.t
index 0b080781..eb1ae312 100644
--- a/t/lei-import-nntp.t
+++ b/t/lei-import-nntp.t
@@ -25,6 +25,11 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	is(ref(json_utf8->decode($lei_out)), 'ARRAY', 'ls-mail-source JSON');
 
 	lei_ok('import', $url);
+	lei_ok "lcat", "nntp://$host_port/testmessage\@example.com";
+	my $local = $lei_out;
+	lei_ok "lcat", "nntp://example.com/testmessage\@example.com";
+	my $remote = $lei_out;
+	is($local, $remote, 'Message-ID used even from unknown host');
 	lei_ok(qw(q z:1..));
 	$out = json_utf8->decode($lei_out);
 	ok(scalar(@$out) > 1, 'got imported messages');
@@ -57,6 +62,11 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	lei_ok('inspect', "$url/$high");
 	my $x = json_utf8->decode($lei_out);
 	like($x->{$url}->{$high}, qr/\A[a-f0-9]{40,}\z/, 'inspect shows blob');
+	lei_ok qw(lcat -f json), "$url/$high";
+	my $lcat = json_utf8->decode($lei_out);
+	is($lcat->[1], undef, 'only one result for lcat');
+	is($lcat->[0]->{blob}, $x->{$url}->{$high},
+		'lcat showed correct blob');
 
 	lei_ok 'ls-mail-sync';
 	is($lei_out, "$url\n", 'article number not stored as folder');
@@ -78,6 +88,19 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	is(scalar(grep(/\A[a-f0-9]{40,}\z/, values %{$x->{$url}})),
 		$end - $low + 1, 'all values are git blobs');
 
+	lei_ok qw(lcat -f json), "$url/$low";
+	$lcat = json_utf8->decode($lei_out);
+	is($lcat->[1], undef, 'only one result for lcat');
+	is($lcat->[0]->{blob}, $x->{$url}->{$low},
+		'lcat showed correct blob');
+	lei_ok qw(lcat -f json), "$url/$low-$end";
+	$lcat = json_utf8->decode($lei_out);
+	pop @$lcat;
+	for ($low..$end) {
+		my $tip = shift @$lcat;
+		is($x->{$url}->{$_}, $tip->{blob}, "blob matches art #$_");
+	}
+
 	lei_ok 'ls-mail-sync';
 	is($lei_out, "$url\n", 'article range not stored as folder');
 	lei_ok qw(q z:0..); my $start = json_utf8->decode($lei_out);

  parent reply	other threads:[~2021-09-21  7:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-21  7:41 [PATCH 00/12] lei: fix various annoyances Eric Wong
2021-09-21  7:41 ` [PATCH 01/12] lei inspect: convert to WQ worker Eric Wong
2021-09-21  7:41 ` [PATCH 02/12] lei inspect: support NNTP URLs Eric Wong
2021-09-21  7:41 ` [PATCH 03/12] lei_mail_sync: account for non-unique cases Eric Wong
2021-09-21  7:41 ` [PATCH 04/12] lei: simplify internal arg2folder usage Eric Wong
2021-09-21  7:41 ` [PATCH 05/12] lei lcat: use single queue for ordering Eric Wong
2021-09-21  7:41 ` [PATCH 06/12] doc: lei-security: section for WIP auth methods Eric Wong
2021-09-21  7:41 ` Eric Wong [this message]
2021-09-21  7:41 ` [PATCH 08/12] lei: various completion improvements Eric Wong
2021-09-21  7:41 ` [PATCH 09/12] lei q: show progress on >1s preparation phase Eric Wong
2021-09-21  7:41 ` [PATCH 10/12] search: drop reopen retry message Eric Wong
2021-09-21  7:41 ` [PATCH 11/12] lei q: update messages to reflect --save default Eric Wong
2021-09-21  7:41 ` [PATCH 12/12] lei q: improve --limit behavior and progress Eric Wong
2021-09-21  9:29 ` [PATCH 0/3] lei: a few more annoyances fixed Eric Wong
2021-09-21  9:29   ` [PATCH 1/3] t/lei-up: use '-q' to silence non-redirected test Eric Wong
2021-09-21  9:29   ` [PATCH 2/3] script/lei: handle SIGTSTP and SIGCONT Eric Wong
2021-09-21  9:29   ` [PATCH 3/3] lei: umask(077) before opening errors.log Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210921074159.20052-8-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).