From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/12] lei lcat: support NNTP URLs
Date: Tue, 21 Sep 2021 07:41:54 +0000 [thread overview]
Message-ID: <20210921074159.20052-8-e@80x24.org> (raw)
In-Reply-To: <20210921074159.20052-1-e@80x24.org>
NNTP URLs are probably more prevalent in public message archives
than IMAP URLs.
---
lib/PublicInbox/LeiLcat.pm | 66 +++++++++++++++++++++-------------
lib/PublicInbox/LeiMailSync.pm | 14 +++++---
t/lei-import-nntp.t | 23 ++++++++++++
3 files changed, 74 insertions(+), 29 deletions(-)
diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm
index 1a4a988e..0902c213 100644
--- a/lib/PublicInbox/LeiLcat.pm
+++ b/lib/PublicInbox/LeiLcat.pm
@@ -11,47 +11,64 @@ use PublicInbox::LeiViewText;
use URI::Escape qw(uri_unescape);
use PublicInbox::MID qw($MID_EXTRACT);
-sub lcat_folder ($$$) {
- my ($lei, $lms, $folder) = @_;
- $lms //= $lei->lms or return;
- my $folders = [ $folder];
+sub lcat_folder ($$;$$) {
+ my ($lei, $folder, $beg, $end) = @_;
+ my $lms = $lei->{-lms_ro} //= $lei->lms // return;
+ my $folders = [ $folder ];
eval { $lms->arg2folder($lei, $folders) };
- if ($@) {
- $lei->child_error(0, "# unknown folder: $folder");
- } else {
- for my $f (@$folders) {
- my $fid = $lms->fid_for($f);
- push @{$lei->{lcat_todo}}, { fid => $fid };
- }
+ return $lei->child_error(0, "# unknown folder: $folder") if $@;
+ my %range;
+ if (defined($beg)) { # NNTP article range
+ $range{min} = $beg;
+ $range{max} = $end // $beg;
+ }
+ for my $f (@$folders) {
+ my $fid = $lms->fid_for($f);
+ push @{$lei->{lcat_todo}}, { fid => $fid, %range };
}
}
sub lcat_imap_uri ($$) {
my ($lei, $uri) = @_;
- my $lms = $lei->lms or return;
- # cf. LeiXsearch->lcat_dump
+ # cf. LeiXSearch->lcat_dump
+ my $lms = $lei->{-lms_ro} //= $lei->lms // return;
if (defined $uri->uid) {
push @{$lei->{lcat_todo}}, $lms->imap_oidhex($lei, $uri);
} elsif (defined(my $fid = $lms->fid_for($$uri))) {
push @{$lei->{lcat_todo}}, { fid => $fid };
} else {
- lcat_folder($lei, $lms, $$uri);
+ lcat_folder($lei, $$uri);
}
}
+sub lcat_nntp_uri ($$) {
+ my ($lei, $uri) = @_;
+ my $mid = $uri->message; # already unescaped by URI::news
+ return "mid:$mid" if defined($mid);
+ my $lms = $lei->{-lms_ro} //= $lei->lms // return;
+ my ($ng, $beg, $end) = $uri->group;
+ $uri->group($ng);
+ lcat_folder($lei, $$uri, $beg, $end);
+ '""';
+}
+
sub extract_1 ($$) {
my ($lei, $x) = @_;
- if ($x =~ m!\b(imaps?://[^>]+)!i) {
- my $u = $1;
- require PublicInbox::URIimap;
- lcat_imap_uri($lei, PublicInbox::URIimap->new($u));
- '""'; # blank query, using {lcat_todo}
- } elsif ($x =~ m!\b(maildir:.+)!i) {
- lcat_folder($lei, undef, $1);
+ if ($x =~ m!\b(maildir:.+)!i) {
+ lcat_folder($lei, $1);
'""'; # blank query, using {lcat_todo}
- } elsif ($x =~ m!\b([a-z]+?://\S+)!i) {
- my $u = $1;
+ } elsif ($x =~ m!\b(([a-z]+)://\S+)!i) {
+ my ($u, $scheme) = ($1, $2);
$u =~ s/[\>\]\)\,\.\;]+\z//;
+ if ($scheme =~ m!\A(imaps?)\z!i) {
+ require PublicInbox::URIimap;
+ lcat_imap_uri($lei, PublicInbox::URIimap->new($u));
+ return '""'; # blank query, using {lcat_todo}
+ } elsif ($scheme =~ m!\A(?:nntps?|s?news)\z!i) {
+ require PublicInbox::URInntps;
+ $u = PublicInbox::URInntps->new($u);
+ return lcat_nntp_uri($lei, $u);
+ } # http, or something else:
require URI;
$u = URI->new($u);
my $p = $u->path;
@@ -93,7 +110,7 @@ sub extract_all {
my $strict = !$lei->{opt}->{stdin};
my @q;
for my $x (@argv) {
- if (my $term = extract_1($lei,$x)) {
+ if (my $term = extract_1($lei, $x)) {
push @q, $term;
} elsif ($strict) {
return $lei->fail(<<"");
@@ -101,6 +118,7 @@ could not extract Message-ID from $x
}
}
+ delete $lei->{-lms_ro};
@q ? join(' OR ', @q) : $lei->fail("no Message-ID in: @argv");
}
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index f83c7de2..522a5ebc 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -197,9 +197,12 @@ INSERT OR IGNORE INTO blob2name (oidbin, fid, name) VALUES (?, ?, ?)
sub each_src {
my ($self, $folder, $cb, @args) = @_;
my $dbh = $self->{dbh} //= dbh_new($self);
- my $fid;
+ my ($fid, @rng);
+ my $and_ge_le = '';
if (ref($folder) eq 'HASH') {
$fid = $folder->{fid} // die "BUG: no `fid'";
+ @rng = grep(defined, @$folder{qw(min max)});
+ $and_ge_le = 'AND uid >= ? AND uid <= ?' if @rng;
} else {
$fid = $self->{fmap}->{$folder} //=
fid_for($self, $folder) // return;
@@ -208,16 +211,17 @@ sub each_src {
# minimize implicit txn time to avoid blocking writers by
# batching SELECTs. This looks wonky but is necessary since
# $cb-> may access the DB on its own.
- my $ary = $dbh->selectall_arrayref(<<'', undef, $fid);
-SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ?
+ my $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng);
+SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? $and_ge_le
ORDER BY _rowid_ ASC LIMIT 1000
my $min = @$ary ? $ary->[-1]->[0] : undef;
while (defined $min) {
for my $row (@$ary) { $cb->($row->[1], $row->[2], @args) }
- $ary = $dbh->selectall_arrayref(<<'', undef, $fid, $min);
-SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? AND _rowid_ > ?
+ $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng, $min);
+SELECT _rowid_,oidbin,uid FROM blob2num
+WHERE fid = ? $and_ge_le AND _rowid_ > ?
ORDER BY _rowid_ ASC LIMIT 1000
$min = @$ary ? $ary->[-1]->[0] : undef;
diff --git a/t/lei-import-nntp.t b/t/lei-import-nntp.t
index 0b080781..eb1ae312 100644
--- a/t/lei-import-nntp.t
+++ b/t/lei-import-nntp.t
@@ -25,6 +25,11 @@ test_lei({ tmpdir => $tmpdir }, sub {
is(ref(json_utf8->decode($lei_out)), 'ARRAY', 'ls-mail-source JSON');
lei_ok('import', $url);
+ lei_ok "lcat", "nntp://$host_port/testmessage\@example.com";
+ my $local = $lei_out;
+ lei_ok "lcat", "nntp://example.com/testmessage\@example.com";
+ my $remote = $lei_out;
+ is($local, $remote, 'Message-ID used even from unknown host');
lei_ok(qw(q z:1..));
$out = json_utf8->decode($lei_out);
ok(scalar(@$out) > 1, 'got imported messages');
@@ -57,6 +62,11 @@ test_lei({ tmpdir => $tmpdir }, sub {
lei_ok('inspect', "$url/$high");
my $x = json_utf8->decode($lei_out);
like($x->{$url}->{$high}, qr/\A[a-f0-9]{40,}\z/, 'inspect shows blob');
+ lei_ok qw(lcat -f json), "$url/$high";
+ my $lcat = json_utf8->decode($lei_out);
+ is($lcat->[1], undef, 'only one result for lcat');
+ is($lcat->[0]->{blob}, $x->{$url}->{$high},
+ 'lcat showed correct blob');
lei_ok 'ls-mail-sync';
is($lei_out, "$url\n", 'article number not stored as folder');
@@ -78,6 +88,19 @@ test_lei({ tmpdir => $tmpdir }, sub {
is(scalar(grep(/\A[a-f0-9]{40,}\z/, values %{$x->{$url}})),
$end - $low + 1, 'all values are git blobs');
+ lei_ok qw(lcat -f json), "$url/$low";
+ $lcat = json_utf8->decode($lei_out);
+ is($lcat->[1], undef, 'only one result for lcat');
+ is($lcat->[0]->{blob}, $x->{$url}->{$low},
+ 'lcat showed correct blob');
+ lei_ok qw(lcat -f json), "$url/$low-$end";
+ $lcat = json_utf8->decode($lei_out);
+ pop @$lcat;
+ for ($low..$end) {
+ my $tip = shift @$lcat;
+ is($x->{$url}->{$_}, $tip->{blob}, "blob matches art #$_");
+ }
+
lei_ok 'ls-mail-sync';
is($lei_out, "$url\n", 'article range not stored as folder');
lei_ok qw(q z:0..); my $start = json_utf8->decode($lei_out);
next prev parent reply other threads:[~2021-09-21 7:41 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-21 7:41 [PATCH 00/12] lei: fix various annoyances Eric Wong
2021-09-21 7:41 ` [PATCH 01/12] lei inspect: convert to WQ worker Eric Wong
2021-09-21 7:41 ` [PATCH 02/12] lei inspect: support NNTP URLs Eric Wong
2021-09-21 7:41 ` [PATCH 03/12] lei_mail_sync: account for non-unique cases Eric Wong
2021-09-21 7:41 ` [PATCH 04/12] lei: simplify internal arg2folder usage Eric Wong
2021-09-21 7:41 ` [PATCH 05/12] lei lcat: use single queue for ordering Eric Wong
2021-09-21 7:41 ` [PATCH 06/12] doc: lei-security: section for WIP auth methods Eric Wong
2021-09-21 7:41 ` Eric Wong [this message]
2021-09-21 7:41 ` [PATCH 08/12] lei: various completion improvements Eric Wong
2021-09-21 7:41 ` [PATCH 09/12] lei q: show progress on >1s preparation phase Eric Wong
2021-09-21 7:41 ` [PATCH 10/12] search: drop reopen retry message Eric Wong
2021-09-21 7:41 ` [PATCH 11/12] lei q: update messages to reflect --save default Eric Wong
2021-09-21 7:41 ` [PATCH 12/12] lei q: improve --limit behavior and progress Eric Wong
2021-09-21 9:29 ` [PATCH 0/3] lei: a few more annoyances fixed Eric Wong
2021-09-21 9:29 ` [PATCH 1/3] t/lei-up: use '-q' to silence non-redirected test Eric Wong
2021-09-21 9:29 ` [PATCH 2/3] script/lei: handle SIGTSTP and SIGCONT Eric Wong
2021-09-21 9:29 ` [PATCH 3/3] lei: umask(077) before opening errors.log Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210921074159.20052-8-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).