user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/5] lei: more fixes and usability enhancement
@ 2021-01-25  1:18 Eric Wong
  2021-01-25  1:18 ` [PATCH 1/5] lei: reinstate JSON smsg output deduplication Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

cccuuurrrlll wwwiiilll nnnooo lllooonnngggeeerrr
ooouuutttpppuuutt llliiikkkeee ttthhhiiisss

Eric Wong (5):
  lei: reinstate JSON smsg output deduplication
  lei q: drop "oid" output format
  lei q: demangle and quiet curl output
  lei q: reject remotes early if curl(1) is missing
  lei q: continue remote search if torsocks(1) is missing

 lib/PublicInbox/LEI.pm         |  3 +-
 lib/PublicInbox/LeiOverview.pm | 11 +++---
 lib/PublicInbox/LeiXSearch.pm  | 70 ++++++++++++++++++++++++++--------
 t/lei.t                        | 21 ++++++++--
 4 files changed, 79 insertions(+), 26 deletions(-)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] lei: reinstate JSON smsg output deduplication
  2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
@ 2021-01-25  1:18 ` Eric Wong
  2021-01-25  1:18 ` [PATCH 2/5] lei q: drop "oid" output format Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

This was accidentally clobbered completely in
("lei q: fix JSON overview with remote externals").
There are now more tests to prevent future regressions.
---
 lib/PublicInbox/LeiOverview.pm |  7 ++++++-
 t/lei.t                        | 19 ++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm
index 928d66cb..880c7acc 100644
--- a/lib/PublicInbox/LeiOverview.pm
+++ b/lib/PublicInbox/LeiOverview.pm
@@ -203,12 +203,14 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 	my ($self, $lei, $ibxish) = @_;
 	my $json;
 	$lei->{1}->autoflush(1);
+	my $dedupe = $lei->{dedupe} // die 'BUG: {dedupe} missing';
 	if (my $pkg = $self->{json}) {
 		$json = $pkg->new;
 		$json->utf8->canonical;
 		$json->ascii(1) if $lei->{opt}->{ascii};
+		$lei->{ovv_buf} = \(my $buf = '');
 	}
-	my $l2m = $lei->{l2m};
+	my $l2m = $lei->{l2m} or $dedupe->prepare_dedupe;
 	if ($l2m && !$ibxish) { # remote https?:// mboxrd
 		delete $l2m->{-wq_s1};
 		my $g2m = $l2m->can('git_to_mail');
@@ -241,6 +243,7 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 		my $git = $ibxish->git; # (LeiXSearch|Inbox|ExtSearch)->git
 		$self->{git} = $git; # for ovv_atexit_child
 		my $g2m = $l2m->can('git_to_mail');
+		$dedupe->prepare_dedupe;
 		sub {
 			my ($smsg, $mitem) = @_;
 			$smsg->{pct} = get_pct($mitem) if $mitem;
@@ -251,6 +254,7 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 		$lei->{ovv_buf} = \(my $buf = '');
 		sub { # DIY prettiness :P
 			my ($smsg, $mitem) = @_;
+			return if $dedupe->is_smsg_dup($smsg);
 			$smsg = _unbless_smsg($smsg, $mitem);
 			$buf .= "{\n";
 			$buf .= join(",\n", map {
@@ -274,6 +278,7 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 		$lei->{ovv_buf} = \(my $buf = '');
 		sub {
 			my ($smsg, $mitem) = @_;
+			return if $dedupe->is_smsg_dup($smsg);
 			$buf .= $json->encode(_unbless_smsg(@_)) . $ORS;
 			if (length($buf) > 65536) {
 				my $lk = $self->lock_for_scope;
diff --git a/t/lei.t b/t/lei.t
index 3fd1d1fe..f826a966 100644
--- a/t/lei.t
+++ b/t/lei.t
@@ -17,6 +17,7 @@ my $err_filter;
 my @onions = qw(http://hjrcffqmbrq6wope.onion/meta/
 	http://czquwvybam4bgbro.onion/meta/
 	http://ou63pmih66umazou.onion/meta/);
+my $json = ref(PublicInbox::Config->json)->new->utf8->canonical;
 my $lei = sub {
 	my ($cmd, $env, $xopt) = @_;
 	$out = $err = '';
@@ -142,8 +143,7 @@ my $setup_publicinboxes = sub {
 		my ($ibx) = @_;
 		my $im = PublicInbox::InboxWritable->new($ibx)->importer(0);
 		my $V = $ibx->version;
-		my @eml = glob('t/*.eml');
-		push(@eml, 't/data/0001.patch') if $V == 2;
+		my @eml = (glob('t/*.eml'), 't/data/0001.patch');
 		for (@eml) {
 			next if $_ eq 't/psgi_v2-old.eml'; # dup mid
 			$im->add(eml_load($_)) or BAIL_OUT "v$V add $_";
@@ -176,7 +176,7 @@ SKIP: {
 	my $mid = '20140421094015.GA8962@dcvr.yhbt.net';
 	ok($lei->('q', "m:$mid"), "query $url");
 	is($err, '', "no errors on $url");
-	my $res = PublicInbox::Config->json->decode($out);
+	my $res = $json->decode($out);
 	is($res->[0]->{'m'}, "<$mid>", "got expected mid from $url");
 	ok($lei->('q', "m:$mid", 'd:..20101002'), 'no results, no error');
 	like($err, qr/404/, 'noted 404');
@@ -246,6 +246,19 @@ my $test_external = sub {
 	# No double-quoting should be imposed on users on the CLI
 	$lei->('q', 's:use boolean prefix');
 	like($out, qr/search: use boolean prefix/, 'phrase search got result');
+	my $res = $json->decode($out);
+	is(scalar(@$res), 2, 'only 2 element array (1 result)');
+	is($res->[1], undef, 'final element is undef'); # XXX should this be?
+	is(ref($res->[0]), 'HASH', 'first element is hashref');
+	$lei->('q', '--pretty', 's:use boolean prefix');
+	my $pretty = $json->decode($out);
+	is_deeply($res, $pretty, '--pretty is identical after decode');
+
+	for my $fmt (qw(ldjson ndjson jsonl)) {
+		$lei->('q', '-f', $fmt, 's:use boolean prefix');
+		is($out, $json->encode($pretty->[0])."\n", "-f $fmt");
+	}
+
 	require IO::Uncompress::Gunzip;
 	for my $sfx ('', '.gz') {
 		my $f = "$home/mbox$sfx";

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/5] lei q: drop "oid" output format
  2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
  2021-01-25  1:18 ` [PATCH 1/5] lei: reinstate JSON smsg output deduplication Eric Wong
@ 2021-01-25  1:18 ` Eric Wong
  2021-01-25  1:18 ` [PATCH 3/5] lei q: demangle and quiet curl output Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

The default deduplication command-line arguments would be
non-sensical for such an option and probably confusing.  It
doesn't seem worth the code to support OID-only output when it's
easy enough to use one of the JSON formats to extract the same
info.

We also don't have OIDs if using remotes, and the
to-be-implemented memoization will be optional.
---
 lib/PublicInbox/LEI.pm         | 3 ++-
 lib/PublicInbox/LeiOverview.pm | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 378113e8..09eac58c 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -200,7 +200,8 @@ my %OPTDESC = (
 			'message/object output format' ],
 'mark	format|f=s' => $stdin_formats,
 'forget	format|f=s' => $stdin_formats,
-'q	format|f=s' => [ 'OUT|maildir|mboxrd|mboxcl2|mboxcl|html|oid|json',
+'q	format|f=s' => [
+	'OUT|maildir|mboxrd|mboxcl2|mboxcl|html|json|jsonl|concatjson',
 		'specify output format, default depends on --output'],
 'ls-query	format|f=s' => $ls_format,
 'ls-external	format|f=s' => $ls_format,
diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm
index 880c7acc..ea35871c 100644
--- a/lib/PublicInbox/LeiOverview.pm
+++ b/lib/PublicInbox/LeiOverview.pm
@@ -286,10 +286,6 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 				$buf = '';
 			}
 		}
-	} elsif ($self->{fmt} eq 'oid') {
-		sub {
-			my ($smsg, $mitem) = @_;
-		}
 	} # else { ...
 }
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/5] lei q: demangle and quiet curl output
  2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
  2021-01-25  1:18 ` [PATCH 1/5] lei: reinstate JSON smsg output deduplication Eric Wong
  2021-01-25  1:18 ` [PATCH 2/5] lei q: drop "oid" output format Eric Wong
@ 2021-01-25  1:18 ` Eric Wong
  2021-01-25  1:18 ` [PATCH 4/5] lei q: reject remotes early if curl(1) is missing Eric Wong
  2021-01-25  1:18 ` [PATCH 5/5] lei q: continue remote search if torsocks(1) " Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

curl(1) writes to stderr one byte-at-a-time (presumably for the
progress bar).  This ends up being unreadable on my terminal
when parallel processes are trying to write error messages.

So instead, we'll capture the output to a file and run
'tail -f' on it if --verbose is enabled.

Since HTTP 404s from non-existent results are a common response,
we'll ignore them and stay silent, matching behavior of local
searches.
---
 lib/PublicInbox/LeiXSearch.pm | 45 ++++++++++++++++++++++++++---------
 t/lei.t                       |  2 +-
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index fb608d00..68be8ada 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -14,8 +14,9 @@ use PublicInbox::Import;
 use File::Temp 0.19 (); # 0.19 for ->newdir
 use File::Spec ();
 use PublicInbox::Search qw(xap_terms);
-use PublicInbox::Spawn qw(popen_rd);
+use PublicInbox::Spawn qw(popen_rd spawn);
 use PublicInbox::MID qw(mids);
+use Fcntl qw(SEEK_SET F_SETFL O_APPEND O_RDWR);
 
 sub new {
 	my ($class) = @_;
@@ -176,6 +177,13 @@ sub each_eml { # callback for MboxReader->mboxrd
 	$each_smsg->($smsg, undef, $eml);
 }
 
+# PublicInbox::OnDestroy callback
+sub kill_reap {
+	my ($pid) = @_;
+	kill('KILL', $pid); # spawn() blocks other signals
+	waitpid($pid, 0);
+}
+
 sub query_remote_mboxrd {
 	my ($self, $lei, $uris) = @_;
 	local $0 = "$0 query_remote_mboxrd";
@@ -186,7 +194,20 @@ sub query_remote_mboxrd {
 	push(@qform, t => 1) if $opt->{thread};
 	my @cmd = (qw(curl -sSf -d), '');
 	my $verbose = $opt->{verbose};
-	push @cmd, '-v' if $verbose;
+	my $reap;
+	my $cerr = File::Temp->new(TEMPLATE => 'curl.err-XXXX', TMPDIR => 1);
+	fcntl($cerr, F_SETFL, O_APPEND|O_RDWR) or warn "set O_APPEND: $!";
+	my $rdr = { 2 => $cerr };
+	my $coff = 0;
+	if ($verbose) {
+		# spawn a process to force line-buffering, otherwise curl
+		# will write 1 character at-a-time and parallel outputs
+		# mmmaaayyy llloookkk llliiikkkeee ttthhhiiisss
+		push @cmd, '-v';
+		my $o = { 1 => $lei->{2}, 2 => $lei->{2} };
+		my $pid = spawn(['tail', '-f', $cerr->filename], undef, $o);
+		$reap = PublicInbox::OnDestroy->new(\&kill_reap, $pid);
+	}
 	for my $o ($lei->curl_opt) {
 		$o =~ s/\|[a-z0-9]\b//i; # remove single char short option
 		if ($o =~ s/=[is]@\z//) {
@@ -213,21 +234,23 @@ sub query_remote_mboxrd {
 		}
 		$lei->err("# @$cmd") if $verbose;
 		$? = 0;
-		my $fh = popen_rd($cmd, $env, { 2 => $lei->{2} });
+		my $fh = popen_rd($cmd, $env, $rdr);
 		$fh = IO::Uncompress::Gunzip->new($fh);
 		eval {
 			PublicInbox::MboxReader->mboxrd($fh, \&each_eml, $self,
 							$lei, $each_smsg);
 		};
 		return $lei->fail("E: @$cmd: $@") if $@;
-		if (($? >> 8) == 22) { # HTTP 404 from curl(1)
-			$uri->query_form(q => $lei->{mset_opt}->{qstr});
-			$lei->err('# no results from '.$uri->as_string);
-		} elsif ($?) {
-			$uri->query_form(q => $lei->{mset_opt}->{qstr});
-			$lei->err('E: '.$uri->as_string);
-			$lei->child_error($?);
-		}
+		next unless $?;
+		seek($cerr, $coff, SEEK_SET) or warn "seek(curl stderr): $!\n";
+		my $e = do { local $/; <$cerr> } //
+				die "read(curl stderr): $!\n";
+		$coff += length($e);
+		next if (($? >> 8) == 22 && $e =~ /\b404\b/);
+		$lei->child_error($?);
+		$uri->query_form(q => $lei->{mset_opt}->{qstr});
+		# --verbose already showed the error via tail(1)
+		$lei->err("E: $uri \$?=$?\n", $verbose ? () : $e);
 	}
 	undef $each_smsg;
 	$lei->{ovv}->ovv_atexit_child($lei);
diff --git a/t/lei.t b/t/lei.t
index f826a966..69338257 100644
--- a/t/lei.t
+++ b/t/lei.t
@@ -179,7 +179,7 @@ SKIP: {
 	my $res = $json->decode($out);
 	is($res->[0]->{'m'}, "<$mid>", "got expected mid from $url");
 	ok($lei->('q', "m:$mid", 'd:..20101002'), 'no results, no error');
-	like($err, qr/404/, 'noted 404');
+	is($err, '', 'no output on 404, matching local FS behavior');
 	is($out, "[null]\n", 'got null results');
 	$lei->('forget-external', $url);
 } # /SKIP

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 4/5] lei q: reject remotes early if curl(1) is missing
  2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
                   ` (2 preceding siblings ...)
  2021-01-25  1:18 ` [PATCH 3/5] lei q: demangle and quiet curl output Eric Wong
@ 2021-01-25  1:18 ` Eric Wong
  2021-01-25  1:18 ` [PATCH 5/5] lei q: continue remote search if torsocks(1) " Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

This ought to provide a better user experience for
users if they attempt to use remote externals but
don't have curl installed.

We can avoid repeating PATH search in every worker here, too.
---
 lib/PublicInbox/LeiXSearch.pm | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index 68be8ada..369f6f89 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -14,7 +14,7 @@ use PublicInbox::Import;
 use File::Temp 0.19 (); # 0.19 for ->newdir
 use File::Spec ();
 use PublicInbox::Search qw(xap_terms);
-use PublicInbox::Spawn qw(popen_rd spawn);
+use PublicInbox::Spawn qw(popen_rd spawn which);
 use PublicInbox::MID qw(mids);
 use Fcntl qw(SEEK_SET F_SETFL O_APPEND O_RDWR);
 
@@ -192,7 +192,7 @@ sub query_remote_mboxrd {
 	my ($opt, $env) = @$lei{qw(opt env)};
 	my @qform = (q => $lei->{mset_opt}->{qstr}, x => 'm');
 	push(@qform, t => 1) if $opt->{thread};
-	my @cmd = (qw(curl -sSf -d), '');
+	my @cmd = ($self->{curl}, qw(-sSf -d), '');
 	my $verbose = $opt->{verbose};
 	my $reap;
 	my $cerr = File::Temp->new(TEMPLATE => 'curl.err-XXXX', TMPDIR => 1);
@@ -411,13 +411,22 @@ sub ipc_atfork_prepare {
 	$self->SUPER::ipc_atfork_prepare; # PublicInbox::IPC
 }
 
+sub add_uri {
+	my ($self, $uri) = @_;
+	if (my $curl = $self->{curl} //= which('curl') // 0) {
+		push @{$self->{remotes}}, $uri;
+	} else {
+		warn "curl missing, ignoring $uri\n";
+	}
+}
+
 sub prepare_external {
 	my ($self, $loc, $boost) = @_; # n.b. already ordered by boost
 	if (ref $loc) { # already a URI, or PublicInbox::Inbox-like object
-		return push(@{$self->{remotes}}, $loc) if $loc->can('scheme');
+		return add_uri($self, $loc) if $loc->can('scheme');
 	} elsif ($loc =~ m!\Ahttps?://!) {
 		require URI;
-		return push(@{$self->{remotes}}, URI->new($loc));
+		return add_uri($self, URI->new($loc));
 	} elsif (-f "$loc/ei.lock") {
 		require PublicInbox::ExtSearch;
 		$loc = PublicInbox::ExtSearch->new($loc);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 5/5] lei q: continue remote search if torsocks(1) is missing
  2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
                   ` (3 preceding siblings ...)
  2021-01-25  1:18 ` [PATCH 4/5] lei q: reject remotes early if curl(1) is missing Eric Wong
@ 2021-01-25  1:18 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2021-01-25  1:18 UTC (permalink / raw)
  To: meta

torsocks is just one of many ways to get curl to use Tor,
so we'll continue if we can't find torsocks in our PATH
and assume the user has a proxy configured via curlrc,
the command-line, environment variable, or even firewall
rules.
---
 lib/PublicInbox/LeiXSearch.pm | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index 369f6f89..b470c113 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -228,10 +228,16 @@ sub query_remote_mboxrd {
 		my $cmd = [ @cmd, $uri->as_string ];
 		if ($tor eq 'auto' && substr($uri->host, -6) eq '.onion' &&
 				(($env->{LD_PRELOAD}//'') !~ /torsocks/)) {
-			unshift @$cmd, 'torsocks';
+			unshift @$cmd, which('torsocks');
 		} elsif (PublicInbox::Config::git_bool($tor)) {
-			unshift @$cmd, 'torsocks';
+			unshift @$cmd, which('torsocks');
 		}
+
+		# continue anyways if torsocks is missing; a proxy may be
+		# specified via CLI, curlrc, environment variable, or even
+		# firewall rule
+		shift(@$cmd) if !$cmd->[0];
+
 		$lei->err("# @$cmd") if $verbose;
 		$? = 0;
 		my $fh = popen_rd($cmd, $env, $rdr);

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-01-25  1:18 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-25  1:18 [PATCH 0/5] lei: more fixes and usability enhancement Eric Wong
2021-01-25  1:18 ` [PATCH 1/5] lei: reinstate JSON smsg output deduplication Eric Wong
2021-01-25  1:18 ` [PATCH 2/5] lei q: drop "oid" output format Eric Wong
2021-01-25  1:18 ` [PATCH 3/5] lei q: demangle and quiet curl output Eric Wong
2021-01-25  1:18 ` [PATCH 4/5] lei q: reject remotes early if curl(1) is missing Eric Wong
2021-01-25  1:18 ` [PATCH 5/5] lei q: continue remote search if torsocks(1) " Eric Wong

Code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).