From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 2/3] www_listing: start updating for pagination + search
Date: Wed, 23 Jun 2021 07:14:21 -0400 [thread overview]
Message-ID: <20210623111422.30182-3-e@80x24.org> (raw)
In-Reply-To: <20210623111422.30182-1-e@80x24.org>
When dealing with thousands of inboxes, displaying all of
them on a single page isn't going to work. So steal some
pagination and search results code from the message search
to generate some basic HTML output that looks good in w3m.
---
lib/PublicInbox/Config.pm | 5 ++
lib/PublicInbox/ManifestJsGz.pm | 2 +-
lib/PublicInbox/MiscSearch.pm | 34 +++----
lib/PublicInbox/SearchQuery.pm | 13 +--
lib/PublicInbox/SearchView.pm | 2 +-
lib/PublicInbox/WwwListing.pm | 155 +++++++++++++++++++++++++++-----
6 files changed, 164 insertions(+), 47 deletions(-)
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 3f0f5a01..36f2fafb 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -97,6 +97,11 @@ sub lookup_ei {
$self->{-ei_by_name}->{$name} //= _fill_ei($self, $name);
}
+sub lookup_eidx_key {
+ my ($self, $eidx_key) = @_;
+ _lookup_fill($self, '-by_eidx_key', $eidx_key);
+}
+
# special case for [extindex "all"]
sub ALL { lookup_ei($_[0], 'all') }
diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm
index 31cf15dc..e7bb0e86 100644
--- a/lib/PublicInbox/ManifestJsGz.pm
+++ b/lib/PublicInbox/ManifestJsGz.pm
@@ -81,7 +81,7 @@ sub ibx_entry {
warn "E: $@" if $@;
}
-sub hide_key { 'manifest' }
+sub hide_key { 'manifest' } # for WwwListing->list_match_i
# overrides WwwListing->psgi_triple
sub psgi_triple {
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index 4e010453..6b575b0d 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -59,7 +59,7 @@ sub misc_enquire_once { # retry_reopen callback
$eq->set_query($qr);
my $desc = !$opt->{asc};
my $rel = $opt->{relevance} // 0;
- if ($rel == -1) { # ORDER BY docid/UID
+ if ($rel == -1) { # ORDER BY docid
$eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING);
$eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new);
} elsif ($rel) {
@@ -132,6 +132,23 @@ sub ibx_data_once {
}
}
+sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc)
+ my ($doc) = $_[-1];
+ my $d;
+ my $data = $json->decode($doc->get_data);
+ for (values %$data) {
+ $d = $_->{description} // next;
+ $d =~ s/ \[epoch [0-9]+\]\z// or next;
+ last;
+ }
+ {
+ uidvalidity => int_val($doc, $UIDVALIDITY),
+ -modified => int_val($doc, $MODIFIED),
+ # extract description from manifest.js.gz epoch description
+ description => $d
+ };
+}
+
sub inbox_data {
my ($self, $ibx) = @_;
retry_reopen($self, \&ibx_data_once, $ibx);
@@ -141,20 +158,7 @@ sub ibx_cache_load {
my ($doc, $cache) = @_;
my ($eidx_key) = xap_terms('Q', $doc);
return unless defined($eidx_key); # expired
- my $ce = $cache->{$eidx_key} = {};
- $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY);
- $ce->{-modified} = int_val($doc, $MODIFIED);
- $ce->{description} = do {
- # extract description from manifest.js.gz epoch description
- my $d;
- my $data = $json->decode($doc->get_data);
- for (values %$data) {
- $d = $_->{description} // next;
- $d =~ s/ \[epoch [0-9]+\]\z// or next;
- last;
- }
- $d;
- }
+ $cache->{$eidx_key} = doc2ibx_cache_ent($doc);
}
sub _nntpd_cache_load { # retry_reopen callback
diff --git a/lib/PublicInbox/SearchQuery.pm b/lib/PublicInbox/SearchQuery.pm
index 0f360500..a6b7d843 100644
--- a/lib/PublicInbox/SearchQuery.pm
+++ b/lib/PublicInbox/SearchQuery.pm
@@ -1,7 +1,7 @@
# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# used by PublicInbox::SearchView
+# used by PublicInbox::SearchView and PublicInbox::WwwListing
package PublicInbox::SearchQuery;
use strict;
use v5.10.1;
@@ -32,11 +32,12 @@ sub qs_html {
if (scalar(keys(%override))) {
$self = bless { (%$self, %override) }, ref($self);
}
-
- my $q = uri_escape($self->{'q'}, MID_ESC);
- $q =~ s/%20/+/g; # improve URL readability
- my $qs = "q=$q";
-
+ my $qs = '';
+ if (defined(my $q = $self->{'q'})) {
+ $q = uri_escape($q, MID_ESC);
+ $q =~ s/%20/+/g; # improve URL readability
+ $qs .= "q=$q";
+ }
if (my $o = $self->{o}) { # ignore o == 0
$qs .= "&o=$o";
}
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index e13359d5..c0c801b3 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -219,7 +219,7 @@ sub search_nav_top {
$rv .= qq{</pre></form><pre>};
}
-sub search_nav_bot {
+sub search_nav_bot { # also used by WwwListing for searching extindex miscidx
my ($mset, $q) = @_;
my $total = $mset->get_matches_estimated;
my $l = $q->{l};
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index f28eddf1..eb015742 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -5,7 +5,7 @@
# Used by PublicInbox::WWW
package PublicInbox::WwwListing;
use strict;
-use PublicInbox::Hval qw(prurl fmt_ts);
+use PublicInbox::Hval qw(prurl fmt_ts ascii_html);
use PublicInbox::Linkify;
use PublicInbox::GzipFilter qw(gzf_maybe);
use PublicInbox::ConfigIter;
@@ -13,18 +13,19 @@ use PublicInbox::WwwStream;
use bytes (); # bytes::length
sub ibx_entry {
- my ($ctx, $ibx) = @_;
- my $mtime = $ibx->modified;
- my $ts = fmt_ts($mtime);
+ my ($ctx, $ibx, $ce) = @_;
+ $ce->{description} //= $ibx->description;
+ my $ts = fmt_ts($ce->{-modified} //= $ibx->modified);
my $url = prurl($ctx->{env}, $ibx->{url});
my $tmp = <<"";
* $ts - $url
- ${\$ibx->description}
+ $ce->{description}
if (defined(my $info_url = $ibx->{infourl})) {
$tmp .= ' ' . prurl($ctx->{env}, $info_url) . "\n";
}
- push @{$ctx->{-list}}, [ $mtime, $tmp ];
+ push(@{$ctx->{-list}}, (scalar(@_) == 3 ? # $misc in use, already sorted
+ $tmp : [ $ce->{-modified}, $tmp ] ));
}
sub list_match_i { # ConfigIter callback
@@ -41,7 +42,7 @@ sub list_match_i { # ConfigIter callback
}
}
-sub url_regexp {
+sub url_filter {
my ($ctx, $key, $default) = @_;
$key //= 'publicInbox.wwwListing';
$default //= '404';
@@ -50,9 +51,9 @@ again:
if ($v eq 'match=domain') {
my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME};
$h =~ s/:[0-9]+\z//;
- qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i;
+ (qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i, "url:$h");
} elsif ($v eq 'all') {
- qr/./;
+ (qr/./, undef);
} elsif ($v eq '404') {
undef;
} else {
@@ -67,22 +68,122 @@ EOF
sub hide_key { 'www' }
+sub add_misc_ibx { # MiscSearch->retry_reopen callback
+ my ($misc, $ctx, $re, $qs) = @_;
+ require PublicInbox::SearchQuery;
+ my $q = $ctx->{-sq} = PublicInbox::SearchQuery->new($ctx->{qp});
+ my $o = $q->{o};
+ my ($asc, $min, $max);
+ if ($o < 0) {
+ $asc = 1;
+ $o = -($o + 1); # so [-1] is the last element, like Perl lists
+ }
+ my $r = $q->{r};
+ my $opt = {
+ offset => $o,
+ asc => $asc,
+ relevance => $r,
+ limit => $q->{l}
+ };
+ $qs .= ' type:inbox';
+ if (my $user_query = $q->{'q'}) {
+ $qs = "( $qs ) AND ( $user_query )";
+ }
+ my $mset = $misc->mset($qs, $opt); # sorts by $MODIFIED (mtime)
+ $ctx->{-list} = [];
+ my $pi_cfg = $ctx->{www}->{pi_cfg};
+ for my $mi ($mset->items) {
+ my $doc = $mi->get_document;
+ my ($eidx_key) = PublicInbox::Search::xap_terms('Q', $doc);
+ $eidx_key // next;
+ my $ibx = $pi_cfg->lookup_eidx_key($eidx_key) // next;
+ next if $ibx->{-hide}->{$ctx->hide_key};
+ grep(/$re/, @{$ibx->{url}}) or next;
+ $ctx->ibx_entry($ibx, $misc->doc2ibx_cache_ent($doc));
+ if ($r) { # for descriptions in search_nav_bot
+ my $pct = PublicInbox::Search::get_pct($mi);
+ # only when sorting by relevance, ->items is always
+ # ordered descending:
+ $max //= $pct;
+ $min = $pct;
+ }
+ }
+ if ($r) { # for descriptions in search_nav_bot
+ $q->{-min_pct} = $min;
+ $q->{-max_pct} = $max;
+ }
+ $ctx->{-mset} = $mset;
+ psgi_triple($ctx);
+}
+
sub response {
my ($class, $ctx) = @_;
bless $ctx, $class;
- if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) {
- $ALL->misc->reopen;
- }
- my $re = $ctx->url_regexp or return $ctx->psgi_triple;
- my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg},
+ my ($re, $qs) = $ctx->url_filter;
+ $re // return $ctx->psgi_triple;
+ if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { # fast path
+ $ALL->misc->reopen->retry_reopen(\&add_misc_ibx,
+ $ctx, $re, $qs);
+ } else { # slow path, no [extindex "all"] configured
+ my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg},
\&list_match_i, $re, $ctx);
- sub {
- $ctx->{-wcb} = $_[0]; # HTTP server callback
- $ctx->{env}->{'pi-httpd.async'} ?
- $iter->event_step : $iter->each_section;
+ sub {
+ $ctx->{-wcb} = $_[0]; # HTTP server callback
+ $ctx->{env}->{'pi-httpd.async'} ?
+ $iter->event_step : $iter->each_section;
+ }
}
}
+sub mset_footer ($$) {
+ my ($ctx, $mset) = @_;
+ # no footer if too few matches
+ return '' if $mset->get_matches_estimated == $mset->size;
+ require PublicInbox::SearchView;
+ PublicInbox::SearchView::search_nav_bot($mset, $ctx->{-sq});
+}
+
+sub mset_nav_top {
+ my ($ctx, $mset) = @_;
+ my $q = $ctx->{-sq};
+ my $qh = $q->{'q'} // '';
+ utf8::decode($qh);
+ $qh = ascii_html($qh);
+ $qh = qq[\nvalue="$qh"] if $qh ne '';
+ my $rv = <<EOM;
+<form
+action="./"><pre><input
+name=q
+type=text$qh /><input
+type=submit
+value="locate inbox" /></pre></form><pre>
+EOM
+ chomp $rv;
+ if (defined($q->{'q'})) {
+ my $initial_q = $ctx->{-uxs_retried};
+ if (defined $initial_q) {
+ my $rewritten = $q->{'q'};
+ utf8::decode($initial_q);
+ utf8::decode($rewritten);
+ $initial_q = ascii_html($initial_q);
+ $rewritten = ascii_html($rewritten);
+ $rv .= " Warning: Initial query:\n <b>$initial_q</b>\n";
+ $rv .= " returned no results, used:\n";
+ $rv .= " <b>$rewritten</b>\n instead\n\n";
+ }
+ $rv .= 'Search results ordered by [';
+ if ($q->{r}) {
+ my $d = $q->qs_html(r => 0);
+ $rv .= qq{<a\nhref="?$d">updated</a>|<b>relevance</b>};
+ } else {
+ my $d = $q->qs_html(r => 1);
+ $rv .= qq{<b>updated</b>|<a\nhref="?$d">relevance</a>};
+ }
+ $rv .= ']';
+ }
+ $rv .= qq{</pre>};
+}
+
sub psgi_triple {
my ($ctx) = @_;
my $h = [ 'Content-Type', 'text/html; charset=UTF-8',
@@ -90,17 +191,23 @@ sub psgi_triple {
my $gzf = gzf_maybe($h, $ctx->{env});
$gzf->zmore('<html><head><title>' .
'public-inbox listing</title>' .
- '</head><body><pre>');
+ '</head><body>');
my $code = 404;
- if (my $list = $ctx->{-list}) {
+ if (my $list = delete $ctx->{-list}) {
+ my $mset = delete $ctx->{-mset};
$code = 200;
- # sort by ->modified
- @$list = map { $_->[1] } sort { $b->[0] <=> $a->[0] } @$list;
+ if ($mset) { # already sorted, so search bar:
+ $gzf->zmore(mset_nav_top($ctx, $mset));
+ } else { # sort config dump by ->modified
+ @$list = map { $_->[1] }
+ sort { $b->[0] <=> $a->[0] } @$list;
+ }
$list = join("\n", @$list);
my $l = PublicInbox::Linkify->new;
- $gzf->zmore($l->to_html($list));
+ $gzf->zmore('<pre>'.$l->to_html($list));
+ $gzf->zmore(mset_footer($ctx, $mset)) if $mset;
} else {
- $gzf->zmore('no inboxes, yet');
+ $gzf->zmore('<pre>no inboxes, yet');
}
my $out = $gzf->zflush('</pre><hr><pre>'.
PublicInbox::WwwStream::code_footer($ctx->{env}) .
next prev parent reply other threads:[~2021-06-23 11:14 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-23 11:14 [PATCH 0/3] some WWW search things Eric Wong
2021-06-23 11:14 ` [PATCH 1/3] search: make xap_terms easier-to-use and use it more Eric Wong
2021-06-23 11:14 ` Eric Wong [this message]
2021-06-23 11:14 ` [PATCH 3/3] www: do not warn on blank query parameters Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210623111422.30182-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).