From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 2/4] miscidx: index inbox min/max article numbers
Date: Wed, 3 Aug 2022 20:03:55 +0000 [thread overview]
Message-ID: <20220803200357.1322670-3-e@80x24.org> (raw)
In-Reply-To: <20220803200357.1322670-1-e@80x24.org>
This will be used to speed up NNTP group listings and IMAP startup
with thousands of inboxes.
---
lib/PublicInbox/Inbox.pm | 10 ++++++++++
lib/PublicInbox/MiscIdx.pm | 8 ++++++--
lib/PublicInbox/MiscSearch.pm | 21 ++++++++++++---------
lib/PublicInbox/Msgmap.pm | 12 ++++++++----
lib/PublicInbox/Search.pm | 3 ++-
5 files changed, 38 insertions(+), 16 deletions(-)
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 0ad68810..3f70e69d 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -409,6 +409,16 @@ sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } }
sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} }
+# only used by NNTP, so we need ->mm anyways
+sub art_min { $_[0]->{-art_min} //= eval { $_[0]->mm(1)->min } }
+
+# used by IMAP, too, which tries to avoid ->mm (but ->{mm} is likely
+# faster since it's smaller iff available)
+sub art_max {
+ $_[0]->{-art_max} //= eval { $_[0]->{mm}->max } //
+ eval { $_[0]->over(1)->max };
+}
+
sub mailboxid { # rfc 8474, 8620, 8621
my ($self, $imap_slice) = @_;
my $pfx = defined($imap_slice) ? $self->{newsgroup} : $self->{name};
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 5faf5c66..76b33b16 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -108,12 +108,16 @@ EOF
$doc->add_boolean_term('Q'.$eidx_key); # uniQue id
$doc->add_boolean_term('T'.'inbox'); # Type
+ # force reread from disk, {description} could be loaded from {misc}
+ delete @$ibx{qw(-art_min -art_max description)};
if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) {
$doc->add_boolean_term('T'.'newsgroup'); # additional Type
+ my $n = $ibx->art_min;
+ add_val($doc, $PublicInbox::MiscSearch::ART_MIN, $n) if $n;
+ $n = $ibx->art_max;
+ add_val($doc, $PublicInbox::MiscSearch::ART_MAX, $n) if $n;
}
- # force reread from disk, {description} could be loaded from {misc}
- delete $ibx->{description};
my $desc = $ibx->description;
# description = S/Subject (or title)
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index c6d2a062..5fb47d03 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# read-only counterpart to MiscIdx
@@ -11,6 +11,8 @@ my $json;
# Xapian value columns:
our $MODIFIED = 0;
our $UIDVALIDITY = 1; # (created time)
+our $ART_MIN = 2; # NNTP article number
+our $ART_MAX = 3; # NNTP article number
# avoid conflicting with message Search::prob_prefix for UI/UX reasons
my %PROB_PREFIX = (
@@ -87,14 +89,13 @@ sub ibx_data_once {
my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private
my $head = $xdb->postlist_begin($term);
my $tail = $xdb->postlist_end($term);
- if ($head != $tail) {
- my $doc = $xdb->get_document($head->get_docid);
- $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY);
- $ibx->{-modified} = int_val($doc, $MODIFIED);
- $doc->get_data;
- } else {
- undef;
- }
+ return if $head == $tail;
+ my $doc = $xdb->get_document($head->get_docid);
+ $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY);
+ $ibx->{-modified} = int_val($doc, $MODIFIED);
+ $ibx->{-art_min} = int_val($doc, $ART_MIN);
+ $ibx->{-art_max} = int_val($doc, $ART_MAX);
+ $doc->get_data;
}
sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc)
@@ -109,6 +110,8 @@ sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc)
{
uidvalidity => int_val($doc, $UIDVALIDITY),
-modified => int_val($doc, $MODIFIED),
+ -art_min => int_val($doc, $ART_MIN), # may be undef
+ -art_max => int_val($doc, $ART_MAX), # may be undef
# extract description from manifest.js.gz epoch description
description => $d
};
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 1041cd17..cb4bb295 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -144,13 +144,17 @@ sub max {
$sth->fetchrow_array // 0;
}
-sub minmax {
- # breaking MIN and MAX into separate queries speeds up from 250ms
- # to around 700us with 2.7million messages.
+sub min {
my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap',
undef, 1);
$sth->execute;
- ($sth->fetchrow_array // 0, max($_[0]));
+ $sth->fetchrow_array // 0;
+}
+
+sub minmax {
+ # breaking MIN and MAX into separate queries speeds up from 250ms
+ # to around 700us with 2.7million messages.
+ (min($_[0]), max($_[0]));
}
sub mid_delete {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index b6141f68..2feb3e13 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -543,9 +543,10 @@ sub help {
\@ret;
}
+# always returns a scalar value
sub int_val ($$) {
my ($doc, $col) = @_;
- my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
+ my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian
sortable_unserialise($val) + 0; # PV => IV conversion
}
next prev parent reply other threads:[~2022-08-03 20:03 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-03 20:03 [PATCH 0/4] use ALL to speedup -nntpd and -imapd Eric Wong
2022-08-03 20:03 ` [PATCH 1/4] nntpd: do not delete newsgroup name from inbox object Eric Wong
2022-08-03 20:03 ` Eric Wong [this message]
2022-08-03 20:03 ` [PATCH 3/4] nntp: speed up group listings via ->ALL->misc Eric Wong
2022-08-03 20:03 ` [PATCH 4/4] imapd: use nntpd_cache to speed up startup/reload time Eric Wong
2022-08-03 23:50 ` [PATCH 0/4] use ALL to speedup -nntpd and -imapd Kyle Meyer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220803200357.1322670-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).