From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A81181F601 for ; Wed, 3 Aug 2022 20:03:57 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1659557037; bh=kumPd8NNLV2mRgtrLc2oJI7hpXbLZLrVY7XS+erUyIQ=; h=From:To:Subject:Date:In-Reply-To:References:From; b=HdWi/H6r3skQchz1unHKjX3YcsG9XB5xRb/8UDhYjzXpMl/MQq34zhjvgX9doUVPu bEzrSrnTs40g0rezBu5Xp/DUhVJXMGai6BctbqG+dXO0eA/1tuMRBr9h9R5kd+wnqz j0uY7vF+pDgRLUyf9wIan+1C5OJJFY5/ZFxdupro= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/4] miscidx: index inbox min/max article numbers Date: Wed, 3 Aug 2022 20:03:55 +0000 Message-Id: <20220803200357.1322670-3-e@80x24.org> In-Reply-To: <20220803200357.1322670-1-e@80x24.org> References: <20220803200357.1322670-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This will be used to speed up NNTP group listings and IMAP startup with thousands of inboxes. --- lib/PublicInbox/Inbox.pm | 10 ++++++++++ lib/PublicInbox/MiscIdx.pm | 8 ++++++-- lib/PublicInbox/MiscSearch.pm | 21 ++++++++++++--------- lib/PublicInbox/Msgmap.pm | 12 ++++++++---- lib/PublicInbox/Search.pm | 3 ++- 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 0ad68810..3f70e69d 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -409,6 +409,16 @@ sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } } sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } +# only used by NNTP, so we need ->mm anyways +sub art_min { $_[0]->{-art_min} //= eval { $_[0]->mm(1)->min } } + +# used by IMAP, too, which tries to avoid ->mm (but ->{mm} is likely +# faster since it's smaller iff available) +sub art_max { + $_[0]->{-art_max} //= eval { $_[0]->{mm}->max } // + eval { $_[0]->over(1)->max }; +} + sub mailboxid { # rfc 8474, 8620, 8621 my ($self, $imap_slice) = @_; my $pfx = defined($imap_slice) ? $self->{newsgroup} : $self->{name}; diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 5faf5c66..76b33b16 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -108,12 +108,16 @@ EOF $doc->add_boolean_term('Q'.$eidx_key); # uniQue id $doc->add_boolean_term('T'.'inbox'); # Type + # force reread from disk, {description} could be loaded from {misc} + delete @$ibx{qw(-art_min -art_max description)}; if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { $doc->add_boolean_term('T'.'newsgroup'); # additional Type + my $n = $ibx->art_min; + add_val($doc, $PublicInbox::MiscSearch::ART_MIN, $n) if $n; + $n = $ibx->art_max; + add_val($doc, $PublicInbox::MiscSearch::ART_MAX, $n) if $n; } - # force reread from disk, {description} could be loaded from {misc} - delete $ibx->{description}; my $desc = $ibx->description; # description = S/Subject (or title) diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index c6d2a062..5fb47d03 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # read-only counterpart to MiscIdx @@ -11,6 +11,8 @@ my $json; # Xapian value columns: our $MODIFIED = 0; our $UIDVALIDITY = 1; # (created time) +our $ART_MIN = 2; # NNTP article number +our $ART_MAX = 3; # NNTP article number # avoid conflicting with message Search::prob_prefix for UI/UX reasons my %PROB_PREFIX = ( @@ -87,14 +89,13 @@ sub ibx_data_once { my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private my $head = $xdb->postlist_begin($term); my $tail = $xdb->postlist_end($term); - if ($head != $tail) { - my $doc = $xdb->get_document($head->get_docid); - $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); - $ibx->{-modified} = int_val($doc, $MODIFIED); - $doc->get_data; - } else { - undef; - } + return if $head == $tail; + my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); + $ibx->{-art_min} = int_val($doc, $ART_MIN); + $ibx->{-art_max} = int_val($doc, $ART_MAX); + $doc->get_data; } sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc) @@ -109,6 +110,8 @@ sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc) { uidvalidity => int_val($doc, $UIDVALIDITY), -modified => int_val($doc, $MODIFIED), + -art_min => int_val($doc, $ART_MIN), # may be undef + -art_max => int_val($doc, $ART_MAX), # may be undef # extract description from manifest.js.gz epoch description description => $d }; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index 1041cd17..cb4bb295 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -144,13 +144,17 @@ sub max { $sth->fetchrow_array // 0; } -sub minmax { - # breaking MIN and MAX into separate queries speeds up from 250ms - # to around 700us with 2.7million messages. +sub min { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1); $sth->execute; - ($sth->fetchrow_array // 0, max($_[0])); + $sth->fetchrow_array // 0; +} + +sub minmax { + # breaking MIN and MAX into separate queries speeds up from 250ms + # to around 700us with 2.7million messages. + (min($_[0]), max($_[0])); } sub mid_delete { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b6141f68..2feb3e13 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -543,9 +543,10 @@ sub help { \@ret; } +# always returns a scalar value sub int_val ($$) { my ($doc, $col) = @_; - my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian sortable_unserialise($val) + 0; # PV => IV conversion }