From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5A7D41F922 for ; Sat, 27 Jun 2020 10:04:04 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 22/34] watch: support imap.fetchBatchSize parameter Date: Sat, 27 Jun 2020 10:03:48 +0000 Message-Id: <20200627100400.9871-23-e@yhbt.net> In-Reply-To: <20200627100400.9871-1-e@yhbt.net> References: <20200627100400.9871-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: IMAP allows retrieving multiple messages with a single command, and Mail::IMAPClient supports that. Unfortunately, it means we slurp multiple messages into memory at once. This option allows users to trade off memory usage to reduce network round-trips. Ideally, we'd support pipelining; but AFAIK no widely installed Perl IMAP library supports it. --- lib/PublicInbox/WatchMaildir.pm | 47 ++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index d492e5d65b7..05aa6594147 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -274,6 +274,15 @@ sub imap_common_init ($) { $self->{imap_opt}->{$sec}->{poll_intvl} = $to if $to; $to = cfg_intvl($cfg, 'imap', 'IdleInterval', $sec, $url); $self->{imap_opt}->{$sec}->{idle_intvl} = $to if $to; + + my $key = lc("imap.$sec.fetchBatchSize"); + my $bs = $cfg->{lc($key)} // + $cfg->urlmatch('imap.fetchBatchSize', $url) // next; + if ($bs =~ /\A([0-9]+)\z/) { + $self->{imap_opt}->{$sec}->{batch_size} = $bs; + } else { + warn "W: $key=$bs is not an integer\n"; + } } $mic_args; } @@ -389,25 +398,31 @@ sub imap_fetch_all ($$$) { warn "I: $url fetching UID $l_uid:$r_uid\n"; $mic->Uid(1); # the default, we hope - my $uids; + my $bs = $self->{imap_opt}->{$sec}->{batch_size} // 1; my $req = $mic->imap4rev1 ? 'BODY.PEEK[]' : 'RFC822.PEEK'; + + # TODO: FLAGS may be useful for personal use my $key = $req; $key =~ s/\.PEEK//; - my $uid; + my ($uids, $batch); my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; local $SIG{__WARN__} = sub { - $uid //= -1; - $warn_cb->("$url UID:$uid\n"); + $batch //= '?'; + $warn_cb->("$url UID:$batch\n"); $warn_cb->(@_); }; my $err; do { + # I wish "UID FETCH $START:*" could work, but: + # 1) servers do not need to return results in any order + # 2) Mail::IMAPClient doesn't offer a streaming API $uids = $mic->search("UID $l_uid:*") or return "E: $url UID SEARCH $l_uid:* error: $!"; return if scalar(@$uids) == 0; # RFC 3501 doesn't seem to indicate order of UID SEARCH - # responses, so sort it ourselves + # responses, so sort it ourselves. Order matters so + # IMAPTracker can store the newest UID. @$uids = sort { $a <=> $b } @$uids; # Did we actually get new messages? @@ -416,17 +431,23 @@ sub imap_fetch_all ($$$) { $l_uid = $uids->[-1] + 1; # for next search my $last_uid; - while (defined(($uid = shift(@$uids)))) { - local $0 = "UID:$uid $mbx $sec"; - my $r = $mic->fetch_hash($uid, $req); + while (scalar @$uids) { + my @batch = splice(@$uids, 0, $bs); + $batch = join(',', @batch); + local $0 = "UID:$batch $mbx $sec"; + my $r = $mic->fetch_hash($batch, $req); unless ($r) { # network error? - $err = "E: $url UID FETCH $uid error: $!"; + $err = "E: $url UID FETCH $batch error: $!"; last; } - # messages get deleted, so holes appear - defined(my $raw = delete $r->{$uid}->{$key}) or next; - imap_import_msg($self, $url, $uid, \$raw); - $last_uid = $uid; + for my $uid (@batch) { + # messages get deleted, so holes appear + my $per_uid = delete $r->{$uid} // next; + my $raw = delete($per_uid->{$key}) // next; + imap_import_msg($self, $url, $uid, \$raw); + $last_uid = $uid; + last if $self->{quit}; + } last if $self->{quit}; } _done_for_now($self);