* [PATCH 65/82] index: account for CRLF conversion when storing bytes
2020-06-10 7:03 6% [PATCH 00/82] public-inbox-imapd: read-only IMAP server Eric Wong
@ 2020-06-10 7:05 7% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-06-10 7:05 UTC (permalink / raw)
To: meta
NNTP and IMAP both require CRLF conversions on the wire.
They're also the only components which care about
$smsg->{bytes}, so store the CRLF-adjusted value in over.sqlite3
and Xapian DBs..
This will allow us to optimize RFC822.SIZE fetch item in IMAP
without triggering size mismatch errors in some clients' default
configurations (e.g. Mail::IMAPClient), but not most others.
It could also fix hypothetical problems with NNTP clients that
report discrepancies between overview and article data.
---
lib/PublicInbox/Import.pm | 2 +-
lib/PublicInbox/SearchIdx.pm | 12 ++++++++++++
lib/PublicInbox/SearchIdxShard.pm | 11 ++++++-----
lib/PublicInbox/V2Writable.pm | 10 ++++++----
t/import.t | 5 +++--
t/nntpd.t | 5 ++++-
t/search.t | 8 ++++++++
7 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index ab75aa00dc2..af35905be49 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -400,7 +400,7 @@ sub add {
# v2: we need this for Xapian
if ($smsg) {
$smsg->{blob} = $self->get_mark(":$blob");
- $smsg->{bytes} = $n;
+ $smsg->{raw_bytes} = $n;
$smsg->{-raw_email} = \$raw_email;
}
my $ref = $self->{ref};
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index a790ac4076a..85821ea706a 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -549,11 +549,23 @@ sub unindex_mm {
$self->{mm}->mid_delete(mid_mime($mime));
}
+# returns the number of bytes to add if given a non-CRLF arg
+sub crlf_adjust ($) {
+ if (index($_[0], "\r\n") < 0) {
+ # common case is LF-only, every \n needs an \r;
+ # so favor a cheap tr// over an expensive m//g
+ $_[0] =~ tr/\n/\n/;
+ } else { # count number of '\n' w/o '\r', expensive:
+ scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
+ }
+}
+
sub index_both { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
+ $size += crlf_adjust($$bref);
my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
my $self = $sync->{sidx};
my $eml = PublicInbox::Eml->new($bref);
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index c1f52d8b884..f7ba293ff5b 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -71,11 +71,11 @@ sub shard_worker_loop ($$$$$) {
} else {
chomp $line;
# n.b. $mid may contain spaces(!)
- my ($bytes, $num, $blob, $ds, $ts, $mid) =
- split(/ /, $line, 6);
+ my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) =
+ split(/ /, $line, 7);
$self->begin_txn_lazy;
- my $n = read($r, my $msg, $bytes) or die "read: $!\n";
- $n == $bytes or die "short read: $n != $bytes\n";
+ my $n = read($r, my $msg, $to_read) or die "read: $!\n";
+ $n == $to_read or die "short read: $n != $to_read\n";
my $mime = PublicInbox::Eml->new(\$msg);
my $smsg = bless {
bytes => $bytes,
@@ -96,7 +96,8 @@ sub index_raw {
my ($self, $msgref, $mime, $smsg) = @_;
if (my $w = $self->{w}) {
# mid must be last, it can contain spaces (but not LF)
- print $w join(' ', @$smsg{qw(bytes num blob ds ts mid)}),
+ print $w join(' ', @$smsg{qw(raw_bytes bytes
+ num blob ds ts mid)}),
"\n", $$msgref or die "failed to write shard $!\n";
} else {
$$msgref = undef;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 79bee7f9f3d..91379431633 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -155,10 +155,12 @@ sub add {
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
+ $smsg->{bytes} = $smsg->{raw_bytes} +
+ PublicInbox::SearchIdx::crlf_adjust($$msgref);
$self->{over}->add_overview($mime, $smsg);
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
- my $n = $self->{transact_bytes} += $smsg->{bytes};
+ my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
$n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
}
@@ -568,7 +570,7 @@ W: $list
for my $smsg (@$need_reindex) {
my $new_smsg = bless {
blob => $blob,
- bytes => $bytes,
+ raw_bytes => $bytes,
num => $smsg->{num},
mid => $smsg->{mid},
}, 'PublicInbox::Smsg';
@@ -962,7 +964,7 @@ sub reindex_oid_m ($$$$;$) {
}
$sync->{nr}++;
my $smsg = bless {
- bytes => $len,
+ raw_bytes => $len,
num => $num,
blob => $oid,
mid => $mid0,
@@ -1054,7 +1056,7 @@ sub reindex_oid ($$$$) {
die "failed to delete <$mid0> for article #$num\n";
$sync->{nr}++;
my $smsg = bless {
- bytes => $len,
+ raw_bytes => $len,
num => $num,
blob => $oid,
mid => $mid0,
diff --git a/t/import.t b/t/import.t
index f987b1141f7..abbc8229d0e 100644
--- a/t/import.t
+++ b/t/import.t
@@ -32,8 +32,9 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
if ($v2) {
like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
- is($mime->as_string, ${$smsg->{-raw_email}}, 'string matches');
- is($smsg->{bytes}, length(${$smsg->{-raw_email}}), 'length matches');
+ my $raw_email = $smsg->{-raw_email};
+ is($mime->as_string, $$raw_email, 'string matches');
+ is($smsg->{raw_bytes}, length($$raw_email), 'length matches');
my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
my $in = tempfile();
print $in $mime->as_string or die "write failed: $!";
diff --git a/t/nntpd.t b/t/nntpd.t
index eee67ea65bb..d2f31323115 100644
--- a/t/nntpd.t
+++ b/t/nntpd.t
@@ -73,7 +73,10 @@ EOF
my $list_id = $addr;
$list_id =~ s/@/./;
$mime->header_set('List-Id', "<$list_id>");
- $len = length($mime->as_string);
+ my $str = $mime->as_string;
+ $str =~ s/(?<!\r)\n/\r\n/sg;
+ $len = length($str);
+ undef $str;
$im->add($mime);
$im->done;
if ($version == 1) {
diff --git a/t/search.t b/t/search.t
index d4ca28c794f..82caf9e41c3 100644
--- a/t/search.t
+++ b/t/search.t
@@ -59,6 +59,14 @@ sub oct_is ($$$) {
}
}
+{
+ my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust;
+ is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed');
+ is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR');
+ is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');
+ is($crlf_adjust->("hi\nworld\r\n"), 1, 'CRLF/LF-mix 2 counts 1 CR');
+}
+
$ibx->with_umask(sub {
my $root = PublicInbox::Eml->new(<<'EOF');
Date: Fri, 02 Oct 1993 00:00:00 +0000
^ permalink raw reply related [relevance 7%]
* [PATCH 00/82] public-inbox-imapd: read-only IMAP server
@ 2020-06-10 7:03 6% Eric Wong
2020-06-10 7:05 7% ` [PATCH 65/82] index: account for CRLF conversion when storing bytes Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-06-10 7:03 UTC (permalink / raw)
To: meta
So I finally wrote my first IMAP server! And I'm actually
fairly satisfied with how it's turning out to support a bunch
of other performance + scalability work I've wanted to do.
Some previous notes here:
https://public-inbox.org/meta/20200609113442.GA16856@dcvr/
I finally seem to have gotten it to play nicely with mutt header
caching, so it's fit for public consumption :)
imaps://news.public-inbox.org/INBOX.comp.mail.public-inbox.meta.0
You can use any username+password, and AUTH=ANONYMOUS also
works if your client does that.
It doesn't support UTF-7 (mailbox names) or advertise UTF-8
in CAPABILITIES, yet; I still have RFCs to read :P
And there's a bunch of new things which could use some
testing from non-mutt/mbsync/offlineimap users.
Maybe you'll find some client-side bugs like I did :P
v1 reindexing also gets a little bit of parallelism :)
Anyways, I'll probably be porting some of the scalability
and slow-storage work to older parts of the code before
fiddling with more IMAP extensions.
Eric Wong (82):
doc: add some IMAP standards
nntpd: restrict allowed newsgroup names
preliminary imap server implementation
inboxidle: new class to detect inbox changes
imap: support IDLE
msgmap: split ->max into its own method
imap: delay InboxIdle start, support refresh
imap: implement STATUS command
imap: use Text::ParseWords::parse_line to handle quoted words
imap: support LIST command
t/imapd: support FakeInotify and KQNotify
imap: support fetch for BODYSTRUCTURE and BODY
eml: each_part: single part $idx is 1
imap: allow fetch of partial of BODY[...] and headers
imap: always include `resp-text' in responses
imap: split out unit tests and benchmarks
imap: fix multi-message partial header fetches
imap: simplify partial fetch structure
imap: support sequence number FETCH
imap: do not include ".PEEK" in responses
imap: support the CLOSE command
imap: speed up HEADER.FIELDS[.NOT] range fetches
git: async: flatten the inflight array
git: do our own read buffering for cat-file
imap: use git-cat-file asynchronously
git: idle rbuf for async
imap: support LSUB command
imap: FETCH: support comma-delimited ranges
add imapd compression test
testcommon: tcp_(server|connect): BAIL_OUT on failure
*deflate: drop invalid comment about rbuf
imap: fix pipelining with async git
git: cat_async: provide requested OID + "missing" on missing blobs
git: move async_cat reference to PublicInbox::Git
git: async: automatic retry on alternates change
imapclient: wrapper for Mail::IMAPClient
xt: add imapd-validate and imapd-mbsync-oimap
imap: support out-of-bounds ranges
xt/perf-imap-list: time refresh_inboxlist
imap: case-insensitive mailbox name comparisons
imap: break giant inboxes into sub-inboxes of 50K messages
imap: start introducing iterative config reloading
imap: require ".$UID_MIN-$UID_END" suffix
imapd: ensure LIST is sorted alphabetically, for now
imap: omit $UID_END from mailbox name, use index
t/config.t: always compare against git bool behavior
xt/*: show some tunable parameters
imap: STATUS and LIST are case-insensitive, too
imap: EXAMINE/STATUS: return correct counts
imap: avoid uninitialized warnings on incomplete commands
imap: start parsing out queries for SQLite and Xapian
imap: SEARCH: clamp results to the 50K UID range
imap: allow UID range search on timestamps
over: get_art: use dbh->prepare_cached
search: index byte size of a message for IMAP search
search: index UID for IMAP search, too
imap: remove dummies from sequence number FETCH
imap: compile UID FETCH to opcodes
imap: UID FETCH: optimize for smsg-only case
imap: UID FETCH: optimize (UID FLAGS) harder
imap: IDLE: avoid extraneous wakeups, keep-alive
imap: 30 minute auto-logout timer
imap: split ->logged_in attribute into a separate class
searchidx: v1 (re)-index uses git asynchronously
index: account for CRLF conversion when storing bytes
imap: rely on smsg->{bytes} for RFC822.SIZE
imap: UID FETCH requires at least one data item
imap: LIST shows "INBOX" in all caps
imap: support 8000 octet lines
imap: reinstate some message sequence number support
imap: cleanup ->{uid_base} usage
imap: FETCH: more granular CRLF conversion
imap: further speed up HEADER.FIELDS FETCH requests
imap: FETCH: try to make fake MSNs sequentially
imap: STATUS/EXAMINE: rely on SQLite overview
imap: UID SEARCH: support multiple ranges
imap: wire up Xapian search, msn SEARCH and multiple ranges
imap: misc cleanups and notes
imapd: don't bother sorting LIST output
imap: drop non-UID SEARCH for now
over: uid_range: remove LIMIT
imap: FETCH: proper MSN => UID mapping for requests
Documentation/public-inbox-imapd.pod | 91 ++
Documentation/standards.perl | 10 +
MANIFEST | 18 +
lib/PublicInbox/Config.pm | 18 +
lib/PublicInbox/Daemon.pm | 24 +-
lib/PublicInbox/DummyInbox.pm | 22 +
lib/PublicInbox/Eml.pm | 9 +-
lib/PublicInbox/FakeInotify.pm | 59 ++
lib/PublicInbox/Git.pm | 163 +--
lib/PublicInbox/GitAsyncCat.pm | 51 +
lib/PublicInbox/IMAP.pm | 1397 ++++++++++++++++++++++++++
lib/PublicInbox/IMAPClient.pm | 119 +++
lib/PublicInbox/IMAPD.pm | 114 +++
lib/PublicInbox/IMAPdeflate.pm | 126 +++
lib/PublicInbox/Import.pm | 2 +-
lib/PublicInbox/In2Tie.pm | 17 +
lib/PublicInbox/Inbox.pm | 33 +-
lib/PublicInbox/InboxIdle.pm | 79 ++
lib/PublicInbox/KQNotify.pm | 66 ++
lib/PublicInbox/Lock.pm | 7 +
lib/PublicInbox/MsgIter.pm | 2 +-
lib/PublicInbox/Msgmap.pm | 20 +-
lib/PublicInbox/NNTPD.pm | 12 +-
lib/PublicInbox/NNTPdeflate.pm | 1 -
lib/PublicInbox/Over.pm | 50 +-
lib/PublicInbox/Search.pm | 32 +-
lib/PublicInbox/SearchIdx.pm | 89 +-
lib/PublicInbox/SearchIdxShard.pm | 11 +-
lib/PublicInbox/Smsg.pm | 8 +-
lib/PublicInbox/TestCommon.pm | 7 +-
lib/PublicInbox/V2Writable.pm | 10 +-
script/public-inbox-imapd | 14 +
t/config.t | 15 +-
t/eml.t | 2 +-
t/git.t | 40 +-
t/imap.t | 133 +++
t/imapd-tls.t | 204 ++++
t/imapd.t | 398 ++++++++
t/import.t | 5 +-
t/inbox_idle.t | 72 ++
t/nntpd.t | 5 +-
t/over.t | 3 +
t/search.t | 19 +
xt/cmp-msgstr.t | 1 -
xt/cmp-msgview.t | 1 -
xt/eml_check_limits.t | 6 +-
xt/git_async_cmp.t | 2 +-
xt/imapd-mbsync-oimap.t | 132 +++
xt/imapd-validate.t | 177 ++++
xt/mem-msgview.t | 1 +
xt/msgtime_cmp.t | 1 -
xt/perf-msgview.t | 1 -
52 files changed, 3718 insertions(+), 181 deletions(-)
create mode 100644 Documentation/public-inbox-imapd.pod
create mode 100644 lib/PublicInbox/DummyInbox.pm
create mode 100644 lib/PublicInbox/FakeInotify.pm
create mode 100644 lib/PublicInbox/GitAsyncCat.pm
create mode 100644 lib/PublicInbox/IMAP.pm
create mode 100644 lib/PublicInbox/IMAPClient.pm
create mode 100644 lib/PublicInbox/IMAPD.pm
create mode 100644 lib/PublicInbox/IMAPdeflate.pm
create mode 100644 lib/PublicInbox/In2Tie.pm
create mode 100644 lib/PublicInbox/InboxIdle.pm
create mode 100644 lib/PublicInbox/KQNotify.pm
create mode 100644 script/public-inbox-imapd
create mode 100644 t/imap.t
create mode 100644 t/imapd-tls.t
create mode 100644 t/imapd.t
create mode 100644 t/inbox_idle.t
create mode 100644 xt/imapd-mbsync-oimap.t
create mode 100644 xt/imapd-validate.t
^ permalink raw reply [relevance 6%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-06-10 7:03 6% [PATCH 00/82] public-inbox-imapd: read-only IMAP server Eric Wong
2020-06-10 7:05 7% ` [PATCH 65/82] index: account for CRLF conversion when storing bytes Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).