From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 893D71FBFE for ; Wed, 10 Jun 2020 07:07:31 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 65/82] index: account for CRLF conversion when storing bytes Date: Wed, 10 Jun 2020 07:05:02 +0000 Message-Id: <20200610070519.18252-66-e@yhbt.net> In-Reply-To: <20200610070519.18252-1-e@yhbt.net> References: <20200610070519.18252-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: NNTP and IMAP both require CRLF conversions on the wire. They're also the only components which care about $smsg->{bytes}, so store the CRLF-adjusted value in over.sqlite3 and Xapian DBs.. This will allow us to optimize RFC822.SIZE fetch item in IMAP without triggering size mismatch errors in some clients' default configurations (e.g. Mail::IMAPClient), but not most others. It could also fix hypothetical problems with NNTP clients that report discrepancies between overview and article data. --- lib/PublicInbox/Import.pm | 2 +- lib/PublicInbox/SearchIdx.pm | 12 ++++++++++++ lib/PublicInbox/SearchIdxShard.pm | 11 ++++++----- lib/PublicInbox/V2Writable.pm | 10 ++++++---- t/import.t | 5 +++-- t/nntpd.t | 5 ++++- t/search.t | 8 ++++++++ 7 files changed, 40 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index ab75aa00dc2..af35905be49 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -400,7 +400,7 @@ sub add { # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{bytes} = $n; + $smsg->{raw_bytes} = $n; $smsg->{-raw_email} = \$raw_email; } my $ref = $self->{ref}; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index a790ac4076a..85821ea706a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -549,11 +549,23 @@ sub unindex_mm { $self->{mm}->mid_delete(mid_mime($mime)); } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; + $size += crlf_adjust($$bref); my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; my $self = $sync->{sidx}; my $eml = PublicInbox::Eml->new($bref); diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index c1f52d8b884..f7ba293ff5b 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -71,11 +71,11 @@ sub shard_worker_loop ($$$$$) { } else { chomp $line; # n.b. $mid may contain spaces(!) - my ($bytes, $num, $blob, $ds, $ts, $mid) = - split(/ /, $line, 6); + my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) = + split(/ /, $line, 7); $self->begin_txn_lazy; - my $n = read($r, my $msg, $bytes) or die "read: $!\n"; - $n == $bytes or die "short read: $n != $bytes\n"; + my $n = read($r, my $msg, $to_read) or die "read: $!\n"; + $n == $to_read or die "short read: $n != $to_read\n"; my $mime = PublicInbox::Eml->new(\$msg); my $smsg = bless { bytes => $bytes, @@ -96,7 +96,8 @@ sub index_raw { my ($self, $msgref, $mime, $smsg) = @_; if (my $w = $self->{w}) { # mid must be last, it can contain spaces (but not LF) - print $w join(' ', @$smsg{qw(bytes num blob ds ts mid)}), + print $w join(' ', @$smsg{qw(raw_bytes bytes + num blob ds ts mid)}), "\n", $$msgref or die "failed to write shard $!\n"; } else { $$msgref = undef; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 79bee7f9f3d..91379431633 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -155,10 +155,12 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; + $smsg->{bytes} = $smsg->{raw_bytes} + + PublicInbox::SearchIdx::crlf_adjust($$msgref); $self->{over}->add_overview($mime, $smsg); my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); $idx->index_raw($msgref, $mime, $smsg); - my $n = $self->{transact_bytes} += $smsg->{bytes}; + my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; $n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } @@ -568,7 +570,7 @@ W: $list for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - bytes => $bytes, + raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; @@ -962,7 +964,7 @@ sub reindex_oid_m ($$$$;$) { } $sync->{nr}++; my $smsg = bless { - bytes => $len, + raw_bytes => $len, num => $num, blob => $oid, mid => $mid0, @@ -1054,7 +1056,7 @@ sub reindex_oid ($$$$) { die "failed to delete <$mid0> for article #$num\n"; $sync->{nr}++; my $smsg = bless { - bytes => $len, + raw_bytes => $len, num => $num, blob => $oid, mid => $mid0, diff --git a/t/import.t b/t/import.t index f987b1141f7..abbc8229d0e 100644 --- a/t/import.t +++ b/t/import.t @@ -32,8 +32,9 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) { like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id'); - is($mime->as_string, ${$smsg->{-raw_email}}, 'string matches'); - is($smsg->{bytes}, length(${$smsg->{-raw_email}}), 'length matches'); + my $raw_email = $smsg->{-raw_email}; + is($mime->as_string, $$raw_email, 'string matches'); + is($smsg->{raw_bytes}, length($$raw_email), 'length matches'); my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); my $in = tempfile(); print $in $mime->as_string or die "write failed: $!"; diff --git a/t/nntpd.t b/t/nntpd.t index eee67ea65bb..d2f31323115 100644 --- a/t/nntpd.t +++ b/t/nntpd.t @@ -73,7 +73,10 @@ EOF my $list_id = $addr; $list_id =~ s/@/./; $mime->header_set('List-Id', "<$list_id>"); - $len = length($mime->as_string); + my $str = $mime->as_string; + $str =~ s/(?add($mime); $im->done; if ($version == 1) { diff --git a/t/search.t b/t/search.t index d4ca28c794f..82caf9e41c3 100644 --- a/t/search.t +++ b/t/search.t @@ -59,6 +59,14 @@ sub oct_is ($$$) { } } +{ + my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust; + is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed'); + is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR'); + is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR'); + is($crlf_adjust->("hi\nworld\r\n"), 1, 'CRLF/LF-mix 2 counts 1 CR'); +} + $ibx->with_umask(sub { my $root = PublicInbox::Eml->new(<<'EOF'); Date: Fri, 02 Oct 1993 00:00:00 +0000