about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-06-10 07:05:02 +0000
committerEric Wong <e@yhbt.net>2020-06-13 07:55:45 +0000
commitcd389aac52936c82f3416b3ceefe21e1250b8a3e (patch)
treee68b068c3ec0d7aba3df80304fcb6cb398c1d589 /lib
parentf77b21173e730a3daa8f5eed6d73835a682b3f04 (diff)
downloadpublic-inbox-cd389aac52936c82f3416b3ceefe21e1250b8a3e.tar.gz
NNTP and IMAP both require CRLF conversions on the wire.
They're also the only components which care about
$smsg->{bytes}, so store the CRLF-adjusted value in over.sqlite3
and Xapian DBs..

This will allow us to optimize RFC822.SIZE fetch item in IMAP
without triggering size mismatch errors in some clients' default
configurations (e.g. Mail::IMAPClient), but not most others.

It could also fix hypothetical problems with NNTP clients that
report discrepancies between overview and article data.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Import.pm2
-rw-r--r--lib/PublicInbox/SearchIdx.pm12
-rw-r--r--lib/PublicInbox/SearchIdxShard.pm11
-rw-r--r--lib/PublicInbox/V2Writable.pm10
4 files changed, 25 insertions, 10 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index ab75aa00..af35905b 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -400,7 +400,7 @@ sub add {
         # v2: we need this for Xapian
         if ($smsg) {
                 $smsg->{blob} = $self->get_mark(":$blob");
-                $smsg->{bytes} = $n;
+                $smsg->{raw_bytes} = $n;
                 $smsg->{-raw_email} = \$raw_email;
         }
         my $ref = $self->{ref};
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index a790ac40..85821ea7 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -549,11 +549,23 @@ sub unindex_mm {
         $self->{mm}->mid_delete(mid_mime($mime));
 }
 
+# returns the number of bytes to add if given a non-CRLF arg
+sub crlf_adjust ($) {
+        if (index($_[0], "\r\n") < 0) {
+                # common case is LF-only, every \n needs an \r;
+                # so favor a cheap tr// over an expensive m//g
+                $_[0] =~ tr/\n/\n/;
+        } else { # count number of '\n' w/o '\r', expensive:
+                scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
+        }
+}
+
 sub index_both { # git->cat_async callback
         my ($bref, $oid, $type, $size, $sync) = @_;
         my ($nr, $max) = @$sync{qw(nr max)};
         ++$$nr;
         $$max -= $size;
+        $size += crlf_adjust($$bref);
         my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
         my $self = $sync->{sidx};
         my $eml = PublicInbox::Eml->new($bref);
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index c1f52d8b..f7ba293f 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -71,11 +71,11 @@ sub shard_worker_loop ($$$$$) {
                 } else {
                         chomp $line;
                         # n.b. $mid may contain spaces(!)
-                        my ($bytes, $num, $blob, $ds, $ts, $mid) =
-                                                        split(/ /, $line, 6);
+                        my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) =
+                                                        split(/ /, $line, 7);
                         $self->begin_txn_lazy;
-                        my $n = read($r, my $msg, $bytes) or die "read: $!\n";
-                        $n == $bytes or die "short read: $n != $bytes\n";
+                        my $n = read($r, my $msg, $to_read) or die "read: $!\n";
+                        $n == $to_read or die "short read: $n != $to_read\n";
                         my $mime = PublicInbox::Eml->new(\$msg);
                         my $smsg = bless {
                                 bytes => $bytes,
@@ -96,7 +96,8 @@ sub index_raw {
         my ($self, $msgref, $mime, $smsg) = @_;
         if (my $w = $self->{w}) {
                 # mid must be last, it can contain spaces (but not LF)
-                print $w join(' ', @$smsg{qw(bytes num blob ds ts mid)}),
+                print $w join(' ', @$smsg{qw(raw_bytes bytes
+                                                num blob ds ts mid)}),
                         "\n", $$msgref or die "failed to write shard $!\n";
         } else {
                 $$msgref = undef;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 79bee7f9..91379431 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -155,10 +155,12 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
         my ($self, $msgref, $mime, $smsg) = @_;
+        $smsg->{bytes} = $smsg->{raw_bytes} +
+                        PublicInbox::SearchIdx::crlf_adjust($$msgref);
         $self->{over}->add_overview($mime, $smsg);
         my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
         $idx->index_raw($msgref, $mime, $smsg);
-        my $n = $self->{transact_bytes} += $smsg->{bytes};
+        my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
         $n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards});
 }
 
@@ -568,7 +570,7 @@ W: $list
         for my $smsg (@$need_reindex) {
                 my $new_smsg = bless {
                         blob => $blob,
-                        bytes => $bytes,
+                        raw_bytes => $bytes,
                         num => $smsg->{num},
                         mid => $smsg->{mid},
                 }, 'PublicInbox::Smsg';
@@ -962,7 +964,7 @@ sub reindex_oid_m ($$$$;$) {
         }
         $sync->{nr}++;
         my $smsg = bless {
-                bytes => $len,
+                raw_bytes => $len,
                 num => $num,
                 blob => $oid,
                 mid => $mid0,
@@ -1054,7 +1056,7 @@ sub reindex_oid ($$$$) {
                 die "failed to delete <$mid0> for article #$num\n";
         $sync->{nr}++;
         my $smsg = bless {
-                bytes => $len,
+                raw_bytes => $len,
                 num => $num,
                 blob => $oid,
                 mid => $mid0,