From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id BE5EE1F487; Tue, 31 Mar 2020 08:49:36 +0000 (UTC) Date: Tue, 31 Mar 2020 08:49:36 +0000 From: Eric Wong To: meta@public-inbox.org Subject: [WIP 1/?] v2writable: index Message-IDs w/ spaces properly Message-ID: <20200331084936.GA26977@dcvr> References: <20200331083250.GA27164@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20200331083250.GA27164@dcvr> List-Id: Message-IDs can apparently contain spaces and other weird characters. Ensure we pass those properly to shard subprocesses when importing messages in parallel mode. Our NNTP parser does not deal with spaces in the Message-ID, yet, and I don't expect most NNTP clients to, either. --- lib/PublicInbox/SearchIdxShard.pm | 8 +++++--- t/v2writable.t | 11 ++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 1ea01095..06bcd403 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -69,8 +69,9 @@ sub shard_worker_loop ($$$$$) { $self->remove_by_oid($oid, $mid); } else { chomp $line; - my ($bytes, $num, $blob, $mid, $ds, $ts) = - split(/ /, $line); + # n.b. $mid may contain spaces(!) + my ($bytes, $num, $blob, $ds, $ts, $mid) = + split(/ /, $line, 6); $self->begin_txn_lazy; my $n = read($r, my $msg, $bytes) or die "read: $!\n"; $n == $bytes or die "short read: $n != $bytes\n"; @@ -93,7 +94,8 @@ sub shard_worker_loop ($$$$$) { sub index_raw { my ($self, $msgref, $mime, $smsg) = @_; if (my $w = $self->{w}) { - print $w join(' ', @$smsg{qw(bytes num blob mid ds ts)}), + # mid must be last, it can contain spaces (but not LF) + print $w join(' ', @$smsg{qw(bytes num blob ds ts mid)}), "\n", $$msgref or die "failed to write shard $!\n"; } else { $$msgref = undef; diff --git a/t/v2writable.t b/t/v2writable.t index cdcfe4d0..8167e4de 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -109,6 +109,11 @@ if ('ensure git configs are correct') { @mids = $mime->header_obj->header_raw('Message-Id'); like($mids[0], $sane_mid, 'mid was generated'); is(scalar(@mids), 1, 'new generated'); + + @warn = (); + $mime->header_set('Message-Id', ''); + ok($im->add($mime), 'message added with space in Message-Id'); + is_deeply([], \@warn); } { @@ -175,8 +180,12 @@ EOF is($uniq{$mid}++, 0, "MID for $num is unique in XOVER"); is_deeply($n->xhdr('Message-ID', $num), { $num => $mid }, "XHDR lookup OK on num $num"); + + # FIXME NNTP.pm doesn't handle spaces in Message-ID + next if $mid =~ / /; + is_deeply($n->xhdr('Message-ID', $mid), - { $mid => $mid }, "XHDR lookup OK on MID $num"); + { $mid => $mid }, "XHDR lookup OK on MID $mid ($num)"); } my %nn; foreach my $mid (@{$n->newnews(0, $group)}) {