From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 678031FFA6 for ; Fri, 18 Dec 2020 12:09:52 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 18/26] lei_store: keyword extraction from mbox and Maildir Date: Fri, 18 Dec 2020 12:09:42 +0000 Message-Id: <20201218120950.23272-19-e@80x24.org> In-Reply-To: <20201218120950.23272-1-e@80x24.org> References: <20201218120950.23272-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Dovecot, mutt, and likely much other software support mbox Status/X-Status headers. Ensure we have a way to extract these headers as JMAP-compatible keywords before removing them for git storage. ->add_eml now accepts setting keywords at import time, and will probably be called like this: $lst->add_eml($eml, $lst->mbox_keywords($eml)); $lst->add_eml($eml, $lst->maildir_keywords($fn)); --- lib/PublicInbox/LeiStore.pm | 23 ++++++++++++++++++++++- t/lei_store.t | 14 ++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index c95df785..553adbc8 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -162,8 +162,27 @@ sub remove_eml_keywords { \@docids; } +# cf: https://doc.dovecot.org/configuration_manual/mail_location/mbox/ +my %status2kw = (F => 'flagged', A => 'answered', R => 'seen', T => 'draft'); +# O (old/non-recent), and D (deleted) aren't in JMAP, +# so probably won't be supported by us. +sub mbox_keywords { + my $eml = $_[-1]; + my $s = "@{[$eml->header_raw('X-Status'),$eml->header_raw('Status')]}"; + my %kw; + $s =~ s/([FART])/$kw{$status2kw{$1}} = 1/sge; + sort(keys %kw); +} + +# cf: https://cr.yp.to/proto/maildir.html +my %c2kw = ('D' => 'draft', F => 'flagged', R => 'answered', S => 'seen'); +sub maildir_keywords { + $_[-1] =~ /:2,([A-Z]+)\z/i ? + sort(map { $c2kw{$_} // () } split(//, $1)) : (); +} + sub add_eml { - my ($self, $eml) = @_; + my ($self, $eml, @kw) = @_; my $eidx = eidx_init($self); my $oidx = $eidx->{oidx}; my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; @@ -178,6 +197,7 @@ sub add_eml { my $idx = $eidx->idx_shard($docid); $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); $idx->shard_add_eidx_info($docid, '.', $eml); # List-Id + $idx->shard_add_keywords($docid, @kw) if @kw; } } else { $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); @@ -185,6 +205,7 @@ sub add_eml { $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.'); my $idx = $eidx->idx_shard($smsg->{num}); $idx->index_raw($msgref, $eml, $smsg); + $idx->shard_add_keywords($smsg->{num}, @kw) if @kw; } $smsg->{blob} } diff --git a/t/lei_store.t b/t/lei_store.t index c18a9620..03ab5af6 100644 --- a/t/lei_store.t +++ b/t/lei_store.t @@ -19,6 +19,20 @@ like($oid, qr/\A[0-9a-f]+\z/, 'add returned OID'); my $eml = eml_load('t/data/0001.patch'); is($lst->add_eml($eml), undef, 'idempotent'); $lst->done; +is_deeply([$lst->mbox_keywords($eml)], [], 'no keywords'); +$eml->header_set('Status', 'RO'); +is_deeply([$lst->mbox_keywords($eml)], ['seen'], 'seen extracted'); +$eml->header_set('X-Status', 'A'); +is_deeply([$lst->mbox_keywords($eml)], [qw(answered seen)], + 'seen+answered extracted'); +$eml->header_set($_) for qw(Status X-Status); + +is_deeply([$lst->maildir_keywords('/foo:2,')], [], 'Maildir no keywords'); +is_deeply([$lst->maildir_keywords('/foo:2,S')], ['seen'], 'Maildir seen'); +is_deeply([$lst->maildir_keywords('/foo:2,RS')], ['answered', 'seen'], + 'Maildir answered + seen'); +is_deeply([$lst->maildir_keywords('/foo:2,RSZ')], ['answered', 'seen'], + 'Maildir answered + seen w/o Z'); { my $es = $lst->search; my $msgs = $es->over->query_xover(0, 1000);