From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 8929B1FA1A for ; Thu, 14 Jan 2021 07:06:28 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 07/14] lei_dedupe+shared_kv: ensure round-tripping serialization Date: Wed, 13 Jan 2021 19:06:20 -1200 Message-Id: <20210114070627.18195-8-e@80x24.org> In-Reply-To: <20210114070627.18195-1-e@80x24.org> References: <20210114070627.18195-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We'll be passing these objects via PublicInbox::IPC which uses Storable (or Sereal), so ensure they're safe to use after serialization. --- lib/PublicInbox/LeiDedupe.pm | 29 ++++++++++++++++------------- lib/PublicInbox/SharedKV.pm | 12 +++++++++--- t/lei_dedupe.t | 13 +++++++++++++ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index 58eee533..81754361 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -43,9 +43,9 @@ sub smsg_hash ($) { } # the paranoid option -sub dedupe_oid () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process +sub dedupe_oid ($) { + my ($skv) = @_; + (sub { # may be called in a child process my ($eml, $oid) = @_; $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); }, sub { @@ -55,9 +55,9 @@ sub dedupe_oid () { } # dangerous if there's duplicate messages with different Message-IDs -sub dedupe_mid () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process +sub dedupe_mid ($) { + my ($skv) = @_; + (sub { # may be called in a child process my ($eml, $oid) = @_; # TODO: lei will support non-public messages w/o Message-ID my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // @@ -73,9 +73,9 @@ sub dedupe_mid () { } # our default deduplication strategy (used by v2, also) -sub dedupe_content () { - my $skv = PublicInbox::SharedKV->new; - ($skv, sub { # may be called in a child process +sub dedupe_content ($) { + my ($skv) = @_; + (sub { # may be called in a child process my ($eml) = @_; # oid = $_[1], ignored $skv->set_maybe(content_hash($eml), ''); }, sub { @@ -86,7 +86,7 @@ sub dedupe_content () { # no deduplication at all sub true { 1 } -sub dedupe_none () { (undef, \&true, \&true) } +sub dedupe_none ($) { (\&true, \&true) } sub new { my ($cls, $lei, $dst) = @_; @@ -94,10 +94,12 @@ sub new { # allow "none" to bypass Eml->new if writing to directory: return if ($dd eq 'none' && substr($dst // '', -1) eq '/'); + my $m = "dedupe_$dd"; + $cls->can($m) or die "unsupported dedupe strategy: $dd\n"; + my $skv = $dd eq 'none' ? undef : PublicInbox::SharedKV->new; - my $dd_new = $cls->can("dedupe_$dd") // - die "unsupported dedupe strategy: $dd\n"; - bless [ $dd_new->() ], $cls; # [ $skv, $cb ] + # [ $skv, $eml_cb, $smsg_cb, "dedupe_$dd" ] + bless [ $skv, undef, undef, $m ], $cls; } # returns true on unseen messages according to the deduplication strategy, @@ -115,6 +117,7 @@ sub is_smsg_dup { sub prepare_dedupe { my ($self) = @_; my $skv = $self->[0]; + $self->[1] or @$self[1,2] = $self->can($self->[3])->($skv); $skv ? $skv->dbh : undef; } diff --git a/lib/PublicInbox/SharedKV.pm b/lib/PublicInbox/SharedKV.pm index d75d8998..072c94ca 100644 --- a/lib/PublicInbox/SharedKV.pm +++ b/lib/PublicInbox/SharedKV.pm @@ -8,9 +8,10 @@ package PublicInbox::SharedKV; use strict; use v5.10.1; use parent qw(PublicInbox::Lock); -use File::Temp 0.19 (); # 0.19 for ->newdir +use File::Temp qw(tempdir); use DBI (); use PublicInbox::Spawn; +use File::Path qw(rmtree); sub dbh { my ($self, $lock) = @_; @@ -44,8 +45,8 @@ sub new { my ($cls, $dir, $base, $opt) = @_; my $self = bless { opt => $opt }, $cls; unless (defined $dir) { - $self->{tmp} = File::Temp->newdir('kv-XXXXXX', TMPDIR => 1); - $dir = $self->{tmp}->dirname; + $self->{tmpdir} = $dir = tempdir('skv-XXXXXX', TMPDIR => 1); + $self->{tmpid} = "$$.$self"; } -d $dir or mkdir($dir) or die "mkdir($dir): $!"; $base //= ''; @@ -145,4 +146,9 @@ SELECT COUNT(k) FROM kv $sth->fetchrow_array; } +sub DESTROY { + my ($self) = @_; + rmtree($self->{tmpdir}) if ($self->{tmpid} // '') eq "$$.$self"; +} + 1; diff --git a/t/lei_dedupe.t b/t/lei_dedupe.t index 6e971b9b..bcb06a0a 100644 --- a/t/lei_dedupe.t +++ b/t/lei_dedupe.t @@ -17,8 +17,18 @@ my $smsg = bless { ds => time }, 'PublicInbox::Smsg'; $smsg->populate($eml); $smsg->{$_} //= '' for (qw(to cc references)) ; +my $check_storable = sub { + my ($x) = @_; + SKIP: { + require_mods('Storable', 1); + my $dup = Storable::thaw(Storable::freeze($x)); + is_deeply($dup, $x, "$x->[3] round-trips through storable"); + } +}; + my $lei = { opt => { dedupe => 'none' } }; my $dd = PublicInbox::LeiDedupe->new($lei); +$check_storable->($dd); $dd->prepare_dedupe; ok(!$dd->is_dup($eml), '1st is_dup w/o dedupe'); ok(!$dd->is_dup($eml), '2nd is_dup w/o dedupe'); @@ -29,6 +39,7 @@ ok(!$dd->is_smsg_dup($smsg), 'smsg dedupe none 2'); for my $strat (undef, 'content') { $lei->{opt}->{dedupe} = $strat; $dd = PublicInbox::LeiDedupe->new($lei); + $check_storable->($dd); $dd->prepare_dedupe; my $desc = $strat // 'default'; ok(!$dd->is_dup($eml), "1st is_dup with $desc dedupe"); @@ -43,6 +54,7 @@ like($@, qr/unsupported.*bogus/, 'died on bogus strategy'); $lei->{opt}->{dedupe} = 'mid'; $dd = PublicInbox::LeiDedupe->new($lei); +$check_storable->($dd); $dd->prepare_dedupe; ok(!$dd->is_dup($eml), '1st is_dup with mid dedupe'); ok($dd->is_dup($eml), '2nd seen with mid dedupe'); @@ -52,6 +64,7 @@ ok($dd->is_smsg_dup($smsg), 'smsg mid dedupe reject'); $lei->{opt}->{dedupe} = 'oid'; $dd = PublicInbox::LeiDedupe->new($lei); +$check_storable->($dd); $dd->prepare_dedupe; # --augment won't have OIDs: