From d2a7dcb58ffb9604b2023159431fcdc4871f368f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 31 Dec 2020 13:51:28 +0000 Subject: lei: implement various deduplication strategies For writing mboxes and Maildirs, users may wish to use stricter or looser deduplication strategies. This gives them more control. --- lib/PublicInbox/LeiDedupe.pm | 96 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 lib/PublicInbox/LeiDedupe.pm (limited to 'lib/PublicInbox/LeiDedupe.pm') diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm new file mode 100644 index 00000000..c6eb7196 --- /dev/null +++ b/lib/PublicInbox/LeiDedupe.pm @@ -0,0 +1,96 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +package PublicInbox::LeiDedupe; +use strict; +use v5.10.1; +use PublicInbox::SharedKV; +use PublicInbox::ContentHash qw(content_hash); + +# n.b. mutt sets most of these headers not sure about Bytes +our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); + +# best-effort regeneration of OID when augmenting existing results +sub _regen_oid ($) { + my ($eml) = @_; + my @stash; # stash away headers we shouldn't have in git + for my $k (@OID_IGNORE) { + my @v = $eml->header_raw($k) or next; + push @stash, [ $k, \@v ]; + $eml->header_set($k); # restore below + } + my $dig = Digest::SHA->new(1); # XXX SHA256 later + my $buf = $eml->as_string; + $dig->add('blob '.length($buf)."\0"); + $dig->add($buf); + undef $buf; + + for my $kv (@stash) { # restore stashed headers + my ($k, @v) = @$kv; + $eml->header_set($k, @v); + } + $dig->digest; +} + +sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } + +# the paranoid option +sub dedupe_oid () { + my $skv = PublicInbox::SharedKV->new; + ($skv, sub { # may be called in a child process + my ($eml, $oid) = @_; + $skv->set_maybe(_oidbin($oid) // _regen_oid($eml), ''); + }); +} + +# dangerous if there's duplicate messages with different Message-IDs +sub dedupe_mid () { + my $skv = PublicInbox::SharedKV->new; + ($skv, sub { # may be called in a child process + my ($eml, $oid) = @_; + # TODO: lei will support non-public messages w/o Message-ID + my $mid = $eml->header_raw('Message-ID') // _oidbin($oid) // + content_hash($eml); + $skv->set_maybe($mid, ''); + }); +} + +# our default deduplication strategy (used by v2, also) +sub dedupe_content () { + my $skv = PublicInbox::SharedKV->new; + ($skv, sub { # may be called in a child process + my ($eml) = @_; # oid = $_[1], ignored + $skv->set_maybe(content_hash($eml), ''); + }); +} + +# no deduplication at all +sub dedupe_none () { (undef, sub { 1 }) } + +sub new { + my ($cls, $lei) = @_; + my $dd = $lei->{opt}->{dedupe} // 'content'; + my $dd_new = $cls->can("dedupe_$dd") // + die "unsupported dedupe strategy: $dd\n"; + bless [ $dd_new->() ], $cls; # [ $skv, $cb ] +} + +# returns true on unseen messages according to the deduplication strategy, +# returns false if seen +sub is_dup { + my ($self, $eml, $oid) = @_; + !$self->[1]->($eml, $oid); +} + +sub prepare_dedupe { + my ($self) = @_; + my $skv = $self->[0]; + $skv ? $skv->dbh : undef; +} + +sub pause_dedupe { + my ($self) = @_; + my $skv = $self->[0]; + delete($skv->{dbh}) if $skv; +} + +1; -- cgit v1.2.3-24-ge0c7