From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A84F21FA09 for ; Sun, 10 May 2020 22:37:15 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/5] rename "ContentId" to "ContentHash" Date: Sun, 10 May 2020 22:37:12 +0000 Message-Id: <20200510223715.19254-3-e@yhbt.net> In-Reply-To: <20200510223715.19254-1-e@yhbt.net> References: <20200510223715.19254-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: The old name may be confused with "Content-ID" as described in RFC 2392, so use an alternate name to avoid confusing future readers. --- Documentation/public-inbox-v2-format.pod | 12 ++--- MANIFEST | 4 +- .../{ContentId.pm => ContentHash.pm} | 8 ++-- lib/PublicInbox/Import.pm | 2 +- lib/PublicInbox/V2Writable.pm | 48 +++++++++---------- script/public-inbox-edit | 16 +++---- t/{content_id.t => content_hash.t} | 14 +++--- t/v1reindex.t | 2 +- t/v2reindex.t | 2 +- t/v2writable.t | 4 +- 10 files changed, 56 insertions(+), 56 deletions(-) rename lib/PublicInbox/{ContentId.pm => ContentHash.pm} (93%) rename t/{content_id.t => content_hash.t} (64%) diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod index d87a717d40b..9e284a75431 100644 --- a/Documentation/public-inbox-v2-format.pod +++ b/Documentation/public-inbox-v2-format.pod @@ -159,7 +159,7 @@ top-level of the directory. =head1 OBJECT IDENTIFIERS -There are three distinct type of identifiers. content_id is the +There are three distinct type of identifiers. content_hash is the new one for v2 and should make message removal and deduplication easier. object_id and Message-ID are already known. @@ -179,11 +179,11 @@ The email header; duplicates allowed for archival purposes. This remains a searchable field in Xapian. Note: it's possible for emails to have multiple Message-ID headers (and L had that bug for a bit); so we take all of them into account. -In case of conflicts detected by content_id below, we generate a new -Message-ID based on content_id; if the generated Message-ID still +In case of conflicts detected by content_hash below, we generate a new +Message-ID based on content_hash; if the generated Message-ID still conflicts, a random one is generated. -=item content_id +=item content_hash A hash of relevant headers and raw body content for purging of unwanted content. This is not stored anywhere, @@ -193,7 +193,7 @@ For now, the relevant headers are: Subject, From, Date, References, In-Reply-To, To, Cc -Received, List-Id, and similar headers are NOT part of content_id as +Received, List-Id, and similar headers are NOT part of content_hash as they differ across lists and we will want removal to be able to cross lists. @@ -203,7 +203,7 @@ raw body risks being broken by list signatures; but we can use filters (e.g. PublicInbox::Filter::Vger) to clean the body for imports. -content_id is SHA-256 for now; but can be changed at any time +content_hash is SHA-256 for now; but can be changed at any time without making DB changes. =back diff --git a/MANIFEST b/MANIFEST index b1512c7a919..7997bc9906c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -99,7 +99,7 @@ lib/PublicInbox/AdminEdit.pm lib/PublicInbox/AltId.pm lib/PublicInbox/Cgit.pm lib/PublicInbox/Config.pm -lib/PublicInbox/ContentId.pm +lib/PublicInbox/ContentHash.pm lib/PublicInbox/DS.pm lib/PublicInbox/DSKQXS.pm lib/PublicInbox/DSPoll.pm @@ -223,7 +223,7 @@ t/cgi.t t/check-www-inbox.perl t/config.t t/config_limiter.t -t/content_id.t +t/content_hash.t t/convert-compact.t t/data/0001.patch t/ds-kqxs.t diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentHash.pm similarity index 93% rename from lib/PublicInbox/ContentId.pm rename to lib/PublicInbox/ContentHash.pm index 8d77934f20a..420dc5e7c92 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -6,11 +6,11 @@ # This is not stored in any database anywhere and may change # as changes in duplicate detection are needed. # See L manpage for more details. -package PublicInbox::ContentId; +package PublicInbox::ContentHash; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/content_id content_digest/; +our @EXPORT_OK = qw/content_hash content_digest/; use PublicInbox::MID qw(mids references); use PublicInbox::MsgIter; @@ -60,7 +60,7 @@ sub content_digest ($) { # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: - # do NOT consider the Message-ID as part of the content_id + # do NOT consider the Message-ID as part of the content_hash # if we got here, we've already got Message-ID reuse my %seen = map { $_ => 1 } @{mids($hdr)}; foreach my $mid (@{references($hdr)}) { @@ -92,7 +92,7 @@ sub content_digest ($) { $dig; } -sub content_id ($) { +sub content_hash ($) { content_digest($_[0])->digest; } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 07d18599200..fc61d06207c 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -13,7 +13,7 @@ use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -use PublicInbox::ContentId qw(content_digest); +use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MDA; use PublicInbox::Eml; use POSIX qw(strftime); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f599e0a03d8..bf5a0df947a 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -13,7 +13,7 @@ use PublicInbox::Eml; use PublicInbox::Git; use PublicInbox::Import; use PublicInbox::MID qw(mids references); -use PublicInbox::ContentId qw(content_id content_digest); +use PublicInbox::ContentHash qw(content_hash content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; use PublicInbox::Msgmap; @@ -353,23 +353,23 @@ sub _replace_oids ($$$) { $rewrites; } -sub content_ids ($) { +sub content_hashes ($) { my ($mime) = @_; - my @cids = ( content_id($mime) ); + my @chashes = ( content_hash($mime) ); # We still support Email::MIME, here, and # Email::MIME->as_string doesn't always round-trip, so we may - # use a second content_id - my $rt = content_id(PublicInbox::Eml->new(\($mime->as_string))); - push @cids, $rt if $cids[0] ne $rt; - \@cids; + # use a second content_hash + my $rt = content_hash(PublicInbox::Eml->new(\($mime->as_string))); + push @chashes, $rt if $chashes[0] ne $rt; + \@chashes; } sub content_matches ($$) { - my ($cids, $existing) = @_; - my $cid = content_id($existing); - foreach (@$cids) { - return 1 if $_ eq $cid + my ($chashes, $existing) = @_; + my $chash = content_hash($existing); + foreach (@$chashes) { + return 1 if $_ eq $chash } 0 } @@ -386,13 +386,13 @@ sub rewrite_internal ($$;$$$) { $im = $self->importer; } my $over = $self->{over}; - my $cids = content_ids($old_mime); + my $chashes = content_hashes($old_mime); my @removed; my $mids = mids($old_mime->header_obj); # We avoid introducing new blobs into git since the raw content # can be slightly different, so we do not need the user-supplied - # message now that we have the mids and content_id + # message now that we have the mids and content_hash $old_mime = undef; my $mark; @@ -407,7 +407,7 @@ sub rewrite_internal ($$;$$$) { } my $orig = $$msg; my $cur = PublicInbox::Eml->new($msg); - if (content_matches($cids, $cur)) { + if (content_matches($chashes, $cur)) { $gone{$smsg->{num}} = [ $smsg, $cur, \$orig ]; } } @@ -835,7 +835,7 @@ sub get_blob ($$) { sub content_exists ($$$) { my ($self, $mime, $mid) = @_; my $over = $self->{over}; - my $cids = content_ids($mime); + my $chashes = content_hashes($mime); my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -844,7 +844,7 @@ sub content_exists ($$$) { next; } my $cur = PublicInbox::Eml->new($msg); - return 1 if content_matches($cids, $cur); + return 1 if content_matches($chashes, $cur); # XXX DEBUG_DIFF is experimental and may be removed diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; @@ -873,9 +873,9 @@ sub mark_deleted ($$$$) { my $msgref = $git->cat_file($oid); my $mime = PublicInbox::Eml->new($$msgref); my $mids = mids($mime->header_obj); - my $cid = content_id($mime); + my $chash = content_hash($mime); foreach my $mid (@$mids) { - $sync->{D}->{"$mid\0$cid"} = $oid; + $sync->{D}->{"$mid\0$chash"} = $oid; } } @@ -904,11 +904,11 @@ sub reindex_oid_m ($$$$;$) { my $msgref = $git->cat_file($oid, \$len); my $mime = PublicInbox::Eml->new($$msgref); my $mids = mids($mime->header_obj); - my $cid = content_id($mime); + my $chash = content_hash($mime); die "BUG: reindex_oid_m called for <=1 mids" if scalar(@$mids) <= 1; for my $mid (reverse @$mids) { - delete($sync->{D}->{"$mid\0$cid"}) and + delete($sync->{D}->{"$mid\0$chash"}) and die "BUG: reindex_oid should handle <$mid> delete"; } my $over = $self->{over}; @@ -1002,7 +1002,7 @@ sub reindex_oid ($$$$) { return if $len == 0; # purged my $mime = PublicInbox::Eml->new($$msgref); my $mids = mids($mime->header_obj); - my $cid = content_id($mime); + my $chash = content_hash($mime); if (scalar(@$mids) == 0) { warn "E: $oid has no Message-ID, skipping\n"; @@ -1011,7 +1011,7 @@ sub reindex_oid ($$$$) { my $mid = $mids->[0]; # was the file previously marked as deleted?, skip if so - if (delete($sync->{D}->{"$mid\0$cid"})) { + if (delete($sync->{D}->{"$mid\0$chash"})) { if (!$sync->{reindex}) { $num = $sync->{regen}--; $self->{mm}->num_highwater($num); @@ -1036,7 +1036,7 @@ sub reindex_oid ($$$$) { } else { # multiple MIDs are a weird case: my $del = 0; for (@$mids) { - $del += delete($sync->{D}->{"$_\0$cid"}) // 0; + $del += delete($sync->{D}->{"$_\0$chash"}) // 0; } if ($del) { unindex_oid_remote($self, $oid, $_) for @$mids; @@ -1309,7 +1309,7 @@ sub index_sync { return unless defined $latest; $self->idx_init($opt); # acquire lock my $sync = { - D => {}, # "$mid\0$cid" => $oid + D => {}, # "$mid\0$chash" => $oid unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, -opt => $opt diff --git a/script/public-inbox-edit b/script/public-inbox-edit index e895a228386..d8e511b2ee4 100755 --- a/script/public-inbox-edit +++ b/script/public-inbox-edit @@ -9,7 +9,7 @@ use warnings; use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); use PublicInbox::AdminEdit; use File::Temp 0.19 (); # 0.19 for TMPDIR -use PublicInbox::ContentId qw(content_id); +use PublicInbox::ContentHash qw(content_hash); use PublicInbox::MID qw(mid_clean mids); PublicInbox::Admin::check_require('-index'); use PublicInbox::Eml; @@ -43,7 +43,7 @@ if (defined $mid && defined $file) { my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); PublicInbox::AdminEdit::check_editable(\@ibxs); -my $found = {}; # cid => [ [ibx, smsg] [, [ibx, smsg] ] ] +my $found = {}; # chash => [ [ibx, smsg] [, [ibx, smsg] ] ] sub find_mid ($$$) { my ($found, $mid, $ibxs) = @_; @@ -53,9 +53,9 @@ sub find_mid ($$$) { while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $ref = $ibx->msg_by_smsg($smsg); my $mime = PublicInbox::Eml->new($ref); - my $cid = content_id($mime); + my $chash = content_hash($mime); my $tuple = [ $ibx, $smsg ]; - push @{$found->{$cid} ||= []}, $tuple + push @{$found->{$chash} ||= []}, $tuple } PublicInbox::InboxWritable::cleanup($ibx); } @@ -96,8 +96,8 @@ Multiple messages with different content found matching die "open($file) failed: $!"; my $mids = mids($mime->header_obj); find_mid($found, $_, \@ibxs) for (@$mids); # populates $found - my $cid = content_id($mime); - my $to_edit = $found->{$cid}; + my $chash = content_hash($mime); + my $to_edit = $found->{$chash}; unless ($to_edit) { my $nr = scalar(keys %$found); if ($nr > 0) { @@ -115,7 +115,7 @@ $mids } exit 1; } - $found = { $cid => $to_edit }; + $found = { $chash => $to_edit }; } my %tmpopt = ( @@ -218,7 +218,7 @@ W: possible message boundary splitting error my $nhdr = $new_mime->header_obj; my $ohdr = $old_mime->header_obj; if (($nhdr->as_string eq $ohdr->as_string) && - (content_id($new_mime) eq content_id($old_mime))) { + (content_hash($new_mime) eq content_hash($old_mime))) { warn "No change detected to:\n", show_cmd($ibx, $smsg); next unless $opt->{verbose}; diff --git a/t/content_id.t b/t/content_hash.t similarity index 64% rename from t/content_id.t rename to t/content_hash.t index 9df81aa8293..646aab07c9a 100644 --- a/t/content_id.t +++ b/t/content_hash.t @@ -3,7 +3,7 @@ use strict; use warnings; use Test::More; -use PublicInbox::ContentId qw(content_id); +use PublicInbox::ContentHash qw(content_hash); use PublicInbox::Eml; my $mime = PublicInbox::Eml->new(<<'EOF'); @@ -16,17 +16,17 @@ Date: Fri, 02 Oct 1993 00:00:00 +0000 hello world EOF -my $orig = content_id($mime); -my $reload = content_id(PublicInbox::Eml->new($mime->as_string)); -is($orig, $reload, 'content_id matches after serialization'); +my $orig = content_hash($mime); +my $reload = content_hash(PublicInbox::Eml->new($mime->as_string)); +is($orig, $reload, 'content_hash matches after serialization'); foreach my $h (qw(From To Cc)) { my $n = q("Quoted N'Ame" ); $mime->header_set($h, "$n"); - my $q = content_id($mime); - is($mime->header($h), $n, "content_id does not mutate $h:"); + my $q = content_hash($mime); + is($mime->header($h), $n, "content_hash does not mutate $h:"); $mime->header_set($h, 'Quoted N\'Ame '); - my $nq = content_id($mime); + my $nq = content_hash($mime); is($nq, $q, "quotes ignored in $h:"); } diff --git a/t/v1reindex.t b/t/v1reindex.t index 13605f8bd6c..9f23ef01e56 100644 --- a/t/v1reindex.t +++ b/t/v1reindex.t @@ -3,7 +3,7 @@ use strict; use warnings; use Test::More; -use PublicInbox::ContentId qw(content_digest); +use PublicInbox::ContentHash qw(content_digest); use File::Path qw(remove_tree); use PublicInbox::TestCommon; use PublicInbox::Eml; diff --git a/t/v2reindex.t b/t/v2reindex.t index f16a0b0d81c..b99106d0fe7 100644 --- a/t/v2reindex.t +++ b/t/v2reindex.t @@ -4,7 +4,7 @@ use strict; use warnings; use Test::More; use PublicInbox::Eml; -use PublicInbox::ContentId qw(content_digest); +use PublicInbox::ContentHash qw(content_digest); use File::Path qw(remove_tree); use PublicInbox::TestCommon; require_git(2.6); diff --git a/t/v2writable.t b/t/v2writable.t index e5a565cea23..fa5c786e151 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -4,7 +4,7 @@ use strict; use warnings; use Test::More; use PublicInbox::Eml; -use PublicInbox::ContentId qw(content_digest content_id); +use PublicInbox::ContentHash qw(content_digest content_hash); use PublicInbox::TestCommon; use Cwd qw(abs_path); require_git(2.6); @@ -215,7 +215,7 @@ EOF $im = PublicInbox::V2Writable->new($ibx, {nproc => 2}); is($im->{shards}, 1, 'detected single shard from previous'); my ($mark, $rm_mime, $smsg) = $im->remove($mime, 'test removal'); - is(content_id($rm_mime), content_id($mime), + is(content_hash($rm_mime), content_hash($mime), 'removed object returned matches'); ok(defined($mark), 'mark set'); $im->done;