public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob dddde092f2581ea1d54b1749804642dbccf89a48 3307 bytes (raw)
$ git show v1.4.0:lib/PublicInbox/MID.pm	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
 
# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
#
# Various Message-ID-related functions.
package PublicInbox::MID;
use strict;
use warnings;
use base qw/Exporter/;
our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_mime mid_escape MID_ESC
	mids references mids_for_index $MID_EXTRACT);
use URI::Escape qw(uri_escape_utf8);
use Digest::SHA qw/sha1_hex/;
require PublicInbox::Address;
use constant {
	MID_MAX => 40, # SHA-1 hex length # TODO: get rid of this
	MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q')
};

our $MID_EXTRACT = qr/<([^>]+)>/s;

sub mid_clean {
	my ($mid) = @_;
	defined($mid) or die "no Message-ID";
	# MDA->precheck did more checking for us
	if ($mid =~ $MID_EXTRACT) {
		$mid = $1;
	}
	$mid;
}

# this is idempotent, used for HTML anchor/ids and such
sub id_compress {
	my ($id, $force) = @_;

	if ($force || $id =~ /[^a-zA-Z0-9_\-]/ || length($id) > MID_MAX) {
		utf8::encode($id);
		return sha1_hex($id);
	}
	$id;
}

sub mid2path {
	my ($mid) = @_;
	my ($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);

	unless (defined $x38) {
		# compatibility with old links (or short Message-IDs :)
		$mid = mid_clean($mid);
		utf8::encode($mid);
		$mid = sha1_hex($mid);
		($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);
	}
	"$x2/$x38";
}

# Only for v1 code paths:
sub mid_mime ($) { mids($_[0]->header_obj)->[0] }

# only intended for Message-ID and X-Alt-Message-ID
sub extract_mids {
	my @mids;
	for my $v (@_) {
		my @cur = ($v =~ /$MID_EXTRACT/g);
		if (@cur) {
			push(@mids, @cur);
		} else {
			push(@mids, $v);
		}
	}
	\@mids;
}

sub mids ($) {
	my ($hdr) = @_;
	my @mids = $hdr->header_raw('Message-Id');
	uniq_mids(extract_mids(@mids));
}

# we allow searching on X-Alt-Message-ID since PublicInbox::NNTP uses them
# to placate some clients, and we want to ensure NNTP-only clients can
# import and index without relying on HTTP endpoints
sub mids_for_index ($) {
	my ($hdr) = @_;
	my @mids = $hdr->header_raw('Message-Id');
	my @alts = $hdr->header_raw('X-Alt-Message-ID');
	uniq_mids(extract_mids(@mids, @alts));
}

# last References should be IRT, but some mail clients do things
# out of order, so trust IRT over References iff IRT exists
sub references ($) {
	my ($hdr) = @_;
	my @mids;
	foreach my $f (qw(References In-Reply-To)) {
		my @v = $hdr->header_raw($f);
		foreach my $v (@v) {
			push(@mids, ($v =~ /$MID_EXTRACT/g));
		}
	}

	# old versions of git-send-email would prompt users for
	# In-Reply-To and users' muscle memory would use 'y' or 'n'
	# as responses:
	my %addr = ( y => 1, n => 1 );

	foreach my $f (qw(To From Cc)) {
		my @v = $hdr->header_raw($f);
		foreach my $v (@v) {
			$addr{$_} = 1 for (PublicInbox::Address::emails($v));
		}
	}
	uniq_mids(\@mids, \%addr);
}

sub uniq_mids ($;$) {
	my ($mids, $seen) = @_;
	my @ret;
	$seen ||= {};
	foreach my $mid (@$mids) {
		$mid =~ tr/\n\t\r//d;
		if (length($mid) > MAX_MID_SIZE) {
			warn "Message-ID: <$mid> too long, truncating\n";
			$mid = substr($mid, 0, MAX_MID_SIZE);
		}
		push(@ret, $mid) unless $seen->{$mid}++;
	}
	\@ret;
}

# RFC3986, section 3.3:
sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' }
sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) }

1;

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git