From e98c3f01267c810ee214be87d0ee1bd575b23b88 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 11 Apr 2021 05:32:55 +0000 Subject: www: do not obfuscate addresses in URLs As they are likely Message-IDs. If an email address ends up in a URL, then it's likely public, so there's even less reason to obfuscate that particular address. [km: add xt/perf-obfuscate.t] [ew: modernize perf test (5.10.1), use diag instead of print] This version of the patch avoids the massive slowdown noted by Kyle in . Performance remains roughly the same, if not slightly faster (which may be due to me testing this on a busy server). Results from xt/perf-obfuscate.t against 6078 messages on a local mirror of : before: 6.67 usr + 0.04 sys = 6.71 CPU after: 6.64 usr + 0.04 sys = 6.68 CPU Reported-by: Kyle Meyer Helped-by: Kyle Meyer Link: https://public-inbox.org/meta/87a6q8p5qa.fsf@kyleam.com/ --- MANIFEST | 1 + lib/PublicInbox/Hval.pm | 21 ++++++++++------ t/hval.t | 4 ++++ xt/perf-obfuscate.t | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 7 deletions(-) create mode 100644 xt/perf-obfuscate.t diff --git a/MANIFEST b/MANIFEST index b663c2a2..12247ad2 100644 --- a/MANIFEST +++ b/MANIFEST @@ -504,6 +504,7 @@ xt/net_writer-imap.t xt/nntpd-validate.t xt/perf-msgview.t xt/perf-nntpd.t +xt/perf-obfuscate.t xt/perf-threading.t xt/solver.t xt/stress-sharedkv.t diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index d20f70ae..eab4738e 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -82,15 +82,22 @@ sub obfuscate_addrs ($$;$) { my $repl = $_[2] // '•'; my $re = $ibx->{-no_obfuscate_re}; # regex of domains my $addrs = $ibx->{-no_obfuscate}; # { $address => 1 } - $_[1] =~ s/(([\w\.\+=\-]+)\@([\w\-]+\.[\w\.\-]+))/ - my ($addr, $user, $domain) = ($1, $2, $3); - if ($addrs->{$addr} || ((defined $re && $domain =~ $re))) { - $addr; + $_[1] =~ s#(\S+)\@([\w\-]+\.[\w\.\-]+)# + my ($pfx, $domain) = ($1, $2); + if (index($pfx, '://') > 0 || $pfx !~ s/([\w\.\+=\-]+)\z//) { + "$pfx\@$domain"; } else { - $domain =~ s!([^\.]+)\.!$1$repl!; - $user . '@' . $domain + my $user = $1; + my $addr = "$user\@$domain"; + if ($addrs->{$addr} || ((defined($re) && + $domain =~ $re))) { + $pfx.$addr; + } else { + $domain =~ s!([^\.]+)\.!$1$repl!; + $pfx . $user . '@' . $domain + } } - /sge; + #sge; } # like format_sanitized_subject in git.git pretty.c with '%f' format string diff --git a/t/hval.t b/t/hval.t index 9d0dab7a..5afc2052 100644 --- a/t/hval.t +++ b/t/hval.t @@ -47,6 +47,10 @@ EOF is($html, $exp, 'only obfuscated relevant addresses'); +$exp = 'https://example.net/foo@example.net'; +PublicInbox::Hval::obfuscate_addrs($ibx, my $res = $exp); +is($res, $exp, 'does not obfuscate URL with Message-ID'); + is(PublicInbox::Hval::to_filename('foo bar '), 'foo-bar', 'to_filename has no trailing -'); diff --git a/xt/perf-obfuscate.t b/xt/perf-obfuscate.t new file mode 100644 index 00000000..d4e7fb99 --- /dev/null +++ b/xt/perf-obfuscate.t @@ -0,0 +1,64 @@ +#!perl -w +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ +use strict; +use v5.10.1; +use PublicInbox::TestCommon; +use Benchmark qw(:all); +use PublicInbox::Inbox; +use PublicInbox::View; + +my $inboxdir = $ENV{GIANT_INBOX_DIR}; +plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir; + +my $obfuscate = $ENV{PI_OBFUSCATE} ? 1 : 0; +diag "obfuscate=$obfuscate\n"; + +my @cat = qw(cat-file --buffer --batch-check --batch-all-objects); +if (require_git(2.19, 1)) { + push @cat, '--unordered'; +} else { + warn +"git <2.19, cat-file lacks --unordered, locality suffers\n"; +} +require_mods qw(Plack::Util); +use_ok 'Plack::Util'; +my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'name' , + obfuscate => $obfuscate}); +my $git = $ibx->git; +my $fh = $git->popen(@cat); +my $vec = ''; +vec($vec, fileno($fh), 1) = 1; +select($vec, undef, undef, 60) or die "timed out waiting for --batch-check"; + +my $ctx = { + env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' }, + ibx => $ibx, + www => Plack::Util::inline_object(style => sub {''}), +}; +my ($mime, $res, $oid, $type); +my $n = 0; +my $obuf = ''; +my $m = 0; + +my $cb = sub { + $mime = PublicInbox::Eml->new(shift); + PublicInbox::View::multipart_text_as_html($mime, $ctx); + ++$m; + $obuf = ''; +}; + +my $t = timeit(1, sub { + $ctx->{obuf} = \$obuf; + $ctx->{mhref} = '../'; + while (<$fh>) { + ($oid, $type) = split / /; + next if $type ne 'blob'; + ++$n; + $git->cat_async($oid, $cb); + } + $git->cat_async_wait; +}); +diag 'multipart_text_as_html took '.timestr($t)." for $n <=> $m messages"; +is($m, $n, 'rendered all messages'); +done_testing(); -- cgit v1.2.3-24-ge0c7