From 71461c67fee940b05309baa8c67bac10c8c51ac6 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 3 Jan 2021 02:06:15 +0000 Subject: use Eml (or MIME) objects for all indexing paths We don't need to be keeping the raw message around after it hits git. Shard work now relies on Storable (or Sereal) and all of the indexing code relies on the Email::MIME-like API of Eml to access interesting parts of the message. Similarly, smsg->{raw_bytes} is no longer carried around and we do the CRLF adjustment when setting smsg->{bytes}. There's also a small simplification to t/import.t while we're in the area to use xqx instead of spawn/popen_rd. --- lib/PublicInbox/ExtSearchIdx.pm | 11 ++++------- lib/PublicInbox/Import.pm | 4 +--- lib/PublicInbox/LeiStore.pm | 4 ---- lib/PublicInbox/SearchIdx.pm | 17 +++-------------- lib/PublicInbox/Smsg.pm | 13 +++++++++++++ lib/PublicInbox/V2Writable.pm | 23 ++++++++++------------- t/import.t | 12 ++---------- t/search.t | 2 +- 8 files changed, 34 insertions(+), 52 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index d55d3db9..e6c21866 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -21,8 +21,7 @@ use Carp qw(croak carp); use Sys::Hostname qw(hostname); use POSIX qw(strftime); use PublicInbox::Search; -use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor - is_bad_blob); +use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); use PublicInbox::OverIdx; use PublicInbox::MiscIdx; use PublicInbox::MID qw(mids); @@ -82,8 +81,6 @@ sub check_batch_limit ($) { my ($req) = @_; my $self = $req->{self}; my $new_smsg = $req->{new_smsg}; - - # {raw_bytes} may be unset, so just use {bytes} my $n = $self->{transact_bytes} += $new_smsg->{bytes}; # set flag for PublicInbox::V2Writable::index_todo: @@ -239,7 +236,7 @@ sub index_oid { # git->cat_async callback for 'm' my $new_smsg = $req->{new_smsg} = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return; ++${$req->{nr}}; do_step($req); @@ -496,7 +493,7 @@ sub _reindex_oid { # git->cat_async callback my $ci = $self->{current_info}; local $self->{current_info} = "$ci #$docid $oid"; my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; - $re_smsg->{bytes} = $size + crlf_adjust($$bref); + $re_smsg->set_bytes($$bref, $size); my $eml = PublicInbox::Eml->new($bref); $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, cotime => $orig_smsg->{ts} }); @@ -676,7 +673,7 @@ sub _reindex_unseen { # git->cat_async callback my $self = $req->{self} // die 'BUG: {self} unset'; local $self->{current_info} = "$self->{current_info} $oid"; my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); my $eml = $req->{eml} = PublicInbox::Eml->new($bref); $req->{new_smsg} = $new_smsg; $req->{chash} = content_hash($eml); diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 052b145b..8a06a661 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -412,12 +412,10 @@ sub add { # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{raw_bytes} = $n; + $smsg->set_bytes($raw_email, $n); if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore return if $oidx->blob_exists($smsg->{blob}); } - # XXX do we need this? it's in git at this point - $smsg->{-raw_email} = \$raw_email; } my $ref = $self->{ref}; my $commit = $self->{mark}++; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 4f77e8fa..7cda7e44 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -10,7 +10,6 @@ package PublicInbox::LeiStore; use strict; use v5.10.1; use parent qw(PublicInbox::Lock PublicInbox::IPC); -use PublicInbox::SearchIdx qw(crlf_adjust); use PublicInbox::ExtSearchIdx; use PublicInbox::Import; use PublicInbox::InboxWritable; @@ -197,9 +196,6 @@ sub add_eml { my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; my $im = $self->importer; $im->add($eml, undef, $smsg) or return; # duplicate returns undef - my $msgref = delete $smsg->{-raw_email}; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); - undef $msgref; local $self->{current_info} = $smsg->{blob}; if (my @docids = _docids_for($self, $eml)) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index da3ac2e3..a7005051 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,7 +22,7 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack +our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; our ($DB_CREATE_OR_OPEN, $DB_OPEN); @@ -613,17 +613,6 @@ sub index_mm { } } -# returns the number of bytes to add if given a non-CRLF arg -sub crlf_adjust ($) { - if (index($_[0], "\r\n") < 0) { - # common case is LF-only, every \n needs an \r; - # so favor a cheap tr// over an expensive m//g - $_[0] =~ tr/\n/\n/; - } else { # count number of '\n' w/o '\r', expensive: - scalar(my @n = ($_[0] =~ m/(?cat_async callback my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; - $size += crlf_adjust($$bref); - my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $smsg->set_bytes($$bref, $size); my $self = $sync->{sidx}; local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 571cbb6f..c6ff7f52 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -135,4 +135,17 @@ sub subject_normalized ($) { $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?{bytes} = $_[2] + crlf_adjust($_[1]) } + 1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 7b6b93a0..c4efbdd2 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -17,8 +17,7 @@ use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd run_die); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size - is_bad_blob); +use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush use File::Temp (); @@ -139,13 +138,12 @@ sub idx_shard ($$) { } # indexes a message, returns true if checkpointing is needed -sub do_idx ($$$$) { - my ($self, $msgref, $eml, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); +sub do_idx ($$$) { + my ($self, $eml, $smsg) = @_; $self->{oidx}->add_overview($eml, $smsg); my $idx = idx_shard($self, $smsg->{num}); $idx->index_eml($eml, $smsg); - my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; + my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= $self->{batch_bytes}; } @@ -173,7 +171,7 @@ sub _add { $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) { + if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } @@ -536,13 +534,13 @@ W: $list for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; $new_smsg->populate($new_mime, $sync); - do_idx($self, \$raw, $new_mime, $new_smsg); + $new_smsg->set_bytes($raw, $bytes); + do_idx($self, $new_mime, $new_smsg); } $rewritten->{rewrites}; } @@ -937,13 +935,13 @@ sub index_oid { # cat_async callback } ++${$arg->{nr}}; my $smsg = bless { - raw_bytes => $size, num => $num, blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; $smsg->populate($eml, $arg); - if (do_idx($self, $bref, $eml, $smsg)) { + $smsg->set_bytes($$bref, $size); + if (do_idx($self, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } index_finalize($arg, 1); @@ -1217,9 +1215,8 @@ sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{self}; my $idx = idx_shard($self, $smsg->{num}); - $smsg->{raw_bytes} = $size; $idx->index_eml(PublicInbox::Eml->new($bref), $smsg); - $self->{transact_bytes} += $size; + $self->{transact_bytes} += $smsg->{bytes}; } sub index_xap_step ($$$;$) { diff --git a/t/import.t b/t/import.t index 855b5d7c..ae76858b 100644 --- a/t/import.t +++ b/t/import.t @@ -7,7 +7,6 @@ use PublicInbox::Eml; use PublicInbox::Smsg; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::Spawn qw(spawn); use Fcntl qw(:DEFAULT SEEK_SET); use PublicInbox::TestCommon; use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2 @@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) { like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id'); - my $raw_email = $smsg->{-raw_email}; - is($mime->as_string, $$raw_email, 'string matches'); - is($smsg->{raw_bytes}, length($$raw_email), 'length matches'); my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); open my $in, '+<', undef or BAIL_OUT "open(+<): $!"; print $in $mime->as_string or die "write failed: $!"; $in->flush or die "flush failed: $!"; - seek($in, 0, SEEK_SET); - open my $out, '+<', undef or BAIL_OUT "open(+<): $!"; - my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out }); - is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object'); + seek($in, 0, SEEK_SET) or die "seek: $!"; + chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in })); is($?, 0, 'hash-object'); - seek($out, 0, SEEK_SET); - chomp(my $hashed_obj = <$out>); is($hashed_obj, $smsg->{blob}, "blob object_id matches exp"); } diff --git a/t/search.t b/t/search.t index 7495233e..b2958c00 100644 --- a/t/search.t +++ b/t/search.t @@ -60,7 +60,7 @@ sub oct_is ($$$) { } { - my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust; + my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust; is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed'); is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR'); is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR'); -- cgit v1.2.3-24-ge0c7