user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 5/7] use Eml (or MIME) objects for all indexing paths
  2021-01-03  2:06  7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
@ 2021-01-03  2:06  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2021-01-03  2:06 UTC (permalink / raw)
  To: meta

We don't need to be keeping the raw message around after it hits
git.  Shard work now relies on Storable (or Sereal) and all of
the indexing code relies on the Email::MIME-like API of Eml to
access interesting parts of the message.

Similarly, smsg->{raw_bytes} is no longer carried around and we
do the CRLF adjustment when setting smsg->{bytes}.

There's also a small simplification to t/import.t while
we're in the area to use xqx instead of spawn/popen_rd.
---
 lib/PublicInbox/ExtSearchIdx.pm | 11 ++++-------
 lib/PublicInbox/Import.pm       |  4 +---
 lib/PublicInbox/LeiStore.pm     |  4 ----
 lib/PublicInbox/SearchIdx.pm    | 17 +++--------------
 lib/PublicInbox/Smsg.pm         | 13 +++++++++++++
 lib/PublicInbox/V2Writable.pm   | 23 ++++++++++-------------
 t/import.t                      | 12 ++----------
 t/search.t                      |  2 +-
 8 files changed, 34 insertions(+), 52 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d55d3db9..e6c21866 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,8 +21,7 @@ use Carp qw(croak carp);
 use Sys::Hostname qw(hostname);
 use POSIX qw(strftime);
 use PublicInbox::Search;
-use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor
-	is_bad_blob);
+use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob);
 use PublicInbox::OverIdx;
 use PublicInbox::MiscIdx;
 use PublicInbox::MID qw(mids);
@@ -82,8 +81,6 @@ sub check_batch_limit ($) {
 	my ($req) = @_;
 	my $self = $req->{self};
 	my $new_smsg = $req->{new_smsg};
-
-	# {raw_bytes} may be unset, so just use {bytes}
 	my $n = $self->{transact_bytes} += $new_smsg->{bytes};
 
 	# set flag for PublicInbox::V2Writable::index_todo:
@@ -239,7 +236,7 @@ sub index_oid { # git->cat_async callback for 'm'
 	my $new_smsg = $req->{new_smsg} = bless {
 		blob => $oid,
 	}, 'PublicInbox::Smsg';
-	$new_smsg->{bytes} = $size + crlf_adjust($$bref);
+	$new_smsg->set_bytes($$bref, $size);
 	defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return;
 	++${$req->{nr}};
 	do_step($req);
@@ -496,7 +493,7 @@ sub _reindex_oid { # git->cat_async callback
 	my $ci = $self->{current_info};
 	local $self->{current_info} = "$ci #$docid $oid";
 	my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
-	$re_smsg->{bytes} = $size + crlf_adjust($$bref);
+	$re_smsg->set_bytes($$bref, $size);
 	my $eml = PublicInbox::Eml->new($bref);
 	$re_smsg->populate($eml, { autime => $orig_smsg->{ds},
 				cotime => $orig_smsg->{ts} });
@@ -676,7 +673,7 @@ sub _reindex_unseen { # git->cat_async callback
 	my $self = $req->{self} // die 'BUG: {self} unset';
 	local $self->{current_info} = "$self->{current_info} $oid";
 	my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg';
-	$new_smsg->{bytes} = $size + crlf_adjust($$bref);
+	$new_smsg->set_bytes($$bref, $size);
 	my $eml = $req->{eml} = PublicInbox::Eml->new($bref);
 	$req->{new_smsg} = $new_smsg;
 	$req->{chash} = content_hash($eml);
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 052b145b..8a06a661 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -412,12 +412,10 @@ sub add {
 	# v2: we need this for Xapian
 	if ($smsg) {
 		$smsg->{blob} = $self->get_mark(":$blob");
-		$smsg->{raw_bytes} = $n;
+		$smsg->set_bytes($raw_email, $n);
 		if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore
 			return if $oidx->blob_exists($smsg->{blob});
 		}
-		# XXX do we need this? it's in git at this point
-		$smsg->{-raw_email} = \$raw_email;
 	}
 	my $ref = $self->{ref};
 	my $commit = $self->{mark}++;
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index 4f77e8fa..7cda7e44 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -10,7 +10,6 @@ package PublicInbox::LeiStore;
 use strict;
 use v5.10.1;
 use parent qw(PublicInbox::Lock PublicInbox::IPC);
-use PublicInbox::SearchIdx qw(crlf_adjust);
 use PublicInbox::ExtSearchIdx;
 use PublicInbox::Import;
 use PublicInbox::InboxWritable;
@@ -197,9 +196,6 @@ sub add_eml {
 	my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg';
 	my $im = $self->importer;
 	$im->add($eml, undef, $smsg) or return; # duplicate returns undef
-	my $msgref = delete $smsg->{-raw_email};
-	$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
-	undef $msgref;
 
 	local $self->{current_info} = $smsg->{blob};
 	if (my @docids = _docids_for($self, $eml)) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index da3ac2e3..a7005051 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,7 +22,7 @@ use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn nodatacow_dir);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
+our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
 	index_text term_generator add_val is_bad_blob);
 my $X = \%PublicInbox::Search::X;
 our ($DB_CREATE_OR_OPEN, $DB_OPEN);
@@ -613,17 +613,6 @@ sub index_mm {
 	}
 }
 
-# returns the number of bytes to add if given a non-CRLF arg
-sub crlf_adjust ($) {
-	if (index($_[0], "\r\n") < 0) {
-		# common case is LF-only, every \n needs an \r;
-		# so favor a cheap tr// over an expensive m//g
-		$_[0] =~ tr/\n/\n/;
-	} else { # count number of '\n' w/o '\r', expensive:
-		scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
-	}
-}
-
 sub is_bad_blob ($$$$) {
 	my ($oid, $type, $size, $expect_oid) = @_;
 	if ($type ne 'blob') {
@@ -640,8 +629,8 @@ sub index_both { # git->cat_async callback
 	my ($nr, $max) = @$sync{qw(nr max)};
 	++$$nr;
 	$$max -= $size;
-	$size += crlf_adjust($$bref);
-	my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
+	my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
+	$smsg->set_bytes($$bref, $size);
 	my $self = $sync->{sidx};
 	local $self->{current_info} = "$self->{current_info}: $oid";
 	my $eml = PublicInbox::Eml->new($bref);
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 571cbb6f..c6ff7f52 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -135,4 +135,17 @@ sub subject_normalized ($) {
 	$subj;
 }
 
+# returns the number of bytes to add if given a non-CRLF arg
+sub crlf_adjust ($) {
+	if (index($_[0], "\r\n") < 0) {
+		# common case is LF-only, every \n needs an \r;
+		# so favor a cheap tr// over an expensive m//g
+		$_[0] =~ tr/\n/\n/;
+	} else { # count number of '\n' w/o '\r', expensive:
+		scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
+	}
+}
+
+sub set_bytes { $_[0]->{bytes} = $_[2] + crlf_adjust($_[1]) }
+
 1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7b6b93a0..c4efbdd2 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -17,8 +17,7 @@ use PublicInbox::InboxWritable;
 use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd run_die);
-use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size
-	is_bad_blob);
+use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob);
 use IO::Handle; # ->autoflush
 use File::Temp ();
 
@@ -139,13 +138,12 @@ sub idx_shard ($$) {
 }
 
 # indexes a message, returns true if checkpointing is needed
-sub do_idx ($$$$) {
-	my ($self, $msgref, $eml, $smsg) = @_;
-	$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
+sub do_idx ($$$) {
+	my ($self, $eml, $smsg) = @_;
 	$self->{oidx}->add_overview($eml, $smsg);
 	my $idx = idx_shard($self, $smsg->{num});
 	$idx->index_eml($eml, $smsg);
-	my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
+	my $n = $self->{transact_bytes} += $smsg->{bytes};
 	$n >= $self->{batch_bytes};
 }
 
@@ -173,7 +171,7 @@ sub _add {
 	$cmt = $im->get_mark($cmt);
 	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
 
-	if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) {
+	if (do_idx($self, $mime, $smsg)) {
 		$self->checkpoint;
 	}
 
@@ -536,13 +534,13 @@ W: $list
 	for my $smsg (@$need_reindex) {
 		my $new_smsg = bless {
 			blob => $blob,
-			raw_bytes => $bytes,
 			num => $smsg->{num},
 			mid => $smsg->{mid},
 		}, 'PublicInbox::Smsg';
 		my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} };
 		$new_smsg->populate($new_mime, $sync);
-		do_idx($self, \$raw, $new_mime, $new_smsg);
+		$new_smsg->set_bytes($raw, $bytes);
+		do_idx($self, $new_mime, $new_smsg);
 	}
 	$rewritten->{rewrites};
 }
@@ -937,13 +935,13 @@ sub index_oid { # cat_async callback
 	}
 	++${$arg->{nr}};
 	my $smsg = bless {
-		raw_bytes => $size,
 		num => $num,
 		blob => $oid,
 		mid => $mid0,
 	}, 'PublicInbox::Smsg';
 	$smsg->populate($eml, $arg);
-	if (do_idx($self, $bref, $eml, $smsg)) {
+	$smsg->set_bytes($$bref, $size);
+	if (do_idx($self, $eml, $smsg)) {
 		${$arg->{need_checkpoint}} = 1;
 	}
 	index_finalize($arg, 1);
@@ -1217,9 +1215,8 @@ sub index_xap_only { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $smsg) = @_;
 	my $self = $smsg->{self};
 	my $idx = idx_shard($self, $smsg->{num});
-	$smsg->{raw_bytes} = $size;
 	$idx->index_eml(PublicInbox::Eml->new($bref), $smsg);
-	$self->{transact_bytes} += $size;
+	$self->{transact_bytes} += $smsg->{bytes};
 }
 
 sub index_xap_step ($$$;$) {
diff --git a/t/import.t b/t/import.t
index 855b5d7c..ae76858b 100644
--- a/t/import.t
+++ b/t/import.t
@@ -7,7 +7,6 @@ use PublicInbox::Eml;
 use PublicInbox::Smsg;
 use PublicInbox::Git;
 use PublicInbox::Import;
-use PublicInbox::Spawn qw(spawn);
 use Fcntl qw(:DEFAULT SEEK_SET);
 use PublicInbox::TestCommon;
 use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2
@@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
 
 if ($v2) {
 	like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
-	my $raw_email = $smsg->{-raw_email};
-	is($mime->as_string, $$raw_email, 'string matches');
-	is($smsg->{raw_bytes}, length($$raw_email), 'length matches');
 	my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
 	open my $in, '+<', undef or BAIL_OUT "open(+<): $!";
 	print $in $mime->as_string or die "write failed: $!";
 	$in->flush or die "flush failed: $!";
-	seek($in, 0, SEEK_SET);
-	open my $out, '+<', undef or BAIL_OUT "open(+<): $!";
-	my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out });
-	is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object');
+	seek($in, 0, SEEK_SET) or die "seek: $!";
+	chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in }));
 	is($?, 0, 'hash-object');
-	seek($out, 0, SEEK_SET);
-	chomp(my $hashed_obj = <$out>);
 	is($hashed_obj, $smsg->{blob}, "blob object_id matches exp");
 }
 
diff --git a/t/search.t b/t/search.t
index 7495233e..b2958c00 100644
--- a/t/search.t
+++ b/t/search.t
@@ -60,7 +60,7 @@ sub oct_is ($$$) {
 }
 
 {
-	my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust;
+	my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust;
 	is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed');
 	is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR');
 	is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');

^ permalink raw reply related	[relevance 5%]

* [PATCH 0/7] v2: swap in new IPC package
@ 2021-01-03  2:06  7% Eric Wong
  2021-01-03  2:06  5% ` [PATCH 5/7] use Eml (or MIME) objects for all indexing paths Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2021-01-03  2:06 UTC (permalink / raw)
  To: meta

SearchIdxShard was too big and adding the new extindex
stuff made things worse.  Since I intend to use IPC
in more places, I figured it'd be good to prove it with
works well by dropping it into the old v2 mix.

The below diffstat is nice

Eric Wong (7):
  ipc: some documentation comments
  searchidxshard: use PublicInbox::IPC to kill lots of code
  searchidxshard: IPC conversion, part 2
  searchidxshard: replace index_raw with index_eml
  use Eml (or MIME) objects for all indexing paths
  ipc: switch to one-way pipes
  searchidxshard: use add_xapian directly for v2

 lib/PublicInbox/ExtSearchIdx.pm   |  38 +++--
 lib/PublicInbox/IPC.pm            | 127 +++++++++------
 lib/PublicInbox/Import.pm         |   4 +-
 lib/PublicInbox/LeiStore.pm       |  18 +--
 lib/PublicInbox/SearchIdx.pm      |  35 +---
 lib/PublicInbox/SearchIdxShard.pm | 257 +++++-------------------------
 lib/PublicInbox/Smsg.pm           |  13 ++
 lib/PublicInbox/V2Writable.pm     | 102 +++++-------
 t/import.t                        |  12 +-
 t/search.t                        |   2 +-
 10 files changed, 206 insertions(+), 402 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | sort options + mbox downloads above
-- links below jump to the message on this page --
2021-01-03  2:06  7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
2021-01-03  2:06  5% ` [PATCH 5/7] use Eml (or MIME) objects for all indexing paths Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).