user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 4/7] searchidxshard: replace index_raw with index_eml
  2021-01-03  2:06  7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
@ 2021-01-03  2:06  6% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2021-01-03  2:06 UTC (permalink / raw)
  To: meta

Since Storable and Sereal are designed for lossless
serialization, we'll just pass $eml objects to whatever process
is running SearchIdx.
---
 lib/PublicInbox/ExtSearchIdx.pm   |  4 ++--
 lib/PublicInbox/LeiStore.pm       |  3 ++-
 lib/PublicInbox/SearchIdxShard.pm |  9 ++-------
 lib/PublicInbox/V2Writable.pm     | 11 +++++------
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 064d9939..d55d3db9 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -135,7 +135,7 @@ sub index_unseen ($) {
 	my $oid = $new_smsg->{blob};
 	my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset';
 	$self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key);
-	$idx->index_raw(undef, $eml, $new_smsg, $ibx->eidx_key);
+	$idx->index_eml($eml, $new_smsg, $ibx->eidx_key);
 	check_batch_limit($req);
 }
 
@@ -437,7 +437,7 @@ sub _reindex_finalize ($$$) {
 	my $top_smsg = pop @$stable;
 	$top_smsg == $smsg or die 'BUG: top_smsg != smsg';
 	my $ibx = _ibx_for($self, $sync, $smsg);
-	$idx->index_raw(undef, $eml, $smsg, $ibx->eidx_key);
+	$idx->index_eml($eml, $smsg, $ibx->eidx_key);
 	for my $x (reverse @$stable) {
 		$ibx = _ibx_for($self, $sync, $x);
 		my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}';
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index d686e95a..4f77e8fa 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -199,6 +199,7 @@ sub add_eml {
 	$im->add($eml, undef, $smsg) or return; # duplicate returns undef
 	my $msgref = delete $smsg->{-raw_email};
 	$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
+	undef $msgref;
 
 	local $self->{current_info} = $smsg->{blob};
 	if (my @docids = _docids_for($self, $eml)) {
@@ -215,7 +216,7 @@ sub add_eml {
 		$oidx->add_overview($eml, $smsg);
 		$oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.');
 		my $idx = $eidx->idx_shard($smsg->{num});
-		$idx->index_raw($msgref, $eml, $smsg);
+		$idx->index_eml($eml, $smsg);
 		$idx->ipc_do('add_keywords', $smsg->{num}, @kw) if @kw;
 		$smsg;
 	}
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 43dad959..83cbbb25 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -43,13 +43,8 @@ sub ipc_atfork_child { # called automatically before ipc_worker_loop
 	PublicInbox::OnDestroy->new($$, \&_worker_done, $self);
 }
 
-sub index_raw {
-	my ($self, $msgref, $eml, $smsg, $eidx_key) = @_;
-	if ($eml) {
-		undef($$msgref) if $msgref;
-	} else { # --xapian-only + --sequential-shard:
-		$eml = PublicInbox::Eml->new($msgref);
-	}
+sub index_eml {
+	my ($self, $eml, $smsg, $eidx_key) = @_;
 	$smsg->{eidx_key} = $eidx_key if defined $eidx_key;
 	$self->ipc_do('add_message', $eml, $smsg);
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 885edbe9..7b6b93a0 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -140,11 +140,11 @@ sub idx_shard ($$) {
 
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
-	my ($self, $msgref, $mime, $smsg) = @_;
+	my ($self, $msgref, $eml, $smsg) = @_;
 	$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
-	$self->{oidx}->add_overview($mime, $smsg);
+	$self->{oidx}->add_overview($eml, $smsg);
 	my $idx = idx_shard($self, $smsg->{num});
-	$idx->index_raw($msgref, $mime, $smsg);
+	$idx->index_eml($eml, $smsg);
 	my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
 	$n >= $self->{batch_bytes};
 }
@@ -173,8 +173,7 @@ sub _add {
 	$cmt = $im->get_mark($cmt);
 	$self->{last_commit}->[$self->{epoch_max}] = $cmt;
 
-	my $msgref = delete $smsg->{-raw_email};
-	if (do_idx($self, $msgref, $mime, $smsg)) {
+	if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) {
 		$self->checkpoint;
 	}
 
@@ -1219,7 +1218,7 @@ sub index_xap_only { # git->cat_async callback
 	my $self = $smsg->{self};
 	my $idx = idx_shard($self, $smsg->{num});
 	$smsg->{raw_bytes} = $size;
-	$idx->index_raw($bref, undef, $smsg);
+	$idx->index_eml(PublicInbox::Eml->new($bref), $smsg);
 	$self->{transact_bytes} += $size;
 }
 

^ permalink raw reply related	[relevance 6%]

* [PATCH 0/7] v2: swap in new IPC package
@ 2021-01-03  2:06  7% Eric Wong
  2021-01-03  2:06  6% ` [PATCH 4/7] searchidxshard: replace index_raw with index_eml Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2021-01-03  2:06 UTC (permalink / raw)
  To: meta

SearchIdxShard was too big and adding the new extindex
stuff made things worse.  Since I intend to use IPC
in more places, I figured it'd be good to prove it with
works well by dropping it into the old v2 mix.

The below diffstat is nice

Eric Wong (7):
  ipc: some documentation comments
  searchidxshard: use PublicInbox::IPC to kill lots of code
  searchidxshard: IPC conversion, part 2
  searchidxshard: replace index_raw with index_eml
  use Eml (or MIME) objects for all indexing paths
  ipc: switch to one-way pipes
  searchidxshard: use add_xapian directly for v2

 lib/PublicInbox/ExtSearchIdx.pm   |  38 +++--
 lib/PublicInbox/IPC.pm            | 127 +++++++++------
 lib/PublicInbox/Import.pm         |   4 +-
 lib/PublicInbox/LeiStore.pm       |  18 +--
 lib/PublicInbox/SearchIdx.pm      |  35 +---
 lib/PublicInbox/SearchIdxShard.pm | 257 +++++-------------------------
 lib/PublicInbox/Smsg.pm           |  13 ++
 lib/PublicInbox/V2Writable.pm     | 102 +++++-------
 t/import.t                        |  12 +-
 t/search.t                        |   2 +-
 10 files changed, 206 insertions(+), 402 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2021-01-03  2:06  7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
2021-01-03  2:06  6% ` [PATCH 4/7] searchidxshard: replace index_raw with index_eml Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).