* [PATCH 4/7] searchidxshard: replace index_raw with index_eml
2021-01-03 2:06 7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
@ 2021-01-03 2:06 6% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2021-01-03 2:06 UTC (permalink / raw)
To: meta
Since Storable and Sereal are designed for lossless
serialization, we'll just pass $eml objects to whatever process
is running SearchIdx.
---
lib/PublicInbox/ExtSearchIdx.pm | 4 ++--
lib/PublicInbox/LeiStore.pm | 3 ++-
lib/PublicInbox/SearchIdxShard.pm | 9 ++-------
lib/PublicInbox/V2Writable.pm | 11 +++++------
4 files changed, 11 insertions(+), 16 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 064d9939..d55d3db9 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -135,7 +135,7 @@ sub index_unseen ($) {
my $oid = $new_smsg->{blob};
my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset';
$self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key);
- $idx->index_raw(undef, $eml, $new_smsg, $ibx->eidx_key);
+ $idx->index_eml($eml, $new_smsg, $ibx->eidx_key);
check_batch_limit($req);
}
@@ -437,7 +437,7 @@ sub _reindex_finalize ($$$) {
my $top_smsg = pop @$stable;
$top_smsg == $smsg or die 'BUG: top_smsg != smsg';
my $ibx = _ibx_for($self, $sync, $smsg);
- $idx->index_raw(undef, $eml, $smsg, $ibx->eidx_key);
+ $idx->index_eml($eml, $smsg, $ibx->eidx_key);
for my $x (reverse @$stable) {
$ibx = _ibx_for($self, $sync, $x);
my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}';
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index d686e95a..4f77e8fa 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -199,6 +199,7 @@ sub add_eml {
$im->add($eml, undef, $smsg) or return; # duplicate returns undef
my $msgref = delete $smsg->{-raw_email};
$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
+ undef $msgref;
local $self->{current_info} = $smsg->{blob};
if (my @docids = _docids_for($self, $eml)) {
@@ -215,7 +216,7 @@ sub add_eml {
$oidx->add_overview($eml, $smsg);
$oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.');
my $idx = $eidx->idx_shard($smsg->{num});
- $idx->index_raw($msgref, $eml, $smsg);
+ $idx->index_eml($eml, $smsg);
$idx->ipc_do('add_keywords', $smsg->{num}, @kw) if @kw;
$smsg;
}
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 43dad959..83cbbb25 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -43,13 +43,8 @@ sub ipc_atfork_child { # called automatically before ipc_worker_loop
PublicInbox::OnDestroy->new($$, \&_worker_done, $self);
}
-sub index_raw {
- my ($self, $msgref, $eml, $smsg, $eidx_key) = @_;
- if ($eml) {
- undef($$msgref) if $msgref;
- } else { # --xapian-only + --sequential-shard:
- $eml = PublicInbox::Eml->new($msgref);
- }
+sub index_eml {
+ my ($self, $eml, $smsg, $eidx_key) = @_;
$smsg->{eidx_key} = $eidx_key if defined $eidx_key;
$self->ipc_do('add_message', $eml, $smsg);
}
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 885edbe9..7b6b93a0 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -140,11 +140,11 @@ sub idx_shard ($$) {
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
- my ($self, $msgref, $mime, $smsg) = @_;
+ my ($self, $msgref, $eml, $smsg) = @_;
$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
- $self->{oidx}->add_overview($mime, $smsg);
+ $self->{oidx}->add_overview($eml, $smsg);
my $idx = idx_shard($self, $smsg->{num});
- $idx->index_raw($msgref, $mime, $smsg);
+ $idx->index_eml($eml, $smsg);
my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
$n >= $self->{batch_bytes};
}
@@ -173,8 +173,7 @@ sub _add {
$cmt = $im->get_mark($cmt);
$self->{last_commit}->[$self->{epoch_max}] = $cmt;
- my $msgref = delete $smsg->{-raw_email};
- if (do_idx($self, $msgref, $mime, $smsg)) {
+ if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) {
$self->checkpoint;
}
@@ -1219,7 +1218,7 @@ sub index_xap_only { # git->cat_async callback
my $self = $smsg->{self};
my $idx = idx_shard($self, $smsg->{num});
$smsg->{raw_bytes} = $size;
- $idx->index_raw($bref, undef, $smsg);
+ $idx->index_eml(PublicInbox::Eml->new($bref), $smsg);
$self->{transact_bytes} += $size;
}
^ permalink raw reply related [relevance 6%]
* [PATCH 0/7] v2: swap in new IPC package
@ 2021-01-03 2:06 7% Eric Wong
2021-01-03 2:06 6% ` [PATCH 4/7] searchidxshard: replace index_raw with index_eml Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2021-01-03 2:06 UTC (permalink / raw)
To: meta
SearchIdxShard was too big and adding the new extindex
stuff made things worse. Since I intend to use IPC
in more places, I figured it'd be good to prove it with
works well by dropping it into the old v2 mix.
The below diffstat is nice
Eric Wong (7):
ipc: some documentation comments
searchidxshard: use PublicInbox::IPC to kill lots of code
searchidxshard: IPC conversion, part 2
searchidxshard: replace index_raw with index_eml
use Eml (or MIME) objects for all indexing paths
ipc: switch to one-way pipes
searchidxshard: use add_xapian directly for v2
lib/PublicInbox/ExtSearchIdx.pm | 38 +++--
lib/PublicInbox/IPC.pm | 127 +++++++++------
lib/PublicInbox/Import.pm | 4 +-
lib/PublicInbox/LeiStore.pm | 18 +--
lib/PublicInbox/SearchIdx.pm | 35 +---
lib/PublicInbox/SearchIdxShard.pm | 257 +++++-------------------------
lib/PublicInbox/Smsg.pm | 13 ++
lib/PublicInbox/V2Writable.pm | 102 +++++-------
t/import.t | 12 +-
t/search.t | 2 +-
10 files changed, 206 insertions(+), 402 deletions(-)
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2021-01-03 2:06 7% [PATCH 0/7] v2: swap in new IPC package Eric Wong
2021-01-03 2:06 6% ` [PATCH 4/7] searchidxshard: replace index_raw with index_eml Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).