* [PATCH 0/6] cindex join stuff
@ 2023-12-08 3:54 7% Eric Wong
2023-12-08 3:54 5% ` [PATCH 1/6] *search: simplify handling of Xapian term iterators Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-12-08 3:54 UTC (permalink / raw)
To: meta
1-2 are small speedups, 3-4 are dev improvements, and 5-6
ought to actually improve and future-proof join accuracy.
Eric Wong (6):
*search: simplify handling of Xapian term iterators
*search: favor wantarray form of xap_terms
xap_helper_cxx: drop chdir usage in build
makefile: add `check-build' target
xap_helper: support term length limit
cindex: switch --join to use dfpost7 by default
Makefile.PL | 13 +++++++
lib/PublicInbox/CodeSearch.pm | 15 ++++----
lib/PublicInbox/CodeSearchIdx.pm | 18 +++++-----
lib/PublicInbox/LeiInspect.pm | 1 -
lib/PublicInbox/LeiSearch.pm | 17 ++++-----
lib/PublicInbox/LeiStore.pm | 13 +++----
lib/PublicInbox/Search.pm | 19 +++++-----
lib/PublicInbox/SearchIdx.pm | 13 ++++---
lib/PublicInbox/XapHelper.pm | 24 ++++++++++---
lib/PublicInbox/XapHelperCxx.pm | 19 ++++------
lib/PublicInbox/xap_helper.h | 11 +++++-
lib/PublicInbox/xh_cidx.h | 61 ++++++++++++++++++++++++--------
lib/PublicInbox/xh_mset.h | 2 +-
t/xap_helper.t | 33 +++++++++++++++++
14 files changed, 177 insertions(+), 82 deletions(-)
^ permalink raw reply [relevance 7%]
* [PATCH 1/6] *search: simplify handling of Xapian term iterators
2023-12-08 3:54 7% [PATCH 0/6] cindex join stuff Eric Wong
@ 2023-12-08 3:54 5% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-12-08 3:54 UTC (permalink / raw)
To: meta
Xapian has always sorted termlist iterators, so we now:
1) break out of the iterator loop early on non-matches
2) avoid doing sorting ourselves
As a result, we'll also favor the wantarray forms of xap_terms
and all_terms to preserve sort order in most cases.
Confirmed by the Xapian maintainer: <20231201184844.GO4059@survex.com>
Link: https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html
---
lib/PublicInbox/LeiInspect.pm | 1 -
lib/PublicInbox/Search.pm | 19 ++++++++++---------
lib/PublicInbox/SearchIdx.pm | 13 ++++++-------
lib/PublicInbox/xh_cidx.h | 15 +++++----------
lib/PublicInbox/xh_mset.h | 2 +-
5 files changed, 22 insertions(+), 28 deletions(-)
diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index d4ad03eb..88d7949c 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -97,7 +97,6 @@ sub _inspect_doc ($$) {
my $term = ($1 // '');
push @{$ent->{terms}->{$term}}, $tn;
}
- @$_ = sort(@$_) for values %{$ent->{terms} // {}};
$cur = $doc->values_begin;
$end = $doc->values_end;
for (; $cur != $end; $cur++) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8ef17d58..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -614,16 +614,16 @@ sub get_pct ($) { # mset item
sub xap_terms ($$;@) {
my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
- my %ret;
my $end = $xdb_or_doc->termlist_end(@docid);
my $cur = $xdb_or_doc->termlist_begin(@docid);
+ $cur->skip_to($pfx);
+ my (@ret, $tn);
+ my $pfxlen = length($pfx);
for (; $cur != $end; $cur++) {
- $cur->skip_to($pfx);
- last if $cur == $end;
- my $tn = $cur->get_termname;
- $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+ $tn = $cur->get_termname;
+ index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
}
- wantarray ? sort(keys(%ret)) : \%ret;
+ wantarray ? @ret : +{ map { $_ => undef } @ret };
}
# get combined docid from over.num:
@@ -638,11 +638,12 @@ sub all_terms {
my ($self, $pfx) = @_;
my $cur = xdb($self)->allterms_begin($pfx);
my $end = $self->{xdb}->allterms_end($pfx);
- my %ret;
+ my $pfxlen = length($pfx);
+ my @ret;
for (; $cur != $end; $cur++) {
- $ret{substr($cur->get_termname, length($pfx))} = undef;
+ push @ret, substr($cur->get_termname, $pfxlen);
}
- wantarray ? (sort keys %ret) : \%ret;
+ wantarray ? @ret : +{ map { $_ => undef } @ret };
}
sub xh_args { # prep getopt args to feed to xap_helper.h socket
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1bf471fc..1ac8e33e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -42,7 +42,7 @@ my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
-my @VMD_MAP = (kw => 'K', L => 'L');
+my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters
our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
sub new {
@@ -608,17 +608,16 @@ sub set_vmd {
my ($self, $docid, $vmd) = @_;
begin_txn_lazy($self);
my $doc = _get_doc($self, $docid) or return;
- my ($end, @rm, @add);
+ my ($v, @rm, @add);
my @x = @VMD_MAP;
+ my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end);
while (my ($field, $pfx) = splice(@x, 0, 2)) {
my $set = $vmd->{$field} // next;
my %keep = map { $_ => 1 } @$set;
my %add = %keep;
- $end //= $doc->termlist_end;
- for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
- $cur->skip_to($pfx);
- last if $cur == $end;
- my $v = $cur->get_termname;
+ $cur->skip_to($pfx); # works due to @VMD_MAP order
+ for (; $cur != $end; $cur++) {
+ $v = $cur->get_termname;
$v =~ s/\A$pfx//s or next;
$keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
}
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 1980f9f6..2803b3a4 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx,
for (cur.skip_to(pfx); cur != end; cur++) {
std::string tn = *cur;
-
- if (starts_with(&tn, pfx, pfx_len)) {
- fprintf(req->fp[0], "%s %s\n",
- tn.c_str() + pfx_len, ibx_id);
- ++req->nr_out;
- }
+ if (!starts_with(&tn, pfx, pfx_len)) break;
+ fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
+ ++req->nr_out;
}
}
@@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
fbuf_init(root_offs);
for (cur.skip_to("G"); cur != end; cur++) {
std::string tn = *cur;
- if (!starts_with(&tn, "G", 1))
- continue;
+ if (!starts_with(&tn, "G", 1)) break;
union { const char *in; char *out; } u;
u.in = tn.c_str() + 1;
e.key = u.out;
@@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char *pfx,
for (cur.skip_to(pfx); cur != end; cur++) {
std::string tn = *cur;
- if (!starts_with(&tn, pfx, pfx_len))
- continue;
+ if (!starts_with(&tn, pfx, pfx_len)) break;
fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
++req->nr_out;
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
index 056fe22b..4e97a284 100644
--- a/lib/PublicInbox/xh_mset.h
+++ b/lib/PublicInbox/xh_mset.h
@@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc)
for (cur.skip_to(pfx); cur != end; cur++) {
std::string tn = *cur;
- if (!starts_with(&tn, pfx, pfx_len)) continue;
+ if (!starts_with(&tn, pfx, pfx_len)) break;
fputc(0, fp);
fwrite(tn.data(), tn.size(), 1, fp);
}
^ permalink raw reply related [relevance 5%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-12-08 3:54 7% [PATCH 0/6] cindex join stuff Eric Wong
2023-12-08 3:54 5% ` [PATCH 1/6] *search: simplify handling of Xapian term iterators Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).