user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 0/6] cindex join stuff
@ 2023-12-08  3:54  7% Eric Wong
  2023-12-08  3:54  5% ` [PATCH 1/6] *search: simplify handling of Xapian term iterators Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-12-08  3:54 UTC (permalink / raw)
  To: meta

1-2 are small speedups, 3-4 are dev improvements, and 5-6
ought to actually improve and future-proof join accuracy.

Eric Wong (6):
  *search: simplify handling of Xapian term iterators
  *search: favor wantarray form of xap_terms
  xap_helper_cxx: drop chdir usage in build
  makefile: add `check-build' target
  xap_helper: support term length limit
  cindex: switch --join to use dfpost7 by default

 Makefile.PL                      | 13 +++++++
 lib/PublicInbox/CodeSearch.pm    | 15 ++++----
 lib/PublicInbox/CodeSearchIdx.pm | 18 +++++-----
 lib/PublicInbox/LeiInspect.pm    |  1 -
 lib/PublicInbox/LeiSearch.pm     | 17 ++++-----
 lib/PublicInbox/LeiStore.pm      | 13 +++----
 lib/PublicInbox/Search.pm        | 19 +++++-----
 lib/PublicInbox/SearchIdx.pm     | 13 ++++---
 lib/PublicInbox/XapHelper.pm     | 24 ++++++++++---
 lib/PublicInbox/XapHelperCxx.pm  | 19 ++++------
 lib/PublicInbox/xap_helper.h     | 11 +++++-
 lib/PublicInbox/xh_cidx.h        | 61 ++++++++++++++++++++++++--------
 lib/PublicInbox/xh_mset.h        |  2 +-
 t/xap_helper.t                   | 33 +++++++++++++++++
 14 files changed, 177 insertions(+), 82 deletions(-)

^ permalink raw reply	[relevance 7%]

* [PATCH 1/6] *search: simplify handling of Xapian term iterators
  2023-12-08  3:54  7% [PATCH 0/6] cindex join stuff Eric Wong
@ 2023-12-08  3:54  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-12-08  3:54 UTC (permalink / raw)
  To: meta

Xapian has always sorted termlist iterators, so we now:

1) break out of the iterator loop early on non-matches
2) avoid doing sorting ourselves

As a result, we'll also favor the wantarray forms of xap_terms
and all_terms to preserve sort order in most cases.

Confirmed by the Xapian maintainer: <20231201184844.GO4059@survex.com>

Link: https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html
---
 lib/PublicInbox/LeiInspect.pm |  1 -
 lib/PublicInbox/Search.pm     | 19 ++++++++++---------
 lib/PublicInbox/SearchIdx.pm  | 13 ++++++-------
 lib/PublicInbox/xh_cidx.h     | 15 +++++----------
 lib/PublicInbox/xh_mset.h     |  2 +-
 5 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index d4ad03eb..88d7949c 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -97,7 +97,6 @@ sub _inspect_doc ($$) {
 		my $term = ($1 // '');
 		push @{$ent->{terms}->{$term}}, $tn;
 	}
-	@$_ = sort(@$_) for values %{$ent->{terms} // {}};
 	$cur = $doc->values_begin;
 	$end = $doc->values_end;
 	for (; $cur != $end; $cur++) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8ef17d58..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -614,16 +614,16 @@ sub get_pct ($) { # mset item
 
 sub xap_terms ($$;@) {
 	my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
-	my %ret;
 	my $end = $xdb_or_doc->termlist_end(@docid);
 	my $cur = $xdb_or_doc->termlist_begin(@docid);
+	$cur->skip_to($pfx);
+	my (@ret, $tn);
+	my $pfxlen = length($pfx);
 	for (; $cur != $end; $cur++) {
-		$cur->skip_to($pfx);
-		last if $cur == $end;
-		my $tn = $cur->get_termname;
-		$ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+		$tn = $cur->get_termname;
+		index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
 	}
-	wantarray ? sort(keys(%ret)) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 # get combined docid from over.num:
@@ -638,11 +638,12 @@ sub all_terms {
 	my ($self, $pfx) = @_;
 	my $cur = xdb($self)->allterms_begin($pfx);
 	my $end = $self->{xdb}->allterms_end($pfx);
-	my %ret;
+	my $pfxlen = length($pfx);
+	my @ret;
 	for (; $cur != $end; $cur++) {
-		$ret{substr($cur->get_termname, length($pfx))} = undef;
+		push @ret, substr($cur->get_termname, $pfxlen);
 	}
-	wantarray ? (sort keys %ret) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 sub xh_args { # prep getopt args to feed to xap_helper.h socket
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1bf471fc..1ac8e33e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -42,7 +42,7 @@ my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
-my @VMD_MAP = (kw => 'K', L => 'L');
+my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -608,17 +608,16 @@ sub set_vmd {
 	my ($self, $docid, $vmd) = @_;
 	begin_txn_lazy($self);
 	my $doc = _get_doc($self, $docid) or return;
-	my ($end, @rm, @add);
+	my ($v, @rm, @add);
 	my @x = @VMD_MAP;
+	my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end);
 	while (my ($field, $pfx) = splice(@x, 0, 2)) {
 		my $set = $vmd->{$field} // next;
 		my %keep = map { $_ => 1 } @$set;
 		my %add = %keep;
-		$end //= $doc->termlist_end;
-		for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
-			$cur->skip_to($pfx);
-			last if $cur == $end;
-			my $v = $cur->get_termname;
+		$cur->skip_to($pfx); # works due to @VMD_MAP order
+		for (; $cur != $end; $cur++) {
+			$v = $cur->get_termname;
 			$v =~ s/\A$pfx//s or next;
 			$keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
 		}
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 1980f9f6..2803b3a4 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-
-		if (starts_with(&tn, pfx, pfx_len)) {
-			fprintf(req->fp[0], "%s %s\n",
-				tn.c_str() + pfx_len, ibx_id);
-			++req->nr_out;
-		}
+		if (!starts_with(&tn, pfx, pfx_len)) break;
+		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
+		++req->nr_out;
 	}
 }
 
@@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 	fbuf_init(root_offs);
 	for (cur.skip_to("G"); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, "G", 1))
-			continue;
+		if (!starts_with(&tn, "G", 1)) break;
 		union { const char *in; char *out; } u;
 		u.in = tn.c_str() + 1;
 		e.key = u.out;
@@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len))
-			continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
 		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
 		++req->nr_out;
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
index 056fe22b..4e97a284 100644
--- a/lib/PublicInbox/xh_mset.h
+++ b/lib/PublicInbox/xh_mset.h
@@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc)
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len)) continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputc(0, fp);
 		fwrite(tn.data(), tn.size(), 1, fp);
 	}

^ permalink raw reply related	[relevance 5%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-12-08  3:54  7% [PATCH 0/6] cindex join stuff Eric Wong
2023-12-08  3:54  5% ` [PATCH 1/6] *search: simplify handling of Xapian term iterators Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).