user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 5/6] xap_helper: support term length limit
  2023-12-08  3:54  7% [PATCH 0/6] cindex join stuff Eric Wong
@ 2023-12-08  3:54  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-12-08  3:54 UTC (permalink / raw)
  To: meta

This will allow us to use p2q-compatible specifications such as
"dfpost7" to only capture blob OIDs which are 7 characters in
length (the indexer will always index down to 7 characters)
---
 lib/PublicInbox/XapHelper.pm | 24 +++++++++++++++---
 lib/PublicInbox/xap_helper.h | 11 ++++++++-
 lib/PublicInbox/xh_cidx.h    | 48 ++++++++++++++++++++++++++++++++----
 t/xap_helper.t               | 33 +++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm
index b21e70a2..ed11a2f8 100644
--- a/lib/PublicInbox/XapHelper.pm
+++ b/lib/PublicInbox/XapHelper.pm
@@ -39,13 +39,24 @@ sub iter_retry_check ($) {
 	}
 }
 
+sub term_length_extract ($) {
+	my ($req) = @_;
+	@{$req->{A_len}} = map {
+		my $len = s/([0-9]+)\z// ? ($1 + 0) : undef;
+		[ $_, $len ];
+	} @{$req->{A}};
+}
+
 sub dump_ibx_iter ($$$) {
 	my ($req, $ibx_id, $it) = @_;
 	my $out = $req->{0};
 	eval {
 		my $doc = $it->get_document;
-		for my $p (@{$req->{A}}) {
-			for (xap_terms($p, $doc)) {
+		for my $pair (@{$req->{A_len}}) {
+			my ($pfx, $len) = @$pair;
+			my @t = xap_terms($pfx, $doc);
+			@t = grep { length == $len } @t if defined($len);
+			for (@t) {
 				print $out "$_ $ibx_id\n" or die "print: $!";
 				++$req->{nr_out};
 			}
@@ -64,6 +75,7 @@ sub cmd_dump_ibx {
 	my ($req, $ibx_id, $qry_str) = @_;
 	$qry_str // die 'usage: dump_ibx [OPTIONS] IBX_ID QRY_STR';
 	$req->{A} or die 'dump_ibx requires -A PREFIX';
+	term_length_extract $req;
 	my $max = $req->{'m'} // $req->{srch}->{xdb}->get_doccount;
 	my $opt = { relevance => -1, limit => $max, offset => $req->{o} // 0 };
 	$opt->{eidx_key} = $req->{O} if defined $req->{O};
@@ -82,8 +94,11 @@ sub dump_roots_iter ($$$) {
 	eval {
 		my $doc = $it->get_document;
 		my $G = join(' ', map { $root2off->{$_} } xap_terms('G', $doc));
-		for my $p (@{$req->{A}}) {
-			for (xap_terms($p, $doc)) {
+		for my $pair (@{$req->{A_len}}) {
+			my ($pfx, $len) = @$pair;
+			my @t = xap_terms($pfx, $doc);
+			@t = grep { length == $len } @t if defined($len);
+			for (@t) {
 				$req->{wbuf} .= "$_ $G\n";
 				++$req->{nr_out};
 			}
@@ -106,6 +121,7 @@ sub cmd_dump_roots {
 	my ($req, $root2off_file, $qry_str) = @_;
 	$qry_str // die 'usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR';
 	$req->{A} or die 'dump_roots requires -A PREFIX';
+	term_length_extract $req;
 	open my $fh, '<', $root2off_file;
 	my $root2off; # record format: $OIDHEX "\0" uint32_t
 	my @x = split(/\0/, read_all $fh);
diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h
index 1f8c426b..3456910b 100644
--- a/lib/PublicInbox/xap_helper.h
+++ b/lib/PublicInbox/xap_helper.h
@@ -123,6 +123,7 @@ typedef bool (*cmd)(struct req *);
 struct req { // argv and pfxv point into global rbuf
 	char *argv[MY_ARG_MAX];
 	char *pfxv[MY_ARG_MAX]; // -A <prefix>
+	size_t *lenv; // -A <prefix>LENGTH
 	struct srch *srch;
 	char *Pgit_dir;
 	char *Oeidx_key;
@@ -727,6 +728,13 @@ static void sigw(int sig) // SIGTERM handler for worker
 	sock_fd = -1; // break out of recv_loop
 }
 
+#define CLEANUP_REQ __attribute__((__cleanup__(req_cleanup)))
+static void req_cleanup(void *ptr)
+{
+	struct req *req = (struct req *)ptr;
+	free(req->lenv);
+}
+
 static void recv_loop(void) // worker process loop
 {
 	static char rbuf[4096 * 33]; // per-process
@@ -737,7 +745,8 @@ static void recv_loop(void) // worker process loop
 
 	while (sock_fd == 0) {
 		size_t len = sizeof(rbuf);
-		struct req req = {};
+		CLEANUP_REQ struct req req = {};
+
 		if (!recv_req(&req, rbuf, &len))
 			continue;
 		if (req.fp[1])
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 2803b3a4..311ca05f 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -3,16 +3,49 @@
 // This file is only intended to be included by xap_helper.h
 // it implements pieces used by CodeSearchIdx.pm
 
-static void dump_ibx_term(struct req *req, const char *pfx,
+static void term_length_extract(struct req *req)
+{
+	req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t));
+	if (!req->lenv)
+		EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t));
+	for (int i = 0; i < req->pfxc; i++) {
+		char *pfx = req->pfxv[i];
+		// extract trailing digits as length:
+		// $len = s/([0-9]+)\z// ? ($1+0) : 0
+		for (size_t j = 0; pfx[j]; j++) {
+			if (pfx[j] < '0' || pfx[j] > '9')
+				continue;
+			if (j == 0) {
+				warnx("W: `%s' not a valid prefix", pfx);
+				continue;
+			}
+			char *end;
+			unsigned long long tmp = strtoull(pfx + j, &end, 10);
+			if (*end || tmp >= (unsigned long long)SIZE_MAX) {
+				warnx("W: `%s' not recognized", pfx);
+			} else {
+				req->lenv[i] = (size_t)tmp;
+				pfx[j] = 0;
+				break;
+			}
+		}
+	}
+}
+
+static void dump_ibx_term(struct req *req, int p,
 			Xapian::Document *doc, const char *ibx_id)
 {
 	Xapian::TermIterator cur = doc->termlist_begin();
 	Xapian::TermIterator end = doc->termlist_end();
+	const char *pfx = req->pfxv[p];
 	size_t pfx_len = strlen(pfx);
+	size_t term_len = req->lenv[p];
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
 		if (!starts_with(&tn, pfx, pfx_len)) break;
+		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
+			continue;
 		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
 		++req->nr_out;
 	}
@@ -24,7 +57,7 @@ static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
 	try {
 		Xapian::Document doc = i->get_document();
 		for (int p = 0; p < req->pfxc; p++)
-			dump_ibx_term(req, req->pfxv[p], &doc, ibx_id);
+			dump_ibx_term(req, p, &doc, ibx_id);
 	} catch (const Xapian::DatabaseModifiedError & e) {
 		req->srch->db->reopen();
 		return ITER_RETRY;
@@ -46,6 +79,7 @@ static bool cmd_dump_ibx(struct req *req)
 		EABORT("setlinebuf(fp[0])"); // WTF?
 	req->asc = true;
 	req->sort_col = -1;
+	term_length_extract(req);
 	Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);
 
 	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
@@ -110,18 +144,22 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 
 // writes term values matching @pfx for a given @doc, ending the line
 // with the contents of @root_offs
-static void dump_roots_term(struct req *req, const char *pfx,
+static void dump_roots_term(struct req *req, int p,
 				struct dump_roots_tmp *drt,
 				struct fbuf *root_offs,
 				Xapian::Document *doc)
 {
 	Xapian::TermIterator cur = doc->termlist_begin();
 	Xapian::TermIterator end = doc->termlist_end();
+	const char *pfx = req->pfxv[p];
 	size_t pfx_len = strlen(pfx);
+	size_t term_len = req->lenv[p];
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
 		if (!starts_with(&tn, pfx, pfx_len)) break;
+		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
+			continue;
 		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
 		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
 		++req->nr_out;
@@ -163,8 +201,7 @@ static enum exc_iter dump_roots_iter(struct req *req,
 		if (!root2offs_str(&root_offs, &doc))
 			return ITER_ABORT; // bad request, abort
 		for (int p = 0; p < req->pfxc; p++)
-			dump_roots_term(req, req->pfxv[p], drt,
-					&root_offs, &doc);
+			dump_roots_term(req, p, drt, &root_offs, &doc);
 	} catch (const Xapian::DatabaseModifiedError & e) {
 		req->srch->db->reopen();
 		return ITER_RETRY;
@@ -217,6 +254,7 @@ static bool cmd_dump_roots(struct req *req)
 	req->asc = true;
 	req->sort_col = -1;
 	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
+	term_length_extract(req);
 
 	fbuf_init(&drt.wbuf);
 
diff --git a/t/xap_helper.t b/t/xap_helper.t
index ec78998c..be010c75 100644
--- a/t/xap_helper.t
+++ b/t/xap_helper.t
@@ -241,6 +241,39 @@ for my $n (@NO_CXX) {
 				"#$docid $pfx as expected ($xhc->{impl})";
 		}
 	}
+	my $nr;
+	for my $i (7, 8, 39, 40) {
+		pipe($err_r, $err_w);
+		$r = $xhc->mkreq([ undef, $err_w ], qw(dump_roots -c -A),
+				"XDFPOST$i", (map { ('-d', $_) } @int),
+				$root2id_file, 'dt:19700101'.'000000..');
+		close $err_w;
+		@res = <$r>;
+		my @err = <$err_r>;
+		if (defined $nr) {
+			is scalar(@res), $nr,
+				"got expected results ($xhc->{impl})";
+		} else {
+			$nr //= scalar @res;
+			ok $nr, "got initial results ($xhc->{impl})";
+		}
+		my @oids = (join('', @res) =~ /^([a-f0-9]+) /gms);
+		is_deeply [grep { length == $i } @oids], \@oids,
+			"all OIDs match expected length ($xhc->{impl})";
+		my ($nr_out) = ("@err" =~ /nr_out=(\d+)/);
+		is $nr_out, scalar(@oids), "output count matches $xhc->{impl}"
+			or diag explain(\@res, \@err);
+	}
+	pipe($err_r, $err_w);
+	$r = $xhc->mkreq([ undef, $err_w ], qw(dump_ibx -A XDFPOST7),
+			@ibx_shard_args, qw(13 rt:0..));
+	close $err_w;
+	@res = <$r>;
+	my @err = <$err_r>;
+	my ($nr_out) = ("@err" =~ /nr_out=(\d+)/);
+	my @oids = (join('', @res) =~ /^([a-f0-9]{7}) /gms);
+	is $nr_out, scalar(@oids), "output count matches $xhc->{impl}" or
+		diag explain(\@res, \@err);
 }
 
 done_testing;

^ permalink raw reply related	[relevance 4%]

* [PATCH 0/6] cindex join stuff
@ 2023-12-08  3:54  7% Eric Wong
  2023-12-08  3:54  4% ` [PATCH 5/6] xap_helper: support term length limit Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-12-08  3:54 UTC (permalink / raw)
  To: meta

1-2 are small speedups, 3-4 are dev improvements, and 5-6
ought to actually improve and future-proof join accuracy.

Eric Wong (6):
  *search: simplify handling of Xapian term iterators
  *search: favor wantarray form of xap_terms
  xap_helper_cxx: drop chdir usage in build
  makefile: add `check-build' target
  xap_helper: support term length limit
  cindex: switch --join to use dfpost7 by default

 Makefile.PL                      | 13 +++++++
 lib/PublicInbox/CodeSearch.pm    | 15 ++++----
 lib/PublicInbox/CodeSearchIdx.pm | 18 +++++-----
 lib/PublicInbox/LeiInspect.pm    |  1 -
 lib/PublicInbox/LeiSearch.pm     | 17 ++++-----
 lib/PublicInbox/LeiStore.pm      | 13 +++----
 lib/PublicInbox/Search.pm        | 19 +++++-----
 lib/PublicInbox/SearchIdx.pm     | 13 ++++---
 lib/PublicInbox/XapHelper.pm     | 24 ++++++++++---
 lib/PublicInbox/XapHelperCxx.pm  | 19 ++++------
 lib/PublicInbox/xap_helper.h     | 11 +++++-
 lib/PublicInbox/xh_cidx.h        | 61 ++++++++++++++++++++++++--------
 lib/PublicInbox/xh_mset.h        |  2 +-
 t/xap_helper.t                   | 33 +++++++++++++++++
 14 files changed, 177 insertions(+), 82 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-12-08  3:54  7% [PATCH 0/6] cindex join stuff Eric Wong
2023-12-08  3:54  4% ` [PATCH 5/6] xap_helper: support term length limit Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).