From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id CFB781F518 for ; Tue, 28 Nov 2023 14:56:28 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1701183388; bh=BFnLi+4OQP2FqvpOQPhfEmF2XFmkld5AN65jnJu0Qlk=; h=From:To:Subject:Date:In-Reply-To:References:From; b=sPlTpY0EPJpfMJedXZWwWprmuzze0nfofNmrg6/nBjEIz1CBD+ac3KbSGQDMCLIqq 4OEoaD1UQzhElhXdD2lz5q+NUFtBVHKnrCkPST4k2yBkkLFEtjmAQiU/clSUqjJCe1 +9KqkfrrxsXymw0ZVR8ebDXzbDDgZJGLnBrCZEzM= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 05/14] xap_helper.h: move cindex endpoints to separate file Date: Tue, 28 Nov 2023 14:56:18 +0000 Message-ID: <20231128145628.1455176-6-e@80x24.org> In-Reply-To: <20231128145628.1455176-1-e@80x24.org> References: <20231128145628.1455176-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It ought to help a bit with organization since xap_helper.h is getting somewhat large and we'll need new endpoints to support WWW, lei, and whatever else that needs to come. --- MANIFEST | 1 + lib/PublicInbox/XapHelperCxx.pm | 10 +- lib/PublicInbox/xap_helper.h | 269 +------------------------------- lib/PublicInbox/xh_cidx.h | 259 ++++++++++++++++++++++++++++++ 4 files changed, 272 insertions(+), 267 deletions(-) create mode 100644 lib/PublicInbox/xh_cidx.h diff --git a/MANIFEST b/MANIFEST index 85811133..bbbe0b91 100644 --- a/MANIFEST +++ b/MANIFEST @@ -378,6 +378,7 @@ lib/PublicInbox/XapHelperCxx.pm lib/PublicInbox/Xapcmd.pm lib/PublicInbox/gcf2_libgit2.h lib/PublicInbox/xap_helper.h +lib/PublicInbox/xh_cidx.h sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm index f421c7bc..8a66fdcd 100644 --- a/lib/PublicInbox/XapHelperCxx.pm +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -20,7 +20,7 @@ $ENV{PERL_INLINE_DIRECTORY} // die('BUG: PERL_INLINE_DIRECTORY unset'); substr($dir, 0, 0) = "$ENV{PERL_INLINE_DIRECTORY}/"; my $bin = "$dir/xap_helper"; my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!); -my @srcs = map { $srcpfx.$_ } qw(xap_helper.h); +my @srcs = map { $srcpfx.$_ } qw(xap_helper.h xh_cidx.h); my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm); my $ldflags = '-Wl,-O1'; $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; @@ -61,11 +61,9 @@ sub build () { require PublicInbox::OnDestroy; my ($prog) = ($bin =~ m!/([^/]+)\z!); my $lk = PublicInbox::Lock->new("$dir/$prog.lock")->lock_for_scope; - open my $fh, '>', "$dir/$prog.cpp"; - say $fh qq(# include "$_") for @srcs; - print $fh PublicInbox::Search::generate_cxx(); - print $fh PublicInbox::CodeSearch::generate_cxx(); - close $fh; + write_file '>', "$dir/$prog.cpp", qq{#include "xap_helper.h"\n}, + PublicInbox::Search::generate_cxx(), + PublicInbox::CodeSearch::generate_cxx(); opendir my $dh, '.'; my $restore = PublicInbox::OnDestroy->new(\&chdir, $dh); diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 5816c24c..89d151d9 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -146,6 +146,12 @@ struct worker { unsigned nr; }; +struct fbuf { + FILE *fp; + char *ptr; + size_t len; +}; + #define SPLIT2ARGV(dst,buf,len) split2argv(dst,buf,len,MY_ARRAY_SIZE(dst)) static size_t split2argv(char **dst, char *buf, size_t len, size_t limit) { @@ -253,87 +259,11 @@ static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len) return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len); } -static void dump_ibx_term(struct req *req, const char *pfx, - Xapian::Document *doc, const char *ibx_id) -{ - Xapian::TermIterator cur = doc->termlist_begin(); - Xapian::TermIterator end = doc->termlist_end(); - size_t pfx_len = strlen(pfx); - - for (cur.skip_to(pfx); cur != end; cur++) { - std::string tn = *cur; - - if (starts_with(&tn, pfx, pfx_len)) { - fprintf(req->fp[0], "%s %s\n", - tn.c_str() + pfx_len, ibx_id); - ++req->nr_out; - } - } -} - static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors { return setvbuf(fp, NULL, _IOLBF, 0); } -static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, - Xapian::MSetIterator *i) -{ - try { - Xapian::Document doc = i->get_document(); - for (int p = 0; p < req->pfxc; p++) - dump_ibx_term(req, req->pfxv[p], &doc, ibx_id); - } catch (const Xapian::DatabaseModifiedError & e) { - req->srch->db->reopen(); - return ITER_RETRY; - } catch (const Xapian::DocNotFoundError & e) { // oh well... - warnx("doc not found: %s", e.get_description().c_str()); - } - return ITER_OK; -} - -static bool cmd_dump_ibx(struct req *req) -{ - if ((optind + 1) >= req->argc) - ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR"); - if (!req->pfxc) - ABORT("dump_ibx requires -A PREFIX"); - - const char *ibx_id = req->argv[optind]; - if (my_setlinebuf(req->fp[0])) // for sort(1) pipe - EABORT("setlinebuf(fp[0])"); // WTF? - req->asc = true; - req->sort_col = -1; - Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); - - // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine - // in case we need to retry on DB reopens - for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { - for (int t = 10; t > 0; --t) - switch (dump_ibx_iter(req, ibx_id, &i)) { - case ITER_OK: t = 0; break; // leave inner loop - case ITER_RETRY: break; // continue for-loop - case ITER_ABORT: return false; // error - } - } - emit_mset_stats(req, &mset); - return true; -} - -struct fbuf { - FILE *fp; - char *ptr; - size_t len; -}; - -struct dump_roots_tmp { - struct stat sb; - void *mm_ptr; - char **entries; - struct fbuf wbuf; - int root2off_fd; -}; - // n.b. __cleanup__ works fine with C++ exceptions, but not longjmp // Only clang and g++ are supported, as AFAIK there's no other // relevant Free(-as-in-speech) C++ compilers. @@ -367,127 +297,6 @@ static size_t off2size(off_t n) return (size_t)n; } -#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure))) -static void dump_roots_ensure(void *ptr) -{ - struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; - if (drt->root2off_fd >= 0) - xclose(drt->root2off_fd); - hdestroy(); // idempotent - size_t size = off2size(drt->sb.st_size); - if (drt->mm_ptr && munmap(drt->mm_ptr, size)) - EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); - free(drt->entries); - fbuf_ensure(&drt->wbuf); -} - -static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) -{ - Xapian::TermIterator cur = doc->termlist_begin(); - Xapian::TermIterator end = doc->termlist_end(); - ENTRY e, *ep; - fbuf_init(root_offs); - for (cur.skip_to("G"); cur != end; cur++) { - std::string tn = *cur; - if (!starts_with(&tn, "G", 1)) - continue; - union { const char *in; char *out; } u; - u.in = tn.c_str() + 1; - e.key = u.out; - ep = hsearch(e, FIND); - if (!ep) ABORT("hsearch miss `%s'", e.key); - // ep->data is a NUL-terminated string matching /[0-9]+/ - fputc(' ', root_offs->fp); - fputs((const char *)ep->data, root_offs->fp); - } - fputc('\n', root_offs->fp); - if (ferror(root_offs->fp) | fclose(root_offs->fp)) - err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM - root_offs->fp = NULL; - return true; -} - -// writes term values matching @pfx for a given @doc, ending the line -// with the contents of @root_offs -static void dump_roots_term(struct req *req, const char *pfx, - struct dump_roots_tmp *drt, - struct fbuf *root_offs, - Xapian::Document *doc) -{ - Xapian::TermIterator cur = doc->termlist_begin(); - Xapian::TermIterator end = doc->termlist_end(); - size_t pfx_len = strlen(pfx); - - for (cur.skip_to(pfx); cur != end; cur++) { - std::string tn = *cur; - if (!starts_with(&tn, pfx, pfx_len)) - continue; - fputs(tn.c_str() + pfx_len, drt->wbuf.fp); - fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); - ++req->nr_out; - } -} - -// we may have lines which exceed PIPE_BUF, so we do our own -// buffering and rely on flock(2), here -static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) -{ - char *p; - int fd = fileno(req->fp[0]); - bool ok = true; - - if (!drt->wbuf.fp) return true; - if (fd < 0) EABORT("BUG: fileno"); - if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM? - err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)"); - drt->wbuf.fp = NULL; - if (!drt->wbuf.len) goto done_free; - while (flock(drt->root2off_fd, LOCK_EX)) { - if (errno == EINTR) continue; - err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? - } - p = drt->wbuf.ptr; - do { // write to client FD - ssize_t n = write(fd, p, drt->wbuf.len); - if (n > 0) { - drt->wbuf.len -= n; - p += n; - } else { - perror(n ? "write" : "write (zero bytes)"); - return false; - } - } while (drt->wbuf.len); - while (flock(drt->root2off_fd, LOCK_UN)) { - if (errno == EINTR) continue; - err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? - } -done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure - free(drt->wbuf.ptr); - drt->wbuf.ptr = NULL; - return ok; -} - -static enum exc_iter dump_roots_iter(struct req *req, - struct dump_roots_tmp *drt, - Xapian::MSetIterator *i) -{ - CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" - try { - Xapian::Document doc = i->get_document(); - if (!root2offs_str(&root_offs, &doc)) - return ITER_ABORT; // bad request, abort - for (int p = 0; p < req->pfxc; p++) - dump_roots_term(req, req->pfxv[p], drt, - &root_offs, &doc); - } catch (const Xapian::DatabaseModifiedError & e) { - req->srch->db->reopen(); - return ITER_RETRY; - } catch (const Xapian::DocNotFoundError & e) { // oh well... - warnx("doc not found: %s", e.get_description().c_str()); - } - return ITER_OK; -} - static char *hsearch_enter_key(char *s) { #if defined(__OpenBSD__) || defined(__DragonFly__) @@ -507,70 +316,6 @@ static char *hsearch_enter_key(char *s) return s; } -static bool cmd_dump_roots(struct req *req) -{ - CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {}; - drt.root2off_fd = -1; - if ((optind + 1) >= req->argc) - ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR"); - if (!req->pfxc) - ABORT("dump_roots requires -A PREFIX"); - const char *root2off_file = req->argv[optind]; - drt.root2off_fd = open(root2off_file, O_RDONLY); - if (drt.root2off_fd < 0) - EABORT("open(%s)", root2off_file); - if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? - err(EXIT_FAILURE, "fstat(%s)", root2off_file); - // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), - // so /32 overestimates the number of expected entries by - // ~%25 (as recommended by Linux hcreate(3) manpage) - size_t size = off2size(drt.sb.st_size); - size_t est = (size / 32) + 1; //+1 for "\0" termination - drt.mm_ptr = mmap(NULL, size, PROT_READ, - MAP_PRIVATE, drt.root2off_fd, 0); - if (drt.mm_ptr == MAP_FAILED) - err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); - size_t asize = est * 2; - if (asize < est) ABORT("too many entries: %zu", est); - drt.entries = (char **)calloc(asize, sizeof(char *)); - if (!drt.entries) - err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); - size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); - if (tot <= 0) return false; // split2argv already warned on error - if (!hcreate(est)) - err(EXIT_FAILURE, "hcreate(%zu)", est); - for (size_t i = 0; i < tot; ) { - ENTRY e; - e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM - e.data = drt.entries[i++]; - if (!hsearch(e, ENTER)) - err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, - (const char *)e.data); - } - req->asc = true; - req->sort_col = -1; - Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); - - // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine - // in case we need to retry on DB reopens - for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { - if (!drt.wbuf.fp) - fbuf_init(&drt.wbuf); - for (int t = 10; t > 0; --t) - switch (dump_roots_iter(req, &drt, &i)) { - case ITER_OK: t = 0; break; // leave inner loop - case ITER_RETRY: break; // continue for-loop - case ITER_ABORT: return false; // error - } - if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt)) - return false; - } - if (!dump_roots_flush(req, &drt)) - return false; - emit_mset_stats(req, &mset); - return true; -} - // for test usage only, we need to ensure the compiler supports // __cleanup__ when exceptions are thrown struct inspect { struct req *req; }; @@ -594,6 +339,8 @@ static bool cmd_test_inspect(struct req *req) return false; } +#include "xh_cidx.h" // CodeSearchIdx.pm stuff + #define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n } static const struct cmd_entry { size_t fn_len; diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h new file mode 100644 index 00000000..c2d94162 --- /dev/null +++ b/lib/PublicInbox/xh_cidx.h @@ -0,0 +1,259 @@ +// Copyright (C) all contributors +// License: GPL-2.0+ +// This file is only intended to be included by xap_helper.h +// it implements pieces used by CodeSearchIdx.pm + +static void dump_ibx_term(struct req *req, const char *pfx, + Xapian::Document *doc, const char *ibx_id) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + size_t pfx_len = strlen(pfx); + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + + if (starts_with(&tn, pfx, pfx_len)) { + fprintf(req->fp[0], "%s %s\n", + tn.c_str() + pfx_len, ibx_id); + ++req->nr_out; + } + } +} + +static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, + Xapian::MSetIterator *i) +{ + try { + Xapian::Document doc = i->get_document(); + for (int p = 0; p < req->pfxc; p++) + dump_ibx_term(req, req->pfxv[p], &doc, ibx_id); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_ibx(struct req *req) +{ + if ((optind + 1) >= req->argc) + ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR"); + if (!req->pfxc) + ABORT("dump_ibx requires -A PREFIX"); + + const char *ibx_id = req->argv[optind]; + if (my_setlinebuf(req->fp[0])) // for sort(1) pipe + EABORT("setlinebuf(fp[0])"); // WTF? + req->asc = true; + req->sort_col = -1; + Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + for (int t = 10; t > 0; --t) + switch (dump_ibx_iter(req, ibx_id, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + } + emit_mset_stats(req, &mset); + return true; +} + +struct dump_roots_tmp { + struct stat sb; + void *mm_ptr; + char **entries; + struct fbuf wbuf; + int root2off_fd; +}; + +#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure))) +static void dump_roots_ensure(void *ptr) +{ + struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; + if (drt->root2off_fd >= 0) + xclose(drt->root2off_fd); + hdestroy(); // idempotent + size_t size = off2size(drt->sb.st_size); + if (drt->mm_ptr && munmap(drt->mm_ptr, size)) + EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); + free(drt->entries); + fbuf_ensure(&drt->wbuf); +} + +static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + ENTRY e, *ep; + fbuf_init(root_offs); + for (cur.skip_to("G"); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) + continue; + union { const char *in; char *out; } u; + u.in = tn.c_str() + 1; + e.key = u.out; + ep = hsearch(e, FIND); + if (!ep) ABORT("hsearch miss `%s'", e.key); + // ep->data is a NUL-terminated string matching /[0-9]+/ + fputc(' ', root_offs->fp); + fputs((const char *)ep->data, root_offs->fp); + } + fputc('\n', root_offs->fp); + if (ferror(root_offs->fp) | fclose(root_offs->fp)) + err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM + root_offs->fp = NULL; + return true; +} + +// writes term values matching @pfx for a given @doc, ending the line +// with the contents of @root_offs +static void dump_roots_term(struct req *req, const char *pfx, + struct dump_roots_tmp *drt, + struct fbuf *root_offs, + Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + size_t pfx_len = strlen(pfx); + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) + continue; + fputs(tn.c_str() + pfx_len, drt->wbuf.fp); + fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); + ++req->nr_out; + } +} + +// we may have lines which exceed PIPE_BUF, so we do our own +// buffering and rely on flock(2), here +static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) +{ + char *p; + int fd = fileno(req->fp[0]); + bool ok = true; + + if (!drt->wbuf.fp) return true; + if (fd < 0) EABORT("BUG: fileno"); + if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM? + err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)"); + drt->wbuf.fp = NULL; + if (!drt->wbuf.len) goto done_free; + while (flock(drt->root2off_fd, LOCK_EX)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? + } + p = drt->wbuf.ptr; + do { // write to client FD + ssize_t n = write(fd, p, drt->wbuf.len); + if (n > 0) { + drt->wbuf.len -= n; + p += n; + } else { + perror(n ? "write" : "write (zero bytes)"); + return false; + } + } while (drt->wbuf.len); + while (flock(drt->root2off_fd, LOCK_UN)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? + } +done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure + free(drt->wbuf.ptr); + drt->wbuf.ptr = NULL; + return ok; +} + +static enum exc_iter dump_roots_iter(struct req *req, + struct dump_roots_tmp *drt, + Xapian::MSetIterator *i) +{ + CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" + try { + Xapian::Document doc = i->get_document(); + if (!root2offs_str(&root_offs, &doc)) + return ITER_ABORT; // bad request, abort + for (int p = 0; p < req->pfxc; p++) + dump_roots_term(req, req->pfxv[p], drt, + &root_offs, &doc); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_roots(struct req *req) +{ + CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {}; + drt.root2off_fd = -1; + if ((optind + 1) >= req->argc) + ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR"); + if (!req->pfxc) + ABORT("dump_roots requires -A PREFIX"); + const char *root2off_file = req->argv[optind]; + drt.root2off_fd = open(root2off_file, O_RDONLY); + if (drt.root2off_fd < 0) + EABORT("open(%s)", root2off_file); + if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? + err(EXIT_FAILURE, "fstat(%s)", root2off_file); + // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), + // so /32 overestimates the number of expected entries by + // ~%25 (as recommended by Linux hcreate(3) manpage) + size_t size = off2size(drt.sb.st_size); + size_t est = (size / 32) + 1; //+1 for "\0" termination + drt.mm_ptr = mmap(NULL, size, PROT_READ, + MAP_PRIVATE, drt.root2off_fd, 0); + if (drt.mm_ptr == MAP_FAILED) + err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); + size_t asize = est * 2; + if (asize < est) ABORT("too many entries: %zu", est); + drt.entries = (char **)calloc(asize, sizeof(char *)); + if (!drt.entries) + err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); + size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); + if (tot <= 0) return false; // split2argv already warned on error + if (!hcreate(est)) + err(EXIT_FAILURE, "hcreate(%zu)", est); + for (size_t i = 0; i < tot; ) { + ENTRY e; + e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM + e.data = drt.entries[i++]; + if (!hsearch(e, ENTER)) + err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, + (const char *)e.data); + } + req->asc = true; + req->sort_col = -1; + Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + if (!drt.wbuf.fp) + fbuf_init(&drt.wbuf); + for (int t = 10; t > 0; --t) + switch (dump_roots_iter(req, &drt, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt)) + return false; + } + if (!dump_roots_flush(req, &drt)) + return false; + emit_mset_stats(req, &mset); + return true; +}