about summary refs log tree commit homepage
path: root/lib/PublicInbox/xh_cidx.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/xh_cidx.h')
-rw-r--r--lib/PublicInbox/xh_cidx.h259
1 files changed, 259 insertions, 0 deletions
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
new file mode 100644
index 00000000..c2d94162
--- /dev/null
+++ b/lib/PublicInbox/xh_cidx.h
@@ -0,0 +1,259 @@
+// Copyright (C) all contributors <meta@public-inbox.org>
+// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+// This file is only intended to be included by xap_helper.h
+// it implements pieces used by CodeSearchIdx.pm
+
+static void dump_ibx_term(struct req *req, const char *pfx,
+                        Xapian::Document *doc, const char *ibx_id)
+{
+        Xapian::TermIterator cur = doc->termlist_begin();
+        Xapian::TermIterator end = doc->termlist_end();
+        size_t pfx_len = strlen(pfx);
+
+        for (cur.skip_to(pfx); cur != end; cur++) {
+                std::string tn = *cur;
+
+                if (starts_with(&tn, pfx, pfx_len)) {
+                        fprintf(req->fp[0], "%s %s\n",
+                                tn.c_str() + pfx_len, ibx_id);
+                        ++req->nr_out;
+                }
+        }
+}
+
+static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
+                                Xapian::MSetIterator *i)
+{
+        try {
+                Xapian::Document doc = i->get_document();
+                for (int p = 0; p < req->pfxc; p++)
+                        dump_ibx_term(req, req->pfxv[p], &doc, ibx_id);
+        } catch (const Xapian::DatabaseModifiedError & e) {
+                req->srch->db->reopen();
+                return ITER_RETRY;
+        } catch (const Xapian::DocNotFoundError & e) { // oh well...
+                warnx("doc not found: %s", e.get_description().c_str());
+        }
+        return ITER_OK;
+}
+
+static bool cmd_dump_ibx(struct req *req)
+{
+        if ((optind + 1) >= req->argc)
+                ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR");
+        if (!req->pfxc)
+                ABORT("dump_ibx requires -A PREFIX");
+
+        const char *ibx_id = req->argv[optind];
+        if (my_setlinebuf(req->fp[0])) // for sort(1) pipe
+                EABORT("setlinebuf(fp[0])"); // WTF?
+        req->asc = true;
+        req->sort_col = -1;
+        Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);
+
+        // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
+        // in case we need to retry on DB reopens
+        for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
+                for (int t = 10; t > 0; --t)
+                        switch (dump_ibx_iter(req, ibx_id, &i)) {
+                        case ITER_OK: t = 0; break; // leave inner loop
+                        case ITER_RETRY: break; // continue for-loop
+                        case ITER_ABORT: return false; // error
+                        }
+        }
+        emit_mset_stats(req, &mset);
+        return true;
+}
+
+struct dump_roots_tmp {
+        struct stat sb;
+        void *mm_ptr;
+        char **entries;
+        struct fbuf wbuf;
+        int root2off_fd;
+};
+
+#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure)))
+static void dump_roots_ensure(void *ptr)
+{
+        struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr;
+        if (drt->root2off_fd >= 0)
+                xclose(drt->root2off_fd);
+        hdestroy(); // idempotent
+        size_t size = off2size(drt->sb.st_size);
+        if (drt->mm_ptr && munmap(drt->mm_ptr, size))
+                EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size);
+        free(drt->entries);
+        fbuf_ensure(&drt->wbuf);
+}
+
+static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
+{
+        Xapian::TermIterator cur = doc->termlist_begin();
+        Xapian::TermIterator end = doc->termlist_end();
+        ENTRY e, *ep;
+        fbuf_init(root_offs);
+        for (cur.skip_to("G"); cur != end; cur++) {
+                std::string tn = *cur;
+                if (!starts_with(&tn, "G", 1))
+                        continue;
+                union { const char *in; char *out; } u;
+                u.in = tn.c_str() + 1;
+                e.key = u.out;
+                ep = hsearch(e, FIND);
+                if (!ep) ABORT("hsearch miss `%s'", e.key);
+                // ep->data is a NUL-terminated string matching /[0-9]+/
+                fputc(' ', root_offs->fp);
+                fputs((const char *)ep->data, root_offs->fp);
+        }
+        fputc('\n', root_offs->fp);
+        if (ferror(root_offs->fp) | fclose(root_offs->fp))
+                err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM
+        root_offs->fp = NULL;
+        return true;
+}
+
+// writes term values matching @pfx for a given @doc, ending the line
+// with the contents of @root_offs
+static void dump_roots_term(struct req *req, const char *pfx,
+                                struct dump_roots_tmp *drt,
+                                struct fbuf *root_offs,
+                                Xapian::Document *doc)
+{
+        Xapian::TermIterator cur = doc->termlist_begin();
+        Xapian::TermIterator end = doc->termlist_end();
+        size_t pfx_len = strlen(pfx);
+
+        for (cur.skip_to(pfx); cur != end; cur++) {
+                std::string tn = *cur;
+                if (!starts_with(&tn, pfx, pfx_len))
+                        continue;
+                fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
+                fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
+                ++req->nr_out;
+        }
+}
+
+// we may have lines which exceed PIPE_BUF, so we do our own
+// buffering and rely on flock(2), here
+static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt)
+{
+        char *p;
+        int fd = fileno(req->fp[0]);
+        bool ok = true;
+
+        if (!drt->wbuf.fp) return true;
+        if (fd < 0) EABORT("BUG: fileno");
+        if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM?
+                err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)");
+        drt->wbuf.fp = NULL;
+        if (!drt->wbuf.len) goto done_free;
+        while (flock(drt->root2off_fd, LOCK_EX)) {
+                if (errno == EINTR) continue;
+                err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK?
+        }
+        p = drt->wbuf.ptr;
+        do { // write to client FD
+                ssize_t n = write(fd, p, drt->wbuf.len);
+                if (n > 0) {
+                        drt->wbuf.len -= n;
+                        p += n;
+                } else {
+                        perror(n ? "write" : "write (zero bytes)");
+                        return false;
+                }
+        } while (drt->wbuf.len);
+        while (flock(drt->root2off_fd, LOCK_UN)) {
+                if (errno == EINTR) continue;
+                err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK?
+        }
+done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure
+        free(drt->wbuf.ptr);
+        drt->wbuf.ptr = NULL;
+        return ok;
+}
+
+static enum exc_iter dump_roots_iter(struct req *req,
+                                struct dump_roots_tmp *drt,
+                                Xapian::MSetIterator *i)
+{
+        CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n"
+        try {
+                Xapian::Document doc = i->get_document();
+                if (!root2offs_str(&root_offs, &doc))
+                        return ITER_ABORT; // bad request, abort
+                for (int p = 0; p < req->pfxc; p++)
+                        dump_roots_term(req, req->pfxv[p], drt,
+                                        &root_offs, &doc);
+        } catch (const Xapian::DatabaseModifiedError & e) {
+                req->srch->db->reopen();
+                return ITER_RETRY;
+        } catch (const Xapian::DocNotFoundError & e) { // oh well...
+                warnx("doc not found: %s", e.get_description().c_str());
+        }
+        return ITER_OK;
+}
+
+static bool cmd_dump_roots(struct req *req)
+{
+        CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {};
+        drt.root2off_fd = -1;
+        if ((optind + 1) >= req->argc)
+                ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR");
+        if (!req->pfxc)
+                ABORT("dump_roots requires -A PREFIX");
+        const char *root2off_file = req->argv[optind];
+        drt.root2off_fd = open(root2off_file, O_RDONLY);
+        if (drt.root2off_fd < 0)
+                EABORT("open(%s)", root2off_file);
+        if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM?
+                err(EXIT_FAILURE, "fstat(%s)", root2off_file);
+        // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0),
+        // so /32 overestimates the number of expected entries by
+        // ~%25 (as recommended by Linux hcreate(3) manpage)
+        size_t size = off2size(drt.sb.st_size);
+        size_t est = (size / 32) + 1; //+1 for "\0" termination
+        drt.mm_ptr = mmap(NULL, size, PROT_READ,
+                                MAP_PRIVATE, drt.root2off_fd, 0);
+        if (drt.mm_ptr == MAP_FAILED)
+                err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file);
+        size_t asize = est * 2;
+        if (asize < est) ABORT("too many entries: %zu", est);
+        drt.entries = (char **)calloc(asize, sizeof(char *));
+        if (!drt.entries)
+                err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *));
+        size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize);
+        if (tot <= 0) return false; // split2argv already warned on error
+        if (!hcreate(est))
+                err(EXIT_FAILURE, "hcreate(%zu)", est);
+        for (size_t i = 0; i < tot; ) {
+                ENTRY e;
+                e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM
+                e.data = drt.entries[i++];
+                if (!hsearch(e, ENTER))
+                        err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key,
+                                        (const char *)e.data);
+        }
+        req->asc = true;
+        req->sort_col = -1;
+        Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
+
+        // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
+        // in case we need to retry on DB reopens
+        for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
+                if (!drt.wbuf.fp)
+                        fbuf_init(&drt.wbuf);
+                for (int t = 10; t > 0; --t)
+                        switch (dump_roots_iter(req, &drt, &i)) {
+                        case ITER_OK: t = 0; break; // leave inner loop
+                        case ITER_RETRY: break; // continue for-loop
+                        case ITER_ABORT: return false; // error
+                        }
+                if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt))
+                        return false;
+        }
+        if (!dump_roots_flush(req, &drt))
+                return false;
+        emit_mset_stats(req, &mset);
+        return true;
+}