user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
ca190a2e32d03fed0bb4200903a771cedb645bd7 blob 5715 bytes (raw)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
 
#!perl -w
# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# Basic tool to create a Xapian search index for a public-inbox.
# Usage with libeatmydata <https://www.flamingspork.com/projects/libeatmydata/>
# highly recommended: eatmydata public-inbox-index INBOX_DIR

use strict;
use v5.10.1;
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
usage: public-inbox-index [options] INBOX_DIR

  Create and update per-inbox search indices

options:

  --no-fsync          speed up indexing, risk corruption on power outage
  -L LEVEL            `basic', `medium', or `full' (default: full)
  -E EXTINDEX         update extindex (default: `all')
  --all               index all configured inboxes
  --compact | -c      run public-inbox-compact(1) after indexing
  --sequential-shard  index Xapian shards sequentially for slow storage
  --jobs=NUM          set or disable parallelization (NUM=0)
  --batch-size=BYTES  flush changes to OS after a given number of bytes
  --max-size=BYTES    do not index messages larger than the given size
  --reindex           index previously indexed data (if upgrading)
  --rethread          regenerate thread IDs (if upgrading, use sparingly)
  --prune             prune git storage on discontiguous history
  --verbose | -v      increase verbosity (may be repeated)

BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
See public-inbox-index(1) man page for full documentation.
EOF
my $opt = {
	quiet => -1, compact => 0, max_size => undef, fsync => 1,
	'update-extindex' => [], # ":s@" optional arg sets '' if no arg given
};
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
		fsync|sync! xapian_only|xapian-only
		indexlevel|index-level|L=s max_size|max-size=s
		batch_size|batch-size=s
		sequential-shard|seq-shard
		no-update-extindex update-extindex|E=s@
		fast-noop|F skip-docdata all C=s@ help|h))
	or die $help;
if ($opt->{help}) { print $help; exit 0 };
die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
if ($opt->{xapian_only} && !$opt->{reindex}) {
	die "--xapian-only requires --reindex\n";
}
if ($opt->{reindex} && delete($opt->{'fast-noop'})) {
	warn "--fast-noop ignored with --reindex\n";
}

# require lazily to speed up --help
require PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
PublicInbox::Admin::do_chdir(delete $opt->{C});

my $cfg = PublicInbox::Config->new; # Config is loaded by Admin
$opt->{-use_cwd} = 1;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR $help; exit 1 }

my (@eidx, %eidx_seen);
my $update_extindex = $opt->{'update-extindex'};
if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
	# extindex and normal inboxes may have different owners
	push(@$update_extindex, 'all') if -w $ALL->{topdir};
}
@$update_extindex = () if $opt->{'no-update-extindex'};
if (scalar @$update_extindex) {
	PublicInbox::Admin::require_or_die('-search');
	require PublicInbox::ExtSearchIdx;
}
for my $ei_name (@$update_extindex) {
	my $es = $cfg->lookup_ei($ei_name);
	my $topdir;
	if (!$es && -d $ei_name) { # allow dirname or config section name
		$topdir = $ei_name;
	} elsif ($es) {
		$topdir = $es->{topdir};
	} else {
		die "extindex `$ei_name' not configured or found\n";
	}
	my $o = { %$opt };
	delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic';
	$eidx_seen{$topdir} //=
		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o));
}
my $mods = {};
my @eidx_unconfigured;
foreach my $ibx (@ibxs) {
	# detect_indexlevel may also set $ibx->{-skip_docdata}
	my $detected = PublicInbox::Admin::detect_indexlevel($ibx);
	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
			'full' : $detected);
	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
	if (@eidx && $ibx->{-unconfigured}) {
		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
	}
}
warn <<EOF if @eidx_unconfigured;
The following inboxes are unconfigured and will not be updated in
@$update_extindex:\n@eidx_unconfigured
EOF

# "Search::Xapian" includes SWIG "Xapian", too:
$opt->{compact} = 0 if !$mods->{'Search::Xapian'};

PublicInbox::Admin::require_or_die(keys %$mods);
my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
local %ENV = (%ENV, %$env) if $env;
require PublicInbox::InboxWritable;
PublicInbox::Xapcmd::check_compact() if $opt->{compact};
PublicInbox::Admin::progress_prepare($opt);
for my $ibx (@ibxs) {
	$ibx = PublicInbox::InboxWritable->new($ibx);
	if ($opt->{compact} >= 2) {
		PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt});
	}
	$ibx->{-no_fsync} = 1 if !$opt->{fsync};
	$ibx->{-skip_docdata} //= $opt->{'skip-docdata'};

	my $ibx_opt = $opt;
	if (defined(my $s = $ibx->{lc('indexSequentialShard')})) {
		defined(my $v = $cfg->git_bool($s)) or die <<EOL;
publicInbox.$ibx->{name}.indexSequentialShard not boolean
EOL
		$ibx_opt = { %$opt, 'sequential-shard' => $v };
	}
	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
	last if $ibx_opt->{quit};
	if (my $copt = $opt->{compact_opt}) {
		local $copt->{jobs} = 0 if $ibx_opt->{'sequential-shard'};
		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
	}
	last if $ibx_opt->{quit};
	next if $ibx->{-unconfigured} || !$nidx;
	for my $eidx (@eidx) {
		$eidx->attach_inbox($ibx);
	}
}
my $pr = $opt->{-progress};
for my $eidx (@eidx) {
	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
	$eidx->eidx_sync($opt);
	last if $opt->{quit};
}
debug log:

solving ca190a2e32d0 ...
found ca190a2e32d0 in https://80x24.org/public-inbox.git

Code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).