public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob 74232ebfb26636dcd405bde747dfa02ba8f9e561 5873 bytes (raw)
$ git show HEAD:script/public-inbox-index	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
 
#!perl -w
# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# Basic tool to create a Xapian search index for a public-inbox.
# Usage with libeatmydata <https://www.flamingspork.com/projects/libeatmydata/>
# highly recommended: eatmydata public-inbox-index INBOX_DIR

use strict;
use v5.10.1;
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
usage: public-inbox-index [options] INBOX_DIR

  Create and update per-inbox search indices

options:

  --no-fsync          speed up indexing, risk corruption on power outage
  -L LEVEL            `basic', `medium', or `full' (default: full)
  -E EXTINDEX         update extindex (default: `all')
  --all               index all configured inboxes
  --compact | -c      run public-inbox-compact(1) after indexing
  --sequential-shard  index Xapian shards sequentially for slow storage
  --jobs=NUM          set or disable parallelization (NUM=0)
  --batch-size=BYTES  flush changes to OS after a given number of bytes
  --max-size=BYTES    do not index messages larger than the given size
  --reindex           index previously indexed data (if upgrading)
  --since=DATE        limit --reindex to changes after DATE
  --until=DATE        limit --reindex to changes before DATE
  --rethread          regenerate thread IDs (if upgrading, use sparingly)
  --prune             prune git storage on discontiguous history
  --verbose | -v      increase verbosity (may be repeated)

BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
See public-inbox-index(1) man page for full documentation.
EOF
my $opt = {
	quiet => -1, compact => 0, max_size => undef, fsync => 1,
	'update-extindex' => [], # ":s@" optional arg sets '' if no arg given
};
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
		fsync|sync! xapian_only|xapian-only dangerous
		indexlevel|index-level|L=s max_size|max-size=s
		batch_size|batch-size=s
		since|after=s until|before=s
		sequential-shard|seq-shard
		no-update-extindex update-extindex|E=s@
		fast-noop|F skip-docdata all C=s@ help|h))
	or die $help;
if ($opt->{help}) { print $help; exit 0 };
die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
if ($opt->{xapian_only} && !$opt->{reindex}) {
	die "--xapian-only requires --reindex\n";
}
if ($opt->{reindex} && delete($opt->{'fast-noop'})) {
	warn "--fast-noop ignored with --reindex\n";
}

# require lazily to speed up --help
require PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
PublicInbox::Admin::do_chdir(delete $opt->{C});

my $cfg = PublicInbox::Config->new; # Config is loaded by Admin
$opt->{-use_cwd} = 1;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR $help; exit 1 }
require PublicInbox::InboxWritable;

my (@eidx, %eidx_seen);
my $update_extindex = $opt->{'update-extindex'};
if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
	# extindex and normal inboxes may have different owners
	push(@$update_extindex, 'all') if -w $ALL->{topdir};
}
@$update_extindex = () if $opt->{'no-update-extindex'};
if (scalar @$update_extindex) {
	PublicInbox::Admin::require_or_die('-search');
	require PublicInbox::ExtSearchIdx;
}
for my $ei_name (@$update_extindex) {
	my $es = $cfg->lookup_ei($ei_name);
	my $topdir;
	if (!$es && -d $ei_name) { # allow dirname or config section name
		$topdir = $ei_name;
	} elsif ($es) {
		$topdir = $es->{topdir};
	} else {
		die "extindex `$ei_name' not configured or found\n";
	}
	my $o = { %$opt };
	delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic';
	$eidx_seen{$topdir} //=
		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o));
}
my $mods = {};
my @eidx_unconfigured;
foreach my $ibx (@ibxs) {
	$ibx = PublicInbox::InboxWritable->new($ibx);
	# detect_indexlevel may also set $ibx->{-skip_docdata}
	my $detected = $ibx->detect_indexlevel;
	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
			'full' : $detected);
	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
	if (@eidx && $ibx->{-unconfigured}) {
		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
	}
}
warn <<EOF if @eidx_unconfigured;
The following inboxes are unconfigured and will not be updated in
@$update_extindex:\n@eidx_unconfigured
EOF

$opt->{compact} = 0 if !$mods->{'Xapian'}; # (or old Search::Xapian)

PublicInbox::Admin::require_or_die(keys %$mods);
my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
local %ENV = (%ENV, %$env) if $env;
PublicInbox::Xapcmd::check_compact() if $opt->{compact};
PublicInbox::Admin::progress_prepare($opt);
for my $ibx (@ibxs) {
	if ($opt->{compact} >= 2) {
		PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt});
	}
	$ibx->{-no_fsync} = 1 if !$opt->{fsync};
	$ibx->{-dangerous} = 1 if $opt->{dangerous};
	$ibx->{-skip_docdata} //= $opt->{'skip-docdata'};

	my $ibx_opt = $opt;
	if (defined(my $s = $ibx->{lc('indexSequentialShard')})) {
		defined(my $v = $cfg->git_bool($s)) or die <<EOL;
publicInbox.$ibx->{name}.indexSequentialShard not boolean
EOL
		$ibx_opt = { %$opt, 'sequential-shard' => $v };
	}
	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
	last if $ibx_opt->{quit};
	if (my $copt = $opt->{compact_opt}) {
		local $copt->{jobs} = 0 if $ibx_opt->{'sequential-shard'};
		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
	}
	last if $ibx_opt->{quit};
	next if $ibx->{-unconfigured} || !$nidx;
	for my $eidx (@eidx) {
		$eidx->attach_inbox($ibx);
	}
}
my $pr = $opt->{-progress};
for my $eidx (@eidx) {
	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
	$eidx->eidx_sync($opt);
	last if $opt->{quit};
}

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git