diff options
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r-- | lib/PublicInbox/Admin.pm | 75 | ||||
-rw-r--r-- | lib/PublicInbox/AdminEdit.pm | 67 | ||||
-rw-r--r-- | lib/PublicInbox/Git.pm | 45 | ||||
-rw-r--r-- | lib/PublicInbox/Import.pm | 101 | ||||
-rw-r--r-- | lib/PublicInbox/Inbox.pm | 2 | ||||
-rw-r--r-- | lib/PublicInbox/NNTP.pm | 27 | ||||
-rw-r--r-- | lib/PublicInbox/Search.pm | 15 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 41 | ||||
-rw-r--r-- | lib/PublicInbox/SearchMsg.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 221 | ||||
-rw-r--r-- | lib/PublicInbox/WWW.pm | 17 | ||||
-rw-r--r-- | lib/PublicInbox/WwwListing.pm | 174 | ||||
-rw-r--r-- | lib/PublicInbox/Xapcmd.pm | 222 |
13 files changed, 757 insertions, 256 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 4a862c6d..8a2f2043 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -9,6 +9,8 @@ use warnings; use Cwd 'abs_path'; use base qw(Exporter); our @EXPORT_OK = qw(resolve_repo_dir); +my $CFG; # all the admin stuff is a singleton +require PublicInbox::Config; sub resolve_repo_dir { my ($cd, $ver) = @_; @@ -66,36 +68,65 @@ $ibx->{mainrepo} has unexpected indexlevel in Xapian: $m $l; } -sub resolve_inboxes { - my ($argv, $warn_on_unconfigured) = @_; - require PublicInbox::Config; +sub unconfigured_ibx ($$) { + my ($dir, $i) = @_; + my $name = "unconfigured-$i"; + PublicInbox::Inbox->new({ + name => $name, + address => [ "$name\@example.com" ], + mainrepo => $dir, + # TODO: consumers may want to warn on this: + #-unconfigured => 1, + }); +} + +sub config () { $CFG //= eval { PublicInbox::Config->new } } + +sub resolve_inboxes ($;$) { + my ($argv, $opt) = @_; require PublicInbox::Inbox; + $opt ||= {}; - my @ibxs = map { resolve_repo_dir($_) } @$argv; - push(@ibxs, resolve_repo_dir()) unless @ibxs; + my $cfg = config(); + if ($opt->{all}) { + my $cfgfile = PublicInbox::Config::default_file(); + $cfg or die "--all specified, but $cfgfile not readable\n"; + @$argv and die "--all specified, but directories specified\n"; + } + my $min_ver = $opt->{-min_inbox_version} || 0; + my (@old, @ibxs); my %dir2ibx; - if (my $config = eval { PublicInbox::Config->new }) { - $config->each_inbox(sub { + if ($cfg) { + $cfg->each_inbox(sub { my ($ibx) = @_; + $ibx->{version} ||= 1; $dir2ibx{abs_path($ibx->{mainrepo})} = $ibx; }); - } elsif ($warn_on_unconfigured) { - # do we really care about this? It's annoying... - warn $warn_on_unconfigured, "\n"; } - for my $i (0..$#ibxs) { - my $dir = $ibxs[$i]; - $ibxs[$i] = $dir2ibx{$dir} ||= do { - my $name = "unconfigured-$i"; - PublicInbox::Inbox->new({ - name => $name, - address => [ "$name\@example.com" ], - mainrepo => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, - }); - }; + if ($opt->{all}) { + my @all = values %dir2ibx; + @all = grep { $_->{version} >= $min_ver } @all; + push @ibxs, @all; + } else { # directories specified on the command-line + my $i = 0; + my @dirs = @$argv; + push @dirs, '.' unless @dirs; + foreach (@dirs) { + my $v; + my $dir = resolve_repo_dir($_, \$v); + if ($v < $min_ver) { + push @old, $dir; + next; + } + my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); + $i++; + push @ibxs, $ibx; + } + } + if (@old) { + die "inboxes $min_ver inboxes not supported by $0\n\t", + join("\n\t", @old), "\n"; } @ibxs; } diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm new file mode 100644 index 00000000..169feba0 --- /dev/null +++ b/lib/PublicInbox/AdminEdit.pm @@ -0,0 +1,67 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common stuff between -edit, -purge (and maybe -learn in the future) +package PublicInbox::AdminEdit; +use strict; +use warnings; +use PublicInbox::Admin; +our @OPT = qw(all force|f verbose|v!); + +sub check_editable ($) { + my ($ibxs) = @_; + + foreach my $ibx (@$ibxs) { + my $lvl = $ibx->{indexlevel}; + if (defined $lvl) { + PublicInbox::Admin::indexlevel_ok_or_die($lvl); + next; + } + + # Undefined indexlevel, so `full'... + # Search::Xapian exists and the DB can be read, at least, fine + $ibx->search and next; + + # it's possible for a Xapian directory to exist, + # but Search::Xapian to go missing/broken. + # Make sure it's purged in that case: + $ibx->over or die "no over.sqlite3 in $ibx->{mainrepo}\n"; + + # $ibx->{search} is populated by $ibx->over call + my $xdir_ro = $ibx->{search}->xdir(1); + my $npart = 0; + foreach my $part (<$xdir_ro/*>) { + if (-d $part && $part =~ m!/[0-9]+\z!) { + my $bytes = 0; + $bytes += -s $_ foreach glob("$part/*"); + $npart++ if $bytes; + } + } + if ($npart) { + PublicInbox::Admin::require_or_die('-search'); + } else { + # somebody could "rm -r" all the Xapian directories; + # let them purge the overview, at least + $ibx->{indexlevel} ||= 'basic'; + } + } +} + +# takes the output of V2Writable::purge and V2Writable::replace +# $rewrites = [ array commits keyed by epoch ] +sub show_rewrites ($$$) { + my ($fh, $ibx, $rewrites) = @_; + print $fh "$ibx->{mainrepo}:"; + if (scalar @$rewrites) { + my $epoch = -1; + my @out = map {; + ++$epoch; + "$epoch.git: ".(defined($_) ? $_ : '(unchanged)') + } @$rewrites; + print $fh join("\n\t", '', @out), "\n"; + } else { + print $fh " NONE\n"; + } +} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 68445b3c..f5c7a95c 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -145,41 +145,24 @@ again: fail($self, "Unexpected result from git cat-file: $head"); my $size = $1; - my $ref_type = $ref ? ref($ref) : ''; - my $rv; my $left = $size; - $$ref = $size if ($ref_type eq 'SCALAR'); - my $cb_err; - - if ($ref_type eq 'CODE') { - $rv = eval { $ref->($in, \$left) }; - $cb_err = $@; - # drain the rest - my $max = 8192; - while ($left > 0) { - my $r = read($in, my $x, $left > $max ? $max : $left); - defined($r) or fail($self, "read failed: $!"); - $r == 0 and fail($self, 'exited unexpectedly'); - $left -= $r; - } - } else { - my $offset = 0; - my $buf = ''; - while ($left > 0) { - my $r = read($in, $buf, $left, $offset); - defined($r) or fail($self, "read failed: $!"); - $r == 0 and fail($self, 'exited unexpectedly'); - $left -= $r; - $offset += $r; - } - $rv = \$buf; + $$ref = $size if $ref; + + my $offset = 0; + my $buf = ''; + while ($left > 0) { + my $r = read($in, $buf, $left, $offset); + defined($r) or fail($self, "read failed: $!"); + $r == 0 and fail($self, 'exited unexpectedly'); + $left -= $r; + $offset += $r; } + $rv = \$buf; - my $r = read($in, my $buf, 1); + my $r = read($in, my $lf, 1); defined($r) or fail($self, "read failed: $!"); - fail($self, 'newline missing after blob') if ($r != 1 || $buf ne "\n"); - die $cb_err if $cb_err; + fail($self, 'newline missing after blob') if ($r != 1 || $lf ne "\n"); $rv; } @@ -320,7 +303,7 @@ sub modified ($) { chomp $oid; my $buf = cat_file($self, $oid) or next; $$buf =~ /^committer .*?> ([0-9]+) [\+\-]?[0-9]+/sm or next; - my $cmt_time = $1; + my $cmt_time = $1 + 0; $modified = $cmt_time if $cmt_time > $modified; } $modified || time; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2c4bad92..137b2b78 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -277,7 +277,7 @@ sub git_timestamp { "$ts $zone"; } -sub extract_author_info ($) { +sub extract_cmt_info ($) { my ($mime) = @_; my $sender = ''; @@ -314,7 +314,17 @@ sub extract_author_info ($) { $name = ''; warn "no name in From: $from or Sender: $sender\n"; } - ($name, $email); + + my $hdr = $mime->header_obj; + + my $subject = $hdr->header('Subject'); + $subject = '(no subject)' unless defined $subject; + # Mime decoding can create nulls replace them with spaces to protect git + $subject =~ tr/\0/ /; + utf8::encode($subject); + my $at = git_timestamp(my @at = msg_datestamp($hdr)); + my $ct = git_timestamp(my @ct = msg_timestamp($hdr)); + ($name, $email, $at, $ct, $subject); } # kill potentially confusing/misleading headers @@ -361,19 +371,7 @@ sub clean_tree_v2 ($$$) { sub add { my ($self, $mime, $check_cb) = @_; # mime = Email::MIME - my ($name, $email) = extract_author_info($mime); - my $hdr = $mime->header_obj; - my @at = msg_datestamp($hdr); - my @ct = msg_timestamp($hdr); - my $author_time_raw = git_timestamp(@at); - my $commit_time_raw = git_timestamp(@ct); - - my $subject = $mime->header('Subject'); - $subject = '(no subject)' unless defined $subject; - # Mime decoding can create nulls replace them with spaces to protect git - $subject =~ tr/\0/ /; - utf8::encode($subject); - + my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); my $path_type = $self->{path_type}; my $path; if ($path_type eq '2/38') { @@ -416,8 +414,8 @@ sub add { } print $w "commit $ref\nmark :$commit\n", - "author $name <$email> $author_time_raw\n", - "committer $self->{ident} $commit_time_raw\n" or wfail; + "author $name <$email> $at\n", + "committer $self->{ident} $ct\n" or wfail; print $w "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { @@ -486,33 +484,45 @@ sub digest2mid ($$) { "$dt.$b64" . '@z'; } -sub clean_purge_buffer { - my ($oids, $buf) = @_; - my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; +sub rewrite_commit ($$$$) { + my ($self, $oids, $buf, $mime) = @_; + my ($name, $email, $at, $ct, $subject); + if ($mime) { + ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); + } else { + $name = $email = ''; + $subject = 'purged '.join(' ', @$oids); + } @$oids = (); - + $subject .= "\n"; foreach my $i (0..$#$buf) { my $l = $buf->[$i]; if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) { - $buf->[$i] = "author <> $1\n"; + $at //= $1; + $buf->[$i] = "author $name <$email> $at\n"; + } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) { + $ct //= $1; + $buf->[$i] = "committer $self->{ident} $ct\n"; } elsif ($l =~ /^data ([0-9]+)/) { - $buf->[$i++] = "data " . length($cmt_msg) . "\n"; - $buf->[$i] = $cmt_msg; + $buf->[$i++] = "data " . length($subject) . "\n"; + $buf->[$i] = $subject; last; } } } -sub purge_oids { - my ($self, $purge) = @_; - my $tmp = "refs/heads/purge-".((keys %$purge)[0]); +# returns the new commit OID if a replacement was done +# returns undef if nothing was done +sub replace_oids { + my ($self, $mime, $replace_map) = @_; # oid => raw string + my $tmp = "refs/heads/replace-".((keys %$replace_map)[0]); my $old = $self->{'ref'}; my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); my $rd = $git->popen(@export); my ($r, $w) = $self->gfi_start; my @buf; - my $npurge = 0; + my $nreplace = 0; my @oids; my ($done, $mark); my $tree = $self->{-tree}; @@ -535,10 +545,13 @@ sub purge_oids { } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; - if ($purge->{$oid}) { + my $sref = $replace_map->{$oid}; + if (defined $sref) { push @oids, $oid; - my $cmd = "M 100644 inline $path\ndata 0\n\n"; - push @buf, $cmd; + my $n = length($$sref); + push @buf, "M 100644 inline $path\ndata $n\n"; + push @buf, $$sref; # hope CoW works... + push @buf, "\n"; } else { push @buf, $_; } @@ -547,11 +560,13 @@ sub purge_oids { push @buf, $_ if $tree->{$path}; } elsif ($_ eq "\n") { if (@oids) { - my $out = join('', @buf); - $out =~ s/^/# /sgm; - warn "purge rewriting\n", $out, "\n"; - clean_purge_buffer(\@oids, \@buf); - $npurge++; + if (!$mime) { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + } + rewrite_commit($self, \@oids, \@buf, $mime); + $nreplace++; } $w->print(@buf, "\n") or wfail; @buf = (); @@ -569,28 +584,30 @@ sub purge_oids { $w->print(@buf) or wfail; } die 'done\n not seen from fast-export' unless $done; - chomp(my $cmt = $self->get_mark(":$mark")) if $npurge; + chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; $self->{nchg} = 0; # prevent _update_git_info until update-ref: $self->done; my @git = ('git', "--git-dir=$git->{git_dir}"); - run_die([@git, qw(update-ref), $old, $tmp]) if $npurge; + run_die([@git, qw(update-ref), $old, $tmp]) if $nreplace; run_die([@git, qw(update-ref -d), $tmp]); - return if $npurge == 0; + return if $nreplace == 0; run_die([@git, qw(-c gc.reflogExpire=now gc --prune=all)]); + + # check that old OIDs are gone my $err = 0; - foreach my $oid (keys %$purge) { + foreach my $oid (keys %$replace_map) { my @info = $git->check($oid); if (@info) { - warn "$oid not purged\n"; + warn "$oid not replaced\n"; $err++; } } _update_git_info($self, 0); - die "Failed to purge $err object(s)\n" if $err; + die "Failed to replace $err object(s)\n" if $err; $cmt; } diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index c9330332..10f716ca 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -191,7 +191,7 @@ sub search ($;$) { my $srch = $self->{search} ||= eval { _cleanup_later($self); require PublicInbox::Search; - PublicInbox::Search->new($self, $self->{altid}); + PublicInbox::Search->new($self); }; ($over_only || eval { $srch->xdb }) ? $srch : undef; } diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 85778c44..fa412f8c 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -435,6 +435,26 @@ sub xref ($$$$) { sub set_nntp_headers ($$$$$) { my ($self, $hdr, $ng, $n, $mid) = @_; + # why? leafnode requires a Path: header for some inexplicable + # reason. We'll fake the shortest one possible. + $hdr->header_set('Path', 'y'); + + # leafnode (and maybe other NNTP clients) have trouble dealing + # with v2 messages which have multiple Message-IDs (either due + # to our own content-based dedupe or buggy git-send-email versions). + my @mids = $hdr->header('Message-ID'); + if (scalar(@mids) > 1) { + my $mid0 = "<$mid>"; + $hdr->header_set('Message-ID', $mid0); + my @alt = $hdr->header('X-Alt-Message-ID'); + my %seen = map { $_ => 1 } (@alt, $mid0); + foreach my $m (@mids) { + next if $seen{$m}++; + push @alt, $m; + } + $hdr->header_set('X-Alt-Message-ID', @alt); + } + # clobber some my $xref = xref($self, $ng, $n, $mid); $hdr->header_set('Xref', $xref); @@ -516,6 +536,13 @@ sub _header ($) { my $hdr = $_[0]->header_obj->as_string; utf8::encode($hdr); $hdr =~ s/(?<!\r)\n/\r\n/sg; + + # for leafnode compatibility, we need to ensure Message-ID headers + # are only a single line. We can't subclass Email::Simple::Header + # and override _default_fold_at in here, either; since that won't + # affect messages already in the archive. + $hdr =~ s/^(Message-ID:)[ \t]*\r\n[ \t]+([^\r]+)\r\n/$1 $2\r\n/igsm; + $hdr } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 9903f427..098c97cd 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -170,17 +170,12 @@ sub xdb ($) { } sub new { - my ($class, $mainrepo, $altid) = @_; - my $version = 1; - my $ibx = $mainrepo; - if (ref $ibx) { - $version = $ibx->{version} || 1; - $mainrepo = $ibx->{mainrepo}; - } + my ($class, $ibx) = @_; + ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; my $self = bless { - mainrepo => $mainrepo, - altid => $altid, - version => $version, + mainrepo => $ibx->{mainrepo}, + altid => $ibx->{altid}, + version => $ibx->{version} // 1, }, $class; my $dir = xdir($self, 1); $self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3"); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 99856286..a088ce75 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -30,31 +30,22 @@ my $xapianlevels = qr/\A(?:full|medium)\z/; sub new { my ($class, $ibx, $creat, $part) = @_; + ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; my $levels = qr/\A(?:full|medium|basic)\z/; - my $mainrepo = $ibx; # for "public-inbox-index" w/o entry in config - my $git_dir = $mainrepo; - my ($altid, $git); - my $version = 1; + my $mainrepo = $ibx->{mainrepo}; + my $version = $ibx->{version} || 1; my $indexlevel = 'full'; - if (ref $ibx) { - $mainrepo = $ibx->{mainrepo}; - $altid = $ibx->{altid}; - $version = $ibx->{version} || 1; - if ($altid) { - require PublicInbox::AltId; - $altid = [ map { - PublicInbox::AltId->new($ibx, $_); - } @$altid ]; - } - if ($ibx->{indexlevel}) { - if ($ibx->{indexlevel} =~ $levels) { - $indexlevel = $ibx->{indexlevel}; - } else { - die("Invalid indexlevel $ibx->{indexlevel}\n"); - } + my $altid = $ibx->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; + } + if ($ibx->{indexlevel}) { + if ($ibx->{indexlevel} =~ $levels) { + $indexlevel = $ibx->{indexlevel}; + } else { + die("Invalid indexlevel $ibx->{indexlevel}\n"); } - } else { # FIXME: old tests: old tests - $ibx = { mainrepo => $git_dir, version => 1 }; } $ibx = PublicInbox::InboxWritable->new($ibx); my $self = bless { @@ -117,7 +108,11 @@ sub _xdb_acquire { } } return unless defined $flag; - $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); + my $xdb = eval { Search::Xapian::WritableDatabase->new($dir, $flag) }; + if ($@) { + die "Failed opening $dir: ", $@; + } + $self->{xdb} = $xdb; } sub add_val ($$$) { diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 5f3c8af8..96a26b15 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -25,12 +25,6 @@ sub wrap { bless { mid => $mid }, $class; } -sub get { - my ($class, $head, $db, $mid) = @_; - my $doc_id = $head->get_docid; - load_expand(wrap($class, $mid), $db->get_document($doc_id)); -} - sub get_val ($$) { my ($doc, $col) = @_; Search::Xapian::sortable_unserialise($doc->get_value($col)); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index a8c33ef4..76e61e86 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -11,7 +11,7 @@ use PublicInbox::SearchIdxPart; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::ContentId qw(content_id content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; @@ -23,7 +23,14 @@ use IO::Handle; # an estimate of the post-packed size to the raw uncompressed size my $PACKING_FACTOR = 0.4; -# assume 2 cores if GNU nproc(1) is not available +# SATA storage lags behind what CPUs are capable of, so relying on +# nproc(1) can be misleading and having extra Xapian partions is a +# waste of FDs and space. It can also lead to excessive IO latency +# and slow things down. Users on NVME or other fast storage can +# use the NPROC env or switches in our script/public-inbox-* programs +# to increase Xapian partitions. +our $NPROC_MAX_DEFAULT = 4; + sub nproc_parts ($) { my ($creat_opt) = @_; if (ref($creat_opt) eq 'HASH') { @@ -32,7 +39,14 @@ sub nproc_parts ($) { } } - my $n = int($ENV{NPROC} || `nproc 2>/dev/null` || 2); + my $n = $ENV{NPROC}; + if (!$n) { + chomp($n = `nproc 2>/dev/null`); + # assume 2 cores if GNU nproc(1) is not available + $n = 2 if !$n; + $n = $NPROC_MAX_DEFAULT if $n > $NPROC_MAX_DEFAULT; + } + # subtract for the main process and git-fast-import $n -= 1; $n < 1 ? 1 : $n; @@ -116,6 +130,18 @@ sub add { }); } +# indexes a message, returns true if checkpointing is needed +sub do_idx ($$$$$$$) { + my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; + $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); + my $npart = $self->{partitions}; + my $part = $num % $npart; + my $idx = idx_part($self, $part); + $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); + my $n = $self->{transact_bytes} += $len; + $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $npart); +} + sub _add { my ($self, $mime, $check_cb) = @_; @@ -141,13 +167,7 @@ sub _add { $self->{last_commit}->[$self->{epoch_max}] = $cmt; my ($oid, $len, $msgref) = @{$im->{last_object}}; - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $nparts = $self->{partitions}; - my $part = $num % $nparts; - my $idx = $self->idx_part($part); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); - my $n = $self->{transact_bytes} += $len; - if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) { + if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) { $self->checkpoint; } @@ -291,26 +311,30 @@ sub idx_init { }); } -sub purge_oids ($$) { - my ($self, $purge) = @_; # $purge = { $object_id => 1, ... } +# returns an array mapping [ epoch => latest_commit ] +# latest_commit may be undef if nothing was done to that epoch +# $replace_map = { $object_id => $strref, ... } +sub _replace_oids ($$$) { + my ($self, $mime, $replace_map) = @_; $self->done; my $pfx = "$self->{-inbox}->{mainrepo}/git"; - my $purges = []; + my $rewrites = []; # epoch => commit my $max = $self->{epoch_max}; unless (defined($max)) { defined(my $latest = git_dir_latest($self, \$max)) or return; $self->{epoch_max} = $max; } + foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; -d $git_dir or next; my $git = PublicInbox::Git->new($git_dir); my $im = $self->import_init($git, 0, 1); - $purges->[$i] = $im->purge_oids($purge); + $rewrites->[$i] = $im->replace_oids($mime, $replace_map); $im->done; } - $purges; + $rewrites; } sub content_ids ($) { @@ -333,25 +357,31 @@ sub content_matches ($$) { 0 } -sub remove_internal ($$$$) { - my ($self, $mime, $cmt_msg, $purge) = @_; +# used for removing or replacing (purging) +sub rewrite_internal ($$;$$$) { + my ($self, $old_mime, $cmt_msg, $new_mime, $sref) = @_; $self->idx_init; - my $im = $self->importer unless $purge; + my ($im, $need_reindex, $replace_map); + if ($sref) { + $replace_map = {}; # oid => sref + $need_reindex = [] if $new_mime; + } else { + $im = $self->importer; + } my $over = $self->{over}; - my $cids = content_ids($mime); + my $cids = content_ids($old_mime); my $parts = $self->{idx_parts}; - my $mm = $self->{mm}; my $removed; - my $mids = mids($mime->header_obj); + my $mids = mids($old_mime->header_obj); # We avoid introducing new blobs into git since the raw content # can be slightly different, so we do not need the user-supplied # message now that we have the mids and content_id - $mime = undef; + $old_mime = undef; my $mark; foreach my $mid (@$mids) { - my %gone; + my %gone; # num => [ smsg, raw ] my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -374,17 +404,21 @@ sub remove_internal ($$$$) { } foreach my $num (keys %gone) { my ($smsg, $orig) = @{$gone{$num}}; - $mm->num_delete($num); # $removed should only be set once assuming # no bugs in our deduplication code: $removed = $smsg; my $oid = $smsg->{blob}; - if ($purge) { - $purge->{$oid} = 1; + if ($replace_map) { + $replace_map->{$oid} = $sref; } else { ($mark, undef) = $im->remove($orig, $cmt_msg); } $orig = undef; + if ($need_reindex) { # ->replace + push @$need_reindex, $smsg; + } else { # ->purge or ->remove + $self->{mm}->num_delete($num); + } unindex_oid_remote($self, $oid, $mid); } } @@ -393,8 +427,9 @@ sub remove_internal ($$$$) { my $cmt = $im->get_mark($mark); $self->{last_commit}->[$self->{epoch_max}] = $cmt; } - if ($purge && scalar keys %$purge) { - return purge_oids($self, $purge); + if ($replace_map && scalar keys %$replace_map) { + my $rewrites = _replace_oids($self, $new_mime, $replace_map); + return { rewrites => $rewrites, need_reindex => $need_reindex }; } $removed; } @@ -403,22 +438,125 @@ sub remove_internal ($$$$) { sub remove { my ($self, $mime, $cmt_msg) = @_; $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, $cmt_msg, undef); + rewrite_internal($self, $mime, $cmt_msg); }); } +sub _replace ($$;$$) { + my ($self, $old_mime, $new_mime, $sref) = @_; + my $rewritten = $self->{-inbox}->with_umask(sub { + rewrite_internal($self, $old_mime, undef, $new_mime, $sref); + }) or return; + + my $rewrites = $rewritten->{rewrites}; + # ->done is called if there are rewrites since we gc+prune from git + $self->idx_init if @$rewrites; + + for my $i (0..$#$rewrites) { + defined(my $cmt = $rewrites->[$i]) or next; + $self->{last_commit}->[$i] = $cmt; + } + $rewritten; +} + # public sub purge { my ($self, $mime) = @_; - my $purges = $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, undef, {}); - }) or return; - $self->idx_init if @$purges; # ->done is called on purges - for my $i (0..$#$purges) { - defined(my $cmt = $purges->[$i]) or next; - $self->{last_commit}->[$i] = $cmt; + my $rewritten = _replace($self, $mime, undef, \'') or return; + $rewritten->{rewrites} +} + +# returns the git object_id of $fh, does not write the object to FS +sub git_hash_raw ($$) { + my ($self, $raw) = @_; + # grab the expected OID we have to reindex: + open my $tmp_fh, '+>', undef or die "failed to open tmp: $!"; + $tmp_fh->autoflush(1); + print $tmp_fh $$raw or die "print \$tmp_fh: $!"; + sysseek($tmp_fh, 0, 0) or die "seek failed: $!"; + + my ($r, $w); + pipe($r, $w) or die "failed to create pipe: $!"; + my $rdr = { 0 => fileno($tmp_fh), 1 => fileno($w) }; + my $git_dir = $self->{-inbox}->git->{git_dir}; + my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; + my $pid = spawn($cmd, undef, $rdr); + close $w; + local $/ = "\n"; + chomp(my $oid = <$r>); + waitpid($pid, 0) == $pid or die "git hash-object did not finish"; + die "git hash-object failed: $?" if $?; + $oid =~ /\A[a-f0-9]{40}\z/ or die "OID not expected: $oid"; + $oid; +} + +sub _check_mids_match ($$$) { + my ($old_list, $new_list, $hdrs) = @_; + my %old_mids = map { $_ => 1 } @$old_list; + my %new_mids = map { $_ => 1 } @$new_list; + my @old = keys %old_mids; + my @new = keys %new_mids; + my $err = "$hdrs may not be changed when replacing\n"; + die $err if scalar(@old) != scalar(@new); + delete @new_mids{@old}; + delete @old_mids{@new}; + die $err if (scalar(keys %old_mids) || scalar(keys %new_mids)); +} + +# Changing Message-IDs or References with ->replace isn't supported. +# The rules for dealing with messages with multiple or conflicting +# Message-IDs are pretty complex and rethreading hasn't been fully +# implemented, yet. +sub check_mids_match ($$) { + my ($old_mime, $new_mime) = @_; + my $old = $old_mime->header_obj; + my $new = $new_mime->header_obj; + _check_mids_match(mids($old), mids($new), 'Message-ID(s)'); + _check_mids_match(references($old), references($new), + 'References/In-Reply-To'); +} + +# public +sub replace ($$$) { + my ($self, $old_mime, $new_mime) = @_; + + check_mids_match($old_mime, $new_mime); + + # mutt will always add Content-Length:, Status:, Lines: when editing + PublicInbox::Import::drop_unwanted_headers($new_mime); + + my $raw = $new_mime->as_string; + my $expect_oid = git_hash_raw($self, \$raw); + my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return; + my $need_reindex = $rewritten->{need_reindex}; + + # just in case we have bugs in deduplication code: + my $n = scalar(@$need_reindex); + if ($n > 1) { + my $list = join(', ', map { + "$_->{num}: <$_->{mid}>" + } @$need_reindex); + warn <<""; +W: rewritten $n messages matching content of original message (expected: 1). +W: possible bug in public-inbox, NNTP article IDs and Message-IDs follow: +W: $list + + } + + # make sure we really got the OID: + my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid); + $oid eq $expect_oid or die "BUG: $expect_oid not found after replace"; + + # don't leak FDs to Xapian: + $self->{-inbox}->git->cleanup; + + # reindex modified messages: + for my $smsg (@$need_reindex) { + my $num = $smsg->{num}; + my $mid0 = $smsg->{mid}; + do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0); } - $purges; + $rewritten->{rewrites}; } sub last_commit_part ($$;$) { @@ -772,15 +910,8 @@ sub reindex_oid ($$$$) { } $sync->{mm_tmp}->mid_delete($mid0) or die "failed to delete <$mid0> for article #$num\n"; - - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $nparts = $self->{partitions}; - my $part = $num % $nparts; - my $idx = $self->idx_part($part); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); - my $n = $self->{transact_bytes} += $len; $sync->{nr}++; - if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) { + if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) { $git->cleanup; $sync->{mm_tmp}->atfork_prepare; $self->done; # release lock diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 7ea98204..e4682636 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -74,7 +74,7 @@ sub call { my $method = $env->{REQUEST_METHOD}; if ($method eq 'POST') { - if ($path_info =~ m!$INBOX_RE/(?:([0-9]+)/)? + if ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? (git-upload-pack)\z!x) { my ($part, $path) = ($2, $3); return invalid_inbox($ctx, $1) || @@ -88,7 +88,7 @@ sub call { } # top-level indices and feeds - if ($path_info eq '/') { + if ($path_info eq '/' || $path_info eq '/manifest.js.gz') { www_listing($self)->call($env); } elsif ($path_info =~ m!$INBOX_RE\z!o) { invalid_inbox($ctx, $1) || r301($ctx, $1); @@ -98,7 +98,7 @@ sub call { invalid_inbox($ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { invalid_inbox($ctx, $1) || get_new($ctx); - } elsif ($path_info =~ m!$INBOX_RE/(?:([0-9]+)/)? + } elsif ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my ($part, $path) = ($2, $3); invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); @@ -126,6 +126,8 @@ sub call { get_text($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/([a-zA-Z0-9_\-\.]+)\.css\z!o) { get_css($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/manifest\.js\.gz\z!o) { + get_inbox_manifest($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/\z!o) { get_vcs_object($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/ @@ -490,6 +492,15 @@ sub www_listing { } } +# GET $INBOX/manifest.js.gz +sub get_inbox_manifest ($$$) { + my ($ctx, $inbox, $key) = @_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + require PublicInbox::WwwListing; + PublicInbox::WwwListing::js($ctx->{env}, [$ctx->{-inbox}]); +} + sub get_attach { my ($ctx, $idx, $fn) = @_; require PublicInbox::WwwAttach; diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index e1473b3d..e2724cc4 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -9,26 +9,33 @@ use warnings; use PublicInbox::Hval qw(ascii_html); use PublicInbox::Linkify; use PublicInbox::View; +use bytes (); +use HTTP::Date qw(time2str); +require Digest::SHA; +require File::Spec; +{ no warnings 'once'; *try_cat = *PublicInbox::Inbox::try_cat }; -sub list_all ($$) { - my ($self, undef) = @_; +sub list_all ($$$) { + my ($self, $env, $hide_key) = @_; my @list; $self->{pi_config}->each_inbox(sub { my ($ibx) = @_; - push @list, $ibx unless $ibx->{-hide}->{www}; + push @list, $ibx unless $ibx->{-hide}->{$hide_key}; }); \@list; } -sub list_match_domain ($$) { - my ($self, $env) = @_; +sub list_match_domain ($$$) { + my ($self, $env, $hide_key) = @_; my @list; my $host = $env->{HTTP_HOST} // $env->{SERVER_NAME}; $host =~ s/:[0-9]+\z//; my $re = qr!\A(?:https?:)?//\Q$host\E(?::[0-9]+)?/!i; $self->{pi_config}->each_inbox(sub { my ($ibx) = @_; - push @list, $ibx if !$ibx->{-hide}->{www} && $ibx->{url} =~ $re; + if (!$ibx->{-hide}->{$hide_key} && $ibx->{url} =~ $re) { + push @list, $ibx; + } }); \@list; } @@ -42,21 +49,27 @@ my %VALID = ( 404 => *list_404, ); +sub set_cb ($$$) { + my ($pi_config, $k, $default) = @_; + my $v = $pi_config->{lc $k} // $default; + $VALID{$v} || do { + warn <<""; +`$v' is not a valid value for `$k' +$k be one of `all', `match=domain', or `404' + + $VALID{$default}; + }; +} + sub new { my ($class, $www) = @_; - my $k = 'publicinbox.wwwListing'; my $pi_config = $www->{pi_config}; - my $v = $pi_config->{lc($k)} // 404; bless { pi_config => $pi_config, style => $www->style("\0"), - list_cb => $VALID{$v} || do { - warn <<""; -`$v' is not a valid value for `$k' -$k be one of `all', `match=domain', or `404' - - *list_404; - }, + www_cb => set_cb($pi_config, 'publicInbox.wwwListing', 404), + manifest_cb => set_cb($pi_config, 'publicInbox.grokManifest', + 'match=domain'), }, $class; } @@ -74,22 +87,20 @@ sub ibx_entry { $tmp; } -# not really a stand-alone PSGI app, but maybe it could be... -sub call { - my ($self, $env) = @_; - my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ]; - my $list = $self->{list_cb}->($self, $env); - my $code = 404; +sub html ($$) { + my ($env, $list) = @_; my $title = 'public-inbox'; my $out = ''; + my $code = 404; if (@$list) { + $title .= ' - listing'; + $code = 200; + # Swartzian transform since ->modified is expensive @$list = sort { $b->[0] <=> $a->[0] } map { [ $_->modified, $_ ] } @$list; - $code = 200; - $title .= ' - listing'; my $tmp = join("\n", map { ibx_entry(@$_, $env) } @$list); my $l = PublicInbox::Linkify->new; $l->linkify_1($tmp); @@ -98,7 +109,122 @@ sub call { $out = "<html><head><title>$title</title></head><body>" . $out; $out .= '<pre>'. PublicInbox::WwwStream::code_footer($env) . '</pre></body></html>'; - [ $code, $h, [ $out ] ] + + my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ]; + [ $code, $h, [ $out ] ]; +} + +my $json; +sub _json () { + for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { + eval "require $mod" or next; + # ->ascii encodes non-ASCII to "\uXXXX" + return $mod->new->ascii(1); + } + die; +} + +sub fingerprint ($) { + my ($git) = @_; + my $fh = $git->popen('show-ref') or + die "popen($git->{git_dir} show-ref) failed: $!"; + + my $dig = Digest::SHA->new(1); + while (read($fh, my $buf, 65536)) { + $dig->add($buf); + } + close $fh; + return if $?; # empty, uninitialized git repo + $dig->hexdigest; +} + +sub manifest_add ($$;$) { + my ($manifest, $ibx, $epoch) = @_; + my $url_path = "/$ibx->{name}"; + my $git_dir = $ibx->{mainrepo}; + if (defined $epoch) { + $git_dir .= "/git/$epoch.git"; + $url_path .= "/git/$epoch.git"; + } + return unless -d $git_dir; + my $git = PublicInbox::Git->new($git_dir); + my $fingerprint = fingerprint($git) or return; # no empty repos + + chomp(my $owner = $git->qx('config', 'gitweb.owner')); + chomp(my $desc = try_cat("$git_dir/description")); + $owner = undef if $owner eq ''; + $desc = 'Unnamed repository' if $desc eq ''; + + my $reference; + chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); + if ($alt) { + # n.b.: GitPython doesn't seem to handle comments or C-quoted + # strings like native git does; and we don't for now, either. + my @alt = split(/\n+/, $alt); + + # grokmirror only supports 1 alternate for "reference", + if (scalar(@alt) == 1) { + my $objdir = "$git_dir/objects"; + $reference = File::Spec->rel2abs($alt[0], $objdir); + $reference =~ s!/[^/]+/?\z!!; # basename + } + } + $manifest->{-abs2urlpath}->{$git_dir} = $url_path; + my $modified = $git->modified; + if ($modified > $manifest->{-mtime}) { + $manifest->{-mtime} = $modified; + } + $manifest->{$url_path} = { + owner => $owner, + reference => $reference, + description => $desc, + modified => $modified, + fingerprint => $fingerprint, + }; +} + +# manifest.js.gz +sub js ($$) { + my ($env, $list) = @_; + eval { require IO::Compress::Gzip } or return [ 404, [], [] ]; + + my $manifest = { -abs2urlpath => {}, -mtime => 0 }; + for my $ibx (@$list) { + if (defined(my $max = $ibx->max_git_part)) { + for my $epoch (0..$max) { + manifest_add($manifest, $ibx, $epoch); + } + } else { + manifest_add($manifest, $ibx); + } + } + my $abs2urlpath = delete $manifest->{-abs2urlpath}; + my $mtime = delete $manifest->{-mtime}; + while (my ($url_path, $repo) = each %$manifest) { + defined(my $abs = $repo->{reference}) or next; + $repo->{reference} = $abs2urlpath->{$abs}; + } + my $out; + IO::Compress::Gzip::gzip(\(($json ||= _json())->encode($manifest)) => + \$out); + $manifest = undef; + [ 200, [ qw(Content-Type application/gzip), + 'Last-Modified', time2str($mtime), + 'Content-Length', bytes::length($out) ], [ $out ] ]; +} + +# not really a stand-alone PSGI app, but maybe it could be... +sub call { + my ($self, $env) = @_; + + if ($env->{PATH_INFO} eq '/manifest.js.gz') { + # grokmirror uses relative paths, so it's domain-dependent + my $list = $self->{manifest_cb}->($self, $env, 'manifest'); + js($env, $list); + } else { # / + my $list = $self->{www_cb}->($self, $env, 'www'); + html($env, $list); + } } 1; diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index dad080c8..e1c6fe3a 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -17,34 +17,66 @@ our @COMPACT_OPT = qw(jobs|j=i quiet|q blocksize|b=s no-full|n fuller|F); sub commit_changes ($$$) { my ($ibx, $tmp, $opt) = @_; - + my $new_parts = $opt->{reshard}; my $reindex = $opt->{reindex}; my $im = $ibx->importer(0); $im->lock_acquire if !$opt->{-coarse_lock}; $SIG{INT} or die 'BUG: $SIG{INT} not handled'; + my @old_part; while (my ($old, $new) = each %$tmp) { - my @st = stat($old) or die "failed to stat($old): $!\n"; + my @st = stat($old); + if (!@st && !defined($opt->{reshard})) { + die "failed to stat($old): $!"; + } my $over = "$old/over.sqlite3"; if (-f $over) { # only for v1, v2 over is untouched + defined $new or die "BUG: $over exists when culling v2"; $over = PublicInbox::Over->new($over); my $tmp_over = "$new/over.sqlite3"; $over->connect->sqlite_backup_to_file($tmp_over); $over = undef; } - chmod($st[2] & 07777, $new) or die "chmod $old: $!\n"; + if (!defined($new)) { # culled partition + push @old_part, $old; + next; + } + + if (@st) { + chmod($st[2] & 07777, $new) or die "chmod $old: $!\n"; + rename($old, "$new/old") or + die "rename $old => $new/old: $!\n"; + } # Xtmpdir->DESTROY won't remove $new after this: - rename($old, "$new/old") or die "rename $old => $new/old: $!\n"; rename($new, $old) or die "rename $new => $old: $!\n"; - my $prev = "$old/old"; - remove_tree($prev) or die "failed to remove $prev: $!\n"; + if (@st) { + my $prev = "$old/old"; + remove_tree($prev) or + die "failed to remove $prev: $!\n"; + } } + remove_tree(@old_part); $tmp->done; if (!$opt->{-coarse_lock}) { $opt->{-skip_lock} = 1; + + if ($im->can('count_partitions')) { + my $pr = $opt->{-progress}; + my $n = $im->count_partitions; + if (defined $new_parts && $n != $new_parts) { + die +"BUG: counted $n partitions after repartioning to $new_parts"; + } + my $prev = $im->{partitions}; + if ($pr && $prev != $n) { + $pr->("partition count changed: $prev => $n\n"); + $im->{partitions} = $n; + } + } + PublicInbox::Admin::index_inbox($ibx, $opt); # implicit lock_release } else { @@ -139,31 +171,59 @@ sub run { my $tmp = PublicInbox::Xtmpdirs->new; my $v = $ibx->{version} ||= 1; my @q; + my $new_parts = $opt->{reshard}; + if (defined $new_parts && $new_parts <= 0) { + die "--reshard must be a positive number\n"; + } # we want temporary directories to be as deep as possible, # so v2 partitions can keep "xap$SCHEMA_VERSION" on a separate FS. if ($v == 1) { + if (defined $new_parts) { + warn +"--reshard=$new_parts ignored for v1 $ibx->{mainrepo}\n"; + } my $old_parent = dirname($old); same_fs_or_die($old_parent, $old); - $tmp->{$old} = tempdir('xapcmd-XXXXXXXX', DIR => $old_parent); - push @q, [ $old, $tmp->{$old} ]; + my $v = PublicInbox::Search::SCHEMA_VERSION(); + my $wip = tempdir("xapian$v-XXXXXXXX", DIR => $old_parent); + $tmp->{$old} = $wip; + push @q, [ $old, $wip ]; } else { opendir my $dh, $old or die "Failed to opendir $old: $!\n"; + my @old_parts; while (defined(my $dn = readdir($dh))) { if ($dn =~ /\A[0-9]+\z/) { - my $tmpl = "$dn-XXXXXXXX"; - my $dst = tempdir($tmpl, DIR => $old); - same_fs_or_die($old, $dst); - my $cur = "$old/$dn"; - push @q, [ $cur, $dst ]; - $tmp->{$cur} = $dst; + push @old_parts, $dn; } elsif ($dn eq '.' || $dn eq '..') { } elsif ($dn =~ /\Aover\.sqlite3/) { } else { warn "W: skipping unknown dir: $old/$dn\n" } } - die "No Xapian parts found in $old\n" unless @q; + die "No Xapian parts found in $old\n" unless @old_parts; + + my ($src, $max_part); + if (!defined($new_parts) || $new_parts == scalar(@old_parts)) { + # 1:1 copy + $max_part = scalar(@old_parts) - 1; + } else { + # M:N copy + $max_part = $new_parts - 1; + $src = [ map { "$old/$_" } @old_parts ]; + } + foreach my $dn (0..$max_part) { + my $tmpl = "$dn-XXXXXXXX"; + my $wip = tempdir($tmpl, DIR => $old); + same_fs_or_die($old, $wip); + my $cur = "$old/$dn"; + push @q, [ $src // $cur , $wip ]; + $tmp->{$cur} = $wip; + } + # mark old parts to be unlinked + if ($src) { + $tmp->{$_} ||= undef for @$src; + } } my $im = $ibx->importer(0); my $max = $opt->{jobs} || scalar(@q); @@ -197,10 +257,11 @@ sub cpdb_retryable ($$) { } sub progress_pfx ($) { - my @p = split('/', $_[0]); + my ($wip) = @_; # tempdir v2: ([0-9])+-XXXXXXXX + my @p = split('/', $wip); # return "xap15/0" for v2, or "xapian15" for v1: - ($p[-1] =~ /\A[0-9]+\z/) ? "$p[-2]/$p[-1]" : $p[-1]; + ($p[-1] =~ /\A([0-9]+)/) ? "$p[-2]/$1" : $p[-1]; } # xapian-compact wrapper @@ -243,12 +304,74 @@ sub compact ($$) { } } +sub cpdb_loop ($$$;$$) { + my ($src, $dst, $pr_data, $cur_part, $new_parts) = @_; + my ($pr, $fmt, $nr, $pfx); + if ($pr_data) { + $pr = $pr_data->{pr}; + $fmt = $pr_data->{fmt}; + $nr = \($pr_data->{nr}); + $pfx = $pr_data->{pfx}; + } + + my ($it, $end); + do { + eval { + $it = $src->postlist_begin(''); + $end = $src->postlist_end(''); + }; + } while (cpdb_retryable($src, $pfx)); + + do { + eval { + for (; $it != $end; $it++) { + my $docid = $it->get_docid; + if (defined $new_parts) { + my $dst_part = $docid % $new_parts; + next if $dst_part != $cur_part; + } + my $doc = $src->get_document($docid); + $dst->replace_document($docid, $doc); + if ($pr_data && !(++$$nr & 1023)) { + $pr->(sprintf($fmt, $$nr)); + } + } + + # unlike copydatabase(1), we don't copy spelling + # and synonym data (or other user metadata) since + # the Perl APIs don't expose iterators for them + # (and public-inbox does not use those features) + }; + } while (cpdb_retryable($src, $pfx)); +} + # Like copydatabase(1), this is horribly slow; and it doesn't seem due # to the overhead of Perl. sub cpdb ($$) { my ($args, $opt) = @_; my ($old, $new) = @$args; - my $src = Search::Xapian::Database->new($old); + my ($src, $cur_part); + my $new_parts; + if (ref($old) eq 'ARRAY') { + ($cur_part) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); + defined $cur_part or + die "BUG: could not extract partition # from $new"; + $new_parts = $opt->{reshard}; + defined $new_parts or die 'BUG: got array src w/o --partition'; + + # repartitioning, M:N copy means have full read access + foreach (@$old) { + if ($src) { + my $sub = Search::Xapian::Database->new($_); + $src->add_database($sub); + } else { + $src = Search::Xapian::Database->new($_); + } + } + } else { + $src = Search::Xapian::Database->new($old); + } + my ($xtmp, $tmp); if ($opt->{compact}) { my $newdir = dirname($new); @@ -264,10 +387,9 @@ sub cpdb ($$) { # of other bugs: my $creat = Search::Xapian::DB_CREATE(); my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat); - my ($it, $end); - my ($nr, $tot, $fmt); # progress output my $pr = $opt->{-progress}; - my $pfx = $opt->{-progress_pfx} = progress_pfx($old); + my $pfx = $opt->{-progress_pfx} = progress_pfx($new); + my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; do { eval { @@ -276,44 +398,45 @@ sub cpdb ($$) { $dst->set_metadata('last_commit', $lc) if $lc; # only the first xapian partition (0) gets 'indexlevel' - if ($old =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\z!) { + if ($new =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\b!) { my $l = $src->get_metadata('indexlevel'); if ($l eq 'medium') { $dst->set_metadata('indexlevel', $l); } } - - $it = $src->postlist_begin(''); - $end = $src->postlist_end(''); - if ($pr) { - $nr = 0; - $tot = $src->get_doccount; - $fmt = "$pfx % ".length($tot)."u/$tot\n"; - $pr->("$pfx copying $tot documents\n"); - } - }; - } while (cpdb_retryable($src, $pfx)); - - do { - eval { - while ($it != $end) { - my $docid = $it->get_docid; - my $doc = $src->get_document($docid); - $dst->replace_document($docid, $doc); - $it->inc; - if ($pr && !(++$nr & 1023)) { - $pr->(sprintf($fmt, $nr)); + if ($pr_data) { + my $tot = $src->get_doccount; + + # we can only estimate when repartitioning, + # because removed spam causes slight imbalance + my $est = ''; + if (defined $cur_part && $new_parts > 1) { + $tot = int($tot/$new_parts); + $est = 'around '; } + my $fmt = "$pfx % ".length($tot)."u/$tot\n"; + $pr->("$pfx copying $est$tot documents\n"); + $pr_data->{fmt} = $fmt; + $pr_data->{total} = $tot; } - - # unlike copydatabase(1), we don't copy spelling - # and synonym data (or other user metadata) since - # the Perl APIs don't expose iterators for them - # (and public-inbox does not use those features) }; } while (cpdb_retryable($src, $pfx)); - $pr->(sprintf($fmt, $nr)) if $pr; + if (defined $new_parts) { + # we rely on document IDs matching NNTP article number, + # so we can't have the combined DB support rewriting + # document IDs. Thus we iterate through each shard + # individually. + $src = undef; + foreach (@$old) { + my $old = Search::Xapian::Database->new($_); + cpdb_loop($old, $dst, $pr_data, $cur_part, $new_parts); + } + } else { + cpdb_loop($src, $dst, $pr_data); + } + + $pr->(sprintf($pr_data->{fmt}, $pr_data->{nr})) if $pr; return unless $xtmp; $src = $dst = undef; # flushes and closes @@ -358,6 +481,7 @@ sub DESTROY { my $owner_pid = delete $owner{"$self"} or return; return if $owner_pid != $$; foreach my $new (values %$self) { + defined $new or next; # may be undef if repartitioning remove_tree($new) unless -d "$new/old"; } done($self); |