From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 622271F5B7 for ; Sun, 21 Jun 2020 00:21:34 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] init: add --skip-artnum parameter Date: Sun, 21 Jun 2020 00:21:33 +0000 Message-Id: <20200621002133.9090-4-e@yhbt.net> In-Reply-To: <20200621002133.9090-1-e@yhbt.net> References: <20200621002133.9090-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: For archivists with only newer mail archives, this option allows reserving reserve NNTP article numbers for yet-to-be-archived old messages. Indexers will need to be updated to support this feature in future commits. -V1 inboxes will now be initialized with SQLite and Xapian support if this option is used, or if --indexlevel= is specified. --- Documentation/public-inbox-init.pod | 14 ++++++++++++++ lib/PublicInbox/InboxWritable.pm | 13 ++++++++++++- lib/PublicInbox/Msgmap.pm | 26 ++++++++++++++++++++++++++ lib/PublicInbox/SearchIdx.pm | 1 + lib/PublicInbox/V2Writable.pm | 3 ++- script/public-inbox-init | 9 ++++----- t/init.t | 28 +++++++++++++++++++++++++++- 7 files changed, 86 insertions(+), 8 deletions(-) diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod index 495a258f..5714828d 100644 --- a/Documentation/public-inbox-init.pod +++ b/Documentation/public-inbox-init.pod @@ -39,6 +39,20 @@ See L for more information. Default: C +=item -N, --skip-artnum + +This option allows archivists to publish incomplete archives +with only new mail while allowing NNTP article numbers +to be reserved for yet-to-be-archived old mail. + +This is mainly intended for users of C<--skip-epoch> (documented below) +but may be of use to L users. + +There is no automatic way to use reserved NNTP article numbers +when old mail is found, yet. + +Default: unset, no NNTP article numbers are skipped + =item -S, --skip-epoch For C<-V2> (L) inboxes only, this option diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index c54be046..f9e28502 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -39,10 +39,21 @@ sub assert_usable_dir { sub init_inbox { my ($self, $shards, $skip_epoch, $skip_artnum) = @_; - # TODO: honor skip_artnum if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); + if (defined($self->{indexlevel}) || defined($skip_artnum)) { + require PublicInbox::SearchIdx; + my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create + $sidx->begin_txn_lazy; + $self->with_umask(sub { + my $mm = PublicInbox::Msgmap->new($dir, 1); + $mm->{dbh}->begin_work; + $mm->skip_artnum($skip_artnum); + $mm->{dbh}->commit; + }) if defined($skip_artnum); + $sidx->commit_txn_lazy; + } } else { my $v2w = importer($self); $v2w->init_inbox($shards, $skip_epoch, $skip_artnum); diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index d115cbce..aa07e344 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -270,4 +270,30 @@ sub atfork_prepare { %$self = (tmp_name => $f, pid => $$); } +sub skip_artnum { + my ($self, $skip_artnum) = @_; + return meta_accessor($self, 'skip_artnum') if !defined($skip_artnum); + + my $cur = num_highwater($self) // 0; + if ($skip_artnum < $cur) { + die "E: current article number $cur ", + "exceeds --skip-artnum=$skip_artnum\n"; + } else { + my $ok; + for (1..10) { + my $mid = 'skip'.rand.'@'.rand.'.example.com'; + $ok = mid_set($self, $skip_artnum, $mid); + if ($ok) { + mid_delete($self, $mid); + last; + } + } + $ok or die '--skip-artnum failed'; + + # in the future, the indexer may use this value for + # new messages in old epochs + meta_accessor($self, 'skip_artnum', $skip_artnum); + } +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 85821ea7..00e63938 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -694,6 +694,7 @@ sub _git_log { } else { # normal regen is for for fresh data $self->{regen_down} = $fcount; + $self->{regen_down} += $high unless $opts->{reindex}; } } else { # Give oldest messages the smallest numbers diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 91379431..a0f041dd 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -128,12 +128,13 @@ sub new { # public (for now?) sub init_inbox { - my ($self, $shards, $skip_epoch) = @_; + my ($self, $shards, $skip_epoch, $skip_artnum) = @_; if (defined $shards) { $self->{parallel} = 0 if $shards == 0; $self->{shards} = $shards if $shards > 0; } $self->idx_init; + $self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum; my $epoch_max = -1; git_dir_latest($self, \$epoch_max); if (defined $skip_epoch && $epoch_max == -1) { diff --git a/script/public-inbox-init b/script/public-inbox-init index e8dcf4fc..c7f3da6f 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -24,14 +24,12 @@ use File::Path qw/mkpath/; use Fcntl qw(:DEFAULT); use Cwd qw/abs_path/; -my $version = undef; -my $indexlevel = undef; -my $skip_epoch; -my $jobs; +my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs); my %opts = ( 'V|version=i' => \$version, 'L|indexlevel=s' => \$indexlevel, 'S|skip|skip-epoch=i' => \$skip_epoch, + 'N|skip-artnum=i' => \$skip_artnum, 'j|jobs=i' => \$jobs, ); GetOptions(%opts) or usage(); @@ -152,7 +150,8 @@ if (defined $jobs) { $creat_opt->{nproc} = $jobs; } -PublicInbox::InboxWritable->new($ibx, $creat_opt)->init_inbox(0, $skip_epoch); +$ibx = PublicInbox::InboxWritable->new($ibx, $creat_opt); +$ibx->init_inbox(0, $skip_epoch, $skip_artnum); # needed for git prior to v2.1.0 umask(0077) if defined $perm; diff --git a/t/init.t b/t/init.t index 94c6184e..e3e8a229 100644 --- a/t/init.t +++ b/t/init.t @@ -93,12 +93,38 @@ SKIP: { is_deeply($gits, ["$tmpdir/skip1/git/1.git"], 'skip OK'); } - $cmd = [ '-init', '-V2', '--skip-epoch=2', 'skip2', "$tmpdir/skip2", qw(http://example.com/skip2 skip2@example.com) ]; ok(run_script($cmd), "--skip-epoch 2"); my $gits = [ glob("$tmpdir/skip2/git/*.git") ]; is_deeply($gits, ["$tmpdir/skip2/git/2.git"], 'skipping 2 works, too'); + + xsys(qw(git config), "--file=$ENV{PI_DIR}/config", + 'publicinboxmda.spamcheck', 'none') == 0 or + BAIL_OUT "git config $?"; + my $addr = 'skip3@example.com'; + $cmd = [ qw(-init -V2 -Lbasic -N12 skip3), "$tmpdir/skip3", + qw(http://example.com/skip3), $addr ]; + ok(run_script($cmd), '--skip-artnum -V2'); + my $env = { ORIGINAL_RECIPIENT => $addr }; + my $mid = 'skip-artnum@example.com'; + my $msg = "Message-ID: <$mid>\n\n"; + my $rdr = { 0 => \$msg, 2 => \(my $err = '') }; + ok(run_script([qw(-mda --no-precheck)], $env, $rdr), 'deliver V1'); + my $mm = PublicInbox::Msgmap->new_file("$tmpdir/skip3/msgmap.sqlite3"); + my $n = $mm->num_for($mid); + is($n, 13, 'V2 NNTP article numbers skipped via --skip-artnum'); + + $addr = 'skip4@example.com'; + $env = { ORIGINAL_RECIPIENT => $addr }; + $cmd = [ qw(-init -V1 -N12 -Lmedium skip4), "$tmpdir/skip4", + qw(http://example.com/skip4), $addr ]; + ok(run_script($cmd), '--skip-artnum -V1'); + ok(run_script([qw(-mda --no-precheck)], $env, $rdr), 'deliver V1'); + $mm = PublicInbox::Msgmap->new("$tmpdir/skip4"); + system "find $tmpdir/skip4 >&2"; + $n = $mm->num_for($mid); + is($n, 13, 'V1 NNTP article numbers skipped via --skip-artnum'); } done_testing();