user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 08/20] use consistent {ibx} field for writable code paths
  2020-07-24  5:55  7% [PATCH 00/20] indexing changes and new features Eric Wong
@ 2020-07-24  5:55  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-07-24  5:55 UTC (permalink / raw)
  To: meta

This is a step which makes our use of abbreviations more
consistent when referring to PublicInbox::Inbox objects.
We'll also be reducing the number of redundant fields
in SearchIdx and V2Writable code paths to make the
object graph easier-to-follow.
---
 lib/PublicInbox/Import.pm         |  6 ++--
 lib/PublicInbox/SearchIdx.pm      | 31 ++++++++++----------
 lib/PublicInbox/SearchIdxShard.pm |  6 ++--
 lib/PublicInbox/V2Writable.pm     | 47 +++++++++++++++----------------
 4 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index d565b0a03..b50c662c7 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -35,7 +35,7 @@ sub new {
 		ident => "$name <$email>",
 		mark => 1,
 		ref => $ref,
-		-inbox => $ibx,
+		ibx => $ibx,
 		path_type => '2/38', # or 'v2'
 		lock_path => "$git->{git_dir}/ssoma.lock", # v2 changes this
 		bytes_added => 0,
@@ -176,7 +176,7 @@ sub _update_git_info ($$) {
 		run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env);
 	}
 	run_die([@cmd, 'update-server-info']);
-	my $ibx = $self->{-inbox};
+	my $ibx = $self->{ibx};
 	($ibx && $self->{path_type} eq '2/38') and eval {
 		require PublicInbox::SearchIdx;
 		my $s = PublicInbox::SearchIdx->new($ibx);
@@ -385,7 +385,7 @@ sub add {
 
 	# spam check:
 	if ($check_cb) {
-		$mime = $check_cb->($mime, $self->{-inbox}) or return;
+		$mime = $check_cb->($mime, $self->{ibx}) or return;
 	}
 
 	my $blob = $self->{mark}++;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index e641ffd43..4b1b1736e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -50,8 +50,7 @@ sub new {
 	$ibx = PublicInbox::InboxWritable->new($ibx);
 	my $self = bless {
 		inboxdir => $inboxdir,
-		-inbox => $ibx,
-		git => $ibx->git,
+		ibx => $ibx,
 		-altid => $altid,
 		ibx_ver => $version,
 		indexlevel => $indexlevel,
@@ -548,14 +547,14 @@ sub unindex_both { # git->cat_async callback
 sub index_sync {
 	my ($self, $opts) = @_;
 	delete $self->{lock_path} if $opts->{-skip_lock};
-	$self->{-inbox}->with_umask(\&_index_sync, $self, $opts);
+	$self->{ibx}->with_umask(\&_index_sync, $self, $opts);
 }
 
-sub too_big ($$$) {
-	my ($self, $git, $oid) = @_;
+sub too_big ($$) {
+	my ($self, $oid) = @_;
 	my $max_size = $self->{index_max_size} or return;
-	my (undef, undef, $size) = $git->check($oid);
-	die "E: bad $oid in $git->{git_dir}\n" if !defined($size);
+	my (undef, undef, $size) = $self->{ibx}->git->check($oid);
+	die "E: bad $oid in $self->{ibx}->{inboxdir}\n" if !defined($size);
 	return if $size <= $max_size;
 	warn "W: skipping $oid ($size > $max_size)\n";
 	1;
@@ -568,7 +567,7 @@ sub read_log {
 	my $h40 = $hex .'{40}';
 	my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
 	my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
-	my $git = $self->{git};
+	my $git = $self->{ibx}->git;
 	my $latest;
 	my $max = $BATCH_BYTES;
 	local $/ = "\n";
@@ -591,7 +590,7 @@ sub read_log {
 				}
 				next;
 			}
-			next if too_big($self, $git, $blob);
+			next if too_big($self, $blob);
 			$git->cat_async($blob, \&index_both, { %$sync });
 			if ($max <= 0) {
 				$git->cat_async_wait;
@@ -600,7 +599,7 @@ sub read_log {
 			}
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
-			$D{$blob} = 1 unless too_big($self, $git, $blob);
+			$D{$blob} = 1 unless too_big($self, $blob);
 		} elsif ($line =~ /^commit ($h40)/o) {
 			$latest = $1;
 			$newest ||= $latest;
@@ -621,7 +620,7 @@ sub read_log {
 
 sub _git_log {
 	my ($self, $opts, $range) = @_;
-	my $git = $self->{git};
+	my $git = $self->{ibx}->git;
 
 	if (index($range, '..') < 0) {
 		# don't show annoying git errors to users who run -index
@@ -681,7 +680,7 @@ sub is_ancestor ($$$) {
 
 sub need_update ($$$) {
 	my ($self, $cur, $new) = @_;
-	my $git = $self->{git};
+	my $git = $self->{ibx}->git;
 	return 1 if $cur && !is_ancestor($git, $cur, $new);
 	my $range = $cur eq '' ? $new : "$cur..$new";
 	chomp(my $n = $git->qx(qw(rev-list --count), $range));
@@ -701,7 +700,7 @@ sub _last_x_commit {
 		$lx = $lm;
 	}
 	# Use last_commit from msgmap if it is older or unset
-	if (!$lm || ($lx && $lm && is_ancestor($self->{git}, $lm, $lx))) {
+	if (!$lm || ($lx && $lm && is_ancestor($self->{ibx}->git, $lm, $lx))) {
 		$lx = $lm;
 	}
 	$lx;
@@ -718,7 +717,7 @@ sub _index_sync {
 	my ($self, $opts) = @_;
 	my $tip = $opts->{ref} || 'HEAD';
 	my ($last_commit, $lx, $xlog);
-	my $git = $self->{git};
+	my $git = $self->{ibx}->git;
 	$git->batch_prepare;
 	my $pr = $opts->{-progress};
 
@@ -830,7 +829,7 @@ sub _begin_txn {
 
 sub begin_txn_lazy {
 	my ($self) = @_;
-	$self->{-inbox}->with_umask(\&_begin_txn, $self) if !$self->{txn};
+	$self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn};
 }
 
 # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard)
@@ -860,7 +859,7 @@ sub _commit_txn {
 sub commit_txn_lazy {
 	my ($self) = @_;
 	delete($self->{txn}) and
-		$self->{-inbox}->with_umask(\&_commit_txn, $self);
+		$self->{ibx}->with_umask(\&_commit_txn, $self);
 }
 
 sub worker_done {
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 544268819..fd34e487b 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -11,14 +11,14 @@ use IO::Handle (); # autoflush
 use PublicInbox::Eml;
 
 sub new {
-	my ($class, $v2writable, $shard) = @_;
-	my $ibx = $v2writable->{-inbox};
+	my ($class, $v2w, $shard) = @_;
+	my $ibx = $v2w->{ibx};
 	my $self = $class->SUPER::new($ibx, 1, $shard);
 	# create the DB before forking:
 	$self->_xdb_acquire;
 	$self->set_indexlevel;
 	$self->_xdb_release;
-	$self->spawn_worker($v2writable, $shard) if $v2writable->{parallel};
+	$self->spawn_worker($v2w, $shard) if $v2w->{parallel};
 	$self;
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 2ff2fc259..a1986a469 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -109,7 +109,7 @@ sub new {
 
 	my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION;
 	my $self = {
-		-inbox => $v2ibx,
+		ibx => $v2ibx,
 		im => undef, #  PublicInbox::Import
 		parallel => 1,
 		transact_bytes => 0,
@@ -149,7 +149,7 @@ sub init_inbox {
 # mimics Import::add and wraps it for v2
 sub add {
 	my ($self, $eml, $check_cb) = @_;
-	$self->{-inbox}->with_umask(\&_add, $self, $eml, $check_cb);
+	$self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb);
 }
 
 # indexes a message, returns true if checkpointing is needed
@@ -169,7 +169,7 @@ sub _add {
 
 	# spam check:
 	if ($check_cb) {
-		$mime = $check_cb->($mime, $self->{-inbox}) or return;
+		$mime = $check_cb->($mime, $self->{ibx}) or return;
 	}
 
 	# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
@@ -218,7 +218,7 @@ sub v2_num_for {
 		# AltId may pre-populate article numbers (e.g. X-Mail-Count
 		# or NNTP article number), use that article number if it's
 		# not in Over.
-		my $altid = $self->{-inbox}->{altid};
+		my $altid = $self->{ibx}->{altid};
 		if ($altid && grep(/:file=msgmap\.sqlite3\z/, @$altid)) {
 			my $num = $self->{mm}->num_for($mid);
 
@@ -293,7 +293,7 @@ sub _idx_init { # with_umask callback
 	# Now that all subprocesses are up, we can open the FDs
 	# for SQLite:
 	my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
-		"$self->{-inbox}->{inboxdir}/msgmap.sqlite3", 1);
+		"$self->{ibx}->{inboxdir}/msgmap.sqlite3", 1);
 	$mm->{dbh}->begin_work;
 }
 
@@ -301,7 +301,7 @@ sub _idx_init { # with_umask callback
 sub idx_init {
 	my ($self, $opt) = @_;
 	return if $self->{idx_shards};
-	my $ibx = $self->{-inbox};
+	my $ibx = $self->{ibx};
 
 	# do not leak read-only FDs to child processes, we only have these
 	# FDs for duplicate detection so they should not be
@@ -329,7 +329,7 @@ sub idx_init {
 sub _replace_oids ($$$) {
 	my ($self, $mime, $replace_map) = @_;
 	$self->done;
-	my $pfx = "$self->{-inbox}->{inboxdir}/git";
+	my $pfx = "$self->{ibx}->{inboxdir}/git";
 	my $rewrites = []; # epoch => commit
 	my $max = $self->{epoch_max};
 
@@ -450,7 +450,7 @@ sub rewrite_internal ($$;$$$) {
 # (retval[2]) is not part of the stable API shared with Import->remove
 sub remove {
 	my ($self, $eml, $cmt_msg) = @_;
-	my $r = $self->{-inbox}->with_umask(\&rewrite_internal,
+	my $r = $self->{ibx}->with_umask(\&rewrite_internal,
 						$self, $eml, $cmt_msg);
 	defined($r) && defined($r->[0]) ? @$r: undef;
 }
@@ -458,7 +458,7 @@ sub remove {
 sub _replace ($$;$$) {
 	my ($self, $old_eml, $new_eml, $sref) = @_;
 	my $arg = [ $self, $old_eml, undef, $new_eml, $sref ];
-	my $rewritten = $self->{-inbox}->with_umask(\&rewrite_internal,
+	my $rewritten = $self->{ibx}->with_umask(\&rewrite_internal,
 			$self, $old_eml, undef, $new_eml, $sref) or return;
 
 	my $rewrites = $rewritten->{rewrites};
@@ -484,7 +484,7 @@ sub git_hash_raw ($$) {
 	my ($self, $raw) = @_;
 	# grab the expected OID we have to reindex:
 	pipe(my($in, $w)) or die "pipe: $!";
-	my $git_dir = $self->{-inbox}->git->{git_dir};
+	my $git_dir = $self->{ibx}->git->{git_dir};
 	my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)];
 	my $r = popen_rd($cmd, undef, { 0 => $in });
 	print $w $$raw or die "print \$w: $!";
@@ -550,11 +550,11 @@ W: $list
 	}
 
 	# make sure we really got the OID:
-	my ($blob, $type, $bytes) = $self->{-inbox}->git->check($expect_oid);
+	my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid);
 	$blob eq $expect_oid or die "BUG: $expect_oid not found after replace";
 
 	# don't leak FDs to Xapian:
-	$self->{-inbox}->git->cleanup;
+	$self->{ibx}->git->cleanup;
 
 	# reindex modified messages:
 	for my $smsg (@$need_reindex) {
@@ -674,14 +674,14 @@ sub done {
 	my $nbytes = $self->{total_bytes};
 	$self->{total_bytes} = 0;
 	$self->lock_release(!!$nbytes) if $shards;
-	$self->{-inbox}->git->cleanup;
+	$self->{ibx}->git->cleanup;
 }
 
 sub fill_alternates ($$) {
 	my ($self, $epoch) = @_;
 
-	my $pfx = "$self->{-inbox}->{inboxdir}/git";
-	my $all = "$self->{-inbox}->{inboxdir}/all.git";
+	my $pfx = "$self->{ibx}->{inboxdir}/git";
+	my $all = "$self->{ibx}->{inboxdir}/all.git";
 	PublicInbox::Import::init_bare($all) unless -d $all;
 	my $info_dir = "$all/objects/info";
 	my $alt = "$info_dir/alternates";
@@ -726,7 +726,7 @@ sub fill_alternates ($$) {
 
 sub git_init {
 	my ($self, $epoch) = @_;
-	my $git_dir = "$self->{-inbox}->{inboxdir}/git/$epoch.git";
+	my $git_dir = "$self->{ibx}->{inboxdir}/git/$epoch.git";
 	PublicInbox::Import::init_bare($git_dir);
 	my @cmd = (qw/git config/, "--file=$git_dir/config",
 			'include.path', '../../all.git/config');
@@ -738,7 +738,7 @@ sub git_init {
 sub git_dir_latest {
 	my ($self, $max) = @_;
 	$$max = -1;
-	my $pfx = "$self->{-inbox}->{inboxdir}/git";
+	my $pfx = "$self->{ibx}->{inboxdir}/git";
 	return unless -d $pfx;
 	my $latest;
 	opendir my $dh, $pfx or die "opendir $pfx: $!\n";
@@ -790,7 +790,7 @@ sub importer {
 
 sub import_init {
 	my ($self, $git, $packed_bytes, $tmp) = @_;
-	my $im = PublicInbox::Import->new($git, undef, undef, $self->{-inbox});
+	my $im = PublicInbox::Import->new($git, undef, undef, $self->{ibx});
 	$im->{bytes_added} = int($packed_bytes / $PACKING_FACTOR);
 	$im->{lock_path} = undef;
 	$im->{path_type} = 'v2';
@@ -823,8 +823,7 @@ sub get_blob ($$) {
 		return $msg if $msg;
 	}
 	# older message, should be in alternates
-	my $ibx = $self->{-inbox};
-	$ibx->msg_by_smsg($smsg);
+	$self->{ibx}->msg_by_smsg($smsg);
 }
 
 sub content_exists ($$$) {
@@ -881,7 +880,7 @@ sub reindex_checkpoint ($$$) {
 
 sub reindex_oid ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
-	return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
+	return if PublicInbox::SearchIdx::too_big($self, $oid);
 	my ($num, $mid0, $len);
 	my $msgref = $git->cat_file($oid, \$len);
 	return if $len == 0; # purged
@@ -968,7 +967,7 @@ sub update_last_commit ($$$$) {
 	last_epoch_commit($self, $i, $cmt);
 }
 
-sub git_dir_n ($$) { "$_[0]->{-inbox}->{inboxdir}/git/$_[1].git" }
+sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" }
 
 sub last_commits ($$) {
 	my ($self, $epoch_max) = @_;
@@ -1077,7 +1076,7 @@ sub sync_prepare ($$$) {
 	my ($self, $sync, $epoch_max) = @_;
 	my $pr = $sync->{-opt}->{-progress};
 	my $regen_max = 0;
-	my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
+	my $head = $self->{ibx}->{ref_head} || 'refs/heads/master';
 
 	# reindex stops at the current heads and we later rerun index_sync
 	# without {reindex}
@@ -1108,7 +1107,7 @@ sub sync_prepare ($$$) {
 	# our code and blindly injects "d" file history into git repos
 	if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
 		warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
-		my $git = $self->{-inbox}->git;
+		my $git = $self->{ibx}->git;
 		for my $oid (@leftovers) {
 			$oid = unpack('H*', $oid);
 			$self->{current_info} = "leftover $oid";

^ permalink raw reply related	[relevance 4%]

* [PATCH 00/20] indexing changes and new features
@ 2020-07-24  5:55  7% Eric Wong
  2020-07-24  5:55  4% ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-07-24  5:55 UTC (permalink / raw)
  To: meta

--rethread and --no-sync options are now supported in
public-inbox-index.  --no-sync should be nice for users
of FSes with poor fsync(2) performance.

Now: I also wonder if --no-sync is a bad name since we
also use it for to mean synchronising indices.  Perhaps
--no-fsync would be a better name, though technically
SQLite and Xapian use fdatasync(2), nowadays.

Some of this is prep work for exposing THREADID via IMAP (and
JMAP) to aid in searching.

Since THREADID (`over.tid') will be exposed in a user-visible
way, I'm finally giving up on using the default (reverse
chronological) log order for indexing to ensure THREADID
ascends for newer threads.

This also simplifies the indexing code significantly.
To avoid pinning huge amounts of RAM, the working space is held
in a IdxStack temporary file.  This further simplifies our code
since we no longer have to worry about old that did not use
Xapian w/o FD_CLOEXEC.

There's still more work on the horizon, here...

Eric Wong (20):
  index: support --rethread switch to fix old indices
  v2: index forwards (via `git log --reverse')
  v2writable: introduce idx_stack
  v2writable: index_sync: reduce fill_alternates calls
  v2writable: move {autime} and {cotime} into $sync state
  v2writable: allow >= 40 byte git object IDs
  v2writable: drop "EPOCH.git indexing $RANGE" progress message
  use consistent {ibx} field for writable code paths
  search: avoid copying {inboxdir}
  v2writable: use read-only PublicInbox::Git for cat_file
  v2writable: get rid of {reindex_pipe} field
  v2writable: clarify "epoch" for {last_commits}
  xapcmd: set {from} properly for v1 inboxes
  searchidx: rename _xdb_{acquire,release} => idx_
  searchidx: make v1 indexing closer to v2
  index+xcpdb: support --no-sync flag
  v2writable: share log2stack code with v1
  searchidx: support async git check
  searchidx: $batch_cb => v1_checkpoint
  v2writable: {unindexed} belongs in $sync state

 Documentation/public-inbox-index.pod |  30 +-
 Documentation/public-inbox-xcpdb.pod |   6 +
 MANIFEST                             |   3 +-
 lib/PublicInbox/Git.pm               |  72 ++++-
 lib/PublicInbox/IdxStack.pm          |  52 ++++
 lib/PublicInbox/Import.pm            |   6 +-
 lib/PublicInbox/Msgmap.pm            |  21 +-
 lib/PublicInbox/MultiMidQueue.pm     |  62 ----
 lib/PublicInbox/Over.pm              |   1 +
 lib/PublicInbox/OverIdx.pm           |  78 ++++-
 lib/PublicInbox/Search.pm            |  25 +-
 lib/PublicInbox/SearchIdx.pm         | 384 ++++++++++++------------
 lib/PublicInbox/SearchIdxShard.pm    |  12 +-
 lib/PublicInbox/Smsg.pm              |   8 +-
 lib/PublicInbox/V2Writable.pm        | 427 +++++++++------------------
 lib/PublicInbox/Xapcmd.pm            |  10 +-
 script/public-inbox-index            |   5 +-
 script/public-inbox-xcpdb            |   4 +-
 t/idx_stack.t                        |  56 ++++
 t/inbox_idle.t                       |   4 +-
 t/search.t                           |   4 +-
 t/v1reindex.t                        |  36 ++-
 t/v2reindex.t                        |  45 +++
 23 files changed, 744 insertions(+), 607 deletions(-)
 create mode 100644 lib/PublicInbox/IdxStack.pm
 delete mode 100644 lib/PublicInbox/MultiMidQueue.pm
 create mode 100644 t/idx_stack.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-07-24  5:55  7% [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24  5:55  4% ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).