user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 16/20] index+xcpdb: support --no-sync flag
  2020-07-24  5:55  7% [PATCH 00/20] indexing changes and new features Eric Wong
@ 2020-07-24  5:56  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-07-24  5:56 UTC (permalink / raw)
  To: meta

This allows us to speed up indexing operations to SQLite
and Xapian.

Unfortunately, it doesn't affect operations using
`xapian-compact' and the compactor API, since that doesn't seem
to support Xapian::DB_NO_SYNC, yet.
---
 Documentation/public-inbox-index.pod |  7 +++++++
 Documentation/public-inbox-xcpdb.pod |  6 ++++++
 lib/PublicInbox/Msgmap.pm            | 21 ++++++++++++---------
 lib/PublicInbox/Over.pm              |  1 +
 lib/PublicInbox/OverIdx.pm           |  2 +-
 lib/PublicInbox/SearchIdx.pm         |  9 ++++++++-
 lib/PublicInbox/V2Writable.pm        |  6 ++++--
 lib/PublicInbox/Xapcmd.pm            |  5 +++--
 script/public-inbox-index            |  5 +++--
 script/public-inbox-xcpdb            |  4 ++--
 10 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 08f2fbf45..aeb1b3a39 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -113,6 +113,13 @@ below.
 
 Available in public-inbox 1.6.0 (PENDING).
 
+=item --no-sync
+
+Disables L<fsync(2)> and L<fdatasync(2)> operations on SQLite
+and Xapian.  This is only effective with Xapian 1.4+.
+
+Available in public-inbox 1.6.0 (PENDING).
+
 =back
 
 =head1 FILES
diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod
index 149c8f78c..7fe1e5fe2 100644
--- a/Documentation/public-inbox-xcpdb.pod
+++ b/Documentation/public-inbox-xcpdb.pod
@@ -45,6 +45,12 @@ too many shards given the capabilities of the current hardware.
 These options are passed directly to L<xapian-compact(1)> when
 used with C<--compact>.
 
+=item --no-sync
+
+Disable L<fsync(2)> and L<fdatasync(2)>.
+
+Available in public-inbox 1.6.0 (PENDING).
+
 =back
 
 =head1 ENVIRONMENT
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 9d2ef0dc5..839ddf7ca 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -32,12 +32,11 @@ sub new_file {
 	my $self = bless { filename => $f }, $class;
 	my $dbh = $self->{dbh} = PublicInbox::Over::dbh_new($self, $rw);
 	if ($rw) {
-		create_tables($dbh);
-
 		# TRUNCATE reduces I/O compared to the default (DELETE)
 		$dbh->do('PRAGMA journal_mode = TRUNCATE');
 
 		$dbh->begin_work;
+		create_tables($dbh);
 		$self->created_at(time) unless $self->created_at;
 
 		my $max = $self->max // 0;
@@ -51,12 +50,17 @@ sub new_file {
 sub tmp_clone {
 	my ($self) = @_;
 	my ($fh, $fn) = tempfile('msgmap-XXXXXXXX', EXLOCK => 0, TMPDIR => 1);
-	$self->{dbh}->sqlite_backup_to_file($fn);
-	my $tmp = ref($self)->new_file($fn, 1);
-	$tmp->{dbh}->do('PRAGMA synchronous = OFF');
-	$tmp->{dbh}->do('PRAGMA journal_mode = MEMORY');
+	my $tmp;
+	if ($self->{dbh}->can('sqlite_backup_to_dbh')) {
+		$tmp = ref($self)->new_file($fn, 2);
+		$tmp->{dbh}->do('PRAGMA journal_mode = MEMORY');
+		$self->{dbh}->sqlite_backup_to_dbh($tmp->{dbh});
+	} else { # DBD::SQLite <= 1.61_01
+		$self->{dbh}->sqlite_backup_to_file($fn);
+		$tmp = ref($self)->new_file($fn, 2);
+		$tmp->{dbh}->do('PRAGMA journal_mode = MEMORY');
+	}
 	$tmp->{pid} = $$;
-	close $fh or die "failed to close $fn: $!";
 	$tmp;
 }
 
@@ -241,8 +245,7 @@ sub atfork_parent {
 	$self->{pid} or die 'BUG: not a temporary clone';
 	$self->{dbh} and die 'BUG: tmp_clone dbh not prepared for parent';
 	defined($self->{filename}) or die 'BUG: {filename} not defined';
-	my $dbh = $self->{dbh} = PublicInbox::Over::dbh_new($self, 1);
-	$dbh->do('PRAGMA synchronous = OFF');
+	$self->{dbh} = PublicInbox::Over::dbh_new($self, 2);
 }
 
 sub atfork_prepare {
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index e3f264564..f32743c05 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -40,6 +40,7 @@ sub dbh_new {
 		$st = pack('dd', $st[0], $st[1]);
 	} while ($st ne $self->{st} && $tries++ < 3);
 	warn "W: $f: .st_dev, .st_ino unstable\n" if $st ne $self->{st};
+	$dbh->do('PRAGMA synchronous = OFF') if ($rw // 0) > 1;
 	$dbh;
 }
 
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index c57be7243..fcb450794 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -21,7 +21,7 @@ use Carp qw(croak);
 
 sub dbh_new {
 	my ($self) = @_;
-	my $dbh = $self->SUPER::dbh_new(1);
+	my $dbh = $self->SUPER::dbh_new($self->{-no_sync} ? 2 : 1);
 
 	# TRUNCATE reduces I/O compared to the default (DELETE)
 	# We do not use WAL since we're optimized for read-only ops,
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c57a7e164..764257432 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -23,6 +23,7 @@ use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 my $X = \%PublicInbox::Search::X;
 my ($DB_CREATE_OR_OPEN, $DB_OPEN);
+our $DB_NO_SYNC = 0;
 our $BATCH_BYTES = defined($ENV{XAPIAN_FLUSH_THRESHOLD}) ?
 			0x7fffffff : 1_000_000;
 use constant DEBUG => !!$ENV{DEBUG};
@@ -67,6 +68,7 @@ sub new {
 		$self->{lock_path} = "$inboxdir/ssoma.lock";
 		my $dir = $self->xdir;
 		$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
+		$self->{over}->{-no_sync} = 1 if $ibx->{-no_sync};
 		$self->{index_max_size} = $ibx->{index_max_size};
 	} elsif ($version == 2) {
 		defined $shard or die "shard is required for v2\n";
@@ -103,6 +105,9 @@ sub load_xapian_writable () {
 	*sortable_serialise = $xap.'::sortable_serialise';
 	$DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()');
 	$DB_OPEN = eval($xap.'::DB_OPEN()');
+	my $ver = (eval($xap.'::major_version()') << 16) |
+		(eval($xap.'::minor_version()') << 8);
+	$DB_NO_SYNC = 0x4 if $ver >= 0x10400;
 	1;
 }
 
@@ -126,6 +131,7 @@ sub idx_acquire {
 		}
 	}
 	return unless defined $flag;
+	$flag |= $DB_NO_SYNC if $self->{ibx}->{-no_sync};
 	my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
 	if ($@) {
 		die "Failed opening $dir: ", $@;
@@ -377,7 +383,8 @@ sub _msgmap_init ($) {
 	die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1;
 	$self->{mm} //= eval {
 		require PublicInbox::Msgmap;
-		PublicInbox::Msgmap->new($self->{ibx}->{inboxdir}, 1);
+		my $rw = $self->{ibx}->{-no_sync} ? 2 : 1;
+		PublicInbox::Msgmap->new($self->{ibx}->{inboxdir}, $rw);
 	};
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 13c1ad6f8..3dc200956 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -116,12 +116,13 @@ sub new {
 		total_bytes => 0,
 		current_info => '',
 		xpfx => $xpfx,
-		over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3", 1),
+		over => PublicInbox::OverIdx->new("$xpfx/over.sqlite3"),
 		lock_path => "$dir/inbox.lock",
 		# limit each git repo (epoch) to 1GB or so
 		rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
 		last_commit => [], # git epoch -> commit
 	};
+	$self->{over}->{-no_sync} = 1 if $v2ibx->{-no_sync};
 	$self->{shards} = count_shards($self) || nproc_shards($creat);
 	$self->{index_max_size} = $v2ibx->{index_max_size};
 	bless $self, $class;
@@ -293,7 +294,8 @@ sub _idx_init { # with_umask callback
 	# Now that all subprocesses are up, we can open the FDs
 	# for SQLite:
 	my $mm = $self->{mm} = PublicInbox::Msgmap->new_file(
-		"$self->{ibx}->{inboxdir}/msgmap.sqlite3", 1);
+				"$self->{ibx}->{inboxdir}/msgmap.sqlite3",
+				$self->{ibx}->{-no_sync} ? 2 : 1);
 	$mm->{dbh}->begin_work;
 }
 
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 4ee3fc791..d6c069d75 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -412,10 +412,11 @@ sub cpdb ($$) {
 
 	# like copydatabase(1), be sure we don't overwrite anything in case
 	# of other bugs:
-	my $creat = eval($PublicInbox::Search::Xap.'::DB_CREATE()');
+	my $flag = eval($PublicInbox::Search::Xap.'::DB_CREATE()');
 	die if $@;
 	my $XapianWritableDatabase = $PublicInbox::Search::X{WritableDatabase};
-	my $dst = $XapianWritableDatabase->new($tmp, $creat);
+	$flag |= $PublicInbox::SearchIdx::DB_NO_SYNC if !$opt->{sync};
+	my $dst = $XapianWritableDatabase->new($tmp, $flag);
 	my $pr = $opt->{-progress};
 	my $pfx = $opt->{-progress_pfx} = progress_pfx($new);
 	my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr;
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 2e1934b08..d5c7cae2b 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -14,8 +14,8 @@ PublicInbox::Admin::require_or_die('-index');
 use PublicInbox::Xapcmd;
 
 my $compact_opt;
-my $opt = { quiet => -1, compact => 0, maxsize => undef };
-GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
+my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
 		indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s))
 	or die "bad command-line args\n$usage";
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
@@ -59,6 +59,7 @@ for my $ibx (@ibxs) {
 	if ($opt->{compact} >= 2) {
 		PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
 	}
+	$ibx->{-no_sync} = 1 if !$opt->{sync};
 	PublicInbox::Admin::index_inbox($ibx, undef, $opt);
 	PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt;
 }
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index 2b9f032c5..fcd961488 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -8,8 +8,8 @@ use PublicInbox::Xapcmd;
 use PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-search');
 my $usage = "Usage: public-inbox-xcpdb [--compact] INBOX_DIR\n";
-my $opt = {};
-my @opt = (qw(compact reshard|R=i), @PublicInbox::Xapcmd::COMPACT_OPT);
+my $opt = { sync => 1 };
+my @opt = (qw(sync! compact reshard|R=i), @PublicInbox::Xapcmd::COMPACT_OPT);
 GetOptions($opt, @opt) or die "bad command-line args\n$usage";
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
 foreach (@ibxs) {

^ permalink raw reply related	[relevance 4%]

* [PATCH 00/20] indexing changes and new features
@ 2020-07-24  5:55  7% Eric Wong
  2020-07-24  5:56  4% ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-07-24  5:55 UTC (permalink / raw)
  To: meta

--rethread and --no-sync options are now supported in
public-inbox-index.  --no-sync should be nice for users
of FSes with poor fsync(2) performance.

Now: I also wonder if --no-sync is a bad name since we
also use it for to mean synchronising indices.  Perhaps
--no-fsync would be a better name, though technically
SQLite and Xapian use fdatasync(2), nowadays.

Some of this is prep work for exposing THREADID via IMAP (and
JMAP) to aid in searching.

Since THREADID (`over.tid') will be exposed in a user-visible
way, I'm finally giving up on using the default (reverse
chronological) log order for indexing to ensure THREADID
ascends for newer threads.

This also simplifies the indexing code significantly.
To avoid pinning huge amounts of RAM, the working space is held
in a IdxStack temporary file.  This further simplifies our code
since we no longer have to worry about old that did not use
Xapian w/o FD_CLOEXEC.

There's still more work on the horizon, here...

Eric Wong (20):
  index: support --rethread switch to fix old indices
  v2: index forwards (via `git log --reverse')
  v2writable: introduce idx_stack
  v2writable: index_sync: reduce fill_alternates calls
  v2writable: move {autime} and {cotime} into $sync state
  v2writable: allow >= 40 byte git object IDs
  v2writable: drop "EPOCH.git indexing $RANGE" progress message
  use consistent {ibx} field for writable code paths
  search: avoid copying {inboxdir}
  v2writable: use read-only PublicInbox::Git for cat_file
  v2writable: get rid of {reindex_pipe} field
  v2writable: clarify "epoch" for {last_commits}
  xapcmd: set {from} properly for v1 inboxes
  searchidx: rename _xdb_{acquire,release} => idx_
  searchidx: make v1 indexing closer to v2
  index+xcpdb: support --no-sync flag
  v2writable: share log2stack code with v1
  searchidx: support async git check
  searchidx: $batch_cb => v1_checkpoint
  v2writable: {unindexed} belongs in $sync state

 Documentation/public-inbox-index.pod |  30 +-
 Documentation/public-inbox-xcpdb.pod |   6 +
 MANIFEST                             |   3 +-
 lib/PublicInbox/Git.pm               |  72 ++++-
 lib/PublicInbox/IdxStack.pm          |  52 ++++
 lib/PublicInbox/Import.pm            |   6 +-
 lib/PublicInbox/Msgmap.pm            |  21 +-
 lib/PublicInbox/MultiMidQueue.pm     |  62 ----
 lib/PublicInbox/Over.pm              |   1 +
 lib/PublicInbox/OverIdx.pm           |  78 ++++-
 lib/PublicInbox/Search.pm            |  25 +-
 lib/PublicInbox/SearchIdx.pm         | 384 ++++++++++++------------
 lib/PublicInbox/SearchIdxShard.pm    |  12 +-
 lib/PublicInbox/Smsg.pm              |   8 +-
 lib/PublicInbox/V2Writable.pm        | 427 +++++++++------------------
 lib/PublicInbox/Xapcmd.pm            |  10 +-
 script/public-inbox-index            |   5 +-
 script/public-inbox-xcpdb            |   4 +-
 t/idx_stack.t                        |  56 ++++
 t/inbox_idle.t                       |   4 +-
 t/search.t                           |   4 +-
 t/v1reindex.t                        |  36 ++-
 t/v2reindex.t                        |  45 +++
 23 files changed, 744 insertions(+), 607 deletions(-)
 create mode 100644 lib/PublicInbox/IdxStack.pm
 delete mode 100644 lib/PublicInbox/MultiMidQueue.pm
 create mode 100644 t/idx_stack.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-07-24  5:55  7% [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24  5:56  4% ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).