user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 17/20] v2writable: share log2stack code with v1
Date: Fri, 24 Jul 2020 05:56:03 +0000	[thread overview]
Message-ID: <20200724055606.27332-18-e@yhbt.net> (raw)
In-Reply-To: <20200724055606.27332-1-e@yhbt.net>

Another step in making v1 and v2 more similar.
---
 lib/PublicInbox/SearchIdx.pm  | 44 ++++++++++++++++++---------
 lib/PublicInbox/V2Writable.pm | 57 ++++++-----------------------------
 2 files changed, 38 insertions(+), 63 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 764257432..4d2e0da92 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -9,7 +9,7 @@
 package PublicInbox::SearchIdx;
 use strict;
 use v5.10.1;
-use parent qw(PublicInbox::Search PublicInbox::Lock);
+use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
 use PublicInbox::Eml;
 use PublicInbox::InboxWritable;
 use PublicInbox::MID qw(mid_mime mids_for_index mids);
@@ -21,6 +21,7 @@ use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+our @EXPORT_OK = qw(too_big crlf_adjust log2stack is_ancestor);
 my $X = \%PublicInbox::Search::X;
 my ($DB_CREATE_OR_OPEN, $DB_OPEN);
 our $DB_NO_SYNC = 0;
@@ -31,8 +32,6 @@ use constant DEBUG => !!$ENV{DEBUG};
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
-my $addmsg = qr!^:000000 100644 \S+ ($OID) A\t${hex}{2}/${hex}{38}$!;
-my $delmsg = qr!^:100644 000000 ($OID) \S+ D\t${hex}{2}/${hex}{38}$!;
 
 sub new {
 	my ($class, $ibx, $creat, $shard) = @_;
@@ -600,17 +599,18 @@ sub process_stack {
 	$batch_cb->($nr, $stk);
 }
 
-sub prepare_stack ($$$) {
-	my ($self, $sync, $range) = @_;
-	my $git = $self->{ibx}->git;
-
-	if (index($range, '..') < 0) {
-		# don't show annoying git errors to users who run -index
-		# on empty inboxes
-		$git->qx(qw(rev-parse -q --verify), "$range^0");
-		return PublicInbox::IdxStack->new->read_prepare if $?;
+sub log2stack ($$$$) {
+	my ($sync, $git, $range, $ibx) = @_;
+	my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise)
+	my ($add, $del);
+	if ($ibx->version == 1) {
+		my $path = $hex.'{2}/'.$hex.'{38}';
+		$add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!;
+		$del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!;
+	} else {
+		$del = qr!\A:\d{6} 100644 $OID ($OID) [AM]\td$!;
+		$add = qr!\A:\d{6} 100644 $OID ($OID) [AM]\tm$!;
 	}
-	my $D = $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
 
 	# Count the new files so they can be added newest to oldest
 	# and still have numbers increasing from oldest to newest
@@ -622,14 +622,14 @@ sub prepare_stack ($$$) {
 		if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
 			($at, $ct) = ($1 + 0, $2 + 0);
 			$stk //= PublicInbox::IdxStack->new($3);
-		} elsif (/$delmsg/) {
+		} elsif (/$del/) {
 			my $oid = $1;
 			if ($D) { # reindex case
 				$D->{pack('H*', $oid)}++;
 			} else { # non-reindex case:
 				$stk->push_rec('d', $at, $ct, $oid);
 			}
-		} elsif (/$addmsg/) {
+		} elsif (/$add/) {
 			my $oid = $1;
 			if ($D) {
 				my $oid_bin = pack('H*', $oid);
@@ -648,6 +648,20 @@ sub prepare_stack ($$$) {
 	$stk->read_prepare;
 }
 
+sub prepare_stack ($$$) {
+	my ($self, $sync, $range) = @_;
+	my $git = $self->{ibx}->git;
+
+	if (index($range, '..') < 0) {
+		# don't show annoying git errors to users who run -index
+		# on empty inboxes
+		$git->qx(qw(rev-parse -q --verify), "$range^0");
+		return PublicInbox::IdxStack->new->read_prepare if $?;
+	}
+	$sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR
+	log2stack($sync, $git, $range, $self->{ibx});
+}
+
 # --is-ancestor requires git 1.8.0+
 sub is_ancestor ($$$) {
 	my ($git, $cur, $tip) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 3dc200956..9a58a7a94 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -18,7 +18,7 @@ use PublicInbox::InboxWritable;
 use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
-use PublicInbox::SearchIdx;
+use PublicInbox::SearchIdx qw(too_big log2stack crlf_adjust is_ancestor);
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -156,8 +156,7 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
 	my ($self, $msgref, $mime, $smsg) = @_;
-	$smsg->{bytes} = $smsg->{raw_bytes} +
-			PublicInbox::SearchIdx::crlf_adjust($$msgref);
+	$smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
 	$self->{over}->add_overview($mime, $smsg);
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
 	$idx->index_raw($msgref, $mime, $smsg);
@@ -878,7 +877,7 @@ sub reindex_checkpoint ($$) {
 
 sub reindex_oid ($$$) {
 	my ($self, $sync, $oid) = @_;
-	return if PublicInbox::SearchIdx::too_big($self, $oid);
+	return if too_big($self, $oid);
 	my ($num, $mid0, $len);
 	my $msgref = $self->{ibx}->git->cat_file($oid, \$len);
 	return if $len == 0; # purged
@@ -976,8 +975,6 @@ sub last_commits ($$) {
 	$heads;
 }
 
-*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
-
 # returns a revision range for git-log(1)
 sub log_range ($$$$$) {
 	my ($self, $sync, $git, $i, $tip) = @_;
@@ -1029,47 +1026,6 @@ $range
 	$range;
 }
 
-sub prepare_range_stack {
-	my ($git, $sync, $range) = @_;
-	# Don't bump num_highwater on --reindex by using {D}.
-	# We intentionally do NOT use {D} in the non-reindex case because
-	# we want NNTP article number gaps from unindexed messages to
-	# show up in mirrors, too.
-	my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
-
-	my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
-				--no-notes --no-color --no-renames --no-abbrev),
-				$range);
-	my ($at, $ct, $stk);
-	while (<$fh>) {
-		if (/\A([0-9]+)-([0-9]+)-($OID)$/o) {
-			($at, $ct) = ($1 + 0, $2 + 0);
-			$stk //= PublicInbox::IdxStack->new($3);
-		} elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\td$/o) {
-			my $oid = $1;
-			if ($D) { # reindex case
-				$D->{pack('H*', $oid)}++;
-			} else { # non-reindex case:
-				$stk->push_rec('d', $at, $ct, $oid);
-			}
-		} elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o) {
-			my $oid = $1;
-			if ($D) {
-				my $oid_bin = pack('H*', $oid);
-				my $nr = --$D->{$oid_bin};
-				delete($D->{$oid_bin}) if $nr <= 0;
-
-				# nr < 0 (-1) means it never existed
-				$stk->push_rec('m', $at, $ct, $oid) if $nr < 0;
-			} else {
-				$stk->push_rec('m', $at, $ct, $oid);
-			}
-		}
-	}
-	close $fh or die "git log failed: \$?=$?";
-	$stk ? $stk->read_prepare : undef;
-}
-
 sub sync_prepare ($$$) {
 	my ($self, $sync, $epoch_max) = @_;
 	my $pr = $sync->{-opt}->{-progress};
@@ -1093,7 +1049,12 @@ sub sync_prepare ($$$) {
 		my $range = log_range($self, $sync, $git, $i, $tip) or next;
 		# can't use 'rev-list --count' if we use --diff-filter
 		$pr->("$i.git counting $range ... ") if $pr;
-		my $stk = prepare_range_stack($git, $sync, $range);
+		# Don't bump num_highwater on --reindex by using {D}.
+		# We intentionally do NOT use {D} in the non-reindex case
+		# because we want NNTP article number gaps from unindexed
+		# messages to show up in mirrors, too.
+		$sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
+		my $stk = log2stack($sync, $git, $range, $self->{ibx});
 		my $nr = $stk ? $stk->num_records : 0;
 		$pr->("$nr\n") if $pr;
 		$sync->{stacks}->[$i] = $stk if $stk;

  parent reply	other threads:[~2020-07-24  5:56 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-24  5:55 [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24  5:55 ` [PATCH 01/20] index: support --rethread switch to fix old indices Eric Wong
2020-07-24  5:55 ` [PATCH 02/20] v2: index forwards (via `git log --reverse') Eric Wong
2020-07-24  5:55 ` [PATCH 03/20] v2writable: introduce idx_stack Eric Wong
2020-07-24  5:55 ` [PATCH 04/20] v2writable: index_sync: reduce fill_alternates calls Eric Wong
2020-07-24  5:55 ` [PATCH 05/20] v2writable: move {autime} and {cotime} into $sync state Eric Wong
2020-07-24  5:55 ` [PATCH 06/20] v2writable: allow >= 40 byte git object IDs Eric Wong
2020-07-24  5:55 ` [PATCH 07/20] v2writable: drop "EPOCH.git indexing $RANGE" progress Eric Wong
2020-07-24  5:55 ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong
2020-07-24  5:55 ` [PATCH 09/20] search: avoid copying {inboxdir} Eric Wong
2020-07-24  5:55 ` [PATCH 10/20] v2writable: use read-only PublicInbox::Git for cat_file Eric Wong
2020-07-24  5:55 ` [PATCH 11/20] v2writable: get rid of {reindex_pipe} field Eric Wong
2020-07-24  5:55 ` [PATCH 12/20] v2writable: clarify "epoch" comment Eric Wong
2020-07-24  5:55 ` [PATCH 13/20] xapcmd: set {from} properly for v1 inboxes Eric Wong
2020-07-24  5:56 ` [PATCH 14/20] searchidx: rename _xdb_{acquire,release} => idx_ Eric Wong
2020-07-24  5:56 ` [PATCH 15/20] searchidx: make v1 indexing closer to v2 Eric Wong
2020-07-24  5:56 ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong
2020-07-24  5:56 ` Eric Wong [this message]
2020-07-24  5:56 ` [PATCH 18/20] searchidx: support async git check Eric Wong
2020-07-24  5:56 ` [PATCH 19/20] searchidx: $batch_cb => v1_checkpoint Eric Wong
2020-07-24  5:56 ` [PATCH 20/20] v2writable: {unindexed} belongs in $sync state Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200724055606.27332-18-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    --subject='Re: [PATCH 17/20] v2writable: share log2stack code with v1' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git