user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/3] lei: internal bug fixups
@ 2021-06-17 22:00 Eric Wong
  2021-06-17 22:00 ` [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
  To: meta

Still chasing some oddness in day-to-day usage; but I think
3/3 is safe (1/3 helped me inspect things)

Eric Wong (3):
  lei inspect: learn "num:" and "docid:" prefixes
  lei_input: prefix bare Maildir paths w/ "maildir:"
  lei/store: cull redundant docids based on blob OID

 lib/PublicInbox/LeiInput.pm   |  3 +-
 lib/PublicInbox/LeiInspect.pm | 73 +++++++++++++++++++++++++++++++++++
 lib/PublicInbox/LeiStore.pm   | 54 +++++++++++++++++---------
 lib/PublicInbox/SearchIdx.pm  |  2 +-
 4 files changed, 111 insertions(+), 21 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes
  2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
@ 2021-06-17 22:00 ` Eric Wong
  2021-06-17 22:00 ` [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:" Eric Wong
  2021-06-17 22:00 ` [PATCH 3/3] lei/store: cull redundant docids based on blob OID Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
  To: meta

"num:" is useful for inspecting Inbox-ish directories, while
"docid:" can be used for any Xapian DB (not just stuff managed
by our code).
---
 lib/PublicInbox/LeiInspect.pm | 73 +++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index eb2634b4..30714764 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -57,6 +57,75 @@ sub inspect_sync_folder ($$) {
 	$ent
 }
 
+sub inspect_docid ($$;$) {
+	my ($lei, $docid, $ent) = @_;
+	require PublicInbox::Search;
+	$ent //= {};
+	my $xdb;
+	if ($xdb = delete $ent->{xdb}) { # from inspect_num
+	} elsif (defined(my $dir = $lei->{opt}->{dir})) {
+		no warnings 'once';
+		$xdb = $PublicInbox::Search::X{Database}->new($dir);
+	} else {
+		$xdb = $lei->{lse}->xdb;
+	}
+	$xdb or return $lei->fail('no Xapian DB');
+	my $doc = $xdb->get_document($docid); # raises
+	my $data = $doc->get_data;
+	$ent->{docid} = $docid;
+	$ent->{data_length} = length($data);
+	$ent->{description} => $doc->get_description;
+	$ent->{$_} = $doc->$_ for (qw(termlist_count values_count));
+	my $cur = $doc->termlist_begin;
+	my $end = $doc->termlist_end;
+	for (; $cur != $end; $cur++) {
+		my $tn = $cur->get_termname;
+		$tn =~ s/\A([A-Z]+)// or warn "$tn no prefix! (???)";
+		my $term = ($1 // '');
+		push @{$ent->{terms}->{$term}}, $tn;
+	}
+	@$_ = sort(@$_) for values %{$ent->{terms} // {}};
+	$cur = $doc->values_begin;
+	$end = $doc->values_end;
+	for (; $cur != $end; $cur++) {
+		my $n = $cur->get_valueno;
+		my $v = $cur->get_value;
+		my $iv = PublicInbox::Search::sortable_unserialise($v);
+		$v = $iv + 0 if defined $iv;
+		# not using ->[$n] since we may have large gaps in $n
+		$ent->{'values'}->{$n} = $v;
+	}
+	$ent;
+}
+
+sub inspect_num ($$) {
+	my ($lei, $num) = @_;
+	my ($docid, $ibx);
+	my $ent = { num => $num };
+	if (defined(my $dir = $lei->{opt}->{dir})) {
+		my $num2docid = $lei->{lse}->can('num2docid');
+		if (-f "$dir/ei.lock") {
+			require PublicInbox::ExtSearch;
+			$ibx = PublicInbox::ExtSearch->new($dir);
+		} elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") {
+			require PublicInbox::Inbox; # v2, v1
+			$ibx = bless { inboxdir => $dir }, 'PublicInbox::Inbox';
+		}
+		$ent->{xdb} = $ibx->xdb //
+			return $lei->fail("no Xapian DB for $dir");
+		$docid = $num2docid->($ibx, $num);
+	} else {
+		$ibx = $lei->{lse};
+		$lei->{lse}->xdb; # set {nshard} for num2docid
+		$docid = $lei->{lse}->num2docid($num);
+	}
+	if ($ibx && $ibx->over) {
+		my $smsg = $ibx->over->get_art($num);
+		$ent->{smsg} = { %$smsg } if $smsg;
+	}
+	inspect_docid($lei, $docid, $ent);
+}
+
 sub inspect1 ($$$) {
 	my ($lei, $item, $more) = @_;
 	my $ent;
@@ -72,6 +141,10 @@ sub inspect1 ($$$) {
 		}
 	} elsif ($item =~ m!\A(?:maildir|mh):!i || -d $item) {
 		$ent = inspect_sync_folder($lei, $item);
+	} elsif ($item =~ m!\Adocid:([0-9]+)\z!) {
+		$ent = inspect_docid($lei, $1 + 0);
+	} elsif ($item =~ m!\Anum:([0-9]+)\z!) {
+		$ent = inspect_num($lei, $1 + 0);
 	} else { # TODO: more things
 		return $lei->fail("$item not understood");
 	}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:"
  2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
  2021-06-17 22:00 ` [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes Eric Wong
@ 2021-06-17 22:00 ` Eric Wong
  2021-06-17 22:00 ` [PATCH 3/3] lei/store: cull redundant docids based on blob OID Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
  To: meta

This will simplify upcoming code for watches.
---
 lib/PublicInbox/LeiInput.pm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 38d3d36d..de2a8ff1 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -300,7 +300,8 @@ $input is `eml', not --in-format=$in_fmt
 				push @f, $input;
 			} elsif (-d "$input/new" && -d "$input/cur") {
 				if ($sync) {
-					$input = $lei->abs_path($input);
+					$input = 'maildir:'.
+						$lei->abs_path($input);
 					push @{$sync->{ok}}, $input;
 				}
 				push @md, $input;

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 3/3] lei/store: cull redundant docids based on blob OID
  2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
  2021-06-17 22:00 ` [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes Eric Wong
  2021-06-17 22:00 ` [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:" Eric Wong
@ 2021-06-17 22:00 ` Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
  To: meta

I'm not sure how this happened (only once for me in March), but
it should not happen...  In any case, we'll operate on the
lowest numbered docid and cull redundant index entries when
lei/store is open for read-write.

This also fixes the normal lei/store removal path to clean up
the xref3 table (since it's not done automatically for
public-facing -eidx due to the multi-list nature of it).
---
 lib/PublicInbox/LeiStore.pm  | 54 +++++++++++++++++++++++-------------
 lib/PublicInbox/SearchIdx.pm |  2 +-
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index f978288a..4ba1e647 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -226,6 +226,18 @@ sub _remove_if_local { # git->cat_async arg
 	$self->{im}->remove($bref) if $bref;
 }
 
+sub remove_docids ($;@) {
+	my ($self, @docids) = @_;
+	my $eidx = eidx_init($self);
+	for my $docid (@docids) {
+		$eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+		$self->{oidx}->delete_by_num($docid);
+		$self->{oidx}->{dbh}->do(<<EOF, undef, $docid);
+DELETE FROM xref3 WHERE docid = ?
+EOF
+	}
+}
+
 # remove the entire message from the index, does not touch mail_sync.sqlite3
 sub remove_eml {
 	my ($self, $eml) = @_;
@@ -241,13 +253,25 @@ sub remove_eml {
 			my $oidhex = unpack('H*', $oidbin);
 			$git->cat_async($oidhex, \&_remove_if_local, $self);
 		}
-		$eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid);
-		$oidx->delete_by_num($docid);
 	}
 	$git->cat_async_wait;
+	remove_docids($self, @docids);
 	\@docids;
 }
 
+sub oid2docid ($$) {
+	my ($self, $oid) = @_;
+	my $eidx = eidx_init($self);
+	my ($docid, @cull) = $eidx->{oidx}->blob_exists($oid);
+	if (@cull) { # fixup old bugs...
+		warn <<EOF;
+W: $oid indexed as multiple docids: $docid @cull, culling to fixup old bugs
+EOF
+		remove_docids($self, @cull);
+	}
+	wantarray ? ($docid) : $docid;
+}
+
 sub add_eml {
 	my ($self, $eml, $vmd, $xoids) = @_;
 	my $im = $self->{-fake_im} // $self->importer; # may create new epoch
@@ -268,7 +292,7 @@ sub add_eml {
 		if (scalar keys %$xoids) {
 			my %docids = map { $_ => 1 } @$vivify_xvmd;
 			for my $oid (keys %$xoids) {
-				my @id = $oidx->blob_exists($oid);
+				my @id = oid2docid($self, $oid);
 				@docids{@id} = @id;
 			}
 			@$vivify_xvmd = sort { $a <=> $b } keys(%docids);
@@ -356,15 +380,11 @@ sub update_xvmd {
 	my $oidx = $eidx->{oidx};
 	my %seen;
 	for my $oid (keys %$xoids) {
-		my @docids = $oidx->blob_exists($oid) or next;
-		scalar(@docids) > 1 and
-			warn "W: $oid indexed as multiple docids: @docids\n";
-		for my $docid (@docids) {
-			next if $seen{$docid}++;
-			my $idx = $eidx->idx_shard($docid);
-			$idx->ipc_do('update_vmd', $docid, $vmd_mod);
-		}
+		my $docid = oid2docid($self, $oid) // next;
 		delete $xoids->{$oid};
+		next if $seen{$docid}++;
+		my $idx = $eidx->idx_shard($docid);
+		$idx->ipc_do('update_vmd', $docid, $vmd_mod);
 	}
 	return unless scalar(keys(%$xoids));
 
@@ -395,15 +415,11 @@ sub set_xvmd {
 
 	# see if we can just update existing docs
 	for my $oid (keys %$xoids) {
-		my @docids = $oidx->blob_exists($oid) or next;
-		scalar(@docids) > 1 and
-			warn "W: $oid indexed as multiple docids: @docids\n";
-		for my $docid (@docids) {
-			next if $seen{$docid}++;
-			my $idx = $eidx->idx_shard($docid);
-			$idx->ipc_do('set_vmd', $docid, $vmd);
-		}
+		my $docid = oid2docid($self, $oid) // next;
 		delete $xoids->{$oid}; # all done with this oid
+		next if $seen{$docid}++;
+		my $idx = $eidx->idx_shard($docid);
+		$idx->ipc_do('set_vmd', $docid, $vmd);
 	}
 	return unless scalar(keys(%$xoids));
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f066cc92..f553eda6 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -572,7 +572,7 @@ sub apply_vmd_mod ($$) {
 	my $updated = 0;
 	my @x = @VMD_MAP;
 	while (my ($field, $pfx) = splice(@x, 0, 2)) {
-		# field: "label" or "kw"
+		# field: "L" or "kw"
 		for my $val (@{$vmd_mod->{"-$field"} // []}) {
 			eval {
 				$doc->remove_term($pfx . $val);

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-06-17 22:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
2021-06-17 22:00 ` [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes Eric Wong
2021-06-17 22:00 ` [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:" Eric Wong
2021-06-17 22:00 ` [PATCH 3/3] lei/store: cull redundant docids based on blob OID Eric Wong

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git