user/dev discussion of public-inbox itself
 help / color / Atom feed
* [PATCH 0/2] v2writable: reduce smsg->{mime} impact
@ 2020-02-24  8:08 Eric Wong
  2020-02-24  8:08 ` [PATCH 1/2] v2writable: make remove return-compatible w/ Import::remove Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2020-02-24  8:08 UTC (permalink / raw)
  To: meta

Stuffing a full MIME object into $smsg is probably a bad idea
as witnessed by the memory bloat fixed with:
https://public-inbox.org/meta/20190108004606.23760-1-e@80x24.org/
("view: stop storing all MIME objects on large threads")

So slowly start getting rid of smsg->{mime} and improve some
v2writable behaviors while we're at it.

Eric Wong (2):
  v2writable: make remove return-compatible w/ Import::remove
  v2writable: lookup_content => content_exists

 lib/PublicInbox/V2Writable.pm | 34 ++++++++++++++++------------------
 t/v2writable.t                |  7 +++++--
 2 files changed, 21 insertions(+), 20 deletions(-)


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] v2writable: make remove return-compatible w/ Import::remove
  2020-02-24  8:08 [PATCH 0/2] v2writable: reduce smsg->{mime} impact Eric Wong
@ 2020-02-24  8:08 ` Eric Wong
  2020-02-24  8:08 ` [PATCH 2/2] v2writable: lookup_content => content_exists Eric Wong
  2020-02-24  8:08 ` [PATCH] " Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-02-24  8:08 UTC (permalink / raw)
  To: meta

Import::remove is a documented interface, and the return
value of the V2Writable work-alike should try to be compatible
with what Import implements.
---
 lib/PublicInbox/V2Writable.pm | 23 +++++++++++++----------
 t/v2writable.t                |  7 +++++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index fc2f33f9..573a92aa 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -381,7 +381,7 @@ sub rewrite_internal ($$;$$$) {
 	}
 	my $over = $self->{over};
 	my $cids = content_ids($old_mime);
-	my $removed;
+	my @removed;
 	my $mids = mids($old_mime->header_obj);
 
 	# We avoid introducing new blobs into git since the raw content
@@ -391,7 +391,7 @@ sub rewrite_internal ($$;$$$) {
 	my $mark;
 
 	foreach my $mid (@$mids) {
-		my %gone; # num => [ smsg, raw ]
+		my %gone; # num => [ smsg, $mime, raw ]
 		my ($id, $prev);
 		while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
 			my $msg = get_blob($self, $smsg);
@@ -402,8 +402,7 @@ sub rewrite_internal ($$;$$$) {
 			my $orig = $$msg;
 			my $cur = PublicInbox::MIME->new($msg);
 			if (content_matches($cids, $cur)) {
-				$smsg->{mime} = $cur;
-				$gone{$smsg->{num}} = [ $smsg, \$orig ];
+				$gone{$smsg->{num}} = [ $smsg, $cur, \$orig ];
 			}
 		}
 		my $n = scalar keys %gone;
@@ -413,15 +412,16 @@ sub rewrite_internal ($$;$$$) {
 				join(',', sort keys %gone), "\n";
 		}
 		foreach my $num (keys %gone) {
-			my ($smsg, $orig) = @{$gone{$num}};
-			# $removed should only be set once assuming
+			my ($smsg, $mime, $orig) = @{$gone{$num}};
+			# @removed should only be set once assuming
 			# no bugs in our deduplication code:
-			$removed = $smsg;
+			@removed = (undef, $mime, $smsg);
 			my $oid = $smsg->{blob};
 			if ($replace_map) {
 				$replace_map->{$oid} = $sref;
 			} else {
 				($mark, undef) = $im->remove($orig, $cmt_msg);
+				$removed[0] = $mark;
 			}
 			$orig = undef;
 			if ($need_reindex) { # ->replace
@@ -441,15 +441,18 @@ sub rewrite_internal ($$;$$$) {
 		my $rewrites = _replace_oids($self, $new_mime, $replace_map);
 		return { rewrites => $rewrites, need_reindex => $need_reindex };
 	}
-	$removed;
+	defined($mark) ? @removed : undef;
 }
 
-# public
+# public (see PublicInbox::Import->remove), but note the 3rd element
+# (retval[2]) is not part of the stable API shared with Import->remove
 sub remove {
 	my ($self, $mime, $cmt_msg) = @_;
+	my @ret;
 	$self->{-inbox}->with_umask(sub {
-		rewrite_internal($self, $mime, $cmt_msg);
+		@ret = rewrite_internal($self, $mime, $cmt_msg);
 	});
+	defined($ret[0]) ? @ret : undef;
 }
 
 sub _replace ($$;$$) {
diff --git a/t/v2writable.t b/t/v2writable.t
index 77bd68d4..cdcfe4d0 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -4,7 +4,7 @@ use strict;
 use warnings;
 use Test::More;
 use PublicInbox::MIME;
-use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::ContentId qw(content_digest content_id);
 use PublicInbox::TestCommon;
 use Cwd qw(abs_path);
 require_git(2.6);
@@ -206,7 +206,10 @@ EOF
 	my $before = $git0->qx(@log, qw(--pretty=raw --raw -r));
 	$im = PublicInbox::V2Writable->new($ibx, {nproc => 2});
 	is($im->{shards}, 1, 'detected single shard from previous');
-	my $smsg = $im->remove($mime, 'test removal');
+	my ($mark, $rm_mime, $smsg) = $im->remove($mime, 'test removal');
+	is(content_id($rm_mime), content_id($mime),
+			'removed object returned matches');
+	ok(defined($mark), 'mark set');
 	$im->done;
 	my @after = $git0->qx(@log, qw(--pretty=oneline));
 	my $tip = shift @after;

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/2] v2writable: lookup_content => content_exists
  2020-02-24  8:08 [PATCH 0/2] v2writable: reduce smsg->{mime} impact Eric Wong
  2020-02-24  8:08 ` [PATCH 1/2] v2writable: make remove return-compatible w/ Import::remove Eric Wong
@ 2020-02-24  8:08 ` Eric Wong
  2020-02-24  8:08 ` [PATCH] " Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-02-24  8:08 UTC (permalink / raw)
  To: meta

It only needs to return a boolean, since none of the current
callers care about the return value.  Thus avoid a hash table
assignment and use of `$smsg->{mime}', here.
---
 lib/PublicInbox/V2Writable.pm | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 573a92aa..b42e6a13 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -201,11 +201,10 @@ sub v2_num_for {
 		# crap, Message-ID is already known, hope somebody just resent:
 		foreach my $m (@$mids) {
 			# read-only lookup now safe to do after above barrier
-			my $existing = lookup_content($self, $mime, $m);
 			# easy, don't store duplicates
 			# note: do not add more diagnostic info here since
 			# it gets noisy on public-inbox-watch restarts
-			return () if $existing;
+			return () if content_exists($self, $mime, $m);
 		}
 
 		# AltId may pre-populate article numbers (e.g. X-Mail-Count
@@ -824,7 +823,7 @@ sub get_blob ($$) {
 	$ibx->msg_by_smsg($smsg);
 }
 
-sub lookup_content ($$$) {
+sub content_exists ($$$) {
 	my ($self, $mime, $mid) = @_;
 	my $over = $self->{over};
 	my $cids = content_ids($mime);
@@ -836,11 +835,7 @@ sub lookup_content ($$$) {
 			next;
 		}
 		my $cur = PublicInbox::MIME->new($msg);
-		if (content_matches($cids, $cur)) {
-			$smsg->{mime} = $cur;
-			return $smsg;
-		}
-
+		return 1 if content_matches($cids, $cur);
 
 		# XXX DEBUG_DIFF is experimental and may be removed
 		diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] v2writable: lookup_content => content_exists
  2020-02-24  8:08 [PATCH 0/2] v2writable: reduce smsg->{mime} impact Eric Wong
  2020-02-24  8:08 ` [PATCH 1/2] v2writable: make remove return-compatible w/ Import::remove Eric Wong
  2020-02-24  8:08 ` [PATCH 2/2] v2writable: lookup_content => content_exists Eric Wong
@ 2020-02-24  8:08 ` " Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-02-24  8:08 UTC (permalink / raw)
  To: meta

It only needs to return a boolean, since none of the current
callers care about the return value, so avoid a hash assignment
and use of `$smsg->{mime}', here.
---
 lib/PublicInbox/V2Writable.pm | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 573a92aa..b42e6a13 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -201,11 +201,10 @@ sub v2_num_for {
 		# crap, Message-ID is already known, hope somebody just resent:
 		foreach my $m (@$mids) {
 			# read-only lookup now safe to do after above barrier
-			my $existing = lookup_content($self, $mime, $m);
 			# easy, don't store duplicates
 			# note: do not add more diagnostic info here since
 			# it gets noisy on public-inbox-watch restarts
-			return () if $existing;
+			return () if content_exists($self, $mime, $m);
 		}
 
 		# AltId may pre-populate article numbers (e.g. X-Mail-Count
@@ -824,7 +823,7 @@ sub get_blob ($$) {
 	$ibx->msg_by_smsg($smsg);
 }
 
-sub lookup_content ($$$) {
+sub content_exists ($$$) {
 	my ($self, $mime, $mid) = @_;
 	my $over = $self->{over};
 	my $cids = content_ids($mime);
@@ -836,11 +835,7 @@ sub lookup_content ($$$) {
 			next;
 		}
 		my $cur = PublicInbox::MIME->new($msg);
-		if (content_matches($cids, $cur)) {
-			$smsg->{mime} = $cur;
-			return $smsg;
-		}
-
+		return 1 if content_matches($cids, $cur);
 
 		# XXX DEBUG_DIFF is experimental and may be removed
 		diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF};

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, back to index

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-24  8:08 [PATCH 0/2] v2writable: reduce smsg->{mime} impact Eric Wong
2020-02-24  8:08 ` [PATCH 1/2] v2writable: make remove return-compatible w/ Import::remove Eric Wong
2020-02-24  8:08 ` [PATCH 2/2] v2writable: lookup_content => content_exists Eric Wong
2020-02-24  8:08 ` [PATCH] " Eric Wong

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git