user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 6/9] *idx: pass $smsg in more places instead of many args
Date: Fri, 20 Mar 2020 08:18:18 +0000
Message-ID: <20200320081821.21715-7-e@yhbt.net> (raw)
In-Reply-To: <20200320081821.21715-1-e@yhbt.net>

We can pass blessed PublicInbox::Smsg objects to internal
indexing APIs instead of having long parameter lists in some
places.  The end goal is to avoid parsing redundant information
each step of the way and hopefully make things more
understandable.
---
 lib/PublicInbox/OverIdx.pm        | 14 +++-------
 lib/PublicInbox/SearchIdx.pm      | 45 ++++++++++++++++++-------------
 lib/PublicInbox/SearchIdxShard.pm | 16 +++++++++--
 lib/PublicInbox/V2Writable.pm     |  8 +++++-
 t/search-thr-index.t              | 17 ++++++++++--
 5 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index f49dfa00..2d71956d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -245,15 +245,9 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_;
-	my $lines = $mime->body_raw =~ tr!\n!\n!;
-	my $smsg = bless {
-		mime => $mime,
-		mid => $mid0,
-		bytes => $bytes,
-		lines => $lines,
-		blob => $oid,
-	}, 'PublicInbox::Smsg';
+	my ($self, $mime, $smsg, $times) = @_;
+	$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+	$smsg->{mime} = $mime; # XXX temporary?
 	my $hdr = $mime->header_obj;
 	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $hdr, $mids);
@@ -268,7 +262,7 @@ sub add_overview {
 	$dd = compress($dd);
 	my $ds = msg_timestamp($hdr, $times->{autime});
 	my $ts = msg_datestamp($hdr, $times->{cotime});
-	my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ];
+	my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ];
 	add_over($self, $values);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 32be9c3f..5ca819c3 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -306,9 +306,9 @@ sub index_xapian { # msg_iter callback
 	index_body($self, $_, /\A>/ ? 0 : $doc) for @sections;
 }
 
-sub add_xapian ($$$$$$) {
-	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
-	my $smsg = PublicInbox::Smsg->new($mime);
+sub add_xapian ($$$$) {
+	my ($self, $mime, $smsg, $mids) = @_;
+	$smsg->{mime} = $mime; # XXX dangerous
 	my $hdr = $mime->header_obj;
 	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
 	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
@@ -338,9 +338,7 @@ sub add_xapian ($$$$$$) {
 			index_text($self, join(' ', @long), 1, 'XM');
 		}
 	}
-	$smsg->{to} = $smsg->{cc} = '';
-	$smsg->{blob} = $oid;
-	$smsg->{mid} = $mid0;
+	$smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
 	PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
 	my $data = $smsg->to_doc_data;
 	$doc->set_data($data);
@@ -355,7 +353,7 @@ sub add_xapian ($$$$$$) {
 		}
 	}
 	$doc->add_boolean_term('Q' . $_) foreach @$mids;
-	$self->{xdb}->replace_document($num, $doc);
+	$self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
 sub _msgmap_init ($) {
@@ -369,20 +367,25 @@ sub _msgmap_init ($) {
 
 sub add_message {
 	# mime = Email::MIME object
-	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $mids = mids_for_index($mime->header_obj);
-	$mid0 //= $mids->[0]; # v1 compatibility
-	$num //= do { # v1
+	$smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat
+	$smsg->{mid} //= $mids->[0]; # v1 compatibility
+	$smsg->{num} //= do { # v1
 		_msgmap_init($self);
 		index_mm($self, $mime);
 	};
 	eval {
-		if (need_xapian($self)) {
-			add_xapian($self, $mime, $num, $oid, $mids, $mid0);
+		# order matters, overview stores every possible piece of
+		# data in doc_data (deflated).  Xapian only stores a subset
+		# of the fields which exist in over.sqlite3.  We may stop
+		# storing doc_data in Xapian sometime after we get multi-inbox
+		# search working.
+		if (my $over = $self->{over}) { # v1 only
+			$over->add_overview($mime, $smsg, $self);
 		}
-		if (my $over = $self->{over}) {
-			$over->add_overview($mime, $bytes, $num, $oid, $mid0,
-						$self);
+		if (need_xapian($self)) {
+			add_xapian($self, $mime, $smsg, $mids);
 		}
 	};
 
@@ -390,7 +393,7 @@ sub add_message {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$num;
+	$smsg->{num};
 }
 
 # returns begin and end PostingIterator
@@ -530,9 +533,10 @@ sub unindex_mm {
 }
 
 sub index_both {
-	my ($self, $mime, $bytes, $blob) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $num = index_mm($self, $mime);
-	add_message($self, $mime, $bytes, $num, $blob);
+	$smsg->{num} = $num;
+	add_message($self, $mime, $smsg);
 }
 
 sub unindex_both {
@@ -595,8 +599,11 @@ sub read_log {
 				next;
 			}
 			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+			my $smsg = bless {}, 'PublicInbox::Smsg';
 			batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
-			$add_cb->($self, $mime, $bytes, $blob);
+			$smsg->{blob} = $blob;
+			$smsg->{bytes} = $bytes;
+			$add_cb->($self, $mime, $smsg);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
 			$D{$blob} = 1;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 74c624a4..d29e6090 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -76,7 +76,13 @@ sub shard_worker_loop ($$$$$) {
 			$artnum = int($artnum);
 			$self->{autime} = $autime;
 			$self->{cotime} = $cotime;
-			$self->add_message($mime, $n, $artnum, $oid, $mid0);
+			my $smsg = bless {
+				bytes => $len,
+				num => $artnum,
+				blob => $oid,
+				mid => $mid0,
+			}, 'PublicInbox::Smsg';
+			$self->add_message($mime, $smsg);
 		}
 	}
 	$self->worker_done;
@@ -95,7 +101,13 @@ sub index_raw {
 		$self->begin_txn_lazy;
 		$self->{autime} = $at;
 		$self->{cotime} = $ct;
-		$self->add_message($mime, $bytes, $artnum, $oid, $mid0);
+		my $smsg = bless {
+			bytes => $bytes,
+			num => $artnum,
+			blob => $oid,
+			mid => $mid0,
+		}, 'PublicInbox::Smsg';
+		$self->add_message($mime, $smsg);
 	}
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index d39a6f89..34dd139b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -150,7 +150,13 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$$$$) {
 	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self);
+	my $smsg = bless {
+		bytes => $len,
+		num => $num,
+		blob => $oid,
+		mid => $mid0,
+	}, 'PublicInbox::Smsg';
+	$self->{over}->add_overview($mime, $smsg, $self);
 	my $idx = idx_shard($self, $num % $self->{shards});
 	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
 	my $n = $self->{transact_bytes} += $len;
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
index 6a5fd919..f073304a 100644
--- a/t/search-thr-index.t
+++ b/t/search-thr-index.t
@@ -9,6 +9,7 @@ use PublicInbox::MID qw(mids);
 use Email::MIME;
 require_mods(qw(DBD::SQLite Search::Xapian));
 require PublicInbox::SearchIdx;
+require PublicInbox::Smsg;
 require PublicInbox::Inbox;
 my ($tmpdir, $for_destroy) = tmpdir();
 my $git_dir = "$tmpdir/a.git";
@@ -45,7 +46,13 @@ foreach (reverse split(/\n\n/, $data)) {
 	$mime->header_set('To' => 'git@vger.kernel.org');
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => ++$num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	push @mids, $mid;
 	ok($doc_id, 'message added: '. $mid);
 }
@@ -86,7 +93,13 @@ SELECT tid FROM over WHERE num = ? LIMIT 1
 
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, $num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => $num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	ok($doc_id, 'message reindexed'. $mid);
 	is($doc_id, $num, "article number unchanged: $num");
 

  parent reply index

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-05  3:23 [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-05  5:13 ` Eric Wong
2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
2020-03-20  8:18     ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
2020-03-20  8:18     ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
2020-03-20  8:18     ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
2020-03-20  8:18     ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
2020-03-20  8:18     ` Eric Wong [this message]
2020-03-20  8:18     ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
2020-03-20  8:18     ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
2020-03-20  8:18     ` [PATCH 9/9] v2: SDBM-based multi Message-ID queue Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200320081821.21715-7-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git