user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 7/8] store less data in the Xapian document
Date: Sat,  7 Apr 2018 03:41:53 +0000	[thread overview]
Message-ID: <20180407034154.2309-8-e@80x24.org> (raw)
In-Reply-To: <20180407034154.2309-1-e@80x24.org>

Since we only query the SQLite over DB for OVER/XOVER; do not
need to waste space storing fields To/Cc/:bytes/:lines or the
XNUM term.  We only use From/Subject/References/Message-ID/:blob
in various places of the PSGI code.

For reindexing, we will take advantage of docid stability
in "xapian-compact --no-renumber" to ensure duplicates do not
show up in search results.  Since the PSGI interface is the
only consumer of Xapian at the moment, it has no need to
search based on NNTP article number.
---
 lib/PublicInbox/NNTP.pm       |  2 +-
 lib/PublicInbox/OverIdx.pm    |  6 +++---
 lib/PublicInbox/SearchIdx.pm  | 37 ++++---------------------------------
 lib/PublicInbox/SearchMsg.pm  |  6 ++----
 lib/PublicInbox/V2Writable.pm |  2 +-
 script/public-inbox-compact   |  6 +++---
 t/search.t                    | 24 +++++++++++++-----------
 t/v2writable.t                |  7 ++++---
 8 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index fa890cb..ace56e7 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -725,7 +725,7 @@ sub hdr_searchmsg ($$$$) {
 			my $nr = scalar @$msgs or return;
 			my $tmp = '';
 			foreach my $s (@$msgs) {
-				$tmp .= $s->num . ' ' . $s->$field . "\r\n";
+				$tmp .= $s->{num} . ' ' . $s->$field . "\r\n";
 			}
 			utf8::encode($tmp);
 			do_more($self, $tmp);
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 08f8744..62fec0d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -207,8 +207,8 @@ sub link_refs {
 	$tid;
 }
 
-sub parse_references ($$$$) {
-	my ($self, $smsg, $mid0, $mids) = @_;
+sub parse_references ($$$) {
+	my ($smsg, $mid0, $mids) = @_;
 	my $mime = $smsg->{mime};
 	my $hdr = $mime->header_obj;
 	my $refs = references($hdr);
@@ -241,7 +241,7 @@ sub add_overview {
 		blob => $oid,
 	}, 'PublicInbox::SearchMsg';
 	my $mids = mids($mime->header_obj);
-	my $refs = $self->parse_references($smsg, $mid0, $mids);
+	my $refs = parse_references($smsg, $mid0, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
 	if ($subj ne '') {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 7cfa745..f9b40b0 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -273,18 +273,12 @@ sub add_message {
 		my $smsg = PublicInbox::SearchMsg->new($mime);
 		my $doc = $smsg->{doc};
 		my $subj = $smsg->subject;
-
-		$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
-		defined $bytes or $bytes = length($mime->as_string);
-		$smsg->{bytes} = $bytes;
-
 		add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
 		my @ds = gmtime($smsg->ds);
 		my $yyyymmdd = strftime('%Y%m%d', @ds);
 		add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
 		my $dt = strftime('%Y%m%d%H%M%S', @ds);
 		add_val($doc, PublicInbox::Search::DT(), $dt);
-		my @vals = ($smsg->{ts}, $smsg->{ds});
 
 		my $tg = $self->term_generator;
 
@@ -333,11 +327,11 @@ sub add_message {
 			index_body($tg, \@orig, $doc) if @orig;
 		});
 
-		# populates smsg->references for smsg->to_doc_data
-		my $data = $smsg->to_doc_data($oid, $mid0);
 		foreach my $mid (@$mids) {
 			$tg->index_text($mid, 1, 'XM');
 		}
+		$smsg->{to} = $smsg->{cc} = '';
+		my $data = $smsg->to_doc_data($oid, $mid0);
 		$doc->set_data($data);
 		if (my $altid = $self->{-altid}) {
 			foreach my $alt (@$altid) {
@@ -350,24 +344,11 @@ sub add_message {
 			}
 		}
 
-		$self->delete_article($num) if defined $num; # for reindexing
-
 		if (my $over = $self->{over}) {
-			utf8::encode($data);
-			$data = compress($data);
-			my $refs = $over->parse_references($smsg, $mid0, $mids);
-			my $xpath;
-			if ($subj ne '') {
-				$xpath = $self->subject_path($subj);
-				$xpath = id_compress($xpath);
-			}
-
-			push @vals, $num, $mids, $refs, $xpath, $data;
-			$over->add_over(\@vals);
+			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
 		}
 		$doc->add_boolean_term('Q' . $_) foreach @$mids;
-		$doc->add_boolean_term('XNUM' . $num) if defined $num;
-		$doc_id = $self->{xdb}->add_document($doc);
+		$self->{xdb}->replace_document($doc_id = $num, $doc);
 	};
 
 	if ($@) {
@@ -419,16 +400,6 @@ sub remove_message {
 	}
 }
 
-sub delete_article {
-	my ($self, $num) = @_;
-	my $ndel = 0;
-	batch_do($self, 'XNUM' . $num, sub {
-		my ($ids) = @_;
-		$ndel += scalar @$ids;
-		$self->{xdb}->delete_document($_) for @$ids;
-	});
-}
-
 # MID is a hint in V2
 sub remove_by_oid {
 	my ($self, $oid, $mid) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 3278802..ab971e0 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -45,12 +45,11 @@ sub to_doc_data {
 		$self->cc,
 		$oid,
 		$mid0,
-		$self->{bytes},
-		$self->{lines}
+		$self->{bytes} || '',
+		$self->{lines} || ''
 	);
 }
 
-
 sub load_from_data ($$) {
 	my ($self) = $_[0]; # data = $_[1]
 	(
@@ -92,7 +91,6 @@ sub load_doc {
 # :bytes and :lines metadata in RFC 3977
 sub bytes ($) { $_[0]->{bytes} }
 sub lines ($) { $_[0]->{lines} }
-sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
 
 sub __hdr ($$) {
 	my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 53fdb73..1cc4b00 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -800,7 +800,7 @@ sub unindex_oid {
 		my %gone;
 		my ($id, $prev);
 		while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
-			$gone{$smsg->num} = 1 if $oid eq $smsg->{blob};
+			$gone{$smsg->{num}} = 1 if $oid eq $smsg->{blob};
 			1; # continue
 		}
 		my $n = scalar keys %gone;
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index d855b9e..9f33265 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -48,7 +48,7 @@ sub commit_changes ($$$) {
 	$im->lock_release;
 	remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
 }
-
+my @compact = qw(xapian-compact --no-renumber);
 if ($v == 2) {
 	require PublicInbox::V2Writable;
 	my $v2w = PublicInbox::V2Writable->new($ibx);
@@ -70,7 +70,7 @@ if ($v == 2) {
 		}
 		close $dh;
 		die "No Xapian parts found in $old\n" unless @parts;
-		my $cmd = ['xapian-compact', @parts, "$new/0" ];
+		my $cmd = [@compact, @parts, "$new/0" ];
 		PublicInbox::Import::run_die($cmd);
 		commit_changes($v2w, $old, $new);
 	});
@@ -84,7 +84,7 @@ if ($v == 2) {
 	my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $v1_root);
 	$ibx->with_umask(sub {
 		$im->lock_acquire;
-		PublicInbox::Import::run_die(['xapian-compact', $old, $new]);
+		PublicInbox::Import::run_die([@compact, $old, $new]);
 		commit_changes($im, $old, $new);
 	});
 } else {
diff --git a/t/search.t b/t/search.t
index fda32d3..516f567 100644
--- a/t/search.t
+++ b/t/search.t
@@ -306,31 +306,33 @@ sub filter_mids {
 
 # names and addresses
 {
-	my $res = $ro->query('t:list@example.com');
-	is(scalar @$res, 6, 'searched To: successfully');
-	foreach my $smsg (@$res) {
+	my $mset = $ro->query('t:list@example.com', {mset => 1});
+	is($mset->size, 6, 'searched To: successfully');
+	foreach my $m ($mset->items) {
+		my $smsg = $ro->lookup_article($m->get_docid);
 		like($smsg->to, qr/\blist\@example\.com\b/, 'to appears');
 	}
 
-	$res = $ro->query('tc:list@example.com');
-	is(scalar @$res, 6, 'searched To+Cc: successfully');
-	foreach my $smsg (@$res) {
+	$mset = $ro->query('tc:list@example.com', {mset => 1});
+	is($mset->size, 6, 'searched To+Cc: successfully');
+	foreach my $m ($mset->items) {
+		my $smsg = $ro->lookup_article($m->get_docid);
 		my $tocc = join("\n", $smsg->to, $smsg->cc);
 		like($tocc, qr/\blist\@example\.com\b/, 'tocc appears');
 	}
 
 	foreach my $pfx ('tcf:', 'c:') {
-		$res = $ro->query($pfx . 'foo@example.com');
-		is(scalar @$res, 1,
-			"searched $pfx successfully for Cc:");
-		foreach my $smsg (@$res) {
+		my $mset = $ro->query($pfx . 'foo@example.com', { mset => 1 });
+		is($mset->items, 1, "searched $pfx successfully for Cc:");
+		foreach my $m ($mset->items) {
+			my $smsg = $ro->lookup_article($m->get_docid);
 			like($smsg->cc, qr/\bfoo\@example\.com\b/,
 				'cc appears');
 		}
 	}
 
 	foreach my $pfx ('', 'tcf:', 'f:') {
-		$res = $ro->query($pfx . 'Laggy');
+		my $res = $ro->query($pfx . 'Laggy');
 		is(scalar(@$res), 1,
 			"searched $pfx successfully for From:");
 		foreach my $smsg (@$res) {
diff --git a/t/v2writable.t b/t/v2writable.t
index b543c53..85fb6a6 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -220,13 +220,14 @@ EOF
 		'commit message propagated to git');
 	is_deeply(\@after, \@before, 'only one commit written to git');
 	is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
-	like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
-	is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+	my $num = $smsg->{num};
+	like($num, qr/\A\d+\z/, 'numeric number in return message');
+	is($ibx->mm->mid_for($num), undef, 'no longer in Msgmap by num');
 	my $srch = $ibx->search->reopen;
 	my $mset = $srch->query('m:'.$smsg->mid, { mset => 1});
 	is($mset->size, 0, 'no longer found in Xapian');
 	my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames);
-	is($srch->{over_ro}->get_art($smsg->num), undef,
+	is($srch->{over_ro}->get_art($num), undef,
 		'removal propagated to Over DB');
 
 	my $after = $git0->qx(@log1);
-- 
EW


  parent reply	other threads:[~2018-04-07  3:41 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-07  3:41 [PATCH 0/8] less code, less memory, more speed Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 1/8] psgi: ensure /$INBOX/$MESSAGE_ID/T/ endpoint is chronological Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 2/8] over: avoid excessive SELECT Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 3/8] over: remove forked subprocess Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 4/8] v2writable: reduce barriers Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 5/8] index: allow specifying --jobs=0 to disable multiprocess Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` [PATCH 6/8] convert: support converting with altid defined Eric Wong (Contractor, The Linux Foundation)
2018-04-07  3:41 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-04-07  3:41 ` [PATCH 8/8] msgmap: speed up minmax with separate queries Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180407034154.2309-8-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).