user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] search: reduce columns stored in Xapian
@ 2018-04-01 23:23 Eric Wong (Contractor, The Linux Foundation)
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-04-01 23:23 UTC (permalink / raw)
  To: meta

We can store :bytes and :lines in doc_data since we never
sort or search by them.  We don't have much use for the Date:
stamp at the moment, either.
---
 lib/PublicInbox/Search.pm            |  9 ++----
 lib/PublicInbox/SearchIdx.pm         | 29 ++++++------------
 lib/PublicInbox/SearchIdxSkeleton.pm | 10 ++-----
 lib/PublicInbox/SearchMsg.pm         | 58 +++++++++++++++++++++---------------
 4 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index de296e1..ca389e3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,12 +8,9 @@ use strict;
 use warnings;
 
 # values for searching
-use constant DS => 0; # Date: header in Unix time
-use constant NUM => 1; # NNTP article number
-use constant BYTES => 2; # :bytes as defined in RFC 3977
-use constant LINES => 3; # :lines as defined in RFC 3977
-use constant TS => 4;  # Received: header in Unix time
-use constant YYYYMMDD => 5; # for searching in the WWW UI
+use constant TS => 0;  # Received: header in Unix time
+use constant YYYYMMDD => 1; # for searching in the WWW UI
+use constant NUM => 2; # NNTP article number
 
 use Search::Xapian qw/:standard/;
 use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 36f97b3..2e0b9a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -114,25 +114,12 @@ sub add_val ($$$) {
 	$doc->add_value($col, $num);
 }
 
-sub add_values ($$) {
-	my ($doc, $values) = @_;
-
-	my $ts = $values->[PublicInbox::Search::TS];
+sub add_values {
+	my ($doc, $ts, $ds, $num) = @_;
 	add_val($doc, PublicInbox::Search::TS, $ts);
-
-	my $num = $values->[PublicInbox::Search::NUM];
-	defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
-
-	my $bytes = $values->[PublicInbox::Search::BYTES];
-	defined($bytes) and add_val($doc, PublicInbox::Search::BYTES, $bytes);
-
-	my $lines = $values->[PublicInbox::Search::LINES];
-	add_val($doc, PublicInbox::Search::LINES, $lines);
-
-	my $ds = $values->[PublicInbox::Search::DS];
-	add_val($doc, PublicInbox::Search::DS, $ds);
 	my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
 	add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+	defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
 }
 
 sub index_users ($$) {
@@ -295,8 +282,10 @@ sub add_message {
 		}
 
 		my $lines = $mime->body_raw =~ tr!\n!\n!;
-		my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
-		add_values($doc, \@values);
+		$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+		defined $bytes or $bytes = length($mime->as_string);
+		$smsg->{bytes} = $bytes;
+		add_values($doc, $smsg->ts, $smsg->ds, $num);
 
 		my $tg = $self->term_generator;
 
@@ -366,8 +355,8 @@ sub add_message {
 
 		$self->delete_article($num) if defined $num; # for reindexing
 		if ($skel) {
-			push @values, $mids, $xpath, $data;
-			$skel->index_skeleton(\@values);
+			my @vals = ($smsg->ts, $num, $mids, $xpath, $data);
+			$skel->index_skeleton(\@vals);
 			$doc->add_boolean_term('Q' . $_) foreach @$mids;
 			$doc->add_boolean_term('XNUM' . $num) if defined $num;
 			$doc_id = $self->{xdb}->add_document($doc);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4f15816..2be6496 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,14 @@ sub remote_remove {
 	die $err if $err;
 }
 
-# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
 sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
-	my $doc_data = pop @$values;
-	my $xpath = pop @$values;
-	my $mids = pop @$values;
+	my ($ts, $num, $mids, $xpath, $doc_data) = @$values;
 	my $smsg = PublicInbox::SearchMsg->new(undef);
+	$smsg->load_from_data($doc_data);
 	my $doc = $smsg->{doc};
-	PublicInbox::SearchIdx::add_values($doc, $values);
 	$doc->set_data($doc_data);
-	$smsg->load_from_data($doc_data);
-	my $num = $values->[PublicInbox::Search::NUM];
+	PublicInbox::SearchIdx::add_values($doc, $ts, $smsg->ds, $num);
 	my @refs = ($smsg->references =~ /<([^>]+)>/g);
 	$self->delete_article($num) if defined $num; # for reindexing
 	$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index e55d401..f5510b8 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -35,20 +35,41 @@ sub get_val ($$) {
 	Search::Xapian::sortable_unserialise($doc->get_value($col));
 }
 
+sub to_doc_data {
+	my ($self, $oid, $mid0) = @_;
+	$oid = '' unless defined $oid;
+	join("\n",
+		$self->subject,
+		$self->from,
+		$self->references,
+		$self->to,
+		$self->cc,
+		$oid,
+		$mid0,
+		$self->ds,
+		$self->{bytes},
+		$self->{lines}
+	);
+}
+
 sub load_from_data ($$) {
 	my ($self) = $_[0]; # data = $_[1]
-	my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]);
-	$self->{subject} = $subj;
-	$self->{from} = $from;
-	$self->{references} = $refs;
+	(
+		$self->{subject},
+		$self->{from},
+		$self->{references},
 
-	# To: and Cc: are stored to optimize HDR/XHDR in NNTP since
-	# some NNTP clients will use that for message displays.
-	$self->{to} = $to;
-	$self->{cc} = $cc;
+		# To: and Cc: are stored to optimize HDR/XHDR in NNTP since
+		# some NNTP clients will use that for message displays.
+		$self->{to},
+		$self->{cc},
 
-	$self->{blob} = $blob;
-	$self->{mid} = $mid0;
+		$self->{blob},
+		$self->{mid},
+		$self->{ds},
+		$self->{bytes},
+		$self->{lines}
+	) = split(/\n/, $_[1]);
 }
 
 sub load_expand {
@@ -56,7 +77,6 @@ sub load_expand {
 	my $doc = $self->{doc};
 	my $data = $doc->get_data or return;
 	$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
-	$self->{ds} = get_val($doc, &PublicInbox::Search::DS);
 	utf8::decode($data);
 	load_from_data($self, $data);
 	$self;
@@ -69,11 +89,9 @@ sub load_doc {
 }
 
 # :bytes and :lines metadata in RFC 3977
-sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
-sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) {
-	$_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM())
-}
+sub bytes ($) { $_[0]->{bytes} }
+sub lines ($) { $_[0]->{lines} }
+sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
 
 sub __hdr ($$) {
 	my ($self, $field) = @_;
@@ -134,14 +152,6 @@ sub ds {
 	$self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
 }
 
-sub to_doc_data {
-	my ($self, $oid, $mid0) = @_;
-	my @rows = ($self->subject, $self->from, $self->references,
-			$self->to, $self->cc);
-	$oid = '' unless defined $oid;
-	join("\n", @rows, $oid, $mid0);
-}
-
 sub references {
 	my ($self) = @_;
 	my $x = $self->{references};
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH] search: reduce columns stored in Xapian
@ 2018-04-01 23:31 Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2018-04-01 23:31 UTC (permalink / raw)
  To: meta

We can store :bytes and :lines in doc_data since we never
sort or search by them.  We don't have much use for the Date:
stamp at the moment, either.
---
 Publishing for documentation purposes, will be obsolete,
 next, as v2 changes will bump the SCHEMA_VERSION.

 lib/PublicInbox/Search.pm            |  9 ++----
 lib/PublicInbox/SearchIdx.pm         | 29 ++++++------------
 lib/PublicInbox/SearchIdxSkeleton.pm | 10 ++-----
 lib/PublicInbox/SearchMsg.pm         | 58 +++++++++++++++++++++---------------
 4 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index de296e1..ca389e3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,12 +8,9 @@ use strict;
 use warnings;
 
 # values for searching
-use constant DS => 0; # Date: header in Unix time
-use constant NUM => 1; # NNTP article number
-use constant BYTES => 2; # :bytes as defined in RFC 3977
-use constant LINES => 3; # :lines as defined in RFC 3977
-use constant TS => 4;  # Received: header in Unix time
-use constant YYYYMMDD => 5; # for searching in the WWW UI
+use constant TS => 0;  # Received: header in Unix time
+use constant YYYYMMDD => 1; # for searching in the WWW UI
+use constant NUM => 2; # NNTP article number
 
 use Search::Xapian qw/:standard/;
 use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 36f97b3..2e0b9a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -114,25 +114,12 @@ sub add_val ($$$) {
 	$doc->add_value($col, $num);
 }
 
-sub add_values ($$) {
-	my ($doc, $values) = @_;
-
-	my $ts = $values->[PublicInbox::Search::TS];
+sub add_values {
+	my ($doc, $ts, $ds, $num) = @_;
 	add_val($doc, PublicInbox::Search::TS, $ts);
-
-	my $num = $values->[PublicInbox::Search::NUM];
-	defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
-
-	my $bytes = $values->[PublicInbox::Search::BYTES];
-	defined($bytes) and add_val($doc, PublicInbox::Search::BYTES, $bytes);
-
-	my $lines = $values->[PublicInbox::Search::LINES];
-	add_val($doc, PublicInbox::Search::LINES, $lines);
-
-	my $ds = $values->[PublicInbox::Search::DS];
-	add_val($doc, PublicInbox::Search::DS, $ds);
 	my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
 	add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+	defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
 }
 
 sub index_users ($$) {
@@ -295,8 +282,10 @@ sub add_message {
 		}
 
 		my $lines = $mime->body_raw =~ tr!\n!\n!;
-		my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
-		add_values($doc, \@values);
+		$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+		defined $bytes or $bytes = length($mime->as_string);
+		$smsg->{bytes} = $bytes;
+		add_values($doc, $smsg->ts, $smsg->ds, $num);
 
 		my $tg = $self->term_generator;
 
@@ -366,8 +355,8 @@ sub add_message {
 
 		$self->delete_article($num) if defined $num; # for reindexing
 		if ($skel) {
-			push @values, $mids, $xpath, $data;
-			$skel->index_skeleton(\@values);
+			my @vals = ($smsg->ts, $num, $mids, $xpath, $data);
+			$skel->index_skeleton(\@vals);
 			$doc->add_boolean_term('Q' . $_) foreach @$mids;
 			$doc->add_boolean_term('XNUM' . $num) if defined $num;
 			$doc_id = $self->{xdb}->add_document($doc);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4f15816..2be6496 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,14 @@ sub remote_remove {
 	die $err if $err;
 }
 
-# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
 sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
-	my $doc_data = pop @$values;
-	my $xpath = pop @$values;
-	my $mids = pop @$values;
+	my ($ts, $num, $mids, $xpath, $doc_data) = @$values;
 	my $smsg = PublicInbox::SearchMsg->new(undef);
+	$smsg->load_from_data($doc_data);
 	my $doc = $smsg->{doc};
-	PublicInbox::SearchIdx::add_values($doc, $values);
 	$doc->set_data($doc_data);
-	$smsg->load_from_data($doc_data);
-	my $num = $values->[PublicInbox::Search::NUM];
+	PublicInbox::SearchIdx::add_values($doc, $ts, $smsg->ds, $num);
 	my @refs = ($smsg->references =~ /<([^>]+)>/g);
 	$self->delete_article($num) if defined $num; # for reindexing
 	$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index e55d401..f5510b8 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -35,20 +35,41 @@ sub get_val ($$) {
 	Search::Xapian::sortable_unserialise($doc->get_value($col));
 }
 
+sub to_doc_data {
+	my ($self, $oid, $mid0) = @_;
+	$oid = '' unless defined $oid;
+	join("\n",
+		$self->subject,
+		$self->from,
+		$self->references,
+		$self->to,
+		$self->cc,
+		$oid,
+		$mid0,
+		$self->ds,
+		$self->{bytes},
+		$self->{lines}
+	);
+}
+
 sub load_from_data ($$) {
 	my ($self) = $_[0]; # data = $_[1]
-	my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]);
-	$self->{subject} = $subj;
-	$self->{from} = $from;
-	$self->{references} = $refs;
+	(
+		$self->{subject},
+		$self->{from},
+		$self->{references},
 
-	# To: and Cc: are stored to optimize HDR/XHDR in NNTP since
-	# some NNTP clients will use that for message displays.
-	$self->{to} = $to;
-	$self->{cc} = $cc;
+		# To: and Cc: are stored to optimize HDR/XHDR in NNTP since
+		# some NNTP clients will use that for message displays.
+		$self->{to},
+		$self->{cc},
 
-	$self->{blob} = $blob;
-	$self->{mid} = $mid0;
+		$self->{blob},
+		$self->{mid},
+		$self->{ds},
+		$self->{bytes},
+		$self->{lines}
+	) = split(/\n/, $_[1]);
 }
 
 sub load_expand {
@@ -56,7 +77,6 @@ sub load_expand {
 	my $doc = $self->{doc};
 	my $data = $doc->get_data or return;
 	$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
-	$self->{ds} = get_val($doc, &PublicInbox::Search::DS);
 	utf8::decode($data);
 	load_from_data($self, $data);
 	$self;
@@ -69,11 +89,9 @@ sub load_doc {
 }
 
 # :bytes and :lines metadata in RFC 3977
-sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
-sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) {
-	$_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM())
-}
+sub bytes ($) { $_[0]->{bytes} }
+sub lines ($) { $_[0]->{lines} }
+sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
 
 sub __hdr ($$) {
 	my ($self, $field) = @_;
@@ -134,14 +152,6 @@ sub ds {
 	$self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
 }
 
-sub to_doc_data {
-	my ($self, $oid, $mid0) = @_;
-	my @rows = ($self->subject, $self->from, $self->references,
-			$self->to, $self->cc);
-	$oid = '' unless defined $oid;
-	join("\n", @rows, $oid, $mid0);
-}
-
 sub references {
 	my ($self) = @_;
 	my $x = $self->{references};
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-04-01 23:31 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-01 23:23 [PATCH] search: reduce columns stored in Xapian Eric Wong (Contractor, The Linux Foundation)
  -- strict thread matches above, loose matches on Subject: below --
2018-04-01 23:31 Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).