user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 01/11] search: reduce redundant doc data
Date: Tue,  1 Sep 2015 08:55:18 +0000	[thread overview]
Message-ID: <1441097728-31950-1-git-send-email-e@80x24.org> (raw)

Redundant document data increases our database size, pull the
smsg->mid off the unique term, the smsg->ts off the value, and
only generate the formatted display date off smsg->ts.
---
 lib/PublicInbox/Search.pm    |  7 ++++---
 lib/PublicInbox/SearchIdx.pm |  2 --
 lib/PublicInbox/SearchMsg.pm | 42 ++++++++++++++++++++++--------------------
 lib/PublicInbox/View.pm      | 15 ++++++++-------
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index d3faaeb..b7b215f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -4,8 +4,9 @@
 package PublicInbox::Search;
 use strict;
 use warnings;
-use PublicInbox::SearchMsg;
+use constant TS => 0;
 use Search::Xapian qw/:standard/;
+use PublicInbox::SearchMsg;
 use Email::MIME;
 use PublicInbox::MID qw/mid_clean mid_compress/;
 
@@ -15,7 +16,6 @@ our $REPLY_RE = qr/^re:\s+/i;
 our $LANG = 'english';
 
 use constant {
-	TS => 0,
 	# SCHEMA_VERSION history
 	# 0 - initial
 	# 1 - subject_path is lower-cased
@@ -25,7 +25,8 @@ use constant {
 	# 5 - subject_path drops trailing '.'
 	# 6 - preserve References: order in document data
 	# 7 - remove references and inreplyto terms
-	SCHEMA_VERSION => 7,
+	# 8 - remove redundant/unneeded document data
+	SCHEMA_VERSION => 8,
 	QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
 };
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index dec3333..32e0714 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -81,8 +81,6 @@ sub add_message {
 			$doc->add_term(xpfx('path') . mid_compress($path));
 		}
 
-		my $from = $smsg->from_name;
-		my $date = $smsg->date;
 		my $ts = Search::Xapian::sortable_serialise($smsg->ts);
 		$doc->add_value(PublicInbox::Search::TS, $ts);
 
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 4ad8a0c..1821b07 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -13,6 +13,7 @@ use PublicInbox::MID qw/mid_clean mid_compress/;
 use Encode qw/find_encoding/;
 my $enc_utf8 = find_encoding('UTF-8');
 our $PFX2TERM_RE = undef;
+use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000';
 
 sub new {
 	my ($class, $mime) = @_;
@@ -30,13 +31,17 @@ sub wrap {
 sub load_doc {
 	my ($class, $doc) = @_;
 	my $data = $doc->get_data;
+	my $ts = eval {
+		no strict 'subs';
+		$doc->get_value(PublicInbox::Search::TS);
+	};
+	$ts = Search::Xapian::sortable_unserialise($ts);
 	$data = $enc_utf8->decode($data);
-	my ($mid, $subj, $from, $date, $refs) = split(/\n/, $data);
+	my ($subj, $from, $refs) = split(/\n/, $data);
 	bless {
 		doc => $doc,
-		mid => $mid,
 		subject => $subj,
-		date => $date,
+		ts => $ts,
 		from_name => $from,
 		references_sorted => $refs,
 	}, $class;
@@ -77,27 +82,13 @@ sub from_name {
 
 sub ts {
 	my ($self) = @_;
-	my $ts = $self->{ts};
-	return $ts if $ts;
-	$self->{ts} = eval {
-		str2time($self->date || $self->mime->header('Date'))
-	} || 0;
-}
-
-sub date {
-	my ($self) = @_;
-	my $date = $self->{date};
-	return $date if $date;
-	my $ts = eval { str2time($self->mime->header('Date')) };
-	$self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
+	$self->{ts} ||= eval { str2time($self->mime->header('Date')) } || 0;
 }
 
 sub to_doc_data {
 	my ($self) = @_;
-	$self->mid . "\n" .
 	PublicInbox::Search::subject_summary($self->subject) . "\n" .
 	$self->from_name . "\n".
-	$self->date . "\n" .
 	$self->references_sorted;
 }
 
@@ -139,14 +130,23 @@ sub mini_mime {
 	my @h = (
 		Subject => $self->subject,
 		'X-PI-From' => $self->from_name,
-		'X-PI-Date' => $self->date,
 		'X-PI-TS' => $self->ts,
 		'Message-ID' => "<$self->{mid}>",
+
+		# prevent Email::Simple::Creator from running,
+		# this header is useless for threading as we use X-PI-TS
+		# for sorting and display:
+		'Date' => EPOCH_822,
 	);
 
 	my $refs = $self->{references_sorted};
 	my $mime = Email::MIME->create(header_str => \@h);
-	$mime->header_set('References', $refs) if (defined $refs);
+	my $h = $mime->header_obj;
+	$h->header_set('References', $refs) if (defined $refs);
+
+	# drop useless headers Email::MIME set for us
+	$h->header_set('Date');
+	$h->header_set('MIME-Version');
 	$mime;
 }
 
@@ -155,6 +155,8 @@ sub mid {
 
 	if (defined $mid) {
 		$self->{mid} = $mid;
+	} elsif (my $rv = $self->{mid}) {
+		$rv;
 	} else {
 		$self->ensure_metadata; # needed for ghosts
 		$self->{mid} ||= $self->_extract_mid;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 584a2d7..477c4b6 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -86,12 +86,7 @@ sub index_entry {
 		$subj = "<u\nid=\"u\">$subj</u>";
 	}
 
-	my $ts = $mime->header('X-PI-TS');
-	unless (defined $ts) {
-		$ts = msg_timestamp($mime);
-	}
-	$ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
-
+	my $ts = _msg_date($mime);
 	my $rv = "<table\nsummary=l$level><tr>";
 	if ($level) {
 		$rv .= '<td><pre>' . ('  ' x $level) . '</pre></td>';
@@ -561,6 +556,12 @@ sub missing_thread {
 EOF
 }
 
+sub _msg_date {
+	my ($mime) = @_;
+	my $ts = $mime->header('X-PI-TS') || msg_timestamp($mime);
+	POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
+}
+
 sub _inline_header {
 	my ($dst, $state, $mime, $level) = @_;
 	my $pfx = '  ' x $level;
@@ -568,7 +569,7 @@ sub _inline_header {
 	my $cur = $state->{cur};
 	my $mid = $mime->header('Message-ID');
 	my $f = $mime->header('X-PI-From');
-	my $d = $mime->header('X-PI-Date');
+	my $d = _msg_date($mime);
 	$f = PublicInbox::Hval->new($f);
 	$d = PublicInbox::Hval->new($d);
 	$f = $f->as_html;
-- 
EW


             reply	other threads:[~2015-09-01  8:55 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-01  8:55 Eric Wong [this message]
2015-09-01  8:55 ` [PATCH 02/11] search: allow querying all mail with '' Eric Wong
2015-09-01  8:55 ` [PATCH 03/11] search: show newest results first Eric Wong
2015-09-01  8:55 ` [PATCH 04/11] feed: use updated date based on git commit date Eric Wong
2015-09-01  8:55 ` [PATCH 05/11] feed: extract atom header generation Eric Wong
2015-09-01  8:55 ` [PATCH 06/11] implement per-thread Atom feeds Eric Wong
2015-09-01  9:30   ` [13/11 PATCH] feed: fix <updated> tag in Atom feed Eric Wong
2015-09-01  8:55 ` [PATCH 07/11] www: compile mbox regexp only once Eric Wong
2015-09-01  8:55 ` [PATCH 08/11] www: root atom feed is "new.atom" and not "atom.xml" Eric Wong
2015-09-01  8:55 ` [PATCH 09/11] completely revamp URL structure to shorten permalinks Eric Wong
2015-09-01  8:55 ` [PATCH 10/11] view: drop extra '</a>' tag Eric Wong
2015-09-01  8:55 ` [PATCH 11/11] view: more robust link generation Eric Wong
2015-09-01  9:08   ` [PATCH 12/11] view: add missing space Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1441097728-31950-1-git-send-email-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).