From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.1 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=unavailable version=3.3.2 X-Original-To: meta@public-inbox.org Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C5DFC1FAAF for ; Tue, 1 Sep 2015 08:55:28 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 01/11] search: reduce redundant doc data Date: Tue, 1 Sep 2015 08:55:18 +0000 Message-Id: <1441097728-31950-1-git-send-email-e@80x24.org> List-Id: Redundant document data increases our database size, pull the smsg->mid off the unique term, the smsg->ts off the value, and only generate the formatted display date off smsg->ts. --- lib/PublicInbox/Search.pm | 7 ++++--- lib/PublicInbox/SearchIdx.pm | 2 -- lib/PublicInbox/SearchMsg.pm | 42 ++++++++++++++++++++++-------------------- lib/PublicInbox/View.pm | 15 ++++++++------- 4 files changed, 34 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index d3faaeb..b7b215f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -4,8 +4,9 @@ package PublicInbox::Search; use strict; use warnings; -use PublicInbox::SearchMsg; +use constant TS => 0; use Search::Xapian qw/:standard/; +use PublicInbox::SearchMsg; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compress/; @@ -15,7 +16,6 @@ our $REPLY_RE = qr/^re:\s+/i; our $LANG = 'english'; use constant { - TS => 0, # SCHEMA_VERSION history # 0 - initial # 1 - subject_path is lower-cased @@ -25,7 +25,8 @@ use constant { # 5 - subject_path drops trailing '.' # 6 - preserve References: order in document data # 7 - remove references and inreplyto terms - SCHEMA_VERSION => 7, + # 8 - remove redundant/unneeded document data + SCHEMA_VERSION => 8, QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index dec3333..32e0714 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -81,8 +81,6 @@ sub add_message { $doc->add_term(xpfx('path') . mid_compress($path)); } - my $from = $smsg->from_name; - my $date = $smsg->date; my $ts = Search::Xapian::sortable_serialise($smsg->ts); $doc->add_value(PublicInbox::Search::TS, $ts); diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 4ad8a0c..1821b07 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -13,6 +13,7 @@ use PublicInbox::MID qw/mid_clean mid_compress/; use Encode qw/find_encoding/; my $enc_utf8 = find_encoding('UTF-8'); our $PFX2TERM_RE = undef; +use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000'; sub new { my ($class, $mime) = @_; @@ -30,13 +31,17 @@ sub wrap { sub load_doc { my ($class, $doc) = @_; my $data = $doc->get_data; + my $ts = eval { + no strict 'subs'; + $doc->get_value(PublicInbox::Search::TS); + }; + $ts = Search::Xapian::sortable_unserialise($ts); $data = $enc_utf8->decode($data); - my ($mid, $subj, $from, $date, $refs) = split(/\n/, $data); + my ($subj, $from, $refs) = split(/\n/, $data); bless { doc => $doc, - mid => $mid, subject => $subj, - date => $date, + ts => $ts, from_name => $from, references_sorted => $refs, }, $class; @@ -77,27 +82,13 @@ sub from_name { sub ts { my ($self) = @_; - my $ts = $self->{ts}; - return $ts if $ts; - $self->{ts} = eval { - str2time($self->date || $self->mime->header('Date')) - } || 0; -} - -sub date { - my ($self) = @_; - my $date = $self->{date}; - return $date if $date; - my $ts = eval { str2time($self->mime->header('Date')) }; - $self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); + $self->{ts} ||= eval { str2time($self->mime->header('Date')) } || 0; } sub to_doc_data { my ($self) = @_; - $self->mid . "\n" . PublicInbox::Search::subject_summary($self->subject) . "\n" . $self->from_name . "\n". - $self->date . "\n" . $self->references_sorted; } @@ -139,14 +130,23 @@ sub mini_mime { my @h = ( Subject => $self->subject, 'X-PI-From' => $self->from_name, - 'X-PI-Date' => $self->date, 'X-PI-TS' => $self->ts, 'Message-ID' => "<$self->{mid}>", + + # prevent Email::Simple::Creator from running, + # this header is useless for threading as we use X-PI-TS + # for sorting and display: + 'Date' => EPOCH_822, ); my $refs = $self->{references_sorted}; my $mime = Email::MIME->create(header_str => \@h); - $mime->header_set('References', $refs) if (defined $refs); + my $h = $mime->header_obj; + $h->header_set('References', $refs) if (defined $refs); + + # drop useless headers Email::MIME set for us + $h->header_set('Date'); + $h->header_set('MIME-Version'); $mime; } @@ -155,6 +155,8 @@ sub mid { if (defined $mid) { $self->{mid} = $mid; + } elsif (my $rv = $self->{mid}) { + $rv; } else { $self->ensure_metadata; # needed for ghosts $self->{mid} ||= $self->_extract_mid; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 584a2d7..477c4b6 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -86,12 +86,7 @@ sub index_entry { $subj = "$subj"; } - my $ts = $mime->header('X-PI-TS'); - unless (defined $ts) { - $ts = msg_timestamp($mime); - } - $ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); - + my $ts = _msg_date($mime); my $rv = ""; if ($level) { $rv .= '
' . ('  ' x $level) . '
'; @@ -561,6 +556,12 @@ sub missing_thread { EOF } +sub _msg_date { + my ($mime) = @_; + my $ts = $mime->header('X-PI-TS') || msg_timestamp($mime); + POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); +} + sub _inline_header { my ($dst, $state, $mime, $level) = @_; my $pfx = ' ' x $level; @@ -568,7 +569,7 @@ sub _inline_header { my $cur = $state->{cur}; my $mid = $mime->header('Message-ID'); my $f = $mime->header('X-PI-From'); - my $d = $mime->header('X-PI-Date'); + my $d = _msg_date($mime); $f = PublicInbox::Hval->new($f); $d = PublicInbox::Hval->new($d); $f = $f->as_html; -- EW