about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-07 03:41:53 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-07 03:42:29 +0000
commit3348ad4b3b1a0865ee58a902953165ea0f4aa4bd (patch)
treefd17dd1b4434cad0dc211c5e890e8c0d5a0d07ce
parent42c485400522c7c255f6da11391526cb1bc5931b (diff)
downloadpublic-inbox-3348ad4b3b1a0865ee58a902953165ea0f4aa4bd.tar.gz
Since we only query the SQLite over DB for OVER/XOVER; do not
need to waste space storing fields To/Cc/:bytes/:lines or the
XNUM term.  We only use From/Subject/References/Message-ID/:blob
in various places of the PSGI code.

For reindexing, we will take advantage of docid stability
in "xapian-compact --no-renumber" to ensure duplicates do not
show up in search results.  Since the PSGI interface is the
only consumer of Xapian at the moment, it has no need to
search based on NNTP article number.
-rw-r--r--lib/PublicInbox/NNTP.pm2
-rw-r--r--lib/PublicInbox/OverIdx.pm6
-rw-r--r--lib/PublicInbox/SearchIdx.pm37
-rw-r--r--lib/PublicInbox/SearchMsg.pm6
-rw-r--r--lib/PublicInbox/V2Writable.pm2
-rwxr-xr-xscript/public-inbox-compact6
-rw-r--r--t/search.t24
-rw-r--r--t/v2writable.t7
8 files changed, 31 insertions, 59 deletions
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index fa890cb2..ace56e7a 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -725,7 +725,7 @@ sub hdr_searchmsg ($$$$) {
                         my $nr = scalar @$msgs or return;
                         my $tmp = '';
                         foreach my $s (@$msgs) {
-                                $tmp .= $s->num . ' ' . $s->$field . "\r\n";
+                                $tmp .= $s->{num} . ' ' . $s->$field . "\r\n";
                         }
                         utf8::encode($tmp);
                         do_more($self, $tmp);
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 08f87447..62fec0da 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -207,8 +207,8 @@ sub link_refs {
         $tid;
 }
 
-sub parse_references ($$$$) {
-        my ($self, $smsg, $mid0, $mids) = @_;
+sub parse_references ($$$) {
+        my ($smsg, $mid0, $mids) = @_;
         my $mime = $smsg->{mime};
         my $hdr = $mime->header_obj;
         my $refs = references($hdr);
@@ -241,7 +241,7 @@ sub add_overview {
                 blob => $oid,
         }, 'PublicInbox::SearchMsg';
         my $mids = mids($mime->header_obj);
-        my $refs = $self->parse_references($smsg, $mid0, $mids);
+        my $refs = parse_references($smsg, $mid0, $mids);
         my $subj = $smsg->subject;
         my $xpath;
         if ($subj ne '') {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 7cfa7452..f9b40b0d 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -273,18 +273,12 @@ sub add_message {
                 my $smsg = PublicInbox::SearchMsg->new($mime);
                 my $doc = $smsg->{doc};
                 my $subj = $smsg->subject;
-
-                $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
-                defined $bytes or $bytes = length($mime->as_string);
-                $smsg->{bytes} = $bytes;
-
                 add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
                 my @ds = gmtime($smsg->ds);
                 my $yyyymmdd = strftime('%Y%m%d', @ds);
                 add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
                 my $dt = strftime('%Y%m%d%H%M%S', @ds);
                 add_val($doc, PublicInbox::Search::DT(), $dt);
-                my @vals = ($smsg->{ts}, $smsg->{ds});
 
                 my $tg = $self->term_generator;
 
@@ -333,11 +327,11 @@ sub add_message {
                         index_body($tg, \@orig, $doc) if @orig;
                 });
 
-                # populates smsg->references for smsg->to_doc_data
-                my $data = $smsg->to_doc_data($oid, $mid0);
                 foreach my $mid (@$mids) {
                         $tg->index_text($mid, 1, 'XM');
                 }
+                $smsg->{to} = $smsg->{cc} = '';
+                my $data = $smsg->to_doc_data($oid, $mid0);
                 $doc->set_data($data);
                 if (my $altid = $self->{-altid}) {
                         foreach my $alt (@$altid) {
@@ -350,24 +344,11 @@ sub add_message {
                         }
                 }
 
-                $self->delete_article($num) if defined $num; # for reindexing
-
                 if (my $over = $self->{over}) {
-                        utf8::encode($data);
-                        $data = compress($data);
-                        my $refs = $over->parse_references($smsg, $mid0, $mids);
-                        my $xpath;
-                        if ($subj ne '') {
-                                $xpath = $self->subject_path($subj);
-                                $xpath = id_compress($xpath);
-                        }
-
-                        push @vals, $num, $mids, $refs, $xpath, $data;
-                        $over->add_over(\@vals);
+                        $over->add_overview($mime, $bytes, $num, $oid, $mid0);
                 }
                 $doc->add_boolean_term('Q' . $_) foreach @$mids;
-                $doc->add_boolean_term('XNUM' . $num) if defined $num;
-                $doc_id = $self->{xdb}->add_document($doc);
+                $self->{xdb}->replace_document($doc_id = $num, $doc);
         };
 
         if ($@) {
@@ -419,16 +400,6 @@ sub remove_message {
         }
 }
 
-sub delete_article {
-        my ($self, $num) = @_;
-        my $ndel = 0;
-        batch_do($self, 'XNUM' . $num, sub {
-                my ($ids) = @_;
-                $ndel += scalar @$ids;
-                $self->{xdb}->delete_document($_) for @$ids;
-        });
-}
-
 # MID is a hint in V2
 sub remove_by_oid {
         my ($self, $oid, $mid) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 3278802b..ab971e00 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -45,12 +45,11 @@ sub to_doc_data {
                 $self->cc,
                 $oid,
                 $mid0,
-                $self->{bytes},
-                $self->{lines}
+                $self->{bytes} || '',
+                $self->{lines} || ''
         );
 }
 
-
 sub load_from_data ($$) {
         my ($self) = $_[0]; # data = $_[1]
         (
@@ -92,7 +91,6 @@ sub load_doc {
 # :bytes and :lines metadata in RFC 3977
 sub bytes ($) { $_[0]->{bytes} }
 sub lines ($) { $_[0]->{lines} }
-sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
 
 sub __hdr ($$) {
         my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 53fdb738..1cc4b005 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -800,7 +800,7 @@ sub unindex_oid {
                 my %gone;
                 my ($id, $prev);
                 while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
-                        $gone{$smsg->num} = 1 if $oid eq $smsg->{blob};
+                        $gone{$smsg->{num}} = 1 if $oid eq $smsg->{blob};
                         1; # continue
                 }
                 my $n = scalar keys %gone;
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index d855b9e1..9f332657 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -48,7 +48,7 @@ sub commit_changes ($$$) {
         $im->lock_release;
         remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
 }
-
+my @compact = qw(xapian-compact --no-renumber);
 if ($v == 2) {
         require PublicInbox::V2Writable;
         my $v2w = PublicInbox::V2Writable->new($ibx);
@@ -70,7 +70,7 @@ if ($v == 2) {
                 }
                 close $dh;
                 die "No Xapian parts found in $old\n" unless @parts;
-                my $cmd = ['xapian-compact', @parts, "$new/0" ];
+                my $cmd = [@compact, @parts, "$new/0" ];
                 PublicInbox::Import::run_die($cmd);
                 commit_changes($v2w, $old, $new);
         });
@@ -84,7 +84,7 @@ if ($v == 2) {
         my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $v1_root);
         $ibx->with_umask(sub {
                 $im->lock_acquire;
-                PublicInbox::Import::run_die(['xapian-compact', $old, $new]);
+                PublicInbox::Import::run_die([@compact, $old, $new]);
                 commit_changes($im, $old, $new);
         });
 } else {
diff --git a/t/search.t b/t/search.t
index fda32d36..516f5670 100644
--- a/t/search.t
+++ b/t/search.t
@@ -306,31 +306,33 @@ sub filter_mids {
 
 # names and addresses
 {
-        my $res = $ro->query('t:list@example.com');
-        is(scalar @$res, 6, 'searched To: successfully');
-        foreach my $smsg (@$res) {
+        my $mset = $ro->query('t:list@example.com', {mset => 1});
+        is($mset->size, 6, 'searched To: successfully');
+        foreach my $m ($mset->items) {
+                my $smsg = $ro->lookup_article($m->get_docid);
                 like($smsg->to, qr/\blist\@example\.com\b/, 'to appears');
         }
 
-        $res = $ro->query('tc:list@example.com');
-        is(scalar @$res, 6, 'searched To+Cc: successfully');
-        foreach my $smsg (@$res) {
+        $mset = $ro->query('tc:list@example.com', {mset => 1});
+        is($mset->size, 6, 'searched To+Cc: successfully');
+        foreach my $m ($mset->items) {
+                my $smsg = $ro->lookup_article($m->get_docid);
                 my $tocc = join("\n", $smsg->to, $smsg->cc);
                 like($tocc, qr/\blist\@example\.com\b/, 'tocc appears');
         }
 
         foreach my $pfx ('tcf:', 'c:') {
-                $res = $ro->query($pfx . 'foo@example.com');
-                is(scalar @$res, 1,
-                        "searched $pfx successfully for Cc:");
-                foreach my $smsg (@$res) {
+                my $mset = $ro->query($pfx . 'foo@example.com', { mset => 1 });
+                is($mset->items, 1, "searched $pfx successfully for Cc:");
+                foreach my $m ($mset->items) {
+                        my $smsg = $ro->lookup_article($m->get_docid);
                         like($smsg->cc, qr/\bfoo\@example\.com\b/,
                                 'cc appears');
                 }
         }
 
         foreach my $pfx ('', 'tcf:', 'f:') {
-                $res = $ro->query($pfx . 'Laggy');
+                my $res = $ro->query($pfx . 'Laggy');
                 is(scalar(@$res), 1,
                         "searched $pfx successfully for From:");
                 foreach my $smsg (@$res) {
diff --git a/t/v2writable.t b/t/v2writable.t
index b543c53f..85fb6a6d 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -220,13 +220,14 @@ EOF
                 'commit message propagated to git');
         is_deeply(\@after, \@before, 'only one commit written to git');
         is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
-        like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
-        is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+        my $num = $smsg->{num};
+        like($num, qr/\A\d+\z/, 'numeric number in return message');
+        is($ibx->mm->mid_for($num), undef, 'no longer in Msgmap by num');
         my $srch = $ibx->search->reopen;
         my $mset = $srch->query('m:'.$smsg->mid, { mset => 1});
         is($mset->size, 0, 'no longer found in Xapian');
         my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames);
-        is($srch->{over_ro}->get_art($smsg->num), undef,
+        is($srch->{over_ro}->get_art($num), undef,
                 'removal propagated to Over DB');
 
         my $after = $git0->qx(@log1);