about summary refs log tree commit homepage
path: root/lib/PublicInbox/OverIdx.pm
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-07 03:41:49 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-07 03:42:26 +0000
commit06cd576bb9b73eb177728fead06de8c3fac8d7a3 (patch)
tree87433009229de29532053f25b9a7fd4a35314321 /lib/PublicInbox/OverIdx.pm
parent59b5b84b8843d54024b5bb182f02b9b487a1ed29 (diff)
downloadpublic-inbox-06cd576bb9b73eb177728fead06de8c3fac8d7a3.tar.gz
Since the overview stuff is a synchronization point anyways,
move it into the main V2Writable process and allow us to
drop a bunch of code.  This is another step towards making
Xapian optional for v2.

In other words, the fan-out point is moved and the Xapian
partitions no longer need to synchronize against each other:

Before:
                     /-------->\
                    /---------->\
     v2writable -->+----parts----> over
                    \---------->/
                     \-------->/

After:

                          /---------->
                         /----------->
  v2writable --> over-->+----parts--->
                         \----------->
                          \---------->

Since the overview/threading logic needs to run on the same core
that feeds git-fast-import, it's slower for small repos but is
not noticeable in large imports where I/O wait in the partitions
dominates.
Diffstat (limited to 'lib/PublicInbox/OverIdx.pm')
-rw-r--r--lib/PublicInbox/OverIdx.pm57
1 files changed, 56 insertions, 1 deletions
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 28e4aa9c..08f87447 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -2,14 +2,21 @@
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI
-# Unlike Msgmap, this is an _UNSTABLE_ database which can be
+# Unlike Msgmap, this is an _UNSTABLE_ cache which can be
 # tweaked/updated over time and rebuilt.
+#
+# Ghost messages (messages which are only referenced in References/In-Reply-To)
+# are denoted by a negative NNTP article number.
 package PublicInbox::OverIdx;
 use strict;
 use warnings;
 use base qw(PublicInbox::Over);
 use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
+use PublicInbox::MID qw/id_compress mids references/;
+use PublicInbox::SearchMsg;
+use Compress::Zlib qw(compress);
+use PublicInbox::Search;
 
 sub dbh_new {
         my ($self) = @_;
@@ -200,6 +207,54 @@ sub link_refs {
         $tid;
 }
 
+sub parse_references ($$$$) {
+        my ($self, $smsg, $mid0, $mids) = @_;
+        my $mime = $smsg->{mime};
+        my $hdr = $mime->header_obj;
+        my $refs = references($hdr);
+        push(@$refs, @$mids) if scalar(@$mids) > 1;
+        return $refs if scalar(@$refs) == 0;
+
+        # prevent circular references here:
+        my %seen = ( $mid0 => 1 );
+        my @keep;
+        foreach my $ref (@$refs) {
+                if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) {
+                        warn "References: <$ref> too long, ignoring\n";
+                        next;
+                }
+                next if $seen{$ref}++;
+                push @keep, $ref;
+        }
+        $smsg->{references} = '<'.join('> <', @keep).'>' if @keep;
+        \@keep;
+}
+
+sub add_overview {
+        my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+        my $lines = $mime->body_raw =~ tr!\n!\n!;
+        my $smsg = bless {
+                mime => $mime,
+                mid => $mid0,
+                bytes => $bytes,
+                lines => $lines,
+                blob => $oid,
+        }, 'PublicInbox::SearchMsg';
+        my $mids = mids($mime->header_obj);
+        my $refs = $self->parse_references($smsg, $mid0, $mids);
+        my $subj = $smsg->subject;
+        my $xpath;
+        if ($subj ne '') {
+                $xpath = PublicInbox::Search::subject_path($subj);
+                $xpath = id_compress($xpath);
+        }
+        my $dd = $smsg->to_doc_data($oid, $mid0);
+        utf8::encode($dd);
+        $dd = compress($dd);
+        my $values = [ $smsg->ts, $smsg->ds, $num, $mids, $refs, $xpath, $dd ];
+        add_over($self, $values);
+}
+
 sub add_over {
         my ($self, $values) = @_;
         my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values;