From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH] searchthread: reduce indirection by removing container
Date: Fri, 17 Apr 2020 09:28:49 +0000 [thread overview]
Message-ID: <20200417092849.6271-1-e@yhbt.net> (raw)
We can rid ourselves of a layer of indirection by subclassing
PublicInbox::Smsg instead of using a container object to hold
each $smsg. Furthermore, the `{id}' vs. `{mid}' field name
confusion is eliminated.
This reduces the size of the $rootset passed to walk_thread by
around 15%, that is over 50K memory when rendering a /$INBOX/
landing page.
---
Documentation/technical/data_structures.txt | 8 +--
lib/PublicInbox/SearchThread.pm | 67 ++++++++++++---------
lib/PublicInbox/SearchView.pm | 4 +-
lib/PublicInbox/View.pm | 55 +++++++++--------
t/thread-cycle.t | 5 +-
5 files changed, 75 insertions(+), 64 deletions(-)
diff --git a/Documentation/technical/data_structures.txt b/Documentation/technical/data_structures.txt
index 08dfc7ac..46d5acff 100644
--- a/Documentation/technical/data_structures.txt
+++ b/Documentation/technical/data_structures.txt
@@ -61,15 +61,13 @@ Per-message classes
There may be hundreds or thousands of these objects in memory
at-a-time, so fields are pruned if unneeded.
-* PublicInbox::SearchThread::Msg - container for message threading
+* PublicInbox::SearchThread::Msg - subclass of Smsg
Common abbreviation: $cont or $node
Used by: PublicInbox::WWW
- The container we use for a non-recursive[1] variant of
+ The structure we use for a non-recursive[1] variant of
JWZ's algorithm: <https://www.jwz.org/doc/threading.html>.
- This holds a $smsg and is only used for message threading.
- This wrapper class may go away in the future and handled
- directly by PublicInbox::Smsg to save memory.
+ Nowadays, this is a re-blessed $smsg with additional fields.
As with $smsg objects, there may be hundreds or thousands
of these objects in memory at-a-time.
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
index 38d1aa6e..60f692b2 100644
--- a/lib/PublicInbox/SearchThread.pm
+++ b/lib/PublicInbox/SearchThread.pm
@@ -24,7 +24,16 @@ use PublicInbox::MID qw($MID_EXTRACT);
sub thread {
my ($msgs, $ordersub, $ctx) = @_;
- my $id_table = {};
+
+ # A. put all current $msgs (non-ghosts) into %id_table
+ my %id_table = map {;
+ # this delete saves around 4K across 1K messages
+ # TODO: move this to a more appropriate place, breaks tests
+ # if we do it during psgi_cull
+ delete $_->{num};
+
+ $_->{mid} => PublicInbox::SearchThread::Msg::cast($_);
+ } @$msgs;
# Sadly, we sort here anyways since the fill-in-the-blanks References:
# can be shakier if somebody used In-Reply-To with multiple, disparate
@@ -32,36 +41,21 @@ sub thread {
# always determine ordering when somebody uses multiple In-Reply-To.
# We'll trust the client Date: header here instead of the Received:
# time since this is for display (and not retrieval)
- _add_message($id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs;
+ _set_parent(\%id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs;
my $ibx = $ctx->{-inbox};
my $rootset = [ grep {
!delete($_->{parent}) && $_->visible($ibx)
- } values %$id_table ];
- $id_table = undef;
+ } values %id_table ];
$rootset = $ordersub->($rootset);
$_->order_children($ordersub, $ctx) for @$rootset;
$rootset;
}
-sub _get_cont_for_id ($$) {
- my ($id_table, $mid) = @_;
- $id_table->{$mid} ||= PublicInbox::SearchThread::Msg->new($mid);
-}
-
-sub _add_message ($$) {
- my ($id_table, $smsg) = @_;
-
- # A. if id_table...
- my $this = _get_cont_for_id($id_table, $smsg->{mid});
- $this->{smsg} = $smsg;
-
- # saves around 4K across 1K messages
- # TODO: move this to a more appropriate place, breaks tests
- # if we do it during psgi_cull
- delete $smsg->{num};
+sub _set_parent ($$) {
+ my ($id_table, $this) = @_;
# B. For each element in the message's References field:
- defined(my $refs = $smsg->{references}) or return;
+ defined(my $refs = $this->{references}) or return;
# This loop exists to help fill in gaps left from missing
# messages. It is not needed in a perfect world where
@@ -70,7 +64,8 @@ sub _add_message ($$) {
my $prev;
foreach my $ref ($refs =~ m/$MID_EXTRACT/go) {
# Find a Container object for the given Message-ID
- my $cont = _get_cont_for_id($id_table, $ref);
+ my $cont = $id_table->{$ref} //=
+ PublicInbox::SearchThread::Msg::ghost($ref);
# Link the References field's Containers together in
# the order implied by the References header
@@ -96,22 +91,31 @@ sub _add_message ($$) {
}
package PublicInbox::SearchThread::Msg;
+use base qw(PublicInbox::Smsg);
use strict;
use warnings;
use Carp qw(croak);
-sub new {
+# declare a ghost smsg (determined by absence of {blob})
+sub ghost {
bless {
- id => $_[1],
+ mid => $_[0],
children => {}, # becomes an array when sorted by ->order(...)
- }, $_[0];
+ }, __PACKAGE__;
+}
+
+# give a existing smsg the methods of this class
+sub cast {
+ my ($smsg) = @_;
+ $smsg->{children} = {};
+ bless $smsg, __PACKAGE__;
}
sub topmost {
my ($self) = @_;
my @q = ($self);
while (my $cont = shift @q) {
- return $cont if $cont->{smsg};
+ return $cont if $cont->{blob};
push @q, values %{$cont->{children}};
}
undef;
@@ -122,7 +126,7 @@ sub add_child {
croak "Cowardly refusing to become my own parent: $self"
if $self == $child;
- my $cid = $child->{id};
+ my $cid = $child->{mid};
# reparenting:
if (defined(my $parent = $child->{parent})) {
@@ -148,8 +152,13 @@ sub has_descendent {
# being folded/mangled by a MUA, and not a missing message.
sub visible ($$) {
my ($self, $ibx) = @_;
- ($self->{smsg} ||= eval { $ibx->smsg_by_mid($self->{id}) }) ||
- (scalar values %{$self->{children}});
+ return 1 if $self->{blob};
+ if (my $by_mid = $ibx->smsg_by_mid($self->{mid})) {
+ %$self = (%$self, %$by_mid);
+ 1;
+ } else {
+ (scalar values %{$self->{children}});
+ }
}
sub order_children {
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 4fbf59ef..4336e4d9 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -243,8 +243,8 @@ sub search_nav_bot {
sub sort_relevance {
[ sort {
- (eval { $b->topmost->{smsg}->{pct} } // 0) <=>
- (eval { $a->topmost->{smsg}->{pct} } // 0)
+ (eval { $b->topmost->{pct} } // 0) <=>
+ (eval { $a->topmost->{pct} } // 0)
} @{$_[0]} ]
}
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index b6d7acaf..9b62ed3c 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -297,11 +297,9 @@ sub _th_index_lite {
my $nr_c = scalar @$children;
my $nr_s = 0;
my $siblings;
- if (my $smsg = $node->{smsg}) {
- # delete saves about 200KB on a 1K message thread
- if (my $refs = delete $smsg->{references}) {
- ($$irt) = ($refs =~ m/$MID_EXTRACT\z/o);
- }
+ # delete saves about 200KB on a 1K message thread
+ if (my $refs = delete $node->{references}) {
+ ($$irt) = ($refs =~ m/$MID_EXTRACT\z/o);
}
my $irt_map = $mapping->{$$irt} if defined $$irt;
if (defined $irt_map) {
@@ -310,12 +308,12 @@ sub _th_index_lite {
$rv .= $pad . $irt_map->[0];
if ($idx > 0) {
my $prev = $siblings->[$idx - 1];
- my $pmid = $prev->{id};
+ my $pmid = $prev->{mid};
if ($idx > 2) {
my $s = ($idx - 1). ' preceding siblings ...';
$rv .= pad_link($pmid, $level, $s);
} elsif ($idx == 2) {
- my $ppmid = $siblings->[0]->{id};
+ my $ppmid = $siblings->[0]->{mid};
$rv .= $pad . $mapping->{$ppmid}->[0];
}
$rv .= $pad . $mapping->{$pmid}->[0];
@@ -328,26 +326,26 @@ sub _th_index_lite {
$attr =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self
$rv .= "<b>@ $attr";
if ($nr_c) {
- my $cmid = $children->[0]->{id};
+ my $cmid = $children->[0]->{mid};
$rv .= $pad . $mapping->{$cmid}->[0];
if ($nr_c > 2) {
my $s = ($nr_c - 1). ' more replies';
$rv .= pad_link($cmid, $level + 1, $s);
} elsif (my $cn = $children->[1]) {
- $rv .= $pad . $mapping->{$cn->{id}}->[0];
+ $rv .= $pad . $mapping->{$cn->{mid}}->[0];
}
}
my $next = $siblings->[$idx+1] if $siblings && $idx >= 0;
if ($next) {
- my $nmid = $next->{id};
+ my $nmid = $next->{mid};
$rv .= $pad . $mapping->{$nmid}->[0];
my $nnext = $nr_s - $idx;
if ($nnext > 2) {
my $s = ($nnext - 1).' subsequent siblings';
$rv .= pad_link($nmid, $level, $s);
} elsif (my $nn = $siblings->[$idx + 2]) {
- $rv .= $pad . $mapping->{$nn->{id}}->[0];
+ $rv .= $pad . $mapping->{$nn->{mid}}->[0];
}
}
$rv .= $pad ."<a\nhref=#r$id>$s_s, $s_c; $ctx->{s_nr}</a>\n";
@@ -369,7 +367,7 @@ sub walk_thread ($$$) {
sub pre_thread { # walk_thread callback
my ($ctx, $level, $node, $idx) = @_;
- $ctx->{mapping}->{$node->{id}} = [ '', $node, $idx, $level ];
+ $ctx->{mapping}->{$node->{mid}} = [ '', $node, $idx, $level ];
skel_dump($ctx, $level, $node);
}
@@ -388,8 +386,8 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback
my $node = shift @$q or next;
my $cl = $level + 1;
unshift @$q, map { ($cl, $_) } @{$node->{children}};
- if (my $smsg = $ctx->{-inbox}->smsg_mime($node->{smsg})) {
- return thread_index_entry($ctx, $level, $smsg);
+ if ($ctx->{-inbox}->smsg_mime($node)) {
+ return thread_index_entry($ctx, $level, $node);
} else {
return ghost_index_entry($ctx, $level, $node);
}
@@ -407,7 +405,7 @@ sub stream_thread ($$) {
my $node = shift @q or next;
my $cl = $level + 1;
unshift @q, map { ($cl, $_) } @{$node->{children}};
- $smsg = $ibx->smsg_mime($node->{smsg}) and last;
+ $smsg = $ibx->smsg_mime($node) and last;
}
return missing_thread($ctx) unless $smsg;
@@ -825,7 +823,7 @@ sub indent_for {
sub find_mid_root {
my ($ctx, $level, $node, $idx) = @_;
++$ctx->{root_idx} if $level == 0;
- if ($node->{id} eq $ctx->{mid}) {
+ if ($node->{mid} eq $ctx->{mid}) {
$ctx->{found_mid_at} = $ctx->{root_idx};
return 0;
}
@@ -899,8 +897,8 @@ sub dedupe_subject {
}
sub skel_dump { # walk_thread callback
- my ($ctx, $level, $node) = @_;
- my $smsg = $node->{smsg} or return _skel_ghost($ctx, $level, $node);
+ my ($ctx, $level, $smsg) = @_;
+ $smsg->{blob} or return _skel_ghost($ctx, $level, $smsg);
my $skel = $ctx->{skel};
my $cur = $ctx->{cur};
@@ -983,7 +981,7 @@ sub skel_dump { # walk_thread callback
sub _skel_ghost {
my ($ctx, $level, $node) = @_;
- my $mid = $node->{id};
+ my $mid = $node->{mid};
my $d = ' [not found] ';
$d .= ' ' if exists $ctx->{searchview};
$d .= indent_for($level) . th_pfx($level);
@@ -1006,18 +1004,23 @@ sub _skel_ghost {
sub sort_ds {
[ sort {
- (eval { $a->topmost->{smsg}->{ds} } || 0) <=>
- (eval { $b->topmost->{smsg}->{ds} } || 0)
+ (eval { $a->topmost->{ds} } || 0) <=>
+ (eval { $b->topmost->{ds} } || 0)
} @{$_[0]} ];
}
# accumulate recent topics if search is supported
# returns 200 if done, 404 if not
sub acc_topic { # walk_thread callback
- my ($ctx, $level, $node) = @_;
- my $mid = $node->{id};
- my $smsg = $node->{smsg} // $ctx->{-inbox}->smsg_by_mid($mid);
- if ($smsg) {
+ my ($ctx, $level, $smsg) = @_;
+ my $mid = $smsg->{mid};
+ my $has_blob = $smsg->{blob} // do {
+ if (my $by_mid = $ctx->{-inbox}->smsg_by_mid($mid)) {
+ %$smsg = (%$smsg, %$by_mid);
+ 1;
+ }
+ };
+ if ($has_blob) {
my $subj = subject_normalized($smsg->{subject});
$subj = '(no subject)' if $subj eq '';
my $ds = $smsg->{ds};
@@ -1208,7 +1211,7 @@ sub thread_adj_level {
sub ghost_index_entry {
my ($ctx, $level, $node) = @_;
my ($beg, $end) = thread_adj_level($ctx, $level);
- $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{id})
+ $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{mid})
. '</pre>' . $end;
}
diff --git a/t/thread-cycle.t b/t/thread-cycle.t
index e9ea0a27..d6545c6d 100644
--- a/t/thread-cycle.t
+++ b/t/thread-cycle.t
@@ -19,6 +19,7 @@ sub make_objs {
my $msg = $_;
$msg->{ds} ||= ++$n;
$msg->{references} =~ s/\s+/ /sg if $msg->{references};
+ $msg->{blob} = '0'x40; # any dummy value will do, here
my $simple = Email::Simple->create(header => [
'Message-ID' => "<$msg->{mid}>",
'References' => $msg->{references},
@@ -100,13 +101,13 @@ done_testing();
sub thread_to_s {
my ($msgs) = @_;
my $rootset = PublicInbox::SearchThread::thread($msgs, sub {
- [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] });
+ [ sort { $a->{mid} cmp $b->{mid} } @{$_[0]} ] });
my $st = '';
my @q = map { (0, $_) } @$rootset;
while (@q) {
my $level = shift @q;
my $node = shift @q or next;
- $st .= (" "x$level). "$node->{id}\n";
+ $st .= (" "x$level). "$node->{mid}\n";
my $cl = $level + 1;
unshift @q, map { ($cl, $_) } @{$node->{children}};
}
reply other threads:[~2020-04-17 9:28 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200417092849.6271-1-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).