user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 4/7] mbox: do not sort search results
Date: Thu,  5 Apr 2018 09:34:09 +0000	[thread overview]
Message-ID: <20180405093412.22348-5-e@80x24.org> (raw)
In-Reply-To: <20180405093412.22348-1-e@80x24.org>

Sorting large msets is a waste when it comes to mboxes
since MUAs should thread and sort them as the user desires.

This forces us to rework each of the mbox download mechanisms
to be more independent of each other, but might make things
easier to reason about.
---
 lib/PublicInbox/Mbox.pm   | 139 ++++++++++++++++++++++++++--------------------
 lib/PublicInbox/Search.pm |   6 +-
 2 files changed, 83 insertions(+), 62 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index c66ccaa..c5e1cb9 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -138,13 +138,24 @@ sub thread_mbox {
 	my ($ctx, $srch, $sfx) = @_;
 	eval { require IO::Compress::Gzip };
 	return sub { need_gzip(@_) } if $@;
-	my $prev = 0;
+	my $mid = $ctx->{mid};
+	my $msgs = $srch->get_thread($mid, 0);
+	return [404, [qw(Content-Type text/plain)], []] if !@$msgs;
+	my $prev = $msgs->[-1]->{num};
+	my $i = 0;
 	my $cb = sub {
-		my $msgs = $srch->get_thread($ctx->{mid}, $prev);
-		$prev = $msgs->[-1]->{num} if scalar(@$msgs);
-		$msgs;
+		while (1) {
+			if (my $smsg = $msgs->[$i++]) {
+				return $smsg;
+			}
+			# refill result set
+			$msgs = $srch->get_thread($mid, $prev);
+			return unless @$msgs;
+			$prev = $msgs->[-1]->{num};
+			$i = 0;
+		}
 	};
-	PublicInbox::MboxGz->response($ctx, $cb);
+	PublicInbox::MboxGz->response($ctx, $cb, $msgs->[0]->subject);
 }
 
 sub emit_range {
@@ -159,22 +170,55 @@ sub emit_range {
 	mbox_all($ctx, $query);
 }
 
+sub mbox_all_ids {
+	my ($ctx) = @_;
+	my $prev = 0;
+	my $ids = $ctx->{-inbox}->mm->ids_after(\$prev) or return
+		[404, [qw(Content-Type text/plain)], ["No results found\n"]];
+	my $i = 0;
+	my $over = $ctx->{srch}->{over_ro};
+	my $cb = sub {
+		do {
+			while ((my $num = $ids->[$i++])) {
+				my $smsg = $over->get_art($num) or next;
+				return $smsg;
+			}
+			$ids = $ctx->{-inbox}->mm->ids_after(\$prev);
+			$i = 0;
+		} while (@$ids);
+		undef;
+	};
+	return PublicInbox::MboxGz->response($ctx, $cb, 'all');
+}
+
 sub mbox_all {
 	my ($ctx, $query) = @_;
 
 	eval { require IO::Compress::Gzip };
 	return sub { need_gzip(@_) } if $@;
-	if ($query eq '') {
-		my $prev = 0;
-		my $cb = sub { $ctx->{-inbox}->mm->ids_after(\$prev) };
-		return PublicInbox::MboxGz->response($ctx, $cb, 'all');
-	}
-	my $opts = { offset => 0 };
+	return mbox_all_ids($ctx) if $query eq '';
+	my $opts = { mset => 2 };
 	my $srch = $ctx->{srch};
+	my $mset = $srch->query($query, $opts);
+	$opts->{offset} = $mset->size or
+			return [404, [qw(Content-Type text/plain)],
+				["No results found\n"]];
+	my $i = 0;
 	my $cb = sub { # called by MboxGz->getline
-		my $msgs = $srch->query($query, $opts);
-		$opts->{offset} += scalar @$msgs;
-		$msgs;
+		while (1) {
+			while (my $mi = (($mset->items)[$i++])) {
+				my $doc = $mi->get_document;
+				my $smsg = $srch->retry_reopen(sub {
+					PublicInbox::SearchMsg->load_doc($doc);
+				}) or next;
+				return $smsg;
+			}
+			# refill result set
+			$mset = $srch->query($query, $opts);
+			my $size = $mset->size or return;
+			$opts->{offset} += $size;
+			$i = 0;
+		}
 	};
 	PublicInbox::MboxGz->response($ctx, $cb, 'results-'.$query);
 }
@@ -206,7 +250,6 @@ sub new {
 		gz => IO::Compress::Gzip->new(\$buf, Time => 0),
 		cb => $cb,
 		ctx => $ctx,
-		msgs => [],
 	}, $class;
 }
 
@@ -214,60 +257,34 @@ sub response {
 	my ($class, $ctx, $cb, $fn) = @_;
 	my $body = $class->new($ctx, $cb);
 	# http://www.iana.org/assignments/media-types/application/gzip
-	$body->{hdr} = [ 'Content-Type', 'application/gzip' ];
-	$body->{fn} = $fn;
-	my $hdr = $body->getline; # fill in Content-Disposition filename
-	[ 200, $hdr, $body ];
-}
-
-sub set_filename ($$) {
-	my ($fn, $msg) = @_;
-	return to_filename($fn) if defined($fn);
-
-	PublicInbox::Mbox::subject_fn($msg);
+	my @h = qw(Content-Type application/gzip);
+	if ($fn) {
+		$fn = to_filename($fn);
+		push @h, 'Content-Disposition', "inline; filename=$fn.mbox.gz";
+	}
+	[ 200, \@h, $body ];
 }
 
 # called by Plack::Util::foreach or similar
 sub getline {
 	my ($self) = @_;
 	my $ctx = $self->{ctx} or return;
-	my $ibx = $ctx->{-inbox};
-	my $gz = $self->{gz};
-	my $msgs = $self->{msgs};
-	do {
-		# work on existing result set
-		while (defined(my $smsg = shift @$msgs)) {
-			# ids_after may return integers
-			ref($smsg) or
-				$smsg = $ctx->{srch}->{over_ro}->get_art($smsg);
-
-			my $msg = eval { $ibx->msg_by_smsg($smsg) } or next;
-			$msg = Email::Simple->new($msg);
-			$gz->write(PublicInbox::Mbox::msg_str($ctx, $msg,
-								$smsg->mid));
-
-			# use subject of first message as subject
-			if (my $hdr = delete $self->{hdr}) {
-				my $fn = set_filename($self->{fn}, $msg);
-				push @$hdr, 'Content-Disposition',
-						"inline; filename=$fn.mbox.gz";
-				return $hdr;
-			}
-			my $bref = $self->{buf};
-			if (length($$bref) >= 8192) {
-				my $ret = $$bref; # copy :<
-				${$self->{buf}} = '';
-				return $ret;
-			}
-
-			# be fair to other clients on public-inbox-httpd:
-			return '';
+	while (my $smsg = $self->{cb}->()) {
+		my $msg = $ctx->{-inbox}->msg_by_smsg($smsg) or next;
+		$msg = Email::Simple->new($msg);
+		$self->{gz}->write(PublicInbox::Mbox::msg_str($ctx, $msg,
+				$smsg->{mid}));
+		my $bref = $self->{buf};
+		if (length($$bref) >= 8192) {
+			my $ret = $$bref; # copy :<
+			${$self->{buf}} = '';
+			return $ret;
 		}
 
-		# refill result set
-		$msgs = $self->{msgs} = $self->{cb}->();
-	} while (@$msgs);
-	$gz->close;
+		# be fair to other clients on public-inbox-httpd:
+		return '';
+	}
+	delete($self->{gz})->close;
 	# signal that we're done and can return undef next call:
 	delete $self->{ctx};
 	${delete $self->{buf}};
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 4e014f4..9eb0728 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -219,7 +219,11 @@ sub _enquire_once {
 	$enquire->set_query($query);
 	$opts ||= {};
         my $desc = !$opts->{asc};
-	if ($opts->{relevance}) {
+	if (($opts->{mset} || 0) == 2) {
+		$enquire->set_docid_order(Search::Xapian::ENQ_ASCENDING());
+		$enquire->set_weighting_scheme(Search::Xapian::BoolWeight->new);
+		delete $self->{enquire};
+	} elsif ($opts->{relevance}) {
 		$enquire->set_sort_by_relevance_then_value(TS, $desc);
 	} else {
 		$enquire->set_sort_by_value_then_relevance(TS, $desc);
-- 
EW


  parent reply	other threads:[~2018-04-05  9:34 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-05  9:34 [PATCH 0/7] v2 odds and ends Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` [PATCH 1/7] v2writable: recount partitions after acquiring lock Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` [PATCH 2/7] searchmsg: remove unused `tid' and `path' methods Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` [PATCH 3/7] search: remove unnecessary OP_AND of query Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-04-05  9:34 ` [PATCH 5/7] searchview: minor cleanup Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` [PATCH 6/7] support altid mechanism for v2 Eric Wong (Contractor, The Linux Foundation)
2018-04-05  9:34 ` [PATCH 7/7] compact: better handling of over.sqlite3* files Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180405093412.22348-5-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).