user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 6/9] *idx: pass $smsg in more places instead of many args
  2020-03-20  8:18  7% ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
@ 2020-03-20  8:18  5%   ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

We can pass blessed PublicInbox::Smsg objects to internal
indexing APIs instead of having long parameter lists in some
places.  The end goal is to avoid parsing redundant information
each step of the way and hopefully make things more
understandable.
---
 lib/PublicInbox/OverIdx.pm        | 14 +++-------
 lib/PublicInbox/SearchIdx.pm      | 45 ++++++++++++++++++-------------
 lib/PublicInbox/SearchIdxShard.pm | 16 +++++++++--
 lib/PublicInbox/V2Writable.pm     |  8 +++++-
 t/search-thr-index.t              | 17 ++++++++++--
 5 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index f49dfa00..2d71956d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -245,15 +245,9 @@ sub subject_path ($) {
 }
 
 sub add_overview {
-	my ($self, $mime, $bytes, $num, $oid, $mid0, $times) = @_;
-	my $lines = $mime->body_raw =~ tr!\n!\n!;
-	my $smsg = bless {
-		mime => $mime,
-		mid => $mid0,
-		bytes => $bytes,
-		lines => $lines,
-		blob => $oid,
-	}, 'PublicInbox::Smsg';
+	my ($self, $mime, $smsg, $times) = @_;
+	$smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+	$smsg->{mime} = $mime; # XXX temporary?
 	my $hdr = $mime->header_obj;
 	my $mids = mids_for_index($hdr);
 	my $refs = parse_references($smsg, $hdr, $mids);
@@ -268,7 +262,7 @@ sub add_overview {
 	$dd = compress($dd);
 	my $ds = msg_timestamp($hdr, $times->{autime});
 	my $ts = msg_datestamp($hdr, $times->{cotime});
-	my $values = [ $ts, $ds, $num, $mids, $refs, $xpath, $dd ];
+	my $values = [ $ts, $ds, $smsg->{num}, $mids, $refs, $xpath, $dd ];
 	add_over($self, $values);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 32be9c3f..5ca819c3 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -306,9 +306,9 @@ sub index_xapian { # msg_iter callback
 	index_body($self, $_, /\A>/ ? 0 : $doc) for @sections;
 }
 
-sub add_xapian ($$$$$$) {
-	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
-	my $smsg = PublicInbox::Smsg->new($mime);
+sub add_xapian ($$$$) {
+	my ($self, $mime, $smsg, $mids) = @_;
+	$smsg->{mime} = $mime; # XXX dangerous
 	my $hdr = $mime->header_obj;
 	$smsg->{ds} = msg_datestamp($hdr, $self->{autime});
 	$smsg->{ts} = msg_timestamp($hdr, $self->{cotime});
@@ -338,9 +338,7 @@ sub add_xapian ($$$$$$) {
 			index_text($self, join(' ', @long), 1, 'XM');
 		}
 	}
-	$smsg->{to} = $smsg->{cc} = '';
-	$smsg->{blob} = $oid;
-	$smsg->{mid} = $mid0;
+	$smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP
 	PublicInbox::OverIdx::parse_references($smsg, $hdr, $mids);
 	my $data = $smsg->to_doc_data;
 	$doc->set_data($data);
@@ -355,7 +353,7 @@ sub add_xapian ($$$$$$) {
 		}
 	}
 	$doc->add_boolean_term('Q' . $_) foreach @$mids;
-	$self->{xdb}->replace_document($num, $doc);
+	$self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
 sub _msgmap_init ($) {
@@ -369,20 +367,25 @@ sub _msgmap_init ($) {
 
 sub add_message {
 	# mime = Email::MIME object
-	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $mids = mids_for_index($mime->header_obj);
-	$mid0 //= $mids->[0]; # v1 compatibility
-	$num //= do { # v1
+	$smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat
+	$smsg->{mid} //= $mids->[0]; # v1 compatibility
+	$smsg->{num} //= do { # v1
 		_msgmap_init($self);
 		index_mm($self, $mime);
 	};
 	eval {
-		if (need_xapian($self)) {
-			add_xapian($self, $mime, $num, $oid, $mids, $mid0);
+		# order matters, overview stores every possible piece of
+		# data in doc_data (deflated).  Xapian only stores a subset
+		# of the fields which exist in over.sqlite3.  We may stop
+		# storing doc_data in Xapian sometime after we get multi-inbox
+		# search working.
+		if (my $over = $self->{over}) { # v1 only
+			$over->add_overview($mime, $smsg, $self);
 		}
-		if (my $over = $self->{over}) {
-			$over->add_overview($mime, $bytes, $num, $oid, $mid0,
-						$self);
+		if (need_xapian($self)) {
+			add_xapian($self, $mime, $smsg, $mids);
 		}
 	};
 
@@ -390,7 +393,7 @@ sub add_message {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$num;
+	$smsg->{num};
 }
 
 # returns begin and end PostingIterator
@@ -530,9 +533,10 @@ sub unindex_mm {
 }
 
 sub index_both {
-	my ($self, $mime, $bytes, $blob) = @_;
+	my ($self, $mime, $smsg) = @_;
 	my $num = index_mm($self, $mime);
-	add_message($self, $mime, $bytes, $num, $blob);
+	$smsg->{num} = $num;
+	add_message($self, $mime, $smsg);
 }
 
 sub unindex_both {
@@ -595,8 +599,11 @@ sub read_log {
 				next;
 			}
 			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+			my $smsg = bless {}, 'PublicInbox::Smsg';
 			batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
-			$add_cb->($self, $mime, $bytes, $blob);
+			$smsg->{blob} = $blob;
+			$smsg->{bytes} = $bytes;
+			$add_cb->($self, $mime, $smsg);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
 			$D{$blob} = 1;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 74c624a4..d29e6090 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -76,7 +76,13 @@ sub shard_worker_loop ($$$$$) {
 			$artnum = int($artnum);
 			$self->{autime} = $autime;
 			$self->{cotime} = $cotime;
-			$self->add_message($mime, $n, $artnum, $oid, $mid0);
+			my $smsg = bless {
+				bytes => $len,
+				num => $artnum,
+				blob => $oid,
+				mid => $mid0,
+			}, 'PublicInbox::Smsg';
+			$self->add_message($mime, $smsg);
 		}
 	}
 	$self->worker_done;
@@ -95,7 +101,13 @@ sub index_raw {
 		$self->begin_txn_lazy;
 		$self->{autime} = $at;
 		$self->{cotime} = $ct;
-		$self->add_message($mime, $bytes, $artnum, $oid, $mid0);
+		my $smsg = bless {
+			bytes => $bytes,
+			num => $artnum,
+			blob => $oid,
+			mid => $mid0,
+		}, 'PublicInbox::Smsg';
+		$self->add_message($mime, $smsg);
 	}
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index d39a6f89..34dd139b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -150,7 +150,13 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$$$$) {
 	my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
-	$self->{over}->add_overview($mime, $len, $num, $oid, $mid0, $self);
+	my $smsg = bless {
+		bytes => $len,
+		num => $num,
+		blob => $oid,
+		mid => $mid0,
+	}, 'PublicInbox::Smsg';
+	$self->{over}->add_overview($mime, $smsg, $self);
 	my $idx = idx_shard($self, $num % $self->{shards});
 	$idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime, $self);
 	my $n = $self->{transact_bytes} += $len;
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
index 6a5fd919..f073304a 100644
--- a/t/search-thr-index.t
+++ b/t/search-thr-index.t
@@ -9,6 +9,7 @@ use PublicInbox::MID qw(mids);
 use Email::MIME;
 require_mods(qw(DBD::SQLite Search::Xapian));
 require PublicInbox::SearchIdx;
+require PublicInbox::Smsg;
 require PublicInbox::Inbox;
 my ($tmpdir, $for_destroy) = tmpdir();
 my $git_dir = "$tmpdir/a.git";
@@ -45,7 +46,13 @@ foreach (reverse split(/\n\n/, $data)) {
 	$mime->header_set('To' => 'git@vger.kernel.org');
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => ++$num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	push @mids, $mid;
 	ok($doc_id, 'message added: '. $mid);
 }
@@ -86,7 +93,13 @@ SELECT tid FROM over WHERE num = ? LIMIT 1
 
 	my $bytes = bytes::length($mime->as_string);
 	my $mid = mids($mime->header_obj)->[0];
-	my $doc_id = $rw->add_message($mime, $bytes, $num, 'ignored', $mid);
+	my $smsg = bless {
+		bytes => $bytes,
+		num => $num,
+		mid => $mid,
+		blob => '',
+	}, 'PublicInbox::Smsg';
+	my $doc_id = $rw->add_message($mime, $smsg);
 	ok($doc_id, 'message reindexed'. $mid);
 	is($doc_id, $num, "article number unchanged: $num");
 

^ permalink raw reply related	[relevance 5%]

* [PATCH 0/9] preserve time and date of initial commit
  @ 2020-03-20  8:18  7% ` Eric Wong
  2020-03-20  8:18  5%   ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-03-20  8:18 UTC (permalink / raw)
  To: meta

For messages lacking Date and/or Received headers, search
queries for "d:YYYYMMDD..YYYYMMDD" ranges can be unreliable in
mirrors, as can the $INBOX_URL/?t=$TIMESTAMP query which
only hits SQLite.

Yes, this ended up being a lot of work to deal with corner case
messages (probably most of which are spam), but there's also a
lot of internal cleanups which made the end result easier to
follow, I think...

The main fix is actually in 1/9, but it's gross.

Patch 2/9 fixes a small window where a race can happen and
cause searches to be off by a minute.

Patches 3-8 cleanup the mess left in 1 and 2,

Finally, patch 9 fixes the corner-case-of-corner-cases for
dealing with multi-MID messages which require a one-off queue to
store the git commit/author times instead of overloading msgmap.

Eric Wong (9):
  index: use git commit times on missing Date/Received
  v2writable: preserve timestamps from import if generated
  rename PublicInbox::SearchMsg => PublicInbox::Smsg
  smsg: to_doc_data: use existing fields
  overidx: parse_references: less error-prone args
  *idx: pass $smsg in more places instead of many args
  v2: pass smsg in more places
  *idx: pass smsg in even more places
  v2: SDBM-based multi Message-ID queue

 Documentation/mknews.perl                   |  4 +-
 Documentation/technical/data_structures.txt |  4 +-
 MANIFEST                                    |  4 +-
 lib/PublicInbox/ExtMsg.pm                   |  3 +-
 lib/PublicInbox/Feed.pm                     |  4 +-
 lib/PublicInbox/Import.pm                   | 19 +++--
 lib/PublicInbox/Inbox.pm                    |  2 +-
 lib/PublicInbox/Mbox.pm                     |  4 +-
 lib/PublicInbox/MsgTime.pm                  | 12 +--
 lib/PublicInbox/MultiMidQueue.pm            | 57 +++++++++++++
 lib/PublicInbox/NNTP.pm                     | 14 ++--
 lib/PublicInbox/Over.pm                     |  8 +-
 lib/PublicInbox/OverIdx.pm                  | 30 +++----
 lib/PublicInbox/Search.pm                   |  8 +-
 lib/PublicInbox/SearchIdx.pm                | 68 ++++++++++-----
 lib/PublicInbox/SearchIdxShard.pm           | 26 ++++--
 lib/PublicInbox/SearchView.pm               |  8 +-
 lib/PublicInbox/{SearchMsg.pm => Smsg.pm}   | 19 +++--
 lib/PublicInbox/SolverGit.pm                |  2 +-
 lib/PublicInbox/V2Writable.pm               | 84 ++++++++++++-------
 lib/PublicInbox/View.pm                     |  2 +-
 t/import.t                                  | 14 ++--
 t/index-git-times.t                         | 93 +++++++++++++++++++++
 t/multi-mid.t                               | 27 +++++-
 t/search-thr-index.t                        | 17 +++-
 t/thread-cycle.t                            |  2 +-
 26 files changed, 386 insertions(+), 149 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm
 rename lib/PublicInbox/{SearchMsg.pm => Smsg.pm} (92%)
 create mode 100644 t/index-git-times.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-03-05  5:13     [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-20  8:18  7% ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20  8:18  5%   ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).