user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional
  2018-07-18 16:52  6%         ` [PATCH v2 1/3] Making the search indexes optional Eric W. Biederman
@ 2018-07-18 16:53  5%           ` Eric W. Biederman
  0 siblings, 0 replies; 4+ results
From: Eric W. Biederman @ 2018-07-18 16:53 UTC (permalink / raw)
  To: Eric Wong; +Cc: meta, Eric W. Biederman

Create a new method add_xapian that holds all of the code to create
Xapian indexes.  The creation of this method simpliy involved
idenitifying the relevant code and moving it from add_message.

A call is added to add_xapian from add_message to keep everything
working as it currently does.  The new call is made conditional upon
index levels of 'full' and 'medium'.  The index levels that index
positions and terms the two things public-inbox uses Xapian to index.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 lib/PublicInbox/SearchIdx.pm | 172 ++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 83 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b19618c71508..8978914ab087 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -268,10 +268,95 @@ sub index_body ($$$) {
 	@$lines = ();
 }
 
+sub add_xapian ($$$$$) {
+	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
+	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $doc = $smsg->{doc};
+	my $subj = $smsg->subject;
+	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
+	my @ds = gmtime($smsg->ds);
+	my $yyyymmdd = strftime('%Y%m%d', @ds);
+	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
+	my $dt = strftime('%Y%m%d%H%M%S', @ds);
+	add_val($doc, PublicInbox::Search::DT(), $dt);
+
+	my $tg = $self->term_generator;
+
+	$tg->set_document($doc);
+	$self->index_text($subj, 1, 'S') if $subj;
+	$self->index_users($smsg);
+
+	msg_iter($mime, sub {
+		my ($part, $depth, @idx) = @{$_[0]};
+		my $ct = $part->content_type || 'text/plain';
+		my $fn = $part->filename;
+		if (defined $fn && $fn ne '') {
+			$self->index_text($fn, 1, 'XFN');
+		}
+
+		return if $ct =~ m!\btext/x?html\b!i;
+
+		my $s = eval { $part->body_str };
+		if ($@) {
+			if ($ct =~ m!\btext/plain\b!i) {
+				# Try to assume UTF-8 because Alpine
+				# seems to do wacky things and set
+				# charset=X-UNKNOWN
+				$part->charset_set('UTF-8');
+				$s = eval { $part->body_str };
+				$s = $part->body if $@;
+			}
+		}
+		defined $s or return;
+
+		my (@orig, @quot);
+		my $body = $part->body;
+		my @lines = split(/\n/, $body);
+		while (defined(my $l = shift @lines)) {
+			if ($l =~ /^>/) {
+				$self->index_body(\@orig, $doc) if @orig;
+				push @quot, $l;
+			} else {
+				$self->index_body(\@quot, 0) if @quot;
+				push @orig, $l;
+			}
+		}
+		$self->index_body(\@quot, 0) if @quot;
+		$self->index_body(\@orig, $doc) if @orig;
+	});
+
+	foreach my $mid (@$mids) {
+		$self->index_text($mid, 1, 'XM');
+
+		# because too many Message-IDs are prefixed with
+		# "Pine.LNX."...
+		if ($mid =~ /\w{12,}/) {
+			my @long = ($mid =~ /(\w{3,}+)/g);
+			$self->index_text(join(' ', @long), 1, 'XM');
+		}
+	}
+	$smsg->{to} = $smsg->{cc} = '';
+	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
+	my $data = $smsg->to_doc_data($oid, $mid0);
+	$doc->set_data($data);
+	if (my $altid = $self->{-altid}) {
+		foreach my $alt (@$altid) {
+			my $pfx = $alt->{xprefix};
+			foreach my $mid (@$mids) {
+				my $id = $alt->mid2alt($mid);
+				next unless defined $id;
+				$doc->add_boolean_term($pfx . $id);
+			}
+		}
+	}
+	$doc->add_boolean_term('Q' . $_) foreach @$mids;
+	$self->{xdb}->replace_document($num, $doc);
+}
+
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $doc_id;
+	my $xapianlevels = qr/\A(?:full|medium)\z/;
 	my $mids = mids($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
@@ -279,98 +364,19 @@ sub add_message {
 		$num = index_mm($self, $mime);
 	}
 	eval {
-		my $smsg = PublicInbox::SearchMsg->new($mime);
-		my $doc = $smsg->{doc};
-		my $subj = $smsg->subject;
-		add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-		my @ds = gmtime($smsg->ds);
-		my $yyyymmdd = strftime('%Y%m%d', @ds);
-		add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
-		my $dt = strftime('%Y%m%d%H%M%S', @ds);
-		add_val($doc, PublicInbox::Search::DT(), $dt);
-
-		my $tg = $self->term_generator;
-
-		$tg->set_document($doc);
-		$self->index_text($subj, 1, 'S') if $subj;
-		$self->index_users($smsg);
-
-		msg_iter($mime, sub {
-			my ($part, $depth, @idx) = @{$_[0]};
-			my $ct = $part->content_type || 'text/plain';
-			my $fn = $part->filename;
-			if (defined $fn && $fn ne '') {
-				$self->index_text($fn, 1, 'XFN');
-			}
-
-			return if $ct =~ m!\btext/x?html\b!i;
-
-			my $s = eval { $part->body_str };
-			if ($@) {
-				if ($ct =~ m!\btext/plain\b!i) {
-					# Try to assume UTF-8 because Alpine
-					# seems to do wacky things and set
-					# charset=X-UNKNOWN
-					$part->charset_set('UTF-8');
-					$s = eval { $part->body_str };
-					$s = $part->body if $@;
-				}
-			}
-			defined $s or return;
-
-			my (@orig, @quot);
-			my $body = $part->body;
-			my @lines = split(/\n/, $body);
-			while (defined(my $l = shift @lines)) {
-				if ($l =~ /^>/) {
-					$self->index_body(\@orig, $doc) if @orig;
-					push @quot, $l;
-				} else {
-					$self->index_body(\@quot, 0) if @quot;
-					push @orig, $l;
-				}
-			}
-			$self->index_body(\@quot, 0) if @quot;
-			$self->index_body(\@orig, $doc) if @orig;
-		});
-
-		foreach my $mid (@$mids) {
-			$self->index_text($mid, 1, 'XM');
-
-			# because too many Message-IDs are prefixed with
-			# "Pine.LNX."...
-			if ($mid =~ /\w{12,}/) {
-				my @long = ($mid =~ /(\w{3,}+)/g);
-				$self->index_text(join(' ', @long), 1, 'XM');
-			}
+		if ($self->{indexlevel} =~ $xapianlevels) {
+			$self->add_xapian($mime, $num, $oid, $mids, $mid0)
 		}
-		$smsg->{to} = $smsg->{cc} = '';
-		PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
-		my $data = $smsg->to_doc_data($oid, $mid0);
-		$doc->set_data($data);
-		if (my $altid = $self->{-altid}) {
-			foreach my $alt (@$altid) {
-				my $pfx = $alt->{xprefix};
-				foreach my $mid (@$mids) {
-					my $id = $alt->mid2alt($mid);
-					next unless defined $id;
-					$doc->add_boolean_term($pfx . $id);
-				}
-			}
-		}
-
 		if (my $over = $self->{over}) {
 			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
 		}
-		$doc->add_boolean_term('Q' . $_) foreach @$mids;
-		$self->{xdb}->replace_document($doc_id = $num, $doc);
 	};
 
 	if ($@) {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$doc_id;
+	$num;
 }
 
 # returns begin and end PostingIterator
-- 
2.17.1


^ permalink raw reply related	[relevance 5%]

* [PATCH v2 1/3] Making the search indexes optional
  @ 2018-07-18 16:52  6%         ` Eric W. Biederman
  2018-07-18 16:53  5%           ` [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman
  0 siblings, 1 reply; 4+ results
From: Eric W. Biederman @ 2018-07-18 16:52 UTC (permalink / raw)
  To: Eric Wong; +Cc: meta


This is my respin of these patches.  I have used the levels:
full, medium, basic.

I think basic conveys the message that it is ok to run with and you can
expect most things to work, better than minimal where it feels like
you don't know what will fail.

I have tweaked the reindex tests to run with all 3 different levels
so at least these code paths get exercised.

Eric W. Biederman (3):
      SearchIdx.pm: Make indexing search positions optional
      SearchIdx: Add the mechanism for making all Xapian indexing optional
      SearchIdx: Allow the amount of indexing be configured

 lib/PublicInbox/Config.pm    |   2 +-
 lib/PublicInbox/SearchIdx.pm | 256 +++++++++++++++++++++++--------------------
 t/v1reindex.t                |  43 +++++++-
 t/v2reindex.t                |  40 +++++++
 4 files changed, 220 insertions(+), 121 deletions(-)


^ permalink raw reply	[relevance 6%]

* [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional
  2018-07-17 23:27  7% [PATCH 0/3] Making the search indexes optional Eric W. Biederman
@ 2018-07-17 23:30  5% ` Eric W. Biederman
    1 sibling, 0 replies; 4+ results
From: Eric W. Biederman @ 2018-07-17 23:30 UTC (permalink / raw)
  To: Eric Wong; +Cc: meta, Eric W. Biederman

Create a new method index_message that holds all of the code to create
Xapian indexes.  The creation of this method simpliy involved
idenitifying the relevant code and moving it from add_message.

A call is added to index_message from add_message to keep everything
working as it currently does.  The new call is made conditional upon
index levels of 'position' and 'terms' The two things public-inbox
uses Xapian to index.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 lib/PublicInbox/SearchIdx.pm | 171 ++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 83 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index cc92c389a152..deb87db3f88a 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -268,10 +268,94 @@ sub index_body ($$$) {
 	@$lines = ();
 }
 
+sub index_message ($$$$$) {
+	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
+	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $doc = $smsg->{doc};
+	my $subj = $smsg->subject;
+	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
+	my @ds = gmtime($smsg->ds);
+	my $yyyymmdd = strftime('%Y%m%d', @ds);
+	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
+	my $dt = strftime('%Y%m%d%H%M%S', @ds);
+	add_val($doc, PublicInbox::Search::DT(), $dt);
+
+	my $tg = $self->term_generator;
+
+	$tg->set_document($doc);
+	$self->index_text($subj, 1, 'S') if $subj;
+	$self->index_users($smsg);
+
+	msg_iter($mime, sub {
+		my ($part, $depth, @idx) = @{$_[0]};
+		my $ct = $part->content_type || 'text/plain';
+		my $fn = $part->filename;
+		if (defined $fn && $fn ne '') {
+			$self->index_text($fn, 1, 'XFN');
+		}
+
+		return if $ct =~ m!\btext/x?html\b!i;
+
+		my $s = eval { $part->body_str };
+		if ($@) {
+			if ($ct =~ m!\btext/plain\b!i) {
+				# Try to assume UTF-8 because Alpine
+				# seems to do wacky things and set
+				# charset=X-UNKNOWN
+				$part->charset_set('UTF-8');
+				$s = eval { $part->body_str };
+				$s = $part->body if $@;
+			}
+		}
+		defined $s or return;
+
+		my (@orig, @quot);
+		my $body = $part->body;
+		my @lines = split(/\n/, $body);
+		while (defined(my $l = shift @lines)) {
+			if ($l =~ /^>/) {
+				$self->index_body(\@orig, $doc) if @orig;
+				push @quot, $l;
+			} else {
+				$self->index_body(\@quot, 0) if @quot;
+				push @orig, $l;
+			}
+		}
+		$self->index_body(\@quot, 0) if @quot;
+		$self->index_body(\@orig, $doc) if @orig;
+	});
+
+	foreach my $mid (@$mids) {
+		$self->index_text($mid, 1, 'XM');
+
+		# because too many Message-IDs are prefixed with
+		# "Pine.LNX."...
+		if ($mid =~ /\w{12,}/) {
+			my @long = ($mid =~ /(\w{3,}+)/g);
+			$self->index_text(join(' ', @long), 1, 'XM');
+		}
+	}
+	$smsg->{to} = $smsg->{cc} = '';
+	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
+	my $data = $smsg->to_doc_data($oid, $mid0);
+	$doc->set_data($data);
+	if (my $altid = $self->{-altid}) {
+		foreach my $alt (@$altid) {
+			my $pfx = $alt->{xprefix};
+			foreach my $mid (@$mids) {
+				my $id = $alt->mid2alt($mid);
+				next unless defined $id;
+				$doc->add_boolean_term($pfx . $id);
+			}
+		}
+	}
+	$doc->add_boolean_term('Q' . $_) foreach @$mids;
+	$self->{xdb}->replace_document($num, $doc);
+}
+
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $doc_id;
 	my $mids = mids($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
@@ -279,98 +363,19 @@ sub add_message {
 		$num = index_mm($self, $mime);
 	}
 	eval {
-		my $smsg = PublicInbox::SearchMsg->new($mime);
-		my $doc = $smsg->{doc};
-		my $subj = $smsg->subject;
-		add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-		my @ds = gmtime($smsg->ds);
-		my $yyyymmdd = strftime('%Y%m%d', @ds);
-		add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
-		my $dt = strftime('%Y%m%d%H%M%S', @ds);
-		add_val($doc, PublicInbox::Search::DT(), $dt);
-
-		my $tg = $self->term_generator;
-
-		$tg->set_document($doc);
-		$self->index_text($subj, 1, 'S') if $subj;
-		$self->index_users($smsg);
-
-		msg_iter($mime, sub {
-			my ($part, $depth, @idx) = @{$_[0]};
-			my $ct = $part->content_type || 'text/plain';
-			my $fn = $part->filename;
-			if (defined $fn && $fn ne '') {
-				$self->index_text($fn, 1, 'XFN');
-			}
-
-			return if $ct =~ m!\btext/x?html\b!i;
-
-			my $s = eval { $part->body_str };
-			if ($@) {
-				if ($ct =~ m!\btext/plain\b!i) {
-					# Try to assume UTF-8 because Alpine
-					# seems to do wacky things and set
-					# charset=X-UNKNOWN
-					$part->charset_set('UTF-8');
-					$s = eval { $part->body_str };
-					$s = $part->body if $@;
-				}
-			}
-			defined $s or return;
-
-			my (@orig, @quot);
-			my $body = $part->body;
-			my @lines = split(/\n/, $body);
-			while (defined(my $l = shift @lines)) {
-				if ($l =~ /^>/) {
-					$self->index_body(\@orig, $doc) if @orig;
-					push @quot, $l;
-				} else {
-					$self->index_body(\@quot, 0) if @quot;
-					push @orig, $l;
-				}
-			}
-			$self->index_body(\@quot, 0) if @quot;
-			$self->index_body(\@orig, $doc) if @orig;
-		});
-
-		foreach my $mid (@$mids) {
-			$self->index_text($mid, 1, 'XM');
-
-			# because too many Message-IDs are prefixed with
-			# "Pine.LNX."...
-			if ($mid =~ /\w{12,}/) {
-				my @long = ($mid =~ /(\w{3,}+)/g);
-				$self->index_text(join(' ', @long), 1, 'XM');
-			}
+		if ($self->{indexlevel} =~ m/(positions|terms)/) {
+			$self->index_message($mime, $num, $oid, $mids, $mid0)
 		}
-		$smsg->{to} = $smsg->{cc} = '';
-		PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
-		my $data = $smsg->to_doc_data($oid, $mid0);
-		$doc->set_data($data);
-		if (my $altid = $self->{-altid}) {
-			foreach my $alt (@$altid) {
-				my $pfx = $alt->{xprefix};
-				foreach my $mid (@$mids) {
-					my $id = $alt->mid2alt($mid);
-					next unless defined $id;
-					$doc->add_boolean_term($pfx . $id);
-				}
-			}
-		}
-
 		if (my $over = $self->{over}) {
 			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
 		}
-		$doc->add_boolean_term('Q' . $_) foreach @$mids;
-		$self->{xdb}->replace_document($doc_id = $num, $doc);
 	};
 
 	if ($@) {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$doc_id;
+	$num;
 }
 
 # returns begin and end PostingIterator
-- 
2.17.1


^ permalink raw reply related	[relevance 5%]

* [PATCH 0/3] Making the search indexes optional
@ 2018-07-17 23:27  7% Eric W. Biederman
  2018-07-17 23:30  5% ` [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman
    0 siblings, 2 replies; 4+ results
From: Eric W. Biederman @ 2018-07-17 23:27 UTC (permalink / raw)
  To: Eric Wong; +Cc: meta


Here is the code to make the Xapian search indexes optional.

The first patch makes the term position database optional.
The second patch makes anything in Xapian optional.
Finally the last patch adds a config option.

At the end of the day it all looks simple and straight forward so I feel
good about the code.  At the very least it looks like a good starting
point.

What this code does not do is make the Xapian code modules optional.  As
that is more involved, and there is not much reward for that.  With a
little cleverness in moving around code that is probably possible in a
follow change.

Eric W. Biederman (3):
      SearchIdx.pm: Make indexing search positions optional
      SearchIdx: Add the mechanism for making all Xapian indexing optional
      SearchIdx: Allow the amount of indexing be configured

 lib/PublicInbox/Config.pm    |   2 +-
 lib/PublicInbox/SearchIdx.pm | 255 +++++++++++++++++++++++--------------------
 2 files changed, 137 insertions(+), 120 deletions(-)



^ permalink raw reply	[relevance 7%]

Results 1-4 of 4 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2018-07-17 23:27  7% [PATCH 0/3] Making the search indexes optional Eric W. Biederman
2018-07-17 23:30  5% ` [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman
2018-07-17 23:30     ` [PATCH 3/3] SearchIdx: Allow the amount of indexing be configured Eric W. Biederman
2018-07-18 10:22       ` Eric Wong
2018-07-18 16:00         ` Eric W. Biederman
2018-07-18 16:31           ` Eric Wong
2018-07-18 16:52  6%         ` [PATCH v2 1/3] Making the search indexes optional Eric W. Biederman
2018-07-18 16:53  5%           ` [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).