user/dev discussion of public-inbox itself
 help / color / Atom feed
From: "Eric W. Biederman" <ebiederm@xmission.com>
To: Eric Wong <e@80x24.org>
Cc: meta@public-inbox.org, "Eric W. Biederman" <ebiederm@xmission.com>
Subject: [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional
Date: Wed, 18 Jul 2018 11:53:24 -0500
Message-ID: <20180718165325.19834-2-ebiederm@xmission.com> (raw)
In-Reply-To: <87pnzkh4fx.fsf_-_@xmission.com>

Create a new method add_xapian that holds all of the code to create
Xapian indexes.  The creation of this method simpliy involved
idenitifying the relevant code and moving it from add_message.

A call is added to add_xapian from add_message to keep everything
working as it currently does.  The new call is made conditional upon
index levels of 'full' and 'medium'.  The index levels that index
positions and terms the two things public-inbox uses Xapian to index.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 lib/PublicInbox/SearchIdx.pm | 172 ++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 83 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b19618c71508..8978914ab087 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -268,10 +268,95 @@ sub index_body ($$$) {
 	@$lines = ();
 }
 
+sub add_xapian ($$$$$) {
+	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
+	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $doc = $smsg->{doc};
+	my $subj = $smsg->subject;
+	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
+	my @ds = gmtime($smsg->ds);
+	my $yyyymmdd = strftime('%Y%m%d', @ds);
+	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
+	my $dt = strftime('%Y%m%d%H%M%S', @ds);
+	add_val($doc, PublicInbox::Search::DT(), $dt);
+
+	my $tg = $self->term_generator;
+
+	$tg->set_document($doc);
+	$self->index_text($subj, 1, 'S') if $subj;
+	$self->index_users($smsg);
+
+	msg_iter($mime, sub {
+		my ($part, $depth, @idx) = @{$_[0]};
+		my $ct = $part->content_type || 'text/plain';
+		my $fn = $part->filename;
+		if (defined $fn && $fn ne '') {
+			$self->index_text($fn, 1, 'XFN');
+		}
+
+		return if $ct =~ m!\btext/x?html\b!i;
+
+		my $s = eval { $part->body_str };
+		if ($@) {
+			if ($ct =~ m!\btext/plain\b!i) {
+				# Try to assume UTF-8 because Alpine
+				# seems to do wacky things and set
+				# charset=X-UNKNOWN
+				$part->charset_set('UTF-8');
+				$s = eval { $part->body_str };
+				$s = $part->body if $@;
+			}
+		}
+		defined $s or return;
+
+		my (@orig, @quot);
+		my $body = $part->body;
+		my @lines = split(/\n/, $body);
+		while (defined(my $l = shift @lines)) {
+			if ($l =~ /^>/) {
+				$self->index_body(\@orig, $doc) if @orig;
+				push @quot, $l;
+			} else {
+				$self->index_body(\@quot, 0) if @quot;
+				push @orig, $l;
+			}
+		}
+		$self->index_body(\@quot, 0) if @quot;
+		$self->index_body(\@orig, $doc) if @orig;
+	});
+
+	foreach my $mid (@$mids) {
+		$self->index_text($mid, 1, 'XM');
+
+		# because too many Message-IDs are prefixed with
+		# "Pine.LNX."...
+		if ($mid =~ /\w{12,}/) {
+			my @long = ($mid =~ /(\w{3,}+)/g);
+			$self->index_text(join(' ', @long), 1, 'XM');
+		}
+	}
+	$smsg->{to} = $smsg->{cc} = '';
+	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
+	my $data = $smsg->to_doc_data($oid, $mid0);
+	$doc->set_data($data);
+	if (my $altid = $self->{-altid}) {
+		foreach my $alt (@$altid) {
+			my $pfx = $alt->{xprefix};
+			foreach my $mid (@$mids) {
+				my $id = $alt->mid2alt($mid);
+				next unless defined $id;
+				$doc->add_boolean_term($pfx . $id);
+			}
+		}
+	}
+	$doc->add_boolean_term('Q' . $_) foreach @$mids;
+	$self->{xdb}->replace_document($num, $doc);
+}
+
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $doc_id;
+	my $xapianlevels = qr/\A(?:full|medium)\z/;
 	my $mids = mids($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
@@ -279,98 +364,19 @@ sub add_message {
 		$num = index_mm($self, $mime);
 	}
 	eval {
-		my $smsg = PublicInbox::SearchMsg->new($mime);
-		my $doc = $smsg->{doc};
-		my $subj = $smsg->subject;
-		add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-		my @ds = gmtime($smsg->ds);
-		my $yyyymmdd = strftime('%Y%m%d', @ds);
-		add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
-		my $dt = strftime('%Y%m%d%H%M%S', @ds);
-		add_val($doc, PublicInbox::Search::DT(), $dt);
-
-		my $tg = $self->term_generator;
-
-		$tg->set_document($doc);
-		$self->index_text($subj, 1, 'S') if $subj;
-		$self->index_users($smsg);
-
-		msg_iter($mime, sub {
-			my ($part, $depth, @idx) = @{$_[0]};
-			my $ct = $part->content_type || 'text/plain';
-			my $fn = $part->filename;
-			if (defined $fn && $fn ne '') {
-				$self->index_text($fn, 1, 'XFN');
-			}
-
-			return if $ct =~ m!\btext/x?html\b!i;
-
-			my $s = eval { $part->body_str };
-			if ($@) {
-				if ($ct =~ m!\btext/plain\b!i) {
-					# Try to assume UTF-8 because Alpine
-					# seems to do wacky things and set
-					# charset=X-UNKNOWN
-					$part->charset_set('UTF-8');
-					$s = eval { $part->body_str };
-					$s = $part->body if $@;
-				}
-			}
-			defined $s or return;
-
-			my (@orig, @quot);
-			my $body = $part->body;
-			my @lines = split(/\n/, $body);
-			while (defined(my $l = shift @lines)) {
-				if ($l =~ /^>/) {
-					$self->index_body(\@orig, $doc) if @orig;
-					push @quot, $l;
-				} else {
-					$self->index_body(\@quot, 0) if @quot;
-					push @orig, $l;
-				}
-			}
-			$self->index_body(\@quot, 0) if @quot;
-			$self->index_body(\@orig, $doc) if @orig;
-		});
-
-		foreach my $mid (@$mids) {
-			$self->index_text($mid, 1, 'XM');
-
-			# because too many Message-IDs are prefixed with
-			# "Pine.LNX."...
-			if ($mid =~ /\w{12,}/) {
-				my @long = ($mid =~ /(\w{3,}+)/g);
-				$self->index_text(join(' ', @long), 1, 'XM');
-			}
+		if ($self->{indexlevel} =~ $xapianlevels) {
+			$self->add_xapian($mime, $num, $oid, $mids, $mid0)
 		}
-		$smsg->{to} = $smsg->{cc} = '';
-		PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
-		my $data = $smsg->to_doc_data($oid, $mid0);
-		$doc->set_data($data);
-		if (my $altid = $self->{-altid}) {
-			foreach my $alt (@$altid) {
-				my $pfx = $alt->{xprefix};
-				foreach my $mid (@$mids) {
-					my $id = $alt->mid2alt($mid);
-					next unless defined $id;
-					$doc->add_boolean_term($pfx . $id);
-				}
-			}
-		}
-
 		if (my $over = $self->{over}) {
 			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
 		}
-		$doc->add_boolean_term('Q' . $_) foreach @$mids;
-		$self->{xdb}->replace_document($doc_id = $num, $doc);
 	};
 
 	if ($@) {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$doc_id;
+	$num;
 }
 
 # returns begin and end PostingIterator
-- 
2.17.1


  parent reply index

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-17 23:27 [PATCH 0/3] Making the search indexes optional ebiederm
2018-07-17 23:30 ` [PATCH 1/3] SearchIdx.pm: Make indexing search positions optional Eric W. Biederman
2018-07-17 23:30 ` [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman
2018-07-17 23:30 ` [PATCH 3/3] SearchIdx: Allow the amount of indexing be configured Eric W. Biederman
2018-07-18 10:22   ` Eric Wong
2018-07-18 16:00     ` ebiederm
2018-07-18 16:31       ` Eric Wong
2018-07-18 16:52         ` [PATCH v2 1/3] Making the search indexes optional ebiederm
2018-07-18 16:53           ` [PATCH v2 1/3] SearchIdx.pm: Make indexing search positions optional Eric W. Biederman
2018-07-18 16:53           ` Eric W. Biederman [this message]
2018-07-18 16:53           ` [PATCH v2 3/3] SearchIdx: Allow the amount of indexing be configured Eric W. Biederman
2018-07-19 21:51             ` [PATCH] tests: fixup indexlevel setting in tests Eric Wong
2018-07-18 17:32           ` [PATCH v2 3/4] public-inbox-init: Initialize indexlevel ebiederm
2018-07-19  3:52           ` [PATCH v2 1/3] Making the search indexes optional Eric Wong
2018-07-19 18:47             ` ebiederm
2018-07-20  6:58               ` [PATCH] v1: allow upgrading indexlevel=basic to 'medium' or 'full' Eric Wong
2018-07-18 10:17 ` [PATCH 0/3] Making the search indexes optional Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180718165325.19834-2-ebiederm@xmission.com \
    --to=ebiederm@xmission.com \
    --cc=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox