user/dev discussion of public-inbox itself
 help / color / Atom feed
From: "Eric W. Biederman" <ebiederm@xmission.com>
To: Eric Wong <e@80x24.org>
Cc: meta@public-inbox.org, "Eric W. Biederman" <ebiederm@xmission.com>
Subject: [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional
Date: Tue, 17 Jul 2018 18:30:57 -0500
Message-ID: <20180717233058.30820-2-ebiederm@xmission.com> (raw)
In-Reply-To: <87a7qpjve8.fsf@xmission.com>

Create a new method index_message that holds all of the code to create
Xapian indexes.  The creation of this method simpliy involved
idenitifying the relevant code and moving it from add_message.

A call is added to index_message from add_message to keep everything
working as it currently does.  The new call is made conditional upon
index levels of 'position' and 'terms' The two things public-inbox
uses Xapian to index.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 lib/PublicInbox/SearchIdx.pm | 171 ++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 83 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index cc92c389a152..deb87db3f88a 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -268,10 +268,94 @@ sub index_body ($$$) {
 	@$lines = ();
 }
 
+sub index_message ($$$$$) {
+	my ($self, $mime, $num, $oid, $mids, $mid0) = @_;
+	my $smsg = PublicInbox::SearchMsg->new($mime);
+	my $doc = $smsg->{doc};
+	my $subj = $smsg->subject;
+	add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
+	my @ds = gmtime($smsg->ds);
+	my $yyyymmdd = strftime('%Y%m%d', @ds);
+	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
+	my $dt = strftime('%Y%m%d%H%M%S', @ds);
+	add_val($doc, PublicInbox::Search::DT(), $dt);
+
+	my $tg = $self->term_generator;
+
+	$tg->set_document($doc);
+	$self->index_text($subj, 1, 'S') if $subj;
+	$self->index_users($smsg);
+
+	msg_iter($mime, sub {
+		my ($part, $depth, @idx) = @{$_[0]};
+		my $ct = $part->content_type || 'text/plain';
+		my $fn = $part->filename;
+		if (defined $fn && $fn ne '') {
+			$self->index_text($fn, 1, 'XFN');
+		}
+
+		return if $ct =~ m!\btext/x?html\b!i;
+
+		my $s = eval { $part->body_str };
+		if ($@) {
+			if ($ct =~ m!\btext/plain\b!i) {
+				# Try to assume UTF-8 because Alpine
+				# seems to do wacky things and set
+				# charset=X-UNKNOWN
+				$part->charset_set('UTF-8');
+				$s = eval { $part->body_str };
+				$s = $part->body if $@;
+			}
+		}
+		defined $s or return;
+
+		my (@orig, @quot);
+		my $body = $part->body;
+		my @lines = split(/\n/, $body);
+		while (defined(my $l = shift @lines)) {
+			if ($l =~ /^>/) {
+				$self->index_body(\@orig, $doc) if @orig;
+				push @quot, $l;
+			} else {
+				$self->index_body(\@quot, 0) if @quot;
+				push @orig, $l;
+			}
+		}
+		$self->index_body(\@quot, 0) if @quot;
+		$self->index_body(\@orig, $doc) if @orig;
+	});
+
+	foreach my $mid (@$mids) {
+		$self->index_text($mid, 1, 'XM');
+
+		# because too many Message-IDs are prefixed with
+		# "Pine.LNX."...
+		if ($mid =~ /\w{12,}/) {
+			my @long = ($mid =~ /(\w{3,}+)/g);
+			$self->index_text(join(' ', @long), 1, 'XM');
+		}
+	}
+	$smsg->{to} = $smsg->{cc} = '';
+	PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
+	my $data = $smsg->to_doc_data($oid, $mid0);
+	$doc->set_data($data);
+	if (my $altid = $self->{-altid}) {
+		foreach my $alt (@$altid) {
+			my $pfx = $alt->{xprefix};
+			foreach my $mid (@$mids) {
+				my $id = $alt->mid2alt($mid);
+				next unless defined $id;
+				$doc->add_boolean_term($pfx . $id);
+			}
+		}
+	}
+	$doc->add_boolean_term('Q' . $_) foreach @$mids;
+	$self->{xdb}->replace_document($num, $doc);
+}
+
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $doc_id;
 	my $mids = mids($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
@@ -279,98 +363,19 @@ sub add_message {
 		$num = index_mm($self, $mime);
 	}
 	eval {
-		my $smsg = PublicInbox::SearchMsg->new($mime);
-		my $doc = $smsg->{doc};
-		my $subj = $smsg->subject;
-		add_val($doc, PublicInbox::Search::TS(), $smsg->ts);
-		my @ds = gmtime($smsg->ds);
-		my $yyyymmdd = strftime('%Y%m%d', @ds);
-		add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
-		my $dt = strftime('%Y%m%d%H%M%S', @ds);
-		add_val($doc, PublicInbox::Search::DT(), $dt);
-
-		my $tg = $self->term_generator;
-
-		$tg->set_document($doc);
-		$self->index_text($subj, 1, 'S') if $subj;
-		$self->index_users($smsg);
-
-		msg_iter($mime, sub {
-			my ($part, $depth, @idx) = @{$_[0]};
-			my $ct = $part->content_type || 'text/plain';
-			my $fn = $part->filename;
-			if (defined $fn && $fn ne '') {
-				$self->index_text($fn, 1, 'XFN');
-			}
-
-			return if $ct =~ m!\btext/x?html\b!i;
-
-			my $s = eval { $part->body_str };
-			if ($@) {
-				if ($ct =~ m!\btext/plain\b!i) {
-					# Try to assume UTF-8 because Alpine
-					# seems to do wacky things and set
-					# charset=X-UNKNOWN
-					$part->charset_set('UTF-8');
-					$s = eval { $part->body_str };
-					$s = $part->body if $@;
-				}
-			}
-			defined $s or return;
-
-			my (@orig, @quot);
-			my $body = $part->body;
-			my @lines = split(/\n/, $body);
-			while (defined(my $l = shift @lines)) {
-				if ($l =~ /^>/) {
-					$self->index_body(\@orig, $doc) if @orig;
-					push @quot, $l;
-				} else {
-					$self->index_body(\@quot, 0) if @quot;
-					push @orig, $l;
-				}
-			}
-			$self->index_body(\@quot, 0) if @quot;
-			$self->index_body(\@orig, $doc) if @orig;
-		});
-
-		foreach my $mid (@$mids) {
-			$self->index_text($mid, 1, 'XM');
-
-			# because too many Message-IDs are prefixed with
-			# "Pine.LNX."...
-			if ($mid =~ /\w{12,}/) {
-				my @long = ($mid =~ /(\w{3,}+)/g);
-				$self->index_text(join(' ', @long), 1, 'XM');
-			}
+		if ($self->{indexlevel} =~ m/(positions|terms)/) {
+			$self->index_message($mime, $num, $oid, $mids, $mid0)
 		}
-		$smsg->{to} = $smsg->{cc} = '';
-		PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids);
-		my $data = $smsg->to_doc_data($oid, $mid0);
-		$doc->set_data($data);
-		if (my $altid = $self->{-altid}) {
-			foreach my $alt (@$altid) {
-				my $pfx = $alt->{xprefix};
-				foreach my $mid (@$mids) {
-					my $id = $alt->mid2alt($mid);
-					next unless defined $id;
-					$doc->add_boolean_term($pfx . $id);
-				}
-			}
-		}
-
 		if (my $over = $self->{over}) {
 			$over->add_overview($mime, $bytes, $num, $oid, $mid0);
 		}
-		$doc->add_boolean_term('Q' . $_) foreach @$mids;
-		$self->{xdb}->replace_document($doc_id = $num, $doc);
 	};
 
 	if ($@) {
 		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
-	$doc_id;
+	$num;
 }
 
 # returns begin and end PostingIterator
-- 
2.17.1


  parent reply index

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-17 23:27 [PATCH 0/3] Making the search indexes optional ebiederm
2018-07-17 23:30 ` [PATCH 1/3] SearchIdx.pm: Make indexing search positions optional Eric W. Biederman
2018-07-17 23:30 ` Eric W. Biederman [this message]
2018-07-17 23:30 ` [PATCH 3/3] SearchIdx: Allow the amount of indexing be configured Eric W. Biederman
2018-07-18 10:22   ` Eric Wong
2018-07-18 16:00     ` ebiederm
2018-07-18 16:31       ` Eric Wong
2018-07-18 16:52         ` [PATCH v2 1/3] Making the search indexes optional ebiederm
2018-07-18 16:53           ` [PATCH v2 1/3] SearchIdx.pm: Make indexing search positions optional Eric W. Biederman
2018-07-18 16:53           ` [PATCH v2 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional Eric W. Biederman
2018-07-18 16:53           ` [PATCH v2 3/3] SearchIdx: Allow the amount of indexing be configured Eric W. Biederman
2018-07-19 21:51             ` [PATCH] tests: fixup indexlevel setting in tests Eric Wong
2018-07-18 17:32           ` [PATCH v2 3/4] public-inbox-init: Initialize indexlevel ebiederm
2018-07-19  3:52           ` [PATCH v2 1/3] Making the search indexes optional Eric Wong
2018-07-19 18:47             ` ebiederm
2018-07-20  6:58               ` [PATCH] v1: allow upgrading indexlevel=basic to 'medium' or 'full' Eric Wong
2018-07-18 10:17 ` [PATCH 0/3] Making the search indexes optional Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180717233058.30820-2-ebiederm@xmission.com \
    --to=ebiederm@xmission.com \
    --cc=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox