From 675494c58ef7a39a92c79cbf02975b9da3991c0b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 13 Aug 2015 02:32:22 +0000 Subject: initial search backend implementation This shall allow us to search for replies/threads more easily. --- lib/PublicInbox/Search.pm | 539 +++++++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/SearchMsg.pm | 105 +++++++++ 2 files changed, 644 insertions(+) create mode 100644 lib/PublicInbox/Search.pm create mode 100644 lib/PublicInbox/SearchMsg.pm (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm new file mode 100644 index 00000000..fe4984e5 --- /dev/null +++ b/lib/PublicInbox/Search.pm @@ -0,0 +1,539 @@ +# Copyright (C) 2015, all contributors +# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +# based on notmuch, but with no concept of folders, files or flags +package PublicInbox::Search; +use strict; +use warnings; +use PublicInbox::SearchMsg; +use base qw/Exporter/; +use Digest::SHA qw//; +use Search::Xapian qw/:standard/; +require PublicInbox::View; +use Date::Parse qw/str2time/; +use POSIX qw//; +use Email::MIME; + +our @EXPORT = qw/xpfx mid_compressed/; + +use constant { + TS => 0, + SHA1HEX_LEN => 40, + SCHEMA_VERSION => 0, + LANG => 'english', + QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, +}; + +use constant MID_MAX => SHA1HEX_LEN; + +# setup prefixes +my %bool_pfx_internal = ( + type => 'T', # "mail" or "ghost" + mid => 'Q', # uniQue id (Message-ID or mid_compressed) +); + +my %bool_pfx_external = ( + path => 'XPATH', + thread => 'G', # newsGroup (or similar entity - e.g. a web forum name) + references => 'XREFS', + inreplyto => 'XIRT', +); + +my %prob_prefix = ( + subject => 'S', +); + +my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); + +sub xpfx { $all_pfx{$_[0]} } + +our %PFX2TERM_RMAP; +while (my ($k, $v) = each %all_pfx) { + next if $prob_prefix{$k}; + $PFX2TERM_RMAP{$v} = $k; +} + +my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail'); + +# this is idempotent +sub mid_compressed { + my ($mid) = @_; + return $mid if (length($mid) <= MID_MAX); + Digest::SHA::sha1_hex($mid); +} + +sub new { + my ($class, $git_dir, $writable) = @_; + # allow concurrent versions for easier rollback: + my $dir = "$git_dir/public-inbox/xapian" . SCHEMA_VERSION; + my $db; + + if ($writable) { # not used by the WWW interface + require Search::Xapian::WritableDatabase; + require File::Path; + File::Path::mkpath($dir); + $db = Search::Xapian::WritableDatabase->new($dir, + Search::Xapian::DB_CREATE_OR_OPEN); + } else { + $db = Search::Xapian::Database->new($dir); + } + bless { xdb => $db, git_dir => $git_dir }, $class; +} + +sub reopen { $_[0]->{xdb}->reopen } + +sub add_message { + my ($self, $mime) = @_; # mime = Email::MIME object + my $db = $self->{xdb}; + + my $doc_id; + my $mid = clean_mid($mime->header('Message-ID')); + $mid = mid_compressed($mid); + my $was_ghost = 0; + my $ct_msg = $mime->header('Content-Type') || 'text/plain'; + my $enc_msg = PublicInbox::View::enc_for($ct_msg); + + $db->begin_transaction; + eval { + my $smsg = $self->lookup_message($mid); + my $doc; + + if ($smsg) { + $smsg->ensure_metadata; + # convert a ghost to a regular message + # it will also clobber any existing regular message + $smsg->mime($mime); + $doc = $smsg->{doc}; + + my $type = xpfx('type'); + eval { + $doc->remove_term($type . 'ghost'); + $was_ghost = 1; + }; + + # probably does not exist: + eval { $doc->remove_term($type . 'mail') }; + $doc->add_term($type . 'mail'); + } else { + $smsg = PublicInbox::SearchMsg->new($mime); + $doc = $smsg->{doc}; + $doc->add_term(xpfx('mid') . $mid); + } + + my $subj = $mime->header('Subject'); + $subj = '' unless defined $subj; + + if (length $subj) { + $doc->add_term(xpfx('subject') . $subj); + + my $path = subject_path($subj); + $doc->add_term(xpfx('path') . $path); + } + + my $from = $mime->header('From') || ''; + my @from; + + if ($from) { + @from = Email::Address->parse($from); + $from = $from[0]->name; + } + + my $ts = eval { str2time($mime->header('Date')) } || 0; + my $date = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); + $ts = Search::Xapian::sortable_serialise($ts); + $doc->add_value(PublicInbox::Search::TS, $ts); + + # this is what we show in index results: + $subj =~ tr/\n/ /; + $from =~ tr/\n/ /; + $doc->set_data("$mid\n$subj\n$from\n$date"); + + my $tg = $self->term_generator; + + $tg->set_document($doc); + $tg->index_text($subj, 1, 'S') if $subj; + $tg->increase_termpos; + $tg->index_text($subj) if $subj; + $tg->increase_termpos; + + if (@from) { + $tg->index_text($from[0]->format); + $tg->increase_termpos; + } + + $mime->walk_parts(sub { + my ($part) = @_; + return if $part->subparts; # walk_parts already recurses + my $ct = $part->content_type || $ct_msg; + + # account for filter bugs... + $ct =~ m!\btext/plain\b!i or return; + + my $enc = PublicInbox::View::enc_for($ct, $enc_msg); + my (@orig, @quot); + foreach my $l (split(/\n/, $enc->decode($part->body))) { + if ($l =~ /^\s*>/) { + push @quot, $l; + } else { + push @orig, $l; + } + } + if (@quot) { + $tg->index_text(join("\n", @quot), 0); + $tg->increase_termpos; + } + if (@orig) { + $tg->index_text(join("\n", @orig)); + $tg->increase_termpos; + } + }); + + if ($was_ghost) { + $doc_id = $smsg->doc_id; + $self->link_message($smsg, 0); + $db->replace_document($doc_id, $doc); + } else { + $self->link_message($smsg, 0); + $doc_id = $db->add_document($doc); + } + }; + + if ($@) { + warn "failed to index message <$mid>: $@\n"; + $db->cancel_transaction; + } else { + $db->commit_transaction; + } + $doc_id; +} + +# returns deleted doc_id on success, undef on missing +sub remove_message { + my ($self, $mid) = @_; + my $db = $self->{xdb}; + my $doc_id; + $mid = clean_mid($mid); + $mid = mid_compressed($mid); + + $db->begin_transaction; + eval { + $doc_id = $self->find_unique_doc_id('mid', $mid); + $db->delete_document($doc_id) if defined $doc_id; + }; + + if ($@) { + warn "failed to remove message <$mid>: $@\n"; + $db->cancel_transaction; + } else { + $db->commit_transaction; + } + $db->commit; + $doc_id; +} + +# read-only +sub query { + my ($self, $query_string, $opts) = @_; + my $query = $self->qp->parse_query($query_string, QP_FLAGS); + + $query = Search::Xapian::Query->new(OP_AND, $mail_query, $query); + $self->do_enquire($query, $opts); +} + +# given a message ID, get replies to a message +sub get_replies { + my ($self, $mid, $opts) = @_; + $mid = clean_mid($mid); + $mid = mid_compressed($mid); + my $qp = $self->qp; + my $irt = $qp->parse_query("inreplyto:$mid", 0); + my $ref = $qp->parse_query("references:$mid", 0); + my $query = Search::Xapian::Query->new(OP_OR, $irt, $ref); + + $self->do_enquire($query); +} + +sub get_thread { + my ($self, $mid, $opts) = @_; + my $smsg = eval { $self->lookup_message($mid) }; + + return { count => 0, msgs => [] } unless $smsg; + my $qp = $self->qp; + my $qtid = $qp->parse_query('thread:'.$smsg->thread_id); + my $qsub = $qp->parse_query('path:'.$smsg->path); + my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); + $self->do_enquire($query); +} + +# private subs below + +sub do_enquire { + my ($self, $query, $opts) = @_; + my $enquire = $self->enquire; + + $enquire->set_query($query); + $enquire->set_sort_by_relevance_then_value(TS, 0); + $opts ||= {}; + my $offset = $opts->{offset} || 0; + my $limit = $opts->{limit} || 50; + my $mset = $enquire->get_mset($offset, $limit); + my @msgs = map { $_->get_document->get_data } $mset->items; + + { count => $mset->get_matches_estimated, msgs => \@msgs } +} + +# read-write +sub stemmer { Search::Xapian::Stem->new(LANG) } + +# read-only +sub qp { + my ($self) = @_; + + my $qp = $self->{query_parser}; + return $qp if $qp; + + # new parser + $qp = Search::Xapian::QueryParser->new; + $qp->set_default_op(OP_AND); + $qp->set_database($self->{xdb}); + $qp->set_stemmer($self->stemmer); + $qp->set_stemming_strategy(STEM_SOME); + $qp->add_valuerangeprocessor($self->ts_range_processor); + $qp->add_valuerangeprocessor($self->date_range_processor); + + while (my ($name, $prefix) = each %bool_pfx_external) { + $qp->add_boolean_prefix($name, $prefix); + } + + while (my ($name, $prefix) = each %prob_prefix) { + $qp->add_prefix($name, $prefix); + } + + $self->{query_parser} = $qp; +} + +sub term_generator { # write-only + my ($self) = @_; + + my $tg = $self->{term_generator}; + return $tg if $tg; + + $tg = Search::Xapian::TermGenerator->new; + $tg->set_stemmer($self->stemmer); + + $self->{term_generator} = $tg; +} + +sub next_doc_id { $_[0]->{xdb}->get_lastdocid + 1 } + +# increments last_thread_id counter +# returns a 64-bit integer represented as a hex string +sub next_thread_id { + my ($self) = @_; + my $db = $self->{xdb}; + my $last_thread_id = int($db->get_metadata('last_thread_id') || 0); + + $db->set_metadata('last_thread_id', ++$last_thread_id); + + $last_thread_id; +} + +sub ts_range_processor { + $_[0]->{tsrp} ||= Search::Xapian::NumberValueRangeProcessor->new(TS); +} + +sub date_range_processor { + $_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS); +} + +sub clean_mid { + my ($mid) = @_; + defined($mid) or die "no Message-ID"; + # MDA->precheck did more checking for us + $mid =~ s/\A\s*?\s*\z//; + $mid; +} + +sub link_message { + my ($self, $smsg, $is_ghost) = @_; + + if ($is_ghost) { + $smsg->ensure_metadata; + } else { + $self->link_message_to_parents($smsg); + } +} + +sub link_message_to_parents { + my ($self, $smsg) = @_; + my $doc = $smsg->{doc}; + my $mid = mid_compressed($smsg->mid); + my $mime = $smsg->mime; + my $refs = $mime->header('References'); + my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); + my $irt = $mime->header('In-Reply-To'); + if ($irt) { + if ($irt =~ /<([^>]+)>/) { + $irt = $1; + } + push @refs, $irt; + } + + my $tid; + if (@refs) { + @refs = map { mid_compressed($_) } @refs; + my %uniq; + @refs = grep { !$uniq{$_}++ } @refs; # uniq + + $doc->add_term(xpfx('inreplyto') . $refs[-1]); + + my $ref_pfx = xpfx('references'); + + # first ref *should* be the thread root, + # but we can never trust clients to do the right thing + my $ref = shift @refs; + $doc->add_term($ref_pfx . $ref); + $tid = $self->_resolve_mid_to_tid($ref); + + # the rest of the refs should point to this tid: + foreach $ref (@refs) { + $doc->add_term($ref_pfx . $ref); + my $ptid = $self->_resolve_mid_to_tid($ref); + if ($tid ne $ptid) { + $self->merge_threads($tid, $ptid); + } + } + } else { + $tid = $self->next_thread_id; + } + $doc->add_term(xpfx('thread') . $tid); +} + +sub lookup_message { + my ($self, $mid) = @_; + $mid = clean_mid($mid); + $mid = mid_compressed($mid); + + my $doc_id = $self->find_unique_doc_id('mid', $mid); + my $smsg; + if (defined $doc_id) { + # raises on error: + my $doc = $self->{xdb}->get_document($doc_id); + $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $smsg->doc_id($doc_id); + } + $smsg; +} + +sub find_unique_doc_id { + my ($self, $term, $value) = @_; + + my ($begin, $end) = $self->find_doc_ids($term, $value); + + return undef if $begin->equal($end); # not found + + my $rv = $begin->get_docid; + + # sanity check + $begin->inc; + $begin->equal($end) or die "Term '$term:$value' is not unique\n"; + $rv; +} + +# returns begin and end PostingIterator +sub find_doc_ids { + my ($self, $term, $value) = @_; + + $self->find_doc_ids_for_term(xpfx($term) . $value); +} + +# returns begin and end PostingIterator +sub find_doc_ids_for_term { + my ($self, $term) = @_; + my $db = $self->{xdb}; + + ($db->postlist_begin($term), $db->postlist_end($term)); +} + +# this will create a ghost as necessary +sub _resolve_mid_to_tid { + my ($self, $mid) = @_; + + my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid); + $smsg->thread_id; +} + +sub create_ghost { + my ($self, $mid, $tid) = @_; + + $mid = mid_compressed($mid); + $tid = $self->next_thread_id unless defined $tid; + + my $doc = Search::Xapian::Document->new; + $doc->add_term(xpfx('mid') . $mid); + $doc->add_term(xpfx('thread') . $tid); + $doc->add_term(xpfx('type') . 'ghost'); + + my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $self->link_message($smsg, 1); + $self->{xdb}->add_document($doc); + + $smsg; +} + +sub merge_threads { + my ($self, $winner_tid, $loser_tid) = @_; + my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); + my $thread_pfx = xpfx('thread'); + my $db = $self->{xdb}; + + for (; $head != $tail; $head->inc) { + my $docid = $head->get_docid; + my $doc = $db->get_document($docid); + $doc->remove_term($thread_pfx . $loser_tid); + $doc->add_term($thread_pfx . $winner_tid); + $db->replace_document($docid, $doc); + } +} + +# normalize subjects so they are suitable as pathnames for URLs +sub subject_path { + my ($subj) = @_; + + $subj =~ s/\A\s+//; + $subj =~ s/\s+\z//; + $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) + $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj; +} + +sub do_cat_mail { + my ($git, $blob) = @_; + my $mime = eval { + my $str = $git->cat_file($blob); + Email::MIME->new($str); + }; + $@ ? undef : $mime; +} + +sub index_blob { + my ($self, $git, $blob) = @_; + my $mime = do_cat_mail($git, $blob) or return; + eval { $self->add_message($mime) }; + warn "W: index_blob $blob: $@\n" if $@; +} + +sub unindex_blob { + my ($self, $git, $blob) = @_; + my $mime = do_cat_mail($git, $blob) or return; + my $mid = $mime->header('Message-ID'); + eval { $self->remove_message($mid) } if defined $mid; + warn "W: unindex_blob $blob: $@\n" if $@; +} + +sub enquire { + my ($self) = @_; + $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb}); +} + +1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm new file mode 100644 index 00000000..920804ac --- /dev/null +++ b/lib/PublicInbox/SearchMsg.pm @@ -0,0 +1,105 @@ +# Copyright (C) 2015, all contributors +# License: GPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +# based on notmuch, but with no concept of folders, files or flags +package PublicInbox::SearchMsg; +use strict; +use warnings; +use Search::Xapian; +our $PFX2TERM_RE = undef; + +sub new { + my ($class, $mime) = @_; + my $doc = Search::Xapian::Document->new; + $doc->add_term(PublicInbox::Search::xpfx('type') . 'mail'); + + bless { type => 'mail', doc => $doc, mime => $mime }, $class; +} + +sub wrap { + my ($class, $doc, $mid) = @_; + bless { doc => $doc, mime => undef, mid => $mid }, $class; +} + +sub ensure_metadata { + my ($self) = @_; + my $doc = $self->{doc}; + my $i = $doc->termlist_begin; + my $end = $doc->termlist_end; + + unless (defined $PFX2TERM_RE) { + my $or = join('|', keys %PublicInbox::Search::PFX2TERM_RMAP); + $PFX2TERM_RE = qr/\A($or)/; + } + + for (; $i != $end; $i->inc) { + my $val = $i->get_termname; + + if ($val =~ s/$PFX2TERM_RE//o) { + my $field = $PublicInbox::Search::PFX2TERM_RMAP{$1}; + if ($field eq 'references') { + my $refs = $self->{references} ||= []; + push @$refs, $val; + } else { + $self->{$field} = $val; + } + } + } +} + +sub mid { + my ($self, $mid) = @_; + + if (defined $mid) { + $self->{mid} = $mid; + } else { + $self->{mid} ||= $self->_extract_mid; + } +} + +sub _extract_mid { + my ($self) = @_; + + my $mid = $self->mime->header('Message-ID'); + if ($mid && $mid =~ /<([^>]+)>/) { + return $1; + } + return $mid; +} + +sub mime { + my ($self, $mime) = @_; + if (defined $mime) { + $self->{mime} = $mime; + } else { + # TODO load from git + $self->{mime}; + } +} + +sub doc_id { + my ($self, $doc_id) = @_; + if (defined $doc_id) { + $self->{doc_id} = $doc_id; + } else { + # TODO load from xapian + $self->{doc_id}; + } +} + +sub thread_id { + my ($self) = @_; + my $tid = $self->{thread}; + return $tid if defined $tid; + $self->ensure_metadata; + $self->{thread}; +} + +sub path { + my ($self) = @_; + my $path = $self->{path}; + return $path if defined $path; + $self->ensure_metadata; + $self->{path}; +} + +1; -- cgit v1.2.3-24-ge0c7 From 885250c3c289c96764e0eb9f432a389136d07088 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 09:28:31 +0000 Subject: search: implement index_sync to fixup indexer We need to make the indexer executable and installable while we're at it. --- lib/PublicInbox/Search.pm | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index fe4984e5..15bb9f62 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -226,7 +226,6 @@ sub remove_message { } else { $db->commit_transaction; } - $db->commit; $doc_id; } @@ -536,4 +535,42 @@ sub enquire { $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb}); } +# indexes all unindexed messages +sub index_sync { + my ($self, $git) = @_; + my $db = $self->{xdb}; + my $latest = $db->get_metadata('last_commit'); + my $range = length $latest ? "$latest..HEAD" : 'HEAD'; + $latest = undef; + + my $hex = '[a-f0-9]'; + my $h40 = $hex .'{40}'; + my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; + my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; + + # get indexed messages + my @cmd = ('git', "--git-dir=$git->{git_dir}", "log", + qw/--reverse --no-notes --no-color --raw -r --no-abbrev/, + $range); + + my $pid = open(my $log, '-|', @cmd) or + die('open` '.join(' ', @cmd) . " pipe failed: $!\n"); + my $last; + while (my $line = <$log>) { + if ($line =~ /$addmsg/o) { + $self->index_blob($git, $1); + } elsif ($line =~ /$delmsg/o) { + $self->unindex_blob($git, $1); + } elsif ($line =~ /^commit ($h40)/o) { + my $commit = $1; + if (defined $latest) { + $db->set_metadata('last_commit', $latest) + } + $latest = $commit; + } + } + close $log; + $db->set_metadata('last_commit', $latest) if defined $latest; +} + 1; -- cgit v1.2.3-24-ge0c7 From d7fcdec712accc212bcfa35e50ade1233eb9beb3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 09:28:32 +0000 Subject: extract redundant Message-ID handling code Quit repeating ourselves and use a common MID module instead. --- lib/PublicInbox/Hval.pm | 13 +++---------- lib/PublicInbox/MID.pm | 27 +++++++++++++++++++++++++++ lib/PublicInbox/Search.pm | 31 ++++++------------------------- lib/PublicInbox/View.pm | 8 +++----- 4 files changed, 39 insertions(+), 40 deletions(-) create mode 100644 lib/PublicInbox/MID.pm (limited to 'lib') diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 68f89546..d8b31c84 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -8,6 +8,7 @@ use warnings; use fields qw(raw href); use Encode qw(find_encoding); use URI::Escape qw(uri_escape_utf8); +use PublicInbox::MID qw/mid_clean mid_compressed/; my $enc_ascii = find_encoding('us-ascii'); @@ -25,16 +26,8 @@ sub new { sub new_msgid { my ($class, $msgid) = @_; - $msgid =~ s/\A\s*?\s*\z//; - - if (length($msgid) <= 40) { - $class->new($msgid); - } else { - require Digest::SHA; - my $hex = Digest::SHA::sha1_hex($msgid); - $class->new($msgid, $hex); - } + $msgid = mid_clean($msgid); + $class->new($msgid, mid_compressed($msgid)); } sub new_oneline { diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm new file mode 100644 index 00000000..e5a30a1b --- /dev/null +++ b/lib/PublicInbox/MID.pm @@ -0,0 +1,27 @@ +# Copyright (C) 2015, all contributors +# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +package PublicInbox::MID; +use strict; +use warnings; +use base qw/Exporter/; +our @EXPORT_OK = qw/mid_clean mid_compressed/; +use Digest::SHA qw/sha1_hex/; +use constant MID_MAX => 40; # SHA-1 hex length + +sub mid_clean { + my ($mid) = @_; + defined($mid) or die "no Message-ID"; + # MDA->precheck did more checking for us + $mid =~ s/\A\s*?\s*\z//; + $mid; +} + +# this is idempotent +sub mid_compressed { + my ($mid) = @_; + return $mid if (length($mid) <= MID_MAX); + sha1_hex($mid); +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 15bb9f62..e88bfb16 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,25 +6,22 @@ use strict; use warnings; use PublicInbox::SearchMsg; use base qw/Exporter/; -use Digest::SHA qw//; use Search::Xapian qw/:standard/; require PublicInbox::View; use Date::Parse qw/str2time/; use POSIX qw//; use Email::MIME; +use PublicInbox::MID qw/mid_clean mid_compressed/; -our @EXPORT = qw/xpfx mid_compressed/; +our @EXPORT = qw/xpfx/; use constant { TS => 0, - SHA1HEX_LEN => 40, SCHEMA_VERSION => 0, LANG => 'english', QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; -use constant MID_MAX => SHA1HEX_LEN; - # setup prefixes my %bool_pfx_internal = ( type => 'T', # "mail" or "ghost" @@ -54,13 +51,6 @@ while (my ($k, $v) = each %all_pfx) { my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail'); -# this is idempotent -sub mid_compressed { - my ($mid) = @_; - return $mid if (length($mid) <= MID_MAX); - Digest::SHA::sha1_hex($mid); -} - sub new { my ($class, $git_dir, $writable) = @_; # allow concurrent versions for easier rollback: @@ -86,7 +76,7 @@ sub add_message { my $db = $self->{xdb}; my $doc_id; - my $mid = clean_mid($mime->header('Message-ID')); + my $mid = mid_clean($mime->header('Message-ID')); $mid = mid_compressed($mid); my $was_ghost = 0; my $ct_msg = $mime->header('Content-Type') || 'text/plain'; @@ -211,7 +201,7 @@ sub remove_message { my ($self, $mid) = @_; my $db = $self->{xdb}; my $doc_id; - $mid = clean_mid($mid); + $mid = mid_clean($mid); $mid = mid_compressed($mid); $db->begin_transaction; @@ -241,7 +231,7 @@ sub query { # given a message ID, get replies to a message sub get_replies { my ($self, $mid, $opts) = @_; - $mid = clean_mid($mid); + $mid = mid_clean($mid); $mid = mid_compressed($mid); my $qp = $self->qp; my $irt = $qp->parse_query("inreplyto:$mid", 0); @@ -344,15 +334,6 @@ sub date_range_processor { $_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS); } -sub clean_mid { - my ($mid) = @_; - defined($mid) or die "no Message-ID"; - # MDA->precheck did more checking for us - $mid =~ s/\A\s*?\s*\z//; - $mid; -} - sub link_message { my ($self, $smsg, $is_ghost) = @_; @@ -410,7 +391,7 @@ sub link_message_to_parents { sub lookup_message { my ($self, $mid) = @_; - $mid = clean_mid($mid); + $mid = mid_clean($mid); $mid = mid_compressed($mid); my $doc_id = $self->find_unique_doc_id('mid', $mid); diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 30759a30..c2dbb7ed 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -3,11 +3,12 @@ package PublicInbox::View; use strict; use warnings; -use PublicInbox::Hval; use URI::Escape qw/uri_escape_utf8/; use Encode qw/find_encoding/; use Encode::MIME::Header; use Email::MIME::ContentType qw/parse_content_type/; +use PublicInbox::Hval; +use PublicInbox::MID qw/mid_clean mid_compressed/; require POSIX; # TODO: make these constants tunable @@ -366,12 +367,9 @@ sub linkify_refs { } @_); } -require Digest::SHA; sub anchor_for { my ($msgid) = @_; - $msgid =~ s/\A\s*?\s*\z//; - 'm' . Digest::SHA::sha1_hex($msgid); + 'm' . mid_compressed(mid_clean($msgid)); } 1; -- cgit v1.2.3-24-ge0c7 From 7edf30e5349ab5566815e5050e9ba0f53e1d0bb9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 09:28:33 +0000 Subject: search: make search results more OO This will relieve callers of the need to decode the data we store internally in Xapian --- lib/PublicInbox/Search.pm | 34 ++++++------------- lib/PublicInbox/SearchMsg.pm | 81 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 87 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index e88bfb16..c9c12c0b 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -8,8 +8,6 @@ use PublicInbox::SearchMsg; use base qw/Exporter/; use Search::Xapian qw/:standard/; require PublicInbox::View; -use Date::Parse qw/str2time/; -use POSIX qw//; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compressed/; @@ -109,8 +107,7 @@ sub add_message { $doc->add_term(xpfx('mid') . $mid); } - my $subj = $mime->header('Subject'); - $subj = '' unless defined $subj; + my $subj = $smsg->subject; if (length $subj) { $doc->add_term(xpfx('subject') . $subj); @@ -119,23 +116,12 @@ sub add_message { $doc->add_term(xpfx('path') . $path); } - my $from = $mime->header('From') || ''; - my @from; - - if ($from) { - @from = Email::Address->parse($from); - $from = $from[0]->name; - } - - my $ts = eval { str2time($mime->header('Date')) } || 0; - my $date = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); - $ts = Search::Xapian::sortable_serialise($ts); + my $from = $smsg->from_name; + my $date = $smsg->date; + my $ts = Search::Xapian::sortable_serialise($smsg->ts); $doc->add_value(PublicInbox::Search::TS, $ts); - # this is what we show in index results: - $subj =~ tr/\n/ /; - $from =~ tr/\n/ /; - $doc->set_data("$mid\n$subj\n$from\n$date"); + $doc->set_data($smsg->to_doc_data); my $tg = $self->term_generator; @@ -145,10 +131,8 @@ sub add_message { $tg->index_text($subj) if $subj; $tg->increase_termpos; - if (@from) { - $tg->index_text($from[0]->format); - $tg->increase_termpos; - } + $tg->index_text($smsg->from->format); + $tg->increase_termpos; $mime->walk_parts(sub { my ($part) = @_; @@ -265,7 +249,9 @@ sub do_enquire { my $offset = $opts->{offset} || 0; my $limit = $opts->{limit} || 50; my $mset = $enquire->get_mset($offset, $limit); - my @msgs = map { $_->get_document->get_data } $mset->items; + my @msgs = map { + PublicInbox::SearchMsg->load_doc($_->get_document); + } $mset->items; { count => $mset->get_matches_estimated, msgs => \@msgs } } diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 920804ac..619a19d4 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -5,6 +5,10 @@ package PublicInbox::SearchMsg; use strict; use warnings; use Search::Xapian; +use Email::Address qw//; +use POSIX qw//; +use Date::Parse qw/str2time/; +use PublicInbox::MID qw/mid_clean mid_compressed/; our $PFX2TERM_RE = undef; sub new { @@ -20,6 +24,78 @@ sub wrap { bless { doc => $doc, mime => undef, mid => $mid }, $class; } +sub load_doc { + my ($class, $doc) = @_; + my ($mid, $subj, $from, $date) = split(/\n/, $doc->get_data); + bless { + doc => $doc, + mid => $mid, + subject => $subj, + date => $date, + from_name => $from, + }, $class; +} + +sub subject { + my ($self) = @_; + my $subj = $self->{subject}; + return $subj if defined $subj; + $subj = $self->{mime}->header('Subject'); + $subj = '' unless defined $subj; + $subj =~ tr/\n/ /; + $self->{subject} = $subj; +} + +sub from { + my ($self) = @_; + my $from = $self->mime->header('From') || ''; + my @from; + + if ($from) { + @from = Email::Address->parse($from); + $self->{from} = $from[0]; + $from = $from[0]->name; + } + $from =~ tr/\n/ /; + $self->{from_name} = $from; + $self->{from}; +} + +sub from_name { + my ($self) = @_; + my $from_name = $self->{from_name}; + return $from_name if defined $from_name; + $self->from; + $self->{from_name}; +} + +sub ts { + my ($self) = @_; + my $ts = $self->{ts}; + return $ts if $ts; + $self->{date} = undef; + $self->date; + $self->{ts}; +} + +sub date { + my ($self) = @_; + my $date = $self->{date}; + return $date if $date; + my $ts = eval { str2time($self->mime->header('Date')) } || 0; + $self->{ts} = $ts; + $self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); +} + +sub to_doc_data { + my ($self) = @_; + + $self->mid . "\n" . + $self->subject . "\n" . + $self->from_name . "\n". + $self->date; +} + sub ensure_metadata { my ($self) = @_; my $doc = $self->{doc}; @@ -60,10 +136,7 @@ sub _extract_mid { my ($self) = @_; my $mid = $self->mime->header('Message-ID'); - if ($mid && $mid =~ /<([^>]+)>/) { - return $1; - } - return $mid; + $mid ? mid_compressed(mid_clean($mid)) : $mid; } sub mime { -- cgit v1.2.3-24-ge0c7 From 226657eb31e326cc8329229e8ba0f63ff4c75083 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 09:28:34 +0000 Subject: view: display replies in per-message view This can be used to quickly scan for replies in a message without displaying an entire thread. --- lib/PublicInbox/SearchMsg.pm | 30 ++++++++++++++- lib/PublicInbox/View.pm | 87 +++++++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/WWW.pm | 20 ++++++++-- 3 files changed, 126 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 619a19d4..550521aa 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -6,9 +6,12 @@ use strict; use warnings; use Search::Xapian; use Email::Address qw//; +use Email::Simple qw//; use POSIX qw//; use Date::Parse qw/str2time/; use PublicInbox::MID qw/mid_clean mid_compressed/; +use Encode qw/find_encoding/; +my $enc_utf8 = find_encoding('UTF-8'); our $PFX2TERM_RE = undef; sub new { @@ -26,7 +29,9 @@ sub wrap { sub load_doc { my ($class, $doc) = @_; - my ($mid, $subj, $from, $date) = split(/\n/, $doc->get_data); + my $data = $doc->get_data; + $data = $enc_utf8->decode($data); + my ($mid, $subj, $from, $date) = split(/\n/, $data); bless { doc => $doc, mid => $mid, @@ -52,11 +57,11 @@ sub from { my @from; if ($from) { + $from =~ tr/\n/ /; @from = Email::Address->parse($from); $self->{from} = $from[0]; $from = $from[0]->name; } - $from =~ tr/\n/ /; $self->{from_name} = $from; $self->{from}; } @@ -122,6 +127,27 @@ sub ensure_metadata { } } +# for threading only +sub mini_mime { + my ($self) = @_; + $self->ensure_metadata; + my @h = ( + Subject => $self->subject, + 'X-PI-From' => $self->from_name, + 'X-PI-Date' => $self->date, + 'X-PI-TS' => $self->ts, + 'Message-ID' => "<$self->{mid}>", + ); + if (my $refs = $self->{references}) { + push @h, References => '<' . join('> <', @$refs) . '>'; + } + if (my $irt = $self->{inreplyto}) { + push @h, 'In-Reply-To' => "<$irt>"; + } + + Email::MIME->create(header_str => \@h); +} + sub mid { my ($self, $mid) = @_; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index c2dbb7ed..fcc98ab8 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -22,7 +22,7 @@ my $enc_utf8 = find_encoding('UTF-8'); # public functions: sub msg_html { - my ($class, $mime, $full_pfx, $footer) = @_; + my ($class, $mime, $full_pfx, $footer, $srch) = @_; if (defined $footer) { $footer = "\n" . $footer; } else { @@ -31,7 +31,7 @@ sub msg_html { headers_to_html_header($mime, $full_pfx) . multipart_text_as_html($mime, $full_pfx) . '
' . PRE_WRAP . - html_footer($mime, 1) . $footer . + html_footer($mime, 1, $full_pfx, $srch) . $footer . ''; } @@ -325,7 +325,7 @@ sub headers_to_html_header { } sub html_footer { - my ($mime, $standalone) = @_; + my ($mime, $standalone, $full_pfx, $srch) = @_; my %cc; # everyone else my $to; # this is the From address @@ -344,8 +344,8 @@ sub html_footer { my $subj = $mime->header('Subject') || ''; $subj = "Re: $subj" unless $subj =~ /\bRe:/; - my $irp = uri_escape_utf8( - $mime->header_obj->header_raw('Message-ID') || ''); + my $mid = $mime->header_obj->header_raw('Message-ID'); + my $irp = uri_escape_utf8($mid); delete $cc{$to}; $to = uri_escape_utf8($to); $subj = uri_escape_utf8($subj); @@ -353,9 +353,28 @@ sub html_footer { my $cc = uri_escape_utf8(join(',', sort values %cc)); my $href = "mailto:$to?In-Reply-To=$irp&Cc=${cc}&Subject=$subj"; + my $irt = ''; my $idx = $standalone ? " index" : ''; + if ($idx && $srch) { + my $res = $srch->get_replies($mid); + if (my $c = $res->{count}) { + $c = $c == 1 ? '1 reply' : "$c replies"; + $idx .= "\n$c:\n"; + thread_replies(\$idx, $mime, $res); + } else { + $idx .= "\n(no replies yet)\n"; + } + $irt = $mime->header_obj->header_raw('In-Reply-To'); + if ($irt) { + $irt = PublicInbox::Hval->new_msgid($irt); + $irt = $irt->as_href; + $irt = "parent "; + } else { + $irt = ' ' x length('parent '); + } + } - "reply' . $idx; + "$irtreply' . $idx; } sub linkify_refs { @@ -372,4 +391,60 @@ sub anchor_for { 'm' . mid_compressed(mid_clean($msgid)); } +# children are chronological +sub simple_sort_children { + sort { + (eval { $a->topmost->message->header('X-PI-TS') } || 0) <=> + (eval { $b->topmost->message->header('X-PI-TS') } || 0) + } @_; +} + +sub simple_dump { + my ($dst, $root, $node, $level) = @_; + $$dst .= ' ' x $level; + if (my $x = $node->message) { + my $mid = $x->header('Message-ID'); + if ($root->[0] ne $mid) { + my $s = clean_subj($x->header('Subject')); + if ($root->[1] eq $s) { + $s = ' '; + } else { + $s = PublicInbox::Hval->new($s); + $s = $s->as_html . ' '; + } + my $m = PublicInbox::Hval->new_msgid($mid); + my $f = PublicInbox::Hval->new($x->header('X-PI-From')); + my $d = PublicInbox::Hval->new($x->header('X-PI-Date')); + $m = $m->as_href . '.html'; + $f = $f->as_html; + $d = $d->as_html; + $$dst .= "` $s$f @ $d UTC\n"; + } + } + simple_dump($dst, $root, $node->child, $level + 1) if $node->child; + simple_dump($dst, $root, $node->next, $level) if $node->next; +} + +sub clean_subj { + my ($subj) = @_; + $subj =~ s/\A\s+//; + $subj =~ s/\s+\z//; + $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) + $subj =~ s/\s+/ /; + $subj; +} + +sub thread_replies { + my ($dst, $root, $res) = @_; + my @msgs = map { $_->mini_mime } @{$res->{msgs}}; + require PublicInbox::Thread; + $root->header_set('X-PI-TS', '0'); + my $th = PublicInbox::Thread->new($root, @msgs); + $th->thread; + $th->order(sub { simple_sort_children(@_) }); + $root = [ $root->header('Message-ID'), + clean_subj($root->header('Subject')) ]; + simple_dump($dst, $root, $_, 0) for $th->rootset; +} + 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 1814286b..32cc0b27 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -175,8 +175,10 @@ sub get_mid_html { my $pfx = "../f/$mid_href.html"; my $foot = footer($ctx); require Email::MIME; + my $mime = Email::MIME->new($x); + my $srch = searcher($ctx); [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View->msg_html(Email::MIME->new($x), $pfx, $foot) ] ]; + [ PublicInbox::View->msg_html($mime, $pfx, $foot, $srch) ] ]; } # /$LISTNAME/f/$MESSAGE_ID.html -> HTML content (fullquotes) @@ -185,10 +187,12 @@ sub get_full_html { my $x = mid2blob($ctx); return r404() unless $x; require PublicInbox::View; - require Email::MIME; my $foot = footer($ctx); + require Email::MIME; + my $mime = Email::MIME->new($x); + my $srch = searcher($ctx); [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View->msg_html(Email::MIME->new($x), undef, $foot)] ]; + [ PublicInbox::View->msg_html($mime, undef, $foot, $srch)] ]; } sub self_url { @@ -281,4 +285,14 @@ sub footer { ); } +# search support is optional, returns undef if Xapian is not installed +# or not configured for the given GIT_DIR +sub searcher { + my ($ctx) = @_; + eval { + require PublicInbox::Search; + PublicInbox::Search->new($ctx->{git_dir}); + }; +} + 1; -- cgit v1.2.3-24-ge0c7 From f79d00b03dfa4f3f3c13bee4654d243c1d2fcd97 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 23:41:21 +0000 Subject: thread: common sorting code We'll be sharing the same threading, so it makes sense to sort replies using the same code and message headers without repeating ourselves. This also standardizes on sorting on X-PI-TS (Unix epoch in seconds) instead over using X-PI-Date differently in two different places --- lib/PublicInbox/Feed.pm | 25 +++++-------------------- lib/PublicInbox/Thread.pm | 14 ++++++++++++++ lib/PublicInbox/View.pm | 12 ++---------- 3 files changed, 21 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index f7c2f329..b5325597 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -73,11 +73,13 @@ sub generate_html_index { '' . PRE_WRAP; # sort child messages in chronological order - $th->order(sub { mime_sort_children(@_) }); + $th->order(*PublicInbox::Thread::sort_ts); # except we sort top-level messages reverse chronologically my $state = [ time, {}, $first, 0 ]; - for (mime_sort_roots($th)) { dump_msg($_, 0, \$html, $state) } + for (PublicInbox::Thread::rsort_ts($th->rootset)) { + dump_msg($_, 0, \$html, $state) + } Email::Address->purge_cache; my $footer = nav_footer($args->{cgi}, $last, $feed_opts, $state); @@ -299,26 +301,9 @@ sub mime_load_for_sort { my $t = eval { str2time($mime->header('Date')) }; defined($t) or $t = 0; - $mime->header_set('X-PI-Date', $t); + $mime->header_set('X-PI-TS', $t); push @$messages, $mime; 1; } -# children are chronological -sub mime_sort_children { - sort { - $a->topmost->message->header('X-PI-Date') <=> - $b->topmost->message->header('X-PI-Date') - } @_; -} - -# parents are reverse chronological -sub mime_sort_roots { - my ($th) = @_; - sort { - (eval { $b->message->header('X-PI-Date') } || 0) <=> - (eval { $a->message->header('X-PI-Date') } || 0) - } $th->rootset; -} - 1; diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm index 7dabf243..58efb8dc 100644 --- a/lib/PublicInbox/Thread.pm +++ b/lib/PublicInbox/Thread.pm @@ -12,6 +12,20 @@ if ($Mail::Thread::VERSION <= 2.55) { eval q(sub _container_class { 'PublicInbox::Thread::Container' }); } +sub sort_ts { + sort { + (eval { $a->topmost->message->header('X-PI-TS') } || 0) <=> + (eval { $b->topmost->message->header('X-PI-TS') } || 0) + } @_; +} + +sub rsort_ts { + sort { + (eval { $b->topmost->message->header('X-PI-TS') } || 0) <=> + (eval { $a->topmost->message->header('X-PI-TS') } || 0) + } @_; +} + package PublicInbox::Thread::Container; use strict; use warnings; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index fcc98ab8..dcdb3109 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -67,7 +67,7 @@ sub index_entry { $subj = PublicInbox::Hval->new_oneline($subj)->as_html; my $pfx = (' ' x $level); - my $ts = $mime->header('X-PI-Date'); + my $ts = $mime->header('X-PI-TS'); my $fmt = '%Y-%m-%d %H:%M UTC'; $ts = POSIX::strftime($fmt, gmtime($ts)); @@ -391,14 +391,6 @@ sub anchor_for { 'm' . mid_compressed(mid_clean($msgid)); } -# children are chronological -sub simple_sort_children { - sort { - (eval { $a->topmost->message->header('X-PI-TS') } || 0) <=> - (eval { $b->topmost->message->header('X-PI-TS') } || 0) - } @_; -} - sub simple_dump { my ($dst, $root, $node, $level) = @_; $$dst .= ' ' x $level; @@ -441,7 +433,7 @@ sub thread_replies { $root->header_set('X-PI-TS', '0'); my $th = PublicInbox::Thread->new($root, @msgs); $th->thread; - $th->order(sub { simple_sort_children(@_) }); + $th->order(*PublicInbox::Thread::sort_ts); $root = [ $root->header('Message-ID'), clean_subj($root->header('Subject')) ]; simple_dump($dst, $root, $_, 0) for $th->rootset; -- cgit v1.2.3-24-ge0c7 From 82f67259c11387e3be45f72723e1940dffacdfc3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Aug 2015 23:57:39 +0000 Subject: view: reply threading adjustment Give changes in subject their own line to reduce line wrapping, but avoid showing any redundant subjects by maintaining a hash of subjects already displayed. --- lib/PublicInbox/View.pm | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index dcdb3109..fe4f2dfd 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -9,6 +9,7 @@ use Encode::MIME::Header; use Email::MIME::ContentType qw/parse_content_type/; use PublicInbox::Hval; use PublicInbox::MID qw/mid_clean mid_compressed/; +use Digest::SHA; require POSIX; # TODO: make these constants tunable @@ -393,37 +394,45 @@ sub anchor_for { sub simple_dump { my ($dst, $root, $node, $level) = @_; - $$dst .= ' ' x $level; + my $pfx = ' ' x $level; + $$dst .= $pfx; if (my $x = $node->message) { my $mid = $x->header('Message-ID'); if ($root->[0] ne $mid) { - my $s = clean_subj($x->header('Subject')); - if ($root->[1] eq $s) { - $s = ' '; + my $s = $x->header('Subject'); + my $h = hash_subj($s); + if ($root->[1]->{$h}) { + $s = ''; } else { + $root->[1]->{$h} = 1; $s = PublicInbox::Hval->new($s); - $s = $s->as_html . ' '; + $s = $s->as_html; } my $m = PublicInbox::Hval->new_msgid($mid); my $f = PublicInbox::Hval->new($x->header('X-PI-From')); my $d = PublicInbox::Hval->new($x->header('X-PI-Date')); $m = $m->as_href . '.html'; $f = $f->as_html; - $d = $d->as_html; - $$dst .= "` $s$f @ $d UTC\n"; + $d = $d->as_html . ' UTC'; + if (length($s) == 0) { + $$dst .= "` $f @ $d\n"; + } else { + $$dst .= "` $s\n" . + "$pfx by $f @ $d\n"; + } } } simple_dump($dst, $root, $node->child, $level + 1) if $node->child; simple_dump($dst, $root, $node->next, $level) if $node->next; } -sub clean_subj { +sub hash_subj { my ($subj) = @_; $subj =~ s/\A\s+//; $subj =~ s/\s+\z//; $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) $subj =~ s/\s+/ /; - $subj; + Digest::SHA::sha1($subj); } sub thread_replies { @@ -435,7 +444,7 @@ sub thread_replies { $th->thread; $th->order(*PublicInbox::Thread::sort_ts); $root = [ $root->header('Message-ID'), - clean_subj($root->header('Subject')) ]; + { hash_subj($root->header('Subject')) => 1 } ]; simple_dump($dst, $root, $_, 0) for $th->rootset; } -- cgit v1.2.3-24-ge0c7 From 91e579a19735ba6ddd3cdee95795801732500c3e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 01:42:13 +0000 Subject: view: hoist out index_walk function We will reuse it for thread views when powered by Xapian. --- lib/PublicInbox/View.pm | 77 +++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index fe4f2dfd..66d3bcb8 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -92,41 +92,9 @@ sub index_entry { my $more = 'message'; # scan through all parts, looking for displayable text $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - my $ct = $part->content_type; - - # account for filter bugs... - return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i; - - my $enc = enc_for($ct, $enc_msg); - - if ($part_nr > 0) { - my $fn = $part->filename; - defined($fn) or $fn = "part #" . ($part_nr + 1); - $rv .= $pfx . add_filename_line($enc->decode($fn)); - } - - my $s = add_text_body_short($enc, $part, $part_nr, $fhref); - - # drop the remainder of git patches, they're usually better - # to review when the full message is viewed - $s =~ s!^---+\n.*\z!!ms and $more = 'more...'; - - # Drop signatures - $s =~ s/^-- \n.*\z//ms and $more = 'more...'; - - # kill any leading or trailing whitespace - $s =~ s/\A\s+//s; - $s =~ s/\s+\z//s; - - if (length $s) { - # add prefix: - $s =~ s/^/$pfx/sgm; - - $rv .= $s . "\n"; - } - ++$part_nr; + $rv .= index_walk($_[0], $pfx, $enc_msg, $part_nr, $fhref, + \$more); + $part_nr++; }); $rv .= "\n$pfx$more "; @@ -150,6 +118,45 @@ sub index_entry { # only private functions below. +sub index_walk { + my ($part, $pfx, $enc_msg, $part_nr, $fhref, $more) = @_; + my $rv = ''; + return $rv if $part->subparts; # walk_parts already recurses + my $ct = $part->content_type; + + # account for filter bugs... + return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i; + + my $enc = enc_for($ct, $enc_msg); + + if ($part_nr > 0) { + my $fn = $part->filename; + defined($fn) or $fn = "part #" . ($part_nr + 1); + $rv .= $pfx . add_filename_line($enc->decode($fn)); + } + + my $s = add_text_body_short($enc, $part, $part_nr, $fhref); + + # drop the remainder of git patches, they're usually better + # to review when the full message is viewed + $s =~ s!^---+\n.*\z!!ms and $$more = 'more...'; + + # Drop signatures + $s =~ s/^-- \n.*\z//ms and $$more = 'more...'; + + # kill any leading or trailing whitespace + $s =~ s/\A\s+//s; + $s =~ s/\s+\z//s; + + if (length $s) { + # add prefix: + $s =~ s/^/$pfx/sgm; + + $rv .= $s . "\n"; + } + $rv; +} + sub enc_for { my ($ct, $default) = @_; $default ||= $enc_utf8; -- cgit v1.2.3-24-ge0c7 From 57cf47ec49dee9f919460840ae94074ff807b695 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 02:17:14 +0000 Subject: www: /t/$MESSAGE_ID.html for threads This should bring up nearly the entire thread a given Message-ID is linked to. --- lib/PublicInbox/View.pm | 150 ++++++++++++++++++++++++++++++++++++++---------- lib/PublicInbox/WWW.pm | 51 +++++++++++++--- 2 files changed, 163 insertions(+), 38 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 66d3bcb8..c40a2a75 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -4,6 +4,7 @@ package PublicInbox::View; use strict; use warnings; use URI::Escape qw/uri_escape_utf8/; +use Date::Parse qw/str2time/; use Encode qw/find_encoding/; use Encode::MIME::Header; use Email::MIME::ContentType qw/parse_content_type/; @@ -16,6 +17,7 @@ require POSIX; use constant MAX_INLINE_QUOTED => 12; # half an 80x24 terminal use constant MAX_TRUNC_LEN => 72; use constant PRE_WRAP => ""; +use constant T_ANCHOR => '#u'; *ascii_html = *PublicInbox::Hval::ascii_html; @@ -43,9 +45,10 @@ sub feed_entry { } # this is already inside a
+# state = [ time, seen = {}, first_commit, page_nr = 0 ]
 sub index_entry {
-	my ($class, $mime, $level, $state) = @_;
-	my ($now, $seen, $first) = @$state;
+	my (undef, $mime, $level, $state) = @_;
+	my (undef, $seen, $first_commit) = @$state;
 	my $midx = $state->[3]++;
 	my ($prev, $next) = ($midx - 1, $midx + 1);
 	my $rv = '';
@@ -67,6 +70,15 @@ sub index_entry {
 	$from = PublicInbox::Hval->new_oneline($from)->as_html;
 	$subj = PublicInbox::Hval->new_oneline($subj)->as_html;
 	my $pfx = ('  ' x $level);
+	my $root_anchor = $seen->{root_anchor};
+	my $path;
+	my $more = 'permalink';
+	if ($root_anchor) {
+		$path = '../';
+		$subj = "$subj" if $root_anchor eq $id;
+	} else {
+		$path = '';
+	}
 
 	my $ts = $mime->header('X-PI-TS');
 	my $fmt = '%Y-%m-%d %H:%M UTC';
@@ -80,16 +92,18 @@ sub index_entry {
 	}
 	$rv .= "\n\n";
 
-	my $irp = $header_obj->header_raw('In-Reply-To');
-	my ($anchor_idx, $anchor);
-	if (defined $irp) {
-		$anchor_idx = anchor_for($irp);
+	my $irt = $header_obj->header_raw('In-Reply-To');
+	my ($anchor_idx, $anchor, $t_anchor);
+	if (defined $irt) {
+		$anchor_idx = anchor_for($irt);
 		$anchor = $seen->{$anchor_idx};
+		$t_anchor = T_ANCHOR;
+	} else {
+		$t_anchor = '';
 	}
 	my $href = $mid->as_href;
-	my $mhref = "m/$href.html";
-	my $fhref = "f/$href.html";
-	my $more = 'message';
+	my $mhref = "${path}m/$href.html";
+	my $fhref = "${path}f/$href.html";
 	# scan through all parts, looking for displayable text
 	$mime->walk_parts(sub {
 		$rv .= index_walk($_[0], $pfx, $enc_msg, $part_nr, $fhref,
@@ -98,24 +112,73 @@ sub index_entry {
 	});
 
 	$rv .= "\n$pfx$more ";
-	my $txt = "m/$href.txt";
+	my $txt = "${path}m/$href.txt";
 	$rv .= "raw ";
 	$rv .= html_footer($mime, 0);
 
-	if (defined $irp) {
+	if (defined $irt) {
 		unless (defined $anchor) {
-			my $v = PublicInbox::Hval->new_msgid($irp);
-			my $html = $v->as_html;
-			$anchor = 'm/' . $v->as_href . '.html';
+			my $v = PublicInbox::Hval->new_msgid($irt);
+			$v = $v->as_href;
+			$anchor = "${path}m/$v.html";
 			$seen->{$anchor_idx} = $anchor;
 		}
 		$rv .= " parent";
 	}
-	$rv .= " threadlink";
+
+	if ($first_commit) {
+		$rv .= " thread";
+	}
 
 	$rv . "\n\n";
 }
 
+sub thread_html {
+	my (undef, $ctx, $foot, $srch) = @_;
+	my $mid = mid_compressed($ctx->{mid});
+	my $res = $srch->get_thread($mid);
+	my $rv = '';
+	require PublicInbox::GitCatFile;
+	my $git = PublicInbox::GitCatFile->new($ctx->{git_dir});
+	my $nr = scalar @{$res->{msgs}};
+	return $rv if $nr == 0;
+	my @msgs;
+	while (my $smsg = shift @{$res->{msgs}}) {
+		my $m = $smsg->mid;
+
+		# Duplicated from WWW.pm
+		my ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);
+
+		unless (defined $x38) {
+			require Digest::SHA;
+			$m = Digest::SHA::sha1_hex($m);
+			($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);
+		}
+
+		# FIXME: duplicated code from Feed.pm
+		my $mime = eval {
+			my $str = $git->cat_file("HEAD:$x2/$x38");
+			Email::MIME->new($str);
+		};
+		unless ($@) {
+			my $t = eval { str2time($mime->header('Date')) };
+			defined($t) or $t = 0;
+			$mime->header_set('X-PI-TS', $t);
+			push @msgs, $mime;
+		}
+	}
+	require PublicInbox::Thread;
+	my $th = PublicInbox::Thread->new(@msgs);
+	$th->thread;
+	$th->order(*PublicInbox::Thread::sort_ts);
+	my $state = [ undef, { root_anchor => anchor_for($mid) }, undef, 0 ];
+	thread_entry(\$rv, $state, $_, 0) for $th->rootset;
+	my $final_anchor = $state->[3];
+	my $next = "end of thread\n";
+
+	$rv .= "

" . PRE_WRAP . $next . $foot . ""; +} + # only private functions below. sub index_walk { @@ -300,17 +363,15 @@ sub headers_to_html_header { my $header_obj = $mime->header_obj; my $mid = $header_obj->header_raw('Message-ID'); - if (defined $mid) { - $mid = PublicInbox::Hval->new_msgid($mid); - $rv .= 'Message-ID: <' . $mid->as_html . '> '; - my $href = $mid->as_href; - $href = "../m/$href" unless $full_pfx; - $rv .= "(raw)\n"; - } + $mid = PublicInbox::Hval->new_msgid($mid); + $rv .= 'Message-ID: <' . $mid->as_html . '> '; + my $href = $mid->as_href; + $href = "../m/$href" unless $full_pfx; + $rv .= "(raw)\n"; - my $irp = $header_obj->header_raw('In-Reply-To'); - if (defined $irp) { - my $v = PublicInbox::Hval->new_msgid($irp); + my $irt = $header_obj->header_raw('In-Reply-To'); + if (defined $irt) { + my $v = PublicInbox::Hval->new_msgid($irt); my $html = $v->as_html; my $href = $v->as_href; $rv .= "In-Reply-To: <"; @@ -319,7 +380,7 @@ sub headers_to_html_header { my $refs = $header_obj->header_raw('References'); if ($refs) { - $refs =~ s/\s*\Q$irp\E\s*// if (defined $irp); + $refs =~ s/\s*\Q$irt\E\s*// if (defined $irt); my @refs = ($refs =~ /<([^>]+)>/g); if (@refs) { $rv .= 'References: '. linkify_refs(@refs) . "\n"; @@ -353,17 +414,20 @@ sub html_footer { my $subj = $mime->header('Subject') || ''; $subj = "Re: $subj" unless $subj =~ /\bRe:/; my $mid = $mime->header_obj->header_raw('Message-ID'); - my $irp = uri_escape_utf8($mid); + my $irt = uri_escape_utf8($mid); delete $cc{$to}; $to = uri_escape_utf8($to); $subj = uri_escape_utf8($subj); my $cc = uri_escape_utf8(join(',', sort values %cc)); - my $href = "mailto:$to?In-Reply-To=$irp&Cc=${cc}&Subject=$subj"; + my $href = "mailto:$to?In-Reply-To=$irt&Cc=${cc}&Subject=$subj"; - my $irt = ''; my $idx = $standalone ? " index" : ''; if ($idx && $srch) { + $irt = $mime->header_obj->header_raw('In-Reply-To') || ''; + $mid = mid_compressed(mid_clean($mid)); + my $t_anchor = length $irt ? T_ANCHOR : ''; + $idx = " thread$idx"; my $res = $srch->get_replies($mid); if (my $c = $res->{count}) { $c = $c == 1 ? '1 reply' : "$c replies"; @@ -372,7 +436,6 @@ sub html_footer { } else { $idx .= "\n(no replies yet)\n"; } - $irt = $mime->header_obj->header_raw('In-Reply-To'); if ($irt) { $irt = PublicInbox::Hval->new_msgid($irt); $irt = $irt->as_href; @@ -380,6 +443,8 @@ sub html_footer { } else { $irt = ' ' x length('parent '); } + } else { + $irt = ''; } "$irtreply' . $idx; @@ -455,4 +520,29 @@ sub thread_replies { simple_dump($dst, $root, $_, 0) for $th->rootset; } +sub thread_html_head { + my ($mime) = @_; + my $s = PublicInbox::Hval->new_oneline($mime->header('Subject')); + $s = $s->as_html; + "$s" . PRE_WRAP + +} + +sub thread_entry { + my ($dst, $state, $node, $level) = @_; + # $state = [ $search_res, $seen, undef, 0 (msg_nr) ]; + # $seen is overloaded with 3 types of fields: + # 1) "root" => Message-ID, + # 2) seen subject hashes: sha1(subject) => 1 + # 3) anchors hashes: "#$sha1_hex" (same as $seen in index_entry) + if (my $mime = $node->message) { + if (length($$dst) == 0) { + $$dst .= thread_html_head($mime); + } + $$dst .= index_entry(undef, $mime, $level, $state); + } + thread_entry($dst, $state, $node->child, $level + 1) if $node->child; + thread_entry($dst, $state, $node->next, $level) if $node->next; +} + 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 32cc0b27..52e51c43 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -49,9 +49,15 @@ sub run { } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\.html\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_full_html(\%ctx, $cgi); + # thread display + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) { + invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi); + # convenience redirects, order matters - } elsif ($path_info =~ m!$LISTNAME_RE/(?:m|f)/(\S+)\z!o) { - invalid_list_mid(\%ctx, $1, $2) || redirect_mid(\%ctx, $cgi); + } elsif ($path_info =~ m!$LISTNAME_RE/(m|f|t)/(\S+)\z!o) { + my $pfx = $2; + invalid_list_mid(\%ctx, $1, $3) || + redirect_mid(\%ctx, $cgi, $2); } else { r404(); @@ -170,9 +176,7 @@ sub get_mid_html { return r404() unless $x; require PublicInbox::View; - my $mid_href = PublicInbox::Hval::ascii_html( - uri_escape_utf8($ctx->{mid})); - my $pfx = "../f/$mid_href.html"; + my $pfx = msg_pfx($ctx); my $foot = footer($ctx); require Email::MIME; my $mime = Email::MIME->new($x); @@ -195,6 +199,18 @@ sub get_full_html { [ PublicInbox::View->msg_html($mime, undef, $foot, $srch)] ]; } +# /$LISTNAME/t/$MESSAGE_ID.html +sub get_thread { + my ($ctx, $cgi) = @_; + my $srch = searcher($ctx) or return need_search($ctx); + require PublicInbox::View; + my $foot = footer($ctx); + my $body = PublicInbox::View->thread_html($ctx, $foot, $srch) or + return r404(); + [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], + [ $body ] ]; +} + sub self_url { my ($cgi) = @_; ref($cgi) eq 'CGI' ? $cgi->self_url : $cgi->uri->as_string; @@ -206,10 +222,13 @@ sub redirect_list_index { } sub redirect_mid { - my ($ctx, $cgi) = @_; + my ($ctx, $cgi, $pfx) = @_; my $url = self_url($cgi); - $url =~ s!/f/!/m/!; - do_redirect($url . '.html'); + my $anchor = ''; + if (lc($pfx) eq 't') { + $anchor = '#u'; # is used to highlight in View.pm + } + do_redirect($url . ".html$anchor"); } sub do_redirect { @@ -295,4 +314,20 @@ sub searcher { }; } +sub need_search { + my ($ctx) = @_; + my $msg = <Search not available for this +public-inbox
Search is not available for this public-inbox
+Return to index
+EOF + [ 501, [ 'Content-Type' => 'text/html; charset=UTF-8' ], [ $msg ] ]; +} + +sub msg_pfx { + my ($ctx) = @_; + my $href = PublicInbox::Hval::ascii_html(uri_escape_utf8($ctx->{mid})); + "../f/$href.html"; +} + 1; -- cgit v1.2.3-24-ge0c7 From 7745ac38a4e2d4ae42e7192183f65c84a8e5662a Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 07:25:11 +0000 Subject: search: remove unnecessary xpfx export SearchMsg calls it with the full module path anyways. --- lib/PublicInbox/Search.pm | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index c9c12c0b..39b06b0a 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -5,14 +5,11 @@ package PublicInbox::Search; use strict; use warnings; use PublicInbox::SearchMsg; -use base qw/Exporter/; use Search::Xapian qw/:standard/; require PublicInbox::View; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compressed/; -our @EXPORT = qw/xpfx/; - use constant { TS => 0, SCHEMA_VERSION => 0, -- cgit v1.2.3-24-ge0c7 From eb5f82b20944d780ac3b2ff9a926c023da9468fd Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 08:14:40 +0000 Subject: implement /s/$SUBJECT_PATH.html lookups Quick-and-dirty wiring up of to Subject: paths. This may prove more memorizable and easier-to-share than /t/$MESSAGE_ID.html links, but less strict. This changes our schema version to 1, since we now use lower-case subject paths. --- lib/PublicInbox/Search.pm | 15 ++++++++-- lib/PublicInbox/View.pm | 72 ++++++++++++++++++++++++++++++++++++++++------- lib/PublicInbox/WWW.pm | 20 ++++++++++++- 3 files changed, 93 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 39b06b0a..f4f00b25 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -12,7 +12,10 @@ use PublicInbox::MID qw/mid_clean mid_compressed/; use constant { TS => 0, - SCHEMA_VERSION => 0, + # SCHEMA_VERSION history + # 0 - initial + # 1 - subject_path is lower-cased + SCHEMA_VERSION => 1, LANG => 'english', QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; @@ -209,6 +212,12 @@ sub query { $self->do_enquire($query, $opts); } +sub get_subject_path { + my ($self, $path, $opts) = @_; + my $query = $self->qp->parse_query("path:$path", 0); + $self->do_enquire($query); +} + # given a message ID, get replies to a message sub get_replies { my ($self, $mid, $opts) = @_; @@ -461,13 +470,13 @@ sub merge_threads { # normalize subjects so they are suitable as pathnames for URLs sub subject_path { - my ($subj) = @_; + my $subj = pop; $subj =~ s/\A\s+//; $subj =~ s/\s+\z//; $subj =~ s/^(?:re|aw):\s*//i; # remove reply prefix (aw: German) $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; - $subj; + lc($subj); } sub do_cat_mail { diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index c40a2a75..696d7d5a 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -31,7 +31,7 @@ sub msg_html { } else { $footer = ''; } - headers_to_html_header($mime, $full_pfx) . + headers_to_html_header($mime, $full_pfx, $srch) . multipart_text_as_html($mime, $full_pfx) . '
' . PRE_WRAP . html_footer($mime, 1, $full_pfx, $srch) . $footer . @@ -179,6 +179,52 @@ sub thread_html { $rv .= "
" . PRE_WRAP . $next . $foot . ""; } +sub subject_path_html { + my (undef, $ctx, $foot, $srch) = @_; + my $path = $ctx->{subject_path}; + my $res = $srch->get_subject_path($path); + my $rv = ''; + require PublicInbox::GitCatFile; + my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); + my $nr = scalar @{$res->{msgs}}; + return $rv if $nr == 0; + my @msgs; + while (my $smsg = shift @{$res->{msgs}}) { + my $m = $smsg->mid; + + # Duplicated from WWW.pm + my ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); + + unless (defined $x38) { + require Digest::SHA; + $m = Digest::SHA::sha1_hex($m); + ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); + } + + # FIXME: duplicated code from Feed.pm + my $mime = eval { + my $str = $git->cat_file("HEAD:$x2/$x38"); + Email::MIME->new($str); + }; + unless ($@) { + my $t = eval { str2time($mime->header('Date')) }; + defined($t) or $t = 0; + $mime->header_set('X-PI-TS', $t); + push @msgs, $mime; + } + } + require PublicInbox::Thread; + my $th = PublicInbox::Thread->new(@msgs); + $th->thread; + $th->order(*PublicInbox::Thread::sort_ts); + my $state = [ undef, { root_anchor => 'dummy' }, undef, 0 ]; + thread_entry(\$rv, $state, $_, 0) for $th->rootset; + my $final_anchor = $state->[3]; + my $next = "end of thread\n"; + + $rv .= "
" . PRE_WRAP . $next . $foot . ""; +} + # only private functions below. sub index_walk { @@ -235,7 +281,7 @@ sub enc_for { } sub multipart_text_as_html { - my ($mime, $full_pfx) = @_; + my ($mime, $full_pfx, $srch) = @_; my $rv = ""; my $part_nr = 0; my $enc_msg = enc_for($mime->header("Content-Type")); @@ -339,7 +385,7 @@ sub add_text_body_full { } sub headers_to_html_header { - my ($mime, $full_pfx) = @_; + my ($mime, $full_pfx, $srch) = @_; my $rv = ""; my @title; @@ -347,18 +393,21 @@ sub headers_to_html_header { my $v = $mime->header($h); defined($v) && length($v) or next; $v = PublicInbox::Hval->new_oneline($v); - $rv .= "$h: " . $v->as_html . "\n"; if ($h eq 'From') { my @from = Email::Address->parse($v->raw); - $v = $from[0]->name; - unless (defined($v) && length($v)) { - $v = '<' . $from[0]->address . '>'; - } - $title[1] = ascii_html($v); + $title[1] = ascii_html($from[0]->name); } elsif ($h eq 'Subject') { $title[0] = $v->as_html; + if ($srch) { + my $path = $srch->subject_path($v->raw); + $rv .= "$h: "; + $rv .= $v->as_html . "\n"; + next; + } } + $rv .= "$h: " . $v->as_html . "\n"; + } my $header_obj = $mime->header_obj; @@ -510,6 +559,9 @@ sub hash_subj { sub thread_replies { my ($dst, $root, $res) = @_; my @msgs = map { $_->mini_mime } @{$res->{msgs}}; + foreach (@{$res->{msgs}}) { + print STDERR "smsg->path: <", $_->path, ">\n"; + } require PublicInbox::Thread; $root->header_set('X-PI-TS', '0'); my $th = PublicInbox::Thread->new($root, @msgs); @@ -532,7 +584,7 @@ sub thread_entry { my ($dst, $state, $node, $level) = @_; # $state = [ $search_res, $seen, undef, 0 (msg_nr) ]; # $seen is overloaded with 3 types of fields: - # 1) "root" => Message-ID, + # 1) "root_anchor" => anchor_for(Message-ID), # 2) seen subject hashes: sha1(subject) => 1 # 3) anchors hashes: "#$sha1_hex" (same as $seen in index_entry) if (my $mime = $node->message) { diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 52e51c43..7fe9b85b 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -53,8 +53,13 @@ sub run { } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi); + # subject_path display + } elsif ($path_info =~ m!$LISTNAME_RE/s/(\S+)\.html\z!o) { + my $sp = $2; + invalid_list(\%ctx, $1) || get_subject_path(\%ctx, $cgi, $sp); + # convenience redirects, order matters - } elsif ($path_info =~ m!$LISTNAME_RE/(m|f|t)/(\S+)\z!o) { + } elsif ($path_info =~ m!$LISTNAME_RE/(m|f|t|s)/(\S+)\z!o) { my $pfx = $2; invalid_list_mid(\%ctx, $1, $3) || redirect_mid(\%ctx, $cgi, $2); @@ -211,6 +216,19 @@ sub get_thread { [ $body ] ]; } +# /$LISTNAME/s/$SUBJECT_PATH.html +sub get_subject_path { + my ($ctx, $cgi, $sp) = @_; + $ctx->{subject_path} = $sp; + my $srch = searcher($ctx) or return need_search($ctx); + require PublicInbox::View; + my $foot = footer($ctx); + my $body = PublicInbox::View->subject_path_html($ctx, $foot, $srch) or + return r404(); + [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], + [ $body ] ]; +} + sub self_url { my ($cgi) = @_; ref($cgi) eq 'CGI' ? $cgi->self_url : $cgi->uri->as_string; -- cgit v1.2.3-24-ge0c7 From 5daa5a6a3da3e944c7757b8f3aebedef602f21a4 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 08:32:18 +0000 Subject: SearchMsg: ensure metadata for ghost messages mid Ghosts have no document data in them. Perhaps we should just rely on terms for Message-ID and avoid storing that in the document data... --- lib/PublicInbox/SearchMsg.pm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 550521aa..14a62eb6 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -152,9 +152,10 @@ sub mid { my ($self, $mid) = @_; if (defined $mid) { - $self->{mid} = $mid; + $self->{mid} = $mid; } else { - $self->{mid} ||= $self->_extract_mid; + $self->ensure_metadata; # needed for ghosts + $self->{mid} ||= $self->_extract_mid; } } -- cgit v1.2.3-24-ge0c7 From 9041b136ba7a106ed5ff33da4b6ae28c2a0f4333 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 16 Aug 2015 08:53:41 +0000 Subject: view: deduplicate common code for loading search results More to come later. --- lib/PublicInbox/MID.pm | 14 +++++++- lib/PublicInbox/View.pm | 91 +++++++++++++++++-------------------------------- lib/PublicInbox/WWW.pm | 15 ++------ 3 files changed, 48 insertions(+), 72 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index e5a30a1b..d097011b 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -4,7 +4,7 @@ package PublicInbox::MID; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/mid_clean mid_compressed/; +our @EXPORT_OK = qw/mid_clean mid_compressed mid2path/; use Digest::SHA qw/sha1_hex/; use constant MID_MAX => 40; # SHA-1 hex length @@ -24,4 +24,16 @@ sub mid_compressed { sha1_hex($mid); } +sub mid2path { + my ($mid) = @_; + my ($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); + + unless (defined $x38) { + # compatibility with old links (or short Message-IDs :) + $mid = sha1_hex($mid); + ($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); + } + "$x2/$x38"; +} + 1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 696d7d5a..575c5ffd 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -9,7 +9,7 @@ use Encode qw/find_encoding/; use Encode::MIME::Header; use Email::MIME::ContentType qw/parse_content_type/; use PublicInbox::Hval; -use PublicInbox::MID qw/mid_clean mid_compressed/; +use PublicInbox::MID qw/mid_clean mid_compressed mid2path/; use Digest::SHA; require POSIX; @@ -138,37 +138,11 @@ sub thread_html { my $mid = mid_compressed($ctx->{mid}); my $res = $srch->get_thread($mid); my $rv = ''; - require PublicInbox::GitCatFile; - my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); - my $nr = scalar @{$res->{msgs}}; + my $msgs = load_results($ctx, $res); + my $nr = scalar @$msgs; return $rv if $nr == 0; - my @msgs; - while (my $smsg = shift @{$res->{msgs}}) { - my $m = $smsg->mid; - - # Duplicated from WWW.pm - my ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - - unless (defined $x38) { - require Digest::SHA; - $m = Digest::SHA::sha1_hex($m); - ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - } - - # FIXME: duplicated code from Feed.pm - my $mime = eval { - my $str = $git->cat_file("HEAD:$x2/$x38"); - Email::MIME->new($str); - }; - unless ($@) { - my $t = eval { str2time($mime->header('Date')) }; - defined($t) or $t = 0; - $mime->header_set('X-PI-TS', $t); - push @msgs, $mime; - } - } require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@msgs); + my $th = PublicInbox::Thread->new(@$msgs); $th->thread; $th->order(*PublicInbox::Thread::sort_ts); my $state = [ undef, { root_anchor => anchor_for($mid) }, undef, 0 ]; @@ -184,37 +158,11 @@ sub subject_path_html { my $path = $ctx->{subject_path}; my $res = $srch->get_subject_path($path); my $rv = ''; - require PublicInbox::GitCatFile; - my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); - my $nr = scalar @{$res->{msgs}}; + my $msgs = load_results($ctx, $res); + my $nr = scalar @$msgs; return $rv if $nr == 0; - my @msgs; - while (my $smsg = shift @{$res->{msgs}}) { - my $m = $smsg->mid; - - # Duplicated from WWW.pm - my ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - - unless (defined $x38) { - require Digest::SHA; - $m = Digest::SHA::sha1_hex($m); - ($x2, $x38) = ($m =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - } - - # FIXME: duplicated code from Feed.pm - my $mime = eval { - my $str = $git->cat_file("HEAD:$x2/$x38"); - Email::MIME->new($str); - }; - unless ($@) { - my $t = eval { str2time($mime->header('Date')) }; - defined($t) or $t = 0; - $mime->header_set('X-PI-TS', $t); - push @msgs, $mime; - } - } require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@msgs); + my $th = PublicInbox::Thread->new(@$msgs); $th->thread; $th->order(*PublicInbox::Thread::sort_ts); my $state = [ undef, { root_anchor => 'dummy' }, undef, 0 ]; @@ -597,4 +545,29 @@ sub thread_entry { thread_entry($dst, $state, $node->next, $level) if $node->next; } +sub load_results { + my ($ctx, $res) = @_; + + require PublicInbox::GitCatFile; + my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); + my @msgs; + while (my $smsg = shift @{$res->{msgs}}) { + my $m = $smsg->mid; + my $path = mid2path($m); + + # FIXME: duplicated code from Feed.pm + my $mime = eval { + my $str = $git->cat_file("HEAD:$path"); + Email::MIME->new($str); + }; + unless ($@) { + my $t = eval { str2time($mime->header('Date')) }; + defined($t) or $t = 0; + $mime->header_set('X-PI-TS', $t); + push @msgs, $mime; + } + } + \@msgs; +} + 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 7fe9b85b..bbd438a2 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -141,19 +141,10 @@ sub get_index { # just returns a string ref for the blob in the current ctx sub mid2blob { my ($ctx) = @_; - my $hex = $ctx->{mid}; - my ($x2, $x38) = ($hex =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - - unless (defined $x38) { - # compatibility with old links - require Digest::SHA; - $hex = Digest::SHA::sha1_hex($hex); - ($x2, $x38) = ($hex =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); - defined $x38 or die "BUG: not a SHA-1 hex: $hex"; - } - + require PublicInbox::MID; + my $path = PublicInbox::MID::mid2path($ctx->{mid}); my @cmd = ('git', "--git-dir=$ctx->{git_dir}", - qw(cat-file blob), "HEAD:$x2/$x38"); + qw(cat-file blob), "HEAD:$path"); my $cmd = join(' ', @cmd); my $pid = open my $fh, '-|'; defined $pid or die "fork failed: $!\n"; -- cgit v1.2.3-24-ge0c7