diff options
-rw-r--r-- | lib/PublicInbox/NNTP.pm | 157 | ||||
-rw-r--r-- | lib/PublicInbox/NewsGroup.pm | 15 | ||||
-rw-r--r-- | lib/PublicInbox/Search.pm | 37 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 59 | ||||
-rw-r--r-- | lib/PublicInbox/SearchMsg.pm | 86 | ||||
-rw-r--r-- | public-inbox-nntpd | 4 | ||||
-rw-r--r-- | t/nntpd.t | 25 | ||||
-rw-r--r-- | t/search.t | 2 |
8 files changed, 236 insertions, 149 deletions
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 12760394..7fe7f2f3 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -5,6 +5,7 @@ use strict; use warnings; use base qw(Danga::Socket); use fields qw(nntpd article rbuf ng long_res); +use PublicInbox::Search; use PublicInbox::Msgmap; use PublicInbox::GitCatFile; use PublicInbox::MID qw(mid2path); @@ -23,10 +24,10 @@ use constant { sub now () { clock_gettime(CLOCK_MONOTONIC) }; -my @OVERVIEW = qw(Subject From Date Message-ID References Bytes Lines); -my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW) . ":\r\n"; -my $LIST_HEADERS = join("\r\n", qw(Subject From Date Message-ID References - :bytes :lines Xref To Cc)) . "\r\n"; +my @OVERVIEW = qw(Subject From Date Message-ID References); +my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines)) . ":\r\n"; +my $LIST_HEADERS = join("\r\n", @OVERVIEW, + qw(:bytes :lines Xref To Cc)) . "\r\n"; # disable commands with easy DoS potential: # LISTGROUP could get pretty bad, too... @@ -614,53 +615,42 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin } } -sub header_obj_for { - my ($srch, $mid) = @_; - eval { - my $smsg = $srch->lookup_message($mid); - $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); - $smsg->mini_mime->header_obj; - }; -}; +sub search_header_for { + my ($srch, $mid, $field) = @_; + my $smsg = $srch->lookup_message($mid) or return; + $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); + $smsg->$field; +} sub hdr_searchmsg ($$$$) { - my ($self, $xhdr, $hdr, $range) = @_; - my $filter; - if ($hdr eq 'date') { - $hdr = 'X-PI-TS'; - $filter = sub ($) { - strftime('%a, %d %b %Y %T %z', gmtime($_[0])); - }; - } - + my ($self, $xhdr, $field, $range) = @_; if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID my ($ng, $n) = mid_lookup($self, $1); return r430 unless $n; - if (my $srch = $ng->search) { - my $m = header_obj_for($srch, $range); - my $v = $m->header($hdr); - $v = $filter->($v) if defined $v && $filter; - hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); - } else { - hdr_slow($self, $xhdr, $hdr, $range); - } + my $v = search_header_for($ng->search, $range, $field); + hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); } else { # numeric range $range = $self->{article} unless defined $range; - my $srch = $self->{ng}->search or - return hdr_slow($self, $xhdr, $hdr, $range); + my $srch = $self->{ng}->search; my $mm = $self->{ng}->mm; my $r = get_range($self, $range); return $r unless ref $r; my ($beg, $end) = @$r; more($self, $xhdr ? r221 : r225); + my $off = 0; $self->long_response($beg, $end, sub { my ($i) = @_; - my $mid = $mm->mid_for($$i) or return; - my $m = header_obj_for($srch, $mid) or return; - my $v = $m->header($hdr); - defined $v or return; - $v = $filter->($v) if $filter; - more($self, "$$i $v"); + my $res = $srch->query_xover($beg, $end, $off); + my $msgs = $res->{msgs}; + my $nr = scalar @$msgs or return; + $off += $nr; + my $tmp = ''; + foreach my $s (@$msgs) { + $tmp .= $s->num . ' ' . $s->$field . "\r\n"; + } + do_more($self, $tmp); + # -1 to adjust for implicit increment in long_response + $$i = $nr ? $$i + $nr - 1 : long_response_limit; }); } } @@ -672,10 +662,13 @@ sub do_hdr ($$$;$) { hdr_message_id($self, $xhdr, $range); } elsif ($sub eq 'xref') { hdr_xref($self, $xhdr, $range); - } elsif ($sub =~ /\A(subject|references|date)\z/) { + } elsif ($sub =~ /\A(?:subject|references|date|from|to|cc| + bytes|lines)\z/x) { hdr_searchmsg($self, $xhdr, $sub, $range); + } elsif ($sub =~ /\A:(bytes|lines)\z/) { + hdr_searchmsg($self, $xhdr, $1, $range); } else { - hdr_slow($self, $xhdr, $header, $range); + $xhdr ? (r221 . "\r\n.") : "503 HDR not permitted on $header"; } } @@ -708,43 +701,16 @@ sub hdr_mid_response ($$$$$$) { my $res = ''; if ($xhdr) { $res .= r221 . "\r\n"; - $res .= "$mid $v\r\n" if defined $v; + $res .= "$mid $v\r\n"; } else { $res .= r225 . "\r\n"; - if (defined $v) { - my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid); - $res .= "$pfx $v\r\n"; - } + my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid); + $res .= "$pfx $v\r\n"; } res($self, $res .= '.'); undef; } -sub hdr_slow ($$$$) { - my ($self, $xhdr, $header, $range) = @_; - - if (defined $range && $range =~ /\A<.+>\z/) { # Message-ID - my $r = $self->art_lookup($range, 2); - return $r unless ref $r; - my ($n, $ng) = ($r->[0], $r->[5]); - my $v = hdr_val($r, $header); - hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); - } else { # numeric range - $range = $self->{article} unless defined $range; - my $r = get_range($self, $range); - return $r unless ref $r; - my ($beg, $end) = @$r; - more($self, $xhdr ? r221 : r225); - $self->long_response($beg, $end, sub { - my ($i) = @_; - $r = $self->art_lookup($$i, 2); - return unless ref $r; - defined($r = hdr_val($r, $header)) or return; - more($self, "$$i $r"); - }); - } -} - sub cmd_xrover ($;$) { my ($self, $range) = @_; my $ng = $self->{ng} or return '412 no newsgroup selected'; @@ -761,32 +727,38 @@ sub cmd_xrover ($;$) { $self->long_response($beg, $end, sub { my ($i) = @_; my $mid = $mm->mid_for($$i) or return; - my $m = header_obj_for($srch, $mid) or return; - my $h = $m->header('references'); - more($self, "$$i $h") if defined $h; + my $h = search_header_for($srch, $mid, 'references'); + more($self, "$$i $h"); }); } sub over_line ($$) { - my ($self, $r) = @_; - - more($self, join("\t", $r->[0], map { - my $h = hdr_val($r, $_); - defined $h ? $h : ''; - } @OVERVIEW )); + my ($num, $smsg) = @_; + # n.b. field access and procedural calls can be + # 10%-15% faster than OO method calls: + join("\t", $num, + $smsg->{subject}, + $smsg->{from}, + PublicInbox::SearchMsg::date($smsg), + '<'.PublicInbox::SearchMsg::mid($smsg).'>', + $smsg->{references}, + PublicInbox::SearchMsg::bytes($smsg), + PublicInbox::SearchMsg::lines($smsg)); } sub cmd_over ($;$) { my ($self, $range) = @_; - if ($range && $range =~ /\A<.+>\z/) { - my $r = $self->art_lookup($range, 2); - return '430 No article with that message-id' unless ref $r; + if ($range && $range =~ /\A<(.+)>\z/) { + my ($ng, $n) = mid_lookup($self, $1); + my $smsg = $ng->search->lookup_message($range) or + return '430 No article with that message-id'; more($self, '224 Overview information follows (multi-line)'); + $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); # Only set article number column if it's the current group - my $ng = $self->{ng}; - $r->[0] = 0 if (!$ng || $ng ne $r->[5]); - over_line($self, $r); + my $self_ng = $self->{ng}; + $n = 0 if (!$self_ng || $self_ng ne $ng); + more($self, over_line($n, $smsg)); '.'; } else { cmd_xover($self, $range); @@ -800,11 +772,22 @@ sub cmd_xover ($;$) { return $r unless ref $r; my ($beg, $end) = @$r; more($self, "224 Overview information follows for $beg to $end"); + my $srch = $self->{ng}->search; + my $off = 0; $self->long_response($beg, $end, sub { my ($i) = @_; - my $r = $self->art_lookup($$i, 2); - return unless ref $r; - over_line($self, $r); + my $res = $srch->query_xover($beg, $end, $off); + my $msgs = $res->{msgs}; + my $nr = scalar @$msgs or return; + $off += $nr; + + # OVERVIEW.FMT + more($self, join("\r\n", map { + over_line(PublicInbox::SearchMsg::num($_), $_); + } @$msgs)); + + # -1 to adjust for implicit increment in long_response + $$i = $nr ? $$i + $nr - 1 : long_response_limit; }); } diff --git a/lib/PublicInbox/NewsGroup.pm b/lib/PublicInbox/NewsGroup.pm index 1250b0d8..02e90111 100644 --- a/lib/PublicInbox/NewsGroup.pm +++ b/lib/PublicInbox/NewsGroup.pm @@ -6,6 +6,7 @@ use warnings; use Scalar::Util qw(weaken); require Danga::Socket; require PublicInbox::Msgmap; +require PublicInbox::Search; require PublicInbox::GitCatFile; sub new { @@ -36,11 +37,16 @@ sub gcf { }; } +sub usable { + my ($self) = @_; + eval { + PublicInbox::Msgmap->new($self->{git_dir}); + PublicInbox::Search->new($self->{git_dir}); + }; +} + sub mm { - my ($self, $check_only) = @_; - if ($check_only) { - return eval { PublicInbox::Msgmap->new($self->{git_dir}) }; - } + my ($self) = @_; $self->{mm} ||= eval { my $mm = PublicInbox::Msgmap->new($self->{git_dir}); @@ -53,7 +59,6 @@ sub mm { sub search { my ($self) = @_; $self->{search} ||= eval { - require PublicInbox::Search; my $search = PublicInbox::Search->new($self->{git_dir}); # may be needed if we run low on handles diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 695d56b3..1d13f4b8 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -4,7 +4,13 @@ package PublicInbox::Search; use strict; use warnings; -use constant TS => 0; + +# values for searching +use constant TS => 0; # timestamp +use constant NUM => 1; # NNTP article number +use constant BYTES => 2; # :bytes as defined in RFC 3977 +use constant LINES => 3; # :lines as defined in RFC 3977 + use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; use Email::MIME; @@ -26,8 +32,9 @@ use constant { # 6 - preserve References: order in document data # 7 - remove references and inreplyto terms # 8 - remove redundant/unneeded document data - # 9 - disable Message-ID compression - SCHEMA_VERSION => 9, + # 9 - disable Message-ID compression (SHA-1) + # 10 - optimize doc for NNTP overviews + SCHEMA_VERSION => 10, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector @@ -168,6 +175,30 @@ sub date_range_processor { $_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS); } +sub num_range_processor { + $_[0]->{nrp} ||= Search::Xapian::NumberValueRangeProcessor->new(NUM); +} + +# only used for NNTP server +sub query_xover { + my ($self, $beg, $end, $offset) = @_; + my $enquire = $self->enquire; + my $qp = Search::Xapian::QueryParser->new; + $qp->set_database($self->{xdb}); + $qp->add_valuerangeprocessor($self->num_range_processor); + my $query = $qp->parse_query("$beg..$end", QP_FLAGS); + $query = Search::Xapian::Query->new(OP_AND, $mail_query, $query); + $enquire->set_query($query); + $enquire->set_sort_by_value(NUM, 0); + my $limit = 200; + my $mset = $enquire->get_mset($offset, $limit); + my @msgs = map { + PublicInbox::SearchMsg->load_doc($_->get_document); + } $mset->items; + + { total => $mset->get_matches_estimated, msgs => \@msgs } +} + sub lookup_message { my ($self, $mid) = @_; $mid = mid_clean($mid); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 87243268..4b43369f 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -36,8 +36,14 @@ sub new { $self; } +sub add_val { + my ($doc, $col, $num) = @_; + $num = Search::Xapian::sortable_serialise($num); + $doc->add_value($col, $num); +} + sub add_message { - my ($self, $mime) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object my $db = $self->{xdb}; my $doc_id; @@ -80,8 +86,16 @@ sub add_message { $doc->add_term(xpfx('path') . mid_compress($path)); } - my $ts = Search::Xapian::sortable_serialise($smsg->ts); - $doc->add_value(PublicInbox::Search::TS, $ts); + add_val($doc, &PublicInbox::Search::TS, $smsg->ts); + + defined($num) and + add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and + add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $mime->body_raw =~ tr!\n!\n!); my $tg = $self->term_generator; @@ -91,7 +105,7 @@ sub add_message { $tg->index_text($subj) if $subj; $tg->increase_termpos; - $tg->index_text($smsg->from->format); + $tg->index_text($smsg->from); $tg->increase_termpos; $mime->walk_parts(sub { @@ -224,7 +238,7 @@ sub link_message_to_parents { } } if (@refs) { - $smsg->{references_sorted} = '<'.join('><', @refs).'>'; + $smsg->{references} = '<'.join('> <', @refs).'>'; # first ref *should* be the thread root, # but we can never trust clients to do the right thing @@ -245,8 +259,8 @@ sub link_message_to_parents { } sub index_blob { - my ($self, $git, $mime) = @_; - $self->add_message($mime); + my ($self, $git, $mime, $bytes, $num) = @_; + $self->add_message($mime, $bytes, $num); } sub unindex_blob { @@ -265,10 +279,22 @@ sub unindex_mm { $self->{mm}->mid_delete(mid_clean($mime->header('Message-ID'))); } -sub index_both { +sub index_mm2 { + my ($self, $git, $mime, $bytes) = @_; + my $num = $self->{mm}->num_for(mid_clean($mime->header('Message-ID'))); + index_blob($self, $git, $mime, $bytes, $num); +} + +sub unindex_mm2 { my ($self, $git, $mime) = @_; - index_blob($self, $git, $mime); - index_mm($self, $git, $mime); + $self->{mm}->mid_delete(mid_clean($mime->header('Message-ID'))); + unindex_blob($self, $git, $mime); +} + +sub index_both { + my ($self, $git, $mime, $bytes) = @_; + my $num = index_mm($self, $git, $mime); + index_blob($self, $git, $mime, $bytes, $num); } sub unindex_both { @@ -278,9 +304,9 @@ sub unindex_both { } sub do_cat_mail { - my ($git, $blob) = @_; + my ($git, $blob, $sizeref) = @_; my $mime = eval { - my $str = $git->cat_file($blob); + my $str = $git->cat_file($blob, $sizeref); Email::MIME->new($str); }; $@ ? undef : $mime; @@ -304,12 +330,13 @@ sub rlog { qw/--reverse --no-notes --no-color --raw -r --no-abbrev/, $range); my $latest; + my $bytes; my $pid = open(my $log, '-|', @cmd) or die('open` '.join(' ', @cmd) . " pipe failed: $!\n"); while (my $line = <$log>) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1) or next; - $add_cb->($self, $git, $mime); + my $mime = do_cat_mail($git, $1, \$bytes) or next; + $add_cb->($self, $git, $mime, $bytes); } elsif ($line =~ /$delmsg/o) { my $mime = do_cat_mail($git, $1) or next; $del_cb->($self, $git, $mime); @@ -354,11 +381,11 @@ sub _index_sync { $mm->{dbh}->commit; $mm->last_commit($lm) if defined $lm; - goto xapian_only; + $lx = $self->rlog($range, *index_mm2, *unindex_mm2); + $db->set_metadata('last_commit', $lx) if defined $lx; } } else { # user didn't install DBD::SQLite and DBI -xapian_only: $lx = $self->rlog($range, *index_blob, *unindex_blob); $db->set_metadata('last_commit', $lx) if defined $lx; } diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 8c55c929..8d49ee20 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -14,6 +14,7 @@ use Encode qw/find_encoding/; my $enc_utf8 = find_encoding('UTF-8'); our $PFX2TERM_RE = undef; use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000'; +use POSIX qw(strftime); sub new { my ($class, $mime) = @_; @@ -28,48 +29,67 @@ sub wrap { bless { doc => $doc, mime => undef, mid => $mid }, $class; } +sub get_val ($$) { + my ($doc, $col) = @_; + Search::Xapian::sortable_unserialise($doc->get_value($col)); +} + sub load_doc { my ($class, $doc) = @_; my $data = $doc->get_data; - my $ts = eval { - no strict 'subs'; - $doc->get_value(PublicInbox::Search::TS); - }; - $ts = Search::Xapian::sortable_unserialise($ts); + my $ts = get_val($doc, &PublicInbox::Search::TS); $data = $enc_utf8->decode($data); - my ($subj, $from, $refs) = split(/\n/, $data); + my ($subj, $from, $refs, $to, $cc) = split(/\n/, $data); bless { doc => $doc, subject => $subj, ts => $ts, - from_name => $from, - references_sorted => $refs, + from => $from, + references => $refs, + to => $to, + cc => $cc, }, $class; } -sub subject { +# :bytes and :lines metadata in RFC 3977 +sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) } +sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) } +sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) } + +sub __hdr ($$) { + my ($self, $field) = @_; + my $val = $self->{$field}; + return $val if defined $val; + + my $mime = $self->{mime} or return; + $val = $mime->header($field); + $val = '' unless defined $val; + $val =~ tr/\t\r\n/ /; + $self->{$field} = $val; +} + +sub subject ($) { __hdr($_[0], 'subject') } +sub to ($) { __hdr($_[0], 'to') } +sub cc ($) { __hdr($_[0], 'cc') } + +sub date ($) { my ($self) = @_; - my $subj = $self->{subject}; - return $subj if defined $subj; - $subj = $self->{mime}->header('Subject'); - $subj = '' unless defined $subj; - $subj =~ tr/\n/ /; - $self->{subject} = $subj; + my $date = __hdr($self, 'date'); + return $date if defined $date; + my $ts = $self->{ts}; + return unless defined $ts; + $self->{date} = strftime('%a, %d %b %Y %T %z', gmtime($ts)); } -sub from { +sub from ($) { my ($self) = @_; - my $from = $self->mime->header('From') || ''; - my @from; - - if ($from) { - $from =~ tr/\n/ /; - @from = Email::Address->parse($from); - $self->{from} = $from[0]; - $from = $from[0]->name; + my $from = __hdr($self, 'from'); + if (defined $from && !defined $self->{from_name}) { + $from =~ tr/\t\r\n/ /; + my @from = Email::Address->parse($from); + $self->{from_name} = $from[0]->name; } - $self->{from_name} = $from; - $self->{from}; + $from; } sub from_name { @@ -87,14 +107,13 @@ sub ts { sub to_doc_data { my ($self) = @_; - PublicInbox::Search::subject_summary($self->subject) . "\n" . - $self->from_name . "\n". - $self->references_sorted; + join("\n", $self->subject, $self->from, $self->references, + $self->to, $self->cc); } -sub references_sorted { +sub references { my ($self) = @_; - my $x = $self->{references_sorted}; + my $x = $self->{references}; defined $x ? $x : ''; } @@ -140,8 +159,7 @@ sub mini_mime { 'Message-ID' => "<$self->{mid}>", 'X-PI-TS' => $self->ts, ); - if (my $refs = $self->{references_sorted}) { - $refs =~ s/></> </g; + if (my $refs = $self->{references}) { push @h, References => $refs; } my $mime = Email::MIME->create(header_str => \@hs, header => \@h); @@ -156,7 +174,7 @@ sub mini_mime { $mime; } -sub mid { +sub mid ($;$) { my ($self, $mid) = @_; if (defined $mid) { diff --git a/public-inbox-nntpd b/public-inbox-nntpd index 82d68383..0035637d 100644 --- a/public-inbox-nntpd +++ b/public-inbox-nntpd @@ -52,8 +52,8 @@ sub refresh_groups () { $ng = $old_ng; } - # Only valid if Msgmap works - if ($ng->mm(1)) { + # Only valid if msgmap and search works + if ($ng->usable) { $new->{$g} = $ng; push @list, $ng; } @@ -151,7 +151,30 @@ EOF '<nntp@example.com>', '', '202', - '1' ] }, "XOVER works"); + '1' ] }, "XOVER range works"); + + is_deeply($n->xover('1'), { + '1' => ['hihi', + 'Me <me@example.com>', + 'Thu, 01 Jan 1970 06:06:06 +0000', + '<nntp@example.com>', + '', + '202', + '1' ] }, "XOVER by article works"); + + { + syswrite($s, "OVER $mid\r\n"); + $buf = ''; + do { + sysread($s, $buf, 4096, length($buf)); + } until ($buf =~ /^[^2]../ || $buf =~ /\r\n\.\r\n\z/); + my @r = split("\r\n", $buf); + like($r[0], qr/^224 /, 'got 224 response for OVER'); + is($r[1], "0\thihi\tMe <me\@example.com>\t" . + "Thu, 01 Jan 1970 06:06:06 +0000\t" . + "$mid\t\t202\t1", 'OVER by Message-ID works'); + is($r[2], '.', 'correctly terminated response'); + } ok(kill('TERM', $pid), 'killed nntpd'); $pid = undef; @@ -285,7 +285,7 @@ sub filter_mids { ok($doc_id > 0, "doc_id defined with circular reference"); my $smsg = $rw->lookup_message('circle@a'); $smsg->ensure_metadata; - is($smsg->references_sorted, '', "no references created"); + is($smsg->references, '', "no references created"); } done_testing(); |