diff options
Diffstat (limited to 'lib/PublicInbox/SearchIdx.pm')
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 190 |
1 files changed, 100 insertions, 90 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 257b83a5..4fd493d9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -9,7 +9,8 @@ package PublicInbox::SearchIdx; use strict; use v5.10.1; -use parent qw(PublicInbox::Search PublicInbox::Lock Exporter); +use parent qw(PublicInbox::Search PublicInbox::Lock PublicInbox::Umask + Exporter); use PublicInbox::Eml; use PublicInbox::Search qw(xap_terms); use PublicInbox::InboxWritable; @@ -21,7 +22,7 @@ use POSIX qw(strftime); use Fcntl qw(SEEK_SET); use Time::Local qw(timegm); use PublicInbox::OverIdx; -use PublicInbox::Spawn qw(spawn); +use PublicInbox::Spawn qw(run_wait popen_rd); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::Address; @@ -37,12 +38,13 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # typical 32-bit system: (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); use constant DEBUG => !!$ENV{DEBUG}; -my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/; +my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; -my @VMD_MAP = (kw => 'K', L => 'L'); +my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; +our $PATCHID_BROKEN; sub new { my ($class, $ibx, $creat, $shard) = @_; @@ -62,6 +64,7 @@ sub new { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } + undef $PATCHID_BROKEN; # retry on new instances in case of upgrades $ibx = PublicInbox::InboxWritable->new($ibx); my $self = PublicInbox::Search->new($ibx); bless $self, $class; @@ -90,7 +93,7 @@ sub new { $self; } -sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels } +sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels } sub idx_release { my ($self, $wake) = @_; @@ -113,15 +116,15 @@ sub load_xapian_writable () { *sortable_serialise = $xap.'::sortable_serialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); - my $ver = (eval($xap.'::major_version()') << 16) | - (eval($xap.'::minor_version()') << 8) | - eval($xap.'::revision()'); - if ($ver >= 0x10400) { + my $ver = eval 'v'.join('.', eval($xap.'::major_version()'), + eval($xap.'::minor_version()'), + eval($xap.'::revision()')); + if ($ver ge v1.4) { # new flags in Xapian 1.4 $DB_NO_SYNC = 0x4; $DB_DANGEROUS = 0x10; } # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks - $X->{CLOEXEC_UNSET} = 1 if $ver >= 0x010215 && $ver <= 0x010218; + $X->{CLOEXEC_UNSET} = 1 if $ver ge v1.2.21 && $ver le v1.2.24; 1; } @@ -134,6 +137,7 @@ sub idx_acquire { load_xapian_writable(); $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } + my $owner = $self->{ibx} // $self->{eidx} // $self; if ($self->{creat}) { require File::Path; $self->lock_acquire; @@ -145,14 +149,13 @@ sub idx_acquire { File::Path::mkpath($dir); require PublicInbox::Syscall; PublicInbox::Syscall::nodatacow_dir($dir); - $self->{-set_has_threadid_once} = 1; - if (($self->{ibx} // $self->{eidx})->{-dangerous}) { - $flag |= $DB_DANGEROUS; - } + # owner == self for CodeSearchIdx + $self->{-set_has_threadid_once} = 1 if $owner != $self; + $flag |= $DB_DANGEROUS if $owner->{-dangerous}; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; + $flag |= $DB_NO_SYNC if $owner->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -177,9 +180,8 @@ sub term_generator ($) { # write-only sub index_phrase ($$$$) { my ($self, $text, $wdf_inc, $prefix) = @_; - my $tg = term_generator($self); - $tg->index_text($text, $wdf_inc, $prefix); - $tg->increase_termpos; + term_generator($self)->index_text($text, $wdf_inc, $prefix); + $self->{term_generator}->increase_termpos; } sub index_text ($$$$) { @@ -188,8 +190,8 @@ sub index_text ($$$$) { if ($self->{indexlevel} eq 'full') { index_phrase($self, $text, $wdf_inc, $prefix); } else { - my $tg = term_generator($self); - $tg->index_text_without_positions($text, $wdf_inc, $prefix); + term_generator($self)->index_text_without_positions( + $text, $wdf_inc, $prefix); } } @@ -270,7 +272,7 @@ sub index_diff ($$$) { push @$xnq, shift(@l); # skip base85 and empty lines - while (@l && ($l[0] =~ /$BASE85/o || + while (@l && ($l[0] =~ /\A$BASE85\h*\z/o || $l[0] !~ /\S/)) { shift @l; } @@ -350,6 +352,52 @@ sub index_diff ($$$) { index_text($self, join("\n", @$xnq), 1, 'XNQ'); } +sub index_body_text { + my ($self, $doc, $sref) = @_; + my $rd; + # start patch-id in parallel + if ($$sref =~ /^(?:diff|---|\+\+\+) /ms && !$PATCHID_BROKEN) { + my $git = ($self->{ibx} // $self->{eidx} // $self)->git; + my $fh = PublicInbox::IO::write_file '+>:utf8', undef, $$sref; + $fh->flush or die "flush: $!"; + sysseek($fh, 0, SEEK_SET); + $rd = popen_rd($git->cmd(qw(patch-id --stable)), undef, + { 0 => $fh }); + } + + # split off quoted and unquoted blocks: + my @sections = PublicInbox::MsgIter::split_quotes($$sref); + undef $$sref; # free memory + for my $txt (@sections) { + if ($txt =~ /\A>/) { + if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { + # get rid of Base-85 noise + $txt =~ s/^([>\h]+(?:literal|delta) + \x20[0-9]+\r?\n) + (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; + } + index_text($self, $txt, 0, 'XQUOT'); + } else { # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + index_diff($self, \$txt, $doc); + } else { + index_text($self, $txt, 1, 'XNQ'); + } + } + undef $txt; # free memory + } + if (defined $rd) { # reap `git patch-id' + (readline($rd) // '') =~ /\A([a-f0-9]{40,})/ and + $doc->add_term('XDFID'.$1); + if (!$rd->close) { + my $c = 'git patch-id --stable'; + $PATCHID_BROKEN = ($? >> 8) == 129; + $PATCHID_BROKEN ? warn("W: $c requires git v2.1.0+\n") + : warn("W: $c failed: \$?=$? (non-fatal)"); + } + } +} + sub index_xapian { # msg_iter callback my $part = $_[0]->[0]; # ignore $depth and $idx my ($self, $doc) = @{$_[1]}; @@ -369,37 +417,7 @@ sub index_xapian { # msg_iter callback my ($s, undef) = msg_part_text($part, $ct); defined $s or return; $_[0]->[0] = $part = undef; # free memory - - if ($s =~ /^(?:diff|---|\+\+\+) /ms) { - open(my $fh, '+>:utf8', undef) or die "open: $!"; - open(my $eh, '+>', undef) or die "open: $!"; - $fh->autoflush(1); - print $fh $s or die "print: $!"; - sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; - my $id = ($self->{ibx} // $self->{eidx})->git->qx( - [qw(patch-id --stable)], - {}, { 0 => $fh, 2 => $eh }); - $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1); - seek($eh, 0, SEEK_SET) or die "seek: $!"; - while (<$eh>) { warn $_ } - } - - # split off quoted and unquoted blocks: - my @sections = PublicInbox::MsgIter::split_quotes($s); - undef $s; # free memory - for my $txt (@sections) { - if ($txt =~ /\A>/) { - index_text($self, $txt, 0, 'XQUOT'); - } else { - # does it look like a diff? - if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - index_diff($self, \$txt, $doc); - } else { - index_text($self, $txt, 1, 'XNQ'); - } - } - undef $txt; # free memory - } + index_body_text($self, $doc, \$s); } sub index_list_id ($$$) { @@ -407,6 +425,7 @@ sub index_list_id ($$$) { for my $l ($hdr->header_raw('List-Id')) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; + $lid =~ tr/\n\t\r\0//d; # same rules as Message-ID $doc->add_boolean_term('G' . $lid); index_phrase($self, $lid, 1, 'XL'); # probabilistic } @@ -442,8 +461,7 @@ sub eml2doc ($$$;$) { add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); - my $tg = term_generator($self); - $tg->set_document($doc); + term_generator($self)->set_document($doc); index_headers($self, $smsg); if (defined(my $eidx_key = $smsg->{eidx_key})) { @@ -540,9 +558,7 @@ sub add_message { sub _get_doc ($$) { my ($self, $docid) = @_; - my $doc = eval { $self->{xdb}->get_document($docid) }; - $doc // do { - warn "E: $@\n" if $@; + $self->get_doc($docid) // do { warn "E: #$docid missing in Xapian\n"; undef; } @@ -600,17 +616,16 @@ sub set_vmd { my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; - my ($end, @rm, @add); + my ($v, @rm, @add); my @x = @VMD_MAP; + my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end); while (my ($field, $pfx) = splice(@x, 0, 2)) { my $set = $vmd->{$field} // next; my %keep = map { $_ => 1 } @$set; my %add = %keep; - $end //= $doc->termlist_end; - for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $v = $cur->get_termname; + $cur->skip_to($pfx); # works due to @VMD_MAP order + for (; $cur != $end; $cur++) { + $v = $cur->get_termname; $v =~ s/\A$pfx//s or next; $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v); } @@ -690,7 +705,7 @@ sub xdb_remove { my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; for my $docid (@docids) { eval { $xdb->delete_document($docid) }; - warn "E: #$docid not in in Xapian? $@\n" if $@; + warn "E: #$docid not in Xapian? $@\n" if $@; } } @@ -707,7 +722,6 @@ sub nr_quiet_rm { delete($_[0]->{-quiet_rm}) // 0 } sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; - my $len = length($objid); for (my $len = length($objid); $len >= 7; ) { $doc->add_term($pfx.$objid); $objid = substr($objid, 0, --$len); @@ -801,7 +815,8 @@ sub unindex_both { # git->cat_async callback sub with_umask { my $self = shift; - ($self->{ibx} // $self->{eidx})->with_umask(@_); + my $owner = $self->{ibx} // $self->{eidx}; + $owner ? $owner->with_umask(@_) : $self->SUPER::with_umask(@_) } # called by public-inbox-index @@ -819,10 +834,10 @@ sub index_sync { } sub check_size { # check_async cb for -index --max-size=... - my ($oid, $type, $size, $arg, $git) = @_; - (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; + my (undef, $oid, $type, $size, $arg) = @_; + ($type // '') eq 'blob' or die "E: bad $oid in $arg->{git}->{git_dir}"; if ($size <= $arg->{max_size}) { - $git->cat_async($oid, $arg->{index_oid}, $arg); + $arg->{git}->cat_async($oid, $arg->{index_oid}, $arg); } else { warn "W: skipping $oid ($size > $arg->{max_size})\n"; } @@ -904,6 +919,7 @@ sub process_stack { $arg->{autime} = $at; $arg->{cotime} = $ct; if ($sync->{max_size}) { + $arg->{git} = $git; $git->check_async($oid, \&check_size, $arg); } else { $git->cat_async($oid, \&index_both, $arg); @@ -964,7 +980,7 @@ sub log2stack ($$$) { $stk->push_rec('m', $at, $ct, $oid, $cmt); } } - close $fh or die "git log failed: \$?=$?"; + $fh->close or die "git log failed: \$?=$?"; $stk //= PublicInbox::IdxStack->new; $stk->read_prepare; } @@ -987,11 +1003,8 @@ sub prepare_stack ($$) { sub is_ancestor ($$$) { my ($git, $cur, $tip) = @_; return 0 unless $git->check($cur); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(merge-base --is-ancestor), $cur, $tip ]; - my $pid = spawn($cmd); - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0; + my $cmd = $git->cmd(qw(merge-base --is-ancestor), $cur, $tip); + run_wait($cmd) == 0; } sub need_update ($$$$) { @@ -1052,7 +1065,11 @@ sub _index_sync { my $ibx = $self->{ibx}; local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $ibx->git->batch_prepare; + + if ($X->{CLOEXEC_UNSET}) { + $ibx->git->cat_file($tip); + $ibx->git->check($tip); + } my $pr = $opt->{-progress}; my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; my $quit = quit_cb($sync); @@ -1088,8 +1105,10 @@ sub DESTROY { $_[0]->{lockfh} = undef; } -sub _begin_txn { +sub begin_txn_lazy { my ($self) = @_; + return if $self->{txn}; + my $restore = $self->with_umask; my $xdb = $self->{xdb} || idx_acquire($self); $self->{oidx}->begin_lazy if $self->{oidx}; $xdb->begin_transaction if $xdb; @@ -1097,13 +1116,8 @@ sub _begin_txn { $xdb; } -sub begin_txn_lazy { - my ($self) = @_; - $self->with_umask(\&_begin_txn, $self) if !$self->{txn}; -} - # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) -# This metadata is read by Admin::detect_indexlevel: +# This metadata is read by InboxWritable->detect_indexlevel: sub set_metadata_once { my ($self) = @_; @@ -1125,8 +1139,10 @@ sub set_metadata_once { } } -sub _commit_txn { +sub commit_txn_lazy { my ($self) = @_; + return unless delete($self->{txn}); + my $restore = $self->with_umask; if (my $eidx = $self->{eidx}) { $eidx->git->async_wait_all; $eidx->{transact_bytes} = 0; @@ -1138,12 +1154,6 @@ sub _commit_txn { $self->{oidx}->commit_lazy if $self->{oidx}; } -sub commit_txn_lazy { - my ($self) = @_; - delete($self->{txn}) and - $self->with_umask(\&_commit_txn, $self); -} - sub eidx_shard_new { my ($class, $eidx, $shard) = @_; my $self = bless { |