# Copyright (C) 2021 all contributors # License: AGPL-3.0+ # front-end for the "lei patch-to-query" sub-command package PublicInbox::LeiP2q; use strict; use v5.10.1; use parent qw(PublicInbox::IPC); use PublicInbox::Eml; use PublicInbox::Smsg; use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::Git qw(git_unquote); use PublicInbox::Spawn qw(popen_rd); use URI::Escape qw(uri_escape_utf8); my $FN = qr!((?:"?[^/\n]+/[^\r\n]+)|/dev/null)!; sub xphrase ($) { my ($s) = @_; return () unless $s =~ /\S/; # cf. xapian-core/queryparser/queryparser.lemony # [\./:\\\@] - is_phrase_generator (implicit phrase search) # FIXME not really sure about these..., we basically want to # extract the longest phrase possible that Xapian can handle map { s/\A\s*//; s/\s+\z//; m![^\./:\\\@\-\w]! ? qq("$_") : $_ ; } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g); } sub extract_terms { # eml->each_part callback my ($p, $lei) = @_; my $part = $p->[0]; # ignore $depth and @idx; my $ct = $part->content_type || 'text/plain'; my ($s, undef) = msg_part_text($part, $ct); defined $s or return; my $in_diff; # TODO: b: nq: q: for (split(/\n/, $s)) { if ($in_diff && s/^ //) { # diff context push @{$lei->{qterms}->{dfctx}}, xphrase($_); } elsif (/^-- $/) { # email signature begins $in_diff = undef; } elsif (m!^diff --git $FN $FN!) { # wait until "---" and "+++" to capture filenames $in_diff = 1; } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) { my ($oa, $ob) = ($1, $2); push @{$lei->{qterms}->{dfpre}}, $oa; push @{$lei->{qterms}->{dfpost}}, $ob; # who uses dfblob? } elsif (m!^(?:---|\+{3}) ($FN)!) { next if $1 eq '/dev/null'; my $fn = (split(m!/!, git_unquote($1.''), 2))[1]; push @{$lei->{qterms}->{dfn}}, xphrase($fn); } elsif ($in_diff && s/^\+//) { # diff added push @{$lei->{qterms}->{dfb}}, xphrase($_); } elsif ($in_diff && s/^-//) { # diff removed push @{$lei->{qterms}->{dfa}}, xphrase($_); } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) { # traditional diff w/o -p } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) { push @{$lei->{qterms}->{dfhh}}, xphrase($1); } elsif (/^(?:dis)similarity index/ || /^(?:old|new) mode/ || /^(?:deleted|new) file mode/ || /^(?:copy|rename) (?:from|to) / || /^(?:dis)?similarity index / || /^\\ No newline at end of file/ || /^Binary files .* differ/) { } elsif ($_ eq '') { # possible to be in diff context, some mail may be # stripped by MUA or even GNU diff(1). "git apply" # treats a bare "\n" as diff context, too } else { $in_diff = undef; } } } my %pfx2smsg = ( t => [ qw(to) ], c => [ qw(cc) ], f => [ qw(from) ], tc => [ qw(to cc) ], tcf => [ qw(to cc from) ], a => [ qw(to cc from) ], s => [ qw(subject) ], bs => [ qw(subject) ], # body handled elsewhere d => [ qw(ds) ], # nonsense? dt => [ qw(ds) ], # ditto... rt => [ qw(ts) ], # ditto... ); sub do_p2q { # via wq_do my ($self) = @_; my $lei = $self->{lei}; my $want = $lei->{opt}->{want} // [ qw(dfpost7) ]; my @want = split(/[, ]+/, "@$want"); for (@want) { /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/ or next; my ($pfx, $n, $unit) = ($1, $2, $3); $n *= 86400 * ($unit =~ /week/i ? 7 : 1); $_ = [ $pfx, $n ]; } my $smsg = bless {}, 'PublicInbox::Smsg'; my $in = $self->{0}; my @cmd; unless ($in) { my $input = $self->{input}; my $devfd = $lei->path_to_fd($input) // return; if ($devfd >= 0) { $in = $lei->{$devfd}; } elsif (-e $input) { open($in, '<', $input) or return $lei->fail("open < $input: $!"); } else { @cmd = (qw(git format-patch --stdout -1), $input); $in = popen_rd(\@cmd, undef, { 2 => $lei->{2} }); } }; my $str = do { local $/; <$in> }; @cmd && !close($in) and return $lei->fail("E: @cmd failed: $?"); my $eml = PublicInbox::Eml->new(\$str); $lei->{diff_want} = +{ map { $_ => 1 } @want }; $smsg->populate($eml); while (my ($pfx, $fields) = each %pfx2smsg) { next unless $lei->{diff_want}->{$pfx}; for my $f (@$fields) { my $v = $smsg->{$f} // next; push @{$lei->{qterms}->{$pfx}}, xphrase($v); } } $eml->each_part(\&extract_terms, $lei, 1); if ($lei->{opt}->{debug}) { my $json = ref(PublicInbox::Config->json)->new; $json->utf8->canonical->pretty; print { $lei->{2} } $json->encode($lei->{qterms}); } my (@q, %seen); for my $pfx (@want) { if (ref($pfx) eq 'ARRAY') { my ($p, $t_range) = @$pfx; # TODO } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! || $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) { push @q, $pfx; } else { my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : ''; my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : ''; my $x = delete($lei->{qterms}->{$pfx}) or next; my $star = $end =~ tr/*//d ? '*' : ''; my $min_len = ($end || 0) + 0; # no wildcards for bool_pfx_external $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/; $pfx = "$plusminus$pfx:"; if ($min_len) { push @q, map { my @t = ($pfx.$_.$star); while (length > $min_len) { chop $_; push @t, 'OR', $pfx.$_.$star; } @t; } @$x; } else { push @q, map { my $k = $pfx.$_.$star; $seen{$k}++ ? () : $k } @$x; } } } if ($lei->{opt}->{uri}) { @q = (join('+', map { uri_escape_utf8($_) } @q)); } else { @q = (join(' ', @q)); } $lei->out(@q, "\n"); } sub lei_p2q { # the "lei patch-to-query" entry point my ($lei, $input) = @_; my $self = bless {}, __PACKAGE__; if ($lei->{opt}->{stdin}) { $self->{0} = delete $lei->{0}; # guard from _lei_atfork_child } else { $self->{input} = $input; } my ($op_c, $ops) = $lei->workers_start($self, 1); $lei->{wq1} = $self; $self->wq_io_do('do_p2q', []); $self->wq_close; $lei->wait_wq_events($op_c, $ops); } sub ipc_atfork_child { my ($self) = @_; $self->{lei}->_lei_atfork_child; $self->SUPER::ipc_atfork_child; } 1;