about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/LEI.pm30
-rw-r--r--lib/PublicInbox/LeiP2q.pm197
2 files changed, 226 insertions, 1 deletions
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 0da24499..a2f8ffe7 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -179,6 +179,9 @@ our %CMD = ( # sorted in order of importance/use:
         qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s quiet|q
         lock=s@ kw|keywords|flags! C=s@),
         ],
+'p2q' => [ 'FILE|COMMIT_OID|--stdin',
+        "use a patch to generate a query for `lei q --stdin'",
+        qw(stdin| want|w=s@ uri debug) ],
 'config' => [ '[...]', sub {
                 'git-config(1) wrapper for '._config_path($_[0]);
         }, qw(config-file|system|global|file|f=s), # for conflict detection
@@ -238,6 +241,10 @@ my %OPTDESC = (
 'show        threads|t' => 'display entire thread a message belongs to',
 'q        threads|t+' =>
         'return all messages in the same threads as the actual match(es)',
+
+'want|w=s@' => [ 'PREFIX|dfpost|dfn', # common ones in help...
+                'search prefixes to extract (default: dfpost7)' ],
+
 'alert=s@' => ['CMD,:WINCH,:bell,<any command>',
         'run command(s) or perform ops when done writing to output ' .
         '(default: ":WINCH,:bell" with --mua and Maildir/IMAP output, ' .
@@ -331,7 +338,7 @@ my %CONFIG_KEYS = (
         'leistore.dir' => 'top-level storage location',
 );
 
-my @WQ_KEYS = qw(lxs l2m imp mrr cnv); # internal workers
+my @WQ_KEYS = qw(lxs l2m imp mrr cnv p2q); # internal workers
 
 # pronounced "exit": x_it(1 << 8) => exit(1); x_it(13) => SIGPIPE
 sub x_it ($$) {
@@ -673,6 +680,11 @@ sub lei_convert {
         PublicInbox::LeiConvert->call(@_);
 }
 
+sub lei_p2q {
+        require PublicInbox::LeiP2q;
+        PublicInbox::LeiP2q->call(@_);
+}
+
 sub lei_init {
         my ($self, $dir) = @_;
         my $cfg = _lei_cfg($self, 1);
@@ -854,6 +866,22 @@ sub poke_mua { # forces terminal MUAs to wake up and hopefully notice new mail
         }
 }
 
+my %path_to_fd = ('/dev/stdin' => 0, '/dev/stdout' => 1, '/dev/stderr' => 2);
+$path_to_fd{"/dev/fd/$_"} = $path_to_fd{"/proc/self/fd/$_"} for (0..2);
+sub fopen {
+        my ($self, $mode, $path) = @_;
+        rel2abs($self, $path);
+        $path =~ tr!/!/!s;
+        if (defined(my $fd = $path_to_fd{$path})) {
+                return $self->{$fd};
+        }
+        if ($path =~ m!\A/(?:dev|proc/self)/fd/[0-9]+\z!) {
+                return fail($self, "cannot open $path from daemon");
+        }
+        open my $fh, $mode, $path or return;
+        $fh;
+}
+
 # caller needs to "-t $self->{1}" to check if tty
 sub start_pager {
         my ($self) = @_;
diff --git a/lib/PublicInbox/LeiP2q.pm b/lib/PublicInbox/LeiP2q.pm
new file mode 100644
index 00000000..d1dd125e
--- /dev/null
+++ b/lib/PublicInbox/LeiP2q.pm
@@ -0,0 +1,197 @@
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# front-end for the "lei patch-to-query" sub-command
+package PublicInbox::LeiP2q;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC);
+use PublicInbox::Eml;
+use PublicInbox::Smsg;
+use PublicInbox::MsgIter qw(msg_part_text);
+use PublicInbox::Git qw(git_unquote);
+use PublicInbox::Spawn qw(popen_rd);
+use URI::Escape qw(uri_escape_utf8);
+
+sub xphrase ($) {
+        my ($s) = @_;
+        return () unless $s =~ /\S/;
+        # cf. xapian-core/queryparser/queryparser.lemony
+        # [\./:\\\@] - is_phrase_generator (implicit phrase search)
+        # FIXME not really sure about these..., we basically want to
+        # extract the longest phrase possible that Xapian can handle
+        map {
+                s/\A\s*//;
+                s/\s+\z//;
+                /[\|=><,\sA-Z]/ && !m![\./:\\\@]! ? qq("$_") : $_;
+        } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g);
+}
+
+sub extract_terms { # eml->each_part callback
+        my ($p, $lei) = @_;
+        my $part = $p->[0]; # ignore $depth and @idx;
+        my $ct = $part->content_type || 'text/plain';
+        my ($s, undef) = msg_part_text($part, $ct);
+        defined $s or return;
+        my $in_diff;
+        # TODO: b: nq: q:
+        for (split(/\n/, $s)) {
+                if ($in_diff && s/^ //) { # diff context
+                        push @{$lei->{qterms}->{dfctx}}, xphrase($_);
+                } elsif (/^-- $/) { # email signature begins
+                        $in_diff = undef;
+                } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
+                        # wait until "---" and "+++" to capture filenames
+                        $in_diff = 1;
+                } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) {
+                        my ($oa, $ob) = ($1, $2);
+                        push @{$lei->{qterms}->{dfpre}}, $oa;
+                        push @{$lei->{qterms}->{dfpost}}, $ob;
+                        # who uses dfblob?
+                } elsif (m!^(?:---|\+{3}) ("?[^/]+/.+)!) {
+                        my $fn = (split(m!/!, git_unquote($1.''), 2))[1];
+                        push @{$lei->{qterms}->{dfn}}, xphrase($fn);
+                } elsif ($in_diff && s/^\+//) { # diff added
+                        push @{$lei->{qterms}->{dfb}}, xphrase($_);
+                } elsif ($in_diff && s/^-//) { # diff removed
+                        push @{$lei->{qterms}->{dfa}}, xphrase($_);
+                } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) {
+                        push @{$lei->{qterms}->{dfhh}}, xphrase($1);
+                } elsif (/^(?:dis)similarity index/ ||
+                                /^(?:old|new) mode/ ||
+                                /^(?:deleted|new) file mode/ ||
+                                /^(?:copy|rename) (?:from|to) / ||
+                                /^(?:dis)?similarity index / ||
+                                /^\\ No newline at end of file/ ||
+                                /^Binary files .* differ/) {
+                } elsif ($_ eq '') {
+                        # possible to be in diff context, some mail may be
+                        # stripped by MUA or even GNU diff(1).  "git apply"
+                        # treats a bare "\n" as diff context, too
+                } else {
+                        $in_diff = undef;
+                }
+        }
+}
+
+my %pfx2smsg = (
+        t => [ qw(to) ],
+        c => [ qw(cc) ],
+        f => [ qw(from) ],
+        tc => [ qw(to cc) ],
+        tcf => [ qw(to cc from) ],
+        a => [ qw(to cc from) ],
+        s => [ qw(subject) ],
+        bs => [ qw(subject) ], # body handled elsewhere
+        d => [ qw(ds) ], # nonsense?
+        dt => [ qw(ds) ], # ditto...
+        rt => [ qw(ts) ], # ditto...
+);
+
+sub do_p2q { # via wq_do
+        my ($self) = @_;
+        my $lei = $self->{lei};
+        my $want = $lei->{opt}->{want} // [ qw(dfpost7) ];
+        my @want = split(/[, ]+/, "@$want");
+        for (@want) {
+                /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/ or next;
+                my ($pfx, $n, $unit) = ($1, $2, $3);
+                $n *= 86400 * ($unit =~ /week/i ? 7 : 1);
+                $_ = [ $pfx, $n ];
+        }
+        my $smsg = bless {}, 'PublicInbox::Smsg';
+        my $in = $self->{0};
+        unless ($in) {
+                my $input = $self->{input};
+                if (-e $input) {
+                        $in = $lei->fopen('<', $input) or
+                                return $lei->fail("open < $input: $!");
+                } else {
+                        my @cmd = (qw(git format-patch --stdout -1), $input);
+                        $in = popen_rd(\@cmd, undef, { 2 => $lei->{2} });
+                }
+        };
+        my $eml = PublicInbox::Eml->new(\(do { local $/; <$in> }));
+        $lei->{diff_want} = +{ map { $_ => 1 } @want };
+        $smsg->populate($eml);
+        while (my ($pfx, $fields) = each %pfx2smsg) {
+                next unless $lei->{diff_want}->{$pfx};
+                for my $f (@$fields) {
+                        my $v = $smsg->{$f} // next;
+                        push @{$lei->{qterms}->{$pfx}}, xphrase($v);
+                }
+        }
+        $eml->each_part(\&extract_terms, $lei, 1);
+        if ($lei->{opt}->{debug}) {
+                my $json = ref(PublicInbox::Config->json)->new;
+                $json->utf8->canonical->pretty;
+                $lei->err($json->encode($lei->{qterms}));
+        }
+        my (@q, %seen);
+        for my $pfx (@want) {
+                if (ref($pfx) eq 'ARRAY') {
+                        my ($p, $t_range) = @$pfx; # TODO
+
+                } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! ||
+                                $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) {
+                        push @q, $pfx;
+                } else {
+                        my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : '';
+                        my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : '';
+                        my $x = delete($lei->{qterms}->{$pfx}) or next;
+                        my $star = $end =~ tr/*//d ? '*' : '';
+                        my $min_len = ($end // 0) + 0;
+
+                        # no wildcards for bool_pfx_external
+                        $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/;
+                        $pfx = "$plusminus$pfx:";
+                        if ($min_len) {
+                                push @q, map {
+                                        my @t = ($pfx.$_.$star);
+                                        while (length > $min_len) {
+                                                chop $_;
+                                                push @t, 'OR', $pfx.$_.$star;
+                                        }
+                                        @t;
+                                } @$x;
+                        } else {
+                                push @q, map {
+                                        my $k = $pfx.$_.$star;
+                                        $seen{$k}++ ? () : $k
+                                } @$x;
+                        }
+                }
+        }
+        if ($lei->{opt}->{uri}) {
+                @q = (join('+', map { uri_escape_utf8($_) } @q));
+        } else {
+                @q = (join(' ', @q));
+        }
+        $lei->out(@q, "\n");
+}
+
+sub call { # the "lei patch-to-query" entry point
+        my ($cls, $lei, $input) = @_;
+        my $self = $lei->{p2q} = bless {}, $cls;
+        if ($lei->{opt}->{stdin}) {
+                $self->{0} = delete $lei->{0}; # guard from lei_atfork_child
+        } else {
+                $self->{input} = $input;
+        }
+        my $op = $lei->workers_start($self, 'lei patch2query', 1, {
+                '' => [ $lei->{p2q_done} // $lei->can('dclose'), $lei ]
+        });
+        $self->wq_io_do('do_p2q', []);
+        $self->wq_close(1);
+        while ($op && $op->{sock}) { $op->event_step }
+}
+
+sub ipc_atfork_child {
+        my ($self) = @_;
+        my $lei = $self->{lei};
+        $lei->lei_atfork_child;
+        $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb();
+        $self->SUPER::ipc_atfork_child;
+}
+
+1;