about summary refs log tree commit homepage
path: root/lib/PublicInbox/LeiP2q.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/LeiP2q.pm')
-rw-r--r--lib/PublicInbox/LeiP2q.pm198
1 files changed, 198 insertions, 0 deletions
diff --git a/lib/PublicInbox/LeiP2q.pm b/lib/PublicInbox/LeiP2q.pm
new file mode 100644
index 00000000..68faa016
--- /dev/null
+++ b/lib/PublicInbox/LeiP2q.pm
@@ -0,0 +1,198 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# front-end for the "lei patch-to-query" sub-command
+package PublicInbox::LeiP2q;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC PublicInbox::LeiInput);
+use PublicInbox::Eml;
+use PublicInbox::Smsg;
+use PublicInbox::MsgIter qw(msg_part_text);
+use PublicInbox::Git qw(git_unquote);
+use PublicInbox::OnDestroy;
+use URI::Escape qw(uri_escape_utf8);
+my $FN = qr!((?:"?[^/\n]+/[^\r\n]+)|/dev/null)!;
+
+sub xphrase ($) {
+        my ($s) = @_;
+        return () unless $s =~ /\S/;
+        # cf. xapian-core/queryparser/queryparser.lemony
+        # [\./:\\\@] - is_phrase_generator (implicit phrase search)
+        # FIXME not really sure about these..., we basically want to
+        # extract the longest phrase possible that Xapian can handle
+        map {
+                s/\A\s*//;
+                s/\s+\z//;
+                m![^\./:\\\@\-\w]! ? qq("$_") : $_ ;
+        } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g);
+}
+
+sub add_qterm ($$@) {
+        my ($self, $p, @v) = @_;
+        for (@v) {
+                $self->{qseen}->{"$p\0$_"} //=
+                        push(@{$self->{qterms}->{$p}}, $_);
+        }
+}
+
+sub extract_terms { # eml->each_part callback
+        my ($p, $self) = @_;
+        my $part = $p->[0]; # ignore $depth and @idx;
+        my $ct = $part->content_type || 'text/plain';
+        my ($s, undef) = msg_part_text($part, $ct);
+        defined $s or return;
+        my $in_diff;
+        # TODO: b: nq: q:
+        for (split(/\n/, $s)) {
+                if ($in_diff && s/^ //) { # diff context
+                        add_qterm($self, 'dfctx', xphrase($_));
+                } elsif (/^-- $/) { # email signature begins
+                        $in_diff = undef;
+                } elsif (m!^diff --git $FN $FN!) {
+                        # wait until "---" and "+++" to capture filenames
+                        $in_diff = 1;
+                } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) {
+                        my ($oa, $ob) = ($1, $2);
+                        add_qterm($self, 'dfpre', $oa);
+                        add_qterm($self, 'dfpost', $ob);
+                        # who uses dfblob?
+                } elsif (m!^(?:---|\+{3}) ($FN)!) {
+                        next if $1 eq '/dev/null';
+                        my $fn = (split(m!/!, git_unquote($1.''), 2))[1];
+                        add_qterm($self, 'dfn', xphrase($fn));
+                } elsif ($in_diff && s/^\+//) { # diff added
+                        add_qterm($self, 'dfb', xphrase($_));
+                } elsif ($in_diff && s/^-//) { # diff removed
+                        add_qterm($self, 'dfa', xphrase($_));
+                } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) {
+                        # traditional diff w/o -p
+                } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) {
+                        add_qterm($self, 'dfhh', xphrase($1));
+                } elsif (/^(?:dis)similarity index/ ||
+                                /^(?:old|new) mode/ ||
+                                /^(?:deleted|new) file mode/ ||
+                                /^(?:copy|rename) (?:from|to) / ||
+                                /^(?:dis)?similarity index / ||
+                                /^\\ No newline at end of file/ ||
+                                /^Binary files .* differ/) {
+                } elsif ($_ eq '') {
+                        # possible to be in diff context, some mail may be
+                        # stripped by MUA or even GNU diff(1).  "git apply"
+                        # treats a bare "\n" as diff context, too
+                } else {
+                        $in_diff = undef;
+                }
+        }
+}
+
+my %pfx2smsg = (
+        t => [ qw(to) ],
+        c => [ qw(cc) ],
+        f => [ qw(from) ],
+        tc => [ qw(to cc) ],
+        tcf => [ qw(to cc from) ],
+        a => [ qw(to cc from) ],
+        s => [ qw(subject) ],
+        bs => [ qw(subject) ], # body handled elsewhere
+        d => [ qw(ds) ], # nonsense?
+        dt => [ qw(ds) ], # ditto...
+        rt => [ qw(ts) ], # ditto...
+);
+
+sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh
+        my ($self, $eml) = @_;
+        my $diff_want = $self->{diff_want} // do {
+                my $want = $self->{lei}->{opt}->{want} // [ qw(dfpost7) ];
+                my @want = split(/[, ]+/, "@$want");
+                for (@want) {
+                        /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/
+                                or next;
+                        my ($pfx, $n, $unit) = ($1, $2, $3);
+                        $n *= 86400 * ($unit =~ /week/i ? 7 : 1);
+                        $_ = [ $pfx, $n ];
+                }
+                $self->{want_order} = \@want;
+                $self->{diff_want} = +{ map { $_ => 1 } @want };
+        };
+        my $smsg = bless {}, 'PublicInbox::Smsg';
+        $smsg->populate($eml);
+        while (my ($pfx, $fields) = each %pfx2smsg) {
+                next unless $diff_want->{$pfx};
+                for my $f (@$fields) {
+                        my $v = $smsg->{$f} // next;
+                        add_qterm($self, $pfx, xphrase($v));
+                }
+        }
+        $eml->each_part(\&extract_terms, $self, 1);
+}
+
+sub emit_query {
+        my ($self) = @_;
+        my $lei = $self->{lei};
+        if ($lei->{opt}->{debug}) {
+                my $json = ref(PublicInbox::Config->json)->new;
+                $json->utf8->canonical->pretty;
+                print { $lei->{2} } $json->encode($self->{qterms});
+        }
+        my (@q, %seen);
+        for my $pfx (@{$self->{want_order}}) {
+                if (ref($pfx) eq 'ARRAY') {
+                        my ($p, $t_range) = @$pfx; # TODO
+
+                } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! ||
+                                $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) {
+                        push @q, $pfx;
+                } else {
+                        my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : '';
+                        my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : '';
+                        my $x = delete($self->{qterms}->{$pfx}) or next;
+                        my $star = $end =~ tr/*//d ? '*' : '';
+                        my $min_len = ($end || 0) + 0;
+
+                        # no wildcards for bool_pfx_external
+                        $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/;
+                        $pfx = "$plusminus$pfx:";
+                        if ($min_len) {
+                                push @q, map {
+                                        my @t = ($pfx.$_.$star);
+                                        while (length > $min_len) {
+                                                chop $_;
+                                                push @t, 'OR', $pfx.$_.$star;
+                                        }
+                                        @t;
+                                } @$x;
+                        } else {
+                                push @q, map {
+                                        my $k = $pfx.$_.$star;
+                                        $seen{$k}++ ? () : $k
+                                } @$x;
+                        }
+                }
+        }
+        if ($lei->{opt}->{uri}) {
+                @q = (join('+', map { uri_escape_utf8($_) } @q));
+        } else {
+                @q = (join(' ', @q));
+        }
+        $lei->out(@q, "\n");
+}
+
+sub lei_p2q { # the "lei patch-to-query" entry point
+        my ($lei, @inputs) = @_;
+        $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin};
+        my $self = bless { missing_ok => 1 }, __PACKAGE__;
+        $self->prepare_inputs($lei, \@inputs) or return;
+        $lei->wq1_start($self);
+}
+
+sub ipc_atfork_child {
+        my ($self) = @_;
+        PublicInbox::LeiInput::input_only_atfork_child($self);
+        on_destroy \&emit_query, $self;
+}
+
+no warnings 'once';
+*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done;
+
+1;