about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2014-04-11 22:17:07 +0000
committerEric Wong <e@80x24.org>2014-04-11 22:19:19 +0000
commit546ff31b4c11d74f63aa4501659224dbf864c21c (patch)
tree66fe3705b36013e594dfde7819e2a5cf848addc1 /lib
parent6dd243c4706d6df9d95fb98f6bdcf4a0cd844bcd (diff)
downloadpublic-inbox-546ff31b4c11d74f63aa4501659224dbf864c21c.tar.gz
We may occasionally encounter horrid HTML which lynx cannot
handle, so improve error reporting.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Filter.pm29
1 files changed, 15 insertions, 14 deletions
diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm
index 4a348217..e5a8fafe 100644
--- a/lib/PublicInbox/Filter.pm
+++ b/lib/PublicInbox/Filter.pm
@@ -10,7 +10,7 @@ use warnings;
 use Email::MIME;
 use Email::MIME::ContentType qw/parse_content_type/;
 use Email::Filter;
-use IPC::Open2;
+use IPC::Run;
 our $VERSION = '0.0.1';
 
 # start with the same defaults as mailman
@@ -41,7 +41,7 @@ sub run {
                 # HTML-only, non-multipart
                 my $body = $simple->body;
                 my $ct_parsed = parse_content_type($content_type);
-                dump_html($body, $ct_parsed->{attributes}->{charset});
+                dump_html(\$body, $ct_parsed->{attributes}->{charset});
                 replace_body($simple, $body);
                 return 1;
         } elsif ($content_type =~ m!\bmultipart/!i) {
@@ -80,28 +80,29 @@ sub html_part_to_text {
         my ($simple, $part) = @_;
         my $body = $part->body;
         my $ct_parsed = parse_content_type($part->content_type);
-        dump_html($body, $ct_parsed->{attributes}->{charset});
+        dump_html(\$body, $ct_parsed->{attributes}->{charset});
         replace_part($simple, $part, $body, 'text/plain');
 }
 
 # modifies $_[0] in place
 sub dump_html {
-        my $charset = $_[1] || 'US-ASCII';
-        my $cmd = "lynx -stdin -dump";
+        my ($body, $charset) = @_;
+        $charset ||= 'US-ASCII';
+        my @cmd = qw(lynx -stdin -stderr -dump);
+        my $out = "";
+        my $err = "";
 
         # be careful about remote command injection!
         if ($charset =~ /\A[A-Za-z0-9\-]+\z/) {
-                $cmd .= " -assume_charset=$charset";
+                push @cmd, "-assume_charset=$charset";
         }
-
-        my $pid = open2(my $out, my $in, $cmd);
-        print $in $_[0];
-        close $in;
-        {
-                local $/;
-                $_[0] = <$out>;
+        if (IPC::Run::run(\@cmd, $body, \$out, \$err)) {
+                $$body = $out;
+        } else {
+                # give them an ugly version:
+                $$body = "public-inbox HTML conversion failed: $err\n" .
+                         $$body . "\n";
         }
-        waitpid($pid, 0);
 }
 
 # this is to correct user errors and not expected to cover all corner cases