From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E7B6A2142D for ; Mon, 21 Jan 2019 20:52:59 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 28/37] t/check-www-inbox: use xmlstarlet to validate Atom if available Date: Mon, 21 Jan 2019 20:52:44 +0000 Message-Id: <20190121205253.10455-29-e@80x24.org> In-Reply-To: <20190121205253.10455-1-e@80x24.org> References: <20190121205253.10455-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: I almost forgot about this script; but remembering to test it against real-world data can be useful to hunt for bugs. --- t/check-www-inbox.perl | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl index 08e6247..7dd1eeb 100644 --- a/t/check-www-inbox.perl +++ b/t/check-www-inbox.perl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2016-2018 all contributors +# Copyright (C) 2016-2019 all contributors # License: AGPL-3.0+ # Parallel WWW checker my $usage = "$0 [-j JOBS] [-s SLOW_THRESHOLD] URL_OF_INBOX\n"; @@ -23,6 +23,16 @@ my %opts = ( GetOptions(%opts) or die "bad command-line args\n$usage"; my $root_url = shift or die $usage; +chomp(my $xmlstarlet = `which xmlstarlet 2>/dev/null`); +my $atom_check = eval { + require IPC::Run; + my $cmd = [ qw(xmlstarlet val -e -) ]; + sub { + my ($in, $out, $err) = @_; + IPC::Run::run($cmd, $in, $out, $err); + } +} if $xmlstarlet; + my %workers; $SIG{TERM} = sub { exit 0 }; $SIG{CHLD} = sub { @@ -146,7 +156,15 @@ sub worker_loop { # make sure the HTML source doesn't screw up terminals # when people curl the source (not remotely an expert # on languages or encodings, here). - next if $r->header('Content-Type') !~ m!\btext/html\b!; + my $ct = $r->header('Content-Type'); + if ($atom_check && $ct =~ m!\bapplication/atom\+xml\b!) { + my $raw = $r->decoded_content; + my ($out, $err) = ('', ''); + $atom_check->(\$raw, \$out, \$err) and + warn "Atom ($?) - $u - <1:$out> <2:$err>\n"; + } + + next if $ct !~ m!\btext/html\b!; my $dc = $r->decoded_content; if ($dc =~ /([\x00-\x08\x0d-\x1f\x7f-\x{99999999}]+)/s) { my $o = $1; -- EW