user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [RFC] www: set "<!DOCTYPE html>" everywhere
@ 2019-04-27 21:23 Eric Wong
  2019-04-28 22:32 ` Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2019-04-27 21:23 UTC (permalink / raw)
  To: meta

I'm no fan of the "Living Standard" quicksand that is HTML 5
(or wasting 15 bytes on every response).  However, being easy
to validate everything with tidy(1) seems alright...

t/check-www-inbox.perl now runs tidy(1) for every text/html
response, now.
---
 Documentation/txt2pre          |  2 +-
 lib/PublicInbox/Mbox.pm        |  2 +-
 lib/PublicInbox/Unsubscribe.pm |  2 +-
 lib/PublicInbox/WWW.pm         |  2 +-
 lib/PublicInbox/WwwStream.pm   |  2 +-
 t/check-www-inbox.perl         | 76 ++++++++++++++++++++--------------
 6 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/Documentation/txt2pre b/Documentation/txt2pre
index 4c4b2ca..4ad2372 100755
--- a/Documentation/txt2pre
+++ b/Documentation/txt2pre
@@ -19,7 +19,7 @@ $str = $l->linkify_1($str);
 $str = ascii_html($str);
 $str = $l->linkify_2($str);
 
-print '<html><head>',
+print '<!DOCTYPE html><html><head>',
   qq(<meta\nhttp-equiv="Content-Type"\ncontent="text/html; charset=utf-8"\n/>),
   "<title>$title</title>",
   "</head><body><pre>",  $str , '</pre></body></html>';
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 78dbe27..1e85573 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -222,7 +222,7 @@ sub need_gzip {
 	my $fh = $_[0]->([501, ['Content-Type' => 'text/html']]);
 	my $title = 'gzipped mbox not available';
 	$fh->write(<<EOF);
-<html><head><title>$title</title><body><pre>$title
+<!DOCTYPE html><html><head><title>$title</title><body><pre>$title
 The administrator needs to install the IO::Compress::Gzip Perl module
 to support gzipped mboxes.
 <a href="../">Return to index</a></pre></body></html>
diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm
index 11a347e..1e66011 100644
--- a/lib/PublicInbox/Unsubscribe.pm
+++ b/lib/PublicInbox/Unsubscribe.pm
@@ -134,7 +134,7 @@ sub finalize_unsub { # on POST
 
 sub r {
 	my ($self, $code, $title, @body) = @_;
-	[ $code, [ @CT_HTML ], [
+	[ $code, [ @CT_HTML ], [ '<!DOCTYPE html>' .
 		"<html><head><title>$title</title></head><body><pre>".
 		join("\n", "<b>$title</b>\n", @body) . '</pre><hr>'.
 		"<pre>This page is available under AGPL-3.0+\n" .
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 6e69001..6e46caa 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -314,7 +314,7 @@ sub searcher {
 sub need_search {
 	my ($ctx) = @_;
 	my $msg = <<EOF;
-<html><head><title>Search not available for this
+<!DOCTYPE html><html><head><title>Search not available for this
 public-inbox</title><body><pre>Search is not available for this public-inbox
 <a href="../">Return to index</a></pre></body></html>
 EOF
diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm
index 8b79923..811f6bc 100644
--- a/lib/PublicInbox/WwwStream.pm
+++ b/lib/PublicInbox/WwwStream.pm
@@ -64,7 +64,7 @@ sub _html_top ($) {
 	} else {
 		$top = '<pre>' . $top . "\n" . $links . '</pre>';
 	}
-	"<html><head><title>$title</title>" .
+	"<!DOCTYPE html><html><head><title>$title</title>" .
 		"<link\nrel=alternate\ntitle=\"Atom feed\"\n".
 		"href=\"$atom\"\ntype=\"application/atom+xml\"/>" .
 	        $ctx->{www}->style($upfx) .
diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl
index db292c5..0cab68e 100644
--- a/t/check-www-inbox.perl
+++ b/t/check-www-inbox.perl
@@ -30,37 +30,14 @@ my %opts = (
 GetOptions(%opts) or die "bad command-line args\n$usage";
 my $root_url = shift or die $usage;
 
-chomp(my $xmlstarlet = which('xmlstarlet'));
-my $atom_check = eval {
-	my $cmd = [ qw(xmlstarlet val -e -) ];
-	sub {
-		my ($in, $out, $err) = @_;
-		use autodie;
-		open my $in_fh, '+>', undef;
-		open my $out_fh, '+>', undef;
-		open my $err_fh, '+>', undef;
-		print $in_fh $$in;
-		$in_fh->flush;
-		sysseek($in_fh, 0, 0);
-		my $rdr = {
-			0 => fileno($in_fh),
-			1 => fileno($out_fh),
-			2 => fileno($err_fh),
-		};
-		my $pid = spawn($cmd, undef, $rdr);
-		defined $pid or die "spawn failure: $!";
-		while (waitpid($pid, 0) != $pid) {
-			next if $!{EINTR};
-			warn "waitpid(xmlstarlet, $pid) $!";
-			return $!;
-		}
-		sysseek($out_fh, 0, 0);
-		sysread($out_fh, $$out, -s $out_fh);
-		sysseek($err_fh, 0, 0);
-		sysread($err_fh, $$err, -s $err_fh);
-		$?
-	}
-} if $xmlstarlet;
+my $xmlstarlet = which('xmlstarlet');
+my $atom_check = cmd_check([ $xmlstarlet, qw(val -e -) ]) if $xmlstarlet;
+
+# FIXME: highlight creates empty spans:
+my @TIDY_OPT = qw(--drop-empty-elements 0);
+
+my $tidy = which('tidy');
+my $tidy_check = cmd_check([ $tidy, qw(-e -q), @TIDY_OPT ]) if $tidy;
 
 my %workers;
 $SIG{INT} = sub { exit 130 };
@@ -205,5 +182,42 @@ sub worker_loop {
 			my $c = Dumper($o);
 			warn "bad: $u $c\n";
 		}
+		if ($tidy_check) {
+			my $raw = $r->decoded_content;
+			my ($out, $err) = ('', '');
+			my $fail = $tidy_check->(\$raw, \$out, \$err);
+			warn "Tidy ($fail) - $u - <1:$out> <2:$err>\n" if $fail;
+		}
+	}
+}
+
+sub cmd_check {
+	my ($cmd) = @_;
+	sub {
+		my ($in, $out, $err) = @_;
+		use autodie;
+		open my $in_fh, '+>', undef;
+		open my $out_fh, '+>', undef;
+		open my $err_fh, '+>', undef;
+		print $in_fh $$in;
+		$in_fh->flush;
+		sysseek($in_fh, 0, 0);
+		my $rdr = {
+			0 => fileno($in_fh),
+			1 => fileno($out_fh),
+			2 => fileno($err_fh),
+		};
+		my $pid = spawn($cmd, undef, $rdr);
+		defined $pid or die "spawn failure: $!";
+		while (waitpid($pid, 0) != $pid) {
+			next if $!{EINTR};
+			warn "waitpid($cmd->[0], $pid) $!";
+			return $!;
+		}
+		sysseek($out_fh, 0, 0);
+		sysread($out_fh, $$out, -s $out_fh);
+		sysseek($err_fh, 0, 0);
+		sysread($err_fh, $$err, -s $err_fh);
+		$?
 	}
 }
-- 
EW

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [RFC] www: set "<!DOCTYPE html>" everywhere
  2019-04-27 21:23 [RFC] www: set "<!DOCTYPE html>" everywhere Eric Wong
@ 2019-04-28 22:32 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2019-04-28 22:32 UTC (permalink / raw)
  To: meta

Eric Wong <e@80x24.org> wrote:
> t/check-www-inbox.perl now runs tidy(1) for every text/html
> response, now.

tidy is definitely useful and exposed a bug with unescaped URLs
in ExtMsg:

https://public-inbox.org/git/20190428221229.22691-1-e@80x24.org/

> I'm no fan of the "Living Standard" quicksand that is HTML 5
> (or wasting 15 bytes on every response).  However, being easy
> to validate everything with tidy(1) seems alright...

However the 15 wasted bytes at the beginning of every single
response still bothers me.  AFAIK every HTML renderer works
fine without it.

Also, our Atom feeds use XHTML (instead of HTML), since Atom
feed parsers need to understand XML, anyways, and may get
the non-overlapping parts of HTML wrong.


So, an alternate change could be merely prepending "<!doctype html>"
before we spawn tidy:

> --- a/t/check-www-inbox.perl
> +++ b/t/check-www-inbox.perl
> @@ -205,5 +182,42 @@ sub worker_loop {
>  			my $c = Dumper($o);
>  			warn "bad: $u $c\n";
>  		}
> +		if ($tidy_check) {
> +			my $raw = $r->decoded_content;

			my $raw = '<!doctype html>' . $r->decoded_content;

> +			my ($out, $err) = ('', '');
> +			my $fail = $tidy_check->(\$raw, \$out, \$err);
> +			warn "Tidy ($fail) - $u - <1:$out> <2:$err>\n" if $fail;
> +		}
> +	}
> +}

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-04-28 22:32 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-27 21:23 [RFC] www: set "<!DOCTYPE html>" everywhere Eric Wong
2019-04-28 22:32 ` Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).