#!/usr/bin/env -S perl -I . use strict; use warnings; use WgetFeature qw(iri); use HTTPTest; # cf. http://en.wikipedia.org/wiki/Latin1 # http://en.wikipedia.org/wiki/ISO-8859-15 ############################################################################### # Force remote encoding to ISO-8859-1 # # mime : charset found in Content-Type HTTP MIME header # meta : charset found in Content-Type meta tag # # index.html mime + file = iso-8859-15 # p1_français.html meta + file = iso-8859-1, mime = utf-8 # p2_één.html mime + file = iso-8859-1 # p3_€€€.html meta + file = utf-8, mime = iso-8859-1 # my $ccedilla_l15 = "\xE7"; my $ccedilla_u8 = "\xC3\xA7"; my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eurosign_l15 = "\xA4"; my $eurosign_u8 = "\xE2\x82\xAC"; my $pageindex = < Main Page

Link to page 1 La seule page en français. Link to page 3 My tailor is rich.

EOF my $pagefrancais = < La seule page en français

Link to page 2 Die enkele nerderlangstalige pagina.

EOF my $pageeen = < Die enkele nederlandstalige pagina

Één is niet veel maar toch meer dan nul.
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)

EOF my $pageeuro = < Euro page

My tailor isn't rich anymore.

EOF my $page404 = < 404

Nop nop nop...

EOF # code, msg, headers, content my %urls = ( '/index.html' => { code => "200", msg => "Ok", headers => { "Content-type" => "text/html; charset=ISO-8859-15", }, content => $pageindex, }, '/robots.txt' => { code => "200", msg => "Ok", headers => { "Content-type" => "text/plain", }, content => "", }, '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { # wrong charset specified by meta tag in $pagefrancais, overridden by HTTP Content-Type "Content-type" => "text/html; charset=iso-8859-1", }, content => $pagefrancais, }, '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { "Content-type" => "text/html; charset=UTF-8", }, content => $pageeen, }, '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { "Content-type" => "text/plain", }, content => $pageeuro, }, '/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded code => "200", msg => "Ok", headers => { "Content-type" => "text/plain", }, content => $pageeuro, }, ); my $cmdline = $WgetTest::WGETPATH . " --iri -e robots=on --trust-server-names --local-encoding=utf-8 -nH -r http://localhost:{{port}}/"; my $expected_error_code = 0; my %expected_downloaded_files = ( 'index.html' => { content => $pageindex, }, 'robots.txt' => { content => "", }, "p1_fran${ccedilla_u8}ais.html" => { content => $pagefrancais, }, "p2_${eacute_u8}${eacute_u8}n.html" => { content => $pageeen, }, "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => { content => $pageeuro, }, ); ############################################################################### my $the_test = HTTPTest->new (input => \%urls, cmdline => $cmdline, errcode => $expected_error_code, output => \%expected_downloaded_files); exit $the_test->run(); # vim: et ts=4 sw=4