Merge remote-tracking branch 'origin' into parallel-wget

This commit is contained in:
Giuseppe Scrivano 2013-10-13 17:53:35 +02:00
commit 93ad38686a
10 changed files with 67 additions and 8 deletions

View File

@ -1,3 +1,7 @@
2013-10-06 Tim Ruehsen <tim.ruehsen@gmx.de>
* wget.texi: add/explain quoting of wildcard patterns
2013-09-04 Tim Ruehsen <tim.ruehsen@gmx.de> 2013-09-04 Tim Ruehsen <tim.ruehsen@gmx.de>
* sample.wgetrc: added "secureprotocol" example * sample.wgetrc: added "secureprotocol" example

View File

@ -2100,6 +2100,8 @@ accept or reject (@pxref{Types of Files}). Note that if
any of the wildcard characters, @samp{*}, @samp{?}, @samp{[} or any of the wildcard characters, @samp{*}, @samp{?}, @samp{[} or
@samp{]}, appear in an element of @var{acclist} or @var{rejlist}, @samp{]}, appear in an element of @var{acclist} or @var{rejlist},
it will be treated as a pattern, rather than a suffix. it will be treated as a pattern, rather than a suffix.
In this case, you have to enclose the pattern into quotes to prevent
your shell from expanding it, like in @samp{-A "*.mp3"} or @samp{-A '*.mp3'}.
@item --accept-regex @var{urlregex} @item --accept-regex @var{urlregex}
@itemx --reject-regex @var{urlregex} @itemx --reject-regex @var{urlregex}
@ -2157,8 +2159,10 @@ dedicated @samp{--page-requisites} option.
Ignore case when matching files and directories. This influences the Ignore case when matching files and directories. This influences the
behavior of -R, -A, -I, and -X options, as well as globbing behavior of -R, -A, -I, and -X options, as well as globbing
implemented when downloading from FTP sites. For example, with this implemented when downloading from FTP sites. For example, with this
option, @samp{-A *.txt} will match @samp{file1.txt}, but also option, @samp{-A "*.txt"} will match @samp{file1.txt}, but also
@samp{file2.TXT}, @samp{file3.TxT}, and so on. @samp{file2.TXT}, @samp{file3.TxT}, and so on.
The quotes in the example are to prevent the shell from expanding the
pattern.
@item -H @item -H
@itemx --span-hosts @itemx --span-hosts

View File

@ -15,6 +15,11 @@
* utils.c (run_with_timeout): abort when there are more threads. * utils.c (run_with_timeout): abort when there are more threads.
2013-10-10 Giuseppe Scrivano <gscrivan@redhat.com>
* url.c (url_parse): Try to convert UTF-8 URLs to IDN.
* html-url.c (append_url): Parse URLs specifying an IRI structure.
2013-09-13 Tim Ruehsen <tim.ruehsen@gmx.de> 2013-09-13 Tim Ruehsen <tim.ruehsen@gmx.de>
* recur.c (download_child_p): fix compile error when * recur.c (download_child_p): fix compile error when

View File

@ -284,6 +284,10 @@ append_url (const char *link_uri, int position, int size,
const char *base = ctx->base ? ctx->base : ctx->parent_base; const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url; struct url *url;
struct iri *iri = iri_new ();
set_uri_encoding (iri, opt.locale, true);
iri->utf8_encode = true;
if (!base) if (!base)
{ {
DEBUGP (("%s: no base, merge will use \"%s\".\n", DEBUGP (("%s: no base, merge will use \"%s\".\n",
@ -301,7 +305,7 @@ append_url (const char *link_uri, int position, int size,
return NULL; return NULL;
} }
url = url_parse (link_uri, NULL, NULL, false); url = url_parse (link_uri, NULL, iri, false);
if (!url) if (!url)
{ {
DEBUGP (("%s: link \"%s\" doesn't parse.\n", DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -323,7 +327,7 @@ append_url (const char *link_uri, int position, int size,
quote_n (2, link_uri), quote_n (2, link_uri),
quotearg_n_style (3, escape_quoting_style, complete_uri))); quotearg_n_style (3, escape_quoting_style, complete_uri)));
url = url_parse (complete_uri, NULL, NULL, false); url = url_parse (complete_uri, NULL, iri, false);
if (!url) if (!url)
{ {
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -334,6 +338,8 @@ append_url (const char *link_uri, int position, int size,
xfree (complete_uri); xfree (complete_uri);
} }
iri_free (iri);
DEBUGP (("appending %s to urlpos.\n", quote (url->url))); DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
newel = xnew0 (struct urlpos); newel = xnew0 (struct urlpos);

View File

@ -702,7 +702,10 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
if (!iri->utf8_encode) if (!iri->utf8_encode)
new_url = NULL; new_url = NULL;
else else
iri->orig_url = xstrdup (url); {
iri->orig_url = xstrdup (url);
percent_encode = true;
}
} }
/* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/ /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/

View File

@ -1,3 +1,15 @@
2013-10-10 Giuseppe Scrivano <gscrivan@redhat.com>
* Test-idn-robots-utf8.px: Remove -H.
* Test-idn-cmd.px: Likewise.
* Test-idn-cmd-utf8.px: Likewise.
Suggested by: Tim Ruehsen <tim.ruehsen@gmx.de>
2013-10-07 Tim Ruehsen <tim.ruehsen@gmx.de>
* Test-idn-robots.px: added punycoded and escaped URLs to follow
removed -H
2013-08-22 Tim Ruehsen <tim.ruehsen@gmx.de> 2013-08-22 Tim Ruehsen <tim.ruehsen@gmx.de>
* Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px. * Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px.

View File

@ -28,7 +28,7 @@ my %urls = (
}, },
); );
my $cmdline = $WgetTest::WGETPATH . " --iri -rH" my $cmdline = $WgetTest::WGETPATH . " --iri -r"
. " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8 $utf8_hostname"; . " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8 $utf8_hostname";
my $expected_error_code = 0; my $expected_error_code = 0;

View File

@ -28,7 +28,7 @@ my %urls = (
}, },
); );
my $cmdline = $WgetTest::WGETPATH . " --iri -rH" my $cmdline = $WgetTest::WGETPATH . " --iri -r"
. " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP $euc_jp_hostname"; . " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP $euc_jp_hostname";
my $expected_error_code = 0; my $expected_error_code = 0;

View File

@ -48,7 +48,7 @@ my %urls = (
}, },
); );
my $cmdline = $WgetTest::WGETPATH . " --iri -rH" my $cmdline = $WgetTest::WGETPATH . " --iri -r"
. " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8" . " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8"
. " http://$utf8_hostname/"; . " http://$utf8_hostname/";

View File

@ -9,11 +9,14 @@ use HTTPTest;
# " Kon'nichiwa <dot> Japan # " Kon'nichiwa <dot> Japan
my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334"; my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a'; my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
my $escaped_hostname = "%ba%a3%c6%fc%a4%cf.%c6%fc%cb%dc";
############################################################################### ###############################################################################
my $starter_file = <<EOF; my $starter_file = <<EOF;
<a href="http://$euc_jp_hostname/foo.txt">The link</a> <a href="http://$euc_jp_hostname/foo.txt">The link</a>
<a href="http://$punycoded_hostname/foo2.txt">The second link</a>
<a href="http://$escaped_hostname/foo3.txt">The third link</a>
EOF EOF
my $result_file = <<EOF; my $result_file = <<EOF;
@ -38,6 +41,22 @@ my %urls = (
}, },
content => $result_file, content => $result_file,
}, },
"http://$punycoded_hostname/foo2.txt" => {
code => "200",
msg => "Uh-huh2",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
"http://$punycoded_hostname/foo3.txt" => {
code => "200",
msg => "Uh-huh3",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
"http://$punycoded_hostname/robots.txt" => { "http://$punycoded_hostname/robots.txt" => {
code => "200", code => "200",
msg => "Uh-huh", msg => "Uh-huh",
@ -48,7 +67,7 @@ my %urls = (
}, },
); );
my $cmdline = $WgetTest::WGETPATH . " --iri -rH" my $cmdline = $WgetTest::WGETPATH . " --iri -r"
. " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP" . " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP"
. " http://$euc_jp_hostname/"; . " http://$euc_jp_hostname/";
@ -61,6 +80,12 @@ my %expected_downloaded_files = (
"$punycoded_hostname/foo.txt" => { "$punycoded_hostname/foo.txt" => {
content => $result_file, content => $result_file,
}, },
"$punycoded_hostname/foo2.txt" => {
content => $result_file,
},
"$punycoded_hostname/foo3.txt" => {
content => $result_file,
},
"$punycoded_hostname/robots.txt" => { "$punycoded_hostname/robots.txt" => {
content => '', content => '',
}, },