Merge remote-tracking branch 'origin' into parallel-wget

2025-03-14 11:50:18 +08:00 · 2013-10-13 17:53:35 +02:00 · 2013-10-13 17:53:35 +02:00 · 93ad38686a
commit 93ad38686a
parent 89024dda4f fbd9b9c16b
10 changed files with 67 additions and 8 deletions
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@ -1,3 +1,7 @@
+2013-10-06  Tim Ruehsen  <tim.ruehsen@gmx.de>
+
+	* wget.texi: add/explain quoting of wildcard patterns
+
 2013-09-04  Tim Ruehsen  <tim.ruehsen@gmx.de>

 	* sample.wgetrc: added "secureprotocol" example
--- a/doc/wget.texi
+++ b/doc/wget.texi
@ -2100,6 +2100,8 @@ accept or reject (@pxref{Types of Files}). Note that if
 any of the wildcard characters, @samp{*}, @samp{?}, @samp{[} or
@samp{]}, appear in an element of @var{acclist} or @var{rejlist},
 it will be treated as a pattern, rather than a suffix.
+In this case, you have to enclose the pattern into quotes to prevent
+your shell from expanding it, like in @samp{-A "*.mp3"} or @samp{-A '*.mp3'}.

@item --accept-regex @var{urlregex}
@itemx --reject-regex @var{urlregex}
@ -2157,8 +2159,10 @@ dedicated @samp{--page-requisites} option.
 Ignore case when matching files and directories.  This influences the
 behavior of -R, -A, -I, and -X options, as well as globbing
 implemented when downloading from FTP sites.  For example, with this
-option, @samp{-A *.txt} will match @samp{file1.txt}, but also
+option, @samp{-A "*.txt"} will match @samp{file1.txt}, but also
@samp{file2.TXT}, @samp{file3.TxT}, and so on.
+The quotes in the example are to prevent the shell from expanding the
+pattern.

@item -H
@itemx --span-hosts
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -15,6 +15,11 @@

 	* utils.c (run_with_timeout): abort when there are more threads.

+2013-10-10  Giuseppe Scrivano  <gscrivan@redhat.com>
+
+	* url.c (url_parse): Try to convert UTF-8 URLs to IDN.
+	* html-url.c (append_url): Parse URLs specifying an IRI structure.
+
 2013-09-13  Tim Ruehsen  <tim.ruehsen@gmx.de>

 	* recur.c (download_child_p): fix compile error when
--- a/src/html-url.c
+++ b/src/html-url.c
@ -284,6 +284,10 @@ append_url (const char *link_uri, int position, int size,
  const char *base = ctx->base ? ctx->base : ctx->parent_base;
  struct url *url;

+  struct iri *iri = iri_new ();
+  set_uri_encoding (iri, opt.locale, true);
+  iri->utf8_encode = true;
+
  if (!base)
    {
      DEBUGP (("%s: no base, merge will use \"%s\".\n",
@ -301,7 +305,7 @@ append_url (const char *link_uri, int position, int size,
          return NULL;
        }

-      url = url_parse (link_uri, NULL, NULL, false);
+      url = url_parse (link_uri, NULL, iri, false);
      if (!url)
        {
          DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -323,7 +327,7 @@ append_url (const char *link_uri, int position, int size,
               quote_n (2, link_uri),
               quotearg_n_style (3, escape_quoting_style, complete_uri)));

-      url = url_parse (complete_uri, NULL, NULL, false);
+      url = url_parse (complete_uri, NULL, iri, false);
      if (!url)
        {
          DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -334,6 +338,8 @@ append_url (const char *link_uri, int position, int size,
      xfree (complete_uri);
    }

+  iri_free (iri);
+
  DEBUGP (("appending %s to urlpos.\n", quote (url->url)));

  newel = xnew0 (struct urlpos);
--- a/src/url.c
+++ b/src/url.c
@ -702,7 +702,10 @@ url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
      if (!iri->utf8_encode)
        new_url = NULL;
      else
-        iri->orig_url = xstrdup (url);
+        {
+          iri->orig_url = xstrdup (url);
+          percent_encode = true;
+        }
    }

  /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@ -1,3 +1,15 @@
+2013-10-10  Giuseppe Scrivano  <gscrivan@redhat.com>
+
+	* Test-idn-robots-utf8.px: Remove -H.
+	* Test-idn-cmd.px: Likewise.
+	* Test-idn-cmd-utf8.px: Likewise.
+	Suggested by: Tim Ruehsen <tim.ruehsen@gmx.de>
+
+2013-10-07  Tim Ruehsen <tim.ruehsen@gmx.de>
+
+	* Test-idn-robots.px: added punycoded and escaped URLs to follow
+	  removed -H
+
 2013-08-22  Tim Ruehsen <tim.ruehsen@gmx.de>

 	* Makefile.am (EXTRA_DIST): Add Test--httpsonly-r.px.
--- a/tests/Test-idn-cmd-utf8.px
+++ b/tests/Test-idn-cmd-utf8.px
@ -28,7 +28,7 @@ my %urls = (
    },
 );

-my $cmdline = $WgetTest::WGETPATH . " --iri -rH"
+my $cmdline = $WgetTest::WGETPATH . " --iri -r"
    . " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8 $utf8_hostname";

 my $expected_error_code = 0;
--- a/tests/Test-idn-cmd.px
+++ b/tests/Test-idn-cmd.px
@ -28,7 +28,7 @@ my %urls = (
    },
 );

-my $cmdline = $WgetTest::WGETPATH . " --iri -rH"
+my $cmdline = $WgetTest::WGETPATH . " --iri -r"
    . " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP $euc_jp_hostname";

 my $expected_error_code = 0;
--- a/tests/Test-idn-robots-utf8.px
+++ b/tests/Test-idn-robots-utf8.px
@ -48,7 +48,7 @@ my %urls = (
    },
 );

-my $cmdline = $WgetTest::WGETPATH . " --iri -rH"
+my $cmdline = $WgetTest::WGETPATH . " --iri -r"
    . " -e http_proxy=localhost:{{port}} --local-encoding=UTF-8"
    . " http://$utf8_hostname/";

--- a/tests/Test-idn-robots.px
+++ b/tests/Test-idn-robots.px
@ -9,11 +9,14 @@ use HTTPTest;
 # " Kon'nichiwa <dot> Japan
 my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
 my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+my $escaped_hostname = "%ba%a3%c6%fc%a4%cf.%c6%fc%cb%dc";

 ###############################################################################

 my $starter_file = <<EOF;
 <a href="http://$euc_jp_hostname/foo.txt">The link</a>
+<a href="http://$punycoded_hostname/foo2.txt">The second link</a>
+<a href="http://$escaped_hostname/foo3.txt">The third link</a>
 EOF

 my $result_file = <<EOF;
@ -38,6 +41,22 @@ my %urls = (
        },
        content => $result_file,
    },
+    "http://$punycoded_hostname/foo2.txt" => {
+        code => "200",
+        msg => "Uh-huh2",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+    "http://$punycoded_hostname/foo3.txt" => {
+        code => "200",
+        msg => "Uh-huh3",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
    "http://$punycoded_hostname/robots.txt" => {
        code => "200",
        msg => "Uh-huh",
@ -48,7 +67,7 @@ my %urls = (
    },
 );

-my $cmdline = $WgetTest::WGETPATH . " --iri -rH"
+my $cmdline = $WgetTest::WGETPATH . " --iri -r"
    . " -e http_proxy=localhost:{{port}} --local-encoding=EUC-JP"
    . " http://$euc_jp_hostname/";

@ -61,6 +80,12 @@ my %expected_downloaded_files = (
    "$punycoded_hostname/foo.txt" => {
        content => $result_file,
    },
+    "$punycoded_hostname/foo2.txt" => {
+        content => $result_file,
+    },
+    "$punycoded_hostname/foo3.txt" => {
+        content => $result_file,
+    },
    "$punycoded_hostname/robots.txt" => {
        content => '',
    },