From d66a45f82859de0d7a92255dd73e544cf64ab7a9 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sat, 24 May 2008 23:57:56 +0200
Subject: [PATCH 01/58] Add autoconf code for IDN/IRIs support

---
 configure.ac | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/configure.ac b/configure.ac
index a994896b..8100f48b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -458,6 +458,60 @@ else
 fi
 AC_SUBST(COMMENT_IF_NO_POD2MAN)
 
+
+dnl
+dnl Check for IDN/IRIs
+dnl
+
+AC_ARG_ENABLE(iri,
+  AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+  [case "${enable_iri}" in
+    no)
+      dnl Disable IRIs checking
+      AC_MSG_NOTICE([disabling IRIs at user request])
+      iri=no
+      ;;
+    yes)
+      dnl IRIs explicitly enabled
+      iri=yes
+      force_iri=yes
+      ;;
+    auto)
+      dnl Auto-detect IRI
+      iri=yes
+      ;;
+    *)
+      AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+      ;;
+    esac
+  ], [
+    dnl If nothing is specified, assume auto-detection
+    iri=yes
+  ]
+)
+
+AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+                                   [Support IDN/IRIs (needs GNU Libidn)]),
+                                   libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+  if test "$libidn" != ""; then
+    LDFLAGS="${LDFLAGS} -L$libidn/lib"
+    CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+  fi
+  AC_CHECK_HEADER(idna.h,
+    AC_CHECK_LIB(idn, stringprep_check_version,
+      [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+    iri=no)
+
+  if test "X$iri" != "Xno" ; then
+    AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+    AC_MSG_NOTICE([Enabling support for IRI.])
+  else
+    AC_MSG_WARN([Libidn not found])
+  fi
+fi
+
+
 dnl
 dnl Create output
 dnl

From d9cd5d220777a9e9abc8b54add709994d031d613 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 25 May 2008 03:34:28 +0200
Subject: [PATCH 02/58] Add options for enabling IRI support and forcing some
 encodings

---
 src/init.c    |  9 +++++++++
 src/main.c    | 30 ++++++++++++++++++++++++++++++
 src/options.h |  6 ++++++
 3 files changed, 45 insertions(+)

diff --git a/src/init.c b/src/init.c
index 97976553..8e8ed488 100644
--- a/src/init.c
+++ b/src/init.c
@@ -181,9 +181,15 @@ static const struct {
   { "inet6only",        &opt.ipv6_only,         cmd_boolean },
 #endif
   { "input",            &opt.input_filename,    cmd_file },
+#ifdef ENABLE_IRI
+  { "iri",              &opt.enable_iri,        cmd_boolean },
+#endif
   { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
   { "limitrate",        &opt.limit_rate,        cmd_bytes },
   { "loadcookies",      &opt.cookies_input,     cmd_file },
+#ifdef ENABLE_IRI
+  { "locale",           &opt.locale,            cmd_string },
+#endif
   { "logfile",          &opt.lfilename,         cmd_file },
   { "login",            &opt.ftp_user,          cmd_string },/* deprecated*/
   { "maxredirect",      &opt.max_redirect,      cmd_number },
@@ -223,6 +229,9 @@ static const struct {
   { "referer",          &opt.referer,           cmd_string },
   { "reject",           &opt.rejects,           cmd_vector },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
+#ifdef ENABLE_IRI
+  { "remoteencoding",   &opt.encoding_remote,   cmd_string },
+#endif
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
   { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
   { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
diff --git a/src/main.c b/src/main.c
index d68cdbd6..4f033697 100644
--- a/src/main.c
+++ b/src/main.c
@@ -43,6 +43,9 @@ as that of the covered work.  */
 #include <assert.h>
 #include <errno.h>
 #include <time.h>
+#ifdef ENABLE_IRI
+#include <langinfo.h>
+#endif
 
 #include "utils.h"
 #include "init.h"
@@ -190,10 +193,16 @@ static struct cmdline_option option_data[] =
     { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
 #endif
     { "input-file", 'i', OPT_VALUE, "input", -1 },
+#ifdef ENABLE_IRI
+    { "iri", 0, OPT_BOOLEAN, "iri", -1 },
+#endif
     { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
     { "level", 'l', OPT_VALUE, "reclevel", -1 },
     { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
     { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+#ifdef ENABLE_IRI
+    { "locale", 0, OPT_VALUE, "locale", -1 },
+#endif
     { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
     { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
     { "no", 'n', OPT__NO, NULL, required_argument },
@@ -227,6 +236,9 @@ static struct cmdline_option option_data[] =
     { "referer", 0, OPT_VALUE, "referer", -1 },
     { "reject", 'R', OPT_VALUE, "reject", -1 },
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+#ifdef ENABLE_IRI
+    { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
+#endif
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
     { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
     { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@ -947,6 +959,24 @@ for details.\n\n"));
       exit (1);
     }
 
+#ifdef ENABLE_IRI
+  if (opt.enable_iri)
+    {
+      if (!opt.locale)
+        {
+          opt.locale = getenv ("CHARSET");
+
+          if (opt.locale == NULL)
+            opt.locale = nl_langinfo(CODESET);
+        }
+      else
+        {
+          /* sXXXav : check given locale */
+          logprintf (LOG_VERBOSE, "Check the locale...\n");
+        }
+    }
+#endif
+
   if (opt.ask_passwd)
     {
       opt.passwd = prompt_for_password ();
diff --git a/src/options.h b/src/options.h
index 6a6badb0..2927a37c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -237,6 +237,12 @@ struct options
   bool content_disposition;	/* Honor HTTP Content-Disposition header. */
   bool auth_without_challenge;  /* Issue Basic authentication creds without
                                    waiting for a challenge. */
+
+#ifdef ENABLE_IRI
+  bool enable_iri;
+  char *encoding_remote;
+  char *locale;
+#endif
 };
 
 extern struct options opt;

From 4aab9e8f0af29d8fb5d59746e4e5270080f95468 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 25 May 2008 03:49:00 +0200
Subject: [PATCH 03/58] Detect HTTP Content-Type server encoding

---
 src/http.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/http.c b/src/http.c
index 11dc9cc8..682258c0 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2040,9 +2040,32 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
       char *tmp = strchr (type, ';');
       if (tmp)
         {
+#ifdef ENABLE_IRI
+          char *tmp2 = tmp + 1;
+#endif
+
           while (tmp > type && c_isspace (tmp[-1]))
             --tmp;
           *tmp = '\0';
+
+#ifdef ENABLE_IRI
+          if (opt.enable_iri && *tmp2 != '\0' &&
+              (tmp = strstr (tmp2, "charset=")) != NULL)
+            {
+              tmp += 8;
+              tmp2 = tmp;
+
+              while (*tmp2 && !c_isspace (*tmp2))
+                tmp2++;
+
+              if (tmp2 > tmp)
+                {
+                  *tmp2 = '\0';
+                  /* sXXXav : check given charset */
+                  logprintf (LOG_VERBOSE, "HTTP charset: `%s'\n", tmp);
+                }
+            }
+#endif
         }
     }
   hs->newloc = resp_header_strdup (resp, "Location");

From ed558a83f6021fa0f2a138b302ece363c1d0783b Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 19 Jun 2008 22:07:03 +0200
Subject: [PATCH 04/58] The prefered way is to avoid #ifdef flooding, so take
 it that way. Introduce iri.c and iri.h for achieving it

---
 ChangeLog        |  4 +++
 configure.ac     |  4 +++
 src/ChangeLog    | 15 +++++++++++
 src/Makefile.am  |  8 ++++--
 src/build_info.c |  7 +++++
 src/http.c       | 25 ++++-------------
 src/init.c       |  6 -----
 src/iri.c        | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/iri.h        | 43 +++++++++++++++++++++++++++++
 src/main.c       | 13 ++++-----
 src/options.h    |  2 --
 11 files changed, 161 insertions(+), 36 deletions(-)
 create mode 100644 src/iri.c
 create mode 100644 src/iri.h

diff --git a/ChangeLog b/ChangeLog
index 407ce64c..8177dc97 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2008-06-14  Xavier Saint  <wget@sxav.eu>
+
+	* configure.ac: Add support for IRIs
+
 2008-05-29  Micah Cowan  <micah@cowan.name>
 
 	* po/*.po: Updated from TP (the 1.11.3 set).
diff --git a/configure.ac b/configure.ac
index 8100f48b..44f397b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -512,6 +512,10 @@ if test "X$iri" != "Xno"; then
 fi
 
 
+dnl Needed by src/Makefile.am
+AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+
+
 dnl
 dnl Create output
 dnl
diff --git a/src/ChangeLog b/src/ChangeLog
index 9af9267b..ac27e15a 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,18 @@
+2008-06-14  Xavier Saint  <wget@sxav.eu>
+
+ 	* iri.c, iri.h : New files.
+
+	* Makefile.am : Add files iri.h and conditional iri.c.
+
+	* build_info.c : Add compiled feature "iri".
+
+	* http.c : include iri.h and parse charset from Content-Type
+	header.
+
+	* init.c, main.c, options.h : if an options isn't supported
+	at compiled time, don't get rid off it and show a dummy
+	message instead if they are used.
+
 2008-06-13  Micah Cowan  <micah@cowan.name>
 
 	* build_info.c: ENABLE_NTLM, not HAVE_NTLM.
diff --git a/src/Makefile.am b/src/Makefile.am
index c8485cfd..379a9b8c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,6 +30,10 @@
 # Version: @VERSION@
 #
 
+if IRI_IS_ENABLED
+IRI_OBJ = iri.c
+endif
+
 # The following line is losing on some versions of make!
 DEFS     = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
 LIBS     = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@@ -39,10 +43,10 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c    \
 	       ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
 	       http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
 	       recur.c res.c retr.c snprintf.c spider.c url.c	          \
-	       utils.c				          \
+	       utils.c $(IRI_OBJ)				          \
 	       connect.h convert.h cookies.h 		                  \
 	       ftp.h gen-md5.h hash.h host.h html-parse.h                 \
-	       http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
+	       http.h http-ntlm.h init.h iri.h log.h mswindows.h netrc.h  \
 	       options.h progress.h ptimer.h recur.h res.h retr.h         \
 	       spider.h ssl.h sysdep.h url.h utils.h wget.h
 nodist_wget_SOURCES = version.c
diff --git a/src/build_info.c b/src/build_info.c
index 1ac682a7..129bf726 100644
--- a/src/build_info.c
+++ b/src/build_info.c
@@ -100,6 +100,13 @@ const char* (compiled_features[]) =
 #else
   "-gettext",
 #endif
+
+#ifdef ENABLE_IRI
+  "+iri",
+#else
+  "-iri",
+#endif
+
   /* sentinel value */
   NULL
 };
diff --git a/src/http.c b/src/http.c
index 543165fd..741ed2c0 100644
--- a/src/http.c
+++ b/src/http.c
@@ -49,6 +49,7 @@ as that of the covered work.  */
 #include "retr.h"
 #include "connect.h"
 #include "netrc.h"
+#include "iri.h"
 #ifdef HAVE_SSL
 # include "ssl.h"
 #endif
@@ -2040,32 +2041,16 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
       char *tmp = strchr (type, ';');
       if (tmp)
         {
-#ifdef ENABLE_IRI
+          /* sXXXav: only needed if IRI support is enabled */
           char *tmp2 = tmp + 1;
-#endif
 
           while (tmp > type && c_isspace (tmp[-1]))
             --tmp;
           *tmp = '\0';
 
-#ifdef ENABLE_IRI
-          if (opt.enable_iri && *tmp2 != '\0' &&
-              (tmp = strstr (tmp2, "charset=")) != NULL)
-            {
-              tmp += 8;
-              tmp2 = tmp;
-
-              while (*tmp2 && !c_isspace (*tmp2))
-                tmp2++;
-
-              if (tmp2 > tmp)
-                {
-                  *tmp2 = '\0';
-                  /* sXXXav : check given charset */
-                  logprintf (LOG_VERBOSE, "HTTP charset: `%s'\n", tmp);
-                }
-            }
-#endif
+          /* Try to get remote encoding if needed */
+          if (opt.enable_iri && !opt.encoding_remote)
+            /* xxx = */ parse_charset (tmp2);
         }
     }
   hs->newloc = resp_header_strdup (resp, "Location");
diff --git a/src/init.c b/src/init.c
index 167c84fe..a7a4ee01 100644
--- a/src/init.c
+++ b/src/init.c
@@ -181,15 +181,11 @@ static const struct {
   { "inet6only",        &opt.ipv6_only,         cmd_boolean },
 #endif
   { "input",            &opt.input_filename,    cmd_file },
-#ifdef ENABLE_IRI
   { "iri",              &opt.enable_iri,        cmd_boolean },
-#endif
   { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
   { "limitrate",        &opt.limit_rate,        cmd_bytes },
   { "loadcookies",      &opt.cookies_input,     cmd_file },
-#ifdef ENABLE_IRI
   { "locale",           &opt.locale,            cmd_string },
-#endif
   { "logfile",          &opt.lfilename,         cmd_file },
   { "login",            &opt.ftp_user,          cmd_string },/* deprecated*/
   { "maxredirect",      &opt.max_redirect,      cmd_number },
@@ -229,9 +225,7 @@ static const struct {
   { "referer",          &opt.referer,           cmd_string },
   { "reject",           &opt.rejects,           cmd_vector },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
-#ifdef ENABLE_IRI
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
-#endif
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
   { "restrictfilenames", NULL,                  cmd_spec_restrict_file_names },
   { "retrsymlinks",     &opt.retr_symlinks,     cmd_boolean },
diff --git a/src/iri.c b/src/iri.c
new file mode 100644
index 00000000..e4f4622b
--- /dev/null
+++ b/src/iri.c
@@ -0,0 +1,70 @@
+/* IRI related functions.
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+   2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#include "utils.h"
+#include "iri.h"
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+   or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+  char *charset;
+
+  if (!str || !*str)
+    return NULL;
+
+  str = strcasestr (str, "charset=");
+  if (!str)
+    return NULL;
+
+  str += 8;
+  charset = str;
+
+  /* sXXXav: which chars should be banned ??? */
+  while (*charset && !c_isspace (*charset))
+    charset++;
+
+  /* sXXXav: could strdupdelim return NULL ? */
+  charset = strdupdelim (str, charset);
+  logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
+
+  return charset;
+}
+
+
diff --git a/src/iri.h b/src/iri.h
new file mode 100644
index 00000000..d135e868
--- /dev/null
+++ b/src/iri.h
@@ -0,0 +1,43 @@
+/* Internationalization related declarations.
+   Copyright (C) 2000, 2007, 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+#ifndef IRI_H
+#define IRI_H
+
+#ifdef ENABLE_IRI
+
+char *parse_charset (char *str);
+
+
+#else /* ENABLE_IRI */
+
+#define parse_charset(str)	/* no-op */
+
+#endif /* ENABLE_IRI */
+#endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index 9b449438..8002c1be 100644
--- a/src/main.c
+++ b/src/main.c
@@ -203,16 +203,12 @@ static struct cmdline_option option_data[] =
     { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
 #endif
     { "input-file", 'i', OPT_VALUE, "input", -1 },
-#ifdef ENABLE_IRI
     { "iri", 0, OPT_BOOLEAN, "iri", -1 },
-#endif
     { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
     { "level", 'l', OPT_VALUE, "reclevel", -1 },
     { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
     { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
-#ifdef ENABLE_IRI
     { "locale", 0, OPT_VALUE, "locale", -1 },
-#endif
     { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
     { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
     { "no", 'n', OPT__NO, NULL, required_argument },
@@ -246,9 +242,7 @@ static struct cmdline_option option_data[] =
     { "referer", 0, OPT_VALUE, "referer", -1 },
     { "reject", 'R', OPT_VALUE, "reject", -1 },
     { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
-#ifdef ENABLE_IRI
     { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
-#endif
     { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
     { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
     { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@ -1085,6 +1079,13 @@ for details.\n\n"));
           logprintf (LOG_VERBOSE, "Check the locale...\n");
         }
     }
+#else
+  if (opt.enable_iri || opt.locale || opt.encoding_remote)
+    {
+      /* sXXXav : be more specific... */
+      printf(_("This version does not have support for IRIs\n"));
+      exit(1);
+    }
 #endif
 
   if (opt.ask_passwd)
diff --git a/src/options.h b/src/options.h
index 2927a37c..723f80a1 100644
--- a/src/options.h
+++ b/src/options.h
@@ -238,11 +238,9 @@ struct options
   bool auth_without_challenge;  /* Issue Basic authentication creds without
                                    waiting for a challenge. */
 
-#ifdef ENABLE_IRI
   bool enable_iri;
   char *encoding_remote;
   char *locale;
-#endif
 };
 
 extern struct options opt;

From 13fec855660ee55c43f64fe47fbc284f35ca6e6e Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 19 Jun 2008 22:33:38 +0200
Subject: [PATCH 05/58] Add "content-type" meta tag parsing for retrieving HTML
 page encoding.

---
 src/ChangeLog  |  8 ++++++++
 src/html-url.c | 20 ++++++++++++++++++++
 src/iri.h      |  2 +-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index ac27e15a..e30990b0 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,11 @@
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+	* html-url.c : Add "content-type" meta tag parsing for
+	retrieving page encoding.
+
+	* iri.h : Make no-op version of parse_charset() return
+	NULL.
+
 2008-06-14  Xavier Saint  <wget@sxav.eu>
 
  	* iri.c, iri.h : New files.
diff --git a/src/html-url.c b/src/html-url.c
index 0eb66506..9b515432 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -42,6 +42,7 @@ as that of the covered work.  */
 #include "hash.h"
 #include "convert.h"
 #include "recur.h"              /* declaration of get_urls_html */
+#include "iri.h"
 
 struct map_context;
 
@@ -534,6 +535,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
           entry->link_expect_html = 1;
         }
     }
+  else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+    {
+      /* Handle stuff like:
+         <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+      char *mcharset;
+      char *content = find_attr (tag, "content", NULL);
+      if (!content)
+        return;
+
+      mcharset = parse_charset (content);
+      if (!mcharset)
+        return;
+
+      logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
+
+      /* sXXXav: Not used yet */
+      xfree (mcharset);
+    }
   else if (name && 0 == strcasecmp (name, "robots"))
     {
       /* Handle stuff like:
diff --git a/src/iri.h b/src/iri.h
index d135e868..2ac7d5e7 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -37,7 +37,7 @@ char *parse_charset (char *str);
 
 #else /* ENABLE_IRI */
 
-#define parse_charset(str)	/* no-op */
+#define parse_charset(str)	NULL
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */

From 2baf3239333fd28763ce4135c38d6e85dcbb8cfc Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 19 Jun 2008 23:10:06 +0200
Subject: [PATCH 06/58] Introduce find_locale() to find out local system
 encoding.

---
 src/ChangeLog |  7 +++++++
 src/iri.c     |  9 +++++++++
 src/iri.h     |  3 ++-
 src/main.c    | 15 +++++----------
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index e30990b0..ef69dca6 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,10 @@
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+	* iri.c, iri.h : Add a new function find_locale() to find
+	out the local system encoding.
+
+	* main.c : Make use of find_locale().
+
 2008-06-19  Xavier Saint  <wget@sxav.eu>
 
 	* html-url.c : Add "content-type" meta tag parsing for
diff --git a/src/iri.c b/src/iri.c
index e4f4622b..797ffa44 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -68,3 +68,12 @@ parse_charset (char *str)
 }
 
 
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+  /* sXXXav, made our own function or use libidn one ?! */
+  return stringprep_locale_charset ();
+}
+
+
diff --git a/src/iri.h b/src/iri.h
index 2ac7d5e7..eb344d9f 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -33,11 +33,12 @@ as that of the covered work.  */
 #ifdef ENABLE_IRI
 
 char *parse_charset (char *str);
-
+char *find_locale (void);
 
 #else /* ENABLE_IRI */
 
 #define parse_charset(str)	NULL
+#define find_locale()		NULL
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index 8002c1be..fc41153b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -57,6 +57,7 @@ as that of the covered work.  */
 #include "convert.h"
 #include "spider.h"
 #include "http.h"               /* for save_cookies */
+#include "iri.h"
 
 #include <getopt.h>
 #include <getpass.h>
@@ -1067,17 +1068,11 @@ for details.\n\n"));
   if (opt.enable_iri)
     {
       if (!opt.locale)
-        {
-          opt.locale = getenv ("CHARSET");
+        opt.locale = find_locale ();
 
-          if (opt.locale == NULL)
-            opt.locale = nl_langinfo(CODESET);
-        }
-      else
-        {
-          /* sXXXav : check given locale */
-          logprintf (LOG_VERBOSE, "Check the locale...\n");
-        }
+      /* sXXXav : check given locale and remote encoding */
+
+      logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
     }
 #else
   if (opt.enable_iri || opt.locale || opt.encoding_remote)

From be546c20cb08f9c25a6bd98bcdf44394e3e8a846 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 19 Jun 2008 23:53:03 +0200
Subject: [PATCH 07/58] Set an initial value for IRIs options and some minor
 additions in iri.c

---
 src/ChangeLog | 8 ++++++++
 src/init.c    | 8 ++++++++
 src/iri.c     | 4 +++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index ef69dca6..7ad7c8ca 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,11 @@
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+	* iri.c : Include missing stringprep.h file and add a
+	cast.
+
+	* init.c : set a default initial value for opt.enable_iri,
+	opt.locale and opt.encoding_remote.
+
 2008-06-19  Xavier Saint  <wget@sxav.eu>
 
 	* iri.c, iri.h : Add a new function find_locale() to find
diff --git a/src/init.c b/src/init.c
index a7a4ee01..f56aa652 100644
--- a/src/init.c
+++ b/src/init.c
@@ -333,6 +333,14 @@ defaults (void)
   opt.restrict_files_case = restrict_no_case_restriction;
 
   opt.max_redirect = 20;
+
+#ifdef ENABLE_IRI
+  opt.enable_iri = true;
+#else
+  opt.enable_iri = false;
+#endif
+  opt.locale = NULL;
+  opt.encoding_remote = NULL;
 }
 
 /* Return the user's home directory (strdup-ed), or NULL if none is
diff --git a/src/iri.c b/src/iri.c
index 797ffa44..b87e6ebe 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -35,6 +35,8 @@ as that of the covered work.  */
 #include <assert.h>
 #include <string.h>
 
+#include <stringprep.h>
+
 #include "utils.h"
 #include "iri.h"
 
@@ -73,7 +75,7 @@ char *
 find_locale (void)
 {
   /* sXXXav, made our own function or use libidn one ?! */
-  return stringprep_locale_charset ();
+  return (char *) stringprep_locale_charset ();
 }
 
 

From e6376b47433be6a0df64b0cd87b2d5c2c53a66f1 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Fri, 20 Jun 2008 00:33:02 +0200
Subject: [PATCH 08/58] Introduce a new function check_encoding_name() for
 doing a basic check on encoding name validity

---
 src/ChangeLog |  7 +++++++
 src/iri.c     | 29 +++++++++++++++++++++++++++++
 src/iri.h     |  6 ++++--
 src/main.c    |  6 +++++-
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 7ad7c8ca..6dcaa279 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,10 @@
+2008-06-19  Xavier Saint  <wget@sxav.eu>
+
+	* iri.c, iri.h : New function check_encoding_name() as
+	a priliminary encoding name check.
+
+	* main.c, iri.c : Make use of check_encoding_name().
+
 2008-06-19  Xavier Saint  <wget@sxav.eu>
 
 	* iri.c : Include missing stringprep.h file and add a
diff --git a/src/iri.c b/src/iri.c
index b87e6ebe..fea7b150 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -64,6 +64,14 @@ parse_charset (char *str)
 
   /* sXXXav: could strdupdelim return NULL ? */
   charset = strdupdelim (str, charset);
+
+  /* Do a minimum check on the charset value */
+  if (!check_encoding_name (charset))
+    {
+      xfree (charset);
+      return NULL;
+    }
+
   logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
 
   return charset;
@@ -79,3 +87,24 @@ find_locale (void)
 }
 
 
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+  char *s = encoding;
+
+  while (*s)
+    {
+      if (!c_isascii(*s) || c_isspace(*s))
+        {
+          logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding));
+          return false;
+        }
+
+      s++;
+    }
+
+  return true;
+}
+
+
diff --git a/src/iri.h b/src/iri.h
index eb344d9f..85a7fb7f 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -34,11 +34,13 @@ as that of the covered work.  */
 
 char *parse_charset (char *str);
 char *find_locale (void);
+bool check_encoding_name (char *encoding);
 
 #else /* ENABLE_IRI */
 
-#define parse_charset(str)	NULL
-#define find_locale()		NULL
+#define parse_charset(str)		NULL
+#define find_locale()			NULL
+#define check_encoding_name(str)	false
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index fc41153b..53ea6b91 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1067,10 +1067,14 @@ for details.\n\n"));
 #ifdef ENABLE_IRI
   if (opt.enable_iri)
     {
+      if (opt.locale && !check_encoding_name(opt.locale))
+        opt.locale = NULL;
+
       if (!opt.locale)
         opt.locale = find_locale ();
 
-      /* sXXXav : check given locale and remote encoding */
+      if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote))
+        opt.encoding_remote = NULL;
 
       logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
     }

From 85ecaaea66d2b140347476081248154f5489e108 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Fri, 20 Jun 2008 00:37:15 +0200
Subject: [PATCH 09/58] Fix a typo in src/ChangeLog

---
 src/ChangeLog | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 6dcaa279..c707eedf 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,7 +1,7 @@
 2008-06-19  Xavier Saint  <wget@sxav.eu>
 
 	* iri.c, iri.h : New function check_encoding_name() as
-	a priliminary encoding name check.
+	a preliminary encoding name check.
 
 	* main.c, iri.c : Make use of check_encoding_name().
 

From 5bb11da009c2f3bc4381bc8009c57007fd86534e Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 26 Jun 2008 17:59:07 +0200
Subject: [PATCH 10/58] Basic support of IRIs.

---
 src/ChangeLog |   9 ++++
 src/iri.c     | 134 ++++++++++++++++++++++++++++++++++++++++++++++++--
 src/iri.h     |  10 ++--
 src/url.c     |  20 +++++++-
 4 files changed, 166 insertions(+), 7 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 6dcaa279..288ec11d 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,12 @@
+2008-06-26  Xavier Saint  <wget@sxav.eu>
+
+	* iri.c, iri.h : New functions locale_to_utf8() and
+	idn_encode() adding basic capabilities of IRI/IDN.
+
+	* url.c : Convert URLs from locale to UTF-8 allowing a basic
+	support of IRI/IDN
+
+
 2008-06-19  Xavier Saint  <wget@sxav.eu>
 
 	* iri.c, iri.h : New function check_encoding_name() as
diff --git a/src/iri.c b/src/iri.c
index fea7b150..5fb06d09 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -34,13 +34,22 @@ as that of the covered work.  */
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
-
+#include <iconv.h>
 #include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
 
 #include "utils.h"
 #include "iri.h"
 
 
+static iconv_t locale2utf8;
+
+
+static bool open_locale_to_utf8 (void);
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
 /* Given a string containing "charset=XXX", return the encoding if found,
    or NULL otherwise */
 char *
@@ -77,7 +86,6 @@ parse_charset (char *str)
   return charset;
 }
 
-
 /* Find the locale used, or fall back on a default value */
 char *
 find_locale (void)
@@ -86,7 +94,6 @@ find_locale (void)
   return (char *) stringprep_locale_charset ();
 }
 
-
 /* Basic check of an encoding name. */
 bool
 check_encoding_name (char *encoding)
@@ -107,4 +114,125 @@ check_encoding_name (char *encoding)
   return true;
 }
 
+/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
+static bool
+open_locale_to_utf8 (void)
+{
+  if (locale2utf8)
+    return true;
+
+  /* sXXXav : That shouldn't happen, just in case */
+  if (!opt.locale)
+    {
+      logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+      opt.locale = find_locale ();
+    }
+
+  if (!opt.locale)
+    return false;
+
+  locale2utf8 = iconv_open ("UTF-8", opt.locale);
+  if (locale2utf8 != (iconv_t)(-1))
+    return true;
+
+  logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+             quote (opt.locale), quote("UTF-8"));
+  locale2utf8 = NULL;
+  return false;
+}
+
+/* Return a new string */
+const char *
+locale_to_utf8 (const char *str)
+{
+  char *new;
+
+  if (!strcasecmp (opt.locale, "utf-8"))
+    return str;
+
+  if (!open_locale_to_utf8 ())
+    return str;
+
+  if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
+    return (const char *) new;
+
+  return str;
+}
+
+/* */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+  /* sXXXav : hummm hard to guess... */
+  size_t len, done, outlen = inlen * 2;
+  int invalid = 0, tooshort = 0;
+  char *s;
+
+  s = xmalloc (outlen + 1);
+  *out = s;
+  len = outlen;
+  done = 0;
+
+  /* sXXXav : put a maximum looping factor ??? */
+  for (;;)
+    {
+      if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+        {
+          *out = s;
+          *(s + len - outlen - done) = '\0';
+          return true;
+        }
+
+      /* Incomplete or invalid multibyte sequence */
+      if (errno == EINVAL || errno == EILSEQ)
+        {
+          invalid++;
+          **out = *in;
+          in++;
+          inlen--;
+          (*out)++;
+          outlen--;
+        }
+      else if (errno == E2BIG) /* Output buffer full */ 
+        {
+          char *new;
+
+          tooshort++;
+          done = len;
+          outlen = done + inlen * 2;
+          new = xmalloc (outlen + 1);
+          memcpy (new, s, done);
+          xfree (s);
+          s = new;
+          len = outlen;
+          *out = s + done;
+        }
+      else /* Weird, we got an unspecified error */
+        {
+          logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
+          break;
+        }
+    }
+
+    return false;
+}
+
+/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
+   on error. */
+char *idn_encode (char *host)
+{
+  char *new;
+  int ret;
+
+  /* toASCII UTF-8 NULL terminated string */
+  ret = idna_to_ascii_8z (host, &new, 0);
+  if (ret != IDNA_SUCCESS)
+    {
+      logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
+                 quote (idna_strerror (ret)));
+      return NULL;
+    }
+
+  return new;
+}
 
diff --git a/src/iri.h b/src/iri.h
index 85a7fb7f..4488501d 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -35,12 +35,16 @@ as that of the covered work.  */
 char *parse_charset (char *str);
 char *find_locale (void);
 bool check_encoding_name (char *encoding);
+const char *locale_to_utf8 (const char *str);
+char *idn_encode (char *host);
 
 #else /* ENABLE_IRI */
 
-#define parse_charset(str)		NULL
-#define find_locale()			NULL
-#define check_encoding_name(str)	false
+#define parse_charset(str)          NULL
+#define find_locale()               NULL
+#define check_encoding_name(str)    false
+#define locale_to_utf8(str)         (str)
+#define idn_encode(str)             NULL
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/url.c b/src/url.c
index f5d621f9..48b23d6c 100644
--- a/src/url.c
+++ b/src/url.c
@@ -42,6 +42,7 @@ as that of the covered work.  */
 #include "utils.h"
 #include "url.h"
 #include "host.h"  /* for is_valid_ipv6_address */
+#include "iri.h"
 
 #ifdef TESTING
 #include "test.h"
@@ -670,6 +671,12 @@ url_parse (const char *url, int *error)
       goto error;
     }
 
+  if (opt.enable_iri)
+    {
+      url_unescape ((char *) url);
+      url = locale_to_utf8(url);
+    }
+
   url_encoded = reencode_escapes (url);
   p = url_encoded;
 
@@ -844,6 +851,17 @@ url_parse (const char *url, int *error)
       host_modified = true;
     }
 
+  if (opt.enable_iri)
+    {
+      char *new = idn_encode (u->host);
+      if (new)
+        {
+          xfree (u->host);
+          u->host = new;
+          host_modified = true;
+        }
+    }
+
   if (params_b)
     u->params = strdupdelim (params_b, params_e);
   if (query_b)
@@ -851,7 +869,7 @@ url_parse (const char *url, int *error)
   if (fragment_b)
     u->fragment = strdupdelim (fragment_b, fragment_e);
 
-  if (path_modified || u->fragment || host_modified || path_b == path_e)
+  if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
     {
       /* If we suspect that a transformation has rendered what
          url_string might return different from URL_ENCODED, rebuild

From 4c9adcd1e4615ed4dba79958dc610f3367e5ade3 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 26 Jun 2008 19:14:14 +0200
Subject: [PATCH 11/58] Check for libiconv needed for IRIs support

---
 ChangeLog    |  4 ++++
 configure.ac | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 8177dc97..89898414 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2008-06-26  Xavier Saint  <wget@sxav.eu>
+
+	* configure.ac : IRIs support required libiconv, check it.
+
 2008-06-14  Xavier Saint  <wget@sxav.eu>
 
 	* configure.ac: Add support for IRIs
diff --git a/configure.ac b/configure.ac
index 44f397b9..b2923436 100644
--- a/configure.ac
+++ b/configure.ac
@@ -493,6 +493,19 @@ AC_ARG_ENABLE(iri,
 AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
                                    [Support IDN/IRIs (needs GNU Libidn)]),
                                    libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+  AM_ICONV
+
+  if test "X$am_cv_func_iconv" != "Xyes"; then
+    iri=no
+    if test "X$force_iri" = "Xyes"; then
+      AC_MSG_ERROR([Libiconv is required for IRIs support])
+    else
+      AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
+    fi
+  fi
+fi
+
 if test "X$iri" != "Xno"; then
   if test "$libidn" != ""; then
     LDFLAGS="${LDFLAGS} -L$libidn/lib"

From 8c204b746399e3f3a42fbdadd47ad8831727c818 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 26 Jun 2008 21:42:37 +0200
Subject: [PATCH 12/58] Fix a typo in a comment: impplement -> implement

---
 src/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/log.c b/src/log.c
index e84e5c61..b62bf9dd 100644
--- a/src/log.c
+++ b/src/log.c
@@ -43,7 +43,7 @@ as that of the covered work.  */
 #include "utils.h"
 #include "log.h"
 
-/* This file impplement support for "logging".  Logging means printing
+/* This file implement support for "logging".  Logging means printing
    output, plus several additional features:
 
    - Cataloguing output by importance.  You can specify that a log

From 26252ac4ca2d62d08e80e77d1f613b0bdbdd9bc5 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Mon, 30 Jun 2008 20:03:01 +0200
Subject: [PATCH 13/58] escnonprint has been replaced by quotearg_style;
 reflect that change in comments too

---
 src/ftp-basic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ftp-basic.c b/src/ftp-basic.c
index 265a1e25..5f250959 100644
--- a/src/ftp-basic.c
+++ b/src/ftp-basic.c
@@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
         return FTPRERR;
 
       /* Strip trailing CRLF before printing the line, so that
-         escnonprint doesn't include bogus \012 and \015. */
+         quotting doesn't include bogus \012 and \015. */
       p = strchr (line, '\0');
       if (p > line && p[-1] == '\n')
         *--p = '\0';

From d687972c5052db9500f6d2cd689eee2c6f4c39ab Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Tue, 1 Jul 2008 19:28:24 +0200
Subject: [PATCH 14/58] Fix copyright year and some GNU coding style

---
 src/iri.c | 9 ++++-----
 src/iri.h | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 5fb06d09..1792ab62 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -1,6 +1,5 @@
 /* IRI related functions.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
-   2008 Free Software Foundation, Inc.
+   Copyright (C) 2008 Free Software Foundation, Inc.
 
 This file is part of GNU Wget.
 
@@ -102,9 +101,9 @@ check_encoding_name (char *encoding)
 
   while (*s)
     {
-      if (!c_isascii(*s) || c_isspace(*s))
+      if (!c_isascii (*s) || c_isspace (*s))
         {
-          logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding));
+          logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
           return false;
         }
 
@@ -136,7 +135,7 @@ open_locale_to_utf8 (void)
     return true;
 
   logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
-             quote (opt.locale), quote("UTF-8"));
+             quote (opt.locale), quote ("UTF-8"));
   locale2utf8 = NULL;
   return false;
 }
diff --git a/src/iri.h b/src/iri.h
index 4488501d..64858476 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -1,5 +1,5 @@
 /* Internationalization related declarations.
-   Copyright (C) 2000, 2007, 2008 Free Software Foundation, Inc.
+   Copyright (C) 2008 Free Software Foundation, Inc.
 
 This file is part of GNU Wget.
 

From 85185bde1b9729a27c3841560232266f77f13166 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Tue, 1 Jul 2008 19:34:37 +0200
Subject: [PATCH 15/58] Emit a message if we found invalid or incomplete
 multibyte sequences

---
 src/iri.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/iri.c b/src/iri.c
index 1792ab62..dfcb0578 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -185,6 +185,10 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
       /* Incomplete or invalid multibyte sequence */
       if (errno == EINVAL || errno == EILSEQ)
         {
+          if (!invalid)
+            logprintf (LOG_VERBOSE,
+                      "Incomplete or invalide multibyte sequence encountered\n");
+
           invalid++;
           **out = *in;
           in++;

From 99396653c22e54e13d9be63f6c333a4b33c6cbda Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Wed, 2 Jul 2008 16:37:28 +0200
Subject: [PATCH 16/58] Show also the hostname in the locale when possible

---
 src/ChangeLog |  8 ++++++++
 src/host.c    | 21 +++++++++++++++++++--
 src/iri.c     | 21 ++++++++++++++++++++-
 src/iri.h     |  2 ++
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 9e31b1c4..7aca0527 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,11 @@
+2008-07-02  Xavier Saint  <wget@sxav.eu>
+
+	* iri.c, iri.h  : New function idn_decode() to decode ASCII
+	encoded hostname to the locale.
+
+	* host.c : Show hostname to be resolved both in locale and
+	ASCII encoded.
+
 2008-06-26  Xavier Saint  <wget@sxav.eu>
 
 	* iri.c, iri.h : New functions locale_to_utf8() and
diff --git a/src/host.c b/src/host.c
index fdb35b1c..8a1495f0 100644
--- a/src/host.c
+++ b/src/host.c
@@ -53,6 +53,7 @@ as that of the covered work.  */
 #include "host.h"
 #include "url.h"
 #include "hash.h"
+#include "iri.h"
 
 #ifndef NO_ADDRESS
 # define NO_ADDRESS NO_DATA
@@ -712,8 +713,24 @@ lookup_host (const char *host, int flags)
   /* No luck with the cache; resolve HOST. */
 
   if (!silent && !numeric_address)
-    logprintf (LOG_VERBOSE, _("Resolving %s... "), 
-               quotearg_style (escape_quoting_style, host));
+    {
+      char *str = NULL, *name = NULL;
+
+      if (opt.enable_iri && (name = idn_decode (host)) != NULL)
+        {
+          int len = strlen (host) + strlen (name) + 4;
+          str = xmalloc (len);
+          snprintf (str, len, "%s (%s)", name, host);
+          str[len-1] = '\0';
+          xfree (name);
+        }
+
+      logprintf (LOG_VERBOSE, _("Resolving %s... "), 
+                 quotearg_style (escape_quoting_style, str ? str : host));
+
+      if (str)
+        xfree (str);
+    }
 
 #ifdef ENABLE_IPV6
   {
diff --git a/src/iri.c b/src/iri.c
index dfcb0578..000f6550 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -220,7 +220,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
     return false;
 }
 
-/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
+/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
    on error. */
 char *idn_encode (char *host)
 {
@@ -239,3 +239,22 @@ char *idn_encode (char *host)
   return new;
 }
 
+/* Try to decode an ASCII encoded host. Return the new domain in the locale on
+   success or NULL on error. */
+char *idn_decode (char *host)
+{
+  char *new;
+  int ret;
+
+  ret = idna_to_unicode_8zlz (host, &new, 0);
+  if (ret != IDNA_SUCCESS)
+    {
+      logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
+                 quote (idna_strerror (ret)));
+      return NULL;
+    }
+
+  return new;
+}
+
+
diff --git a/src/iri.h b/src/iri.h
index 64858476..3992d76d 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -37,6 +37,7 @@ char *find_locale (void);
 bool check_encoding_name (char *encoding);
 const char *locale_to_utf8 (const char *str);
 char *idn_encode (char *host);
+char *idn_decode (char *host);
 
 #else /* ENABLE_IRI */
 
@@ -45,6 +46,7 @@ char *idn_encode (char *host);
 #define check_encoding_name(str)    false
 #define locale_to_utf8(str)         (str)
 #define idn_encode(str)             NULL
+#define idn_decode(str)             NULL
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */

From 3781197ec61b6050222df10206c201c185c8fe2d Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Tue, 8 Jul 2008 00:29:02 +0200
Subject: [PATCH 17/58] Remove an always true condition

---
 src/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/connect.c b/src/connect.c
index 1e8f07e5..a6ff0b9b 100644
--- a/src/connect.c
+++ b/src/connect.c
@@ -266,7 +266,7 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
   if (print)
     {
       const char *txt_addr = print_address (ip);
-      if (print && 0 != strcmp (print, txt_addr))
+      if (0 != strcmp (print, txt_addr))
         logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
                    escnonprint_uri (print), txt_addr, port);
       else

From f50be2a403574a8d2cc01f4be714da9c2d6f748a Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Tue, 8 Jul 2008 00:42:09 +0200
Subject: [PATCH 18/58] Show the hostname we are connecting to in the locale
 when possible

---
 src/connect.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/connect.c b/src/connect.c
index a6ff0b9b..6cfdb4b7 100644
--- a/src/connect.c
+++ b/src/connect.c
@@ -58,6 +58,7 @@ as that of the covered work.  */
 #include "host.h"
 #include "connect.h"
 #include "hash.h"
+#include "iri.h"
 
 /* Define sockaddr_storage where unavailable (presumably on IPv4-only
    hosts).  */
@@ -267,8 +268,24 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
     {
       const char *txt_addr = print_address (ip);
       if (0 != strcmp (print, txt_addr))
-        logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
-                   escnonprint_uri (print), txt_addr, port);
+        {
+				  char *str = NULL, *name;
+
+          if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
+            {
+              int len = strlen (print) + strlen (name) + 4;
+              str = xmalloc (len);
+              snprintf (str, len, "%s (%s)", name, print);
+              str[len-1] = '\0';
+              xfree (name);
+            }
+
+          logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
+                     str ? str : escnonprint_uri (print), txt_addr, port);
+
+					if (str)
+					  xfree (str);
+        }
       else
         logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
     }

From 6c6e838338c31f1ac3c57c71e4ac34c0401bdf86 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Tue, 8 Jul 2008 00:44:08 +0200
Subject: [PATCH 19/58] No need for initial value

---
 src/host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host.c b/src/host.c
index 8a1495f0..fb8158e5 100644
--- a/src/host.c
+++ b/src/host.c
@@ -714,7 +714,7 @@ lookup_host (const char *host, int flags)
 
   if (!silent && !numeric_address)
     {
-      char *str = NULL, *name = NULL;
+      char *str = NULL, *name;
 
       if (opt.enable_iri && (name = idn_decode (host)) != NULL)
         {

From 9a2ea3938d09643c6528c3b83b1db4c30f47d981 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 13:10:02 +0200
Subject: [PATCH 20/58] Basic IDN/IRI support

---
 src/host.c     |   4 +-
 src/html-url.c |  12 +++---
 src/http.c     |   6 +--
 src/iri.c      | 108 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/iri.h      |  20 ++++++++-
 src/main.c     |  17 +++++---
 src/recur.c    |  35 +++++++++++-----
 src/retr.c     |  47 +++++++++++++--------
 src/url.c      |  11 +++--
 src/url.h      |   2 +-
 10 files changed, 209 insertions(+), 53 deletions(-)

diff --git a/src/host.c b/src/host.c
index fb8158e5..1226a274 100644
--- a/src/host.c
+++ b/src/host.c
@@ -716,7 +716,7 @@ lookup_host (const char *host, int flags)
     {
       char *str = NULL, *name;
 
-      if (opt.enable_iri && (name = idn_decode (host)) != NULL)
+      if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
         {
           int len = strlen (host) + strlen (name) + 4;
           str = xmalloc (len);
@@ -725,7 +725,7 @@ lookup_host (const char *host, int flags)
           xfree (name);
         }
 
-      logprintf (LOG_VERBOSE, _("Resolving %s... "), 
+      logprintf (LOG_VERBOSE, _("Resolving %s... "),
                  quotearg_style (escape_quoting_style, str ? str : host));
 
       if (str)
diff --git a/src/html-url.c b/src/html-url.c
index 9b515432..0d580f9a 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -274,6 +274,7 @@ append_url (const char *link_uri,
   struct urlpos *newel;
   const char *base = ctx->base ? ctx->base : ctx->parent_base;
   struct url *url;
+  bool utf8_encode = false;
 
   if (!base)
     {
@@ -292,7 +293,7 @@ append_url (const char *link_uri,
           return NULL;
         }
 
-      url = url_parse (link_uri, NULL);
+      url = url_parse (link_uri, NULL, &utf8_encode);
       if (!url)
         {
           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -311,7 +312,7 @@ append_url (const char *link_uri,
       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                ctx->document_file, base, link_uri, complete_uri));
 
-      url = url_parse (complete_uri, NULL);
+      url = url_parse (complete_uri, NULL, &utf8_encode);
       if (!url)
         {
           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -549,9 +550,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
       if (!mcharset)
         return;
 
-      logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
+      /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
 
-      /* sXXXav: Not used yet */
+      set_current_charset (mcharset);
       xfree (mcharset);
     }
   else if (name && 0 == strcasecmp (name, "robots"))
@@ -660,6 +661,7 @@ get_urls_file (const char *file)
   struct file_memory *fm;
   struct urlpos *head, *tail;
   const char *text, *text_end;
+  bool utf8_encode = false;
 
   /* Load the file.  */
   fm = read_file (file);
@@ -711,7 +713,7 @@ get_urls_file (const char *file)
           url_text = merged;
         }
 
-      url = url_parse (url_text, &up_error_code);
+      url = url_parse (url_text, &up_error_code, &utf8_encode);
       if (!url)
         {
           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
diff --git a/src/http.c b/src/http.c
index a4571ad7..df9ca2bb 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1825,7 +1825,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
           hs->local_file = url_file_name (u);
         }
     }
-  
+
   /* TODO: perform this check only once. */
   if (!hs->existence_checked && file_exists_p (hs->local_file))
     {
@@ -1894,7 +1894,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
               local_dot_orig_file_exists = true;
               local_filename = filename_plus_orig_suffix;
             }
-        }      
+        }
 
       if (!local_dot_orig_file_exists)
         /* Couldn't stat() <file>.orig, so try to stat() <file>. */
@@ -2055,7 +2055,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
 
           /* Try to get remote encoding if needed */
           if (opt.enable_iri && !opt.encoding_remote)
-            /* xxx = */ parse_charset (tmp2);
+            set_current_charset (parse_charset (tmp2));
         }
     }
   hs->newloc = resp_header_strdup (resp, "Location");
diff --git a/src/iri.c b/src/iri.c
index 000f6550..32eb7210 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -41,6 +41,8 @@ as that of the covered work.  */
 #include "utils.h"
 #include "iri.h"
 
+char *remote;
+char *current;
 
 static iconv_t locale2utf8;
 
@@ -80,7 +82,7 @@ parse_charset (char *str)
       return NULL;
     }
 
-  logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
+  /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
 
   return charset;
 }
@@ -196,7 +198,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
           (*out)++;
           outlen--;
         }
-      else if (errno == E2BIG) /* Output buffer full */ 
+      else if (errno == E2BIG) /* Output buffer full */
         {
           char *new;
 
@@ -222,15 +224,29 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
 
 /* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
    on error. */
-char *idn_encode (char *host)
+char *
+idn_encode (char *host, bool utf8_encoded)
 {
   char *new;
   int ret;
 
+  /* Encode to UTF-8 if not done using current remote */
+  if (!utf8_encoded)
+    {
+      if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+        {
+          /* Nothing to encode or an error occured */
+          return NULL;
+        }
+
+      host = new;
+    }
+
   /* toASCII UTF-8 NULL terminated string */
   ret = idna_to_ascii_8z (host, &new, 0);
   if (ret != IDNA_SUCCESS)
     {
+      /* sXXXav : free new when needed ! */
       logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
                  quote (idna_strerror (ret)));
       return NULL;
@@ -241,7 +257,8 @@ char *idn_encode (char *host)
 
 /* Try to decode an ASCII encoded host. Return the new domain in the locale on
    success or NULL on error. */
-char *idn_decode (char *host)
+char *
+idn_decode (char *host)
 {
   char *new;
   int ret;
@@ -257,4 +274,87 @@ char *idn_decode (char *host)
   return new;
 }
 
+/* Return a new string */
+bool
+remote_to_utf8 (const char *str, const char **new)
+{
+  char *remote;
+  iconv_t cd;
+  bool ret = false;
+
+  if (opt.encoding_remote)
+    remote = opt.encoding_remote;
+  else if (current)
+    remote = current;
+  else
+    return false;
+
+  cd = iconv_open ("UTF-8", remote);
+  if (cd == (iconv_t)(-1))
+    return false;
+
+  if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+    ret = true;
+
+  iconv_close (cd);
+
+  /* Test if something was converted */
+  if (!strcmp (str, *new))
+    {
+      xfree ((char *) *new);
+      return false;
+    }
+
+  return ret;
+}
+
+char *get_remote_charset (void)
+{
+  return remote;
+}
+
+char *get_current_charset (void)
+{
+  return current;
+}
+
+void set_current_charset (char *charset)
+{
+  /*printf("[ current = `%s'\n", charset);*/
+
+  if (current)
+    xfree (current);
+
+  current = charset ? xstrdup (charset) : NULL;
+}
+
+void set_current_as_locale (void)
+{
+  /*printf("[ current = locale = `%s'\n", opt.locale);*/
+  if (current)
+    xfree (current);
+
+  /* sXXXav : assert opt.locale NULL ? */
+  current = xstrdup (opt.locale);
+}
+
+void
+set_remote_charset (char *charset)
+{
+  /*printf("[ remote = `%s'\n", charset);*/
+  if (remote)
+    xfree (remote);
+
+  remote = charset ? xstrdup (charset) : NULL;
+}
+
+void
+set_remote_as_current (void)
+{
+  /*printf("[ remote = current = `%s'\n", current);*/
+  if (remote)
+    xfree (remote);
+
+  remote = current ? xstrdup (current) : NULL;
+}
 
diff --git a/src/iri.h b/src/iri.h
index 3992d76d..837dbfdd 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -36,8 +36,16 @@ char *parse_charset (char *str);
 char *find_locale (void);
 bool check_encoding_name (char *encoding);
 const char *locale_to_utf8 (const char *str);
-char *idn_encode (char *host);
+char *idn_encode (char *host, bool utf8_encoded);
 char *idn_decode (char *host);
+char *get_remote_charset (void);
+char *get_current_charset (void);
+void set_current_charset (char *charset);
+void set_current_as_locale (void);
+void set_current_charset (char *charset);
+void set_remote_charset (char *charset);
+void set_remote_as_current (void);
+bool remote_to_utf8 (const char *str, const char **new);
 
 #else /* ENABLE_IRI */
 
@@ -45,8 +53,16 @@ char *idn_decode (char *host);
 #define find_locale()               NULL
 #define check_encoding_name(str)    false
 #define locale_to_utf8(str)         (str)
-#define idn_encode(str)             NULL
+#define idn_encode(str,encoded)     NULL
 #define idn_decode(str)             NULL
+#define get_remote_charset()        NULL
+#define get_current_charset()       NULL
+#define set_current_charset(str)
+#define set_current_as_locale()
+#define set_current_charset(str)
+#define set_remote_charset(str)
+#define set_remote_as_current()
+#define remote_to_utf8(a,b)         false
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index 53ea6b91..d0ff1d21 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1067,16 +1067,16 @@ for details.\n\n"));
 #ifdef ENABLE_IRI
   if (opt.enable_iri)
     {
-      if (opt.locale && !check_encoding_name(opt.locale))
+      if (opt.locale && !check_encoding_name (opt.locale))
         opt.locale = NULL;
 
       if (!opt.locale)
         opt.locale = find_locale ();
 
-      if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote))
+      if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
         opt.encoding_remote = NULL;
 
-      logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
+      /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
     }
 #else
   if (opt.enable_iri || opt.locale || opt.encoding_remote)
@@ -1190,21 +1190,26 @@ WARNING: Can't reopen standard output in binary mode;\n\
       char *filename = NULL, *redirected_URL = NULL;
       int dt;
 
+      set_current_as_locale ();
+
       if ((opt.recursive || opt.page_requisites)
           && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
         {
           int old_follow_ftp = opt.follow_ftp;
 
           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-          if (url_scheme (*t) == SCHEME_FTP) 
+          if (url_scheme (*t) == SCHEME_FTP)
             opt.follow_ftp = 1;
-          
+
           status = retrieve_tree (*t);
 
           opt.follow_ftp = old_follow_ftp;
         }
       else
-        status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+        {
+          set_remote_as_current ();
+          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+        }
 
       if (opt.delete_after && file_exists_p(filename))
         {
diff --git a/src/recur.c b/src/recur.c
index d1d0f18d..e5f2b929 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -49,6 +49,7 @@ as that of the covered work.  */
 #include "res.h"
 #include "convert.h"
 #include "spider.h"
+#include "iri.h"
 
 /* Functions for maintaining the URL queue.  */
 
@@ -58,7 +59,7 @@ struct queue_element {
   int depth;                    /* the depth */
   bool html_allowed;            /* whether the document is allowed to
                                    be treated as HTML. */
-
+  char *remote_encoding;
   struct queue_element *next;   /* next element in queue */
 };
 
@@ -94,12 +95,18 @@ url_enqueue (struct url_queue *queue,
              const char *url, const char *referer, int depth, bool html_allowed)
 {
   struct queue_element *qel = xnew (struct queue_element);
+  char *charset = get_current_charset ();
   qel->url = url;
   qel->referer = referer;
   qel->depth = depth;
   qel->html_allowed = html_allowed;
   qel->next = NULL;
 
+  if (charset)
+    qel->remote_encoding = xstrdup (charset);
+  else
+    qel->remote_encoding = NULL;
+
   ++queue->count;
   if (queue->count > queue->maxcount)
     queue->maxcount = queue->count;
@@ -107,6 +114,8 @@ url_enqueue (struct url_queue *queue,
   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 
+  /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
   if (queue->tail)
     queue->tail->next = qel;
   queue->tail = qel;
@@ -132,6 +141,10 @@ url_dequeue (struct url_queue *queue,
   if (!queue->head)
     queue->tail = NULL;
 
+  set_remote_charset (qel->remote_encoding);
+  if (qel->remote_encoding)
+    xfree (qel->remote_encoding);
+
   *url = qel->url;
   *referer = qel->referer;
   *depth = qel->depth;
@@ -177,6 +190,7 @@ uerr_t
 retrieve_tree (const char *start_url)
 {
   uerr_t status = RETROK;
+  bool utf8_encode = false;
 
   /* The queue of URLs we need to load. */
   struct url_queue *queue;
@@ -186,7 +200,7 @@ retrieve_tree (const char *start_url)
   struct hash_table *blacklist;
 
   int up_error_code;
-  struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+  struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
 
   if (!start_url_parsed)
     {
@@ -324,7 +338,7 @@ retrieve_tree (const char *start_url)
           if (children)
             {
               struct urlpos *child = children;
-              struct url *url_parsed = url_parsed = url_parse (url, NULL);
+              struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
               char *referer_url = url;
               bool strip_auth = (url_parsed != NULL
                                  && url_parsed->user != NULL);
@@ -360,18 +374,18 @@ retrieve_tree (const char *start_url)
             }
         }
 
-      if (file 
-          && (opt.delete_after 
+      if (file
+          && (opt.delete_after
               || opt.spider /* opt.recursive is implicitely true */
               || !acceptable (file)))
         {
           /* Either --delete-after was specified, or we loaded this
-             (otherwise unneeded because of --spider or rejected by -R) 
-             HTML file just to harvest its hyperlinks -- in either case, 
+             (otherwise unneeded because of --spider or rejected by -R)
+             HTML file just to harvest its hyperlinks -- in either case,
              delete the local file. */
           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                    opt.delete_after ? "--delete-after" :
-                   (opt.spider ? "--spider" : 
+                   (opt.spider ? "--spider" :
                     "recursive rejection criteria")));
           logprintf (LOG_VERBOSE,
                      (opt.delete_after || opt.spider
@@ -627,11 +641,12 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
   struct url *orig_parsed, *new_parsed;
   struct urlpos *upos;
   bool success;
+  bool utf8_encode = false;
 
-  orig_parsed = url_parse (original, NULL);
+  orig_parsed = url_parse (original, NULL, &utf8_encode);
   assert (orig_parsed != NULL);
 
-  new_parsed = url_parse (redirected, NULL);
+  new_parsed = url_parse (redirected, NULL, &utf8_encode);
   assert (new_parsed != NULL);
 
   upos = xnew0 (struct urlpos);
diff --git a/src/retr.c b/src/retr.c
index 179430ac..05ffe1d0 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -51,6 +51,7 @@ as that of the covered work.  */
 #include "hash.h"
 #include "convert.h"
 #include "ptimer.h"
+#include "iri.h"
 
 /* Total size of downloaded files.  Used to enforce quota.  */
 SUM_SIZE_INT total_downloaded_bytes;
@@ -612,6 +613,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   char *saved_post_data = NULL;
   char *saved_post_file_name = NULL;
 
+  bool utf8_encoded = opt.enable_iri;
+
   /* If dt is NULL, use local storage.  */
   if (!dt)
     {
@@ -624,7 +627,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (file)
     *file = NULL;
 
-  u = url_parse (url, &up_error_code);
+ second_try:
+  u = url_parse (url, &up_error_code, &utf8_encoded);
   if (!u)
     {
       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@@ -632,6 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       return URLERROR;
     }
 
+  /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+
   if (!refurl)
     refurl = opt.referer;
 
@@ -645,8 +651,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   proxy = getproxy (u);
   if (proxy)
     {
+      /* sXXXav : support IRI for proxy */
+      bool proxy_utf8_encode = false;
       /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code);
+      proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
       if (!proxy_url)
         {
           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@@ -721,8 +729,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       xfree (mynewloc);
       mynewloc = construced_newloc;
 
+      utf8_encoded = opt.enable_iri;
+
       /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
       if (!newloc_parsed)
         {
           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@@ -769,16 +779,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       goto redirected;
     }
 
-  if (local_file)
+  /* Try to not encode in UTF-8 if fetching failed */
+  if (result != RETROK && utf8_encoded)
     {
-      if (*dt & RETROKF)
-        {
-          register_download (u->url, local_file);
-          if (redirection_count && 0 != strcmp (origurl, u->url))
-            register_redirection (origurl, u->url);
-          if (*dt & TEXTHTML)
-            register_html (u->url, local_file);
-        }
+      utf8_encoded = false;
+      /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+      goto second_try;
+    }
+
+  if (local_file && *dt & RETROKF)
+    {
+      register_download (u->url, local_file);
+      if (redirection_count && 0 != strcmp (origurl, u->url))
+        register_redirection (origurl, u->url);
+      if (*dt & TEXTHTML)
+        register_html (u->url, local_file);
     }
 
   if (file)
@@ -843,9 +858,9 @@ retrieve_from_file (const char *file, bool html, int *count)
           int old_follow_ftp = opt.follow_ftp;
 
           /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
-          if (cur_url->url->scheme == SCHEME_FTP) 
+          if (cur_url->url->scheme == SCHEME_FTP)
             opt.follow_ftp = 1;
-          
+
           status = retrieve_tree (cur_url->url->url);
 
           opt.follow_ftp = old_follow_ftp;
@@ -1021,8 +1036,8 @@ getproxy (struct url *u)
 bool
 url_uses_proxy (const char *url)
 {
-  bool ret;
-  struct url *u = url_parse (url, NULL);
+  bool ret, utf8_encode = false;
+  struct url *u = url_parse (url, NULL, &utf8_encode);
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
diff --git a/src/url.c b/src/url.c
index 48b23d6c..32de9c75 100644
--- a/src/url.c
+++ b/src/url.c
@@ -641,7 +641,7 @@ static const char *parse_errors[] = {
    error, and if ERROR is not NULL, also set *ERROR to the appropriate
    error code. */
 struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, bool *utf8_encode)
 {
   struct url *u;
   const char *p;
@@ -671,10 +671,13 @@ url_parse (const char *url, int *error)
       goto error;
     }
 
-  if (opt.enable_iri)
+  if (opt.enable_iri && *utf8_encode)
     {
+      const char *new;
       url_unescape ((char *) url);
-      url = locale_to_utf8(url);
+      *utf8_encode = remote_to_utf8 (url, &new);
+      if (*utf8_encode)
+        url = new;
     }
 
   url_encoded = reencode_escapes (url);
@@ -853,7 +856,7 @@ url_parse (const char *url, int *error)
 
   if (opt.enable_iri)
     {
-      char *new = idn_encode (u->host);
+      char *new = idn_encode (u->host, *utf8_encode);
       if (new)
         {
           xfree (u->host);
diff --git a/src/url.h b/src/url.h
index 7c8bcfed..a174568e 100644
--- a/src/url.h
+++ b/src/url.h
@@ -84,7 +84,7 @@ struct url
 
 char *url_escape (const char *);
 
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, bool *);
 const char *url_error (int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);

From da6b3f4b614fb8b28bf388b66f21efc5d553ebb9 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 18:20:18 +0200
Subject: [PATCH 21/58] Use dt rather than result

---
 src/recur.c | 2 +-
 src/retr.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/recur.c b/src/recur.c
index e5f2b929..d8279c39 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -450,7 +450,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 
   if (string_set_contains (blacklist, url))
     {
-      if (opt.spider) 
+      if (opt.spider)
         {
           char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
           DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
diff --git a/src/retr.c b/src/retr.c
index 05ffe1d0..02106081 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -780,7 +780,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     }
 
   /* Try to not encode in UTF-8 if fetching failed */
-  if (result != RETROK && utf8_encoded)
+  if (!(*dt & RETROKF) && utf8_encoded)
     {
       utf8_encoded = false;
       /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/

From 1e9ced017082976d257a7a158d9b6aca49f3c690 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 18:47:52 +0200
Subject: [PATCH 22/58] Get rid of the supplementary bool pointer in url_parse
 () arguments; UGLY :)

---
 src/html-url.c | 14 +++++++++-----
 src/iri.c      | 27 +++++++++++++++++++++++++++
 src/iri.h      | 10 ++++++++++
 src/main.c     |  1 +
 src/recur.c    | 17 +++++++++++------
 src/retr.c     | 26 +++++++++++++++-----------
 src/url.c      | 12 +++++++-----
 src/url.h      |  2 +-
 8 files changed, 81 insertions(+), 28 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index 0d580f9a..5a0682d3 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -274,7 +274,6 @@ append_url (const char *link_uri,
   struct urlpos *newel;
   const char *base = ctx->base ? ctx->base : ctx->parent_base;
   struct url *url;
-  bool utf8_encode = false;
 
   if (!base)
     {
@@ -293,7 +292,9 @@ append_url (const char *link_uri,
           return NULL;
         }
 
-      url = url_parse (link_uri, NULL, &utf8_encode);
+      set_ugly_no_encode (true);
+      url = url_parse (link_uri, NULL);
+      set_ugly_no_encode (false);
       if (!url)
         {
           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -312,7 +313,9 @@ append_url (const char *link_uri,
       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                ctx->document_file, base, link_uri, complete_uri));
 
-      url = url_parse (complete_uri, NULL, &utf8_encode);
+      set_ugly_no_encode (true);
+      url = url_parse (complete_uri, NULL);
+      set_ugly_no_encode (false);
       if (!url)
         {
           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -661,7 +664,6 @@ get_urls_file (const char *file)
   struct file_memory *fm;
   struct urlpos *head, *tail;
   const char *text, *text_end;
-  bool utf8_encode = false;
 
   /* Load the file.  */
   fm = read_file (file);
@@ -713,7 +715,9 @@ get_urls_file (const char *file)
           url_text = merged;
         }
 
-      url = url_parse (url_text, &up_error_code, &utf8_encode);
+      set_ugly_no_encode (true);
+      url = url_parse (url_text, &up_error_code);
+      set_ugly_no_encode (false);
       if (!url)
         {
           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
diff --git a/src/iri.c b/src/iri.c
index 32eb7210..e5be2cf8 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -43,6 +43,8 @@ as that of the covered work.  */
 
 char *remote;
 char *current;
+bool utf8_encode;
+bool ugly_no_encode;
 
 static iconv_t locale2utf8;
 
@@ -358,3 +360,28 @@ set_remote_as_current (void)
   remote = current ? xstrdup (current) : NULL;
 }
 
+void reset_utf8_encode (void)
+{
+  set_utf8_encode (opt.enable_iri);
+}
+
+void set_utf8_encode (bool encode)
+{
+  utf8_encode = encode;
+}
+
+bool get_utf8_encode (void)
+{
+  return utf8_encode;
+}
+
+void set_ugly_no_encode (bool ugly)
+{
+  ugly_no_encode = ugly;
+}
+
+bool get_ugly_no_encode (void)
+{
+  return ugly_no_encode;
+}
+
diff --git a/src/iri.h b/src/iri.h
index 837dbfdd..413fb2f6 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -46,6 +46,13 @@ void set_current_charset (char *charset);
 void set_remote_charset (char *charset);
 void set_remote_as_current (void);
 bool remote_to_utf8 (const char *str, const char **new);
+void reset_utf8_encode (void);
+void set_utf8_encode (bool encode);
+bool get_utf8_encode (void);
+
+/* ugly ugly ugly */
+void set_ugly_no_encode (bool ugly);
+bool get_ugly_no_encode (void);
 
 #else /* ENABLE_IRI */
 
@@ -63,6 +70,9 @@ bool remote_to_utf8 (const char *str, const char **new);
 #define set_remote_charset(str)
 #define set_remote_as_current()
 #define remote_to_utf8(a,b)         false
+#define reset_utf8_encode()
+#define set_utf8_encode(a)
+#define get_utf8_encode()           false
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index d0ff1d21..bf49bf89 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1191,6 +1191,7 @@ WARNING: Can't reopen standard output in binary mode;\n\
       int dt;
 
       set_current_as_locale ();
+      set_ugly_no_encode (false);
 
       if ((opt.recursive || opt.page_requisites)
           && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
diff --git a/src/recur.c b/src/recur.c
index d8279c39..6f5da2ae 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -190,7 +190,6 @@ uerr_t
 retrieve_tree (const char *start_url)
 {
   uerr_t status = RETROK;
-  bool utf8_encode = false;
 
   /* The queue of URLs we need to load. */
   struct url_queue *queue;
@@ -200,8 +199,11 @@ retrieve_tree (const char *start_url)
   struct hash_table *blacklist;
 
   int up_error_code;
-  struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
+  struct url *start_url_parsed;
 
+  set_ugly_no_encode (true);
+  start_url_parsed= url_parse (start_url, &up_error_code);
+  set_ugly_no_encode (false);
   if (!start_url_parsed)
     {
       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@@ -338,7 +340,9 @@ retrieve_tree (const char *start_url)
           if (children)
             {
               struct urlpos *child = children;
-              struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
+              set_ugly_no_encode (true);
+              struct url *url_parsed = url_parse (url, NULL);
+              set_ugly_no_encode (false);
               char *referer_url = url;
               bool strip_auth = (url_parsed != NULL
                                  && url_parsed->user != NULL);
@@ -641,13 +645,14 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
   struct url *orig_parsed, *new_parsed;
   struct urlpos *upos;
   bool success;
-  bool utf8_encode = false;
 
-  orig_parsed = url_parse (original, NULL, &utf8_encode);
+  set_ugly_no_encode (true);
+  orig_parsed = url_parse (original, NULL);
   assert (orig_parsed != NULL);
 
-  new_parsed = url_parse (redirected, NULL, &utf8_encode);
+  new_parsed = url_parse (redirected, NULL);
   assert (new_parsed != NULL);
+  set_ugly_no_encode (false);
 
   upos = xnew0 (struct urlpos);
   upos->url = new_parsed;
diff --git a/src/retr.c b/src/retr.c
index 02106081..dd4978a7 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -613,8 +613,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   char *saved_post_data = NULL;
   char *saved_post_file_name = NULL;
 
-  bool utf8_encoded = opt.enable_iri;
-
   /* If dt is NULL, use local storage.  */
   if (!dt)
     {
@@ -627,8 +625,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (file)
     *file = NULL;
 
+  reset_utf8_encode ();
+
  second_try:
-  u = url_parse (url, &up_error_code, &utf8_encoded);
+  u = url_parse (url, &up_error_code);
   if (!u)
     {
       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@@ -652,9 +652,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (proxy)
     {
       /* sXXXav : support IRI for proxy */
-      bool proxy_utf8_encode = false;
       /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
+      set_ugly_no_encode (true);
+      proxy_url = url_parse (proxy, &up_error_code);
+      set_ugly_no_encode (false);
       if (!proxy_url)
         {
           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@@ -729,10 +730,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       xfree (mynewloc);
       mynewloc = construced_newloc;
 
-      utf8_encoded = opt.enable_iri;
+      reset_utf8_encode ();
 
       /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
+      newloc_parsed = url_parse (mynewloc, &up_error_code);
       if (!newloc_parsed)
         {
           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@@ -780,9 +781,9 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     }
 
   /* Try to not encode in UTF-8 if fetching failed */
-  if (!(*dt & RETROKF) && utf8_encoded)
+  if (!(*dt & RETROKF) && get_utf8_encode ())
     {
-      utf8_encoded = false;
+      set_utf8_encode (false);
       /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
       goto second_try;
     }
@@ -1036,8 +1037,11 @@ getproxy (struct url *u)
 bool
 url_uses_proxy (const char *url)
 {
-  bool ret, utf8_encode = false;
-  struct url *u = url_parse (url, NULL, &utf8_encode);
+  bool ret;
+  struct url *u;
+  set_ugly_no_encode(true);
+  u= url_parse (url, NULL);
+  set_ugly_no_encode(false);
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
diff --git a/src/url.c b/src/url.c
index 32de9c75..c9489597 100644
--- a/src/url.c
+++ b/src/url.c
@@ -641,7 +641,7 @@ static const char *parse_errors[] = {
    error, and if ERROR is not NULL, also set *ERROR to the appropriate
    error code. */
 struct url *
-url_parse (const char *url, int *error, bool *utf8_encode)
+url_parse (const char *url, int *error)
 {
   struct url *u;
   const char *p;
@@ -671,12 +671,14 @@ url_parse (const char *url, int *error, bool *utf8_encode)
       goto error;
     }
 
-  if (opt.enable_iri && *utf8_encode)
+  if (opt.enable_iri && get_utf8_encode () && !get_ugly_no_encode ())
     {
       const char *new;
+      bool utf8_encode;
       url_unescape ((char *) url);
-      *utf8_encode = remote_to_utf8 (url, &new);
-      if (*utf8_encode)
+      utf8_encode = remote_to_utf8 (url, &new);
+      set_utf8_encode (utf8_encode);
+      if (utf8_encode)
         url = new;
     }
 
@@ -856,7 +858,7 @@ url_parse (const char *url, int *error, bool *utf8_encode)
 
   if (opt.enable_iri)
     {
-      char *new = idn_encode (u->host, *utf8_encode);
+      char *new = idn_encode (u->host, get_utf8_encode ());
       if (new)
         {
           xfree (u->host);
diff --git a/src/url.h b/src/url.h
index a174568e..7c8bcfed 100644
--- a/src/url.h
+++ b/src/url.h
@@ -84,7 +84,7 @@ struct url
 
 char *url_escape (const char *);
 
-struct url *url_parse (const char *, int *, bool *);
+struct url *url_parse (const char *, int *);
 const char *url_error (int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);

From 7410cb97644ba0b9e327b2c37b4e39fcec5b3690 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 18:52:20 +0200
Subject: [PATCH 23/58] In spider mode, do not report links as broken if they
 were utf8 encoded

---
 src/http.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/http.c b/src/http.c
index df9ca2bb..f79327c3 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2350,16 +2350,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
   uerr_t err, ret = TRYLIMEXC;
   time_t tmr = -1;               /* remote time-stamp */
   struct http_stat hstat;        /* HTTP status */
-  struct_stat st;  
+  struct_stat st;
   bool send_head_first = true;
 
   /* Assert that no value for *LOCAL_FILE was passed. */
   assert (local_file == NULL || *local_file == NULL);
-  
+
   /* Set LOCAL_FILE parameter. */
   if (local_file && opt.output_document)
     *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
-  
+
   /* Reset NEWLOC parameter. */
   *newloc = NULL;
 
@@ -2396,7 +2396,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
          retrieve the file. But if the output_document was given, then this
          test was already done and the file didn't exist. Hence the !opt.output_document */
       logprintf (LOG_VERBOSE, _("\
-File %s already there; not retrieving.\n\n"), 
+File %s already there; not retrieving.\n\n"),
                  quote (hstat.local_file));
       /* If the file is there, we suppose it's retrieved OK.  */
       *dt |= RETROKF;
@@ -2412,10 +2412,10 @@ File %s already there; not retrieving.\n\n"),
 
   /* Reset the counter. */
   count = 0;
-  
+
   /* Reset the document type. */
   *dt = 0;
-  
+
   /* Skip preliminary HEAD request if we're not in spider mode AND
    * if -O was given or HTTP Content-Disposition support is disabled. */
   if (!opt.spider
@@ -2424,21 +2424,21 @@ File %s already there; not retrieving.\n\n"),
 
   /* Send preliminary HEAD request if -N is given and we have an existing 
    * destination file. */
-  if (opt.timestamping 
+  if (opt.timestamping
       && !opt.content_disposition
       && file_exists_p (url_file_name (u)))
     send_head_first = true;
-  
+
   /* THE loop */
   do
     {
       /* Increment the pass counter.  */
       ++count;
       sleep_between_retrievals (count);
-      
+
       /* Get the current time string.  */
       tms = datetime_str (time (NULL));
-      
+
       if (opt.spider && !got_head)
         logprintf (LOG_VERBOSE, _("\
 Spider mode enabled. Check if remote file exists.\n"));
@@ -2447,20 +2447,20 @@ Spider mode enabled. Check if remote file exists.\n"));
       if (opt.verbose)
         {
           char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
-          
-          if (count > 1) 
+
+          if (count > 1)
             {
               char tmp[256];
               sprintf (tmp, _("(try:%2d)"), count);
               logprintf (LOG_NOTQUIET, "--%s--  %s  %s\n",
                          tms, tmp, hurl);
             }
-          else 
+          else
             {
               logprintf (LOG_NOTQUIET, "--%s--  %s\n",
                          tms, hurl);
             }
-          
+
 #ifdef WINDOWS
           ws_changetitle (hurl);
 #endif
@@ -2470,7 +2470,7 @@ Spider mode enabled. Check if remote file exists.\n"));
       /* Default document type is empty.  However, if spider mode is
          on or time-stamping is employed, HEAD_ONLY commands is
          encoded within *dt.  */
-      if (send_head_first && !got_head) 
+      if (send_head_first && !got_head)
         *dt |= HEAD_ONLY;
       else
         *dt &= ~HEAD_ONLY;
@@ -2507,7 +2507,7 @@ Spider mode enabled. Check if remote file exists.\n"));
 
       /* Time?  */
       tms = datetime_str (time (NULL));
-      
+
       /* Get the new location (with or without the redirection).  */
       if (hstat.newloc)
         *newloc = xstrdup (hstat.newloc);
@@ -2546,7 +2546,7 @@ Spider mode enabled. Check if remote file exists.\n"));
                          hstat.statcode);
               ret = WRONGCODE;
             }
-          else 
+          else
             {
               ret = NEWLOCATION;
             }
@@ -2562,7 +2562,7 @@ Spider mode enabled. Check if remote file exists.\n"));
           /* All possibilities should have been exhausted.  */
           abort ();
         }
-      
+
       if (!(*dt & RETROKF))
         {
           char *hurl = NULL;
@@ -2581,11 +2581,13 @@ Spider mode enabled. Check if remote file exists.\n"));
               continue;
             }
           /* Maybe we should always keep track of broken links, not just in
-           * spider mode.  */
-          else if (opt.spider)
+           * spider mode.
+           * Don't log error if it was utf8 encoded because we will try
+           * one unencoded. */
+          else if (opt.spider && !get_utf8_encode ())
             {
               /* #### Again: ugly ugly ugly! */
-              if (!hurl) 
+              if (!hurl)
                 hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
               nonexisting_url (hurl);
               logprintf (LOG_NOTQUIET, _("\
@@ -2594,7 +2596,7 @@ Remote file does not exist -- broken link!!!\n"));
           else
             {
               logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
-                         tms, hstat.statcode, 
+                         tms, hstat.statcode,
                          quotearg_style (escape_quoting_style, hstat.error));
             }
           logputs (LOG_VERBOSE, "\n");

From 24d68b7a25aa3def28bbd3681898078146239227 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 19:08:28 +0200
Subject: [PATCH 24/58] Add some comments in iri.c and change a variable name
 which was the same for a global and a local one

---
 src/iri.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index e5be2cf8..5108d999 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -41,14 +41,22 @@ as that of the covered work.  */
 #include "utils.h"
 #include "iri.h"
 
+/* Note: locale encoding is kept in options struct (opt.locale) */
+
+/* Hold the encoding used for the current fetch */
 char *remote;
+
+/* Hold the encoding for the future found links */
 char *current;
+
+/* Will/Is the current URL encoded in utf8 ? */
 bool utf8_encode;
+
+/* Force no utf8 encoding for url_parse () */
 bool ugly_no_encode;
 
 static iconv_t locale2utf8;
 
-
 static bool open_locale_to_utf8 (void);
 static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
 
@@ -93,7 +101,6 @@ parse_charset (char *str)
 char *
 find_locale (void)
 {
-  /* sXXXav, made our own function or use libidn one ?! */
   return (char *) stringprep_locale_charset ();
 }
 
@@ -144,7 +151,8 @@ open_locale_to_utf8 (void)
   return false;
 }
 
-/* Return a new string */
+/* Try converting string str from locale to UTF-8. Return a new string
+   on success, or str on error or if conversion isn't needed. */
 const char *
 locale_to_utf8 (const char *str)
 {
@@ -162,7 +170,9 @@ locale_to_utf8 (const char *str)
   return str;
 }
 
-/* */
+/* Do the conversion according to the passed conversion descriptor cd. *out
+   will containes the transcoded string on success. *out content is
+   unspecified otherwise. */
 static bool
 do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
 {
@@ -176,7 +186,6 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
   len = outlen;
   done = 0;
 
-  /* sXXXav : put a maximum looping factor ??? */
   for (;;)
     {
       if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
@@ -224,7 +233,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
     return false;
 }
 
-/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
    on error. */
 char *
 idn_encode (char *host, bool utf8_encoded)
@@ -257,8 +266,8 @@ idn_encode (char *host, bool utf8_encoded)
   return new;
 }
 
-/* Try to decode an ASCII encoded host. Return the new domain in the locale on
-   success or NULL on error. */
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+   on success or NULL on error. */
 char *
 idn_decode (char *host)
 {
@@ -276,22 +285,23 @@ idn_decode (char *host)
   return new;
 }
 
-/* Return a new string */
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+   contains the transcoded string. *new content is unspecified otherwise. */
 bool
 remote_to_utf8 (const char *str, const char **new)
 {
-  char *remote;
+  char *r;
   iconv_t cd;
   bool ret = false;
 
   if (opt.encoding_remote)
-    remote = opt.encoding_remote;
+    r = opt.encoding_remote;
   else if (current)
-    remote = current;
+    r = current;
   else
     return false;
 
-  cd = iconv_open ("UTF-8", remote);
+  cd = iconv_open ("UTF-8", r);
   if (cd == (iconv_t)(-1))
     return false;
 
@@ -323,7 +333,6 @@ char *get_current_charset (void)
 void set_current_charset (char *charset)
 {
   /*printf("[ current = `%s'\n", charset);*/
-
   if (current)
     xfree (current);
 

From 169a16fc7ddb348cc4f0a5ebd149f754b5042478 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 19:29:51 +0200
Subject: [PATCH 25/58] Make get_utf8_encode() directly aware of ugly_no_encode
 and remove get_ugly_no_encode()

---
 src/iri.c | 7 +------
 src/iri.h | 1 -
 src/url.c | 2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 5108d999..1f421d43 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -381,7 +381,7 @@ void set_utf8_encode (bool encode)
 
 bool get_utf8_encode (void)
 {
-  return utf8_encode;
+  return (!ugly_no_encode && utf8_encode);
 }
 
 void set_ugly_no_encode (bool ugly)
@@ -389,8 +389,3 @@ void set_ugly_no_encode (bool ugly)
   ugly_no_encode = ugly;
 }
 
-bool get_ugly_no_encode (void)
-{
-  return ugly_no_encode;
-}
-
diff --git a/src/iri.h b/src/iri.h
index 413fb2f6..58389813 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -52,7 +52,6 @@ bool get_utf8_encode (void);
 
 /* ugly ugly ugly */
 void set_ugly_no_encode (bool ugly);
-bool get_ugly_no_encode (void);
 
 #else /* ENABLE_IRI */
 
diff --git a/src/url.c b/src/url.c
index c9489597..beaf0fb2 100644
--- a/src/url.c
+++ b/src/url.c
@@ -671,7 +671,7 @@ url_parse (const char *url, int *error)
       goto error;
     }
 
-  if (opt.enable_iri && get_utf8_encode () && !get_ugly_no_encode ())
+  if (opt.enable_iri && get_utf8_encode ())
     {
       const char *new;
       bool utf8_encode;

From ee8ff7488f5402e7f252feabc2e9c70b64354605 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 19:31:09 +0200
Subject: [PATCH 26/58] Add a missing no-op macro for set_ugly_no_encode()

---
 src/iri.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/iri.h b/src/iri.h
index 58389813..50102df4 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -72,6 +72,7 @@ void set_ugly_no_encode (bool ugly);
 #define reset_utf8_encode()
 #define set_utf8_encode(a)
 #define get_utf8_encode()           false
+#define set_ugly_no_encode(a)
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */

From 5982054a98a20a00fdb0e701530af3e7a2981873 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sun, 20 Jul 2008 20:37:22 +0200
Subject: [PATCH 27/58] Use the right flags for idna conversion (RFC3987
 section 3.1)

---
 src/iri.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 1f421d43..c28d4f51 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -41,6 +41,9 @@ as that of the covered work.  */
 #include "utils.h"
 #include "iri.h"
 
+/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
+#define IDNA_FLAGS  IDNA_USE_STD3_ASCII_RULES
+
 /* Note: locale encoding is kept in options struct (opt.locale) */
 
 /* Hold the encoding used for the current fetch */
@@ -254,7 +257,7 @@ idn_encode (char *host, bool utf8_encoded)
     }
 
   /* toASCII UTF-8 NULL terminated string */
-  ret = idna_to_ascii_8z (host, &new, 0);
+  ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
   if (ret != IDNA_SUCCESS)
     {
       /* sXXXav : free new when needed ! */
@@ -274,7 +277,7 @@ idn_decode (char *host)
   char *new;
   int ret;
 
-  ret = idna_to_unicode_8zlz (host, &new, 0);
+  ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
   if (ret != IDNA_SUCCESS)
     {
       logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,

From c31e00b52d49632dd0f005269ab2b820c7fd2c34 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Mon, 21 Jul 2008 19:34:22 +0200
Subject: [PATCH 28/58] Do not free/duplicate current/remote encoding string if
 they aren't changed

---
 src/iri.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index c28d4f51..d23615ae 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -337,18 +337,27 @@ void set_current_charset (char *charset)
 {
   /*printf("[ current = `%s'\n", charset);*/
   if (current)
-    xfree (current);
+    {
+      /* Do nothing if already equal */
+      if (!strcasecmp (current, charset))
+        return;
+      xfree (current);
+    }
 
   current = charset ? xstrdup (charset) : NULL;
 }
 
 void set_current_as_locale (void)
 {
+  /* sXXXav : assert opt.locale NULL ? */
   /*printf("[ current = locale = `%s'\n", opt.locale);*/
   if (current)
-    xfree (current);
+    {
+      if (!strcasecmp (current, opt.locale))
+        return;
+      xfree (current);
+    }
 
-  /* sXXXav : assert opt.locale NULL ? */
   current = xstrdup (opt.locale);
 }
 
@@ -357,8 +366,12 @@ set_remote_charset (char *charset)
 {
   /*printf("[ remote = `%s'\n", charset);*/
   if (remote)
-    xfree (remote);
-
+    {
+      /* Do nothing if already equal */
+      if (!strcasecmp (remote, charset))
+        return;
+      xfree (remote);
+    }
   remote = charset ? xstrdup (charset) : NULL;
 }
 
@@ -367,7 +380,12 @@ set_remote_as_current (void)
 {
   /*printf("[ remote = current = `%s'\n", current);*/
   if (remote)
-    xfree (remote);
+    {
+      /* Do nothing if already equal */
+      if (current && !strcasecmp (remote, current))
+        return;
+      xfree (remote);
+    }
 
   remote = current ? xstrdup (current) : NULL;
 }

From d82f80ecab9bfef857d780f894cca7e890780ce0 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 24 Jul 2008 00:56:29 +0200
Subject: [PATCH 29/58] Change global variable model for state-object

---
 src/convert.c  |   2 +-
 src/html-url.c |  29 ++++++------
 src/html-url.h |   2 +-
 src/http.c     |  20 ++++----
 src/http.h     |   2 +-
 src/iri.c      | 126 +++++++++++++++----------------------------------
 src/iri.h      |  48 ++++++++-----------
 src/main.c     |  11 ++---
 src/recur.c    |  75 +++++++++++++++--------------
 src/res.c      |  13 +++--
 src/res.h      |   2 +-
 src/retr.c     |  53 ++++++++++++---------
 src/retr.h     |   3 +-
 src/url.c      |  37 ++++++++-------
 src/url.h      |   2 +-
 src/wget.h     |   3 ++
 16 files changed, 197 insertions(+), 231 deletions(-)

diff --git a/src/convert.c b/src/convert.c
index e72a4b0f..54004ad0 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
 
       /* Parse the file...  */
       urls = is_css ? get_urls_css_file (file, url) :
-                      get_urls_html (file, url, NULL);
+                      get_urls_html (file, url, NULL, NULL);
 
       /* We don't respect meta_disallow_follow here because, even if
          the file is not followed, we might still want to convert the
diff --git a/src/html-url.c b/src/html-url.c
index ef93a7e4..6e886083 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -44,7 +44,6 @@ as that of the covered work.  */
 #include "recur.h"
 #include "html-url.h"
 #include "css-url.h"
-#include "iri.h"
 
 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
 
@@ -175,6 +174,10 @@ static const char *additional_attributes[] = {
 static struct hash_table *interesting_tags;
 static struct hash_table *interesting_attributes;
 
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+   meta tags  */
+static char *meta_charset;
+
 static void
 init_interesting (void)
 {
@@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size,
           return NULL;
         }
 
-      set_ugly_no_encode (true);
-      url = url_parse (link_uri, NULL);
-      set_ugly_no_encode (false);
+      url = url_parse (link_uri, NULL, NULL);
       if (!url)
         {
           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size,
       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                ctx->document_file, base, link_uri, complete_uri));
 
-      set_ugly_no_encode (true);
-      url = url_parse (complete_uri, NULL);
-      set_ugly_no_encode (false);
+      url = url_parse (complete_uri, NULL, NULL);
       if (!url)
         {
           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
         return;
 
       /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
-
-      set_current_charset (mcharset);
-      xfree (mcharset);
+      xfree_null (meta_charset);
+      meta_charset = mcharset;
     }
   else if (name && 0 == strcasecmp (name, "robots"))
     {
@@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
    <base href=...> and does the right thing.  */
 
 struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+               struct iri *iri)
 {
   struct file_memory *fm;
   struct map_context ctx;
@@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
   map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                  NULL, interesting_attributes);
 
+  /* If meta charset isn't null, override content encoding */
+  if (iri && meta_charset)
+    set_content_encoding (iri, meta_charset);
+
   DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
   if (meta_disallow_follow)
     *meta_disallow_follow = ctx.nofollow;
@@ -750,9 +753,7 @@ get_urls_file (const char *file)
           url_text = merged;
         }
 
-      set_ugly_no_encode (true);
-      url = url_parse (url_text, &up_error_code);
-      set_ugly_no_encode (false);
+      url = url_parse (url_text, &up_error_code, NULL);
       if (!url)
         {
           logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
diff --git a/src/html-url.h b/src/html-url.h
index a94f0db6..2e9ec820 100644
--- a/src/html-url.h
+++ b/src/html-url.h
@@ -44,7 +44,7 @@ struct map_context {
 };
 
 struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
 struct urlpos *append_url (const char *, int, int, struct map_context *);
 void free_urlpos (struct urlpos *);
 
diff --git a/src/http.c b/src/http.c
index 5ec70d27..589e18ee 100644
--- a/src/http.c
+++ b/src/http.c
@@ -49,7 +49,6 @@ as that of the covered work.  */
 #include "retr.h"
 #include "connect.h"
 #include "netrc.h"
-#include "iri.h"
 #ifdef HAVE_SSL
 # include "ssl.h"
 #endif
@@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs)
    If PROXY is non-NULL, the connection will be made to the proxy
    server, and u->url will be requested.  */
 static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
+gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
+         struct iri *iri)
 {
   struct request *req;
 
@@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
 
           /* Try to get remote encoding if needed */
           if (opt.enable_iri && !opt.encoding_remote)
-            set_current_charset (parse_charset (tmp2));
+            {
+              tmp = parse_charset (tmp2);
+              if (tmp)
+                set_content_encoding (iri, tmp);
+            }
         }
     }
   hs->newloc = resp_header_strdup (resp, "Location");
@@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
    retried, and retried, and retried, and...  */
 uerr_t
 http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
-           int *dt, struct url *proxy)
+           int *dt, struct url *proxy, struct iri *iri)
 {
   int count;
   bool got_head = false;         /* used for time-stamping and filename detection */
@@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
         *dt &= ~SEND_NOCACHE;
 
       /* Try fetching the document, or at least its head.  */
-      err = gethttp (u, &hstat, dt, proxy);
+      err = gethttp (u, &hstat, dt, proxy, iri);
 
       /* Time?  */
       tms = datetime_str (time (NULL));
@@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n"));
             }
           /* Maybe we should always keep track of broken links, not just in
            * spider mode.
-           * Don't log error if it was utf8 encoded because we will try
-           * one unencoded. */
-          else if (opt.spider && !get_utf8_encode ())
+           * Don't log error if it was UTF-8 encoded because we will try
+           * once unencoded. */
+          else if (opt.spider && !iri->utf8_encode)
             {
               /* #### Again: ugly ugly ugly! */
               if (!hurl)
diff --git a/src/http.h b/src/http.h
index e0e66cea..4769e9d3 100644
--- a/src/http.h
+++ b/src/http.h
@@ -33,7 +33,7 @@ as that of the covered work.  */
 struct url;
 
 uerr_t http_loop (struct url *, char **, char **, const char *, int *,
-		  struct url *);
+		  struct url *, struct iri *);
 void save_cookies (void);
 void http_cleanup (void);
 time_t http_atotm (const char *);
diff --git a/src/iri.c b/src/iri.c
index d23615ae..783aa331 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -46,18 +46,6 @@ as that of the covered work.  */
 
 /* Note: locale encoding is kept in options struct (opt.locale) */
 
-/* Hold the encoding used for the current fetch */
-char *remote;
-
-/* Hold the encoding for the future found links */
-char *current;
-
-/* Will/Is the current URL encoded in utf8 ? */
-bool utf8_encode;
-
-/* Force no utf8 encoding for url_parse () */
-bool ugly_no_encode;
-
 static iconv_t locale2utf8;
 
 static bool open_locale_to_utf8 (void);
@@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
    on error. */
 char *
-idn_encode (char *host, bool utf8_encoded)
+idn_encode (struct iri *i, char *host)
 {
   char *new;
   int ret;
 
-  /* Encode to UTF-8 if not done using current remote */
-  if (!utf8_encoded)
+  /* Encode to UTF-8 if not done */
+  if (!i->utf8_encode)
     {
-      if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+      if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
         {
           /* Nothing to encode or an error occured */
           return NULL;
@@ -291,7 +279,7 @@ idn_decode (char *host)
 /* Try to transcode string str from remote encoding to UTF-8. On success, *new
    contains the transcoded string. *new content is unspecified otherwise. */
 bool
-remote_to_utf8 (const char *str, const char **new)
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
 {
   char *r;
   iconv_t cd;
@@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new)
 
   if (opt.encoding_remote)
     r = opt.encoding_remote;
-  else if (current)
-    r = current;
+  else if (i->uri_encoding)
+    r = i->uri_encoding;
   else
     return false;
 
@@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new)
   return ret;
 }
 
-char *get_remote_charset (void)
+struct iri *
+iri_new (void)
 {
-  return remote;
-}
-
-char *get_current_charset (void)
-{
-  return current;
-}
-
-void set_current_charset (char *charset)
-{
-  /*printf("[ current = `%s'\n", charset);*/
-  if (current)
-    {
-      /* Do nothing if already equal */
-      if (!strcasecmp (current, charset))
-        return;
-      xfree (current);
-    }
-
-  current = charset ? xstrdup (charset) : NULL;
-}
-
-void set_current_as_locale (void)
-{
-  /* sXXXav : assert opt.locale NULL ? */
-  /*printf("[ current = locale = `%s'\n", opt.locale);*/
-  if (current)
-    {
-      if (!strcasecmp (current, opt.locale))
-        return;
-      xfree (current);
-    }
-
-  current = xstrdup (opt.locale);
+  struct iri *i = xmalloc (sizeof (struct iri));
+  i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+  i->content_encoding = NULL;
+  i->utf8_encode = opt.enable_iri;
 }
 
 void
-set_remote_charset (char *charset)
+iri_free (struct iri *i)
 {
-  /*printf("[ remote = `%s'\n", charset);*/
-  if (remote)
-    {
-      /* Do nothing if already equal */
-      if (!strcasecmp (remote, charset))
-        return;
-      xfree (remote);
-    }
-  remote = charset ? xstrdup (charset) : NULL;
+  xfree_null (i->uri_encoding);
+  xfree_null (i->content_encoding);
+  xfree (i);
 }
 
 void
-set_remote_as_current (void)
+set_uri_encoding (struct iri *i, char *charset)
 {
-  /*printf("[ remote = current = `%s'\n", current);*/
-  if (remote)
+  logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
+  if (opt.encoding_remote)
+    return;
+  if (i->uri_encoding)
     {
-      /* Do nothing if already equal */
-      if (current && !strcasecmp (remote, current))
+      if (!strcasecmp (i->uri_encoding, charset))
         return;
-      xfree (remote);
+      xfree (i->uri_encoding);
     }
 
-  remote = current ? xstrdup (current) : NULL;
+  i->uri_encoding = charset ? xstrdup (charset) : NULL;
 }
 
-void reset_utf8_encode (void)
+void
+set_content_encoding (struct iri *i, char *charset)
 {
-  set_utf8_encode (opt.enable_iri);
-}
-
-void set_utf8_encode (bool encode)
-{
-  utf8_encode = encode;
-}
-
-bool get_utf8_encode (void)
-{
-  return (!ugly_no_encode && utf8_encode);
-}
-
-void set_ugly_no_encode (bool ugly)
-{
-  ugly_no_encode = ugly;
+  logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
+  if (opt.encoding_remote)
+    return;
+  if (i->content_encoding)
+    {
+      if (!strcasecmp (i->content_encoding, charset))
+        return;
+      xfree (i->content_encoding);
+    }
+
+  i->content_encoding = charset ? xstrdup (charset) : NULL;
 }
 
diff --git a/src/iri.h b/src/iri.h
index 50102df4..173d0656 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -30,49 +30,41 @@ as that of the covered work.  */
 #ifndef IRI_H
 #define IRI_H
 
+struct iri {
+  char *uri_encoding;     /* Encoding of the uri to fetch */
+  char *content_encoding;  /* Encoding of links inside the fetched file */
+  bool utf8_encode;       /* Will/Is the current url encoded in utf8 */
+};
+
 #ifdef ENABLE_IRI
 
 char *parse_charset (char *str);
 char *find_locale (void);
 bool check_encoding_name (char *encoding);
 const char *locale_to_utf8 (const char *str);
-char *idn_encode (char *host, bool utf8_encoded);
+char *idn_encode (struct iri *i, char *host);
 char *idn_decode (char *host);
-char *get_remote_charset (void);
-char *get_current_charset (void);
-void set_current_charset (char *charset);
-void set_current_as_locale (void);
-void set_current_charset (char *charset);
-void set_remote_charset (char *charset);
-void set_remote_as_current (void);
-bool remote_to_utf8 (const char *str, const char **new);
-void reset_utf8_encode (void);
-void set_utf8_encode (bool encode);
-bool get_utf8_encode (void);
-
-/* ugly ugly ugly */
-void set_ugly_no_encode (bool ugly);
+bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
+struct iri *iri_new (void);
+void iri_free (struct iri *i);
+void set_uri_encoding (struct iri *i, char *charset);
+void set_content_encoding (struct iri *i, char *charset);
 
 #else /* ENABLE_IRI */
 
+struct iri dummy_iri;
+
 #define parse_charset(str)          NULL
 #define find_locale()               NULL
 #define check_encoding_name(str)    false
 #define locale_to_utf8(str)         (str)
-#define idn_encode(str,encoded)     NULL
+#define idn_encode(a,b,c)           NULL
 #define idn_decode(str)             NULL
-#define get_remote_charset()        NULL
-#define get_current_charset()       NULL
-#define set_current_charset(str)
-#define set_current_as_locale()
-#define set_current_charset(str)
-#define set_remote_charset(str)
-#define set_remote_as_current()
-#define remote_to_utf8(a,b)         false
-#define reset_utf8_encode()
-#define set_utf8_encode(a)
-#define get_utf8_encode()           false
-#define set_ugly_no_encode(a)
+#define remote_to_utf8(a,b,c)       false
+#define iri_new()                   (&dummy_iri)
+#define iri_free(a)
+#define set_uri_encoding(a,b)
+#define set_content_encoding(a,b)
 
 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
diff --git a/src/main.c b/src/main.c
index 6135a67d..8cee194c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -57,7 +57,6 @@ as that of the covered work.  */
 #include "convert.h"
 #include "spider.h"
 #include "http.h"               /* for save_cookies */
-#include "iri.h"
 
 #include <getopt.h>
 #include <getpass.h>
@@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\
       char *filename = NULL, *redirected_URL = NULL;
       int dt;
 
-      set_current_as_locale ();
-      set_ugly_no_encode (false);
-
       if ((opt.recursive || opt.page_requisites)
           && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
         {
@@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\
         }
       else
         {
-          set_remote_as_current ();
-          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+          struct iri *i = iri_new ();
+          set_uri_encoding (i, opt.locale);
+          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
+                                 opt.recursive, i);
+          iri_free (i);
         }
 
       if (opt.delete_after && file_exists_p(filename))
diff --git a/src/recur.c b/src/recur.c
index 24b80ad4..e2f58d1c 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -61,7 +61,7 @@ struct queue_element {
   int depth;                    /* the depth */
   bool html_allowed;            /* whether the document is allowed to
                                    be treated as HTML. */
-  char *remote_encoding;
+  struct iri *iri;                /* sXXXav */
   bool css_allowed;             /* whether the document is allowed to
                                    be treated as CSS. */
   struct queue_element *next;   /* next element in queue */
@@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue)
    into it.  */
 
 static void
-url_enqueue (struct url_queue *queue,
+url_enqueue (struct url_queue *queue, struct iri *i,
              const char *url, const char *referer, int depth,
              bool html_allowed, bool css_allowed)
 {
   struct queue_element *qel = xnew (struct queue_element);
-  char *charset = get_current_charset ();
+  qel->iri = i;
   qel->url = url;
   qel->referer = referer;
   qel->depth = depth;
@@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue,
   qel->css_allowed = css_allowed;
   qel->next = NULL;
 
-  if (charset)
-    qel->remote_encoding = xstrdup (charset);
-  else
-    qel->remote_encoding = NULL;
-
   ++queue->count;
   if (queue->count > queue->maxcount)
     queue->maxcount = queue->count;
@@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue,
   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 
-  /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+  if (i)
+    printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
 
   if (queue->tail)
     queue->tail->next = qel;
@@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue,
    succeeded, or false if the queue is empty.  */
 
 static bool
-url_dequeue (struct url_queue *queue,
+url_dequeue (struct url_queue *queue, struct iri **i,
              const char **url, const char **referer, int *depth,
              bool *html_allowed, bool *css_allowed)
 {
@@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue,
   if (!queue->head)
     queue->tail = NULL;
 
-  set_remote_charset (qel->remote_encoding);
-  if (qel->remote_encoding)
-    xfree (qel->remote_encoding);
-
+  *i = qel->iri;
   *url = qel->url;
   *referer = qel->referer;
   *depth = qel->depth;
@@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue,
 }
 
 static bool download_child_p (const struct urlpos *, struct url *, int,
-                              struct url *, struct hash_table *);
+                              struct url *, struct hash_table *, struct iri *);
 static bool descend_redirect_p (const char *, const char *, int,
-                                struct url *, struct hash_table *);
+                                struct url *, struct hash_table *, struct iri *);
 
 
 /* Retrieve a part of the web beginning with START_URL.  This used to
@@ -207,10 +200,10 @@ retrieve_tree (const char *start_url)
 
   int up_error_code;
   struct url *start_url_parsed;
+  struct iri *i = iri_new ();
+  set_uri_encoding (i, opt.locale);
 
-  set_ugly_no_encode (true);
-  start_url_parsed= url_parse (start_url, &up_error_code);
-  set_ugly_no_encode (false);
+  start_url_parsed = url_parse (start_url, &up_error_code, i);
   if (!start_url_parsed)
     {
       logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@@ -223,7 +216,8 @@ retrieve_tree (const char *start_url)
 
   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
      just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+  url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+               false);
   string_set_add (blacklist, start_url_parsed->url);
 
   while (1)
@@ -242,7 +236,7 @@ retrieve_tree (const char *start_url)
 
       /* Get the next URL from the queue... */
 
-      if (!url_dequeue (queue,
+      if (!url_dequeue (queue, (struct iri **) &i,
                         (const char **)&url, (const char **)&referer,
                         &depth, &html_allowed, &css_allowed))
         break;
@@ -283,7 +277,8 @@ retrieve_tree (const char *start_url)
           int dt = 0;
           char *redirected = NULL;
 
-          status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+          status = retrieve_url (url, &file, &redirected, referer, &dt,
+                                 false, i);
 
           if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
@@ -311,7 +306,7 @@ retrieve_tree (const char *start_url)
               if (descend)
                 {
                   if (!descend_redirect_p (redirected, url, depth,
-                                           start_url_parsed, blacklist))
+                                           start_url_parsed, blacklist, i))
                     descend = false;
                   else
                     /* Make sure that the old pre-redirect form gets
@@ -363,7 +358,7 @@ retrieve_tree (const char *start_url)
           bool meta_disallow_follow = false;
           struct urlpos *children
             = is_css ? get_urls_css_file (file, url) :
-                       get_urls_html (file, url, &meta_disallow_follow);
+                       get_urls_html (file, url, &meta_disallow_follow, i);
 
           if (opt.use_robots && meta_disallow_follow)
             {
@@ -374,9 +369,8 @@ retrieve_tree (const char *start_url)
           if (children)
             {
               struct urlpos *child = children;
-              set_ugly_no_encode (true);
-              struct url *url_parsed = url_parse (url, NULL);
-              set_ugly_no_encode (false);
+              struct url *url_parsed = url_parse (url, NULL, i);
+              struct iri *ci;
               char *referer_url = url;
               bool strip_auth = (url_parsed != NULL
                                  && url_parsed->user != NULL);
@@ -393,9 +387,11 @@ retrieve_tree (const char *start_url)
                   if (dash_p_leaf_HTML && !child->link_inline_p)
                     continue;
                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
-                                        blacklist))
+                                        blacklist, i))
                     {
-                      url_enqueue (queue, xstrdup (child->url->url),
+                      ci = iri_new ();
+                      set_uri_encoding (ci, i->content_encoding);
+                      url_enqueue (queue, ci, xstrdup (child->url->url),
                                    xstrdup (referer_url), depth + 1,
                                    child->link_expect_html,
                                    child->link_expect_css);
@@ -440,6 +436,7 @@ retrieve_tree (const char *start_url)
       xfree (url);
       xfree_null (referer);
       xfree_null (file);
+      iri_free (i);
     }
 
   /* If anything is left of the queue due to a premature exit, free it
@@ -448,9 +445,11 @@ retrieve_tree (const char *start_url)
     char *d1, *d2;
     int d3;
     bool d4, d5;
-    while (url_dequeue (queue,
+    struct iri *d6;
+    while (url_dequeue (queue, (struct iri **)&d6,
                         (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
       {
+        iri_free (d6);
         xfree (d1);
         xfree_null (d2);
       }
@@ -479,7 +478,8 @@ retrieve_tree (const char *start_url)
 
 static bool
 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
-                  struct url *start_url_parsed, struct hash_table *blacklist)
+                  struct url *start_url_parsed, struct hash_table *blacklist,
+                  struct iri *iri)
 {
   struct url *u = upos->url;
   const char *url = u->url;
@@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
       if (!specs)
         {
           char *rfile;
-          if (res_retrieve_file (url, &rfile))
+          if (res_retrieve_file (url, &rfile, iri))
             {
               specs = res_parse_from_file (rfile);
 
@@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 
 static bool
 descend_redirect_p (const char *redirected, const char *original, int depth,
-                    struct url *start_url_parsed, struct hash_table *blacklist)
+                    struct url *start_url_parsed, struct hash_table *blacklist,
+                    struct iri *iri)
 {
   struct url *orig_parsed, *new_parsed;
   struct urlpos *upos;
   bool success;
 
-  set_ugly_no_encode (true);
-  orig_parsed = url_parse (original, NULL);
+  orig_parsed = url_parse (original, NULL, NULL);
   assert (orig_parsed != NULL);
 
-  new_parsed = url_parse (redirected, NULL);
+  new_parsed = url_parse (redirected, NULL, NULL);
   assert (new_parsed != NULL);
-  set_ugly_no_encode (false);
 
   upos = xnew0 (struct urlpos);
   upos->url = new_parsed;
 
   success = download_child_p (upos, orig_parsed, depth,
-                              start_url_parsed, blacklist);
+                              start_url_parsed, blacklist, iri);
 
   url_free (orig_parsed);
   url_free (new_parsed);
diff --git a/src/res.c b/src/res.c
index 8c35f0e1..69abd12d 100644
--- a/src/res.c
+++ b/src/res.c
@@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
    Return true if robots were retrieved OK, false otherwise.  */
 
 bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
 {
+  struct iri *i = iri_new ();
   uerr_t err;
   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
   int saved_ts_val = opt.timestamping;
   int saved_sp_val = opt.spider;
 
+  /* Copy server URI encoding for a possible IDNA transformation, no need to
+     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+  set_uri_encoding (i, iri->uri_encoding);
+  i->utf8_encode = false;
+
   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
   *file = NULL;
   opt.timestamping = false;
   opt.spider       = false;
-  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
   opt.timestamping = saved_ts_val;
-  opt.spider       = saved_sp_val;  
+  opt.spider       = saved_sp_val;
   xfree (robots_url);
+  iri_free (i);
 
   if (err != RETROK && *file != NULL)
     {
diff --git a/src/res.h b/src/res.h
index 94a57750..5439eaf9 100644
--- a/src/res.h
+++ b/src/res.h
@@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
 void res_register_specs (const char *, int, struct robot_specs *);
 struct robot_specs *res_get_specs (const char *, int);
 
-bool res_retrieve_file (const char *, char **);
+bool res_retrieve_file (const char *, char **, struct iri *);
 
 bool is_robots_txt_url (const char *);
 
diff --git a/src/retr.c b/src/retr.c
index 7a28ea32..e70f6e6e 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -598,7 +598,7 @@ static char *getproxy (struct url *);
 
 uerr_t
 retrieve_url (const char *origurl, char **file, char **newloc,
-              const char *refurl, int *dt, bool recursive)
+              const char *refurl, int *dt, bool recursive, struct iri *iri)
 {
   uerr_t result;
   char *url;
@@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (file)
     *file = NULL;
 
-  reset_utf8_encode ();
-
  second_try:
-  u = url_parse (url, &up_error_code);
+  u = url_parse (url, &up_error_code, iri);
   if (!u)
     {
       logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       return URLERROR;
     }
 
-  /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+  printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
 
   if (!refurl)
     refurl = opt.referer;
@@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   proxy = getproxy (u);
   if (proxy)
     {
-      /* sXXXav : support IRI for proxy */
+      /* sXXXav : could a proxy include a path ??? */
+      struct iri *pi = iri_new ();
+      set_uri_encoding (pi, opt.locale);
+      pi->utf8_encode = false;
+
       /* Parse the proxy URL.  */
-      set_ugly_no_encode (true);
-      proxy_url = url_parse (proxy, &up_error_code);
-      set_ugly_no_encode (false);
+      proxy_url = url_parse (proxy, &up_error_code, NULL);
       if (!proxy_url)
         {
           logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 #endif
       || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
     {
-      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
     }
   else if (u->scheme == SCHEME_FTP)
     {
@@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       xfree (mynewloc);
       mynewloc = construced_newloc;
 
-      reset_utf8_encode ();
+      /* Reset UTF-8 encoding state, keep the URI encoding and reset
+         the content encoding. */
+      iri->utf8_encode = opt.enable_iri;
+      set_content_encoding (iri, NULL);
 
       /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
       if (!newloc_parsed)
         {
           logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     }
 
   /* Try to not encode in UTF-8 if fetching failed */
-  if (!(*dt & RETROKF) && get_utf8_encode ())
+  if (!(*dt & RETROKF) && iri->utf8_encode)
     {
-      set_utf8_encode (false);
-      /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+      iri->utf8_encode = false;
+      printf ("[Fallbacking to non-utf8 for `%s'\n", url);
       goto second_try;
     }
 
@@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count)
 {
   uerr_t status;
   struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();
 
   char *input_file = NULL;
   const char *url = file;
 
   status = RETROK;             /* Suppose everything is OK.  */
   *count = 0;                  /* Reset the URL count.  */
-  
+
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_content_encoding (iri, opt.locale);
+
   if (url_has_scheme (url))
     {
       uerr_t status;
-      status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
+      status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri);
       if (status != RETROK)
         return status;
     }
   else
     input_file = (char *) file;
 
-  url_list = (html ? get_urls_html (input_file, NULL, NULL)
+  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
               : get_urls_file (input_file));
 
   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count)
           opt.follow_ftp = old_follow_ftp;
         }
       else
-        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
+	                       &dt, opt.recursive, iri);
 
       if (filename && opt.delete_after && file_exists_p (filename))
         {
@@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url)
 {
   bool ret;
   struct url *u;
-  set_ugly_no_encode(true);
-  u= url_parse (url, NULL);
-  set_ugly_no_encode(false);
+  struct iri *i = iri_new();
+  /* url was given in the command line, so use locale as encoding */
+  set_uri_encoding (i, opt.locale);
+  u= url_parse (url, NULL, i);
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
diff --git a/src/retr.h b/src/retr.h
index ec55cfda..bb2e66d3 100644
--- a/src/retr.h
+++ b/src/retr.h
@@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
 char *fd_read_hunk (int, hunk_terminator_t, long, long);
 char *fd_read_line (int);
 
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
+                     bool, struct iri *);
 uerr_t retrieve_from_file (const char *, bool, int *);
 
 const char *retr_rate (wgint, double);
diff --git a/src/url.c b/src/url.c
index beaf0fb2..c7a3a721 100644
--- a/src/url.c
+++ b/src/url.c
@@ -641,7 +641,7 @@ static const char *parse_errors[] = {
    error, and if ERROR is not NULL, also set *ERROR to the appropriate
    error code. */
 struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri)
 {
   struct url *u;
   const char *p;
@@ -660,7 +660,7 @@ url_parse (const char *url, int *error)
   int port;
   char *user = NULL, *passwd = NULL;
 
-  char *url_encoded = NULL;
+  char *url_encoded = NULL, *new_url = NULL;
 
   int error_code;
 
@@ -671,20 +671,20 @@ url_parse (const char *url, int *error)
       goto error;
     }
 
-  if (opt.enable_iri && get_utf8_encode ())
+  if (iri && iri->utf8_encode)
     {
-      const char *new;
-      bool utf8_encode;
       url_unescape ((char *) url);
-      utf8_encode = remote_to_utf8 (url, &new);
-      set_utf8_encode (utf8_encode);
-      if (utf8_encode)
-        url = new;
+      iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+      if (!iri->utf8_encode)
+        new_url = NULL;
     }
 
-  url_encoded = reencode_escapes (url);
+  url_encoded = reencode_escapes (new_url ? new_url : url);
   p = url_encoded;
 
+  if (new_url && url_encoded != new_url)
+    xfree (new_url);
+
   p += strlen (supported_schemes[scheme].leading_string);
   uname_b = p;
   p = url_skip_credentials (p);
@@ -854,16 +854,17 @@ url_parse (const char *url, int *error)
     {
       url_unescape (u->host);
       host_modified = true;
-    }
 
-  if (opt.enable_iri)
-    {
-      char *new = idn_encode (u->host, get_utf8_encode ());
-      if (new)
+      /* Apply IDNA regardless of iri->utf8_encode status */
+      if (opt.enable_iri && iri)
         {
-          xfree (u->host);
-          u->host = new;
-          host_modified = true;
+          char *new = idn_encode (iri, u->host);
+          if (new)
+            {
+              xfree (u->host);
+              u->host = new;
+              host_modified = true;
+            }
         }
     }
 
diff --git a/src/url.h b/src/url.h
index 7c8bcfed..9c49c0b5 100644
--- a/src/url.h
+++ b/src/url.h
@@ -84,7 +84,7 @@ struct url
 
 char *url_escape (const char *);
 
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, struct iri *iri);
 const char *url_error (int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);
diff --git a/src/wget.h b/src/wget.h
index d87dfcac..b17b6709 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
 #include "quote.h"
 #include "quotearg.h"
 
+/* Likewise for struct iri definition */
+#include "iri.h"
+
 /* Useful macros used across the code: */
 
 /* The number of elements in an array.  For example:

From 3ae04f5fe4ae2025c177168be4a2c396627c2ffb Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 24 Jul 2008 14:32:31 +0200
Subject: [PATCH 30/58] Use DEBUGP instead of commenting out all the
 _wonderful_ printfs

---
 src/iri.c   | 4 ++--
 src/recur.c | 3 ++-
 src/retr.c  | 5 +++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 783aa331..44a262b8 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -331,7 +331,7 @@ iri_free (struct iri *i)
 void
 set_uri_encoding (struct iri *i, char *charset)
 {
-  logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
+  DEBUGP (("[IRI uri = `%s'\n", quote (charset)));
   if (opt.encoding_remote)
     return;
   if (i->uri_encoding)
@@ -347,7 +347,7 @@ set_uri_encoding (struct iri *i, char *charset)
 void
 set_content_encoding (struct iri *i, char *charset)
 {
-  logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
+  DEBUGP (("[IRI content = %s\n", quote (charset)));
   if (opt.encoding_remote)
     return;
   if (i->content_encoding)
diff --git a/src/recur.c b/src/recur.c
index e2f58d1c..aa83e9a6 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -116,7 +116,8 @@ url_enqueue (struct url_queue *queue, struct iri *i,
   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 
   if (i)
-    printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
+    DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url),
+               quote (i->uri_encoding)));
 
   if (queue->tail)
     queue->tail->next = qel;
diff --git a/src/retr.c b/src/retr.c
index ae8ef3ef..691b8f51 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -635,7 +635,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       return URLERROR;
     }
 
-  printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
+  DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url),
+             quote (iri->uri_encoding), iri->utf8_encode));
 
   if (!refurl)
     refurl = opt.referer;
@@ -788,7 +789,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (!(*dt & RETROKF) && iri->utf8_encode)
     {
       iri->utf8_encode = false;
-      printf ("[Fallbacking to non-utf8 for `%s'\n", url);
+      DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
       goto second_try;
     }
 

From 8c513ef48725f2091baecb30717b178f3337b442 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Thu, 24 Jul 2008 14:34:48 +0200
Subject: [PATCH 31/58] Fix numbers of arguments of the no-op macro
 idn_encode()

---
 src/iri.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/iri.h b/src/iri.h
index 173d0656..cdc5c7fd 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -58,7 +58,7 @@ struct iri dummy_iri;
 #define find_locale()               NULL
 #define check_encoding_name(str)    false
 #define locale_to_utf8(str)         (str)
-#define idn_encode(a,b,c)           NULL
+#define idn_encode(a,b)             NULL
 #define idn_decode(str)             NULL
 #define remote_to_utf8(a,b,c)       false
 #define iri_new()                   (&dummy_iri)

From b967d49f79b6e0ce73559bd30d231bddc2e4b232 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Wed, 30 Jul 2008 10:15:55 +0200
Subject: [PATCH 32/58] opt.remote_encoding should not override opt.locale, add
 a force arguments to set_uri_encoding()

---
 src/iri.c   | 4 ++--
 src/iri.h   | 4 ++--
 src/main.c  | 2 +-
 src/recur.c | 4 ++--
 src/res.c   | 2 +-
 src/retr.c  | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 44a262b8..3ee99871 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -329,10 +329,10 @@ iri_free (struct iri *i)
 }
 
 void
-set_uri_encoding (struct iri *i, char *charset)
+set_uri_encoding (struct iri *i, char *charset, bool force)
 {
   DEBUGP (("[IRI uri = `%s'\n", quote (charset)));
-  if (opt.encoding_remote)
+  if (!force && opt.encoding_remote)
     return;
   if (i->uri_encoding)
     {
diff --git a/src/iri.h b/src/iri.h
index cdc5c7fd..e7f3fe3e 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -47,7 +47,7 @@ char *idn_decode (char *host);
 bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
 struct iri *iri_new (void);
 void iri_free (struct iri *i);
-void set_uri_encoding (struct iri *i, char *charset);
+void set_uri_encoding (struct iri *i, char *charset, bool force);
 void set_content_encoding (struct iri *i, char *charset);
 
 #else /* ENABLE_IRI */
@@ -63,7 +63,7 @@ struct iri dummy_iri;
 #define remote_to_utf8(a,b,c)       false
 #define iri_new()                   (&dummy_iri)
 #define iri_free(a)
-#define set_uri_encoding(a,b)
+#define set_uri_encoding(a,b,c)
 #define set_content_encoding(a,b)
 
 #endif /* ENABLE_IRI */
diff --git a/src/main.c b/src/main.c
index 8cee194c..799e5d63 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1206,7 +1206,7 @@ WARNING: Can't reopen standard output in binary mode;\n\
       else
         {
           struct iri *i = iri_new ();
-          set_uri_encoding (i, opt.locale);
+          set_uri_encoding (i, opt.locale, true);
           status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
                                  opt.recursive, i);
           iri_free (i);
diff --git a/src/recur.c b/src/recur.c
index aa83e9a6..19ef8f1c 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -202,7 +202,7 @@ retrieve_tree (const char *start_url)
   int up_error_code;
   struct url *start_url_parsed;
   struct iri *i = iri_new ();
-  set_uri_encoding (i, opt.locale);
+  set_uri_encoding (i, opt.locale, true);
 
   start_url_parsed = url_parse (start_url, &up_error_code, i);
   if (!start_url_parsed)
@@ -391,7 +391,7 @@ retrieve_tree (const char *start_url)
                                         blacklist, i))
                     {
                       ci = iri_new ();
-                      set_uri_encoding (ci, i->content_encoding);
+                      set_uri_encoding (ci, i->content_encoding, false);
                       url_enqueue (queue, ci, xstrdup (child->url->url),
                                    xstrdup (referer_url), depth + 1,
                                    child->link_expect_html,
diff --git a/src/res.c b/src/res.c
index 69abd12d..0320d034 100644
--- a/src/res.c
+++ b/src/res.c
@@ -542,7 +542,7 @@ res_retrieve_file (const char *url, char **file, struct iri *iri)
 
   /* Copy server URI encoding for a possible IDNA transformation, no need to
      encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
-  set_uri_encoding (i, iri->uri_encoding);
+  set_uri_encoding (i, iri->uri_encoding, false);
   i->utf8_encode = false;
 
   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
diff --git a/src/retr.c b/src/retr.c
index 691b8f51..111b745a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -653,7 +653,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     {
       /* sXXXav : could a proxy include a path ??? */
       struct iri *pi = iri_new ();
-      set_uri_encoding (pi, opt.locale);
+      set_uri_encoding (pi, opt.locale, true);
       pi->utf8_encode = false;
 
       /* Parse the proxy URL.  */
@@ -1083,7 +1083,7 @@ url_uses_proxy (const char *url)
   struct url *u;
   struct iri *i = iri_new();
   /* url was given in the command line, so use locale as encoding */
-  set_uri_encoding (i, opt.locale);
+  set_uri_encoding (i, opt.locale, true);
   u= url_parse (url, NULL, i);
   if (!u)
     return false;

From 042828f4690232e4e2d8b0787acb941d64b59b97 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Fri, 1 Aug 2008 14:58:37 +0200
Subject: [PATCH 33/58] Add a missing return...

---
 src/iri.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/iri.c b/src/iri.c
index 3ee99871..a45f3899 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -318,6 +318,7 @@ iri_new (void)
   i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
   i->content_encoding = NULL;
   i->utf8_encode = opt.enable_iri;
+  return i;
 }
 
 void

From bfd8a73f004b95d044741f4cb78ecad9de92bddc Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sat, 2 Aug 2008 11:22:14 +0200
Subject: [PATCH 34/58] quote*() functions don't like that much NULL arg

---
 src/iri.c   | 9 +++++----
 src/recur.c | 2 +-
 src/retr.c  | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 3ee99871..9050e858 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -318,6 +318,7 @@ iri_new (void)
   i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
   i->content_encoding = NULL;
   i->utf8_encode = opt.enable_iri;
+  return i;
 }
 
 void
@@ -331,12 +332,12 @@ iri_free (struct iri *i)
 void
 set_uri_encoding (struct iri *i, char *charset, bool force)
 {
-  DEBUGP (("[IRI uri = `%s'\n", quote (charset)));
+  DEBUGP (("[IRI uri = `%s'\n", charset ? quote (charset) : "None"));
   if (!force && opt.encoding_remote)
     return;
   if (i->uri_encoding)
     {
-      if (!strcasecmp (i->uri_encoding, charset))
+      if (charset && !strcasecmp (i->uri_encoding, charset))
         return;
       xfree (i->uri_encoding);
     }
@@ -347,12 +348,12 @@ set_uri_encoding (struct iri *i, char *charset, bool force)
 void
 set_content_encoding (struct iri *i, char *charset)
 {
-  DEBUGP (("[IRI content = %s\n", quote (charset)));
+  DEBUGP (("[IRI content = %s\n", charset ? quote (charset) : "None"));
   if (opt.encoding_remote)
     return;
   if (i->content_encoding)
     {
-      if (!strcasecmp (i->content_encoding, charset))
+      if (charset && !strcasecmp (i->content_encoding, charset))
         return;
       xfree (i->content_encoding);
     }
diff --git a/src/recur.c b/src/recur.c
index 19ef8f1c..baeaed58 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -117,7 +117,7 @@ url_enqueue (struct url_queue *queue, struct iri *i,
 
   if (i)
     DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url),
-               quote (i->uri_encoding)));
+             i->uri_encoding ? quote (i->uri_encoding) : "None"));
 
   if (queue->tail)
     queue->tail->next = qel;
diff --git a/src/retr.c b/src/retr.c
index 111b745a..fa7f762d 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -636,7 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     }
 
   DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url),
-             quote (iri->uri_encoding), iri->utf8_encode));
+           iri->uri_encoding ? quote (iri->uri_encoding) : "None",
+           iri->utf8_encode));
 
   if (!refurl)
     refurl = opt.referer;

From da7adbaef4bb2c47a19db3e83620aed06ba9456e Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sat, 2 Aug 2008 12:17:03 +0200
Subject: [PATCH 35/58] Functional tests for IRI and HTTP

---
 tests/Test-iri-disabled.px      | 197 ++++++++++++++++++++++++++++
 tests/Test-iri-forced-remote.px | 208 +++++++++++++++++++++++++++++
 tests/Test-iri.px               | 225 ++++++++++++++++++++++++++++++++
 tests/run-px                    |   3 +
 4 files changed, 633 insertions(+)
 create mode 100755 tests/Test-iri-disabled.px
 create mode 100755 tests/Test-iri-forced-remote.px
 create mode 100755 tests/Test-iri.px

diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px
new file mode 100755
index 00000000..122537ff
--- /dev/null
+++ b/tests/Test-iri-disabled.px
@@ -0,0 +1,197 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 mime + file = iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_l1}${eacute_l1}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
+        content => $pageeuro,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-disabled",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px
new file mode 100755
index 00000000..0d116d8f
--- /dev/null
+++ b/tests/Test-iri-forced-remote.px
@@ -0,0 +1,208 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+# Force remote encoding to ISO-8859-1
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 mime + file = iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
+my $currency_l1 = "\xA4";
+my $currency_u8 = "\xC2\xA4";
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%C2%A4%C2%A4%C2%A4.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $pageeuro,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
+        content => $pageeuro,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
new file mode 100755
index 00000000..3f4cf3fd
--- /dev/null
+++ b/tests/Test-iri.px
@@ -0,0 +1,225 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 meta + file = utf-8, mime =iso-8859-1
+# p3_€€€.html                 meta + file = utf-8, mime = iso-8859-1
+# p4_méér.html                mime + file = utf-8
+#
+
+my $ccedilla_l15 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+my $eurosign_l15 = "\xA4";
+my $eurosign_u8 = "\xE2\x82\xAC";
+my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
+    Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    &Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
+    Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
+    <a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">M&eacute&eacute;r</a>
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeuro = <<EOF;
+<html>
+<head>
+  <title>Euro page</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    My tailor isn't rich anymore.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagemeer = <<EOF;
+<html>
+<head>
+  <title>Bekende supermarkt</title>
+</head>
+<body>
+  <p>
+    Ik ben toch niet gek !
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $pageeuro,
+    },
+    '/p3_%A4%A4%A4.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $pageeuro,
+    },
+    '/p4_m%C3%A9%C3%A9r.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=UTF-8",
+        },
+        content => $pagemeer,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri -nH -r http://localhost:{{port}}/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'index.html' => {
+        content => $pageindex,
+    },
+    'robots.txt' => {
+        content => "",
+    },
+    "p1_fran${ccedilla_l15}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+    "p3_${eurosign2_u8}${eurosign2_u8}${eurosign2_u8}.html" => {
+        content => $pageeuro,
+    },
+    "p4_m${eacute_u8}${eacute_u8}r.html" => {
+        content => $pagemeer,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 37f14324..865246e3 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -19,6 +19,9 @@ my @tests = (
     'Test-HTTP-Content-Disposition-1.px',
     'Test-HTTP-Content-Disposition-2.px',
     'Test-HTTP-Content-Disposition.px',
+    'Test-iri.px',
+    'Test-iri-disabled.px',
+    'Test-iri-forced-remote.px',
     'Test-N-current.px',
     'Test-N-smaller.px',
     'Test-N-no-info.px',

From 8d7c2219d1965fb1bda16d46bb45e8fe7dc60501 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sat, 2 Aug 2008 13:47:10 +0200
Subject: [PATCH 36/58] Test FTP IRI support

---
 tests/Test-ftp-iri-disabled.px | 50 ++++++++++++++++++++++++++++++++++
 tests/Test-ftp-iri-fallback.px | 46 +++++++++++++++++++++++++++++++
 tests/Test-ftp-iri.px          | 47 ++++++++++++++++++++++++++++++++
 tests/run-px                   |  3 ++
 4 files changed, 146 insertions(+)
 create mode 100755 tests/Test-ftp-iri-disabled.px
 create mode 100755 tests/Test-ftp-iri-fallback.px
 create mode 100755 tests/Test-ftp-iri.px

diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px
new file mode 100755
index 00000000..14d849da
--- /dev/null
+++ b/tests/Test-ftp-iri-disabled.px
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+    "/fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px
new file mode 100755
index 00000000..8902e0f9
--- /dev/null
+++ b/tests/Test-ftp-iri-fallback.px
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_l1}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px
new file mode 100755
index 00000000..d453669c
--- /dev/null
+++ b/tests/Test-ftp-iri.px
@@ -0,0 +1,47 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use FTPTest;
+
+
+###############################################################################
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+
+my $francais = <<EOF;
+Some text.
+EOF
+
+$francais =~ s/\n/\r\n/;
+
+
+# code, msg, headers, content
+my %urls = (
+    "/fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "fran${ccedilla_u8}ais.txt" => {
+        content => $francais,
+    },
+);
+
+###############################################################################
+
+my $the_test = FTPTest->new (name => "Test-ftp-iri",
+                             input => \%urls, 
+                             cmdline => $cmdline, 
+                             errcode => $expected_error_code, 
+                             output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 865246e3..172adcd7 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -16,6 +16,9 @@ my @tests = (
     'Test-E-k-K.px',
     'Test-E-k.px',
     'Test-ftp.px',
+    'Test-ftp-iri.px',
+    'Test-ftp-iri-fallback.px',
+    'Test-ftp-iri-disabled.px',
     'Test-HTTP-Content-Disposition-1.px',
     'Test-HTTP-Content-Disposition-2.px',
     'Test-HTTP-Content-Disposition.px',

From e2813c1e4fdf1f565f65197445695cc18485ddb3 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sun, 3 Aug 2008 20:02:35 +0200
Subject: [PATCH 37/58] Since wget use libidn function for finding the locale,
 langinfo.h is useless

---
 src/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/main.c b/src/main.c
index 799e5d63..79c35220 100644
--- a/src/main.c
+++ b/src/main.c
@@ -43,9 +43,6 @@ as that of the covered work.  */
 #include <assert.h>
 #include <errno.h>
 #include <time.h>
-#ifdef ENABLE_IRI
-#include <langinfo.h>
-#endif
 
 #include "utils.h"
 #include "init.h"

From cda8835de6b299d591f636ba960c66ad646a2b58 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sun, 3 Aug 2008 20:03:13 +0200
Subject: [PATCH 38/58] IRI support documentation, first attempt

---
 doc/wget.texi | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/doc/wget.texi b/doc/wget.texi
index 6f88e19a..9219f49c 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -675,6 +675,30 @@ Another instance where you'll get a garbled file if you try to use
 Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
 servers that support the @code{Range} header.
 
+@cindex iri support
+@cindex idn support
+@item --iri
+
+Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
+turn it off. IRI support is activated by default.
+
+You can set the default state of IRI support using @code{iri} command in
+@file{.wgetrc}. That setting may be overridden from the command line.
+
+@cindex local encoding
+@cindex locale
+@item --locale=@var{encoding}
+
+Force Wget to use @var{encoding} as the default system encoding. That affects
+how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
+IRI support.
+
+Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
+environment variable to get the locale. If it fails, @sc{ascii} is used.
+
+You can set the default locale using the @code{locale} command in
+@file{.wgetrc}. That setting may be overridden from the command line.
+
 @cindex progress indicator
 @cindex dot style
 @item --progress=@var{type}
@@ -706,6 +730,21 @@ command line.  The exception is that, when the output is not a TTY, the
 ``dot'' progress will be favored over ``bar''.  To force the bar output,
 use @samp{--progress=bar:force}.
 
+@cindex remote encoding
+@item --remote-encoding=@var{encoding}
+
+Force Wget to use encoding as the default remote server encoding. That
+affects how Wget converts URIs found in files from remote encoding to
+@sc{utf-8} during a recursive fetch. This options is only useful for
+IRI support, for the interpretation of non-@sc{ascii} characters.
+
+For HTTP, remote encoding can be found in HTTP @code{Content-Type}
+header and in HTML @code{Content-Type http-equiv} meta tag.
+
+You can set the default encoding using the @code{remoteencoding}
+command in @file{.wgetrc}. That setting may be overridden from the
+command line.
+
 @item -N
 @itemx --timestamping
 Turn on time-stamping.  @xref{Time-Stamping}, for details.

From e4fd97c2eb9c7311a0cf8bf51bbf9d6cff16ae91 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sun, 3 Aug 2008 20:06:39 +0200
Subject: [PATCH 39/58] Add lines to .wgetrc sample file

---
 doc/sample.wgetrc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/sample.wgetrc b/doc/sample.wgetrc
index c69596bf..7ef9ef4a 100644
--- a/doc/sample.wgetrc
+++ b/doc/sample.wgetrc
@@ -113,3 +113,12 @@ waitretry = 10
 
 # To try ipv6 addresses first:
 #prefer-family = IPv6
+
+# Set default IRI support state
+#iri = off
+
+# Force the default system encoding
+#locale = UTF-8
+
+# Force the default remote server encoding
+#remoteencoding = UTF-8

From f8ffc7d0848e45c9c288c19332b99b6291188e66 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sun, 3 Aug 2008 20:38:00 +0200
Subject: [PATCH 40/58] Use --restrict-file-names=nocontrol during Test-iri*
 tests avoiding some special escaping

---
 tests/Test-iri-disabled.px      | 1 -
 tests/Test-iri-forced-remote.px | 1 -
 tests/Test-iri.px               | 5 ++---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px
index 122537ff..17e43361 100755
--- a/tests/Test-iri-disabled.px
+++ b/tests/Test-iri-disabled.px
@@ -24,7 +24,6 @@ my $eacute_l1 = "\xE9";
 my $eacute_u8 = "\xC3\xA9";
 my $eurosign_l15 = "\xA4";
 my $eurosign_u8 = "\xE2\x82\xAC";
-my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
 
 my $pageindex = <<EOF;
 <html>
diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px
index 0d116d8f..1acd03a7 100755
--- a/tests/Test-iri-forced-remote.px
+++ b/tests/Test-iri-forced-remote.px
@@ -25,7 +25,6 @@ my $eacute_l1 = "\xE9";
 my $eacute_u8 = "\xC3\xA9";
 my $eurosign_l15 = "\xA4";
 my $eurosign_u8 = "\xE2\x82\xAC";
-my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
 my $currency_l1 = "\xA4";
 my $currency_u8 = "\xC2\xA4";
 
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
index 3f4cf3fd..d228721c 100755
--- a/tests/Test-iri.px
+++ b/tests/Test-iri.px
@@ -25,7 +25,6 @@ my $eacute_l1 = "\xE9";
 my $eacute_u8 = "\xC3\xA9";
 my $eurosign_l15 = "\xA4";
 my $eurosign_u8 = "\xE2\x82\xAC";
-my $eurosign2_u8 = "\xE2%82\xAC";	# version wget use... sXXXav
 
 my $pageindex = <<EOF;
 <html>
@@ -187,7 +186,7 @@ my %urls = (
     },
 );
 
-my $cmdline = $WgetTest::WGETPATH . " --iri -nH -r http://localhost:{{port}}/";
+my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
 
 my $expected_error_code = 0;
 
@@ -204,7 +203,7 @@ my %expected_downloaded_files = (
     "p2_${eacute_u8}${eacute_u8}n.html" => {
         content => $pageeen,
     },
-    "p3_${eurosign2_u8}${eurosign2_u8}${eurosign2_u8}.html" => {
+    "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
         content => $pageeuro,
     },
     "p4_m${eacute_u8}${eacute_u8}r.html" => {

From c74bc2da704de7c291521093368b8bab7149909d Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Sun, 3 Aug 2008 22:30:12 +0200
Subject: [PATCH 41/58] Some cleanups in iri.c

---
 src/iri.c | 70 ++++++++++++++++++++++---------------------------------
 src/iri.h |  4 ++--
 2 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/src/iri.c b/src/iri.c
index 9050e858..dce9e2ed 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -46,9 +46,6 @@ as that of the covered work.  */
 
 /* Note: locale encoding is kept in options struct (opt.locale) */
 
-static iconv_t locale2utf8;
-
-static bool open_locale_to_utf8 (void);
 static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
 
 
@@ -119,27 +116,7 @@ check_encoding_name (char *encoding)
 static bool
 open_locale_to_utf8 (void)
 {
-  if (locale2utf8)
-    return true;
 
-  /* sXXXav : That shouldn't happen, just in case */
-  if (!opt.locale)
-    {
-      logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
-      opt.locale = find_locale ();
-    }
-
-  if (!opt.locale)
-    return false;
-
-  locale2utf8 = iconv_open ("UTF-8", opt.locale);
-  if (locale2utf8 != (iconv_t)(-1))
-    return true;
-
-  logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
-             quote (opt.locale), quote ("UTF-8"));
-  locale2utf8 = NULL;
-  return false;
 }
 
 /* Try converting string str from locale to UTF-8. Return a new string
@@ -147,22 +124,35 @@ open_locale_to_utf8 (void)
 const char *
 locale_to_utf8 (const char *str)
 {
+  iconv_t l2u;
   char *new;
 
-  if (!strcasecmp (opt.locale, "utf-8"))
+  /* That shouldn't happen, just in case */
+  if (!opt.locale)
+    {
+      logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+      opt.locale = find_locale ();
+    }
+
+  if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
     return str;
 
-  if (!open_locale_to_utf8 ())
-    return str;
+  l2u = iconv_open ("UTF-8", opt.locale);
+  if (l2u != (iconv_t)(-1))
+    { 
+      logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+                 quote (opt.locale), quote ("UTF-8"));
+      return str;
+    }
 
-  if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
+  if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
     return (const char *) new;
 
   return str;
 }
 
 /* Do the conversion according to the passed conversion descriptor cd. *out
-   will containes the transcoded string on success. *out content is
+   will contain the transcoded string on success. *out content is
    unspecified otherwise. */
 static bool
 do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
@@ -236,11 +226,7 @@ idn_encode (struct iri *i, char *host)
   if (!i->utf8_encode)
     {
       if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
-        {
-          /* Nothing to encode or an error occured */
-          return NULL;
-        }
-
+          return NULL;  /* Nothing to encode or an error occured */
       host = new;
     }
 
@@ -281,18 +267,13 @@ idn_decode (char *host)
 bool
 remote_to_utf8 (struct iri *i, const char *str, const char **new)
 {
-  char *r;
   iconv_t cd;
   bool ret = false;
 
-  if (opt.encoding_remote)
-    r = opt.encoding_remote;
-  else if (i->uri_encoding)
-    r = i->uri_encoding;
-  else
+  if (!i->uri_encoding)
     return false;
 
-  cd = iconv_open ("UTF-8", r);
+  cd = iconv_open ("UTF-8", i->uri_encoding);
   if (cd == (iconv_t)(-1))
     return false;
 
@@ -311,6 +292,7 @@ remote_to_utf8 (struct iri *i, const char *str, const char **new)
   return ret;
 }
 
+/* Allocate a new iri structure and return a pointer to it. */
 struct iri *
 iri_new (void)
 {
@@ -321,6 +303,7 @@ iri_new (void)
   return i;
 }
 
+/* Completely free an iri structure. */
 void
 iri_free (struct iri *i)
 {
@@ -329,10 +312,12 @@ iri_free (struct iri *i)
   xfree (i);
 }
 
+/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
+   it unless force is true. */
 void
 set_uri_encoding (struct iri *i, char *charset, bool force)
 {
-  DEBUGP (("[IRI uri = `%s'\n", charset ? quote (charset) : "None"));
+  DEBUGP (("URI encoding = `%s'\n", charset ? quote (charset) : "None"));
   if (!force && opt.encoding_remote)
     return;
   if (i->uri_encoding)
@@ -345,10 +330,11 @@ set_uri_encoding (struct iri *i, char *charset, bool force)
   i->uri_encoding = charset ? xstrdup (charset) : NULL;
 }
 
+/* Set content_encoding of struct iri i. */
 void
 set_content_encoding (struct iri *i, char *charset)
 {
-  DEBUGP (("[IRI content = %s\n", charset ? quote (charset) : "None"));
+  DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
   if (opt.encoding_remote)
     return;
   if (i->content_encoding)
diff --git a/src/iri.h b/src/iri.h
index e7f3fe3e..c024de72 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -31,9 +31,9 @@ as that of the covered work.  */
 #define IRI_H
 
 struct iri {
-  char *uri_encoding;     /* Encoding of the uri to fetch */
+  char *uri_encoding;      /* Encoding of the uri to fetch */
   char *content_encoding;  /* Encoding of links inside the fetched file */
-  bool utf8_encode;       /* Will/Is the current url encoded in utf8 */
+  bool utf8_encode;        /* Will/Is the current url encoded in utf8 */
 };
 
 #ifdef ENABLE_IRI

From 84395897ad2d1c107be470946daba744b2e7ebe8 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Mon, 4 Aug 2008 11:08:33 +0200
Subject: [PATCH 42/58] iri.h is already included in wget.h, so don't include
 it in C files

---
 src/connect.c | 1 -
 src/host.c    | 1 -
 src/iri.c     | 1 -
 src/recur.c   | 1 -
 src/retr.c    | 1 -
 src/url.c     | 1 -
 6 files changed, 6 deletions(-)

diff --git a/src/connect.c b/src/connect.c
index 6cfdb4b7..41258d26 100644
--- a/src/connect.c
+++ b/src/connect.c
@@ -58,7 +58,6 @@ as that of the covered work.  */
 #include "host.h"
 #include "connect.h"
 #include "hash.h"
-#include "iri.h"
 
 /* Define sockaddr_storage where unavailable (presumably on IPv4-only
    hosts).  */
diff --git a/src/host.c b/src/host.c
index 1226a274..bbf40222 100644
--- a/src/host.c
+++ b/src/host.c
@@ -53,7 +53,6 @@ as that of the covered work.  */
 #include "host.h"
 #include "url.h"
 #include "hash.h"
-#include "iri.h"
 
 #ifndef NO_ADDRESS
 # define NO_ADDRESS NO_DATA
diff --git a/src/iri.c b/src/iri.c
index dce9e2ed..ea4046af 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -39,7 +39,6 @@ as that of the covered work.  */
 #include <errno.h>
 
 #include "utils.h"
-#include "iri.h"
 
 /* RFC3987 section 3.1 mandates STD3 ASCII RULES */
 #define IDNA_FLAGS  IDNA_USE_STD3_ASCII_RULES
diff --git a/src/recur.c b/src/recur.c
index baeaed58..71fbe7bf 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -51,7 +51,6 @@ as that of the covered work.  */
 #include "html-url.h"
 #include "css-url.h"
 #include "spider.h"
-#include "iri.h"
 
 /* Functions for maintaining the URL queue.  */
 
diff --git a/src/retr.c b/src/retr.c
index fa7f762d..fe176eaf 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -51,7 +51,6 @@ as that of the covered work.  */
 #include "hash.h"
 #include "convert.h"
 #include "ptimer.h"
-#include "iri.h"
 #include "html-url.h"
 
 /* Total size of downloaded files.  Used to enforce quota.  */
diff --git a/src/url.c b/src/url.c
index c7a3a721..e79cf8a2 100644
--- a/src/url.c
+++ b/src/url.c
@@ -42,7 +42,6 @@ as that of the covered work.  */
 #include "utils.h"
 #include "url.h"
 #include "host.h"  /* for is_valid_ipv6_address */
-#include "iri.h"
 
 #ifdef TESTING
 #include "test.h"

From bb62e1aa9ed97e931ccd174f64c6e13e4d0439bb Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Mon, 4 Aug 2008 11:18:26 +0200
Subject: [PATCH 43/58] Update tests/ChangeLog for the 6 new tests for testing
 IRI support

---
 tests/ChangeLog | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 36bc35dc..ad18c14a 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,26 @@
+2008-08-03  Xavier Saint <wget@sxav.eu>
+
+	* Test-iri.px : HTTP recursive fetch for testing IRI support and
+	fallback.
+
+	* Test-iri-disabled.px : Same file structure as Test-iri.px but with
+	IRI support disabled
+
+	* Test-iri-forced-remote.px : There's a difference between ISO-8859-1
+	and ISO-8859-15 for character 0xA4 (respectively currency sign and
+	euro sign). So with a forced ISO-8859-1 remote encoding, wget should
+	see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
+	of using the ISO-8859-15 given by the server.
+
+	* Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
+	and expect wget to fetch the file UTF-8 encoded.
+
+	* Test-ftp-iri-fallback.px : Same as above but wget should fallback on
+	locale encoding to fetch the file.
+
+	* Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
+	disabled. The UTF-8 encoded file should not be retrieved.
+
 2008-06-22  Micah Cowan  <micah@cowan.name>
 
 	* Test-proxied-https-auth.px: Shift exit code so it falls in the

From 49061b72b630e248b4e1df0593a2198b2ed612fb Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Mon, 4 Aug 2008 11:21:45 +0200
Subject: [PATCH 44/58] Update doc/ChangeLog in regards to IRI support

---
 doc/ChangeLog | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/ChangeLog b/doc/ChangeLog
index 4f68780e..08d2f05e 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,12 @@
+2008-08-03  Xavier Saint  <wget@sxav.eu>
+
+	* wget.texi : Add option descriptions for the three new
+	options --iri, --locale and --remote-encoding related to
+	IRI support.
+
+	* sample.wgetrc : Add commented lines for the three new
+	command iri, locale and encoding related to IRI support.
+
 2008-07-17  Steven Schubiger  <stsc@members.fsf.org>
 
 	* wget.texi (Logging and Input File Options): Document

From 0d0a42514458629dd6875138e813110f39eded03 Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Thu, 7 Aug 2008 10:33:06 +0200
Subject: [PATCH 45/58] Correct a mis-merge: return type for url_error is
 char*, not const char*

---
 src/url.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/url.h b/src/url.h
index 69db1551..0748e214 100644
--- a/src/url.h
+++ b/src/url.h
@@ -85,7 +85,7 @@ struct url
 char *url_escape (const char *);
 
 struct url *url_parse (const char *, int *, struct iri *iri);
-const char *url_error (const char *, int);
+char *url_error (const char *, int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);
 void url_set_file (struct url *, const char *);

From e6b4e761d1f1439b1b2352f5eeaedd1ae5b9d76e Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Thu, 14 Aug 2008 17:42:16 +0200
Subject: [PATCH 46/58] Don't forget to free the iri struct

---
 src/retr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/retr.c b/src/retr.c
index fe176eaf..4731d9ee 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -928,6 +928,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
   /* Free the linked list of URL-s.  */
   free_urlpos (url_list);
 
+  iri_free (iri);
+
   return status;
 }
 

From 723dbfc818e3e5b22ec53fd093dca999290ebead Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Thu, 14 Aug 2008 18:26:53 +0200
Subject: [PATCH 47/58] Correct iri handling while fetching a remote file list
 with -i and provide a test

---
 src/main.c             |   2 +-
 src/recur.c            |  15 +++-
 src/recur.h            |   2 +-
 src/retr.c             |   8 +-
 tests/Test-iri-list.px | 173 +++++++++++++++++++++++++++++++++++++++++
 tests/run-px           |   1 +
 6 files changed, 195 insertions(+), 6 deletions(-)
 create mode 100755 tests/Test-iri-list.px

diff --git a/src/main.c b/src/main.c
index 79c35220..8d8d93fa 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1196,7 +1196,7 @@ WARNING: Can't reopen standard output in binary mode;\n\
           if (url_scheme (*t) == SCHEME_FTP)
             opt.follow_ftp = 1;
 
-          status = retrieve_tree (*t);
+          status = retrieve_tree (*t, NULL);
 
           opt.follow_ftp = old_follow_ftp;
         }
diff --git a/src/recur.c b/src/recur.c
index 71fbe7bf..921c60c7 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -187,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int,
           options, add it to the queue. */
 
 uerr_t
-retrieve_tree (const char *start_url)
+retrieve_tree (const char *start_url, struct iri *pi)
 {
   uerr_t status = RETROK;
 
@@ -201,7 +201,18 @@ retrieve_tree (const char *start_url)
   int up_error_code;
   struct url *start_url_parsed;
   struct iri *i = iri_new ();
-  set_uri_encoding (i, opt.locale, true);
+
+#define COPYSTR(x)  (x) ? xstrdup(x) : NULL;
+  /* Duplicate pi struct if not NULL */
+  if (pi)
+    {
+      i->uri_encoding = COPYSTR (pi->uri_encoding);
+      i->content_encoding = COPYSTR (pi->content_encoding);
+      i->utf8_encode = pi->utf8_encode;
+    }
+  else
+    set_uri_encoding (i, opt.locale, true);
+#undef COPYSTR
 
   start_url_parsed = url_parse (start_url, &up_error_code, i);
   if (!start_url_parsed)
diff --git a/src/recur.h b/src/recur.h
index 5ab26a95..515a382b 100644
--- a/src/recur.h
+++ b/src/recur.h
@@ -42,6 +42,6 @@ as that of the covered work.  */
 struct urlpos;
 
 void recursive_cleanup (void);
-uerr_t retrieve_tree (const char *);
+uerr_t retrieve_tree (const char *, struct iri *);
 
 #endif /* RECUR_H */
diff --git a/src/retr.c b/src/retr.c
index 4731d9ee..963d5044 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -651,7 +651,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   proxy = getproxy (u);
   if (proxy)
     {
-      /* sXXXav : could a proxy include a path ??? */
       struct iri *pi = iri_new ();
       set_uri_encoding (pi, opt.locale, true);
       pi->utf8_encode = false;
@@ -858,6 +857,7 @@ retrieve_from_file (const char *file, bool html, int *count)
   *count = 0;                  /* Reset the URL count.  */
 
   /* sXXXav : Assume filename and links in the file are in the locale */
+  set_uri_encoding (iri, opt.locale, true);
   set_content_encoding (iri, opt.locale);
 
   if (url_has_scheme (url))
@@ -894,6 +894,10 @@ retrieve_from_file (const char *file, bool html, int *count)
           status = QUOTEXC;
           break;
         }
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+
       if ((opt.recursive || opt.page_requisites)
           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
         {
@@ -903,7 +907,7 @@ retrieve_from_file (const char *file, bool html, int *count)
           if (cur_url->url->scheme == SCHEME_FTP)
             opt.follow_ftp = 1;
 
-          status = retrieve_tree (cur_url->url->url);
+          status = retrieve_tree (cur_url->url->url, iri);
 
           opt.follow_ftp = old_follow_ftp;
         }
diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px
new file mode 100755
index 00000000..51bb09fe
--- /dev/null
+++ b/tests/Test-iri-list.px
@@ -0,0 +1,173 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# cf. http://en.wikipedia.org/wiki/Latin1
+#     http://en.wikipedia.org/wiki/ISO-8859-15
+###############################################################################
+#
+# mime : charset found in Content-Type HTTP MIME header
+# meta : charset found in Content-Type meta tag
+#
+# index.html                  mime + file = iso-8859-15
+# p1_français.html            meta + file = iso-8859-1, mime = utf-8
+# p2_één.html                 meta + file = utf-8, mime =iso-8859-1
+#
+
+my $ccedilla_l1 = "\xE7";
+my $ccedilla_u8 = "\xC3\xA7";
+my $eacute_l1 = "\xE9";
+my $eacute_u8 = "\xC3\xA9";
+
+my $urllist = <<EOF;
+http://localhost:{{port}}/
+http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
+http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
+EOF
+
+my $pageindex = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+	Main page.
+  </p>
+</body>
+</html>
+EOF
+
+my $pagefrancais = <<EOF;
+<html>
+<head>
+  <title>La seule page en français</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
+</head>
+<body>
+  <p>
+    French page.
+  </p>
+</body>
+</html>
+EOF
+
+my $pageeen = <<EOF;
+<html>
+<head>
+  <title>Die enkele nederlandstalige pagina</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+</head>
+<body>
+  <p>
+    Dutch page.
+  </p>
+</body>
+</html>
+EOF
+
+my $page404 = <<EOF;
+<html>
+<head>
+  <title>404</title>
+</head>
+<body>
+  <p>
+    Nop nop nop...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-15",
+        },
+        content => $pageindex,
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/p1_fran%C3%A7ais.html' => {	# UTF-8 encoded
+        code => "404",
+        msg => "File not found",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $page404,
+    },
+    '/p1_fran%E7ais.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=UTF-8",
+        },
+        content => $pagefrancais,
+    },
+    '/p2_%C3%A9%C3%A9n.html' => {	# UTF-8 encoded
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/p2_%E9%E9n.html' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/html; charset=ISO-8859-1",
+        },
+        content => $pageeen,
+    },
+    '/url_list.txt' => {
+        code => "200",
+        msg => "Ok",
+        headers => {
+            "Content-type" => "text/plain; charset=ISO-8859-1",
+        },
+        content => $urllist,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'url_list.txt' => {
+        content => $urllist,
+    },
+    'index.html' => {
+        content => $pageindex,
+    },
+    "p1_fran${ccedilla_l1}ais.html" => {
+        content => $pagefrancais,
+    },
+    "p2_${eacute_u8}${eacute_u8}n.html" => {
+        content => $pageeen,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-list",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 172adcd7..51dec828 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -25,6 +25,7 @@ my @tests = (
     'Test-iri.px',
     'Test-iri-disabled.px',
     'Test-iri-forced-remote.px',
+    'Test-iri-list.px',
     'Test-N-current.px',
     'Test-N-smaller.px',
     'Test-N-no-info.px',

From a5c222fa798673319e930e944d8d59cd906361fc Mon Sep 17 00:00:00 2001
From: Xavier Saint <wget@sxav.eu>
Date: Thu, 14 Aug 2008 18:31:03 +0200
Subject: [PATCH 48/58] Update tests/Changelog for Test-iri-list.px

---
 tests/ChangeLog | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/ChangeLog b/tests/ChangeLog
index ad18c14a..f2179763 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,7 @@
+2008-08-14  Xavier Saint <wget@sxav.eu>
+	
+	* Test-iri-list.px : Fetch files from a remote list.
+
 2008-08-03  Xavier Saint <wget@sxav.eu>
 
 	* Test-iri.px : HTTP recursive fetch for testing IRI support and

From 1063191b33579ef411e17881125e926573839560 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Fri, 15 Aug 2008 14:41:15 +0200
Subject: [PATCH 49/58] Fix a double quoting

---
 src/iri.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/iri.c b/src/iri.c
index ea4046af..e3909d50 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -316,7 +316,7 @@ iri_free (struct iri *i)
 void
 set_uri_encoding (struct iri *i, char *charset, bool force)
 {
-  DEBUGP (("URI encoding = `%s'\n", charset ? quote (charset) : "None"));
+  DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
   if (!force && opt.encoding_remote)
     return;
   if (i->uri_encoding)

From 5133d573667c75e5af2de1a4797d7610b05900a5 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Fri, 15 Aug 2008 15:03:38 +0200
Subject: [PATCH 50/58] Fixes: use encoding got from the remote server serving
 the list file and use quote_n() for quote'ing several args

---
 src/retr.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/retr.c b/src/retr.c
index 0aa95072..28a6d874 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -636,8 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       return URLERROR;
     }
 
-  DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url),
-           iri->uri_encoding ? quote (iri->uri_encoding) : "None",
+  DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
+           iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
            iri->utf8_encode));
 
   if (!refurl)
@@ -880,6 +880,10 @@ retrieve_from_file (const char *file, bool html, int *count)
 
       if (dt & TEXTHTML)
         html = true;
+
+      /* If we have a found a content encoding, use it */
+      if (iri->content_encoding)
+	  set_uri_encoding (iri, iri->content_encoding, false);
     }
   else
     input_file = (char *) file;

From 26a3eea8e2f42c621ce6c40a93acf5ff1cd12220 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Fri, 15 Aug 2008 15:15:42 +0200
Subject: [PATCH 51/58] Removed commented  *printf and use quote_n() for
 quoting several args

---
 src/html-url.c | 1 -
 src/main.c     | 2 --
 src/recur.c    | 4 ++--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index cbaffb25..c954cb97 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -571,7 +571,6 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
       if (!mcharset)
         return;
 
-      /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
       xfree_null (meta_charset);
       meta_charset = mcharset;
     }
diff --git a/src/main.c b/src/main.c
index c080394e..414b62bc 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1076,8 +1076,6 @@ for details.\n\n"));
 
       if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
         opt.encoding_remote = NULL;
-
-      /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
     }
 #else
   if (opt.enable_iri || opt.locale || opt.encoding_remote)
diff --git a/src/recur.c b/src/recur.c
index a0bb8681..78682458 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -115,8 +115,8 @@ url_enqueue (struct url_queue *queue, struct iri *i,
   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 
   if (i)
-    DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url),
-             i->uri_encoding ? quote (i->uri_encoding) : "None"));
+    DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
+             i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
 
   if (queue->tail)
     queue->tail->next = qel;

From ab0b0a40904c6cbd4a21d61c6acf31ff11c30a71 Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Thu, 28 Aug 2008 02:45:29 -0700
Subject: [PATCH 52/58] IDN test.

---
 tests/ChangeLog                | 12 +++++++
 tests/HTTPServer.pm            |  3 +-
 tests/Test-idn-headers.px      | 65 ++++++++++++++++++++++++++++++++++
 tests/Test-proxy-auth-basic.px |  2 +-
 tests/run-px                   |  1 +
 5 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100755 tests/Test-idn-headers.px

diff --git a/tests/ChangeLog b/tests/ChangeLog
index f2179763..7eb37563 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,15 @@
+2008-08-28  Micah Cowan  <micah@cowan.name>
+
+	* HTTPServer.pm (run): Allow distinguishing between hostnames,
+	when used as a proxy.
+
+	* Test-idn-headers.px: Added.
+
+	* run-px: Added Test-idn-headers.px.
+
+	* Test-proxy-auth-basic.px: Use the full URL, rather than just the
+	path (made necessary by the accompanying change to HTTPServer.pm).
+
 2008-08-14  Xavier Saint <wget@sxav.eu>
 	
 	* Test-iri-list.px : Fetch files from a remote list.
diff --git a/tests/HTTPServer.pm b/tests/HTTPServer.pm
index b76f0985..01c36957 100644
--- a/tests/HTTPServer.pm
+++ b/tests/HTTPServer.pm
@@ -27,7 +27,8 @@ sub run {
         my $con = $self->accept();
         print STDERR "Accepted a new connection\n" if $log;
         while (my $req = $con->get_request) {
-            my $url_path = $req->url->path;
+            #my $url_path = $req->url->path;
+            my $url_path = $req->url->as_string;
             if ($url_path =~ m{/$}) { # append 'index.html'
                 $url_path .= 'index.html';
             }
diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px
new file mode 100755
index 00000000..3289d5f5
--- /dev/null
+++ b/tests/Test-idn-headers.px
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<a href="http://$euc_jp_hostname/">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    'http://start-here.com/start.html' => {
+        code => "200",
+        msg => "You want fries with that?",
+        headers => {
+            'Content-Type' => 'text/html; charset=EUC-JP',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'start-here.com/start.html' => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-headers",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/Test-proxy-auth-basic.px b/tests/Test-proxy-auth-basic.px
index e440a392..e3934d7d 100755
--- a/tests/Test-proxy-auth-basic.px
+++ b/tests/Test-proxy-auth-basic.px
@@ -11,7 +11,7 @@ my $wholefile = "You're all authenticated.\n";
 
 # code, msg, headers, content
 my %urls = (
-    '/needs-auth.txt' => {
+    'http://no.such.domain/needs-auth.txt' => {
         auth_method => 'Basic',
         user => 'fiddle-dee-dee',
         passwd => 'Dodgson',
diff --git a/tests/run-px b/tests/run-px
index c18c8d85..c2380d5b 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -23,6 +23,7 @@ my @tests = (
     'Test-HTTP-Content-Disposition-1.px',
     'Test-HTTP-Content-Disposition-2.px',
     'Test-HTTP-Content-Disposition.px',
+    'Test-idn-headers.px',
     'Test-iri.px',
     'Test-iri-disabled.px',
     'Test-iri-forced-remote.px',

From 171c71e09cc710e82c2fa6f3c4d08a678083b346 Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Thu, 28 Aug 2008 12:47:17 -0700
Subject: [PATCH 53/58] Test for IDN, based on meta-specified encoding.

---
 tests/ChangeLog        |  4 +--
 tests/Test-idn-meta.px | 66 ++++++++++++++++++++++++++++++++++++++++++
 tests/run-px           |  1 +
 3 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100755 tests/Test-idn-meta.px

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 7eb37563..867a82ec 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -3,9 +3,9 @@
 	* HTTPServer.pm (run): Allow distinguishing between hostnames,
 	when used as a proxy.
 
-	* Test-idn-headers.px: Added.
+	* Test-idn-headers.px, Test-idn-meta.px: Added.
 
-	* run-px: Added Test-idn-headers.px.
+	* run-px: Added Test-idn-headers.px, Test-idn-meta.px.
 
 	* Test-proxy-auth-basic.px: Use the full URL, rather than just the
 	path (made necessary by the accompanying change to HTTPServer.pm).
diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px
new file mode 100755
index 00000000..1397cf45
--- /dev/null
+++ b/tests/Test-idn-meta.px
@@ -0,0 +1,66 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP" />
+<a href="http://$euc_jp_hostname/">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    'http://start-here.com/start.html' => {
+        code => "200",
+        msg => "You want fries with that?",
+        headers => {
+            'Content-Type' => 'text/html; charset=UTF-8',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    'start-here.com/start.html' => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-meta",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index c2380d5b..50f33218 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -24,6 +24,7 @@ my @tests = (
     'Test-HTTP-Content-Disposition-2.px',
     'Test-HTTP-Content-Disposition.px',
     'Test-idn-headers.px',
+    'Test-idn-meta.px',
     'Test-iri.px',
     'Test-iri-disabled.px',
     'Test-iri-forced-remote.px',

From 523c3dfcbc3e6858ea94288554d67d3c1208a7c1 Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Tue, 9 Sep 2008 21:55:02 -0700
Subject: [PATCH 54/58] Test-idn-cmd.px.

---
 tests/ChangeLog       |  6 ++++++
 tests/Test-idn-cmd.px | 50 +++++++++++++++++++++++++++++++++++++++++++
 tests/run-px          |  1 +
 3 files changed, 57 insertions(+)
 create mode 100755 tests/Test-idn-cmd.px

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 867a82ec..7751be64 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2008-09-09  Micah Cowan  <micah@cowan.name>
+
+	* Test-idn-cmd.px: Added.
+
+	* run-px: Added Test-idn-cmd.px.
+
 2008-08-28  Micah Cowan  <micah@cowan.name>
 
 	* HTTPServer.pm (run): Allow distinguishing between hostnames,
diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px
new file mode 100755
index 00000000..a5c156a2
--- /dev/null
+++ b/tests/Test-idn-cmd.px
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} --locale=EUC-JP $euc_jp_hostname";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "$punycoded_hostname/index.html" => {
+        content => $result_file,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-iri-cmd",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 50f33218..38520714 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -25,6 +25,7 @@ my @tests = (
     'Test-HTTP-Content-Disposition.px',
     'Test-idn-headers.px',
     'Test-idn-meta.px',
+    'Test-idn-cmd.px',
     'Test-iri.px',
     'Test-iri-disabled.px',
     'Test-iri-forced-remote.px',

From 66dd4bda74bb78915b92cac4e7bfd32a3fe9d957 Mon Sep 17 00:00:00 2001
From: Saint Xavier <wget@sxav.eu>
Date: Sat, 27 Sep 2008 11:13:21 +0200
Subject: [PATCH 55/58] IRI requirement: do not percent-encode already
 percent-encoded values (try1)

---
 src/html-url.c    |  6 +++---
 src/iri.c         |  2 ++
 src/iri.h         |  1 +
 src/recur.c       |  8 ++++----
 src/retr.c        | 13 ++++++++-----
 src/url.c         | 22 ++++++++++++++--------
 src/url.h         |  2 +-
 tests/Test-iri.px |  6 +++---
 8 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index c954cb97..e6ab2324 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -288,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
           return NULL;
         }
 
-      url = url_parse (link_uri, NULL, NULL);
+      url = url_parse (link_uri, NULL, NULL, false);
       if (!url)
         {
           DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@ -307,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
       DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                ctx->document_file, base, link_uri, complete_uri));
 
-      url = url_parse (complete_uri, NULL, NULL);
+      url = url_parse (complete_uri, NULL, NULL, false);
       if (!url)
         {
           DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@ -752,7 +752,7 @@ get_urls_file (const char *file)
           url_text = merged;
         }
 
-      url = url_parse (url_text, &up_error_code, NULL);
+      url = url_parse (url_text, &up_error_code, NULL, false);
       if (!url)
         {
           char *error = url_error (url_text, up_error_code);
diff --git a/src/iri.c b/src/iri.c
index e3909d50..b1e0bf89 100644
--- a/src/iri.c
+++ b/src/iri.c
@@ -298,6 +298,7 @@ iri_new (void)
   struct iri *i = xmalloc (sizeof (struct iri));
   i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
   i->content_encoding = NULL;
+  i->orig_url = NULL;
   i->utf8_encode = opt.enable_iri;
   return i;
 }
@@ -308,6 +309,7 @@ iri_free (struct iri *i)
 {
   xfree_null (i->uri_encoding);
   xfree_null (i->content_encoding);
+  xfree_null (i->orig_url);
   xfree (i);
 }
 
diff --git a/src/iri.h b/src/iri.h
index c024de72..6ad2becf 100644
--- a/src/iri.h
+++ b/src/iri.h
@@ -33,6 +33,7 @@ as that of the covered work.  */
 struct iri {
   char *uri_encoding;      /* Encoding of the uri to fetch */
   char *content_encoding;  /* Encoding of links inside the fetched file */
+  char *orig_url;          /* */
   bool utf8_encode;        /* Will/Is the current url encoded in utf8 */
 };
 
diff --git a/src/recur.c b/src/recur.c
index 78682458..95581486 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -214,7 +214,7 @@ retrieve_tree (const char *start_url, struct iri *pi)
     set_uri_encoding (i, opt.locale, true);
 #undef COPYSTR
 
-  start_url_parsed = url_parse (start_url, &up_error_code, i);
+  start_url_parsed = url_parse (start_url, &up_error_code, i, true);
   if (!start_url_parsed)
     {
       char *error = url_error (start_url, up_error_code);
@@ -381,7 +381,7 @@ retrieve_tree (const char *start_url, struct iri *pi)
           if (children)
             {
               struct urlpos *child = children;
-              struct url *url_parsed = url_parse (url, NULL, i);
+              struct url *url_parsed = url_parse (url, NULL, i, false);
               struct iri *ci;
               char *referer_url = url;
               bool strip_auth = (url_parsed != NULL
@@ -694,10 +694,10 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
   struct urlpos *upos;
   bool success;
 
-  orig_parsed = url_parse (original, NULL, NULL);
+  orig_parsed = url_parse (original, NULL, NULL, false);
   assert (orig_parsed != NULL);
 
-  new_parsed = url_parse (redirected, NULL, NULL);
+  new_parsed = url_parse (redirected, NULL, NULL, false);
   assert (new_parsed != NULL);
 
   upos = xnew0 (struct urlpos);
diff --git a/src/retr.c b/src/retr.c
index 28a6d874..fe4e3e76 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -626,7 +626,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
     *file = NULL;
 
  second_try:
-  u = url_parse (url, &up_error_code, iri);
+  u = url_parse (url, &up_error_code, iri, true);
   if (!u)
     {
       char *error = url_error (url, up_error_code);
@@ -658,7 +658,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
       pi->utf8_encode = false;
 
       /* Parse the proxy URL.  */
-      proxy_url = url_parse (proxy, &up_error_code, NULL);
+      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
       if (!proxy_url)
         {
           char *error = url_error (proxy, up_error_code);
@@ -739,9 +739,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
          the content encoding. */
       iri->utf8_encode = opt.enable_iri;
       set_content_encoding (iri, NULL);
+      xfree_null (iri->orig_url);
 
       /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
       if (!newloc_parsed)
         {
           char *error = url_error (mynewloc, up_error_code);
@@ -794,7 +795,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
   if (!(*dt & RETROKF) && iri->utf8_encode)
     {
       iri->utf8_encode = false;
-      DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
+      DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
       goto second_try;
     }
 
@@ -907,6 +908,8 @@ retrieve_from_file (const char *file, bool html, int *count)
 
       /* Reset UTF-8 encode status */
       iri->utf8_encode = opt.enable_iri;
+      xfree_null (iri->orig_url);
+      iri->orig_url = NULL;
 
       if ((opt.recursive || opt.page_requisites)
           && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
@@ -1100,7 +1103,7 @@ url_uses_proxy (const char *url)
   struct iri *i = iri_new();
   /* url was given in the command line, so use locale as encoding */
   set_uri_encoding (i, opt.locale, true);
-  u= url_parse (url, NULL, i);
+  u= url_parse (url, NULL, i, false);
   if (!u)
     return false;
   ret = getproxy (u) != NULL;
diff --git a/src/url.c b/src/url.c
index c937d056..8f067250 100644
--- a/src/url.c
+++ b/src/url.c
@@ -640,7 +640,7 @@ static const char *parse_errors[] = {
    error, and if ERROR is not NULL, also set *ERROR to the appropriate
    error code. */
 struct url *
-url_parse (const char *url, int *error, struct iri *iri)
+url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 {
   struct url *u;
   const char *p;
@@ -672,13 +672,19 @@ url_parse (const char *url, int *error, struct iri *iri)
 
   if (iri && iri->utf8_encode)
     {
-      url_unescape ((char *) url);
-      iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+      iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
       if (!iri->utf8_encode)
         new_url = NULL;
+      else
+        iri->orig_url = xstrdup (url);
     }
 
-  url_encoded = reencode_escapes (new_url ? new_url : url);
+  /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
+  if (percent_encode)
+    url_encoded = reencode_escapes (new_url ? new_url : url);
+  else
+     url_encoded = new_url ? new_url : url;
+
   p = url_encoded;
 
   if (new_url && url_encoded != new_url)
@@ -1992,12 +1998,12 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
 
 static int
 getchar_from_escaped_string (const char *str, char *c)
-{  
+{
   const char *p = str;
 
   assert (str && *str);
   assert (c);
-  
+
   if (p[0] == '%')
     {
       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
@@ -2047,7 +2053,7 @@ are_urls_equal (const char *u1, const char *u2)
       p += pp;
       q += qq;
     }
-  
+
   return (*p == 0 && *q == 0 ? true : false);
 }
 
@@ -2156,7 +2162,7 @@ test_append_uri_pathel()
   } test_array[] = {
     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
   };
-  
+
   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 
     {
       struct growable dest;
diff --git a/src/url.h b/src/url.h
index 0748e214..2fa8d51c 100644
--- a/src/url.h
+++ b/src/url.h
@@ -84,7 +84,7 @@ struct url
 
 char *url_escape (const char *);
 
-struct url *url_parse (const char *, int *, struct iri *iri);
+struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);
 char *url_error (const char *, int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
index d228721c..ca6feddf 100755
--- a/tests/Test-iri.px
+++ b/tests/Test-iri.px
@@ -214,9 +214,9 @@ my %expected_downloaded_files = (
 ###############################################################################
 
 my $the_test = HTTPTest->new (name => "Test-iri",
-                              input => \%urls, 
-                              cmdline => $cmdline, 
-                              errcode => $expected_error_code, 
+                              input => \%urls,
+                              cmdline => $cmdline,
+                              errcode => $expected_error_code,
                               output => \%expected_downloaded_files);
 exit $the_test->run();
 

From 0fa023cfffc896d72ba36a8789154630e585435a Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Wed, 26 Nov 2008 07:14:27 -0800
Subject: [PATCH 56/58] More module-scoped warnings.

---
 tests/ChangeLog                 | 8 ++++++++
 tests/Test-ftp-iri-disabled.px  | 3 ++-
 tests/Test-ftp-iri-fallback.px  | 3 ++-
 tests/Test-ftp-iri.px           | 3 ++-
 tests/Test-idn-cmd.px           | 3 ++-
 tests/Test-idn-headers.px       | 3 ++-
 tests/Test-idn-meta.px          | 3 ++-
 tests/Test-iri-disabled.px      | 3 ++-
 tests/Test-iri-forced-remote.px | 3 ++-
 tests/Test-iri-list.px          | 3 ++-
 tests/Test-iri.px               | 3 ++-
 11 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 25f2ab40..ffe5fddd 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,11 @@
+2008-11-26  Micah Cowan  <micah@cowan.name>  (not copyrightable)
+
+	* Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
+	Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px,
+	Test-idn-meta.px, Test-iri-disabled.px,
+	Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More
+	module-scope warnings.
+
 2008-11-12  Steven Schubiger  <stsc@members.fsf.org>
 
 	* Test-auth-basic.px, Test-auth-no-challenge.px,
diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px
index 14d849da..96122867 100755
--- a/tests/Test-ftp-iri-disabled.px
+++ b/tests/Test-ftp-iri-disabled.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use FTPTest;
 
diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px
index 8902e0f9..091fd008 100755
--- a/tests/Test-ftp-iri-fallback.px
+++ b/tests/Test-ftp-iri-fallback.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use FTPTest;
 
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px
index d453669c..78e2622c 100755
--- a/tests/Test-ftp-iri.px
+++ b/tests/Test-ftp-iri.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use FTPTest;
 
diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px
index a5c156a2..dba98183 100755
--- a/tests/Test-idn-cmd.px
+++ b/tests/Test-idn-cmd.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px
index 3289d5f5..f07621c3 100755
--- a/tests/Test-idn-headers.px
+++ b/tests/Test-idn-headers.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px
index 1397cf45..3d6e0563 100755
--- a/tests/Test-idn-meta.px
+++ b/tests/Test-idn-meta.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px
index 17e43361..02fc4d3a 100755
--- a/tests/Test-iri-disabled.px
+++ b/tests/Test-iri-disabled.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px
index 1acd03a7..8341d516 100755
--- a/tests/Test-iri-forced-remote.px
+++ b/tests/Test-iri-forced-remote.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px
index 51bb09fe..87cc33c8 100755
--- a/tests/Test-iri-list.px
+++ b/tests/Test-iri-list.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 
diff --git a/tests/Test-iri.px b/tests/Test-iri.px
index d228721c..662019e7 100755
--- a/tests/Test-iri.px
+++ b/tests/Test-iri.px
@@ -1,6 +1,7 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
 use strict;
+use warnings;
 
 use HTTPTest;
 

From c6f511e7897e706ebd644e5b573abb5caf9e36f2 Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Thu, 4 Dec 2008 13:57:18 -0800
Subject: [PATCH 57/58] Fix test names.

---
 tests/ChangeLog           | 5 +++++
 tests/Test-idn-cmd.px     | 2 +-
 tests/Test-idn-headers.px | 2 +-
 tests/Test-idn-meta.px    | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 16e7bd3b..8e1a63f6 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,8 @@
+2008-12-04  Micah Cowan  <micah@cowan.name> (not copyrightable)
+
+	* Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
+	Fix test names.
+
 2008-11-26  Micah Cowan  <micah@cowan.name>  (not copyrightable)
 
 	* Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px
index dba98183..2f979624 100755
--- a/tests/Test-idn-cmd.px
+++ b/tests/Test-idn-cmd.px
@@ -40,7 +40,7 @@ my %expected_downloaded_files = (
 
 ###############################################################################
 
-my $the_test = HTTPTest->new (name => "Test-iri-cmd",
+my $the_test = HTTPTest->new (name => "Test-idn-cmd",
                               input => \%urls, 
                               cmdline => $cmdline, 
                               errcode => $expected_error_code, 
diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px
index f07621c3..b94c1cde 100755
--- a/tests/Test-idn-headers.px
+++ b/tests/Test-idn-headers.px
@@ -55,7 +55,7 @@ my %expected_downloaded_files = (
 
 ###############################################################################
 
-my $the_test = HTTPTest->new (name => "Test-iri-headers",
+my $the_test = HTTPTest->new (name => "Test-idn-headers",
                               input => \%urls, 
                               cmdline => $cmdline, 
                               errcode => $expected_error_code, 
diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px
index 3d6e0563..2734e1ea 100755
--- a/tests/Test-idn-meta.px
+++ b/tests/Test-idn-meta.px
@@ -56,7 +56,7 @@ my %expected_downloaded_files = (
 
 ###############################################################################
 
-my $the_test = HTTPTest->new (name => "Test-iri-meta",
+my $the_test = HTTPTest->new (name => "Test-idn-meta",
                               input => \%urls, 
                               cmdline => $cmdline, 
                               errcode => $expected_error_code, 

From 5d0073b8f290dee2e9bad3e83230f6b57dd06beb Mon Sep 17 00:00:00 2001
From: Micah Cowan <micah@cowan.name>
Date: Thu, 4 Dec 2008 14:25:12 -0800
Subject: [PATCH 58/58] Robots idn test.

---
 tests/ChangeLog          |  3 ++
 tests/Test-idn-robots.px | 78 ++++++++++++++++++++++++++++++++++++++++
 tests/run-px             |  1 +
 3 files changed, 82 insertions(+)
 create mode 100755 tests/Test-idn-robots.px

diff --git a/tests/ChangeLog b/tests/ChangeLog
index 8e1a63f6..d9ba6531 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,5 +1,8 @@
 2008-12-04  Micah Cowan  <micah@cowan.name> (not copyrightable)
 
+	* run-px, Test-idn-robots.px: Added test for robots-file
+	downloads.
+
 	* Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
 	Fix test names.
 
diff --git a/tests/Test-idn-robots.px b/tests/Test-idn-robots.px
new file mode 100755
index 00000000..bc9084ef
--- /dev/null
+++ b/tests/Test-idn-robots.px
@@ -0,0 +1,78 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# " Kon'nichiwa <dot> Japan
+my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
+my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
+
+###############################################################################
+
+my $starter_file = <<EOF;
+<a href="http://$euc_jp_hostname/foo.txt">The link</a>
+EOF
+
+my $result_file = <<EOF;
+Found me!
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    "http://$punycoded_hostname/index.html" => {
+        code => "200",
+        msg => "Yes, please",
+        headers => {
+            'Content-Type' => 'text/html; charset=EUC-JP',
+        },
+        content => $starter_file,
+    },
+    "http://$punycoded_hostname/foo.txt" => {
+        code => "200",
+        msg => "Uh-huh",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => $result_file,
+    },
+    "http://$punycoded_hostname/robots.txt" => {
+        code => "200",
+        msg => "Uh-huh",
+        headers => {
+            'Content-Type' => 'text/plain',
+        },
+        content => '',
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
+    . " -e http_proxy=localhost:{{port}} --locale=EUC-JP"
+    . " http://$euc_jp_hostname/";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "$punycoded_hostname/index.html" => {
+        content => $starter_file,
+    },
+    "$punycoded_hostname/foo.txt" => {
+        content => $result_file,
+    },
+    "$punycoded_hostname/robots.txt" => {
+        content => '',
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-idn-robots",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
diff --git a/tests/run-px b/tests/run-px
index 3ab1c444..01d84995 100755
--- a/tests/run-px
+++ b/tests/run-px
@@ -34,6 +34,7 @@ my @tests = (
     'Test-idn-headers.px',
     'Test-idn-meta.px',
     'Test-idn-cmd.px',
+    'Test-idn-robots.px',
     'Test-iri.px',
     'Test-iri-disabled.px',
     'Test-iri-forced-remote.px',