From d66a45f82859de0d7a92255dd73e544cf64ab7a9 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sat, 24 May 2008 23:57:56 +0200 Subject: [PATCH 01/55] Add autoconf code for IDN/IRIs support --- configure.ac | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/configure.ac b/configure.ac index a994896b..8100f48b 100644 --- a/configure.ac +++ b/configure.ac @@ -458,6 +458,60 @@ else fi AC_SUBST(COMMENT_IF_NO_POD2MAN) + +dnl +dnl Check for IDN/IRIs +dnl + +AC_ARG_ENABLE(iri, + AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]), + [case "${enable_iri}" in + no) + dnl Disable IRIs checking + AC_MSG_NOTICE([disabling IRIs at user request]) + iri=no + ;; + yes) + dnl IRIs explicitly enabled + iri=yes + force_iri=yes + ;; + auto) + dnl Auto-detect IRI + iri=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection + iri=yes + ] +) + +AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], + [Support IDN/IRIs (needs GNU Libidn)]), + libidn=$withval, libidn="") +if test "X$iri" != "Xno"; then + if test "$libidn" != ""; then + LDFLAGS="${LDFLAGS} -L$libidn/lib" + CPPFLAGS="${CPPFLAGS} -I$libidn/include" + fi + AC_CHECK_HEADER(idna.h, + AC_CHECK_LIB(idn, stringprep_check_version, + [iri=yes LIBS="${LIBS} -lidn"], iri=no), + iri=no) + + if test "X$iri" != "Xno" ; then + AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.]) + AC_MSG_NOTICE([Enabling support for IRI.]) + else + AC_MSG_WARN([Libidn not found]) + fi +fi + + dnl dnl Create output dnl From d9cd5d220777a9e9abc8b54add709994d031d613 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 25 May 2008 03:34:28 +0200 Subject: [PATCH 02/55] Add options for enabling IRI support and forcing some encodings --- src/init.c | 9 +++++++++ src/main.c | 30 ++++++++++++++++++++++++++++++ src/options.h | 6 ++++++ 3 files changed, 45 insertions(+) diff --git a/src/init.c b/src/init.c index 97976553..8e8ed488 100644 --- a/src/init.c +++ b/src/init.c @@ -181,9 +181,15 @@ static const struct { { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, +#ifdef ENABLE_IRI + { "iri", &opt.enable_iri, cmd_boolean }, +#endif { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, +#ifdef ENABLE_IRI + { "locale", &opt.locale, cmd_string }, +#endif { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@ -223,6 +229,9 @@ static const struct { { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, +#ifdef ENABLE_IRI + { "remoteencoding", &opt.encoding_remote, cmd_string }, +#endif { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, diff --git a/src/main.c b/src/main.c index d68cdbd6..4f033697 100644 --- a/src/main.c +++ b/src/main.c @@ -43,6 +43,9 @@ as that of the covered work. */ #include #include #include +#ifdef ENABLE_IRI +#include +#endif #include "utils.h" #include "init.h" @@ -190,10 +193,16 @@ static struct cmdline_option option_data[] = { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, +#ifdef ENABLE_IRI + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, +#endif { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, +#ifdef ENABLE_IRI + { "locale", 0, OPT_VALUE, "locale", -1 }, +#endif { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@ -227,6 +236,9 @@ static struct cmdline_option option_data[] = { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, +#ifdef ENABLE_IRI + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, +#endif { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@ -947,6 +959,24 @@ for details.\n\n")); exit (1); } +#ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (!opt.locale) + { + opt.locale = getenv ("CHARSET"); + + if (opt.locale == NULL) + opt.locale = nl_langinfo(CODESET); + } + else + { + /* sXXXav : check given locale */ + logprintf (LOG_VERBOSE, "Check the locale...\n"); + } + } +#endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); diff --git a/src/options.h b/src/options.h index 6a6badb0..2927a37c 100644 --- a/src/options.h +++ b/src/options.h @@ -237,6 +237,12 @@ struct options bool content_disposition; /* Honor HTTP Content-Disposition header. */ bool auth_without_challenge; /* Issue Basic authentication creds without waiting for a challenge. */ + +#ifdef ENABLE_IRI + bool enable_iri; + char *encoding_remote; + char *locale; +#endif }; extern struct options opt; From 4aab9e8f0af29d8fb5d59746e4e5270080f95468 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 25 May 2008 03:49:00 +0200 Subject: [PATCH 03/55] Detect HTTP Content-Type server encoding --- src/http.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/http.c b/src/http.c index 11dc9cc8..682258c0 100644 --- a/src/http.c +++ b/src/http.c @@ -2040,9 +2040,32 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); char *tmp = strchr (type, ';'); if (tmp) { +#ifdef ENABLE_IRI + char *tmp2 = tmp + 1; +#endif + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + +#ifdef ENABLE_IRI + if (opt.enable_iri && *tmp2 != '\0' && + (tmp = strstr (tmp2, "charset=")) != NULL) + { + tmp += 8; + tmp2 = tmp; + + while (*tmp2 && !c_isspace (*tmp2)) + tmp2++; + + if (tmp2 > tmp) + { + *tmp2 = '\0'; + /* sXXXav : check given charset */ + logprintf (LOG_VERBOSE, "HTTP charset: `%s'\n", tmp); + } + } +#endif } } hs->newloc = resp_header_strdup (resp, "Location"); From ed558a83f6021fa0f2a138b302ece363c1d0783b Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 19 Jun 2008 22:07:03 +0200 Subject: [PATCH 04/55] The prefered way is to avoid #ifdef flooding, so take it that way. Introduce iri.c and iri.h for achieving it --- ChangeLog | 4 +++ configure.ac | 4 +++ src/ChangeLog | 15 +++++++++++ src/Makefile.am | 8 ++++-- src/build_info.c | 7 +++++ src/http.c | 25 ++++------------- src/init.c | 6 ----- src/iri.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ src/iri.h | 43 +++++++++++++++++++++++++++++ src/main.c | 13 ++++----- src/options.h | 2 -- 11 files changed, 161 insertions(+), 36 deletions(-) create mode 100644 src/iri.c create mode 100644 src/iri.h diff --git a/ChangeLog b/ChangeLog index 407ce64c..8177dc97 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --git a/configure.ac b/configure.ac index 8100f48b..44f397b9 100644 --- a/configure.ac +++ b/configure.ac @@ -512,6 +512,10 @@ if test "X$iri" != "Xno"; then fi +dnl Needed by src/Makefile.am +AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) + + dnl dnl Create output dnl diff --git a/src/ChangeLog b/src/ChangeLog index 9af9267b..ac27e15a 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,18 @@ +2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM. diff --git a/src/Makefile.am b/src/Makefile.am index c8485cfd..379a9b8c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,6 +30,10 @@ # Version: @VERSION@ # +if IRI_IS_ENABLED +IRI_OBJ = iri.c +endif + # The following line is losing on some versions of make! DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ @@ -39,10 +43,10 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ recur.c res.c retr.c snprintf.c spider.c url.c \ - utils.c \ + utils.c $(IRI_OBJ) \ connect.h convert.h cookies.h \ ftp.h gen-md5.h hash.h host.h html-parse.h \ - http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ + http.h http-ntlm.h init.h iri.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ spider.h ssl.h sysdep.h url.h utils.h wget.h nodist_wget_SOURCES = version.c diff --git a/src/build_info.c b/src/build_info.c index 1ac682a7..129bf726 100644 --- a/src/build_info.c +++ b/src/build_info.c @@ -100,6 +100,13 @@ const char* (compiled_features[]) = #else "-gettext", #endif + +#ifdef ENABLE_IRI + "+iri", +#else + "-iri", +#endif + /* sentinel value */ NULL }; diff --git a/src/http.c b/src/http.c index 543165fd..741ed2c0 100644 --- a/src/http.c +++ b/src/http.c @@ -49,6 +49,7 @@ as that of the covered work. */ #include "retr.h" #include "connect.h" #include "netrc.h" +#include "iri.h" #ifdef HAVE_SSL # include "ssl.h" #endif @@ -2040,32 +2041,16 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); char *tmp = strchr (type, ';'); if (tmp) { -#ifdef ENABLE_IRI + /* sXXXav: only needed if IRI support is enabled */ char *tmp2 = tmp + 1; -#endif while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; -#ifdef ENABLE_IRI - if (opt.enable_iri && *tmp2 != '\0' && - (tmp = strstr (tmp2, "charset=")) != NULL) - { - tmp += 8; - tmp2 = tmp; - - while (*tmp2 && !c_isspace (*tmp2)) - tmp2++; - - if (tmp2 > tmp) - { - *tmp2 = '\0'; - /* sXXXav : check given charset */ - logprintf (LOG_VERBOSE, "HTTP charset: `%s'\n", tmp); - } - } -#endif + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + /* xxx = */ parse_charset (tmp2); } } hs->newloc = resp_header_strdup (resp, "Location"); diff --git a/src/init.c b/src/init.c index 167c84fe..a7a4ee01 100644 --- a/src/init.c +++ b/src/init.c @@ -181,15 +181,11 @@ static const struct { { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, -#ifdef ENABLE_IRI { "iri", &opt.enable_iri, cmd_boolean }, -#endif { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, -#ifdef ENABLE_IRI { "locale", &opt.locale, cmd_string }, -#endif { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@ -229,9 +225,7 @@ static const struct { { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, -#ifdef ENABLE_IRI { "remoteencoding", &opt.encoding_remote, cmd_string }, -#endif { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, diff --git a/src/iri.c b/src/iri.c new file mode 100644 index 00000000..e4f4622b --- /dev/null +++ b/src/iri.c @@ -0,0 +1,70 @@ +/* IRI related functions. + Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, + 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at +your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" + +#include +#include +#include +#include + +#include "utils.h" +#include "iri.h" + + +/* Given a string containing "charset=XXX", return the encoding if found, + or NULL otherwise */ +char * +parse_charset (char *str) +{ + char *charset; + + if (!str || !*str) + return NULL; + + str = strcasestr (str, "charset="); + if (!str) + return NULL; + + str += 8; + charset = str; + + /* sXXXav: which chars should be banned ??? */ + while (*charset && !c_isspace (*charset)) + charset++; + + /* sXXXav: could strdupdelim return NULL ? */ + charset = strdupdelim (str, charset); + logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); + + return charset; +} + + diff --git a/src/iri.h b/src/iri.h new file mode 100644 index 00000000..d135e868 --- /dev/null +++ b/src/iri.h @@ -0,0 +1,43 @@ +/* Internationalization related declarations. + Copyright (C) 2000, 2007, 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#ifndef IRI_H +#define IRI_H + +#ifdef ENABLE_IRI + +char *parse_charset (char *str); + + +#else /* ENABLE_IRI */ + +#define parse_charset(str) /* no-op */ + +#endif /* ENABLE_IRI */ +#endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 9b449438..8002c1be 100644 --- a/src/main.c +++ b/src/main.c @@ -203,16 +203,12 @@ static struct cmdline_option option_data[] = { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, -#ifdef ENABLE_IRI { "iri", 0, OPT_BOOLEAN, "iri", -1 }, -#endif { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, -#ifdef ENABLE_IRI { "locale", 0, OPT_VALUE, "locale", -1 }, -#endif { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@ -246,9 +242,7 @@ static struct cmdline_option option_data[] = { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, -#ifdef ENABLE_IRI { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, -#endif { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@ -1085,6 +1079,13 @@ for details.\n\n")); logprintf (LOG_VERBOSE, "Check the locale...\n"); } } +#else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } #endif if (opt.ask_passwd) diff --git a/src/options.h b/src/options.h index 2927a37c..723f80a1 100644 --- a/src/options.h +++ b/src/options.h @@ -238,11 +238,9 @@ struct options bool auth_without_challenge; /* Issue Basic authentication creds without waiting for a challenge. */ -#ifdef ENABLE_IRI bool enable_iri; char *encoding_remote; char *locale; -#endif }; extern struct options opt; From 13fec855660ee55c43f64fe47fbc284f35ca6e6e Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 19 Jun 2008 22:33:38 +0200 Subject: [PATCH 05/55] Add "content-type" meta tag parsing for retrieving HTML page encoding. --- src/ChangeLog | 8 ++++++++ src/html-url.c | 20 ++++++++++++++++++++ src/iri.h | 2 +- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/ChangeLog b/src/ChangeLog index ac27e15a..e30990b0 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-14 Xavier Saint * iri.c, iri.h : New files. diff --git a/src/html-url.c b/src/html-url.c index 0eb66506..9b515432 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -42,6 +42,7 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "recur.h" /* declaration of get_urls_html */ +#include "iri.h" struct map_context; @@ -534,6 +535,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset)); + + /* sXXXav: Not used yet */ + xfree (mcharset); + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: diff --git a/src/iri.h b/src/iri.h index d135e868..2ac7d5e7 100644 --- a/src/iri.h +++ b/src/iri.h @@ -37,7 +37,7 @@ char *parse_charset (char *str); #else /* ENABLE_IRI */ -#define parse_charset(str) /* no-op */ +#define parse_charset(str) NULL #endif /* ENABLE_IRI */ #endif /* IRI_H */ From 2baf3239333fd28763ce4135c38d6e85dcbb8cfc Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 19 Jun 2008 23:10:06 +0200 Subject: [PATCH 06/55] Introduce find_locale() to find out local system encoding. --- src/ChangeLog | 7 +++++++ src/iri.c | 9 +++++++++ src/iri.h | 3 ++- src/main.c | 15 +++++---------- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index e30990b0..ef69dca6 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,10 @@ +2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + 2008-06-19 Xavier Saint * html-url.c : Add "content-type" meta tag parsing for diff --git a/src/iri.c b/src/iri.c index e4f4622b..797ffa44 100644 --- a/src/iri.c +++ b/src/iri.c @@ -68,3 +68,12 @@ parse_charset (char *str) } +/* Find the locale used, or fall back on a default value */ +char * +find_locale (void) +{ + /* sXXXav, made our own function or use libidn one ?! */ + return stringprep_locale_charset (); +} + + diff --git a/src/iri.h b/src/iri.h index 2ac7d5e7..eb344d9f 100644 --- a/src/iri.h +++ b/src/iri.h @@ -33,11 +33,12 @@ as that of the covered work. */ #ifdef ENABLE_IRI char *parse_charset (char *str); - +char *find_locale (void); #else /* ENABLE_IRI */ #define parse_charset(str) NULL +#define find_locale() NULL #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 8002c1be..fc41153b 100644 --- a/src/main.c +++ b/src/main.c @@ -57,6 +57,7 @@ as that of the covered work. */ #include "convert.h" #include "spider.h" #include "http.h" /* for save_cookies */ +#include "iri.h" #include #include @@ -1067,17 +1068,11 @@ for details.\n\n")); if (opt.enable_iri) { if (!opt.locale) - { - opt.locale = getenv ("CHARSET"); + opt.locale = find_locale (); - if (opt.locale == NULL) - opt.locale = nl_langinfo(CODESET); - } - else - { - /* sXXXav : check given locale */ - logprintf (LOG_VERBOSE, "Check the locale...\n"); - } + /* sXXXav : check given locale and remote encoding */ + + logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale)); } #else if (opt.enable_iri || opt.locale || opt.encoding_remote) From be546c20cb08f9c25a6bd98bcdf44394e3e8a846 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 19 Jun 2008 23:53:03 +0200 Subject: [PATCH 07/55] Set an initial value for IRIs options and some minor additions in iri.c --- src/ChangeLog | 8 ++++++++ src/init.c | 8 ++++++++ src/iri.c | 4 +++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/ChangeLog b/src/ChangeLog index ef69dca6..7ad7c8ca 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + 2008-06-19 Xavier Saint * iri.c, iri.h : Add a new function find_locale() to find diff --git a/src/init.c b/src/init.c index a7a4ee01..f56aa652 100644 --- a/src/init.c +++ b/src/init.c @@ -333,6 +333,14 @@ defaults (void) opt.restrict_files_case = restrict_no_case_restriction; opt.max_redirect = 20; + +#ifdef ENABLE_IRI + opt.enable_iri = true; +#else + opt.enable_iri = false; +#endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is diff --git a/src/iri.c b/src/iri.c index 797ffa44..b87e6ebe 100644 --- a/src/iri.c +++ b/src/iri.c @@ -35,6 +35,8 @@ as that of the covered work. */ #include #include +#include + #include "utils.h" #include "iri.h" @@ -73,7 +75,7 @@ char * find_locale (void) { /* sXXXav, made our own function or use libidn one ?! */ - return stringprep_locale_charset (); + return (char *) stringprep_locale_charset (); } From e6376b47433be6a0df64b0cd87b2d5c2c53a66f1 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Fri, 20 Jun 2008 00:33:02 +0200 Subject: [PATCH 08/55] Introduce a new function check_encoding_name() for doing a basic check on encoding name validity --- src/ChangeLog | 7 +++++++ src/iri.c | 29 +++++++++++++++++++++++++++++ src/iri.h | 6 ++++-- src/main.c | 6 +++++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 7ad7c8ca..6dcaa279 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,10 @@ +2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a priliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + 2008-06-19 Xavier Saint * iri.c : Include missing stringprep.h file and add a diff --git a/src/iri.c b/src/iri.c index b87e6ebe..fea7b150 100644 --- a/src/iri.c +++ b/src/iri.c @@ -64,6 +64,14 @@ parse_charset (char *str) /* sXXXav: could strdupdelim return NULL ? */ charset = strdupdelim (str, charset); + + /* Do a minimum check on the charset value */ + if (!check_encoding_name (charset)) + { + xfree (charset); + return NULL; + } + logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); return charset; @@ -79,3 +87,24 @@ find_locale (void) } +/* Basic check of an encoding name. */ +bool +check_encoding_name (char *encoding) +{ + char *s = encoding; + + while (*s) + { + if (!c_isascii(*s) || c_isspace(*s)) + { + logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding)); + return false; + } + + s++; + } + + return true; +} + + diff --git a/src/iri.h b/src/iri.h index eb344d9f..85a7fb7f 100644 --- a/src/iri.h +++ b/src/iri.h @@ -34,11 +34,13 @@ as that of the covered work. */ char *parse_charset (char *str); char *find_locale (void); +bool check_encoding_name (char *encoding); #else /* ENABLE_IRI */ -#define parse_charset(str) NULL -#define find_locale() NULL +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index fc41153b..53ea6b91 100644 --- a/src/main.c +++ b/src/main.c @@ -1067,10 +1067,14 @@ for details.\n\n")); #ifdef ENABLE_IRI if (opt.enable_iri) { + if (opt.locale && !check_encoding_name(opt.locale)) + opt.locale = NULL; + if (!opt.locale) opt.locale = find_locale (); - /* sXXXav : check given locale and remote encoding */ + if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote)) + opt.encoding_remote = NULL; logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale)); } From 85ecaaea66d2b140347476081248154f5489e108 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Fri, 20 Jun 2008 00:37:15 +0200 Subject: [PATCH 09/55] Fix a typo in src/ChangeLog --- src/ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ChangeLog b/src/ChangeLog index 6dcaa279..c707eedf 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,7 +1,7 @@ 2008-06-19 Xavier Saint * iri.c, iri.h : New function check_encoding_name() as - a priliminary encoding name check. + a preliminary encoding name check. * main.c, iri.c : Make use of check_encoding_name(). From 5bb11da009c2f3bc4381bc8009c57007fd86534e Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 26 Jun 2008 17:59:07 +0200 Subject: [PATCH 10/55] Basic support of IRIs. --- src/ChangeLog | 9 ++++ src/iri.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/iri.h | 10 ++-- src/url.c | 20 +++++++- 4 files changed, 166 insertions(+), 7 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 6dcaa279..288ec11d 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + + 2008-06-19 Xavier Saint * iri.c, iri.h : New function check_encoding_name() as diff --git a/src/iri.c b/src/iri.c index fea7b150..5fb06d09 100644 --- a/src/iri.c +++ b/src/iri.c @@ -34,13 +34,22 @@ as that of the covered work. */ #include #include #include - +#include #include +#include +#include #include "utils.h" #include "iri.h" +static iconv_t locale2utf8; + + +static bool open_locale_to_utf8 (void); +static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); + + /* Given a string containing "charset=XXX", return the encoding if found, or NULL otherwise */ char * @@ -77,7 +86,6 @@ parse_charset (char *str) return charset; } - /* Find the locale used, or fall back on a default value */ char * find_locale (void) @@ -86,7 +94,6 @@ find_locale (void) return (char *) stringprep_locale_charset (); } - /* Basic check of an encoding name. */ bool check_encoding_name (char *encoding) @@ -107,4 +114,125 @@ check_encoding_name (char *encoding) return true; } +/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ +static bool +open_locale_to_utf8 (void) +{ + if (locale2utf8) + return true; + + /* sXXXav : That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale) + return false; + + locale2utf8 = iconv_open ("UTF-8", opt.locale); + if (locale2utf8 != (iconv_t)(-1)) + return true; + + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote("UTF-8")); + locale2utf8 = NULL; + return false; +} + +/* Return a new string */ +const char * +locale_to_utf8 (const char *str) +{ + char *new; + + if (!strcasecmp (opt.locale, "utf-8")) + return str; + + if (!open_locale_to_utf8 ()) + return str; + + if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + +/* */ +static bool +do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +{ + /* sXXXav : hummm hard to guess... */ + size_t len, done, outlen = inlen * 2; + int invalid = 0, tooshort = 0; + char *s; + + s = xmalloc (outlen + 1); + *out = s; + len = outlen; + done = 0; + + /* sXXXav : put a maximum looping factor ??? */ + for (;;) + { + if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) + { + *out = s; + *(s + len - outlen - done) = '\0'; + return true; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + invalid++; + **out = *in; + in++; + inlen--; + (*out)++; + outlen--; + } + else if (errno == E2BIG) /* Output buffer full */ + { + char *new; + + tooshort++; + done = len; + outlen = done + inlen * 2; + new = xmalloc (outlen + 1); + memcpy (new, s, done); + xfree (s); + s = new; + len = outlen; + *out = s + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + break; + } + } + + return false; +} + +/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL + on error. */ +char *idn_encode (char *host) +{ + char *new; + int ret; + + /* toASCII UTF-8 NULL terminated string */ + ret = idna_to_ascii_8z (host, &new, 0); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} diff --git a/src/iri.h b/src/iri.h index 85a7fb7f..4488501d 100644 --- a/src/iri.h +++ b/src/iri.h @@ -35,12 +35,16 @@ as that of the covered work. */ char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); +const char *locale_to_utf8 (const char *str); +char *idn_encode (char *host); #else /* ENABLE_IRI */ -#define parse_charset(str) NULL -#define find_locale() NULL -#define check_encoding_name(str) false +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false +#define locale_to_utf8(str) (str) +#define idn_encode(str) NULL #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/url.c b/src/url.c index f5d621f9..48b23d6c 100644 --- a/src/url.c +++ b/src/url.c @@ -42,6 +42,7 @@ as that of the covered work. */ #include "utils.h" #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ +#include "iri.h" #ifdef TESTING #include "test.h" @@ -670,6 +671,12 @@ url_parse (const char *url, int *error) goto error; } + if (opt.enable_iri) + { + url_unescape ((char *) url); + url = locale_to_utf8(url); + } + url_encoded = reencode_escapes (url); p = url_encoded; @@ -844,6 +851,17 @@ url_parse (const char *url, int *error) host_modified = true; } + if (opt.enable_iri) + { + char *new = idn_encode (u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } + if (params_b) u->params = strdupdelim (params_b, params_e); if (query_b) @@ -851,7 +869,7 @@ url_parse (const char *url, int *error) if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild From 4c9adcd1e4615ed4dba79958dc610f3367e5ade3 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 26 Jun 2008 19:14:14 +0200 Subject: [PATCH 11/55] Check for libiconv needed for IRIs support --- ChangeLog | 4 ++++ configure.ac | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8177dc97..89898414 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + 2008-06-14 Xavier Saint * configure.ac: Add support for IRIs diff --git a/configure.ac b/configure.ac index 44f397b9..b2923436 100644 --- a/configure.ac +++ b/configure.ac @@ -493,6 +493,19 @@ AC_ARG_ENABLE(iri, AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], [Support IDN/IRIs (needs GNU Libidn)]), libidn=$withval, libidn="") +if test "X$iri" != "Xno"; then + AM_ICONV + + if test "X$am_cv_func_iconv" != "Xyes"; then + iri=no + if test "X$force_iri" = "Xyes"; then + AC_MSG_ERROR([Libiconv is required for IRIs support]) + else + AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found]) + fi + fi +fi + if test "X$iri" != "Xno"; then if test "$libidn" != ""; then LDFLAGS="${LDFLAGS} -L$libidn/lib" From 8c204b746399e3f3a42fbdadd47ad8831727c818 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 26 Jun 2008 21:42:37 +0200 Subject: [PATCH 12/55] Fix a typo in a comment: impplement -> implement --- src/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log.c b/src/log.c index e84e5c61..b62bf9dd 100644 --- a/src/log.c +++ b/src/log.c @@ -43,7 +43,7 @@ as that of the covered work. */ #include "utils.h" #include "log.h" -/* This file impplement support for "logging". Logging means printing +/* This file implement support for "logging". Logging means printing output, plus several additional features: - Cataloguing output by importance. You can specify that a log From 26252ac4ca2d62d08e80e77d1f613b0bdbdd9bc5 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Mon, 30 Jun 2008 20:03:01 +0200 Subject: [PATCH 13/55] escnonprint has been replaced by quotearg_style; reflect that change in comments too --- src/ftp-basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ftp-basic.c b/src/ftp-basic.c index 265a1e25..5f250959 100644 --- a/src/ftp-basic.c +++ b/src/ftp-basic.c @@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line) return FTPRERR; /* Strip trailing CRLF before printing the line, so that - escnonprint doesn't include bogus \012 and \015. */ + quotting doesn't include bogus \012 and \015. */ p = strchr (line, '\0'); if (p > line && p[-1] == '\n') *--p = '\0'; From d687972c5052db9500f6d2cd689eee2c6f4c39ab Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Tue, 1 Jul 2008 19:28:24 +0200 Subject: [PATCH 14/55] Fix copyright year and some GNU coding style --- src/iri.c | 9 ++++----- src/iri.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/iri.c b/src/iri.c index 5fb06d09..1792ab62 100644 --- a/src/iri.c +++ b/src/iri.c @@ -1,6 +1,5 @@ /* IRI related functions. - Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, - 2008 Free Software Foundation, Inc. + Copyright (C) 2008 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -102,9 +101,9 @@ check_encoding_name (char *encoding) while (*s) { - if (!c_isascii(*s) || c_isspace(*s)) + if (!c_isascii (*s) || c_isspace (*s)) { - logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding)); + logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding)); return false; } @@ -136,7 +135,7 @@ open_locale_to_utf8 (void) return true; logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", - quote (opt.locale), quote("UTF-8")); + quote (opt.locale), quote ("UTF-8")); locale2utf8 = NULL; return false; } diff --git a/src/iri.h b/src/iri.h index 4488501d..64858476 100644 --- a/src/iri.h +++ b/src/iri.h @@ -1,5 +1,5 @@ /* Internationalization related declarations. - Copyright (C) 2000, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 2008 Free Software Foundation, Inc. This file is part of GNU Wget. From 85185bde1b9729a27c3841560232266f77f13166 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Tue, 1 Jul 2008 19:34:37 +0200 Subject: [PATCH 15/55] Emit a message if we found invalid or incomplete multibyte sequences --- src/iri.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/iri.c b/src/iri.c index 1792ab62..dfcb0578 100644 --- a/src/iri.c +++ b/src/iri.c @@ -185,6 +185,10 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Incomplete or invalid multibyte sequence */ if (errno == EINVAL || errno == EILSEQ) { + if (!invalid) + logprintf (LOG_VERBOSE, + "Incomplete or invalide multibyte sequence encountered\n"); + invalid++; **out = *in; in++; From 99396653c22e54e13d9be63f6c333a4b33c6cbda Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Wed, 2 Jul 2008 16:37:28 +0200 Subject: [PATCH 16/55] Show also the hostname in the locale when possible --- src/ChangeLog | 8 ++++++++ src/host.c | 21 +++++++++++++++++++-- src/iri.c | 21 ++++++++++++++++++++- src/iri.h | 2 ++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 9e31b1c4..7aca0527 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-26 Xavier Saint * iri.c, iri.h : New functions locale_to_utf8() and diff --git a/src/host.c b/src/host.c index fdb35b1c..8a1495f0 100644 --- a/src/host.c +++ b/src/host.c @@ -53,6 +53,7 @@ as that of the covered work. */ #include "host.h" #include "url.h" #include "hash.h" +#include "iri.h" #ifndef NO_ADDRESS # define NO_ADDRESS NO_DATA @@ -712,8 +713,24 @@ lookup_host (const char *host, int flags) /* No luck with the cache; resolve HOST. */ if (!silent && !numeric_address) - logprintf (LOG_VERBOSE, _("Resolving %s... "), - quotearg_style (escape_quoting_style, host)); + { + char *str = NULL, *name = NULL; + + if (opt.enable_iri && (name = idn_decode (host)) != NULL) + { + int len = strlen (host) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, host); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Resolving %s... "), + quotearg_style (escape_quoting_style, str ? str : host)); + + if (str) + xfree (str); + } #ifdef ENABLE_IPV6 { diff --git a/src/iri.c b/src/iri.c index dfcb0578..000f6550 100644 --- a/src/iri.c +++ b/src/iri.c @@ -220,7 +220,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) return false; } -/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL +/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL on error. */ char *idn_encode (char *host) { @@ -239,3 +239,22 @@ char *idn_encode (char *host) return new; } +/* Try to decode an ASCII encoded host. Return the new domain in the locale on + success or NULL on error. */ +char *idn_decode (char *host) +{ + char *new; + int ret; + + ret = idna_to_unicode_8zlz (host, &new, 0); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + + diff --git a/src/iri.h b/src/iri.h index 64858476..3992d76d 100644 --- a/src/iri.h +++ b/src/iri.h @@ -37,6 +37,7 @@ char *find_locale (void); bool check_encoding_name (char *encoding); const char *locale_to_utf8 (const char *str); char *idn_encode (char *host); +char *idn_decode (char *host); #else /* ENABLE_IRI */ @@ -45,6 +46,7 @@ char *idn_encode (char *host); #define check_encoding_name(str) false #define locale_to_utf8(str) (str) #define idn_encode(str) NULL +#define idn_decode(str) NULL #endif /* ENABLE_IRI */ #endif /* IRI_H */ From 3781197ec61b6050222df10206c201c185c8fe2d Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Tue, 8 Jul 2008 00:29:02 +0200 Subject: [PATCH 17/55] Remove an always true condition --- src/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connect.c b/src/connect.c index 1e8f07e5..a6ff0b9b 100644 --- a/src/connect.c +++ b/src/connect.c @@ -266,7 +266,7 @@ connect_to_ip (const ip_address *ip, int port, const char *print) if (print) { const char *txt_addr = print_address (ip); - if (print && 0 != strcmp (print, txt_addr)) + if (0 != strcmp (print, txt_addr)) logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), escnonprint_uri (print), txt_addr, port); else From f50be2a403574a8d2cc01f4be714da9c2d6f748a Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Tue, 8 Jul 2008 00:42:09 +0200 Subject: [PATCH 18/55] Show the hostname we are connecting to in the locale when possible --- src/connect.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/connect.c b/src/connect.c index a6ff0b9b..6cfdb4b7 100644 --- a/src/connect.c +++ b/src/connect.c @@ -58,6 +58,7 @@ as that of the covered work. */ #include "host.h" #include "connect.h" #include "hash.h" +#include "iri.h" /* Define sockaddr_storage where unavailable (presumably on IPv4-only hosts). */ @@ -267,8 +268,24 @@ connect_to_ip (const ip_address *ip, int port, const char *print) { const char *txt_addr = print_address (ip); if (0 != strcmp (print, txt_addr)) - logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), - escnonprint_uri (print), txt_addr, port); + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL) + { + int len = strlen (print) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, print); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), + str ? str : escnonprint_uri (print), txt_addr, port); + + if (str) + xfree (str); + } else logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); } From 6c6e838338c31f1ac3c57c71e4ac34c0401bdf86 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Tue, 8 Jul 2008 00:44:08 +0200 Subject: [PATCH 19/55] No need for initial value --- src/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host.c b/src/host.c index 8a1495f0..fb8158e5 100644 --- a/src/host.c +++ b/src/host.c @@ -714,7 +714,7 @@ lookup_host (const char *host, int flags) if (!silent && !numeric_address) { - char *str = NULL, *name = NULL; + char *str = NULL, *name; if (opt.enable_iri && (name = idn_decode (host)) != NULL) { From 9a2ea3938d09643c6528c3b83b1db4c30f47d981 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 13:10:02 +0200 Subject: [PATCH 20/55] Basic IDN/IRI support --- src/host.c | 4 +- src/html-url.c | 12 +++--- src/http.c | 6 +-- src/iri.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++-- src/iri.h | 20 ++++++++- src/main.c | 17 +++++--- src/recur.c | 35 +++++++++++----- src/retr.c | 47 +++++++++++++-------- src/url.c | 11 +++-- src/url.h | 2 +- 10 files changed, 209 insertions(+), 53 deletions(-) diff --git a/src/host.c b/src/host.c index fb8158e5..1226a274 100644 --- a/src/host.c +++ b/src/host.c @@ -716,7 +716,7 @@ lookup_host (const char *host, int flags) { char *str = NULL, *name; - if (opt.enable_iri && (name = idn_decode (host)) != NULL) + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) { int len = strlen (host) + strlen (name) + 4; str = xmalloc (len); @@ -725,7 +725,7 @@ lookup_host (const char *host, int flags) xfree (name); } - logprintf (LOG_VERBOSE, _("Resolving %s... "), + logprintf (LOG_VERBOSE, _("Resolving %s... "), quotearg_style (escape_quoting_style, str ? str : host)); if (str) diff --git a/src/html-url.c b/src/html-url.c index 9b515432..0d580f9a 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -274,6 +274,7 @@ append_url (const char *link_uri, struct urlpos *newel; const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; + bool utf8_encode = false; if (!base) { @@ -292,7 +293,7 @@ append_url (const char *link_uri, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, &utf8_encode); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -311,7 +312,7 @@ append_url (const char *link_uri, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, &utf8_encode); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -549,9 +550,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset)); + /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ - /* sXXXav: Not used yet */ + set_current_charset (mcharset); xfree (mcharset); } else if (name && 0 == strcasecmp (name, "robots")) @@ -660,6 +661,7 @@ get_urls_file (const char *file) struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; + bool utf8_encode = false; /* Load the file. */ fm = read_file (file); @@ -711,7 +713,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, &utf8_encode); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/http.c b/src/http.c index a4571ad7..df9ca2bb 100644 --- a/src/http.c +++ b/src/http.c @@ -1825,7 +1825,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@ -1894,7 +1894,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@ -2055,7 +2055,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); /* Try to get remote encoding if needed */ if (opt.enable_iri && !opt.encoding_remote) - /* xxx = */ parse_charset (tmp2); + set_current_charset (parse_charset (tmp2)); } } hs->newloc = resp_header_strdup (resp, "Location"); diff --git a/src/iri.c b/src/iri.c index 000f6550..32eb7210 100644 --- a/src/iri.c +++ b/src/iri.c @@ -41,6 +41,8 @@ as that of the covered work. */ #include "utils.h" #include "iri.h" +char *remote; +char *current; static iconv_t locale2utf8; @@ -80,7 +82,7 @@ parse_charset (char *str) return NULL; } - logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ return charset; } @@ -196,7 +198,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) (*out)++; outlen--; } - else if (errno == E2BIG) /* Output buffer full */ + else if (errno == E2BIG) /* Output buffer full */ { char *new; @@ -222,15 +224,29 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL on error. */ -char *idn_encode (char *host) +char * +idn_encode (char *host, bool utf8_encoded) { char *new; int ret; + /* Encode to UTF-8 if not done using current remote */ + if (!utf8_encoded) + { + if (!remote_to_utf8 ((const char *) host, (const char **) &new)) + { + /* Nothing to encode or an error occured */ + return NULL; + } + + host = new; + } + /* toASCII UTF-8 NULL terminated string */ ret = idna_to_ascii_8z (host, &new, 0); if (ret != IDNA_SUCCESS) { + /* sXXXav : free new when needed ! */ logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, quote (idna_strerror (ret))); return NULL; @@ -241,7 +257,8 @@ char *idn_encode (char *host) /* Try to decode an ASCII encoded host. Return the new domain in the locale on success or NULL on error. */ -char *idn_decode (char *host) +char * +idn_decode (char *host) { char *new; int ret; @@ -257,4 +274,87 @@ char *idn_decode (char *host) return new; } +/* Return a new string */ +bool +remote_to_utf8 (const char *str, const char **new) +{ + char *remote; + iconv_t cd; + bool ret = false; + + if (opt.encoding_remote) + remote = opt.encoding_remote; + else if (current) + remote = current; + else + return false; + + cd = iconv_open ("UTF-8", remote); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +char *get_remote_charset (void) +{ + return remote; +} + +char *get_current_charset (void) +{ + return current; +} + +void set_current_charset (char *charset) +{ + /*printf("[ current = `%s'\n", charset);*/ + + if (current) + xfree (current); + + current = charset ? xstrdup (charset) : NULL; +} + +void set_current_as_locale (void) +{ + /*printf("[ current = locale = `%s'\n", opt.locale);*/ + if (current) + xfree (current); + + /* sXXXav : assert opt.locale NULL ? */ + current = xstrdup (opt.locale); +} + +void +set_remote_charset (char *charset) +{ + /*printf("[ remote = `%s'\n", charset);*/ + if (remote) + xfree (remote); + + remote = charset ? xstrdup (charset) : NULL; +} + +void +set_remote_as_current (void) +{ + /*printf("[ remote = current = `%s'\n", current);*/ + if (remote) + xfree (remote); + + remote = current ? xstrdup (current) : NULL; +} diff --git a/src/iri.h b/src/iri.h index 3992d76d..837dbfdd 100644 --- a/src/iri.h +++ b/src/iri.h @@ -36,8 +36,16 @@ char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); const char *locale_to_utf8 (const char *str); -char *idn_encode (char *host); +char *idn_encode (char *host, bool utf8_encoded); char *idn_decode (char *host); +char *get_remote_charset (void); +char *get_current_charset (void); +void set_current_charset (char *charset); +void set_current_as_locale (void); +void set_current_charset (char *charset); +void set_remote_charset (char *charset); +void set_remote_as_current (void); +bool remote_to_utf8 (const char *str, const char **new); #else /* ENABLE_IRI */ @@ -45,8 +53,16 @@ char *idn_decode (char *host); #define find_locale() NULL #define check_encoding_name(str) false #define locale_to_utf8(str) (str) -#define idn_encode(str) NULL +#define idn_encode(str,encoded) NULL #define idn_decode(str) NULL +#define get_remote_charset() NULL +#define get_current_charset() NULL +#define set_current_charset(str) +#define set_current_as_locale() +#define set_current_charset(str) +#define set_remote_charset(str) +#define set_remote_as_current() +#define remote_to_utf8(a,b) false #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 53ea6b91..d0ff1d21 100644 --- a/src/main.c +++ b/src/main.c @@ -1067,16 +1067,16 @@ for details.\n\n")); #ifdef ENABLE_IRI if (opt.enable_iri) { - if (opt.locale && !check_encoding_name(opt.locale)) + if (opt.locale && !check_encoding_name (opt.locale)) opt.locale = NULL; if (!opt.locale) opt.locale = find_locale (); - if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote)) + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) opt.encoding_remote = NULL; - logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale)); + /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/ } #else if (opt.enable_iri || opt.locale || opt.encoding_remote) @@ -1190,21 +1190,26 @@ WARNING: Can't reopen standard output in binary mode;\n\ char *filename = NULL, *redirected_URL = NULL; int dt; + set_current_as_locale (); + if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (*t); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + set_remote_as_current (); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + } if (opt.delete_after && file_exists_p(filename)) { diff --git a/src/recur.c b/src/recur.c index d1d0f18d..e5f2b929 100644 --- a/src/recur.c +++ b/src/recur.c @@ -49,6 +49,7 @@ as that of the covered work. */ #include "res.h" #include "convert.h" #include "spider.h" +#include "iri.h" /* Functions for maintaining the URL queue. */ @@ -58,7 +59,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ - + char *remote_encoding; struct queue_element *next; /* next element in queue */ }; @@ -94,12 +95,18 @@ url_enqueue (struct url_queue *queue, const char *url, const char *referer, int depth, bool html_allowed) { struct queue_element *qel = xnew (struct queue_element); + char *charset = get_current_charset (); qel->url = url; qel->referer = referer; qel->depth = depth; qel->html_allowed = html_allowed; qel->next = NULL; + if (charset) + qel->remote_encoding = xstrdup (charset); + else + qel->remote_encoding = NULL; + ++queue->count; if (queue->count > queue->maxcount) queue->maxcount = queue->count; @@ -107,6 +114,8 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/ + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@ -132,6 +141,10 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; + set_remote_charset (qel->remote_encoding); + if (qel->remote_encoding) + xfree (qel->remote_encoding); + *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -177,6 +190,7 @@ uerr_t retrieve_tree (const char *start_url) { uerr_t status = RETROK; + bool utf8_encode = false; /* The queue of URLs we need to load. */ struct url_queue *queue; @@ -186,7 +200,7 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code); + struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode); if (!start_url_parsed) { @@ -324,7 +338,7 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode); char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -360,18 +374,18 @@ retrieve_tree (const char *start_url) } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@ -627,11 +641,12 @@ descend_redirect_p (const char *redirected, const char *original, int depth, struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; + bool utf8_encode = false; - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, &utf8_encode); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, &utf8_encode); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); diff --git a/src/retr.c b/src/retr.c index 179430ac..05ffe1d0 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,7 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "iri.h" /* Total size of downloaded files. Used to enforce quota. */ SUM_SIZE_INT total_downloaded_bytes; @@ -612,6 +613,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, char *saved_post_data = NULL; char *saved_post_file_name = NULL; + bool utf8_encoded = opt.enable_iri; + /* If dt is NULL, use local storage. */ if (!dt) { @@ -624,7 +627,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - u = url_parse (url, &up_error_code); + second_try: + u = url_parse (url, &up_error_code, &utf8_encoded); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -632,6 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } + /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/ + if (!refurl) refurl = opt.referer; @@ -645,8 +651,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + /* sXXXav : support IRI for proxy */ + bool proxy_utf8_encode = false; /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -721,8 +729,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + utf8_encoded = opt.enable_iri; + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -769,16 +779,21 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (result != RETROK && utf8_encoded) { - if (*dt & RETROKF) - { - register_download (u->url, local_file); - if (redirection_count && 0 != strcmp (origurl, u->url)) - register_redirection (origurl, u->url); - if (*dt & TEXTHTML) - register_html (u->url, local_file); - } + utf8_encoded = false; + /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); } if (file) @@ -843,9 +858,9 @@ retrieve_from_file (const char *file, bool html, int *count) int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (cur_url->url->url); opt.follow_ftp = old_follow_ftp; @@ -1021,8 +1036,8 @@ getproxy (struct url *u) bool url_uses_proxy (const char *url) { - bool ret; - struct url *u = url_parse (url, NULL); + bool ret, utf8_encode = false; + struct url *u = url_parse (url, NULL, &utf8_encode); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/url.c b/src/url.c index 48b23d6c..32de9c75 100644 --- a/src/url.c +++ b/src/url.c @@ -641,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, bool *utf8_encode) { struct url *u; const char *p; @@ -671,10 +671,13 @@ url_parse (const char *url, int *error) goto error; } - if (opt.enable_iri) + if (opt.enable_iri && *utf8_encode) { + const char *new; url_unescape ((char *) url); - url = locale_to_utf8(url); + *utf8_encode = remote_to_utf8 (url, &new); + if (*utf8_encode) + url = new; } url_encoded = reencode_escapes (url); @@ -853,7 +856,7 @@ url_parse (const char *url, int *error) if (opt.enable_iri) { - char *new = idn_encode (u->host); + char *new = idn_encode (u->host, *utf8_encode); if (new) { xfree (u->host); diff --git a/src/url.h b/src/url.h index 7c8bcfed..a174568e 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, bool *); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); From da6b3f4b614fb8b28bf388b66f21efc5d553ebb9 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 18:20:18 +0200 Subject: [PATCH 21/55] Use dt rather than result --- src/recur.c | 2 +- src/retr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/recur.c b/src/recur.c index e5f2b929..d8279c39 100644 --- a/src/recur.c +++ b/src/recur.c @@ -450,7 +450,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { - if (opt.spider) + if (opt.spider) { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); diff --git a/src/retr.c b/src/retr.c index 05ffe1d0..02106081 100644 --- a/src/retr.c +++ b/src/retr.c @@ -780,7 +780,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, } /* Try to not encode in UTF-8 if fetching failed */ - if (result != RETROK && utf8_encoded) + if (!(*dt & RETROKF) && utf8_encoded) { utf8_encoded = false; /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ From 1e9ced017082976d257a7a158d9b6aca49f3c690 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 18:47:52 +0200 Subject: [PATCH 22/55] Get rid of the supplementary bool pointer in url_parse () arguments; UGLY :) --- src/html-url.c | 14 +++++++++----- src/iri.c | 27 +++++++++++++++++++++++++++ src/iri.h | 10 ++++++++++ src/main.c | 1 + src/recur.c | 17 +++++++++++------ src/retr.c | 26 +++++++++++++++----------- src/url.c | 12 +++++++----- src/url.h | 2 +- 8 files changed, 81 insertions(+), 28 deletions(-) diff --git a/src/html-url.c b/src/html-url.c index 0d580f9a..5a0682d3 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -274,7 +274,6 @@ append_url (const char *link_uri, struct urlpos *newel; const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; - bool utf8_encode = false; if (!base) { @@ -293,7 +292,9 @@ append_url (const char *link_uri, return NULL; } - url = url_parse (link_uri, NULL, &utf8_encode); + set_ugly_no_encode (true); + url = url_parse (link_uri, NULL); + set_ugly_no_encode (false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -312,7 +313,9 @@ append_url (const char *link_uri, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL, &utf8_encode); + set_ugly_no_encode (true); + url = url_parse (complete_uri, NULL); + set_ugly_no_encode (false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -661,7 +664,6 @@ get_urls_file (const char *file) struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; - bool utf8_encode = false; /* Load the file. */ fm = read_file (file); @@ -713,7 +715,9 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code, &utf8_encode); + set_ugly_no_encode (true); + url = url_parse (url_text, &up_error_code); + set_ugly_no_encode (false); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/iri.c b/src/iri.c index 32eb7210..e5be2cf8 100644 --- a/src/iri.c +++ b/src/iri.c @@ -43,6 +43,8 @@ as that of the covered work. */ char *remote; char *current; +bool utf8_encode; +bool ugly_no_encode; static iconv_t locale2utf8; @@ -358,3 +360,28 @@ set_remote_as_current (void) remote = current ? xstrdup (current) : NULL; } +void reset_utf8_encode (void) +{ + set_utf8_encode (opt.enable_iri); +} + +void set_utf8_encode (bool encode) +{ + utf8_encode = encode; +} + +bool get_utf8_encode (void) +{ + return utf8_encode; +} + +void set_ugly_no_encode (bool ugly) +{ + ugly_no_encode = ugly; +} + +bool get_ugly_no_encode (void) +{ + return ugly_no_encode; +} + diff --git a/src/iri.h b/src/iri.h index 837dbfdd..413fb2f6 100644 --- a/src/iri.h +++ b/src/iri.h @@ -46,6 +46,13 @@ void set_current_charset (char *charset); void set_remote_charset (char *charset); void set_remote_as_current (void); bool remote_to_utf8 (const char *str, const char **new); +void reset_utf8_encode (void); +void set_utf8_encode (bool encode); +bool get_utf8_encode (void); + +/* ugly ugly ugly */ +void set_ugly_no_encode (bool ugly); +bool get_ugly_no_encode (void); #else /* ENABLE_IRI */ @@ -63,6 +70,9 @@ bool remote_to_utf8 (const char *str, const char **new); #define set_remote_charset(str) #define set_remote_as_current() #define remote_to_utf8(a,b) false +#define reset_utf8_encode() +#define set_utf8_encode(a) +#define get_utf8_encode() false #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index d0ff1d21..bf49bf89 100644 --- a/src/main.c +++ b/src/main.c @@ -1191,6 +1191,7 @@ WARNING: Can't reopen standard output in binary mode;\n\ int dt; set_current_as_locale (); + set_ugly_no_encode (false); if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) diff --git a/src/recur.c b/src/recur.c index d8279c39..6f5da2ae 100644 --- a/src/recur.c +++ b/src/recur.c @@ -190,7 +190,6 @@ uerr_t retrieve_tree (const char *start_url) { uerr_t status = RETROK; - bool utf8_encode = false; /* The queue of URLs we need to load. */ struct url_queue *queue; @@ -200,8 +199,11 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode); + struct url *start_url_parsed; + set_ugly_no_encode (true); + start_url_parsed= url_parse (start_url, &up_error_code); + set_ugly_no_encode (false); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, @@ -338,7 +340,9 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode); + set_ugly_no_encode (true); + struct url *url_parsed = url_parse (url, NULL); + set_ugly_no_encode (false); char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -641,13 +645,14 @@ descend_redirect_p (const char *redirected, const char *original, int depth, struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; - bool utf8_encode = false; - orig_parsed = url_parse (original, NULL, &utf8_encode); + set_ugly_no_encode (true); + orig_parsed = url_parse (original, NULL); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL, &utf8_encode); + new_parsed = url_parse (redirected, NULL); assert (new_parsed != NULL); + set_ugly_no_encode (false); upos = xnew0 (struct urlpos); upos->url = new_parsed; diff --git a/src/retr.c b/src/retr.c index 02106081..dd4978a7 100644 --- a/src/retr.c +++ b/src/retr.c @@ -613,8 +613,6 @@ retrieve_url (const char *origurl, char **file, char **newloc, char *saved_post_data = NULL; char *saved_post_file_name = NULL; - bool utf8_encoded = opt.enable_iri; - /* If dt is NULL, use local storage. */ if (!dt) { @@ -627,8 +625,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; + reset_utf8_encode (); + second_try: - u = url_parse (url, &up_error_code, &utf8_encoded); + u = url_parse (url, &up_error_code); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -652,9 +652,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (proxy) { /* sXXXav : support IRI for proxy */ - bool proxy_utf8_encode = false; /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode); + set_ugly_no_encode (true); + proxy_url = url_parse (proxy, &up_error_code); + set_ugly_no_encode (false); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -729,10 +730,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; - utf8_encoded = opt.enable_iri; + reset_utf8_encode (); /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded); + newloc_parsed = url_parse (mynewloc, &up_error_code); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -780,9 +781,9 @@ retrieve_url (const char *origurl, char **file, char **newloc, } /* Try to not encode in UTF-8 if fetching failed */ - if (!(*dt & RETROKF) && utf8_encoded) + if (!(*dt & RETROKF) && get_utf8_encode ()) { - utf8_encoded = false; + set_utf8_encode (false); /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ goto second_try; } @@ -1036,8 +1037,11 @@ getproxy (struct url *u) bool url_uses_proxy (const char *url) { - bool ret, utf8_encode = false; - struct url *u = url_parse (url, NULL, &utf8_encode); + bool ret; + struct url *u; + set_ugly_no_encode(true); + u= url_parse (url, NULL); + set_ugly_no_encode(false); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/url.c b/src/url.c index 32de9c75..c9489597 100644 --- a/src/url.c +++ b/src/url.c @@ -641,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error, bool *utf8_encode) +url_parse (const char *url, int *error) { struct url *u; const char *p; @@ -671,12 +671,14 @@ url_parse (const char *url, int *error, bool *utf8_encode) goto error; } - if (opt.enable_iri && *utf8_encode) + if (opt.enable_iri && get_utf8_encode () && !get_ugly_no_encode ()) { const char *new; + bool utf8_encode; url_unescape ((char *) url); - *utf8_encode = remote_to_utf8 (url, &new); - if (*utf8_encode) + utf8_encode = remote_to_utf8 (url, &new); + set_utf8_encode (utf8_encode); + if (utf8_encode) url = new; } @@ -856,7 +858,7 @@ url_parse (const char *url, int *error, bool *utf8_encode) if (opt.enable_iri) { - char *new = idn_encode (u->host, *utf8_encode); + char *new = idn_encode (u->host, get_utf8_encode ()); if (new) { xfree (u->host); diff --git a/src/url.h b/src/url.h index a174568e..7c8bcfed 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *, bool *); +struct url *url_parse (const char *, int *); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); From 7410cb97644ba0b9e327b2c37b4e39fcec5b3690 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 18:52:20 +0200 Subject: [PATCH 23/55] In spider mode, do not report links as broken if they were utf8 encoded --- src/http.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/http.c b/src/http.c index df9ca2bb..f79327c3 100644 --- a/src/http.c +++ b/src/http.c @@ -2350,16 +2350,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@ -2396,7 +2396,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), +File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@ -2412,10 +2412,10 @@ File %s already there; not retrieving.\n\n"), /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@ -2424,21 +2424,21 @@ File %s already there; not retrieving.\n\n"), /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition && file_exists_p (url_file_name (u))) send_head_first = true; - + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@ -2447,20 +2447,20 @@ Spider mode enabled. Check if remote file exists.\n")); if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@ -2470,7 +2470,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@ -2507,7 +2507,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@ -2546,7 +2546,7 @@ Spider mode enabled. Check if remote file exists.\n")); hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@ -2562,7 +2562,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@ -2581,11 +2581,13 @@ Spider mode enabled. Check if remote file exists.\n")); continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was utf8 encoded because we will try + * one unencoded. */ + else if (opt.spider && !get_utf8_encode ()) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@ -2594,7 +2596,7 @@ Remote file does not exist -- broken link!!!\n")); else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); From 24d68b7a25aa3def28bbd3681898078146239227 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 19:08:28 +0200 Subject: [PATCH 24/55] Add some comments in iri.c and change a variable name which was the same for a global and a local one --- src/iri.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/iri.c b/src/iri.c index e5be2cf8..5108d999 100644 --- a/src/iri.c +++ b/src/iri.c @@ -41,14 +41,22 @@ as that of the covered work. */ #include "utils.h" #include "iri.h" +/* Note: locale encoding is kept in options struct (opt.locale) */ + +/* Hold the encoding used for the current fetch */ char *remote; + +/* Hold the encoding for the future found links */ char *current; + +/* Will/Is the current URL encoded in utf8 ? */ bool utf8_encode; + +/* Force no utf8 encoding for url_parse () */ bool ugly_no_encode; static iconv_t locale2utf8; - static bool open_locale_to_utf8 (void); static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); @@ -93,7 +101,6 @@ parse_charset (char *str) char * find_locale (void) { - /* sXXXav, made our own function or use libidn one ?! */ return (char *) stringprep_locale_charset (); } @@ -144,7 +151,8 @@ open_locale_to_utf8 (void) return false; } -/* Return a new string */ +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ const char * locale_to_utf8 (const char *str) { @@ -162,7 +170,9 @@ locale_to_utf8 (const char *str) return str; } -/* */ +/* Do the conversion according to the passed conversion descriptor cd. *out + will containes the transcoded string on success. *out content is + unspecified otherwise. */ static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out) { @@ -176,7 +186,6 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) len = outlen; done = 0; - /* sXXXav : put a maximum looping factor ??? */ for (;;) { if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) @@ -224,7 +233,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) return false; } -/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL +/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL on error. */ char * idn_encode (char *host, bool utf8_encoded) @@ -257,8 +266,8 @@ idn_encode (char *host, bool utf8_encoded) return new; } -/* Try to decode an ASCII encoded host. Return the new domain in the locale on - success or NULL on error. */ +/* Try to decode an "ASCII encoded" host. Return the new domain in the locale + on success or NULL on error. */ char * idn_decode (char *host) { @@ -276,22 +285,23 @@ idn_decode (char *host) return new; } -/* Return a new string */ +/* Try to transcode string str from remote encoding to UTF-8. On success, *new + contains the transcoded string. *new content is unspecified otherwise. */ bool remote_to_utf8 (const char *str, const char **new) { - char *remote; + char *r; iconv_t cd; bool ret = false; if (opt.encoding_remote) - remote = opt.encoding_remote; + r = opt.encoding_remote; else if (current) - remote = current; + r = current; else return false; - cd = iconv_open ("UTF-8", remote); + cd = iconv_open ("UTF-8", r); if (cd == (iconv_t)(-1)) return false; @@ -323,7 +333,6 @@ char *get_current_charset (void) void set_current_charset (char *charset) { /*printf("[ current = `%s'\n", charset);*/ - if (current) xfree (current); From 169a16fc7ddb348cc4f0a5ebd149f754b5042478 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 19:29:51 +0200 Subject: [PATCH 25/55] Make get_utf8_encode() directly aware of ugly_no_encode and remove get_ugly_no_encode() --- src/iri.c | 7 +------ src/iri.h | 1 - src/url.c | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/iri.c b/src/iri.c index 5108d999..1f421d43 100644 --- a/src/iri.c +++ b/src/iri.c @@ -381,7 +381,7 @@ void set_utf8_encode (bool encode) bool get_utf8_encode (void) { - return utf8_encode; + return (!ugly_no_encode && utf8_encode); } void set_ugly_no_encode (bool ugly) @@ -389,8 +389,3 @@ void set_ugly_no_encode (bool ugly) ugly_no_encode = ugly; } -bool get_ugly_no_encode (void) -{ - return ugly_no_encode; -} - diff --git a/src/iri.h b/src/iri.h index 413fb2f6..58389813 100644 --- a/src/iri.h +++ b/src/iri.h @@ -52,7 +52,6 @@ bool get_utf8_encode (void); /* ugly ugly ugly */ void set_ugly_no_encode (bool ugly); -bool get_ugly_no_encode (void); #else /* ENABLE_IRI */ diff --git a/src/url.c b/src/url.c index c9489597..beaf0fb2 100644 --- a/src/url.c +++ b/src/url.c @@ -671,7 +671,7 @@ url_parse (const char *url, int *error) goto error; } - if (opt.enable_iri && get_utf8_encode () && !get_ugly_no_encode ()) + if (opt.enable_iri && get_utf8_encode ()) { const char *new; bool utf8_encode; From ee8ff7488f5402e7f252feabc2e9c70b64354605 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 19:31:09 +0200 Subject: [PATCH 26/55] Add a missing no-op macro for set_ugly_no_encode() --- src/iri.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/iri.h b/src/iri.h index 58389813..50102df4 100644 --- a/src/iri.h +++ b/src/iri.h @@ -72,6 +72,7 @@ void set_ugly_no_encode (bool ugly); #define reset_utf8_encode() #define set_utf8_encode(a) #define get_utf8_encode() false +#define set_ugly_no_encode(a) #endif /* ENABLE_IRI */ #endif /* IRI_H */ From 5982054a98a20a00fdb0e701530af3e7a2981873 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 20:37:22 +0200 Subject: [PATCH 27/55] Use the right flags for idna conversion (RFC3987 section 3.1) --- src/iri.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/iri.c b/src/iri.c index 1f421d43..c28d4f51 100644 --- a/src/iri.c +++ b/src/iri.c @@ -41,6 +41,9 @@ as that of the covered work. */ #include "utils.h" #include "iri.h" +/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ +#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES + /* Note: locale encoding is kept in options struct (opt.locale) */ /* Hold the encoding used for the current fetch */ @@ -254,7 +257,7 @@ idn_encode (char *host, bool utf8_encoded) } /* toASCII UTF-8 NULL terminated string */ - ret = idna_to_ascii_8z (host, &new, 0); + ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); if (ret != IDNA_SUCCESS) { /* sXXXav : free new when needed ! */ @@ -274,7 +277,7 @@ idn_decode (char *host) char *new; int ret; - ret = idna_to_unicode_8zlz (host, &new, 0); + ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); if (ret != IDNA_SUCCESS) { logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret, From c31e00b52d49632dd0f005269ab2b820c7fd2c34 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Mon, 21 Jul 2008 19:34:22 +0200 Subject: [PATCH 28/55] Do not free/duplicate current/remote encoding string if they aren't changed --- src/iri.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/iri.c b/src/iri.c index c28d4f51..d23615ae 100644 --- a/src/iri.c +++ b/src/iri.c @@ -337,18 +337,27 @@ void set_current_charset (char *charset) { /*printf("[ current = `%s'\n", charset);*/ if (current) - xfree (current); + { + /* Do nothing if already equal */ + if (!strcasecmp (current, charset)) + return; + xfree (current); + } current = charset ? xstrdup (charset) : NULL; } void set_current_as_locale (void) { + /* sXXXav : assert opt.locale NULL ? */ /*printf("[ current = locale = `%s'\n", opt.locale);*/ if (current) - xfree (current); + { + if (!strcasecmp (current, opt.locale)) + return; + xfree (current); + } - /* sXXXav : assert opt.locale NULL ? */ current = xstrdup (opt.locale); } @@ -357,8 +366,12 @@ set_remote_charset (char *charset) { /*printf("[ remote = `%s'\n", charset);*/ if (remote) - xfree (remote); - + { + /* Do nothing if already equal */ + if (!strcasecmp (remote, charset)) + return; + xfree (remote); + } remote = charset ? xstrdup (charset) : NULL; } @@ -367,7 +380,12 @@ set_remote_as_current (void) { /*printf("[ remote = current = `%s'\n", current);*/ if (remote) - xfree (remote); + { + /* Do nothing if already equal */ + if (current && !strcasecmp (remote, current)) + return; + xfree (remote); + } remote = current ? xstrdup (current) : NULL; } From d82f80ecab9bfef857d780f894cca7e890780ce0 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 24 Jul 2008 00:56:29 +0200 Subject: [PATCH 29/55] Change global variable model for state-object --- src/convert.c | 2 +- src/html-url.c | 29 ++++++------ src/html-url.h | 2 +- src/http.c | 20 ++++---- src/http.h | 2 +- src/iri.c | 126 +++++++++++++++---------------------------------- src/iri.h | 48 ++++++++----------- src/main.c | 11 ++--- src/recur.c | 75 +++++++++++++++-------------- src/res.c | 13 +++-- src/res.h | 2 +- src/retr.c | 53 ++++++++++++--------- src/retr.h | 3 +- src/url.c | 37 ++++++++------- src/url.h | 2 +- src/wget.h | 3 ++ 16 files changed, 197 insertions(+), 231 deletions(-) diff --git a/src/convert.c b/src/convert.c index e72a4b0f..54004ad0 100644 --- a/src/convert.c +++ b/src/convert.c @@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, /* Parse the file... */ urls = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, NULL); + get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the diff --git a/src/html-url.c b/src/html-url.c index ef93a7e4..6e886083 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -44,7 +44,6 @@ as that of the covered work. */ #include "recur.h" #include "html-url.h" #include "css-url.h" -#include "iri.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -175,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - set_ugly_no_encode (true); - url = url_parse (link_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - set_ugly_no_encode (true); - url = url_parse (complete_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) return; /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ - - set_current_charset (mcharset); - xfree (mcharset); + xfree_null (meta_charset); + meta_charset = mcharset; } else if (name && 0 == strcasecmp (name, "robots")) { @@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -750,9 +753,7 @@ get_urls_file (const char *file) url_text = merged; } - set_ugly_no_encode (true); - url = url_parse (url_text, &up_error_code); - set_ugly_no_encode (false); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/html-url.h b/src/html-url.h index a94f0db6..2e9ec820 100644 --- a/src/html-url.h +++ b/src/html-url.h @@ -44,7 +44,7 @@ struct map_context { }; struct urlpos *get_urls_file (const char *); -struct urlpos *get_urls_html (const char *, const char *, bool *); +struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *append_url (const char *, int, int, struct map_context *); void free_urlpos (struct urlpos *); diff --git a/src/http.c b/src/http.c index 5ec70d27..589e18ee 100644 --- a/src/http.c +++ b/src/http.c @@ -49,7 +49,6 @@ as that of the covered work. */ #include "retr.h" #include "connect.h" #include "netrc.h" -#include "iri.h" #ifdef HAVE_SSL # include "ssl.h" #endif @@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs) If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t -gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) +gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); /* Try to get remote encoding if needed */ if (opt.enable_iri && !opt.encoding_remote) - set_current_charset (parse_charset (tmp2)); + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n")); *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); @@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n")); } /* Maybe we should always keep track of broken links, not just in * spider mode. - * Don't log error if it was utf8 encoded because we will try - * one unencoded. */ - else if (opt.spider && !get_utf8_encode ()) + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ if (!hurl) diff --git a/src/http.h b/src/http.h index e0e66cea..4769e9d3 100644 --- a/src/http.h +++ b/src/http.h @@ -33,7 +33,7 @@ as that of the covered work. */ struct url; uerr_t http_loop (struct url *, char **, char **, const char *, int *, - struct url *); + struct url *, struct iri *); void save_cookies (void); void http_cleanup (void); time_t http_atotm (const char *); diff --git a/src/iri.c b/src/iri.c index d23615ae..783aa331 100644 --- a/src/iri.c +++ b/src/iri.c @@ -46,18 +46,6 @@ as that of the covered work. */ /* Note: locale encoding is kept in options struct (opt.locale) */ -/* Hold the encoding used for the current fetch */ -char *remote; - -/* Hold the encoding for the future found links */ -char *current; - -/* Will/Is the current URL encoded in utf8 ? */ -bool utf8_encode; - -/* Force no utf8 encoding for url_parse () */ -bool ugly_no_encode; - static iconv_t locale2utf8; static bool open_locale_to_utf8 (void); @@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL on error. */ char * -idn_encode (char *host, bool utf8_encoded) +idn_encode (struct iri *i, char *host) { char *new; int ret; - /* Encode to UTF-8 if not done using current remote */ - if (!utf8_encoded) + /* Encode to UTF-8 if not done */ + if (!i->utf8_encode) { - if (!remote_to_utf8 ((const char *) host, (const char **) &new)) + if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) { /* Nothing to encode or an error occured */ return NULL; @@ -291,7 +279,7 @@ idn_decode (char *host) /* Try to transcode string str from remote encoding to UTF-8. On success, *new contains the transcoded string. *new content is unspecified otherwise. */ bool -remote_to_utf8 (const char *str, const char **new) +remote_to_utf8 (struct iri *i, const char *str, const char **new) { char *r; iconv_t cd; @@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new) if (opt.encoding_remote) r = opt.encoding_remote; - else if (current) - r = current; + else if (i->uri_encoding) + r = i->uri_encoding; else return false; @@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new) return ret; } -char *get_remote_charset (void) +struct iri * +iri_new (void) { - return remote; -} - -char *get_current_charset (void) -{ - return current; -} - -void set_current_charset (char *charset) -{ - /*printf("[ current = `%s'\n", charset);*/ - if (current) - { - /* Do nothing if already equal */ - if (!strcasecmp (current, charset)) - return; - xfree (current); - } - - current = charset ? xstrdup (charset) : NULL; -} - -void set_current_as_locale (void) -{ - /* sXXXav : assert opt.locale NULL ? */ - /*printf("[ current = locale = `%s'\n", opt.locale);*/ - if (current) - { - if (!strcasecmp (current, opt.locale)) - return; - xfree (current); - } - - current = xstrdup (opt.locale); + struct iri *i = xmalloc (sizeof (struct iri)); + i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; + i->content_encoding = NULL; + i->utf8_encode = opt.enable_iri; } void -set_remote_charset (char *charset) +iri_free (struct iri *i) { - /*printf("[ remote = `%s'\n", charset);*/ - if (remote) - { - /* Do nothing if already equal */ - if (!strcasecmp (remote, charset)) - return; - xfree (remote); - } - remote = charset ? xstrdup (charset) : NULL; + xfree_null (i->uri_encoding); + xfree_null (i->content_encoding); + xfree (i); } void -set_remote_as_current (void) +set_uri_encoding (struct iri *i, char *charset) { - /*printf("[ remote = current = `%s'\n", current);*/ - if (remote) + logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->uri_encoding) { - /* Do nothing if already equal */ - if (current && !strcasecmp (remote, current)) + if (!strcasecmp (i->uri_encoding, charset)) return; - xfree (remote); + xfree (i->uri_encoding); } - remote = current ? xstrdup (current) : NULL; + i->uri_encoding = charset ? xstrdup (charset) : NULL; } -void reset_utf8_encode (void) +void +set_content_encoding (struct iri *i, char *charset) { - set_utf8_encode (opt.enable_iri); -} - -void set_utf8_encode (bool encode) -{ - utf8_encode = encode; -} - -bool get_utf8_encode (void) -{ - return (!ugly_no_encode && utf8_encode); -} - -void set_ugly_no_encode (bool ugly) -{ - ugly_no_encode = ugly; + logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->content_encoding) + { + if (!strcasecmp (i->content_encoding, charset)) + return; + xfree (i->content_encoding); + } + + i->content_encoding = charset ? xstrdup (charset) : NULL; } diff --git a/src/iri.h b/src/iri.h index 50102df4..173d0656 100644 --- a/src/iri.h +++ b/src/iri.h @@ -30,49 +30,41 @@ as that of the covered work. */ #ifndef IRI_H #define IRI_H +struct iri { + char *uri_encoding; /* Encoding of the uri to fetch */ + char *content_encoding; /* Encoding of links inside the fetched file */ + bool utf8_encode; /* Will/Is the current url encoded in utf8 */ +}; + #ifdef ENABLE_IRI char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); const char *locale_to_utf8 (const char *str); -char *idn_encode (char *host, bool utf8_encoded); +char *idn_encode (struct iri *i, char *host); char *idn_decode (char *host); -char *get_remote_charset (void); -char *get_current_charset (void); -void set_current_charset (char *charset); -void set_current_as_locale (void); -void set_current_charset (char *charset); -void set_remote_charset (char *charset); -void set_remote_as_current (void); -bool remote_to_utf8 (const char *str, const char **new); -void reset_utf8_encode (void); -void set_utf8_encode (bool encode); -bool get_utf8_encode (void); - -/* ugly ugly ugly */ -void set_ugly_no_encode (bool ugly); +bool remote_to_utf8 (struct iri *i, const char *str, const char **new); +struct iri *iri_new (void); +void iri_free (struct iri *i); +void set_uri_encoding (struct iri *i, char *charset); +void set_content_encoding (struct iri *i, char *charset); #else /* ENABLE_IRI */ +struct iri dummy_iri; + #define parse_charset(str) NULL #define find_locale() NULL #define check_encoding_name(str) false #define locale_to_utf8(str) (str) -#define idn_encode(str,encoded) NULL +#define idn_encode(a,b,c) NULL #define idn_decode(str) NULL -#define get_remote_charset() NULL -#define get_current_charset() NULL -#define set_current_charset(str) -#define set_current_as_locale() -#define set_current_charset(str) -#define set_remote_charset(str) -#define set_remote_as_current() -#define remote_to_utf8(a,b) false -#define reset_utf8_encode() -#define set_utf8_encode(a) -#define get_utf8_encode() false -#define set_ugly_no_encode(a) +#define remote_to_utf8(a,b,c) false +#define iri_new() (&dummy_iri) +#define iri_free(a) +#define set_uri_encoding(a,b) +#define set_content_encoding(a,b) #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 6135a67d..8cee194c 100644 --- a/src/main.c +++ b/src/main.c @@ -57,7 +57,6 @@ as that of the covered work. */ #include "convert.h" #include "spider.h" #include "http.h" /* for save_cookies */ -#include "iri.h" #include #include @@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\ char *filename = NULL, *redirected_URL = NULL; int dt; - set_current_as_locale (); - set_ugly_no_encode (false); - if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) { @@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\ } else { - set_remote_as_current (); - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, + opt.recursive, i); + iri_free (i); } if (opt.delete_after && file_exists_p(filename)) diff --git a/src/recur.c b/src/recur.c index 24b80ad4..e2f58d1c 100644 --- a/src/recur.c +++ b/src/recur.c @@ -61,7 +61,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ - char *remote_encoding; + struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue) into it. */ static void -url_enqueue (struct url_queue *queue, +url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); - char *charset = get_current_charset (); + qel->iri = i; qel->url = url; qel->referer = referer; qel->depth = depth; @@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue, qel->css_allowed = css_allowed; qel->next = NULL; - if (charset) - qel->remote_encoding = xstrdup (charset); - else - qel->remote_encoding = NULL; - ++queue->count; if (queue->count > queue->maxcount) queue->maxcount = queue->count; @@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); - /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/ + if (i) + printf ("[Enqueuing %s with %s\n", url, i->uri_encoding); if (queue->tail) queue->tail->next = qel; @@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue, succeeded, or false if the queue is empty. */ static bool -url_dequeue (struct url_queue *queue, +url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { @@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; - set_remote_charset (qel->remote_encoding); - if (qel->remote_encoding) - xfree (qel->remote_encoding); - + *i = qel->iri; *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue, } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); static bool descend_redirect_p (const char *, const char *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -207,10 +200,10 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); - set_ugly_no_encode (true); - start_url_parsed= url_parse (start_url, &up_error_code); - set_ugly_no_encode (false); + start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, @@ -223,7 +216,8 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); string_set_add (blacklist, start_url_parsed->url); while (1) @@ -242,7 +236,7 @@ retrieve_tree (const char *start_url) /* Get the next URL from the queue... */ - if (!url_dequeue (queue, + if (!url_dequeue (queue, (struct iri **) &i, (const char **)&url, (const char **)&referer, &depth, &html_allowed, &css_allowed)) break; @@ -283,7 +277,8 @@ retrieve_tree (const char *start_url) int dt = 0; char *redirected = NULL; - status = retrieve_url (url, &file, &redirected, referer, &dt, false); + status = retrieve_url (url, &file, &redirected, referer, &dt, + false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -311,7 +306,7 @@ retrieve_tree (const char *start_url) if (descend) { if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@ -363,7 +358,7 @@ retrieve_tree (const char *start_url) bool meta_disallow_follow = false; struct urlpos *children = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, &meta_disallow_follow); + get_urls_html (file, url, &meta_disallow_follow, i); if (opt.use_robots && meta_disallow_follow) { @@ -374,9 +369,8 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - set_ugly_no_encode (true); - struct url *url_parsed = url_parse (url, NULL); - set_ugly_no_encode (false); + struct url *url_parsed = url_parse (url, NULL, i); + struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -393,9 +387,11 @@ retrieve_tree (const char *start_url) if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + blacklist, i)) { - url_enqueue (queue, xstrdup (child->url->url), + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding); + url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, child->link_expect_css); @@ -440,6 +436,7 @@ retrieve_tree (const char *start_url) xfree (url); xfree_null (referer); xfree_null (file); + iri_free (i); } /* If anything is left of the queue due to a premature exit, free it @@ -448,9 +445,11 @@ retrieve_tree (const char *start_url) char *d1, *d2; int d3; bool d4, d5; - while (url_dequeue (queue, + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { + iri_free (d6); xfree (d1); xfree_null (d2); } @@ -479,7 +478,8 @@ retrieve_tree (const char *start_url) static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *u = upos->url; const char *url = u->url; @@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (!specs) { char *rfile; - if (res_retrieve_file (url, &rfile)) + if (res_retrieve_file (url, &rfile, iri)) { specs = res_parse_from_file (rfile); @@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, static bool descend_redirect_p (const char *redirected, const char *original, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; - set_ugly_no_encode (true); - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, NULL); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL); assert (new_parsed != NULL); - set_ugly_no_encode (false); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); url_free (orig_parsed); url_free (new_parsed); diff --git a/src/res.c b/src/res.c index 8c35f0e1..69abd12d 100644 --- a/src/res.c +++ b/src/res.c @@ -532,21 +532,28 @@ res_get_specs (const char *host, int port) Return true if robots were retrieved OK, false otherwise. */ bool -res_retrieve_file (const char *url, char **file) +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --git a/src/res.h b/src/res.h index 94a57750..5439eaf9 100644 --- a/src/res.h +++ b/src/res.h @@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *); void res_register_specs (const char *, int, struct robot_specs *); struct robot_specs *res_get_specs (const char *, int); -bool res_retrieve_file (const char *, char **); +bool res_retrieve_file (const char *, char **, struct iri *); bool is_robots_txt_url (const char *); diff --git a/src/retr.c b/src/retr.c index 7a28ea32..e70f6e6e 100644 --- a/src/retr.c +++ b/src/retr.c @@ -598,7 +598,7 @@ static char *getproxy (struct url *); uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) + const char *refurl, int *dt, bool recursive, struct iri *iri) { uerr_t result; char *url; @@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - reset_utf8_encode (); - second_try: - u = url_parse (url, &up_error_code); + u = url_parse (url, &up_error_code, iri); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } - /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/ + printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode); if (!refurl) refurl = opt.referer; @@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { - /* sXXXav : support IRI for proxy */ + /* sXXXav : could a proxy include a path ??? */ + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - set_ugly_no_encode (true); - proxy_url = url_parse (proxy, &up_error_code); - set_ugly_no_encode (false); + proxy_url = url_parse (proxy, &up_error_code, NULL); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; - reset_utf8_encode (); + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, } /* Try to not encode in UTF-8 if fetching failed */ - if (!(*dt & RETROKF) && get_utf8_encode ()) + if (!(*dt & RETROKF) && iri->utf8_encode) { - set_utf8_encode (false); - /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ + iri->utf8_encode = false; + printf ("[Fallbacking to non-utf8 for `%s'\n", url); goto second_try; } @@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { uerr_t status; - status = retrieve_url (url, &input_file, NULL, NULL, NULL, false); + status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri); if (status != RETROK) return status; } else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count) opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, + &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url) { bool ret; struct url *u; - set_ugly_no_encode(true); - u= url_parse (url, NULL); - set_ugly_no_encode(false); + struct iri *i = iri_new(); + /* url was given in the command line, so use locale as encoding */ + set_uri_encoding (i, opt.locale); + u= url_parse (url, NULL, i); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/retr.h b/src/retr.h index ec55cfda..bb2e66d3 100644 --- a/src/retr.h +++ b/src/retr.h @@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); +uerr_t retrieve_url (const char *, char **, char **, const char *, int *, + bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --git a/src/url.c b/src/url.c index beaf0fb2..c7a3a721 100644 --- a/src/url.c +++ b/src/url.c @@ -641,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, struct iri *iri) { struct url *u; const char *p; @@ -660,7 +660,7 @@ url_parse (const char *url, int *error) int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; + char *url_encoded = NULL, *new_url = NULL; int error_code; @@ -671,20 +671,20 @@ url_parse (const char *url, int *error) goto error; } - if (opt.enable_iri && get_utf8_encode ()) + if (iri && iri->utf8_encode) { - const char *new; - bool utf8_encode; url_unescape ((char *) url); - utf8_encode = remote_to_utf8 (url, &new); - set_utf8_encode (utf8_encode); - if (utf8_encode) - url = new; + iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; } - url_encoded = reencode_escapes (url); + url_encoded = reencode_escapes (new_url ? new_url : url); p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@ -854,16 +854,17 @@ url_parse (const char *url, int *error) { url_unescape (u->host); host_modified = true; - } - if (opt.enable_iri) - { - char *new = idn_encode (u->host, get_utf8_encode ()); - if (new) + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) { - xfree (u->host); - u->host = new; - host_modified = true; + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } } } diff --git a/src/url.h b/src/url.h index 7c8bcfed..9c49c0b5 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, struct iri *iri); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --git a/src/wget.h b/src/wget.h index d87dfcac..b17b6709 100644 --- a/src/wget.h +++ b/src/wget.h @@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT; #include "quote.h" #include "quotearg.h" +/* Likewise for struct iri definition */ +#include "iri.h" + /* Useful macros used across the code: */ /* The number of elements in an array. For example: From 3ae04f5fe4ae2025c177168be4a2c396627c2ffb Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 24 Jul 2008 14:32:31 +0200 Subject: [PATCH 30/55] Use DEBUGP instead of commenting out all the _wonderful_ printfs --- src/iri.c | 4 ++-- src/recur.c | 3 ++- src/retr.c | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/iri.c b/src/iri.c index 783aa331..44a262b8 100644 --- a/src/iri.c +++ b/src/iri.c @@ -331,7 +331,7 @@ iri_free (struct iri *i) void set_uri_encoding (struct iri *i, char *charset) { - logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset); + DEBUGP (("[IRI uri = `%s'\n", quote (charset))); if (opt.encoding_remote) return; if (i->uri_encoding) @@ -347,7 +347,7 @@ set_uri_encoding (struct iri *i, char *charset) void set_content_encoding (struct iri *i, char *charset) { - logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset); + DEBUGP (("[IRI content = %s\n", quote (charset))); if (opt.encoding_remote) return; if (i->content_encoding) diff --git a/src/recur.c b/src/recur.c index e2f58d1c..aa83e9a6 100644 --- a/src/recur.c +++ b/src/recur.c @@ -116,7 +116,8 @@ url_enqueue (struct url_queue *queue, struct iri *i, DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); if (i) - printf ("[Enqueuing %s with %s\n", url, i->uri_encoding); + DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url), + quote (i->uri_encoding))); if (queue->tail) queue->tail->next = qel; diff --git a/src/retr.c b/src/retr.c index ae8ef3ef..691b8f51 100644 --- a/src/retr.c +++ b/src/retr.c @@ -635,7 +635,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } - printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode); + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url), + quote (iri->uri_encoding), iri->utf8_encode)); if (!refurl) refurl = opt.referer; @@ -788,7 +789,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (!(*dt & RETROKF) && iri->utf8_encode) { iri->utf8_encode = false; - printf ("[Fallbacking to non-utf8 for `%s'\n", url); + DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url))); goto second_try; } From 8c513ef48725f2091baecb30717b178f3337b442 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 24 Jul 2008 14:34:48 +0200 Subject: [PATCH 31/55] Fix numbers of arguments of the no-op macro idn_encode() --- src/iri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iri.h b/src/iri.h index 173d0656..cdc5c7fd 100644 --- a/src/iri.h +++ b/src/iri.h @@ -58,7 +58,7 @@ struct iri dummy_iri; #define find_locale() NULL #define check_encoding_name(str) false #define locale_to_utf8(str) (str) -#define idn_encode(a,b,c) NULL +#define idn_encode(a,b) NULL #define idn_decode(str) NULL #define remote_to_utf8(a,b,c) false #define iri_new() (&dummy_iri) From b967d49f79b6e0ce73559bd30d231bddc2e4b232 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Wed, 30 Jul 2008 10:15:55 +0200 Subject: [PATCH 32/55] opt.remote_encoding should not override opt.locale, add a force arguments to set_uri_encoding() --- src/iri.c | 4 ++-- src/iri.h | 4 ++-- src/main.c | 2 +- src/recur.c | 4 ++-- src/res.c | 2 +- src/retr.c | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/iri.c b/src/iri.c index 44a262b8..3ee99871 100644 --- a/src/iri.c +++ b/src/iri.c @@ -329,10 +329,10 @@ iri_free (struct iri *i) } void -set_uri_encoding (struct iri *i, char *charset) +set_uri_encoding (struct iri *i, char *charset, bool force) { DEBUGP (("[IRI uri = `%s'\n", quote (charset))); - if (opt.encoding_remote) + if (!force && opt.encoding_remote) return; if (i->uri_encoding) { diff --git a/src/iri.h b/src/iri.h index cdc5c7fd..e7f3fe3e 100644 --- a/src/iri.h +++ b/src/iri.h @@ -47,7 +47,7 @@ char *idn_decode (char *host); bool remote_to_utf8 (struct iri *i, const char *str, const char **new); struct iri *iri_new (void); void iri_free (struct iri *i); -void set_uri_encoding (struct iri *i, char *charset); +void set_uri_encoding (struct iri *i, char *charset, bool force); void set_content_encoding (struct iri *i, char *charset); #else /* ENABLE_IRI */ @@ -63,7 +63,7 @@ struct iri dummy_iri; #define remote_to_utf8(a,b,c) false #define iri_new() (&dummy_iri) #define iri_free(a) -#define set_uri_encoding(a,b) +#define set_uri_encoding(a,b,c) #define set_content_encoding(a,b) #endif /* ENABLE_IRI */ diff --git a/src/main.c b/src/main.c index 8cee194c..799e5d63 100644 --- a/src/main.c +++ b/src/main.c @@ -1206,7 +1206,7 @@ WARNING: Can't reopen standard output in binary mode;\n\ else { struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale); + set_uri_encoding (i, opt.locale, true); status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive, i); iri_free (i); diff --git a/src/recur.c b/src/recur.c index aa83e9a6..19ef8f1c 100644 --- a/src/recur.c +++ b/src/recur.c @@ -202,7 +202,7 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale); + set_uri_encoding (i, opt.locale, true); start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) @@ -391,7 +391,7 @@ retrieve_tree (const char *start_url) blacklist, i)) { ci = iri_new (); - set_uri_encoding (ci, i->content_encoding); + set_uri_encoding (ci, i->content_encoding, false); url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, diff --git a/src/res.c b/src/res.c index 69abd12d..0320d034 100644 --- a/src/res.c +++ b/src/res.c @@ -542,7 +542,7 @@ res_retrieve_file (const char *url, char **file, struct iri *iri) /* Copy server URI encoding for a possible IDNA transformation, no need to encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ - set_uri_encoding (i, iri->uri_encoding); + set_uri_encoding (i, iri->uri_encoding, false); i->utf8_encode = false; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); diff --git a/src/retr.c b/src/retr.c index 691b8f51..111b745a 100644 --- a/src/retr.c +++ b/src/retr.c @@ -653,7 +653,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, { /* sXXXav : could a proxy include a path ??? */ struct iri *pi = iri_new (); - set_uri_encoding (pi, opt.locale); + set_uri_encoding (pi, opt.locale, true); pi->utf8_encode = false; /* Parse the proxy URL. */ @@ -1083,7 +1083,7 @@ url_uses_proxy (const char *url) struct url *u; struct iri *i = iri_new(); /* url was given in the command line, so use locale as encoding */ - set_uri_encoding (i, opt.locale); + set_uri_encoding (i, opt.locale, true); u= url_parse (url, NULL, i); if (!u) return false; From 042828f4690232e4e2d8b0787acb941d64b59b97 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Fri, 1 Aug 2008 14:58:37 +0200 Subject: [PATCH 33/55] Add a missing return... --- src/iri.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/iri.c b/src/iri.c index 3ee99871..a45f3899 100644 --- a/src/iri.c +++ b/src/iri.c @@ -318,6 +318,7 @@ iri_new (void) i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; i->content_encoding = NULL; i->utf8_encode = opt.enable_iri; + return i; } void From bfd8a73f004b95d044741f4cb78ecad9de92bddc Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sat, 2 Aug 2008 11:22:14 +0200 Subject: [PATCH 34/55] quote*() functions don't like that much NULL arg --- src/iri.c | 9 +++++---- src/recur.c | 2 +- src/retr.c | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/iri.c b/src/iri.c index 3ee99871..9050e858 100644 --- a/src/iri.c +++ b/src/iri.c @@ -318,6 +318,7 @@ iri_new (void) i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; i->content_encoding = NULL; i->utf8_encode = opt.enable_iri; + return i; } void @@ -331,12 +332,12 @@ iri_free (struct iri *i) void set_uri_encoding (struct iri *i, char *charset, bool force) { - DEBUGP (("[IRI uri = `%s'\n", quote (charset))); + DEBUGP (("[IRI uri = `%s'\n", charset ? quote (charset) : "None")); if (!force && opt.encoding_remote) return; if (i->uri_encoding) { - if (!strcasecmp (i->uri_encoding, charset)) + if (charset && !strcasecmp (i->uri_encoding, charset)) return; xfree (i->uri_encoding); } @@ -347,12 +348,12 @@ set_uri_encoding (struct iri *i, char *charset, bool force) void set_content_encoding (struct iri *i, char *charset) { - DEBUGP (("[IRI content = %s\n", quote (charset))); + DEBUGP (("[IRI content = %s\n", charset ? quote (charset) : "None")); if (opt.encoding_remote) return; if (i->content_encoding) { - if (!strcasecmp (i->content_encoding, charset)) + if (charset && !strcasecmp (i->content_encoding, charset)) return; xfree (i->content_encoding); } diff --git a/src/recur.c b/src/recur.c index 19ef8f1c..baeaed58 100644 --- a/src/recur.c +++ b/src/recur.c @@ -117,7 +117,7 @@ url_enqueue (struct url_queue *queue, struct iri *i, if (i) DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url), - quote (i->uri_encoding))); + i->uri_encoding ? quote (i->uri_encoding) : "None")); if (queue->tail) queue->tail->next = qel; diff --git a/src/retr.c b/src/retr.c index 111b745a..fa7f762d 100644 --- a/src/retr.c +++ b/src/retr.c @@ -636,7 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, } DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url), - quote (iri->uri_encoding), iri->utf8_encode)); + iri->uri_encoding ? quote (iri->uri_encoding) : "None", + iri->utf8_encode)); if (!refurl) refurl = opt.referer; From da7adbaef4bb2c47a19db3e83620aed06ba9456e Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sat, 2 Aug 2008 12:17:03 +0200 Subject: [PATCH 35/55] Functional tests for IRI and HTTP --- tests/Test-iri-disabled.px | 197 ++++++++++++++++++++++++++++ tests/Test-iri-forced-remote.px | 208 +++++++++++++++++++++++++++++ tests/Test-iri.px | 225 ++++++++++++++++++++++++++++++++ tests/run-px | 3 + 4 files changed, 633 insertions(+) create mode 100755 tests/Test-iri-disabled.px create mode 100755 tests/Test-iri-forced-remote.px create mode 100755 tests/Test-iri.px diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px new file mode 100755 index 00000000..122537ff --- /dev/null +++ b/tests/Test-iri-disabled.px @@ -0,0 +1,197 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html mime + file = iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; +my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :) +

+ + +EOF + +my $pageeuro = < + + Euro page + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "200", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_l1}${eacute_l1}n.html" => { + content => $pageeen, + }, + "p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => { + content => $pageeuro, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-disabled", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px new file mode 100755 index 00000000..0d116d8f --- /dev/null +++ b/tests/Test-iri-forced-remote.px @@ -0,0 +1,208 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# Force remote encoding to ISO-8859-1 +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html mime + file = iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; +my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav +my $currency_l1 = "\xA4"; +my $currency_u8 = "\xC2\xA4"; + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :) +

+ + +EOF + +my $pageeuro = < + + Euro page + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, + "p3_${currency_u8}${currency_u8}${currency_u8}.html" => { + content => $pageeuro, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-forced-remote", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri.px b/tests/Test-iri.px new file mode 100755 index 00000000..3f4cf3fd --- /dev/null +++ b/tests/Test-iri.px @@ -0,0 +1,225 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# p4_méér.html mime + file = utf-8 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; +my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+ Méér +

+ + +EOF + +my $pageeuro = < + + Euro page + + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $pagemeer = < + + Bekende supermarkt + + +

+ Ik ben toch niet gek ! +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $pageeuro, + }, + '/p4_m%C3%A9%C3%A9r.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=UTF-8", + }, + content => $pagemeer, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, + "p3_${eurosign2_u8}${eurosign2_u8}${eurosign2_u8}.html" => { + content => $pageeuro, + }, + "p4_m${eacute_u8}${eacute_u8}r.html" => { + content => $pagemeer, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 37f14324..865246e3 100755 --- a/tests/run-px +++ b/tests/run-px @@ -19,6 +19,9 @@ my @tests = ( 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', + 'Test-iri.px', + 'Test-iri-disabled.px', + 'Test-iri-forced-remote.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px', From 8d7c2219d1965fb1bda16d46bb45e8fe7dc60501 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sat, 2 Aug 2008 13:47:10 +0200 Subject: [PATCH 36/55] Test FTP IRI support --- tests/Test-ftp-iri-disabled.px | 50 ++++++++++++++++++++++++++++++++++ tests/Test-ftp-iri-fallback.px | 46 +++++++++++++++++++++++++++++++ tests/Test-ftp-iri.px | 47 ++++++++++++++++++++++++++++++++ tests/run-px | 3 ++ 4 files changed, 146 insertions(+) create mode 100755 tests/Test-ftp-iri-disabled.px create mode 100755 tests/Test-ftp-iri-fallback.px create mode 100755 tests/Test-ftp-iri.px diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px new file mode 100755 index 00000000..14d849da --- /dev/null +++ b/tests/Test-ftp-iri-disabled.px @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, + "/fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px new file mode 100755 index 00000000..8902e0f9 --- /dev/null +++ b/tests/Test-ftp-iri-fallback.px @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px new file mode 100755 index 00000000..d453669c --- /dev/null +++ b/tests/Test-ftp-iri.px @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_u8}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 865246e3..172adcd7 100755 --- a/tests/run-px +++ b/tests/run-px @@ -16,6 +16,9 @@ my @tests = ( 'Test-E-k-K.px', 'Test-E-k.px', 'Test-ftp.px', + 'Test-ftp-iri.px', + 'Test-ftp-iri-fallback.px', + 'Test-ftp-iri-disabled.px', 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', From e2813c1e4fdf1f565f65197445695cc18485ddb3 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sun, 3 Aug 2008 20:02:35 +0200 Subject: [PATCH 37/55] Since wget use libidn function for finding the locale, langinfo.h is useless --- src/main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main.c b/src/main.c index 799e5d63..79c35220 100644 --- a/src/main.c +++ b/src/main.c @@ -43,9 +43,6 @@ as that of the covered work. */ #include #include #include -#ifdef ENABLE_IRI -#include -#endif #include "utils.h" #include "init.h" From cda8835de6b299d591f636ba960c66ad646a2b58 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sun, 3 Aug 2008 20:03:13 +0200 Subject: [PATCH 38/55] IRI support documentation, first attempt --- doc/wget.texi | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/doc/wget.texi b/doc/wget.texi index 6f88e19a..9219f49c 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -675,6 +675,30 @@ Another instance where you'll get a garbled file if you try to use Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http} servers that support the @code{Range} header. +@cindex iri support +@cindex idn support +@item --iri + +Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to +turn it off. IRI support is activated by default. + +You can set the default state of IRI support using @code{iri} command in +@file{.wgetrc}. That setting may be overridden from the command line. + +@cindex local encoding +@cindex locale +@item --locale=@var{encoding} + +Force Wget to use @var{encoding} as the default system encoding. That affects +how Wget converts URLs specified as arguments from locale to @sc{utf-8} for +IRI support. + +Wget use the function @code{nl_langinfo()} and then the @code{CHARSET} +environment variable to get the locale. If it fails, @sc{ascii} is used. + +You can set the default locale using the @code{locale} command in +@file{.wgetrc}. That setting may be overridden from the command line. + @cindex progress indicator @cindex dot style @item --progress=@var{type} @@ -706,6 +730,21 @@ command line. The exception is that, when the output is not a TTY, the ``dot'' progress will be favored over ``bar''. To force the bar output, use @samp{--progress=bar:force}. +@cindex remote encoding +@item --remote-encoding=@var{encoding} + +Force Wget to use encoding as the default remote server encoding. That +affects how Wget converts URIs found in files from remote encoding to +@sc{utf-8} during a recursive fetch. This options is only useful for +IRI support, for the interpretation of non-@sc{ascii} characters. + +For HTTP, remote encoding can be found in HTTP @code{Content-Type} +header and in HTML @code{Content-Type http-equiv} meta tag. + +You can set the default encoding using the @code{remoteencoding} +command in @file{.wgetrc}. That setting may be overridden from the +command line. + @item -N @itemx --timestamping Turn on time-stamping. @xref{Time-Stamping}, for details. From e4fd97c2eb9c7311a0cf8bf51bbf9d6cff16ae91 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sun, 3 Aug 2008 20:06:39 +0200 Subject: [PATCH 39/55] Add lines to .wgetrc sample file --- doc/sample.wgetrc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/sample.wgetrc b/doc/sample.wgetrc index c69596bf..7ef9ef4a 100644 --- a/doc/sample.wgetrc +++ b/doc/sample.wgetrc @@ -113,3 +113,12 @@ waitretry = 10 # To try ipv6 addresses first: #prefer-family = IPv6 + +# Set default IRI support state +#iri = off + +# Force the default system encoding +#locale = UTF-8 + +# Force the default remote server encoding +#remoteencoding = UTF-8 From f8ffc7d0848e45c9c288c19332b99b6291188e66 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sun, 3 Aug 2008 20:38:00 +0200 Subject: [PATCH 40/55] Use --restrict-file-names=nocontrol during Test-iri* tests avoiding some special escaping --- tests/Test-iri-disabled.px | 1 - tests/Test-iri-forced-remote.px | 1 - tests/Test-iri.px | 5 ++--- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px index 122537ff..17e43361 100755 --- a/tests/Test-iri-disabled.px +++ b/tests/Test-iri-disabled.px @@ -24,7 +24,6 @@ my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eurosign_l15 = "\xA4"; my $eurosign_u8 = "\xE2\x82\xAC"; -my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav my $pageindex = < diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px index 0d116d8f..1acd03a7 100755 --- a/tests/Test-iri-forced-remote.px +++ b/tests/Test-iri-forced-remote.px @@ -25,7 +25,6 @@ my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eurosign_l15 = "\xA4"; my $eurosign_u8 = "\xE2\x82\xAC"; -my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav my $currency_l1 = "\xA4"; my $currency_u8 = "\xC2\xA4"; diff --git a/tests/Test-iri.px b/tests/Test-iri.px index 3f4cf3fd..d228721c 100755 --- a/tests/Test-iri.px +++ b/tests/Test-iri.px @@ -25,7 +25,6 @@ my $eacute_l1 = "\xE9"; my $eacute_u8 = "\xC3\xA9"; my $eurosign_l15 = "\xA4"; my $eurosign_u8 = "\xE2\x82\xAC"; -my $eurosign2_u8 = "\xE2%82\xAC"; # version wget use... sXXXav my $pageindex = < @@ -187,7 +186,7 @@ my %urls = ( }, ); -my $cmdline = $WgetTest::WGETPATH . " --iri -nH -r http://localhost:{{port}}/"; +my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/"; my $expected_error_code = 0; @@ -204,7 +203,7 @@ my %expected_downloaded_files = ( "p2_${eacute_u8}${eacute_u8}n.html" => { content => $pageeen, }, - "p3_${eurosign2_u8}${eurosign2_u8}${eurosign2_u8}.html" => { + "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => { content => $pageeuro, }, "p4_m${eacute_u8}${eacute_u8}r.html" => { From c74bc2da704de7c291521093368b8bab7149909d Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Sun, 3 Aug 2008 22:30:12 +0200 Subject: [PATCH 41/55] Some cleanups in iri.c --- src/iri.c | 70 ++++++++++++++++++++++--------------------------------- src/iri.h | 4 ++-- 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/src/iri.c b/src/iri.c index 9050e858..dce9e2ed 100644 --- a/src/iri.c +++ b/src/iri.c @@ -46,9 +46,6 @@ as that of the covered work. */ /* Note: locale encoding is kept in options struct (opt.locale) */ -static iconv_t locale2utf8; - -static bool open_locale_to_utf8 (void); static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); @@ -119,27 +116,7 @@ check_encoding_name (char *encoding) static bool open_locale_to_utf8 (void) { - if (locale2utf8) - return true; - /* sXXXav : That shouldn't happen, just in case */ - if (!opt.locale) - { - logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); - opt.locale = find_locale (); - } - - if (!opt.locale) - return false; - - locale2utf8 = iconv_open ("UTF-8", opt.locale); - if (locale2utf8 != (iconv_t)(-1)) - return true; - - logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", - quote (opt.locale), quote ("UTF-8")); - locale2utf8 = NULL; - return false; } /* Try converting string str from locale to UTF-8. Return a new string @@ -147,22 +124,35 @@ open_locale_to_utf8 (void) const char * locale_to_utf8 (const char *str) { + iconv_t l2u; char *new; - if (!strcasecmp (opt.locale, "utf-8")) + /* That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) return str; - if (!open_locale_to_utf8 ()) - return str; + l2u = iconv_open ("UTF-8", opt.locale); + if (l2u != (iconv_t)(-1)) + { + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote ("UTF-8")); + return str; + } - if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new)) return (const char *) new; return str; } /* Do the conversion according to the passed conversion descriptor cd. *out - will containes the transcoded string on success. *out content is + will contain the transcoded string on success. *out content is unspecified otherwise. */ static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out) @@ -236,11 +226,7 @@ idn_encode (struct iri *i, char *host) if (!i->utf8_encode) { if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) - { - /* Nothing to encode or an error occured */ - return NULL; - } - + return NULL; /* Nothing to encode or an error occured */ host = new; } @@ -281,18 +267,13 @@ idn_decode (char *host) bool remote_to_utf8 (struct iri *i, const char *str, const char **new) { - char *r; iconv_t cd; bool ret = false; - if (opt.encoding_remote) - r = opt.encoding_remote; - else if (i->uri_encoding) - r = i->uri_encoding; - else + if (!i->uri_encoding) return false; - cd = iconv_open ("UTF-8", r); + cd = iconv_open ("UTF-8", i->uri_encoding); if (cd == (iconv_t)(-1)) return false; @@ -311,6 +292,7 @@ remote_to_utf8 (struct iri *i, const char *str, const char **new) return ret; } +/* Allocate a new iri structure and return a pointer to it. */ struct iri * iri_new (void) { @@ -321,6 +303,7 @@ iri_new (void) return i; } +/* Completely free an iri structure. */ void iri_free (struct iri *i) { @@ -329,10 +312,12 @@ iri_free (struct iri *i) xfree (i); } +/* Set uri_encoding of struct iri i. If a remote encoding was specified, use + it unless force is true. */ void set_uri_encoding (struct iri *i, char *charset, bool force) { - DEBUGP (("[IRI uri = `%s'\n", charset ? quote (charset) : "None")); + DEBUGP (("URI encoding = `%s'\n", charset ? quote (charset) : "None")); if (!force && opt.encoding_remote) return; if (i->uri_encoding) @@ -345,10 +330,11 @@ set_uri_encoding (struct iri *i, char *charset, bool force) i->uri_encoding = charset ? xstrdup (charset) : NULL; } +/* Set content_encoding of struct iri i. */ void set_content_encoding (struct iri *i, char *charset) { - DEBUGP (("[IRI content = %s\n", charset ? quote (charset) : "None")); + DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None")); if (opt.encoding_remote) return; if (i->content_encoding) diff --git a/src/iri.h b/src/iri.h index e7f3fe3e..c024de72 100644 --- a/src/iri.h +++ b/src/iri.h @@ -31,9 +31,9 @@ as that of the covered work. */ #define IRI_H struct iri { - char *uri_encoding; /* Encoding of the uri to fetch */ + char *uri_encoding; /* Encoding of the uri to fetch */ char *content_encoding; /* Encoding of links inside the fetched file */ - bool utf8_encode; /* Will/Is the current url encoded in utf8 */ + bool utf8_encode; /* Will/Is the current url encoded in utf8 */ }; #ifdef ENABLE_IRI From 84395897ad2d1c107be470946daba744b2e7ebe8 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Mon, 4 Aug 2008 11:08:33 +0200 Subject: [PATCH 42/55] iri.h is already included in wget.h, so don't include it in C files --- src/connect.c | 1 - src/host.c | 1 - src/iri.c | 1 - src/recur.c | 1 - src/retr.c | 1 - src/url.c | 1 - 6 files changed, 6 deletions(-) diff --git a/src/connect.c b/src/connect.c index 6cfdb4b7..41258d26 100644 --- a/src/connect.c +++ b/src/connect.c @@ -58,7 +58,6 @@ as that of the covered work. */ #include "host.h" #include "connect.h" #include "hash.h" -#include "iri.h" /* Define sockaddr_storage where unavailable (presumably on IPv4-only hosts). */ diff --git a/src/host.c b/src/host.c index 1226a274..bbf40222 100644 --- a/src/host.c +++ b/src/host.c @@ -53,7 +53,6 @@ as that of the covered work. */ #include "host.h" #include "url.h" #include "hash.h" -#include "iri.h" #ifndef NO_ADDRESS # define NO_ADDRESS NO_DATA diff --git a/src/iri.c b/src/iri.c index dce9e2ed..ea4046af 100644 --- a/src/iri.c +++ b/src/iri.c @@ -39,7 +39,6 @@ as that of the covered work. */ #include #include "utils.h" -#include "iri.h" /* RFC3987 section 3.1 mandates STD3 ASCII RULES */ #define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES diff --git a/src/recur.c b/src/recur.c index baeaed58..71fbe7bf 100644 --- a/src/recur.c +++ b/src/recur.c @@ -51,7 +51,6 @@ as that of the covered work. */ #include "html-url.h" #include "css-url.h" #include "spider.h" -#include "iri.h" /* Functions for maintaining the URL queue. */ diff --git a/src/retr.c b/src/retr.c index fa7f762d..fe176eaf 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,7 +51,6 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" -#include "iri.h" #include "html-url.h" /* Total size of downloaded files. Used to enforce quota. */ diff --git a/src/url.c b/src/url.c index c7a3a721..e79cf8a2 100644 --- a/src/url.c +++ b/src/url.c @@ -42,7 +42,6 @@ as that of the covered work. */ #include "utils.h" #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ -#include "iri.h" #ifdef TESTING #include "test.h" From bb62e1aa9ed97e931ccd174f64c6e13e4d0439bb Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Mon, 4 Aug 2008 11:18:26 +0200 Subject: [PATCH 43/55] Update tests/ChangeLog for the 6 new tests for testing IRI support --- tests/ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/ChangeLog b/tests/ChangeLog index 36bc35dc..ad18c14a 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,26 @@ +2008-08-03 Xavier Saint + + * Test-iri.px : HTTP recursive fetch for testing IRI support and + fallback. + + * Test-iri-disabled.px : Same file structure as Test-iri.px but with + IRI support disabled + + * Test-iri-forced-remote.px : There's a difference between ISO-8859-1 + and ISO-8859-15 for character 0xA4 (respectively currency sign and + euro sign). So with a forced ISO-8859-1 remote encoding, wget should + see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead + of using the ISO-8859-15 given by the server. + + * Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale + and expect wget to fetch the file UTF-8 encoded. + + * Test-ftp-iri-fallback.px : Same as above but wget should fallback on + locale encoding to fetch the file. + + * Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support + disabled. The UTF-8 encoded file should not be retrieved. + 2008-06-22 Micah Cowan * Test-proxied-https-auth.px: Shift exit code so it falls in the From 49061b72b630e248b4e1df0593a2198b2ed612fb Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Mon, 4 Aug 2008 11:21:45 +0200 Subject: [PATCH 44/55] Update doc/ChangeLog in regards to IRI support --- doc/ChangeLog | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/ChangeLog b/doc/ChangeLog index 4f68780e..08d2f05e 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,12 @@ +2008-08-03 Xavier Saint + + * wget.texi : Add option descriptions for the three new + options --iri, --locale and --remote-encoding related to + IRI support. + + * sample.wgetrc : Add commented lines for the three new + command iri, locale and encoding related to IRI support. + 2008-07-17 Steven Schubiger * wget.texi (Logging and Input File Options): Document From 0d0a42514458629dd6875138e813110f39eded03 Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 7 Aug 2008 10:33:06 +0200 Subject: [PATCH 45/55] Correct a mis-merge: return type for url_error is char*, not const char* --- src/url.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/url.h b/src/url.h index 69db1551..0748e214 100644 --- a/src/url.h +++ b/src/url.h @@ -85,7 +85,7 @@ struct url char *url_escape (const char *); struct url *url_parse (const char *, int *, struct iri *iri); -const char *url_error (const char *, int); +char *url_error (const char *, int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); void url_set_file (struct url *, const char *); From e6b4e761d1f1439b1b2352f5eeaedd1ae5b9d76e Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 17:42:16 +0200 Subject: [PATCH 46/55] Don't forget to free the iri struct --- src/retr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/retr.c b/src/retr.c index fe176eaf..4731d9ee 100644 --- a/src/retr.c +++ b/src/retr.c @@ -928,6 +928,8 @@ Removing file due to --delete-after in retrieve_from_file():\n")); /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } From 723dbfc818e3e5b22ec53fd093dca999290ebead Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 18:26:53 +0200 Subject: [PATCH 47/55] Correct iri handling while fetching a remote file list with -i and provide a test --- src/main.c | 2 +- src/recur.c | 15 +++- src/recur.h | 2 +- src/retr.c | 8 +- tests/Test-iri-list.px | 173 +++++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 6 files changed, 195 insertions(+), 6 deletions(-) create mode 100755 tests/Test-iri-list.px diff --git a/src/main.c b/src/main.c index 79c35220..8d8d93fa 100644 --- a/src/main.c +++ b/src/main.c @@ -1196,7 +1196,7 @@ WARNING: Can't reopen standard output in binary mode;\n\ if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (*t); + status = retrieve_tree (*t, NULL); opt.follow_ftp = old_follow_ftp; } diff --git a/src/recur.c b/src/recur.c index 71fbe7bf..921c60c7 100644 --- a/src/recur.c +++ b/src/recur.c @@ -187,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int, options, add it to the queue. */ uerr_t -retrieve_tree (const char *start_url) +retrieve_tree (const char *start_url, struct iri *pi) { uerr_t status = RETROK; @@ -201,7 +201,18 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale, true); + +#define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); +#undef COPYSTR start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) diff --git a/src/recur.h b/src/recur.h index 5ab26a95..515a382b 100644 --- a/src/recur.h +++ b/src/recur.h @@ -42,6 +42,6 @@ as that of the covered work. */ struct urlpos; void recursive_cleanup (void); -uerr_t retrieve_tree (const char *); +uerr_t retrieve_tree (const char *, struct iri *); #endif /* RECUR_H */ diff --git a/src/retr.c b/src/retr.c index 4731d9ee..963d5044 100644 --- a/src/retr.c +++ b/src/retr.c @@ -651,7 +651,6 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { - /* sXXXav : could a proxy include a path ??? */ struct iri *pi = iri_new (); set_uri_encoding (pi, opt.locale, true); pi->utf8_encode = false; @@ -858,6 +857,7 @@ retrieve_from_file (const char *file, bool html, int *count) *count = 0; /* Reset the URL count. */ /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); set_content_encoding (iri, opt.locale); if (url_has_scheme (url)) @@ -894,6 +894,10 @@ retrieve_from_file (const char *file, bool html, int *count) status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { @@ -903,7 +907,7 @@ retrieve_from_file (const char *file, bool html, int *count) if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (cur_url->url->url); + status = retrieve_tree (cur_url->url->url, iri); opt.follow_ftp = old_follow_ftp; } diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px new file mode 100755 index 00000000..51bb09fe --- /dev/null +++ b/tests/Test-iri-list.px @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; + +my $urllist = < + + Main Page + + +

+ Main page. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ French page. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Dutch page. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/url_list.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $urllist, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'url_list.txt' => { + content => $urllist, + }, + 'index.html' => { + content => $pageindex, + }, + "p1_fran${ccedilla_l1}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-list", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 172adcd7..51dec828 100755 --- a/tests/run-px +++ b/tests/run-px @@ -25,6 +25,7 @@ my @tests = ( 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px', From a5c222fa798673319e930e944d8d59cd906361fc Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 18:31:03 +0200 Subject: [PATCH 48/55] Update tests/Changelog for Test-iri-list.px --- tests/ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ChangeLog b/tests/ChangeLog index ad18c14a..f2179763 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,7 @@ +2008-08-14 Xavier Saint + + * Test-iri-list.px : Fetch files from a remote list. + 2008-08-03 Xavier Saint * Test-iri.px : HTTP recursive fetch for testing IRI support and From 1063191b33579ef411e17881125e926573839560 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Fri, 15 Aug 2008 14:41:15 +0200 Subject: [PATCH 49/55] Fix a double quoting --- src/iri.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iri.c b/src/iri.c index ea4046af..e3909d50 100644 --- a/src/iri.c +++ b/src/iri.c @@ -316,7 +316,7 @@ iri_free (struct iri *i) void set_uri_encoding (struct iri *i, char *charset, bool force) { - DEBUGP (("URI encoding = `%s'\n", charset ? quote (charset) : "None")); + DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None")); if (!force && opt.encoding_remote) return; if (i->uri_encoding) From 5133d573667c75e5af2de1a4797d7610b05900a5 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Fri, 15 Aug 2008 15:03:38 +0200 Subject: [PATCH 50/55] Fixes: use encoding got from the remote server serving the list file and use quote_n() for quote'ing several args --- src/retr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/retr.c b/src/retr.c index 0aa95072..28a6d874 100644 --- a/src/retr.c +++ b/src/retr.c @@ -636,8 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } - DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote (url), - iri->uri_encoding ? quote (iri->uri_encoding) : "None", + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url), + iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None", iri->utf8_encode)); if (!refurl) @@ -880,6 +880,10 @@ retrieve_from_file (const char *file, bool html, int *count) if (dt & TEXTHTML) html = true; + + /* If we have a found a content encoding, use it */ + if (iri->content_encoding) + set_uri_encoding (iri, iri->content_encoding, false); } else input_file = (char *) file; From 26a3eea8e2f42c621ce6c40a93acf5ff1cd12220 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Fri, 15 Aug 2008 15:15:42 +0200 Subject: [PATCH 51/55] Removed commented *printf and use quote_n() for quoting several args --- src/html-url.c | 1 - src/main.c | 2 -- src/recur.c | 4 ++-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/html-url.c b/src/html-url.c index cbaffb25..c954cb97 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -571,7 +571,6 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ xfree_null (meta_charset); meta_charset = mcharset; } diff --git a/src/main.c b/src/main.c index c080394e..414b62bc 100644 --- a/src/main.c +++ b/src/main.c @@ -1076,8 +1076,6 @@ for details.\n\n")); if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) opt.encoding_remote = NULL; - - /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/ } #else if (opt.enable_iri || opt.locale || opt.encoding_remote) diff --git a/src/recur.c b/src/recur.c index a0bb8681..78682458 100644 --- a/src/recur.c +++ b/src/recur.c @@ -115,8 +115,8 @@ url_enqueue (struct url_queue *queue, struct iri *i, DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); if (i) - DEBUGP (("[IRI Enqueuing %s with %s\n", quote (url), - i->uri_encoding ? quote (i->uri_encoding) : "None")); + DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url), + i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); if (queue->tail) queue->tail->next = qel; From ab0b0a40904c6cbd4a21d61c6acf31ff11c30a71 Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Thu, 28 Aug 2008 02:45:29 -0700 Subject: [PATCH 52/55] IDN test. --- tests/ChangeLog | 12 +++++++ tests/HTTPServer.pm | 3 +- tests/Test-idn-headers.px | 65 ++++++++++++++++++++++++++++++++++ tests/Test-proxy-auth-basic.px | 2 +- tests/run-px | 1 + 5 files changed, 81 insertions(+), 2 deletions(-) create mode 100755 tests/Test-idn-headers.px diff --git a/tests/ChangeLog b/tests/ChangeLog index f2179763..7eb37563 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,15 @@ +2008-08-28 Micah Cowan + + * HTTPServer.pm (run): Allow distinguishing between hostnames, + when used as a proxy. + + * Test-idn-headers.px: Added. + + * run-px: Added Test-idn-headers.px. + + * Test-proxy-auth-basic.px: Use the full URL, rather than just the + path (made necessary by the accompanying change to HTTPServer.pm). + 2008-08-14 Xavier Saint * Test-iri-list.px : Fetch files from a remote list. diff --git a/tests/HTTPServer.pm b/tests/HTTPServer.pm index b76f0985..01c36957 100644 --- a/tests/HTTPServer.pm +++ b/tests/HTTPServer.pm @@ -27,7 +27,8 @@ sub run { my $con = $self->accept(); print STDERR "Accepted a new connection\n" if $log; while (my $req = $con->get_request) { - my $url_path = $req->url->path; + #my $url_path = $req->url->path; + my $url_path = $req->url->as_string; if ($url_path =~ m{/$}) { # append 'index.html' $url_path .= 'index.html'; } diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px new file mode 100755 index 00000000..3289d5f5 --- /dev/null +++ b/tests/Test-idn-headers.px @@ -0,0 +1,65 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# " Kon'nichiwa Japan +my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334"; +my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a'; + +############################################################################### + +my $starter_file = <The link +EOF + +my $result_file = < { + code => "200", + msg => "You want fries with that?", + headers => { + 'Content-Type' => 'text/html; charset=EUC-JP', + }, + content => $starter_file, + }, + "http://$punycoded_hostname/index.html" => { + code => "200", + msg => "Yes, please", + headers => { + 'Content-Type' => 'text/plain', + }, + content => $result_file, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH" + . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'start-here.com/start.html' => { + content => $starter_file, + }, + "$punycoded_hostname/index.html" => { + content => $result_file, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-headers", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-proxy-auth-basic.px b/tests/Test-proxy-auth-basic.px index e440a392..e3934d7d 100755 --- a/tests/Test-proxy-auth-basic.px +++ b/tests/Test-proxy-auth-basic.px @@ -11,7 +11,7 @@ my $wholefile = "You're all authenticated.\n"; # code, msg, headers, content my %urls = ( - '/needs-auth.txt' => { + 'http://no.such.domain/needs-auth.txt' => { auth_method => 'Basic', user => 'fiddle-dee-dee', passwd => 'Dodgson', diff --git a/tests/run-px b/tests/run-px index c18c8d85..c2380d5b 100755 --- a/tests/run-px +++ b/tests/run-px @@ -23,6 +23,7 @@ my @tests = ( 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', + 'Test-idn-headers.px', 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', From 171c71e09cc710e82c2fa6f3c4d08a678083b346 Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Thu, 28 Aug 2008 12:47:17 -0700 Subject: [PATCH 53/55] Test for IDN, based on meta-specified encoding. --- tests/ChangeLog | 4 +-- tests/Test-idn-meta.px | 66 ++++++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100755 tests/Test-idn-meta.px diff --git a/tests/ChangeLog b/tests/ChangeLog index 7eb37563..867a82ec 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -3,9 +3,9 @@ * HTTPServer.pm (run): Allow distinguishing between hostnames, when used as a proxy. - * Test-idn-headers.px: Added. + * Test-idn-headers.px, Test-idn-meta.px: Added. - * run-px: Added Test-idn-headers.px. + * run-px: Added Test-idn-headers.px, Test-idn-meta.px. * Test-proxy-auth-basic.px: Use the full URL, rather than just the path (made necessary by the accompanying change to HTTPServer.pm). diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px new file mode 100755 index 00000000..1397cf45 --- /dev/null +++ b/tests/Test-idn-meta.px @@ -0,0 +1,66 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# " Kon'nichiwa Japan +my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334"; +my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a'; + +############################################################################### + +my $starter_file = < +The link +EOF + +my $result_file = < { + code => "200", + msg => "You want fries with that?", + headers => { + 'Content-Type' => 'text/html; charset=UTF-8', + }, + content => $starter_file, + }, + "http://$punycoded_hostname/index.html" => { + code => "200", + msg => "Yes, please", + headers => { + 'Content-Type' => 'text/plain', + }, + content => $result_file, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH" + . " -e http_proxy=localhost:{{port}} http://start-here.com/start.html"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'start-here.com/start.html' => { + content => $starter_file, + }, + "$punycoded_hostname/index.html" => { + content => $result_file, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-meta", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index c2380d5b..50f33218 100755 --- a/tests/run-px +++ b/tests/run-px @@ -24,6 +24,7 @@ my @tests = ( 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', 'Test-idn-headers.px', + 'Test-idn-meta.px', 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', From 523c3dfcbc3e6858ea94288554d67d3c1208a7c1 Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Tue, 9 Sep 2008 21:55:02 -0700 Subject: [PATCH 54/55] Test-idn-cmd.px. --- tests/ChangeLog | 6 ++++++ tests/Test-idn-cmd.px | 50 +++++++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 3 files changed, 57 insertions(+) create mode 100755 tests/Test-idn-cmd.px diff --git a/tests/ChangeLog b/tests/ChangeLog index 867a82ec..7751be64 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2008-09-09 Micah Cowan + + * Test-idn-cmd.px: Added. + + * run-px: Added Test-idn-cmd.px. + 2008-08-28 Micah Cowan * HTTPServer.pm (run): Allow distinguishing between hostnames, diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px new file mode 100755 index 00000000..a5c156a2 --- /dev/null +++ b/tests/Test-idn-cmd.px @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# " Kon'nichiwa Japan +my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334"; +my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a'; + +############################################################################### + +my $result_file = < { + code => "200", + msg => "Yes, please", + headers => { + 'Content-Type' => 'text/plain', + }, + content => $result_file, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH" + . " -e http_proxy=localhost:{{port}} --locale=EUC-JP $euc_jp_hostname"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "$punycoded_hostname/index.html" => { + content => $result_file, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-cmd", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 50f33218..38520714 100755 --- a/tests/run-px +++ b/tests/run-px @@ -25,6 +25,7 @@ my @tests = ( 'Test-HTTP-Content-Disposition.px', 'Test-idn-headers.px', 'Test-idn-meta.px', + 'Test-idn-cmd.px', 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', From 0fa023cfffc896d72ba36a8789154630e585435a Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Wed, 26 Nov 2008 07:14:27 -0800 Subject: [PATCH 55/55] More module-scoped warnings. --- tests/ChangeLog | 8 ++++++++ tests/Test-ftp-iri-disabled.px | 3 ++- tests/Test-ftp-iri-fallback.px | 3 ++- tests/Test-ftp-iri.px | 3 ++- tests/Test-idn-cmd.px | 3 ++- tests/Test-idn-headers.px | 3 ++- tests/Test-idn-meta.px | 3 ++- tests/Test-iri-disabled.px | 3 ++- tests/Test-iri-forced-remote.px | 3 ++- tests/Test-iri-list.px | 3 ++- tests/Test-iri.px | 3 ++- 11 files changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/ChangeLog b/tests/ChangeLog index 25f2ab40..ffe5fddd 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,11 @@ +2008-11-26 Micah Cowan (not copyrightable) + + * Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px, + Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px, + Test-idn-meta.px, Test-iri-disabled.px, + Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More + module-scope warnings. + 2008-11-12 Steven Schubiger * Test-auth-basic.px, Test-auth-no-challenge.px, diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px index 14d849da..96122867 100755 --- a/tests/Test-ftp-iri-disabled.px +++ b/tests/Test-ftp-iri-disabled.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use FTPTest; diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px index 8902e0f9..091fd008 100755 --- a/tests/Test-ftp-iri-fallback.px +++ b/tests/Test-ftp-iri-fallback.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use FTPTest; diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px index d453669c..78e2622c 100755 --- a/tests/Test-ftp-iri.px +++ b/tests/Test-ftp-iri.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use FTPTest; diff --git a/tests/Test-idn-cmd.px b/tests/Test-idn-cmd.px index a5c156a2..dba98183 100755 --- a/tests/Test-idn-cmd.px +++ b/tests/Test-idn-cmd.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-idn-headers.px b/tests/Test-idn-headers.px index 3289d5f5..f07621c3 100755 --- a/tests/Test-idn-headers.px +++ b/tests/Test-idn-headers.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-idn-meta.px b/tests/Test-idn-meta.px index 1397cf45..3d6e0563 100755 --- a/tests/Test-idn-meta.px +++ b/tests/Test-idn-meta.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px index 17e43361..02fc4d3a 100755 --- a/tests/Test-iri-disabled.px +++ b/tests/Test-iri-disabled.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px index 1acd03a7..8341d516 100755 --- a/tests/Test-iri-forced-remote.px +++ b/tests/Test-iri-forced-remote.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px index 51bb09fe..87cc33c8 100755 --- a/tests/Test-iri-list.px +++ b/tests/Test-iri-list.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest; diff --git a/tests/Test-iri.px b/tests/Test-iri.px index d228721c..662019e7 100755 --- a/tests/Test-iri.px +++ b/tests/Test-iri.px @@ -1,6 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use strict; +use warnings; use HTTPTest;