2008-06-20 04:07:03 +08:00
|
|
|
/* IRI related functions.
|
2011-01-01 20:19:37 +08:00
|
|
|
Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
|
2008-06-20 04:07:03 +08:00
|
|
|
|
|
|
|
This file is part of GNU Wget.
|
|
|
|
|
|
|
|
GNU Wget is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 3 of the License, or (at
|
|
|
|
your option) any later version.
|
|
|
|
|
|
|
|
GNU Wget is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with Wget. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
Additional permission under GNU GPL version 3 section 7
|
|
|
|
|
|
|
|
If you modify this program, or any covered work, by linking or
|
|
|
|
combining it with the OpenSSL project's OpenSSL library (or a
|
|
|
|
modified version of that library), containing parts covered by the
|
|
|
|
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
|
|
|
|
grants you additional permission to convey the resulting work.
|
|
|
|
Corresponding Source for a non-source form of such a combination
|
|
|
|
shall include the source code for the parts of OpenSSL used as well
|
|
|
|
as that of the covered work. */
|
|
|
|
|
|
|
|
#include "wget.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
2008-06-26 23:59:07 +08:00
|
|
|
#include <iconv.h>
|
2008-06-20 05:53:03 +08:00
|
|
|
#include <stringprep.h>
|
2008-06-26 23:59:07 +08:00
|
|
|
#include <idna.h>
|
|
|
|
#include <errno.h>
|
2008-06-20 05:53:03 +08:00
|
|
|
|
2008-06-20 04:07:03 +08:00
|
|
|
#include "utils.h"
|
2014-10-06 22:32:37 +08:00
|
|
|
#include "url.h"
|
2014-11-20 17:52:25 +08:00
|
|
|
#include "c-strcase.h"
|
2008-06-20 04:07:03 +08:00
|
|
|
|
2008-07-21 02:37:22 +08:00
|
|
|
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
|
|
|
|
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
|
|
|
|
|
2008-07-21 01:08:28 +08:00
|
|
|
/* Note: locale encoding is kept in options struct (opt.locale) */
|
|
|
|
|
2008-06-20 04:07:03 +08:00
|
|
|
/* Given a string containing "charset=XXX", return the encoding if found,
|
|
|
|
or NULL otherwise */
|
|
|
|
char *
|
|
|
|
parse_charset (char *str)
|
|
|
|
{
|
|
|
|
char *charset;
|
|
|
|
|
|
|
|
if (!str || !*str)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
str = strcasestr (str, "charset=");
|
|
|
|
if (!str)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
str += 8;
|
|
|
|
charset = str;
|
|
|
|
|
|
|
|
/* sXXXav: which chars should be banned ??? */
|
|
|
|
while (*charset && !c_isspace (*charset))
|
|
|
|
charset++;
|
|
|
|
|
|
|
|
/* sXXXav: could strdupdelim return NULL ? */
|
|
|
|
charset = strdupdelim (str, charset);
|
2008-06-20 06:33:02 +08:00
|
|
|
|
|
|
|
/* Do a minimum check on the charset value */
|
|
|
|
if (!check_encoding_name (charset))
|
|
|
|
{
|
|
|
|
xfree (charset);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2008-07-20 19:10:02 +08:00
|
|
|
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
|
2008-06-20 04:07:03 +08:00
|
|
|
|
|
|
|
return charset;
|
|
|
|
}
|
|
|
|
|
2008-06-20 05:10:06 +08:00
|
|
|
/* Find the locale used, or fall back on a default value */
|
|
|
|
char *
|
|
|
|
find_locale (void)
|
|
|
|
{
|
2008-06-20 05:53:03 +08:00
|
|
|
return (char *) stringprep_locale_charset ();
|
2008-06-20 05:10:06 +08:00
|
|
|
}
|
|
|
|
|
2008-06-20 06:33:02 +08:00
|
|
|
/* Basic check of an encoding name. */
|
|
|
|
bool
|
|
|
|
check_encoding_name (char *encoding)
|
|
|
|
{
|
|
|
|
char *s = encoding;
|
|
|
|
|
|
|
|
while (*s)
|
|
|
|
{
|
2008-07-02 01:28:24 +08:00
|
|
|
if (!c_isascii (*s) || c_isspace (*s))
|
2008-06-20 06:33:02 +08:00
|
|
|
{
|
2009-07-06 00:46:13 +08:00
|
|
|
logprintf (LOG_VERBOSE, _("Encoding %s isn't valid\n"), quote (encoding));
|
2008-06-20 06:33:02 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
s++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2008-07-21 01:08:28 +08:00
|
|
|
/* Do the conversion according to the passed conversion descriptor cd. *out
|
2008-08-04 04:30:12 +08:00
|
|
|
will contain the transcoded string on success. *out content is
|
2008-07-21 01:08:28 +08:00
|
|
|
unspecified otherwise. */
|
2008-06-26 23:59:07 +08:00
|
|
|
static bool
|
2014-10-06 22:32:37 +08:00
|
|
|
do_conversion (const char *tocode, const char *fromcode, char *in, size_t inlen, char **out)
|
2008-06-26 23:59:07 +08:00
|
|
|
{
|
2014-10-06 22:32:37 +08:00
|
|
|
iconv_t cd;
|
2008-06-26 23:59:07 +08:00
|
|
|
/* sXXXav : hummm hard to guess... */
|
2014-10-06 22:32:37 +08:00
|
|
|
size_t len, done, outlen;
|
2008-06-26 23:59:07 +08:00
|
|
|
int invalid = 0, tooshort = 0;
|
2014-10-06 22:32:37 +08:00
|
|
|
char *s, *in_org, *in_save;
|
2008-06-26 23:59:07 +08:00
|
|
|
|
2014-10-06 22:32:37 +08:00
|
|
|
cd = iconv_open (tocode, fromcode);
|
|
|
|
if (cd == (iconv_t)(-1))
|
|
|
|
{
|
2014-11-06 04:57:18 +08:00
|
|
|
logprintf (LOG_VERBOSE, _("Conversion from %s to UTF-8 isn't supported\n"),
|
|
|
|
quote (opt.locale));
|
|
|
|
*out = NULL;
|
2014-10-06 22:32:37 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* iconv() has to work on an unescaped string */
|
|
|
|
in_org = in;
|
|
|
|
in_save = in = strndup(in, inlen);
|
|
|
|
url_unescape(in);
|
|
|
|
inlen = strlen(in);
|
|
|
|
|
|
|
|
len = outlen = inlen * 2;
|
|
|
|
*out = s = xmalloc (outlen + 1);
|
2008-06-26 23:59:07 +08:00
|
|
|
done = 0;
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
|
|
|
|
{
|
|
|
|
*out = s;
|
|
|
|
*(s + len - outlen - done) = '\0';
|
2014-10-06 22:32:37 +08:00
|
|
|
xfree(in_save);
|
|
|
|
iconv_close(cd);
|
2014-11-06 04:57:18 +08:00
|
|
|
DEBUGP (("converted '%s' (%s) -> '%s' (%s)\n", in_org, fromcode, *out, tocode));
|
2008-06-26 23:59:07 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Incomplete or invalid multibyte sequence */
|
|
|
|
if (errno == EINVAL || errno == EILSEQ)
|
|
|
|
{
|
2008-07-02 01:34:37 +08:00
|
|
|
if (!invalid)
|
|
|
|
logprintf (LOG_VERBOSE,
|
2009-07-28 10:58:06 +08:00
|
|
|
_("Incomplete or invalid multibyte sequence encountered\n"));
|
2008-07-02 01:34:37 +08:00
|
|
|
|
2008-06-26 23:59:07 +08:00
|
|
|
invalid++;
|
|
|
|
**out = *in;
|
|
|
|
in++;
|
|
|
|
inlen--;
|
|
|
|
(*out)++;
|
|
|
|
outlen--;
|
|
|
|
}
|
2008-07-20 19:10:02 +08:00
|
|
|
else if (errno == E2BIG) /* Output buffer full */
|
2008-06-26 23:59:07 +08:00
|
|
|
{
|
|
|
|
char *new;
|
|
|
|
|
|
|
|
tooshort++;
|
|
|
|
done = len;
|
|
|
|
outlen = done + inlen * 2;
|
|
|
|
new = xmalloc (outlen + 1);
|
|
|
|
memcpy (new, s, done);
|
|
|
|
xfree (s);
|
|
|
|
s = new;
|
|
|
|
len = outlen;
|
|
|
|
*out = s + done;
|
|
|
|
}
|
|
|
|
else /* Weird, we got an unspecified error */
|
|
|
|
{
|
2009-07-06 00:46:13 +08:00
|
|
|
logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
|
2008-06-26 23:59:07 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-06 22:32:37 +08:00
|
|
|
xfree(in_save);
|
|
|
|
iconv_close(cd);
|
2014-11-06 04:57:18 +08:00
|
|
|
DEBUGP (("converted '%s' (%s) -> '%s' (%s)\n", in_org, fromcode, *out, tocode));
|
2008-06-26 23:59:07 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-10-06 22:32:37 +08:00
|
|
|
/* Try converting string str from locale to UTF-8. Return a new string
|
|
|
|
on success, or str on error or if conversion isn't needed. */
|
|
|
|
const char *
|
|
|
|
locale_to_utf8 (const char *str)
|
|
|
|
{
|
|
|
|
char *new;
|
|
|
|
|
|
|
|
/* That shouldn't happen, just in case */
|
|
|
|
if (!opt.locale)
|
|
|
|
{
|
|
|
|
logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n"));
|
|
|
|
opt.locale = find_locale ();
|
|
|
|
}
|
|
|
|
|
2014-11-20 17:52:25 +08:00
|
|
|
if (!opt.locale || !c_strcasecmp (opt.locale, "utf-8"))
|
2014-10-06 22:32:37 +08:00
|
|
|
return str;
|
|
|
|
|
|
|
|
if (do_conversion ("UTF-8", opt.locale, (char *) str, strlen ((char *) str), &new))
|
|
|
|
return (const char *) new;
|
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2008-07-21 01:08:28 +08:00
|
|
|
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
|
2008-06-26 23:59:07 +08:00
|
|
|
on error. */
|
2008-07-20 19:10:02 +08:00
|
|
|
char *
|
2008-07-24 06:56:29 +08:00
|
|
|
idn_encode (struct iri *i, char *host)
|
2008-06-26 23:59:07 +08:00
|
|
|
{
|
|
|
|
char *new;
|
|
|
|
int ret;
|
|
|
|
|
2008-07-24 06:56:29 +08:00
|
|
|
/* Encode to UTF-8 if not done */
|
|
|
|
if (!i->utf8_encode)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
2008-07-24 06:56:29 +08:00
|
|
|
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
|
2008-08-04 04:30:12 +08:00
|
|
|
return NULL; /* Nothing to encode or an error occured */
|
2008-07-20 19:10:02 +08:00
|
|
|
host = new;
|
|
|
|
}
|
|
|
|
|
2008-06-26 23:59:07 +08:00
|
|
|
/* toASCII UTF-8 NULL terminated string */
|
2008-07-21 02:37:22 +08:00
|
|
|
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
|
2008-06-26 23:59:07 +08:00
|
|
|
if (ret != IDNA_SUCCESS)
|
|
|
|
{
|
2008-07-20 19:10:02 +08:00
|
|
|
/* sXXXav : free new when needed ! */
|
2009-07-06 00:46:13 +08:00
|
|
|
logprintf (LOG_VERBOSE, _("idn_encode failed (%d): %s\n"), ret,
|
2008-06-26 23:59:07 +08:00
|
|
|
quote (idna_strerror (ret)));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return new;
|
|
|
|
}
|
2008-06-20 06:33:02 +08:00
|
|
|
|
2008-07-21 01:08:28 +08:00
|
|
|
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
|
|
|
|
on success or NULL on error. */
|
2008-07-20 19:10:02 +08:00
|
|
|
char *
|
|
|
|
idn_decode (char *host)
|
2008-07-02 22:37:28 +08:00
|
|
|
{
|
|
|
|
char *new;
|
|
|
|
int ret;
|
|
|
|
|
2008-07-21 02:37:22 +08:00
|
|
|
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
|
2008-07-02 22:37:28 +08:00
|
|
|
if (ret != IDNA_SUCCESS)
|
|
|
|
{
|
2009-07-06 00:46:13 +08:00
|
|
|
logprintf (LOG_VERBOSE, _("idn_decode failed (%d): %s\n"), ret,
|
2008-07-02 22:37:28 +08:00
|
|
|
quote (idna_strerror (ret)));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return new;
|
|
|
|
}
|
|
|
|
|
2008-07-21 01:08:28 +08:00
|
|
|
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
|
|
|
|
contains the transcoded string. *new content is unspecified otherwise. */
|
2008-07-20 19:10:02 +08:00
|
|
|
bool
|
2014-05-12 05:20:49 +08:00
|
|
|
remote_to_utf8 (struct iri *iri, const char *str, const char **new)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
|
|
|
bool ret = false;
|
|
|
|
|
2014-05-12 05:20:49 +08:00
|
|
|
if (!iri->uri_encoding)
|
2008-07-20 19:10:02 +08:00
|
|
|
return false;
|
|
|
|
|
2011-07-26 15:27:08 +08:00
|
|
|
/* When `i->uri_encoding' == "UTF-8" there is nothing to convert. But we must
|
|
|
|
test for non-ASCII symbols for correct hostname processing in `idn_encode'
|
|
|
|
function. */
|
2014-11-20 17:52:25 +08:00
|
|
|
if (!c_strcasecmp (iri->uri_encoding, "UTF-8"))
|
2011-07-26 15:27:08 +08:00
|
|
|
{
|
2014-05-12 05:20:49 +08:00
|
|
|
const char *p = str;
|
|
|
|
for (p = str; *p; p++)
|
2014-11-03 21:17:02 +08:00
|
|
|
if (*p < 0 || *p > 127)
|
2011-07-26 15:27:08 +08:00
|
|
|
{
|
|
|
|
*new = strdup (str);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-10-06 22:32:37 +08:00
|
|
|
if (do_conversion ("UTF-8", iri->uri_encoding, (char *) str, strlen (str), (char **) new))
|
2008-07-20 19:10:02 +08:00
|
|
|
ret = true;
|
|
|
|
|
|
|
|
/* Test if something was converted */
|
2014-11-06 04:57:18 +08:00
|
|
|
if (*new && !strcmp (str, *new))
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
|
|
|
xfree ((char *) *new);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-08-04 04:30:12 +08:00
|
|
|
/* Allocate a new iri structure and return a pointer to it. */
|
2008-07-24 06:56:29 +08:00
|
|
|
struct iri *
|
|
|
|
iri_new (void)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
2009-06-29 16:07:12 +08:00
|
|
|
struct iri *i = xmalloc (sizeof *i);
|
2008-07-24 06:56:29 +08:00
|
|
|
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
|
|
|
|
i->content_encoding = NULL;
|
2008-09-27 17:13:21 +08:00
|
|
|
i->orig_url = NULL;
|
2008-07-24 06:56:29 +08:00
|
|
|
i->utf8_encode = opt.enable_iri;
|
2008-08-02 17:22:14 +08:00
|
|
|
return i;
|
2008-07-20 19:10:02 +08:00
|
|
|
}
|
|
|
|
|
2009-06-29 16:07:12 +08:00
|
|
|
struct iri *iri_dup (const struct iri *src)
|
|
|
|
{
|
|
|
|
struct iri *i = xmalloc (sizeof *i);
|
|
|
|
i->uri_encoding = src->uri_encoding ? xstrdup (src->uri_encoding) : NULL;
|
|
|
|
i->content_encoding = (src->content_encoding ?
|
|
|
|
xstrdup (src->content_encoding) : NULL);
|
|
|
|
i->orig_url = src->orig_url ? xstrdup (src->orig_url) : NULL;
|
|
|
|
i->utf8_encode = src->utf8_encode;
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
2008-08-04 04:30:12 +08:00
|
|
|
/* Completely free an iri structure. */
|
2008-07-24 06:56:29 +08:00
|
|
|
void
|
|
|
|
iri_free (struct iri *i)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
2008-07-24 06:56:29 +08:00
|
|
|
xfree_null (i->uri_encoding);
|
|
|
|
xfree_null (i->content_encoding);
|
2008-09-27 17:13:21 +08:00
|
|
|
xfree_null (i->orig_url);
|
2008-07-24 06:56:29 +08:00
|
|
|
xfree (i);
|
2008-07-20 19:10:02 +08:00
|
|
|
}
|
|
|
|
|
2008-08-04 04:30:12 +08:00
|
|
|
/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
|
|
|
|
it unless force is true. */
|
2008-07-20 19:10:02 +08:00
|
|
|
void
|
2008-07-30 16:15:55 +08:00
|
|
|
set_uri_encoding (struct iri *i, char *charset, bool force)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
2008-08-15 20:41:15 +08:00
|
|
|
DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
|
2008-07-30 16:15:55 +08:00
|
|
|
if (!force && opt.encoding_remote)
|
2008-07-24 06:56:29 +08:00
|
|
|
return;
|
|
|
|
if (i->uri_encoding)
|
2008-07-22 01:34:22 +08:00
|
|
|
{
|
2014-11-20 17:52:25 +08:00
|
|
|
if (charset && !c_strcasecmp (i->uri_encoding, charset))
|
2008-07-22 01:34:22 +08:00
|
|
|
return;
|
2008-07-24 06:56:29 +08:00
|
|
|
xfree (i->uri_encoding);
|
2008-07-22 01:34:22 +08:00
|
|
|
}
|
2008-07-24 06:56:29 +08:00
|
|
|
|
|
|
|
i->uri_encoding = charset ? xstrdup (charset) : NULL;
|
2008-07-20 19:10:02 +08:00
|
|
|
}
|
|
|
|
|
2008-08-04 04:30:12 +08:00
|
|
|
/* Set content_encoding of struct iri i. */
|
2008-07-20 19:10:02 +08:00
|
|
|
void
|
2008-07-24 06:56:29 +08:00
|
|
|
set_content_encoding (struct iri *i, char *charset)
|
2008-07-20 19:10:02 +08:00
|
|
|
{
|
2008-08-04 04:30:12 +08:00
|
|
|
DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
|
2008-07-24 06:56:29 +08:00
|
|
|
if (opt.encoding_remote)
|
|
|
|
return;
|
|
|
|
if (i->content_encoding)
|
2008-07-22 01:34:22 +08:00
|
|
|
{
|
2014-11-20 17:52:25 +08:00
|
|
|
if (charset && !c_strcasecmp (i->content_encoding, charset))
|
2008-07-22 01:34:22 +08:00
|
|
|
return;
|
2008-07-24 06:56:29 +08:00
|
|
|
xfree (i->content_encoding);
|
2008-07-22 01:34:22 +08:00
|
|
|
}
|
2008-07-20 19:10:02 +08:00
|
|
|
|
2008-07-24 06:56:29 +08:00
|
|
|
i->content_encoding = charset ? xstrdup (charset) : NULL;
|
2008-07-21 00:47:52 +08:00
|
|
|
}
|