mirror of
https://github.com/mirror/wget.git
synced 2025-01-21 01:30:32 +08:00
Basic support of IRIs.
This commit is contained in:
parent
e6376b4743
commit
5bb11da009
@ -1,3 +1,12 @@
|
||||
2008-06-26 Xavier Saint <wget@sxav.eu>
|
||||
|
||||
* iri.c, iri.h : New functions locale_to_utf8() and
|
||||
idn_encode() adding basic capabilities of IRI/IDN.
|
||||
|
||||
* url.c : Convert URLs from locale to UTF-8 allowing a basic
|
||||
support of IRI/IDN
|
||||
|
||||
|
||||
2008-06-19 Xavier Saint <wget@sxav.eu>
|
||||
|
||||
* iri.c, iri.h : New function check_encoding_name() as
|
||||
|
134
src/iri.c
134
src/iri.c
@ -34,13 +34,22 @@ as that of the covered work. */
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iconv.h>
|
||||
#include <stringprep.h>
|
||||
#include <idna.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "utils.h"
|
||||
#include "iri.h"
|
||||
|
||||
|
||||
static iconv_t locale2utf8;
|
||||
|
||||
|
||||
static bool open_locale_to_utf8 (void);
|
||||
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
|
||||
|
||||
|
||||
/* Given a string containing "charset=XXX", return the encoding if found,
|
||||
or NULL otherwise */
|
||||
char *
|
||||
@ -77,7 +86,6 @@ parse_charset (char *str)
|
||||
return charset;
|
||||
}
|
||||
|
||||
|
||||
/* Find the locale used, or fall back on a default value */
|
||||
char *
|
||||
find_locale (void)
|
||||
@ -86,7 +94,6 @@ find_locale (void)
|
||||
return (char *) stringprep_locale_charset ();
|
||||
}
|
||||
|
||||
|
||||
/* Basic check of an encoding name. */
|
||||
bool
|
||||
check_encoding_name (char *encoding)
|
||||
@ -107,4 +114,125 @@ check_encoding_name (char *encoding)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
|
||||
static bool
|
||||
open_locale_to_utf8 (void)
|
||||
{
|
||||
if (locale2utf8)
|
||||
return true;
|
||||
|
||||
/* sXXXav : That shouldn't happen, just in case */
|
||||
if (!opt.locale)
|
||||
{
|
||||
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
|
||||
opt.locale = find_locale ();
|
||||
}
|
||||
|
||||
if (!opt.locale)
|
||||
return false;
|
||||
|
||||
locale2utf8 = iconv_open ("UTF-8", opt.locale);
|
||||
if (locale2utf8 != (iconv_t)(-1))
|
||||
return true;
|
||||
|
||||
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
|
||||
quote (opt.locale), quote("UTF-8"));
|
||||
locale2utf8 = NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Return a new string */
|
||||
const char *
|
||||
locale_to_utf8 (const char *str)
|
||||
{
|
||||
char *new;
|
||||
|
||||
if (!strcasecmp (opt.locale, "utf-8"))
|
||||
return str;
|
||||
|
||||
if (!open_locale_to_utf8 ())
|
||||
return str;
|
||||
|
||||
if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
|
||||
return (const char *) new;
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
/* */
|
||||
static bool
|
||||
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
|
||||
{
|
||||
/* sXXXav : hummm hard to guess... */
|
||||
size_t len, done, outlen = inlen * 2;
|
||||
int invalid = 0, tooshort = 0;
|
||||
char *s;
|
||||
|
||||
s = xmalloc (outlen + 1);
|
||||
*out = s;
|
||||
len = outlen;
|
||||
done = 0;
|
||||
|
||||
/* sXXXav : put a maximum looping factor ??? */
|
||||
for (;;)
|
||||
{
|
||||
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
|
||||
{
|
||||
*out = s;
|
||||
*(s + len - outlen - done) = '\0';
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Incomplete or invalid multibyte sequence */
|
||||
if (errno == EINVAL || errno == EILSEQ)
|
||||
{
|
||||
invalid++;
|
||||
**out = *in;
|
||||
in++;
|
||||
inlen--;
|
||||
(*out)++;
|
||||
outlen--;
|
||||
}
|
||||
else if (errno == E2BIG) /* Output buffer full */
|
||||
{
|
||||
char *new;
|
||||
|
||||
tooshort++;
|
||||
done = len;
|
||||
outlen = done + inlen * 2;
|
||||
new = xmalloc (outlen + 1);
|
||||
memcpy (new, s, done);
|
||||
xfree (s);
|
||||
s = new;
|
||||
len = outlen;
|
||||
*out = s + done;
|
||||
}
|
||||
else /* Weird, we got an unspecified error */
|
||||
{
|
||||
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
|
||||
on error. */
|
||||
char *idn_encode (char *host)
|
||||
{
|
||||
char *new;
|
||||
int ret;
|
||||
|
||||
/* toASCII UTF-8 NULL terminated string */
|
||||
ret = idna_to_ascii_8z (host, &new, 0);
|
||||
if (ret != IDNA_SUCCESS)
|
||||
{
|
||||
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
|
||||
quote (idna_strerror (ret)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
|
10
src/iri.h
10
src/iri.h
@ -35,12 +35,16 @@ as that of the covered work. */
|
||||
char *parse_charset (char *str);
|
||||
char *find_locale (void);
|
||||
bool check_encoding_name (char *encoding);
|
||||
const char *locale_to_utf8 (const char *str);
|
||||
char *idn_encode (char *host);
|
||||
|
||||
#else /* ENABLE_IRI */
|
||||
|
||||
#define parse_charset(str) NULL
|
||||
#define find_locale() NULL
|
||||
#define check_encoding_name(str) false
|
||||
#define parse_charset(str) NULL
|
||||
#define find_locale() NULL
|
||||
#define check_encoding_name(str) false
|
||||
#define locale_to_utf8(str) (str)
|
||||
#define idn_encode(str) NULL
|
||||
|
||||
#endif /* ENABLE_IRI */
|
||||
#endif /* IRI_H */
|
||||
|
20
src/url.c
20
src/url.c
@ -42,6 +42,7 @@ as that of the covered work. */
|
||||
#include "utils.h"
|
||||
#include "url.h"
|
||||
#include "host.h" /* for is_valid_ipv6_address */
|
||||
#include "iri.h"
|
||||
|
||||
#ifdef TESTING
|
||||
#include "test.h"
|
||||
@ -670,6 +671,12 @@ url_parse (const char *url, int *error)
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (opt.enable_iri)
|
||||
{
|
||||
url_unescape ((char *) url);
|
||||
url = locale_to_utf8(url);
|
||||
}
|
||||
|
||||
url_encoded = reencode_escapes (url);
|
||||
p = url_encoded;
|
||||
|
||||
@ -844,6 +851,17 @@ url_parse (const char *url, int *error)
|
||||
host_modified = true;
|
||||
}
|
||||
|
||||
if (opt.enable_iri)
|
||||
{
|
||||
char *new = idn_encode (u->host);
|
||||
if (new)
|
||||
{
|
||||
xfree (u->host);
|
||||
u->host = new;
|
||||
host_modified = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (params_b)
|
||||
u->params = strdupdelim (params_b, params_e);
|
||||
if (query_b)
|
||||
@ -851,7 +869,7 @@ url_parse (const char *url, int *error)
|
||||
if (fragment_b)
|
||||
u->fragment = strdupdelim (fragment_b, fragment_e);
|
||||
|
||||
if (path_modified || u->fragment || host_modified || path_b == path_e)
|
||||
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
|
||||
{
|
||||
/* If we suspect that a transformation has rendered what
|
||||
url_string might return different from URL_ENCODED, rebuild
|
||||
|
Loading…
Reference in New Issue
Block a user