Basic support of IRIs.

This commit is contained in:
Saint Xavier 2008-06-26 17:59:07 +02:00
parent e6376b4743
commit 5bb11da009
4 changed files with 166 additions and 7 deletions

View File

@ -1,3 +1,12 @@
2008-06-26 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New functions locale_to_utf8() and
idn_encode() adding basic capabilities of IRI/IDN.
* url.c : Convert URLs from locale to UTF-8 allowing a basic
support of IRI/IDN
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function check_encoding_name() as

134
src/iri.c
View File

@ -34,13 +34,22 @@ as that of the covered work. */
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iconv.h>
#include <stringprep.h>
#include <idna.h>
#include <errno.h>
#include "utils.h"
#include "iri.h"
static iconv_t locale2utf8;
static bool open_locale_to_utf8 (void);
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
/* Given a string containing "charset=XXX", return the encoding if found,
or NULL otherwise */
char *
@ -77,7 +86,6 @@ parse_charset (char *str)
return charset;
}
/* Find the locale used, or fall back on a default value */
char *
find_locale (void)
@ -86,7 +94,6 @@ find_locale (void)
return (char *) stringprep_locale_charset ();
}
/* Basic check of an encoding name. */
bool
check_encoding_name (char *encoding)
@ -107,4 +114,125 @@ check_encoding_name (char *encoding)
return true;
}
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
static bool
open_locale_to_utf8 (void)
{
if (locale2utf8)
return true;
/* sXXXav : That shouldn't happen, just in case */
if (!opt.locale)
{
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
opt.locale = find_locale ();
}
if (!opt.locale)
return false;
locale2utf8 = iconv_open ("UTF-8", opt.locale);
if (locale2utf8 != (iconv_t)(-1))
return true;
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
quote (opt.locale), quote("UTF-8"));
locale2utf8 = NULL;
return false;
}
/* Return a new string */
const char *
locale_to_utf8 (const char *str)
{
char *new;
if (!strcasecmp (opt.locale, "utf-8"))
return str;
if (!open_locale_to_utf8 ())
return str;
if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
return (const char *) new;
return str;
}
/* */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
/* sXXXav : hummm hard to guess... */
size_t len, done, outlen = inlen * 2;
int invalid = 0, tooshort = 0;
char *s;
s = xmalloc (outlen + 1);
*out = s;
len = outlen;
done = 0;
/* sXXXav : put a maximum looping factor ??? */
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
{
*out = s;
*(s + len - outlen - done) = '\0';
return true;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
invalid++;
**out = *in;
in++;
inlen--;
(*out)++;
outlen--;
}
else if (errno == E2BIG) /* Output buffer full */
{
char *new;
tooshort++;
done = len;
outlen = done + inlen * 2;
new = xmalloc (outlen + 1);
memcpy (new, s, done);
xfree (s);
s = new;
len = outlen;
*out = s + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
break;
}
}
return false;
}
/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
on error. */
char *idn_encode (char *host)
{
char *new;
int ret;
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, 0);
if (ret != IDNA_SUCCESS)
{
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}

View File

@ -35,12 +35,16 @@ as that of the covered work. */
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (char *host);
#else /* ENABLE_IRI */
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(str) NULL
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -42,6 +42,7 @@ as that of the covered work. */
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
#include "iri.h"
#ifdef TESTING
#include "test.h"
@ -670,6 +671,12 @@ url_parse (const char *url, int *error)
goto error;
}
if (opt.enable_iri)
{
url_unescape ((char *) url);
url = locale_to_utf8(url);
}
url_encoded = reencode_escapes (url);
p = url_encoded;
@ -844,6 +851,17 @@ url_parse (const char *url, int *error)
host_modified = true;
}
if (opt.enable_iri)
{
char *new = idn_encode (u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
if (params_b)
u->params = strdupdelim (params_b, params_e);
if (query_b)
@ -851,7 +869,7 @@ url_parse (const char *url, int *error)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
if (path_modified || u->fragment || host_modified || path_b == path_e)
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild