From 8b9cabe00460434123bd265a2ab60b0a4f9e048c Mon Sep 17 00:00:00 2001 From: hniksic Date: Sat, 9 Apr 2005 04:48:31 -0700 Subject: [PATCH] [svn] Don't mark ~ as unsafe, it confuses too many sites. --- src/ChangeLog | 11 +++++++++++ src/url.c | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 43d7494f..9be27d80 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,14 @@ +2005-04-09 Hrvoje Niksic + + * url.c: Use "static const" in preference to "const static". + Sun's cc warns that "storage class after type is obsolescent". + + * url.c (urlchr_table): Don't mark ~ as unsafe, too many broken + web sites are confused when ~ is changed to %7E. Their servers + redirect /%7Efoo/ to /~foo/, which Wget again accesses using %7E, + causing further redirections, therefore looping infinitely. See + Debian bug #301624 for an example. + 2005-04-09 Hrvoje Niksic * alloca.c: Include wget.h to be able to use xmalloc. In addition diff --git a/src/url.c b/src/url.c index 82b4bcea..2606c4de 100644 --- a/src/url.c +++ b/src/url.c @@ -87,13 +87,14 @@ static int path_simplify PARAMS ((char *)); changing the meaning of the URL. For example, you can't decode "/foo/%2f/bar" into "/foo///bar" because the number and contents of path components is different. Non-reserved characters can be - changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget - uses the rfc1738 set of reserved characters, plus "$" and ",", as - recommended by rfc2396. + changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The + unsafe characters are loosely based on rfc1738, plus "$" and ",", + as recommended by rfc2396, and minus "~", which is very frequently + used (and sometimes unrecognized as %7E by broken servers). - An unsafe characters is the one that should be encoded when URLs - are placed in foreign environments. E.g. space and newline are - unsafe in HTTP contexts because HTTP uses them as separator and + An unsafe character is the one that should be encoded when URLs are + placed in foreign environments. E.g. space and newline are unsafe + in HTTP contexts because HTTP uses them as separator and line terminator, so they must be encoded to %20 and %0A respectively. "*" is unsafe in shell context, etc. @@ -117,7 +118,7 @@ enum { #define U urlchr_unsafe #define RU R|U -const static unsigned char urlchr_table[256] = +static const unsigned char urlchr_table[256] = { U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ @@ -134,7 +135,7 @@ const static unsigned char urlchr_table[256] = U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ - 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */ + 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, @@ -1269,7 +1270,7 @@ enum { translate file name back to URL, this would become important crucial. Right now, it's better to be minimal in escaping. */ -const static unsigned char filechr_table[256] = +static const unsigned char filechr_table[256] = { UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */