mirror of
https://github.com/mirror/wget.git
synced 2025-01-21 09:41:06 +08:00
[svn] Don't needlessly decode % escapes in URLs.
This commit is contained in:
parent
4c4c440401
commit
5aba2a5850
@ -1,3 +1,8 @@
|
|||||||
|
2005-05-07 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
|
* url.c (decide_copy_method): Never cause reencode_escapes to
|
||||||
|
decode % escapes; it is too intrusive and breaks some servers.
|
||||||
|
|
||||||
2005-05-07 Hrvoje Niksic <hniksic@xemacs.org>
|
2005-05-07 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
* http.c (gethttp): When tunnelling SSL traffic over proxy with
|
* http.c (gethttp): When tunnelling SSL traffic over proxy with
|
||||||
|
112
src/url.c
112
src/url.c
@ -255,34 +255,29 @@ url_escape_allow_passthrough (const char *s)
|
|||||||
return url_escape_1 (s, urlchr_unsafe, 1);
|
return url_escape_1 (s, urlchr_unsafe, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
|
enum copy_method { cm_encode, cm_passthrough };
|
||||||
|
|
||||||
|
/* Decide whether to encode or pass through the char at P. This used
|
||||||
|
to be a macro, but it got a little too convoluted. */
|
||||||
|
|
||||||
/* Decide whether to encode, decode, or pass through the char at P.
|
|
||||||
This used to be a macro, but it got a little too convoluted. */
|
|
||||||
static inline enum copy_method
|
static inline enum copy_method
|
||||||
decide_copy_method (const char *p)
|
decide_copy_method (const char *p)
|
||||||
{
|
{
|
||||||
if (*p == '%')
|
if (*p == '%')
|
||||||
{
|
{
|
||||||
if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
|
if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
|
||||||
{
|
/* Prior to 1.10 this decoded %HH escapes corresponding to
|
||||||
/* %xx sequence: decode it, unless it would decode to an
|
"safe" chars, but that proved too obtrusive -- it's better
|
||||||
unsafe or a reserved char; in that case, leave it as
|
to always preserve the escapes found in the URL. */
|
||||||
is. */
|
return cm_passthrough;
|
||||||
char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
|
|
||||||
if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
|
|
||||||
return CM_PASSTHROUGH;
|
|
||||||
else
|
|
||||||
return CM_DECODE;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
/* Garbled %.. sequence: encode `%'. */
|
/* Garbled %.. sequence: encode `%'. */
|
||||||
return CM_ENCODE;
|
return cm_encode;
|
||||||
}
|
}
|
||||||
else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
|
else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
|
||||||
return CM_ENCODE;
|
return cm_encode;
|
||||||
else
|
else
|
||||||
return CM_PASSTHROUGH;
|
return cm_passthrough;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Translate a %-escaped (but possibly non-conformant) input string S
|
/* Translate a %-escaped (but possibly non-conformant) input string S
|
||||||
@ -292,16 +287,15 @@ decide_copy_method (const char *p)
|
|||||||
|
|
||||||
After a URL has been run through this function, the protocols that
|
After a URL has been run through this function, the protocols that
|
||||||
use `%' as the quote character can use the resulting string as-is,
|
use `%' as the quote character can use the resulting string as-is,
|
||||||
while those that don't call url_unescape() to get to the intended
|
while those that don't can use url_unescape to get to the intended
|
||||||
data. This function is also stable: after an input string is
|
data. This function is stable: once the input is transformed,
|
||||||
transformed the first time, all further transformations of the
|
further transformations of the result yield the same output.
|
||||||
result yield the same result string.
|
|
||||||
|
|
||||||
Let's discuss why this function is needed.
|
Let's discuss why this function is needed.
|
||||||
|
|
||||||
Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
|
Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since
|
||||||
space character would mess up the HTTP request, it needs to be
|
a raw space character would mess up the HTTP request, it needs to
|
||||||
quoted, like this:
|
be quoted, like this:
|
||||||
|
|
||||||
GET /abc%20def HTTP/1.0
|
GET /abc%20def HTTP/1.0
|
||||||
|
|
||||||
@ -312,14 +306,15 @@ decide_copy_method (const char *p)
|
|||||||
part of URL syntax, "%20" is the correct way to denote a literal
|
part of URL syntax, "%20" is the correct way to denote a literal
|
||||||
space on the Wget command line. This leaves us in the conclusion
|
space on the Wget command line. This leaves us in the conclusion
|
||||||
that in that case Wget should not call url_escape, but leave the
|
that in that case Wget should not call url_escape, but leave the
|
||||||
`%20' as is.
|
`%20' as is. This is clearly contradictory, but it only gets
|
||||||
|
worse.
|
||||||
|
|
||||||
And what if the requested URI is `abc%20 def'? If we call
|
What if the requested URI is `abc%20 def'? If we call url_escape,
|
||||||
url_escape, we end up with `/abc%2520%20def', which is almost
|
we end up with `/abc%2520%20def', which is almost certainly not
|
||||||
certainly not intended. If we don't call url_escape, we are left
|
intended. If we don't call url_escape, we are left with the
|
||||||
with the embedded space and cannot complete the request. What the
|
embedded space and cannot complete the request. What the user
|
||||||
user meant was for Wget to request `/abc%20%20def', and this is
|
meant was for Wget to request `/abc%20%20def', and this is where
|
||||||
where reencode_escapes kicks in.
|
reencode_escapes kicks in.
|
||||||
|
|
||||||
Wget used to solve this by first decoding %-quotes, and then
|
Wget used to solve this by first decoding %-quotes, and then
|
||||||
encoding all the "unsafe" characters found in the resulting string.
|
encoding all the "unsafe" characters found in the resulting string.
|
||||||
@ -333,25 +328,25 @@ decide_copy_method (const char *p)
|
|||||||
literal plus. reencode_escapes correctly translates the above to
|
literal plus. reencode_escapes correctly translates the above to
|
||||||
"a%2B+b", i.e. returns the original string.
|
"a%2B+b", i.e. returns the original string.
|
||||||
|
|
||||||
This function uses an algorithm proposed by Anon Sricharoenchai:
|
This function uses a modified version of the algorithm originally
|
||||||
|
proposed by Anon Sricharoenchai:
|
||||||
|
|
||||||
1. Encode all URL_UNSAFE and the "%" that are not followed by 2
|
* Encode all "unsafe" characters, except those that are also
|
||||||
hexdigits.
|
"reserved", to %XX. See urlchr_table for which characters are
|
||||||
|
unsafe and reserved.
|
||||||
|
|
||||||
2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
|
* Encode the "%" characters not followed by two hex digits to
|
||||||
"+".
|
"%25".
|
||||||
|
|
||||||
...except that this code conflates the two steps, and decides
|
* Pass through all other characters and %XX escapes as-is. (Up to
|
||||||
whether to encode, decode, or pass through each character in turn.
|
Wget 1.10 this decoded %XX escapes corresponding to "safe"
|
||||||
The function still uses two passes, but their logic is the same --
|
characters, but that was obtrusive and broke some servers.)
|
||||||
the first pass exists merely for the sake of allocation. Another
|
|
||||||
small difference is that we include `+' to URL_RESERVED.
|
|
||||||
|
|
||||||
Anon's test case:
|
Anon's test case:
|
||||||
|
|
||||||
"http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
|
"http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
|
||||||
->
|
->
|
||||||
"http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
|
"http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
|
||||||
|
|
||||||
Simpler test cases:
|
Simpler test cases:
|
||||||
|
|
||||||
@ -372,7 +367,6 @@ reencode_escapes (const char *s)
|
|||||||
int oldlen, newlen;
|
int oldlen, newlen;
|
||||||
|
|
||||||
int encode_count = 0;
|
int encode_count = 0;
|
||||||
int decode_count = 0;
|
|
||||||
|
|
||||||
/* First, pass through the string to see if there's anything to do,
|
/* First, pass through the string to see if there's anything to do,
|
||||||
and to calculate the new length. */
|
and to calculate the new length. */
|
||||||
@ -380,25 +374,21 @@ reencode_escapes (const char *s)
|
|||||||
{
|
{
|
||||||
switch (decide_copy_method (p1))
|
switch (decide_copy_method (p1))
|
||||||
{
|
{
|
||||||
case CM_ENCODE:
|
case cm_encode:
|
||||||
++encode_count;
|
++encode_count;
|
||||||
break;
|
break;
|
||||||
case CM_DECODE:
|
case cm_passthrough:
|
||||||
++decode_count;
|
|
||||||
break;
|
|
||||||
case CM_PASSTHROUGH:
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!encode_count && !decode_count)
|
if (!encode_count)
|
||||||
/* The string is good as it is. */
|
/* The string is good as it is. */
|
||||||
return (char *)s; /* C const model sucks. */
|
return (char *) s; /* C const model sucks. */
|
||||||
|
|
||||||
oldlen = p1 - s;
|
oldlen = p1 - s;
|
||||||
/* Each encoding adds two characters (hex digits), while each
|
/* Each encoding adds two characters (hex digits). */
|
||||||
decoding removes two characters. */
|
newlen = oldlen + 2 * encode_count;
|
||||||
newlen = oldlen + 2 * (encode_count - decode_count);
|
|
||||||
newstr = xmalloc (newlen + 1);
|
newstr = xmalloc (newlen + 1);
|
||||||
|
|
||||||
p1 = s;
|
p1 = s;
|
||||||
@ -408,7 +398,7 @@ reencode_escapes (const char *s)
|
|||||||
{
|
{
|
||||||
switch (decide_copy_method (p1))
|
switch (decide_copy_method (p1))
|
||||||
{
|
{
|
||||||
case CM_ENCODE:
|
case cm_encode:
|
||||||
{
|
{
|
||||||
unsigned char c = *p1++;
|
unsigned char c = *p1++;
|
||||||
*p2++ = '%';
|
*p2++ = '%';
|
||||||
@ -416,11 +406,7 @@ reencode_escapes (const char *s)
|
|||||||
*p2++ = XNUM_TO_DIGIT (c & 0xf);
|
*p2++ = XNUM_TO_DIGIT (c & 0xf);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case CM_DECODE:
|
case cm_passthrough:
|
||||||
*p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
|
|
||||||
p1 += 3; /* skip %xx */
|
|
||||||
break;
|
|
||||||
case CM_PASSTHROUGH:
|
|
||||||
*p2++ = *p1++;
|
*p2++ = *p1++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -869,8 +855,9 @@ url_parse (const char *url, int *error)
|
|||||||
host_modified = lowercase_str (u->host);
|
host_modified = lowercase_str (u->host);
|
||||||
|
|
||||||
/* Decode %HH sequences in host name. This is important not so much
|
/* Decode %HH sequences in host name. This is important not so much
|
||||||
to support %HH sequences, but to support binary characters (which
|
to support %HH sequences in host names (which other browser
|
||||||
will have been converted to %HH by reencode_escapes). */
|
don't), but to support binary characters (which will have been
|
||||||
|
converted to %HH by reencode_escapes). */
|
||||||
if (strchr (u->host, '%'))
|
if (strchr (u->host, '%'))
|
||||||
{
|
{
|
||||||
url_unescape (u->host);
|
url_unescape (u->host);
|
||||||
@ -1539,11 +1526,6 @@ url_file_name (const struct url *u)
|
|||||||
"back up one element". Single leading and trailing slashes are
|
"back up one element". Single leading and trailing slashes are
|
||||||
preserved.
|
preserved.
|
||||||
|
|
||||||
This function does not handle URL escapes explicitly. If you're
|
|
||||||
passing paths from URLs, make sure to unquote "%2e" and "%2E" to
|
|
||||||
".", so that this function can find the dots. (Wget's URL parser
|
|
||||||
calls reencode_escapes, which see.)
|
|
||||||
|
|
||||||
For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
|
For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
|
||||||
test examples are provided below. If you change anything in this
|
test examples are provided below. If you change anything in this
|
||||||
function, run test_path_simplify to make sure you haven't broken a
|
function, run test_path_simplify to make sure you haven't broken a
|
||||||
|
Loading…
Reference in New Issue
Block a user