Cope better with unclosed html tags.

This commit is contained in:
Giuseppe Scrivano 2010-05-30 14:01:10 +02:00
parent 05fccaeed2
commit 1b2092fd06
3 changed files with 21 additions and 9 deletions

2
NEWS
View File

@ -19,6 +19,8 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
** Set new cookies after an authorization failure.
** Exit with failure if -k is specified and -O is not a regular file.
** Cope better with unclosed html tags.
* Changes in Wget 1.12

View File

@ -1,3 +1,9 @@
2010-05-30 Giuseppe Scrivano <gscrivano@gnu.org>
* html-parse.c (NAME_CHAR_P): Consider '<' an invalid character.
(advance_declaration): Close the tag if '<' is found.
(map_html_tags): Likewise.
2010-05-27 Giuseppe Scrivano <gscrivano@gnu.org>
* main.c (main): Exit with failure when -k is specified and -O is not

View File

@ -528,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
'=', '>', '/'.
'=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
&& (x) != '=' && (x) != '>' && (x) != '/')
&& (x) != '=' && (x) != '<' && (x) != '>' \
&& (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
@ -619,6 +620,7 @@ advance_declaration (const char *beg, const char *end)
case '\n':
ch = *p++;
break;
case '<':
case '>':
state = AC_S_DONE;
break;
@ -926,7 +928,7 @@ map_html_tags (const char *text, int size,
}
}
if (end_tag && *p != '>')
if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
@ -958,12 +960,12 @@ map_html_tags (const char *text, int size,
/* ^ */
ADVANCE (p);
SKIP_WS (p);
if (*p != '>')
if (*p != '<' || *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
if (*p == '>')
if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
@ -978,7 +980,8 @@ map_html_tags (const char *text, int size,
/* Establish bounds of attribute value. */
SKIP_WS (p);
if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
@ -1015,7 +1018,7 @@ map_html_tags (const char *text, int size,
newline_seen = true;
continue;
}
else if (newline_seen && *p == '>')
else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
@ -1040,7 +1043,7 @@ map_html_tags (const char *text, int size,
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
while (!c_isspace (*p) && *p != '>')
while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
@ -1138,7 +1141,8 @@ map_html_tags (const char *text, int size,
}
mapfun (&taginfo, maparg);
ADVANCE (p);
if (*p != '<')
ADVANCE (p);
}
goto look_for_tag;