[svn] Squash embedded newlines in links.

This commit is contained in:
hniksic 2003-11-26 08:37:04 -08:00
parent 1b2dce0493
commit 3f84a5e00e
3 changed files with 21 additions and 12 deletions

View File

@ -1,3 +1,8 @@
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
* html-parse.c (convert_and_copy): Remove embedded newlines when
AP_TRIM_BLANKS is specified.
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org> 2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
* ftp.c: Set con->csock to -1 where rbuf_uninitialize was * ftp.c: Set con->csock to -1 where rbuf_uninitialize was

View File

@ -360,17 +360,16 @@ enum {
the ASCII range when copying the string. the ASCII range when copying the string.
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
of text. */ of text, as well as embedded newlines. */
static void static void
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
{ {
int old_tail = pool->tail; int old_tail = pool->tail;
int size;
/* First, skip blanks if required. We must do this before entities /* Skip blanks if required. We must do this before entities are
are processed, so that blanks can still be inserted as, for processed, so that blanks can still be inserted as, for instance,
instance, `&#32;'. */ `&#32;'. */
if (flags & AP_TRIM_BLANKS) if (flags & AP_TRIM_BLANKS)
{ {
while (beg < end && ISSPACE (*beg)) while (beg < end && ISSPACE (*beg))
@ -378,7 +377,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
while (end > beg && ISSPACE (end[-1])) while (end > beg && ISSPACE (end[-1]))
--end; --end;
} }
size = end - beg;
if (flags & AP_DECODE_ENTITIES) if (flags & AP_DECODE_ENTITIES)
{ {
@ -391,15 +389,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
never lengthen it. */ never lengthen it. */
const char *from = beg; const char *from = beg;
char *to; char *to;
int squash_newlines = flags & AP_TRIM_BLANKS;
POOL_GROW (pool, end - beg); POOL_GROW (pool, end - beg);
to = pool->contents + pool->tail; to = pool->contents + pool->tail;
while (from < end) while (from < end)
{ {
if (*from != '&') if (*from == '&')
*to++ = *from++;
else
{ {
int entity = decode_entity (&from, end); int entity = decode_entity (&from, end);
if (entity != -1) if (entity != -1)
@ -407,6 +404,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
else else
*to++ = *from++; *to++ = *from++;
} }
else if ((*from == '\n' || *from == '\r') && squash_newlines)
++from;
else
*to++ = *from++;
} }
/* Verify that we haven't exceeded the original size. (It /* Verify that we haven't exceeded the original size. (It
shouldn't happen, hence the assert.) */ shouldn't happen, hence the assert.) */

View File

@ -612,9 +612,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
init_interesting (); init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
generate <a href=" foo"> instead of <a href="foo"> (Netscape generate <a href=" foo"> instead of <a href="foo"> (browsers
ignores spaces as well.) If you really mean space, use &32; or ignore spaces as well.) If you really mean space, use &32; or
%20. */ %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
e.g. in <img src="foo.[newline]html">. Such newlines are also
ignored by IE and Mozilla and are presumably introduced by
writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES; flags = MHT_TRIM_VALUES;
if (opt.strict_comments) if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS; flags |= MHT_STRICT_COMMENTS;