mirror of
https://github.com/mirror/wget.git
synced 2025-01-25 20:00:42 +08:00
274 lines
6.9 KiB
C
274 lines
6.9 KiB
C
/* Collect URLs from CSS source.
|
|
Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
|
|
|
This file is part of GNU Wget.
|
|
|
|
GNU Wget is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
GNU Wget is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Wget; if not, write to the Free Software
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
In addition, as a special exception, the Free Software Foundation
|
|
gives permission to link the code of its release of Wget with the
|
|
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
|
that use the same license as the "OpenSSL" library), and distribute
|
|
the linked executables. You must obey the GNU General Public License
|
|
in all respects for all of the code used other than "OpenSSL". If you
|
|
modify this file, you may extend this exception to your version of the
|
|
file, but you are not obligated to do so. If you do not wish to do
|
|
so, delete this exception statement from your version. */
|
|
|
|
/*
|
|
Note that this is not an actual CSS parser, but just a lexical
|
|
scanner with a tiny bit more smarts bolted on top. A full parser
|
|
is somewhat overkill for this job. The only things we're interested
|
|
in are @import rules and url() tokens, so it's easy enough to
|
|
grab those without truly understanding the input. The only downside
|
|
to this is that we might be coerced into downloading files that
|
|
a browser would ignore. That might merit some more investigation.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include <stdio.h>
|
|
#ifdef HAVE_STRING_H
|
|
# include <string.h>
|
|
#else
|
|
# include <strings.h>
|
|
#endif
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <errno.h>
|
|
#include <assert.h>
|
|
|
|
#include "wget.h"
|
|
#include "utils.h"
|
|
#include "convert.h"
|
|
#include "html-url.h"
|
|
#include "css-tokens.h"
|
|
|
|
/* from lex.yy.c */
|
|
extern char *yytext;
|
|
extern int yyleng;
|
|
typedef struct yy_buffer_state *YY_BUFFER_STATE;
|
|
extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len );
|
|
extern int yylex (void);
|
|
|
|
#if 1
|
|
const char *token_names[] = {
|
|
"CSSEOF",
|
|
"S",
|
|
"CDO",
|
|
"CDC",
|
|
"INCLUDES",
|
|
"DASHMATCH",
|
|
"LBRACE",
|
|
"PLUS",
|
|
"GREATER",
|
|
"COMMA",
|
|
"STRING",
|
|
"INVALID",
|
|
"IDENT",
|
|
"HASH",
|
|
"IMPORT_SYM",
|
|
"PAGE_SYM",
|
|
"MEDIA_SYM",
|
|
"CHARSET_SYM",
|
|
"IMPORTANT_SYM",
|
|
"EMS",
|
|
"EXS",
|
|
"LENGTH",
|
|
"ANGLE",
|
|
"TIME",
|
|
"FREQ",
|
|
"DIMENSION",
|
|
"PERCENTAGE",
|
|
"NUMBER",
|
|
"URI",
|
|
"FUNCTION"
|
|
};
|
|
#endif
|
|
|
|
/*
|
|
Given a detected URI token, get only the URI specified within.
|
|
Also adjust the starting position and length of the string.
|
|
|
|
A URI can be specified with or without quotes, and the quotes
|
|
can be single or double quotes. In addition there can be
|
|
whitespace after the opening parenthesis and before the closing
|
|
parenthesis.
|
|
*/
|
|
char *
|
|
get_uri_string (const char *at, int *pos, int *length)
|
|
{
|
|
char *uri;
|
|
/*char buf[1024];
|
|
strncpy(buf,at + *pos, *length);
|
|
buf[*length] = '\0';
|
|
DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
|
|
|
|
if (0 != strncasecmp (at + *pos, "url(", 4))
|
|
return NULL;
|
|
|
|
*pos += 4;
|
|
*length -= 5; /* url() */
|
|
/* skip leading space */
|
|
while (isspace (at[*pos]))
|
|
{
|
|
(*pos)++;
|
|
(*length)--;
|
|
}
|
|
/* skip trailing space */
|
|
while (isspace (at[*pos + *length - 1]))
|
|
{
|
|
(*length)--;
|
|
}
|
|
/* trim off quotes */
|
|
if (at[*pos] == '\'' || at[*pos] == '"')
|
|
{
|
|
(*pos)++;
|
|
*length -= 2;
|
|
}
|
|
|
|
uri = xmalloc (*length + 1);
|
|
if (uri)
|
|
{
|
|
strncpy (uri, at + *pos, *length);
|
|
uri[*length] = '\0';
|
|
}
|
|
|
|
return uri;
|
|
}
|
|
|
|
void
|
|
get_urls_css (struct map_context *ctx, int offset, int buf_length)
|
|
{
|
|
int token;
|
|
/*char tmp[2048];*/
|
|
int buffer_pos = 0;
|
|
int pos, length;
|
|
char *uri;
|
|
|
|
/*
|
|
strncpy(tmp,ctx->text + offset, buf_length);
|
|
tmp[buf_length] = '\0';
|
|
DEBUGP (("get_urls_css: \"%s\"\n", tmp));
|
|
*/
|
|
|
|
/* tell flex to scan from this buffer */
|
|
yy_scan_bytes (ctx->text + offset, buf_length);
|
|
|
|
while((token = yylex()) != CSSEOF)
|
|
{
|
|
/*DEBUGP (("%s ", token_names[token]));*/
|
|
/* @import "foo.css"
|
|
or @import url(foo.css)
|
|
*/
|
|
if(token == IMPORT_SYM)
|
|
{
|
|
do {
|
|
buffer_pos += yyleng;
|
|
} while((token = yylex()) == S);
|
|
|
|
/*DEBUGP (("%s ", token_names[token]));*/
|
|
|
|
if (token == STRING || token == URI)
|
|
{
|
|
/*DEBUGP (("Got URI "));*/
|
|
pos = buffer_pos + offset;
|
|
length = yyleng;
|
|
|
|
if (token == URI)
|
|
{
|
|
uri = get_uri_string (ctx->text, &pos, &length);
|
|
}
|
|
else
|
|
{
|
|
/* cut out quote characters */
|
|
pos++;
|
|
length -= 2;
|
|
uri = xmalloc (length + 1);
|
|
strncpy (uri, yytext + 1, length);
|
|
uri[length] = '\0';
|
|
}
|
|
|
|
if (uri)
|
|
{
|
|
struct urlpos *up = append_url (uri, pos, length, ctx);
|
|
DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
|
|
|
|
if (up)
|
|
{
|
|
up->link_inline_p = 1;
|
|
up->link_css_p = 1;
|
|
up->link_expect_css = 1;
|
|
}
|
|
|
|
xfree(uri);
|
|
}
|
|
}
|
|
}
|
|
/* background-image: url(foo.png)
|
|
note that we don't care what
|
|
property this is actually on.
|
|
*/
|
|
else if(token == URI)
|
|
{
|
|
pos = buffer_pos + offset;
|
|
length = yyleng;
|
|
uri = get_uri_string (ctx->text, &pos, &length);
|
|
|
|
if (uri)
|
|
{
|
|
struct urlpos *up = append_url (uri, pos, length, ctx);
|
|
DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
|
|
if (up)
|
|
{
|
|
up->link_inline_p = 1;
|
|
up->link_css_p = 1;
|
|
}
|
|
|
|
xfree (uri);
|
|
}
|
|
}
|
|
buffer_pos += yyleng;
|
|
}
|
|
DEBUGP (("\n"));
|
|
}
|
|
|
|
struct urlpos *
|
|
get_urls_css_file (const char *file, const char *url)
|
|
{
|
|
struct file_memory *fm;
|
|
struct map_context ctx;
|
|
|
|
/* Load the file. */
|
|
fm = read_file (file);
|
|
if (!fm)
|
|
{
|
|
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
|
return NULL;
|
|
}
|
|
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
|
|
|
ctx.text = fm->content;
|
|
ctx.head = ctx.tail = NULL;
|
|
ctx.base = NULL;
|
|
ctx.parent_base = url ? url : opt.base_href;
|
|
ctx.document_file = file;
|
|
ctx.nofollow = 0;
|
|
|
|
get_urls_css (&ctx, 0, fm->length);
|
|
read_file_free (fm);
|
|
return ctx.head;
|
|
}
|