diff --git a/src/html.c b/src/html.c deleted file mode 100644 index 5ab05679..00000000 --- a/src/html.c +++ /dev/null @@ -1,684 +0,0 @@ -/* A simple HTML parser. - Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. - -This file is part of Wget. - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ - -#include - -#include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif -#include -#include -#include -#include - -#include "wget.h" -#include "url.h" -#include "utils.h" -#include "ftp.h" -#include "html.h" - -#ifndef errno -extern int errno; -#endif - -static state_t global_state; - -struct tag_attr { - char *tag; - char *attr; -}; - - -/* Match a string against a null-terminated list of identifiers. */ -static int -idmatch (struct tag_attr *tags, const char *tag, const char *attr) -{ - int i, j; - - if (tag == NULL || attr == NULL) - return FALSE; - - for (i = 0; tags[i].tag; i++) - /* Loop through all the tags wget ever cares about. */ - if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr)) - /* The tag and attribute matched one of the ones wget cares about. */ - { - if (opt.ignore_tags) - /* --ignore-tags was specified. Do not match these specific tags. - --ignore-tags takes precedence over --follow-tags, so we process - --ignore first and fall through if there's no match. */ - for (j = 0; opt.ignore_tags[j] != NULL; j++) - /* Loop through all the tags this user doesn't care about. */ - if (strcasecmp(opt.ignore_tags[j], tag) == EQ) - return FALSE; - - if (opt.follow_tags) - /* --follow-tags was specified. Only match these specific tags, so - return FALSE if we don't match one of them. */ - { - for (j = 0; opt.follow_tags[j] != NULL; j++) - /* Loop through all the tags this user cares about. */ - if (strcasecmp(opt.follow_tags[j], tag) == EQ) - return TRUE; - - return FALSE; /* wasn't one of the explicitly desired tags */ - } - - /* If we get to here, --follow-tags isn't being used, and --ignore-tags, - if specified, didn't include this tag, so it's okay to follow. */ - return TRUE; - } - - return FALSE; /* not one of the tag/attribute pairs wget ever cares about */ -} - -/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags - describing URLs to follow. When a tag is encountered, extract its - components (as described by html_allow[] array), and return the - address and the length of the string. Return NULL if no URL is - found. */ -const char * -htmlfindurl (const char *buf, int bufsize, int *size, int init, - int dash_p_leaf_HTML) -{ - const char *p, *ph; - state_t *s = &global_state; - - /* NULL-terminated list of tags and modifiers someone would want to - follow -- feel free to edit to suit your needs: */ - static struct tag_attr html_allow[] = { - { "script", "src" }, - { "img", "src" }, - { "img", "href" }, - { "body", "background" }, - { "frame", "src" }, - { "iframe", "src" }, - { "fig", "src" }, - { "overlay", "src" }, - { "applet", "code" }, - { "script", "src" }, - { "embed", "src" }, - { "bgsound", "src" }, - { "img", "lowsrc" }, - { "input", "src" }, - { "layer", "src" }, - { "table", "background"}, - { "th", "background"}, - { "td", "background"}, - /* Tags below this line are treated specially. */ - { "a", "href" }, - { "area", "href" }, - { "base", "href" }, - { "link", "href" }, - { "link", "rel" }, - { "meta", "content" }, - { NULL, NULL } - }; - - if (init) - { - DEBUGP (("Resetting a parser state.\n")); - memset (s, 0, sizeof (*s)); - } - - while (1) - { - const char* link_href = NULL; - const char* link_rel = NULL; - int link_href_saved_size = 0; /* init. just to shut up warning */ - - if (!bufsize) - break; - /* Let's look for a tag, if we are not already in one. */ - if (!s->at_value) - { - /* Find '<'. */ - if (*buf != '<') - for (; bufsize && *buf != '<'; ++buf, --bufsize); - if (!bufsize) - break; - /* Skip spaces. */ - for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; - ++buf, --bufsize); - if (!bufsize) - break; - p = buf; - /* Find the tag end. */ - for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; - ++buf, --bufsize); - if (!bufsize) - break; - if (*buf == '=') - { - /* is illegal. Just skip it. */ - ++buf, --bufsize; - continue; - } - if (p == buf) - { - /* *buf == '>'. */ - ++buf, --bufsize; - continue; - } - s->tag = strdupdelim (p, buf); - if (*buf == '>') - { - xfree (s->tag); - s->tag = NULL; - ++buf, --bufsize; - continue; - } - } - else /* s->at_value */ - { - /* Reset AT_VALUE. */ - s->at_value = 0; - /* If in quotes, just skip out of them and continue living. */ - if (s->in_quote) - { - s->in_quote = 0; - for (; bufsize && *buf != s->quote_char; ++buf, --bufsize); - if (!bufsize) - break; - ++buf, --bufsize; - } - if (!bufsize) - break; - if (*buf == '>') - { - FREE_MAYBE (s->tag); - FREE_MAYBE (s->attr); - s->tag = s->attr = NULL; - continue; - } - } - /* Find the attributes. */ - do - { - FREE_MAYBE (s->attr); - s->attr = NULL; - if (!bufsize) - break; - /* Skip the spaces if we have them. We don't have them at - places like something. - ^ no spaces here */ - if (ISSPACE (*buf)) - for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>'; - ++buf, --bufsize); - if (!bufsize || *buf == '>') - break; - if (*buf == '=') - { - /* This is the case of , which is - illegal. Just skip it. */ - ++buf, --bufsize; - continue; - } - p = buf; - /* Find the attribute end. */ - for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '='; - ++buf, --bufsize); - if (!bufsize || *buf == '>') - break; - /* Construct the attribute. */ - s->attr = strdupdelim (p, buf); - /* Now we must skip the spaces to find '='. */ - if (*buf != '=') - { - for (; bufsize && ISSPACE (*buf) && *buf != '>'; - ++buf, --bufsize); - if (!bufsize || *buf == '>') - break; - } - /* If we still don't have '=', something is amiss. */ - if (*buf != '=') - continue; - /* Find the beginning of attribute value by skipping the - spaces. */ - ++buf, --bufsize; - for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize); - if (!bufsize || *buf == '>') - break; - ph = NULL; - /* The value of an attribute can, but does not have to be - quoted. */ - if (*buf == '\"' || *buf == '\'') - { - s->in_quote = 1; - s->quote_char = *buf; - p = buf + 1; - for (++buf, --bufsize; - bufsize && *buf != s->quote_char && *buf != '\n'; - ++buf, --bufsize) - if (!ph && *buf == '#' && *(buf - 1) != '&') - ph = buf; - if (!bufsize) - { - s->in_quote = 0; - break; - } - if (*buf == '\n') - { - /* #### Is the following logic good? - - Obviously no longer in quote. It might be well - to check whether '>' was encountered, but that - would be encouraging writers of invalid HTMLs, - and we don't want that, now do we? */ - s->in_quote = 0; - continue; - } - } - else - { - p = buf; - for (; bufsize && !ISSPACE (*buf) && *buf != '>'; - ++buf, --bufsize) - if (!ph && *buf == '#' && *(buf - 1) != '&') - ph = buf; - if (!bufsize) - break; - } - /* If '#' was found unprotected in a URI, it is probably an - HTML marker, or color spec. */ - *size = (ph ? ph : buf) - p; - /* The URI is liable to be returned if: - 1) *size != 0; - 2) its tag and attribute are found in html_allow. */ - if (*size && idmatch (html_allow, s->tag, s->attr)) - { - if (strcasecmp(s->tag, "a") == EQ || - strcasecmp(s->tag, "area") == EQ) - { - /* Only follow these if we're not at a -p leaf node, as they - always link to external documents. */ - if (!dash_p_leaf_HTML) - { - s->at_value = 1; - return p; - } - } - else if (!strcasecmp (s->tag, "base") && - !strcasecmp (s->attr, "href")) - { - FREE_MAYBE (s->base); - s->base = strdupdelim (p, buf); - } - else if (strcasecmp(s->tag, "link") == EQ) - { - if (strcasecmp(s->attr, "href") == EQ) - { - link_href = p; - link_href_saved_size = *size; /* for restoration below */ - } - else if (strcasecmp(s->attr, "rel") == EQ) - link_rel = p; - - if (link_href != NULL && link_rel != NULL) - /* Okay, we've now seen this tag's HREF and REL - attributes (they may be in either order), so it's now - possible to decide if we want to traverse it. */ - if (!dash_p_leaf_HTML || - strncasecmp(link_rel, "stylesheet", - sizeof("stylesheet") - 1) == EQ) - /* In the normal case, all tags are fair game. - - In the special case of when -p is active, however, and - we're at a leaf node (relative to the -l max. depth) in - the HTML document tree, the only tag we'll - follow is a , as it's necessary - for displaying this document properly. We won't follow - other tags, like , for - instance, as they refer to external documents. - - Note that the above strncasecmp() will incorrectly - consider something like ' tag, size is currently set to the size for - REL's value -- set it to what it was when we were - looking at HREF's value. */ - *size = link_href_saved_size; - - s->at_value = 1; - return link_href; - } - } - else if (!strcasecmp (s->tag, "meta") && - !strcasecmp (s->attr, "content")) - { - /* Some pages use a META tag to specify that the page - be refreshed by a new page after a given number of - seconds. We need to attempt to extract an URL for - the new page from the other garbage present. The - general format for this is: - - - So we just need to skip past the "0; URL=" - garbage to get to the URL. META tags are also - used for specifying random things like the page - author's name and what editor was used to create - it. So we need to be careful to ignore them and - not assume that an URL will be present at all. */ - for (; *size && ISDIGIT (*p); p++, *size -= 1); - if (*p == ';') - { - for (p++, *size -= 1; - *size && ISSPACE (*p); - p++, *size -= 1) ; - if (!strncasecmp (p, "URL=", 4)) - { - p += 4, *size -= 4; - s->at_value = 1; - return p; - } - } - } - else - { - s->at_value = 1; - return p; - } - } - /* Exit from quote. */ - if (*buf == s->quote_char) - { - s->in_quote = 0; - ++buf, --bufsize; - } - } while (*buf != '>'); - FREE_MAYBE (s->tag); - FREE_MAYBE (s->attr); - s->tag = s->attr = NULL; - if (!bufsize) - break; - } - - FREE_MAYBE (s->tag); - FREE_MAYBE (s->attr); - FREE_MAYBE (s->base); - memset (s, 0, sizeof (*s)); /* just to be sure */ - DEBUGP (("HTML parser ends here (state destroyed).\n")); - return NULL; -} - -/* The function returns the base reference of HTML buffer id, or NULL - if one wasn't defined for that buffer. */ -const char * -html_base (void) -{ - return global_state.base; -} - -/* Create a malloc'ed copy of text in the range [beg, end), but with - the HTML entities processed. Recognized entities are <, >, - &, ",   and the numerical entities. */ - -char * -html_decode_entities (const char *beg, const char *end) -{ - char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */ - const char *from = beg; - char *to = newstr; - - while (from < end) - { - if (*from != '&') - *to++ = *from++; - else - { - const char *save = from; - int remain; - - if (++from == end) goto lose; - remain = end - from; - - if (*from == '#') - { - int numeric; - ++from; - if (from == end || !ISDIGIT (*from)) goto lose; - for (numeric = 0; from < end && ISDIGIT (*from); from++) - numeric = 10 * numeric + (*from) - '0'; - if (from < end && ISALPHA (*from)) goto lose; - numeric &= 0xff; - *to++ = numeric; - } -#define FROB(literal) (remain >= (sizeof (literal) - 1) \ - && !memcmp (from, literal, sizeof (literal) - 1) \ - && (*(from + sizeof (literal) - 1) == ';' \ - || remain == sizeof (literal) - 1 \ - || !ISALNUM (*(from + sizeof (literal) - 1)))) - else if (FROB ("lt")) - *to++ = '<', from += 2; - else if (FROB ("gt")) - *to++ = '>', from += 2; - else if (FROB ("amp")) - *to++ = '&', from += 3; - else if (FROB ("quot")) - *to++ = '\"', from += 4; - /* We don't implement the "Added Latin 1" entities proposed - by rfc1866 (except for nbsp), because it is unnecessary - in the context of Wget, and would require hashing to work - efficiently. */ - else if (FROB ("nbsp")) - *to++ = 160, from += 4; - else - goto lose; -#undef FROB - /* If the entity was followed by `;', we step over the `;'. - Otherwise, it was followed by either a non-alphanumeric - or EOB, in which case we do nothing. */ - if (from < end && *from == ';') - ++from; - continue; - - lose: - /* This was not an entity after all. Back out. */ - from = save; - *to++ = *from++; - } - } - *to++ = '\0'; - /* #### Should we try to do this: */ -#if 0 - newstr = xrealloc (newstr, to - newstr); -#endif - return newstr; -} - -/* The function returns the pointer to the malloc-ed quoted version of - string s. It will recognize and quote numeric and special graphic - entities, as per RFC1866: - - `&' -> `&' - `<' -> `<' - `>' -> `>' - `"' -> `"' - - No other entities are recognized or replaced. */ -static char * -html_quote_string (const char *s) -{ - const char *b = s; - char *p, *res; - int i; - - /* Pass through the string, and count the new size. */ - for (i = 0; *s; s++, i++) - { - if (*s == '&') - i += 4; /* `amp;' */ - else if (*s == '<' || *s == '>') - i += 3; /* `lt;' and `gt;' */ - else if (*s == '\"') - i += 5; /* `quot;' */ - } - res = (char *)xmalloc (i + 1); - s = b; - for (p = res; *s; s++) - { - switch (*s) - { - case '&': - *p++ = '&'; - *p++ = 'a'; - *p++ = 'm'; - *p++ = 'p'; - *p++ = ';'; - break; - case '<': case '>': - *p++ = '&'; - *p++ = (*s == '<' ? 'l' : 'g'); - *p++ = 't'; - *p++ = ';'; - break; - case '\"': - *p++ = '&'; - *p++ = 'q'; - *p++ = 'u'; - *p++ = 'o'; - *p++ = 't'; - *p++ = ';'; - break; - default: - *p++ = *s; - } - } - *p = '\0'; - return res; -} - -/* The function creates an HTML index containing references to given - directories and files on the appropriate host. The references are - FTP. */ -uerr_t -ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f) -{ - FILE *fp; - char *upwd; - char *htclfile; /* HTML-clean file name */ - - if (!opt.dfp) - { - fp = fopen (file, "wb"); - if (!fp) - { - logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); - return FOPENERR; - } - } - else - fp = opt.dfp; - if (u->user) - { - char *tmpu, *tmpp; /* temporary, clean user and passwd */ - - tmpu = CLEANDUP (u->user); - tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL; - upwd = (char *)xmalloc (strlen (tmpu) - + (tmpp ? (1 + strlen (tmpp)) : 0) + 2); - sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : ""); - xfree (tmpu); - FREE_MAYBE (tmpp); - } - else - upwd = xstrdup (""); - fprintf (fp, "\n"); - fprintf (fp, "\n\n"); - fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); - fprintf (fp, "\n\n\n

"); - fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); - fprintf (fp, "

\n
\n
\n");
-  while (f)
-    {
-      fprintf (fp, "  ");
-      if (f->tstamp != -1)
-	{
-	  /* #### Should we translate the months? */
-	  static char *months[] = {
-	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
-	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
-	  };
-	  struct tm *ptm = localtime ((time_t *)&f->tstamp);
-
-	  fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
-		  ptm->tm_mday);
-	  if (ptm->tm_hour)
-	    fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
-	  else
-	    fprintf (fp, "       ");
-	}
-      else
-	fprintf (fp, _("time unknown       "));
-      switch (f->type)
-	{
-	case FT_PLAINFILE:
-	  fprintf (fp, _("File        "));
-	  break;
-	case FT_DIRECTORY:
-	  fprintf (fp, _("Directory   "));
-	  break;
-	case FT_SYMLINK:
-	  fprintf (fp, _("Link        "));
-	  break;
-	default:
-	  fprintf (fp, _("Not sure    "));
-	  break;
-	}
-      htclfile = html_quote_string (f->name);
-      fprintf (fp, "host, u->port);
-      if (*u->dir != '/')
-	putc ('/', fp);
-      fprintf (fp, "%s", u->dir);
-      if (*u->dir)
-	putc ('/', fp);
-      fprintf (fp, "%s", htclfile);
-      if (f->type == FT_DIRECTORY)
-	putc ('/', fp);
-      fprintf (fp, "\">%s", htclfile);
-      if (f->type == FT_DIRECTORY)
-	putc ('/', fp);
-      fprintf (fp, " ");
-      if (f->type == FT_PLAINFILE)
-	fprintf (fp, _(" (%s bytes)"), legible (f->size));
-      else if (f->type == FT_SYMLINK)
-	fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
-      putc ('\n', fp);
-      xfree (htclfile);
-      f = f->next;
-    }
-  fprintf (fp, "
\n\n\n"); - xfree (upwd); - if (!opt.dfp) - fclose (fp); - else - fflush (fp); - return FTPOK; -}