[svn] Several fixes for recursive spider mode.

This commit is contained in:
mtortonesi 2006-08-24 08:27:57 -07:00
parent a53296c501
commit 0dbef4ccb4
9 changed files with 317 additions and 152 deletions

View File

@ -1,3 +1,27 @@
2006-08-24 Mauro Tortonesi <mauro@ferrara.linux.it>
* Makefile.in: Added spider.c to the list of files to compile and
spider.h to the list of header files. Updated copyright information.
* http.c: Major changes to recursive spider mode. Now for every
resource we are supposed to check, we send a HEAD request to find out
if it exists. If the resource is a HTML file, we retrieve it and parse
it to discover links to other resources.
* recur.c: Ditto.
* res.c (res_retrieve_file): Disable opt.timestamping and opt.spider
when retrieving robots.txt. Updated copyright information.
* convert.c: Moved code tracking broken links to spider.c.
* convert.h: Ditto.
* spider.c: Created new file to keep track of visited URLs in spider
mode.
* spider.h: Ditto.
2006-08-21 Mauro Tortonesi <mauro@ferrara.linux.it>
* http.c: Fixed timestamping-related bug.

View File

@ -1,5 +1,5 @@
# Makefile for `wget' utility
# Copyright (C) 1995-2005 Free Software Foundation, Inc.
# Copyright (C) 1995-2006 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -76,8 +76,8 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \
ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \
log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
res.o retr.o safe-ctype.o snprintf.o $(SSL_OBJ) url.o \
utils.o version.o xmalloc.o
res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \
url.o utils.o version.o xmalloc.o
.SUFFIXES:
.SUFFIXES: .c .o
@ -99,8 +99,8 @@ wget$(exeext): $(OBJ)
$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
progress.h ptimer.h recur.h res.h retr.h safe-ctype.h ssl.h \
sysdep.h url.h utils.h wget.h xmalloc.h
progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \
spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
#
# Dependencies for installing

View File

@ -54,8 +54,6 @@ struct hash_table *dl_url_file_map;
conversion after Wget is done. */
struct hash_table *downloaded_html_set;
static struct hash_table *nonexisting_urls_hash;
static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
@ -835,7 +833,6 @@ register_html (const char *url, const char *file)
}
static void downloaded_files_free (void);
static void nonexisting_urls_free (void);
/* Cleanup the data structures associated with this file. */
@ -857,7 +854,6 @@ convert_cleanup (void)
if (downloaded_html_set)
string_set_free (downloaded_html_set);
downloaded_files_free ();
nonexisting_urls_free ();
if (converted_files)
string_set_free (converted_files);
}
@ -957,122 +953,6 @@ downloaded_files_free (void)
downloaded_files_hash = NULL;
}
}
/* Remembers broken links. */
struct broken_urls_list
{
char *url;
struct broken_urls_list *next;
};
static bool
in_list (const struct broken_urls_list *list, const char *url)
{
const struct broken_urls_list *ptr;
for (ptr = list; ptr; ptr = ptr->next)
{
/* str[case]cmp is inadequate for URL comparison */
if (are_urls_equal (url, ptr->url) == 0) return true;
}
return false;
}
void
nonexisting_url (const char *url, const char *referrer)
{
struct broken_urls_list *list;
/* Ignore robots.txt URLs */
if (is_robots_txt_url (url))
return;
if (!nonexisting_urls_hash)
nonexisting_urls_hash = make_string_hash_table (0);
list = hash_table_get (nonexisting_urls_hash, url);
if (!list)
{
list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
list->url = referrer ? xstrdup (referrer) : NULL;
hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
}
else if (list && !in_list (list, referrer))
{
/* Append referrer at the end of the list */
struct broken_urls_list *newnode;
while (list->next) list = list->next;
newnode = xnew0 (struct broken_urls_list);
newnode->url = xstrdup (referrer);
list->next = newnode;
}
}
static void
nonexisting_urls_free (void)
{
if (nonexisting_urls_hash)
{
hash_table_iterator iter;
for (hash_table_iterate (nonexisting_urls_hash, &iter);
hash_table_iter_next (&iter);
)
{
xfree (iter.key);
xfree (iter.value);
}
hash_table_destroy (nonexisting_urls_hash);
nonexisting_urls_hash = NULL;
}
}
void
print_broken_links (void)
{
hash_table_iterator iter;
int num_elems;
if (!nonexisting_urls_hash)
{
logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
return;
}
num_elems = hash_table_count (nonexisting_urls_hash);
assert (num_elems > 0);
if (num_elems > 1)
{
logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
num_elems);
}
else
{
logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
}
for (hash_table_iterate (nonexisting_urls_hash, &iter);
hash_table_iter_next (&iter);
)
{
struct broken_urls_list *list;
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
for (list = (struct broken_urls_list *) iter.value;
list;
list = list->next)
{
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
}
}
logputs (LOG_NOTQUIET, "\n");
}
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic

View File

@ -104,7 +104,4 @@ void convert_cleanup (void);
char *html_quote_string (const char *);
void nonexisting_url (const char *, const char *);
void print_broken_links (void);
#endif /* CONVERT_H */

View File

@ -2281,6 +2281,10 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
/* Get the current time string. */
tms = time_str (time (NULL));
if (opt.spider && !got_head)
logprintf (LOG_VERBOSE, _("\
Spider mode enabled. Check if remote file exists.\n"));
/* Print fetch message, if opt.verbose. */
if (opt.verbose)
{
@ -2308,8 +2312,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
if ((opt.spider && !opt.recursive)
|| (opt.timestamping && !got_head)
if (((opt.spider || opt.timestamping) && !got_head)
|| (opt.always_rest && !got_name))
*dt |= HEAD_ONLY;
else
@ -2412,13 +2415,22 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
hurl = url_string (u, true);
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
}
if (opt.spider && opt.recursive)
/* Maybe we should always keep track of broken links, not just in
* spider mode. */
if (opt.spider)
{
if (!hurl) hurl = url_string (u, true);
nonexisting_url (hurl, referer);
/* #### Again: ugly ugly ugly! */
if (!hurl)
hurl = url_string (u, true);
nonexisting_url (hurl);
logprintf (LOG_NOTQUIET, _("\
Remote file does not exist -- broken link!!!\n"));
}
else
{
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
tms, hstat.statcode, escnonprint (hstat.error));
}
logputs (LOG_VERBOSE, "\n");
ret = WRONGCODE;
xfree_null (hurl);
@ -2447,10 +2459,12 @@ Last-modified header invalid -- time-stamp ignored.\n"));
/* The time-stamping section. */
if (opt.timestamping)
{
if (hstat.orig_file_name) /* Perform this check only if the file we're
supposed to download already exists. */
if (hstat.orig_file_name) /* Perform the following checks only
if the file we're supposed to
download already exists. */
{
if (hstat.remote_time && tmr != (time_t) (-1))
if (hstat.remote_time &&
tmr != (time_t) (-1))
{
/* Now time-stamping can be used validly. Time-stamping
means that if the sizes of the local and remote file
@ -2459,7 +2473,8 @@ Last-modified header invalid -- time-stamp ignored.\n"));
download procedure is resumed. */
if (hstat.orig_file_tstamp >= tmr)
{
if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
if (hstat.contlen == -1
|| hstat.orig_file_size == hstat.contlen)
{
logprintf (LOG_VERBOSE, _("\
Server file no newer than local file `%s' -- not retrieving.\n\n"),
@ -2493,6 +2508,33 @@ The sizes do not match (local %s) -- retrieving.\n"),
restart_loop = true;
}
if (opt.spider)
{
if (opt.recursive)
{
if (*dt & TEXTHTML)
{
logputs (LOG_VERBOSE, _("\
Remote file exists and could contain links to other resources -- retrieving.\n\n"));
restart_loop = true;
}
else
{
logprintf (LOG_VERBOSE, _("\
Remote file exists but does not contain any link -- not retrieving.\n\n"));
ret = RETRUNNEEDED;
goto exit;
}
}
else
{
logprintf (LOG_VERBOSE, _("\
Remote file exists but recursion is disabled -- not retrieving.\n\n"));
ret = RETRUNNEEDED;
goto exit;
}
}
got_head = true; /* no more time-stamping */
*dt &= ~HEAD_ONLY;
count = 0; /* the retrieve count for HEAD is reset */
@ -2502,7 +2544,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
}
if ((tmr != (time_t) (-1))
&& (!opt.spider || opt.recursive)
&& ((hstat.len == hstat.contlen) ||
((hstat.res == 0) && (hstat.contlen == -1))))
{
@ -2521,14 +2562,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
}
/* End of time-stamping section. */
if (opt.spider && !opt.recursive)
{
logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
escnonprint (hstat.error));
ret = RETROK;
goto exit;
}
tmrate = retr_rate (hstat.rd_size, hstat.dltime);
total_download_time += hstat.dltime;

View File

@ -274,6 +274,11 @@ retrieve_tree (const char *start_url)
}
}
if (opt.spider)
{
visited_url (url, referer);
}
if (descend
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
{
@ -365,6 +370,7 @@ retrieve_tree (const char *start_url)
file);
if (unlink (file))
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
logputs (LOG_VERBOSE, "\n");
register_delete_file (file);
}
@ -420,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (string_set_contains (blacklist, url))
{
if (opt.spider)
{
char *referrer = url_string (parent, true);
DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
visited_url (url, referrer);
xfree (referrer);
}
DEBUGP (("Already on the black list.\n"));
goto out;
}

View File

@ -1,5 +1,5 @@
/* Support for Robot Exclusion Standard (RES).
Copyright (C) 2001 Free Software Foundation, Inc.
Copyright (C) 2001,2006 Free Software Foundation, Inc.
This file is part of Wget.
@ -539,10 +539,16 @@ res_retrieve_file (const char *url, char **file)
{
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
xfree (robots_url);
if (err != RETROK && *file != NULL)

175
src/spider.c Normal file
View File

@ -0,0 +1,175 @@
/* Keep track of visited URLs in spider mode.
Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
OpenSSL project's "OpenSSL" library (or with modified versions of it
that use the same license as the "OpenSSL" library), and distribute
the linked executables. You must obey the GNU General Public License
in all respects for all of the code used other than "OpenSSL". If you
modify this file, you may extend this exception to your version of the
file, but you are not obligated to do so. If you do not wish to do
so, delete this exception statement from your version. */
#include <config.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include "wget.h"
#include "spider.h"
#include "url.h"
#include "utils.h"
#include "hash.h"
#include "res.h"
static struct hash_table *visited_urls_hash;
static struct hash_table *nonexisting_urls_set;
/* Cleanup the data structures associated with this file. */
void
spider_cleanup (void)
{
if (visited_urls_hash)
{
free_keys_and_values (visited_urls_hash);
hash_table_destroy (visited_urls_hash);
visited_urls_hash = NULL;
}
if (nonexisting_urls_set)
string_set_free (nonexisting_urls_set);
}
/* Remembers visited files. */
struct url_list
{
char *url;
struct url_list *next;
};
static bool
in_url_list_p (const struct url_list *list, const char *url, bool verbose)
{
const struct url_list *ptr;
for (ptr = list; ptr; ptr = ptr->next)
{
/* str[case]cmp is inadequate for URL comparison */
if (are_urls_equal (url, ptr->url))
return true;
}
return false;
}
void
visited_url (const char *url, const char *referrer)
{
struct url_list *list;
/* Ignore robots.txt URLs */
if (is_robots_txt_url (url))
return;
if (!visited_urls_hash)
visited_urls_hash = make_string_hash_table (0);
list = hash_table_get (visited_urls_hash, url);
if (!list)
{
list = (struct url_list *) xnew0 (struct url_list);
list->url = referrer ? xstrdup (referrer) : NULL;
hash_table_put (visited_urls_hash, xstrdup (url), list);
}
else if (referrer && !in_url_list_p (list, referrer, false))
{
/* Append referrer at the end of the list */
struct url_list *newnode;
while (list->next)
list = list->next;
newnode = (struct url_list *) xnew0 (struct url_list);
newnode->url = xstrdup (referrer);
list->next = newnode;
}
}
/* Remembers broken links. */
void
nonexisting_url (const char *url)
{
/* Ignore robots.txt URLs */
if (is_robots_txt_url (url))
return;
if (!nonexisting_urls_set)
nonexisting_urls_set = make_string_hash_table (0);
string_set_add (nonexisting_urls_set, url);
}
void
print_broken_links (void)
{
hash_table_iterator iter;
int num_elems;
if (!nonexisting_urls_set)
{
logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
return;
}
num_elems = hash_table_count (nonexisting_urls_set);
assert (num_elems > 0);
if (num_elems > 1)
{
logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
num_elems);
}
else
{
logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
}
for (hash_table_iterate (nonexisting_urls_set, &iter);
hash_table_iter_next (&iter); )
{
struct url_list *list;
const char *url = (const char *) iter.key;
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), url);
for (list = (struct url_list *) hash_table_get (visited_urls_hash, url);
list; list = list->next)
{
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
}
}
logputs (LOG_NOTQUIET, "\n");
}
/*
* vim: et ts=2 sw=2
*/

37
src/spider.h Normal file
View File

@ -0,0 +1,37 @@
/* Declarations for spider.c
Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
OpenSSL project's "OpenSSL" library (or with modified versions of it
that use the same license as the "OpenSSL" library), and distribute
the linked executables. You must obey the GNU General Public License
in all respects for all of the code used other than "OpenSSL". If you
modify this file, you may extend this exception to your version of the
file, but you are not obligated to do so. If you do not wish to do
so, delete this exception statement from your version. */
#ifndef SPIDER_H
#define SPIDER_H
void visited_url (const char *, const char *);
void nonexisting_url (const char *);
void print_broken_links (void);
#endif /* SPIDER_H */