mirror of
https://github.com/mirror/wget.git
synced 2025-01-24 19:30:54 +08:00
[svn] Several fixes for recursive spider mode.
This commit is contained in:
parent
a53296c501
commit
0dbef4ccb4
@ -1,3 +1,27 @@
|
|||||||
|
2006-08-24 Mauro Tortonesi <mauro@ferrara.linux.it>
|
||||||
|
|
||||||
|
* Makefile.in: Added spider.c to the list of files to compile and
|
||||||
|
spider.h to the list of header files. Updated copyright information.
|
||||||
|
|
||||||
|
* http.c: Major changes to recursive spider mode. Now for every
|
||||||
|
resource we are supposed to check, we send a HEAD request to find out
|
||||||
|
if it exists. If the resource is a HTML file, we retrieve it and parse
|
||||||
|
it to discover links to other resources.
|
||||||
|
|
||||||
|
* recur.c: Ditto.
|
||||||
|
|
||||||
|
* res.c (res_retrieve_file): Disable opt.timestamping and opt.spider
|
||||||
|
when retrieving robots.txt. Updated copyright information.
|
||||||
|
|
||||||
|
* convert.c: Moved code tracking broken links to spider.c.
|
||||||
|
|
||||||
|
* convert.h: Ditto.
|
||||||
|
|
||||||
|
* spider.c: Created new file to keep track of visited URLs in spider
|
||||||
|
mode.
|
||||||
|
|
||||||
|
* spider.h: Ditto.
|
||||||
|
|
||||||
2006-08-21 Mauro Tortonesi <mauro@ferrara.linux.it>
|
2006-08-21 Mauro Tortonesi <mauro@ferrara.linux.it>
|
||||||
|
|
||||||
* http.c: Fixed timestamping-related bug.
|
* http.c: Fixed timestamping-related bug.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# Makefile for `wget' utility
|
# Makefile for `wget' utility
|
||||||
# Copyright (C) 1995-2005 Free Software Foundation, Inc.
|
# Copyright (C) 1995-2006 Free Software Foundation, Inc.
|
||||||
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
@ -76,8 +76,8 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \
|
|||||||
ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
|
ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
|
||||||
host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \
|
host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \
|
||||||
log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
|
log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
|
||||||
res.o retr.o safe-ctype.o snprintf.o $(SSL_OBJ) url.o \
|
res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \
|
||||||
utils.o version.o xmalloc.o
|
url.o utils.o version.o xmalloc.o
|
||||||
|
|
||||||
.SUFFIXES:
|
.SUFFIXES:
|
||||||
.SUFFIXES: .c .o
|
.SUFFIXES: .c .o
|
||||||
@ -96,11 +96,11 @@ wget$(exeext): $(OBJ)
|
|||||||
# time, and it's a lot safer than attempting to get all the
|
# time, and it's a lot safer than attempting to get all the
|
||||||
# dependencies right.
|
# dependencies right.
|
||||||
|
|
||||||
$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
|
$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
|
||||||
gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
|
gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
|
||||||
http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
|
http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
|
||||||
progress.h ptimer.h recur.h res.h retr.h safe-ctype.h ssl.h \
|
progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \
|
||||||
sysdep.h url.h utils.h wget.h xmalloc.h
|
spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Dependencies for installing
|
# Dependencies for installing
|
||||||
|
120
src/convert.c
120
src/convert.c
@ -54,8 +54,6 @@ struct hash_table *dl_url_file_map;
|
|||||||
conversion after Wget is done. */
|
conversion after Wget is done. */
|
||||||
struct hash_table *downloaded_html_set;
|
struct hash_table *downloaded_html_set;
|
||||||
|
|
||||||
static struct hash_table *nonexisting_urls_hash;
|
|
||||||
|
|
||||||
static void convert_links (const char *, struct urlpos *);
|
static void convert_links (const char *, struct urlpos *);
|
||||||
|
|
||||||
/* This function is called when the retrieval is done to convert the
|
/* This function is called when the retrieval is done to convert the
|
||||||
@ -835,7 +833,6 @@ register_html (const char *url, const char *file)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void downloaded_files_free (void);
|
static void downloaded_files_free (void);
|
||||||
static void nonexisting_urls_free (void);
|
|
||||||
|
|
||||||
/* Cleanup the data structures associated with this file. */
|
/* Cleanup the data structures associated with this file. */
|
||||||
|
|
||||||
@ -857,7 +854,6 @@ convert_cleanup (void)
|
|||||||
if (downloaded_html_set)
|
if (downloaded_html_set)
|
||||||
string_set_free (downloaded_html_set);
|
string_set_free (downloaded_html_set);
|
||||||
downloaded_files_free ();
|
downloaded_files_free ();
|
||||||
nonexisting_urls_free ();
|
|
||||||
if (converted_files)
|
if (converted_files)
|
||||||
string_set_free (converted_files);
|
string_set_free (converted_files);
|
||||||
}
|
}
|
||||||
@ -957,122 +953,6 @@ downloaded_files_free (void)
|
|||||||
downloaded_files_hash = NULL;
|
downloaded_files_hash = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Remembers broken links. */
|
|
||||||
|
|
||||||
struct broken_urls_list
|
|
||||||
{
|
|
||||||
char *url;
|
|
||||||
struct broken_urls_list *next;
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool
|
|
||||||
in_list (const struct broken_urls_list *list, const char *url)
|
|
||||||
{
|
|
||||||
const struct broken_urls_list *ptr;
|
|
||||||
|
|
||||||
for (ptr = list; ptr; ptr = ptr->next)
|
|
||||||
{
|
|
||||||
/* str[case]cmp is inadequate for URL comparison */
|
|
||||||
if (are_urls_equal (url, ptr->url) == 0) return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
nonexisting_url (const char *url, const char *referrer)
|
|
||||||
{
|
|
||||||
struct broken_urls_list *list;
|
|
||||||
|
|
||||||
/* Ignore robots.txt URLs */
|
|
||||||
if (is_robots_txt_url (url))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (!nonexisting_urls_hash)
|
|
||||||
nonexisting_urls_hash = make_string_hash_table (0);
|
|
||||||
|
|
||||||
list = hash_table_get (nonexisting_urls_hash, url);
|
|
||||||
if (!list)
|
|
||||||
{
|
|
||||||
list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
|
|
||||||
list->url = referrer ? xstrdup (referrer) : NULL;
|
|
||||||
hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
|
|
||||||
}
|
|
||||||
else if (list && !in_list (list, referrer))
|
|
||||||
{
|
|
||||||
/* Append referrer at the end of the list */
|
|
||||||
struct broken_urls_list *newnode;
|
|
||||||
|
|
||||||
while (list->next) list = list->next;
|
|
||||||
|
|
||||||
newnode = xnew0 (struct broken_urls_list);
|
|
||||||
newnode->url = xstrdup (referrer);
|
|
||||||
list->next = newnode;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
nonexisting_urls_free (void)
|
|
||||||
{
|
|
||||||
if (nonexisting_urls_hash)
|
|
||||||
{
|
|
||||||
hash_table_iterator iter;
|
|
||||||
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
|
||||||
hash_table_iter_next (&iter);
|
|
||||||
)
|
|
||||||
{
|
|
||||||
xfree (iter.key);
|
|
||||||
xfree (iter.value);
|
|
||||||
}
|
|
||||||
hash_table_destroy (nonexisting_urls_hash);
|
|
||||||
nonexisting_urls_hash = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
print_broken_links (void)
|
|
||||||
{
|
|
||||||
hash_table_iterator iter;
|
|
||||||
int num_elems;
|
|
||||||
|
|
||||||
if (!nonexisting_urls_hash)
|
|
||||||
{
|
|
||||||
logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
num_elems = hash_table_count (nonexisting_urls_hash);
|
|
||||||
assert (num_elems > 0);
|
|
||||||
|
|
||||||
if (num_elems > 1)
|
|
||||||
{
|
|
||||||
logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
|
|
||||||
num_elems);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
|
||||||
hash_table_iter_next (&iter);
|
|
||||||
)
|
|
||||||
{
|
|
||||||
struct broken_urls_list *list;
|
|
||||||
|
|
||||||
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
|
|
||||||
|
|
||||||
for (list = (struct broken_urls_list *) iter.value;
|
|
||||||
list;
|
|
||||||
list = list->next)
|
|
||||||
{
|
|
||||||
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logputs (LOG_NOTQUIET, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* The function returns the pointer to the malloc-ed quoted version of
|
/* The function returns the pointer to the malloc-ed quoted version of
|
||||||
string s. It will recognize and quote numeric and special graphic
|
string s. It will recognize and quote numeric and special graphic
|
||||||
|
@ -104,7 +104,4 @@ void convert_cleanup (void);
|
|||||||
|
|
||||||
char *html_quote_string (const char *);
|
char *html_quote_string (const char *);
|
||||||
|
|
||||||
void nonexisting_url (const char *, const char *);
|
|
||||||
void print_broken_links (void);
|
|
||||||
|
|
||||||
#endif /* CONVERT_H */
|
#endif /* CONVERT_H */
|
||||||
|
73
src/http.c
73
src/http.c
@ -2281,6 +2281,10 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
|||||||
/* Get the current time string. */
|
/* Get the current time string. */
|
||||||
tms = time_str (time (NULL));
|
tms = time_str (time (NULL));
|
||||||
|
|
||||||
|
if (opt.spider && !got_head)
|
||||||
|
logprintf (LOG_VERBOSE, _("\
|
||||||
|
Spider mode enabled. Check if remote file exists.\n"));
|
||||||
|
|
||||||
/* Print fetch message, if opt.verbose. */
|
/* Print fetch message, if opt.verbose. */
|
||||||
if (opt.verbose)
|
if (opt.verbose)
|
||||||
{
|
{
|
||||||
@ -2308,8 +2312,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
|||||||
/* Default document type is empty. However, if spider mode is
|
/* Default document type is empty. However, if spider mode is
|
||||||
on or time-stamping is employed, HEAD_ONLY commands is
|
on or time-stamping is employed, HEAD_ONLY commands is
|
||||||
encoded within *dt. */
|
encoded within *dt. */
|
||||||
if ((opt.spider && !opt.recursive)
|
if (((opt.spider || opt.timestamping) && !got_head)
|
||||||
|| (opt.timestamping && !got_head)
|
|
||||||
|| (opt.always_rest && !got_name))
|
|| (opt.always_rest && !got_name))
|
||||||
*dt |= HEAD_ONLY;
|
*dt |= HEAD_ONLY;
|
||||||
else
|
else
|
||||||
@ -2412,13 +2415,22 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
|||||||
hurl = url_string (u, true);
|
hurl = url_string (u, true);
|
||||||
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
|
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
|
||||||
}
|
}
|
||||||
if (opt.spider && opt.recursive)
|
/* Maybe we should always keep track of broken links, not just in
|
||||||
|
* spider mode. */
|
||||||
|
if (opt.spider)
|
||||||
{
|
{
|
||||||
if (!hurl) hurl = url_string (u, true);
|
/* #### Again: ugly ugly ugly! */
|
||||||
nonexisting_url (hurl, referer);
|
if (!hurl)
|
||||||
|
hurl = url_string (u, true);
|
||||||
|
nonexisting_url (hurl);
|
||||||
|
logprintf (LOG_NOTQUIET, _("\
|
||||||
|
Remote file does not exist -- broken link!!!\n"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
|
||||||
|
tms, hstat.statcode, escnonprint (hstat.error));
|
||||||
}
|
}
|
||||||
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
|
|
||||||
tms, hstat.statcode, escnonprint (hstat.error));
|
|
||||||
logputs (LOG_VERBOSE, "\n");
|
logputs (LOG_VERBOSE, "\n");
|
||||||
ret = WRONGCODE;
|
ret = WRONGCODE;
|
||||||
xfree_null (hurl);
|
xfree_null (hurl);
|
||||||
@ -2447,10 +2459,12 @@ Last-modified header invalid -- time-stamp ignored.\n"));
|
|||||||
/* The time-stamping section. */
|
/* The time-stamping section. */
|
||||||
if (opt.timestamping)
|
if (opt.timestamping)
|
||||||
{
|
{
|
||||||
if (hstat.orig_file_name) /* Perform this check only if the file we're
|
if (hstat.orig_file_name) /* Perform the following checks only
|
||||||
supposed to download already exists. */
|
if the file we're supposed to
|
||||||
|
download already exists. */
|
||||||
{
|
{
|
||||||
if (hstat.remote_time && tmr != (time_t) (-1))
|
if (hstat.remote_time &&
|
||||||
|
tmr != (time_t) (-1))
|
||||||
{
|
{
|
||||||
/* Now time-stamping can be used validly. Time-stamping
|
/* Now time-stamping can be used validly. Time-stamping
|
||||||
means that if the sizes of the local and remote file
|
means that if the sizes of the local and remote file
|
||||||
@ -2459,7 +2473,8 @@ Last-modified header invalid -- time-stamp ignored.\n"));
|
|||||||
download procedure is resumed. */
|
download procedure is resumed. */
|
||||||
if (hstat.orig_file_tstamp >= tmr)
|
if (hstat.orig_file_tstamp >= tmr)
|
||||||
{
|
{
|
||||||
if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
|
if (hstat.contlen == -1
|
||||||
|
|| hstat.orig_file_size == hstat.contlen)
|
||||||
{
|
{
|
||||||
logprintf (LOG_VERBOSE, _("\
|
logprintf (LOG_VERBOSE, _("\
|
||||||
Server file no newer than local file `%s' -- not retrieving.\n\n"),
|
Server file no newer than local file `%s' -- not retrieving.\n\n"),
|
||||||
@ -2493,6 +2508,33 @@ The sizes do not match (local %s) -- retrieving.\n"),
|
|||||||
restart_loop = true;
|
restart_loop = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (opt.spider)
|
||||||
|
{
|
||||||
|
if (opt.recursive)
|
||||||
|
{
|
||||||
|
if (*dt & TEXTHTML)
|
||||||
|
{
|
||||||
|
logputs (LOG_VERBOSE, _("\
|
||||||
|
Remote file exists and could contain links to other resources -- retrieving.\n\n"));
|
||||||
|
restart_loop = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, _("\
|
||||||
|
Remote file exists but does not contain any link -- not retrieving.\n\n"));
|
||||||
|
ret = RETRUNNEEDED;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, _("\
|
||||||
|
Remote file exists but recursion is disabled -- not retrieving.\n\n"));
|
||||||
|
ret = RETRUNNEEDED;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
got_head = true; /* no more time-stamping */
|
got_head = true; /* no more time-stamping */
|
||||||
*dt &= ~HEAD_ONLY;
|
*dt &= ~HEAD_ONLY;
|
||||||
count = 0; /* the retrieve count for HEAD is reset */
|
count = 0; /* the retrieve count for HEAD is reset */
|
||||||
@ -2502,7 +2544,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((tmr != (time_t) (-1))
|
if ((tmr != (time_t) (-1))
|
||||||
&& (!opt.spider || opt.recursive)
|
|
||||||
&& ((hstat.len == hstat.contlen) ||
|
&& ((hstat.len == hstat.contlen) ||
|
||||||
((hstat.res == 0) && (hstat.contlen == -1))))
|
((hstat.res == 0) && (hstat.contlen == -1))))
|
||||||
{
|
{
|
||||||
@ -2521,14 +2562,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
|
|||||||
}
|
}
|
||||||
/* End of time-stamping section. */
|
/* End of time-stamping section. */
|
||||||
|
|
||||||
if (opt.spider && !opt.recursive)
|
|
||||||
{
|
|
||||||
logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
|
|
||||||
escnonprint (hstat.error));
|
|
||||||
ret = RETROK;
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmrate = retr_rate (hstat.rd_size, hstat.dltime);
|
tmrate = retr_rate (hstat.rd_size, hstat.dltime);
|
||||||
total_download_time += hstat.dltime;
|
total_download_time += hstat.dltime;
|
||||||
|
|
||||||
|
13
src/recur.c
13
src/recur.c
@ -274,6 +274,11 @@ retrieve_tree (const char *start_url)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (opt.spider)
|
||||||
|
{
|
||||||
|
visited_url (url, referer);
|
||||||
|
}
|
||||||
|
|
||||||
if (descend
|
if (descend
|
||||||
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
|
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
|
||||||
{
|
{
|
||||||
@ -365,6 +370,7 @@ retrieve_tree (const char *start_url)
|
|||||||
file);
|
file);
|
||||||
if (unlink (file))
|
if (unlink (file))
|
||||||
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
|
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
|
||||||
|
logputs (LOG_VERBOSE, "\n");
|
||||||
register_delete_file (file);
|
register_delete_file (file);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -420,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
|
|
||||||
if (string_set_contains (blacklist, url))
|
if (string_set_contains (blacklist, url))
|
||||||
{
|
{
|
||||||
|
if (opt.spider)
|
||||||
|
{
|
||||||
|
char *referrer = url_string (parent, true);
|
||||||
|
DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
|
||||||
|
visited_url (url, referrer);
|
||||||
|
xfree (referrer);
|
||||||
|
}
|
||||||
DEBUGP (("Already on the black list.\n"));
|
DEBUGP (("Already on the black list.\n"));
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/* Support for Robot Exclusion Standard (RES).
|
/* Support for Robot Exclusion Standard (RES).
|
||||||
Copyright (C) 2001 Free Software Foundation, Inc.
|
Copyright (C) 2001,2006 Free Software Foundation, Inc.
|
||||||
|
|
||||||
This file is part of Wget.
|
This file is part of Wget.
|
||||||
|
|
||||||
@ -539,10 +539,16 @@ res_retrieve_file (const char *url, char **file)
|
|||||||
{
|
{
|
||||||
uerr_t err;
|
uerr_t err;
|
||||||
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||||
|
int saved_ts_val = opt.timestamping;
|
||||||
|
int saved_sp_val = opt.spider;
|
||||||
|
|
||||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||||
*file = NULL;
|
*file = NULL;
|
||||||
|
opt.timestamping = false;
|
||||||
|
opt.spider = false;
|
||||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
|
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
|
||||||
|
opt.timestamping = saved_ts_val;
|
||||||
|
opt.spider = saved_sp_val;
|
||||||
xfree (robots_url);
|
xfree (robots_url);
|
||||||
|
|
||||||
if (err != RETROK && *file != NULL)
|
if (err != RETROK && *file != NULL)
|
||||||
|
175
src/spider.c
Normal file
175
src/spider.c
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
/* Keep track of visited URLs in spider mode.
|
||||||
|
Copyright (C) 2006 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of GNU Wget.
|
||||||
|
|
||||||
|
GNU Wget is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
GNU Wget is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with Wget; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
In addition, as a special exception, the Free Software Foundation
|
||||||
|
gives permission to link the code of its release of Wget with the
|
||||||
|
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||||
|
that use the same license as the "OpenSSL" library), and distribute
|
||||||
|
the linked executables. You must obey the GNU General Public License
|
||||||
|
in all respects for all of the code used other than "OpenSSL". If you
|
||||||
|
modify this file, you may extend this exception to your version of the
|
||||||
|
file, but you are not obligated to do so. If you do not wish to do
|
||||||
|
so, delete this exception statement from your version. */
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#include "wget.h"
|
||||||
|
#include "spider.h"
|
||||||
|
#include "url.h"
|
||||||
|
#include "utils.h"
|
||||||
|
#include "hash.h"
|
||||||
|
#include "res.h"
|
||||||
|
|
||||||
|
|
||||||
|
static struct hash_table *visited_urls_hash;
|
||||||
|
static struct hash_table *nonexisting_urls_set;
|
||||||
|
|
||||||
|
/* Cleanup the data structures associated with this file. */
|
||||||
|
|
||||||
|
void
|
||||||
|
spider_cleanup (void)
|
||||||
|
{
|
||||||
|
if (visited_urls_hash)
|
||||||
|
{
|
||||||
|
free_keys_and_values (visited_urls_hash);
|
||||||
|
hash_table_destroy (visited_urls_hash);
|
||||||
|
visited_urls_hash = NULL;
|
||||||
|
}
|
||||||
|
if (nonexisting_urls_set)
|
||||||
|
string_set_free (nonexisting_urls_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remembers visited files. */
|
||||||
|
|
||||||
|
struct url_list
|
||||||
|
{
|
||||||
|
char *url;
|
||||||
|
struct url_list *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool
|
||||||
|
in_url_list_p (const struct url_list *list, const char *url, bool verbose)
|
||||||
|
{
|
||||||
|
const struct url_list *ptr;
|
||||||
|
|
||||||
|
for (ptr = list; ptr; ptr = ptr->next)
|
||||||
|
{
|
||||||
|
/* str[case]cmp is inadequate for URL comparison */
|
||||||
|
if (are_urls_equal (url, ptr->url))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
visited_url (const char *url, const char *referrer)
|
||||||
|
{
|
||||||
|
struct url_list *list;
|
||||||
|
|
||||||
|
/* Ignore robots.txt URLs */
|
||||||
|
if (is_robots_txt_url (url))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (!visited_urls_hash)
|
||||||
|
visited_urls_hash = make_string_hash_table (0);
|
||||||
|
|
||||||
|
list = hash_table_get (visited_urls_hash, url);
|
||||||
|
if (!list)
|
||||||
|
{
|
||||||
|
list = (struct url_list *) xnew0 (struct url_list);
|
||||||
|
list->url = referrer ? xstrdup (referrer) : NULL;
|
||||||
|
hash_table_put (visited_urls_hash, xstrdup (url), list);
|
||||||
|
}
|
||||||
|
else if (referrer && !in_url_list_p (list, referrer, false))
|
||||||
|
{
|
||||||
|
/* Append referrer at the end of the list */
|
||||||
|
struct url_list *newnode;
|
||||||
|
|
||||||
|
while (list->next)
|
||||||
|
list = list->next;
|
||||||
|
|
||||||
|
newnode = (struct url_list *) xnew0 (struct url_list);
|
||||||
|
newnode->url = xstrdup (referrer);
|
||||||
|
list->next = newnode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remembers broken links. */
|
||||||
|
void
|
||||||
|
nonexisting_url (const char *url)
|
||||||
|
{
|
||||||
|
/* Ignore robots.txt URLs */
|
||||||
|
if (is_robots_txt_url (url))
|
||||||
|
return;
|
||||||
|
if (!nonexisting_urls_set)
|
||||||
|
nonexisting_urls_set = make_string_hash_table (0);
|
||||||
|
string_set_add (nonexisting_urls_set, url);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
print_broken_links (void)
|
||||||
|
{
|
||||||
|
hash_table_iterator iter;
|
||||||
|
int num_elems;
|
||||||
|
|
||||||
|
if (!nonexisting_urls_set)
|
||||||
|
{
|
||||||
|
logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
num_elems = hash_table_count (nonexisting_urls_set);
|
||||||
|
assert (num_elems > 0);
|
||||||
|
|
||||||
|
if (num_elems > 1)
|
||||||
|
{
|
||||||
|
logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
|
||||||
|
num_elems);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (hash_table_iterate (nonexisting_urls_set, &iter);
|
||||||
|
hash_table_iter_next (&iter); )
|
||||||
|
{
|
||||||
|
struct url_list *list;
|
||||||
|
const char *url = (const char *) iter.key;
|
||||||
|
|
||||||
|
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), url);
|
||||||
|
|
||||||
|
for (list = (struct url_list *) hash_table_get (visited_urls_hash, url);
|
||||||
|
list; list = list->next)
|
||||||
|
{
|
||||||
|
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logputs (LOG_NOTQUIET, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vim: et ts=2 sw=2
|
||||||
|
*/
|
||||||
|
|
37
src/spider.h
Normal file
37
src/spider.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/* Declarations for spider.c
|
||||||
|
Copyright (C) 2006 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of GNU Wget.
|
||||||
|
|
||||||
|
GNU Wget is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
GNU Wget is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with Wget; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
In addition, as a special exception, the Free Software Foundation
|
||||||
|
gives permission to link the code of its release of Wget with the
|
||||||
|
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||||
|
that use the same license as the "OpenSSL" library), and distribute
|
||||||
|
the linked executables. You must obey the GNU General Public License
|
||||||
|
in all respects for all of the code used other than "OpenSSL". If you
|
||||||
|
modify this file, you may extend this exception to your version of the
|
||||||
|
file, but you are not obligated to do so. If you do not wish to do
|
||||||
|
so, delete this exception statement from your version. */
|
||||||
|
|
||||||
|
#ifndef SPIDER_H
|
||||||
|
#define SPIDER_H
|
||||||
|
|
||||||
|
void visited_url (const char *, const char *);
|
||||||
|
void nonexisting_url (const char *);
|
||||||
|
void print_broken_links (void);
|
||||||
|
|
||||||
|
#endif /* SPIDER_H */
|
Loading…
Reference in New Issue
Block a user