Add option to write URL rejections to a tab-delimited CSV log.

* main.c: Add "--rejected-log" option.
 * init.c: Add "rejectedlog" command.
 * options.h: Add "rejected_log" parameter string.
 * wget.texi: Add brief documentation on new --rejected-log option.
 * recur.c: Optionally log details of URLs not traversed.
   Add reject_reason enum.
   (download_child_p -> download_child): Return a reject_reason.
   (descend_redirect_p -> descend_redirect): Return a reject_reason.
   (retrieve_tree): Support logging reasons for rejection.
   Add write_reject_log_header that writes a CSV format header to a file.
   Add write_reject_log_url that writes a url struct to a file in CSV format.
   Add write_reject_log_reason that writes the URL and parent URL as well as the
   rejection reason to a CSV file.
 * Test--rejected-log.px: Add a basic test for the --rejected-log command.
 * tests/Makefile.am: Run Test--rejected-log.px.

This allows you to figure out why URLs are being rejected and some context
around it. CSV is used as the output format since it can be used easily parsed,
it's delimited by tabs instead of commas to allow using all (quoted) URL
characters and includes column names which may be used for compatibility.
This commit is contained in:
Jookia 2015-07-31 23:41:36 +10:00 committed by Giuseppe Scrivano
parent 670eb924e7
commit e4db00d74d
7 changed files with 308 additions and 32 deletions

View File

@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
@cindex specify config
@item --config=@var{FILE}
Specify the location of a startup file you wish to use.
@item --rejected-log=@var{logfile}
Logs all URL rejections to @var{logfile} as comma separated values. The values
include the reason of rejection, the URL and the parent URL it was found in.
@end table
@node Download Options, Directory Options, Logging and Input File Options, Invoking

View File

@ -274,6 +274,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string },
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
{ "reject", &opt.rejects, cmd_vector },
{ "rejectedlog", &opt.rejected_log, cmd_file },
{ "rejectregex", &opt.rejectregex_s, cmd_string },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
@ -1856,6 +1857,7 @@ cleanup (void)
xfree (opt.post_data);
xfree (opt.body_data);
xfree (opt.body_file);
xfree (opt.rejected_log);
#endif /* DEBUG_MALLOC */
}

View File

@ -321,6 +321,7 @@ static struct cmdline_option option_data[] =
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
{ "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
#ifdef HAVE_METALINK
{ "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
@ -576,6 +577,8 @@ Logging and input file:\n"),
--config=FILE specify config file to use\n"),
N_("\
--no-config do not read any config file\n"),
N_("\
--rejected-log=FILE log reasons for URL rejection to FILE\n"),
"\n",
N_("\

View File

@ -296,6 +296,8 @@ struct options
name. */
bool report_bps; /*Output bandwidth in bits format*/
char *rejected_log; /* The file to log rejected URLS to. */
#ifdef HAVE_HSTS
bool hsts;
char *hsts_file;

View File

@ -182,11 +182,19 @@ static int blacklist_contains (struct hash_table *blacklist, const char *url)
return ret;
}
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, struct url *, int,
struct url *, struct hash_table *, struct iri *);
typedef enum
{
SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
RULES, SPANNEDHOST, ROBOTS
} reject_reason;
static reject_reason download_child (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *, struct iri *);
static reject_reason descend_redirect (const char *, struct url *, int,
struct url *, struct hash_table *, struct iri *);
static void write_reject_log_header (FILE *);
static void write_reject_log_reason (FILE *, reject_reason, struct url *,
struct url *);
/* Retrieve a part of the web beginning with START_URL. This used to
be called "recursive retrieval", because the old function was
@ -244,6 +252,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
false);
blacklist_add (blacklist, start_url_parsed->url);
FILE *rejectedlog = 0; /* Don't write a rejected log. */
if (opt.rejected_log)
{
rejectedlog = fopen (opt.rejected_log, "w");
write_reject_log_header (rejectedlog);
if (!rejectedlog)
logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno));
}
while (1)
{
bool descend = false;
@ -266,9 +283,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
break;
/* ...and download it. Note that this download is in most cases
unconditional, as download_child_p already makes sure a file
unconditional, as download_child already makes sure a file
doesn't get enqueued twice -- and yet this check is here, and
not in download_child_p. This is so that if you run `wget -r
not in download_child. This is so that if you run `wget -r
URL1 URL2', and a random URL is encountered once under URL1
and again under URL2, but at a different (possibly smaller)
depth, we want the URL's children to be taken into account
@ -337,13 +354,19 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
want to follow it. */
if (descend)
{
if (!descend_redirect_p (redirected, url_parsed, depth,
start_url_parsed, blacklist, i))
descend = false;
reject_reason r = descend_redirect (redirected, url_parsed,
depth, start_url_parsed, blacklist, i);
if (r == SUCCESS)
{
/* Make sure that the old pre-redirect form gets
blacklisted. */
blacklist_add (blacklist, url);
}
else
/* Make sure that the old pre-redirect form gets
blacklisted. */
blacklist_add (blacklist, url);
{
write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
descend = false;
}
}
xfree (url);
@ -425,8 +448,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
continue;
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist, i))
reject_reason r = download_child (child, url_parsed, depth,
start_url_parsed, blacklist, i);
if (r == SUCCESS)
{
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding, false);
@ -439,6 +463,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
same URL twice. */
blacklist_add (blacklist, child->url->url);
}
else
{
write_reject_log_reason (rejectedlog, r, child->url, url_parsed);
}
}
if (strip_auth)
@ -478,6 +506,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
iri_free (i);
}
if (rejectedlog)
fclose (rejectedlog);
/* If anything is left of the queue due to a premature exit, free it
now. */
{
@ -513,14 +544,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
by storing these URLs to BLACKLIST. This may or may not help. It
will help if those URLs are encountered many times. */
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static reject_reason
download_child (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
bool u_scheme_like_http;
reject_reason reason = SUCCESS;
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
@ -529,11 +561,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url)));
visited_url (url, referrer);
xfree (referrer);
}
DEBUGP (("Already on the black list.\n"));
reason = BLACKLIST;
goto out;
}
@ -563,6 +596,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.https_only && u->scheme != SCHEME_HTTPS)
{
DEBUGP (("Not following non-HTTPS links.\n"));
reason = NOTHTTPS;
goto out;
}
#endif
@ -574,6 +608,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{
DEBUGP (("Not following non-HTTP schemes.\n"));
reason = NONHTTP;
goto out;
}
@ -583,6 +618,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.relative_only && !upos->link_relative_p)
{
DEBUGP (("It doesn't really look like a relative link.\n"));
reason = ABSOLUTE;
goto out;
}
@ -591,6 +627,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accept_domain (u))
{
DEBUGP (("The domain was not accepted.\n"));
reason = DOMAIN;
goto out;
}
@ -610,6 +647,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
u->dir, start_url_parsed->dir));
reason = PARENT;
goto out;
}
}
@ -622,12 +660,14 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accdir (u->dir))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
reason = LIST;
goto out;
}
}
if (!accept_url (url))
{
DEBUGP (("%s is excluded/not-included through regex.\n", url));
reason = REGEX;
goto out;
}
@ -652,6 +692,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file));
reason = RULES;
goto out;
}
}
@ -662,6 +703,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
u->host, parent->host));
reason = SPANNEDHOST;
goto out;
}
@ -704,35 +746,36 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
blacklist_add (blacklist, url);
reason = ROBOTS;
goto out;
}
}
/* The URL has passed all the tests. It can be placed in the
download queue. */
DEBUGP (("Decided to load it.\n"));
out:
return true;
if (reason == SUCCESS)
/* The URL has passed all the tests. It can be placed in the
download queue. */
DEBUGP (("Decided to load it.\n"));
else
DEBUGP (("Decided NOT to load it.\n"));
out:
DEBUGP (("Decided NOT to load it.\n"));
return false;
return reason;
}
/* This function determines whether we will consider downloading the
children of a URL whose download resulted in a redirection,
possibly to another host, etc. It is needed very rarely, and thus
it is merely a simple-minded wrapper around download_child_p. */
it is merely a simple-minded wrapper around download_child. */
static bool
descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
static reject_reason
descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *new_parsed;
struct urlpos *upos;
bool success;
reject_reason reason;
assert (orig_parsed != NULL);
@ -742,10 +785,10 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
reason = download_child (upos, orig_parsed, depth,
start_url_parsed, blacklist, iri);
if (success)
if (reason == SUCCESS)
blacklist_add (blacklist, upos->url->url);
else
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
@ -753,7 +796,89 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
url_free (new_parsed);
xfree (upos);
return success;
return reason;
}
/* This function writes the rejected log header. */
static void
write_reject_log_header (FILE *f)
{
if (!f)
return;
/* Note: Update this header when columns change in any way. */
fprintf (f, "REASON\t"
"U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t"
"P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n");
}
/* This function writes a URL to the reject log. Internal use only. */
static void
write_reject_log_url (FILE *f, struct url *url)
{
if (!f)
return;
char *escaped_str = url_escape (url->url);
char const *scheme_str = 0;
char empty_str[] = "";
switch (url->scheme)
{
case SCHEME_HTTP: scheme_str = "SCHEME_HTTP"; break;
#ifdef HAVE_SSL
case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS"; break;
#endif
case SCHEME_FTP: scheme_str = "SCHEME_FTP"; break;
case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
}
fprintf (f, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s",
escaped_str,
scheme_str,
url->host,
url->port,
url->path,
url->params ? url->params : empty_str,
url->query ? url->query : empty_str,
url->fragment ? url->fragment : empty_str);
free (escaped_str);
}
/* This function writes out information on why a URL was rejected and its
context from download_child such as the URL being rejected and it's
parent's URL. The format it uses is comma separated values but with tabs. */
static void
write_reject_log_reason (FILE *f, reject_reason r, struct url *url,
struct url *parent)
{
if (!f)
return;
char const *reason_str = 0;
switch (r)
{
case SUCCESS: reason_str = "SUCCESS"; break;
case BLACKLIST: reason_str = "BLACKLIST"; break;
case NOTHTTPS: reason_str = "NOTHTTPS"; break;
case NONHTTP: reason_str = "NONHTTP"; break;
case ABSOLUTE: reason_str = "ABSOLUTE"; break;
case DOMAIN: reason_str = "DOMAIN"; break;
case PARENT: reason_str = "PARENT"; break;
case LIST: reason_str = "LIST"; break;
case REGEX: reason_str = "REGEX"; break;
case RULES: reason_str = "RULES"; break;
case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
case ROBOTS: reason_str = "ROBOTS"; break;
}
fprintf (f, "%s\t", reason_str);
write_reject_log_url (f, url);
fprintf (f, "\t");
write_reject_log_url (f, parent);
fprintf (f, "\n");
}
/* vim:set sts=2 sw=2 cino+={s: */

View File

@ -127,6 +127,7 @@ PX_TESTS = \
Test--start-pos.px \
Test--start-pos--continue.px \
Test--httpsonly-r.px \
Test--rejected-log.px \
Test-204.px
EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \

138
tests/Test--rejected-log.px Executable file
View File

@ -0,0 +1,138 @@
#!/usr/bin/env perl
use strict;
use warnings;
use HTTPTest;
###############################################################################
my $mainpage = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Recurse to a <a href="http://localhost:{{port}}/secondpage.html">second page</a>.
</p>
</body>
</html>
EOF
my $secondpage = <<EOF;
<html>
<head>
<title>Second Page</title>
</head>
<body>
<p>
Recurse to a <a href="http://localhost:{{port}}/thirdpage.html">third page</a>.
Try the blacklisted <a href="http://localhost:{{port}}/index.html">main page</a>.
</p>
</body>
</html>
EOF
my $thirdpage = <<EOF;
<html>
<head>
<title>Third Page</title>
</head>
<body>
<p>
Try a hidden <a href="http://localhost:{{port}}/dummy.txt">dummy file</a>.
Try to leave to <a href="http://no.such.domain/">another domain</a>.
</p>
</body>
</html>
EOF
my $robots = <<EOF;
User-agent: *
Disallow: /dummy.txt
EOF
my $log = <<EOF;
REASON U_URL U_SCHEME U_HOST U_PORT U_PATH U_PARAMS U_QUERY U_FRAGMENT P_URL P_SCHEME P_HOST P_PORT P_PATH P_PARAMS P_QUERY P_FRAGMENT
BLACKLIST http%3A//localhost%3A{{port}}/index.html SCHEME_HTTP localhost {{port}} index.html http%3A//localhost%3A{{port}}/secondpage.html SCHEME_HTTP localhost {{port}} secondpage.html
ROBOTS http%3A//localhost%3A{{port}}/dummy.txt SCHEME_HTTP localhost {{port}} dummy.txt http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
SPANNEDHOST http%3A//no.such.domain/ SCHEME_HTTP no.such.domain 80 http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Dontcare",
headers => {
"Content-type" => "text/html",
},
content => $mainpage,
},
'/secondpage.html' => {
code => "200",
msg => "Dontcare",
headers => {
"Content-type" => "text/html",
},
content => $secondpage,
},
'/thirdpage.html' => {
code => "200",
msg => "Dontcare",
headers => {
"Content-type" => "text/html",
},
content => $thirdpage,
},
'/dummy.txt' => {
code => "200",
msg => "Dontcare",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/robots.txt' => {
code => "200",
msg => "Dontcare",
headers => {
"Content-type" => "text/plain",
},
content => $robots
},
);
my $cmdline = $WgetTest::WGETPATH . " -nd -r --rejected-log log.csv http://localhost:{{port}}/index.html";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"index.html" => {
content => $mainpage,
},
"secondpage.html" => {
content => $secondpage,
},
"thirdpage.html" => {
content => $thirdpage,
},
"robots.txt" => {
content => $robots,
},
"log.csv" => {
content => $log,
},
);
###############################################################################
my $the_test = HTTPTest->new (input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4