mirror of
https://github.com/mirror/wget.git
synced 2025-01-08 19:30:41 +08:00
[svn] Fixes for recursive spider mode.
This commit is contained in:
parent
79f66dfd15
commit
60c88ee992
@ -1,3 +1,22 @@
|
|||||||
|
2006-06-28 Mauro Tortonesi <mauro@ferrara.linux.it>
|
||||||
|
|
||||||
|
* res.c: Implemented is_robots_txt_url function for detection of
|
||||||
|
robots.txt URLs and related test routine.
|
||||||
|
|
||||||
|
* res.h: Ditto.
|
||||||
|
|
||||||
|
* url.c: Implemented are_urls_equal function for URL comparison and
|
||||||
|
related testing routine.
|
||||||
|
|
||||||
|
* url.h: Ditto.
|
||||||
|
|
||||||
|
* convert.c: Fixes for recursive spider mode: don't consider
|
||||||
|
non-existing robots.txt as a broken link, and use are_urls_equal
|
||||||
|
instead of strcasecmp for referrer URLs comparison.
|
||||||
|
|
||||||
|
* test.c: Call tests routines for are_urls_equal and
|
||||||
|
is_robots_txt_url.
|
||||||
|
|
||||||
2006-06-26 Hrvoje Niksic <hniksic@xemacs.org>
|
2006-06-26 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
* wget.h (wgint): Typedef to any 64-bit (or larger) type we can
|
* wget.h (wgint): Typedef to any 64-bit (or larger) type we can
|
||||||
|
462
src/convert.c
462
src/convert.c
@ -45,6 +45,7 @@ so, delete this exception statement from your version. */
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "hash.h"
|
#include "hash.h"
|
||||||
#include "ptimer.h"
|
#include "ptimer.h"
|
||||||
|
#include "res.h"
|
||||||
|
|
||||||
static struct hash_table *dl_file_url_map;
|
static struct hash_table *dl_file_url_map;
|
||||||
struct hash_table *dl_url_file_map;
|
struct hash_table *dl_url_file_map;
|
||||||
@ -99,13 +100,13 @@ convert_all_links (void)
|
|||||||
char *file = file_array[i];
|
char *file = file_array[i];
|
||||||
|
|
||||||
/* Determine the URL of the HTML file. get_urls_html will need
|
/* Determine the URL of the HTML file. get_urls_html will need
|
||||||
it. */
|
it. */
|
||||||
url = hash_table_get (dl_file_url_map, file);
|
url = hash_table_get (dl_file_url_map, file);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
DEBUGP (("Apparently %s has been removed.\n", file));
|
DEBUGP (("Apparently %s has been removed.\n", file));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUGP (("Scanning %s (from %s)\n", file, url));
|
DEBUGP (("Scanning %s (from %s)\n", file, url));
|
||||||
|
|
||||||
@ -117,48 +118,48 @@ convert_all_links (void)
|
|||||||
links that have been followed from other files. */
|
links that have been followed from other files. */
|
||||||
|
|
||||||
for (cur_url = urls; cur_url; cur_url = cur_url->next)
|
for (cur_url = urls; cur_url; cur_url = cur_url->next)
|
||||||
{
|
{
|
||||||
char *local_name;
|
char *local_name;
|
||||||
struct url *u = cur_url->url;
|
struct url *u = cur_url->url;
|
||||||
|
|
||||||
if (cur_url->link_base_p)
|
if (cur_url->link_base_p)
|
||||||
{
|
{
|
||||||
/* Base references have been resolved by our parser, so
|
/* Base references have been resolved by our parser, so
|
||||||
we turn the base URL into an empty string. (Perhaps
|
we turn the base URL into an empty string. (Perhaps
|
||||||
we should remove the tag entirely?) */
|
we should remove the tag entirely?) */
|
||||||
cur_url->convert = CO_NULLIFY_BASE;
|
cur_url->convert = CO_NULLIFY_BASE;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We decide the direction of conversion according to whether
|
/* We decide the direction of conversion according to whether
|
||||||
a URL was downloaded. Downloaded URLs will be converted
|
a URL was downloaded. Downloaded URLs will be converted
|
||||||
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
||||||
local_name = hash_table_get (dl_url_file_map, u->url);
|
local_name = hash_table_get (dl_url_file_map, u->url);
|
||||||
|
|
||||||
/* Decide on the conversion type. */
|
/* Decide on the conversion type. */
|
||||||
if (local_name)
|
if (local_name)
|
||||||
{
|
{
|
||||||
/* We've downloaded this URL. Convert it to relative
|
/* We've downloaded this URL. Convert it to relative
|
||||||
form. We do this even if the URL already is in
|
form. We do this even if the URL already is in
|
||||||
relative form, because our directory structure may
|
relative form, because our directory structure may
|
||||||
not be identical to that on the server (think `-nd',
|
not be identical to that on the server (think `-nd',
|
||||||
`--cut-dirs', etc.) */
|
`--cut-dirs', etc.) */
|
||||||
cur_url->convert = CO_CONVERT_TO_RELATIVE;
|
cur_url->convert = CO_CONVERT_TO_RELATIVE;
|
||||||
cur_url->local_name = xstrdup (local_name);
|
cur_url->local_name = xstrdup (local_name);
|
||||||
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* We haven't downloaded this URL. If it's not already
|
/* We haven't downloaded this URL. If it's not already
|
||||||
complete (including a full host name), convert it to
|
complete (including a full host name), convert it to
|
||||||
that form, so it can be reached while browsing this
|
that form, so it can be reached while browsing this
|
||||||
HTML locally. */
|
HTML locally. */
|
||||||
if (!cur_url->link_complete_p)
|
if (!cur_url->link_complete_p)
|
||||||
cur_url->convert = CO_CONVERT_TO_COMPLETE;
|
cur_url->convert = CO_CONVERT_TO_COMPLETE;
|
||||||
cur_url->local_name = NULL;
|
cur_url->local_name = NULL;
|
||||||
DEBUGP (("will convert url %s to complete\n", u->url));
|
DEBUGP (("will convert url %s to complete\n", u->url));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert the links in the file. */
|
/* Convert the links in the file. */
|
||||||
convert_links (file, urls);
|
convert_links (file, urls);
|
||||||
@ -171,13 +172,13 @@ convert_all_links (void)
|
|||||||
secs = ptimer_measure (timer);
|
secs = ptimer_measure (timer);
|
||||||
ptimer_destroy (timer);
|
ptimer_destroy (timer);
|
||||||
logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
|
logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
|
||||||
file_count, print_decimal (secs));
|
file_count, print_decimal (secs));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_backup_file (const char *, downloaded_file_t);
|
static void write_backup_file (const char *, downloaded_file_t);
|
||||||
static const char *replace_attr (const char *, int, FILE *, const char *);
|
static const char *replace_attr (const char *, int, FILE *, const char *);
|
||||||
static const char *replace_attr_refresh_hack (const char *, int, FILE *,
|
static const char *replace_attr_refresh_hack (const char *, int, FILE *,
|
||||||
const char *, int);
|
const char *, int);
|
||||||
static char *local_quote_string (const char *);
|
static char *local_quote_string (const char *);
|
||||||
static char *construct_relative (const char *, const char *);
|
static char *construct_relative (const char *, const char *);
|
||||||
|
|
||||||
@ -205,11 +206,11 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
struct urlpos *dry;
|
struct urlpos *dry;
|
||||||
for (dry = links; dry; dry = dry->next)
|
for (dry = links; dry; dry = dry->next)
|
||||||
if (dry->convert != CO_NOCONVERT)
|
if (dry->convert != CO_NOCONVERT)
|
||||||
++dry_count;
|
++dry_count;
|
||||||
if (!dry_count)
|
if (!dry_count)
|
||||||
{
|
{
|
||||||
logputs (LOG_VERBOSE, _("nothing to do.\n"));
|
logputs (LOG_VERBOSE, _("nothing to do.\n"));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,7 +218,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
if (!fm)
|
if (!fm)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
||||||
file, strerror (errno));
|
file, strerror (errno));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,7 +233,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
if (unlink (file) < 0 && errno != ENOENT)
|
if (unlink (file) < 0 && errno != ENOENT)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
|
logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
|
||||||
file, strerror (errno));
|
file, strerror (errno));
|
||||||
read_file_free (fm);
|
read_file_free (fm);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -241,7 +242,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
if (!fp)
|
if (!fp)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
||||||
file, strerror (errno));
|
file, strerror (errno));
|
||||||
read_file_free (fm);
|
read_file_free (fm);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -254,16 +255,16 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
char *url_start = fm->content + link->pos;
|
char *url_start = fm->content + link->pos;
|
||||||
|
|
||||||
if (link->pos >= fm->length)
|
if (link->pos >= fm->length)
|
||||||
{
|
{
|
||||||
DEBUGP (("Something strange is going on. Please investigate."));
|
DEBUGP (("Something strange is going on. Please investigate."));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* If the URL is not to be converted, skip it. */
|
/* If the URL is not to be converted, skip it. */
|
||||||
if (link->convert == CO_NOCONVERT)
|
if (link->convert == CO_NOCONVERT)
|
||||||
{
|
{
|
||||||
DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
|
DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Echo the file contents, up to the offending URL's opening
|
/* Echo the file contents, up to the offending URL's opening
|
||||||
quote, to the outfile. */
|
quote, to the outfile. */
|
||||||
@ -271,52 +272,52 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
p = url_start;
|
p = url_start;
|
||||||
|
|
||||||
switch (link->convert)
|
switch (link->convert)
|
||||||
{
|
{
|
||||||
case CO_CONVERT_TO_RELATIVE:
|
case CO_CONVERT_TO_RELATIVE:
|
||||||
/* Convert absolute URL to relative. */
|
/* Convert absolute URL to relative. */
|
||||||
{
|
{
|
||||||
char *newname = construct_relative (file, link->local_name);
|
char *newname = construct_relative (file, link->local_name);
|
||||||
char *quoted_newname = local_quote_string (newname);
|
char *quoted_newname = local_quote_string (newname);
|
||||||
|
|
||||||
if (!link->link_refresh_p)
|
if (!link->link_refresh_p)
|
||||||
p = replace_attr (p, link->size, fp, quoted_newname);
|
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||||
else
|
else
|
||||||
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
|
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
|
||||||
link->refresh_timeout);
|
link->refresh_timeout);
|
||||||
|
|
||||||
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
||||||
link->url->url, newname, link->pos, file));
|
link->url->url, newname, link->pos, file));
|
||||||
xfree (newname);
|
xfree (newname);
|
||||||
xfree (quoted_newname);
|
xfree (quoted_newname);
|
||||||
++to_file_count;
|
++to_file_count;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CO_CONVERT_TO_COMPLETE:
|
case CO_CONVERT_TO_COMPLETE:
|
||||||
/* Convert the link to absolute URL. */
|
/* Convert the link to absolute URL. */
|
||||||
{
|
{
|
||||||
char *newlink = link->url->url;
|
char *newlink = link->url->url;
|
||||||
char *quoted_newlink = html_quote_string (newlink);
|
char *quoted_newlink = html_quote_string (newlink);
|
||||||
|
|
||||||
if (!link->link_refresh_p)
|
if (!link->link_refresh_p)
|
||||||
p = replace_attr (p, link->size, fp, quoted_newlink);
|
p = replace_attr (p, link->size, fp, quoted_newlink);
|
||||||
else
|
else
|
||||||
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
|
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
|
||||||
link->refresh_timeout);
|
link->refresh_timeout);
|
||||||
|
|
||||||
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
||||||
newlink, link->pos, file));
|
newlink, link->pos, file));
|
||||||
xfree (quoted_newlink);
|
xfree (quoted_newlink);
|
||||||
++to_url_count;
|
++to_url_count;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CO_NULLIFY_BASE:
|
case CO_NULLIFY_BASE:
|
||||||
/* Change the base href to "". */
|
/* Change the base href to "". */
|
||||||
p = replace_attr (p, link->size, fp, "");
|
p = replace_attr (p, link->size, fp, "");
|
||||||
break;
|
break;
|
||||||
case CO_NOCONVERT:
|
case CO_NOCONVERT:
|
||||||
abort ();
|
abort ();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Output the rest of the file. */
|
/* Output the rest of the file. */
|
||||||
@ -359,7 +360,7 @@ construct_relative (const char *basefile, const char *linkfile)
|
|||||||
for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
|
for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
|
||||||
{
|
{
|
||||||
if (*b == '/')
|
if (*b == '/')
|
||||||
start = (b - basefile) + 1;
|
start = (b - basefile) + 1;
|
||||||
}
|
}
|
||||||
basefile += start;
|
basefile += start;
|
||||||
linkfile += start;
|
linkfile += start;
|
||||||
@ -380,7 +381,7 @@ construct_relative (const char *basefile, const char *linkfile)
|
|||||||
for (b = basefile; *b; b++)
|
for (b = basefile; *b; b++)
|
||||||
{
|
{
|
||||||
if (*b == '/')
|
if (*b == '/')
|
||||||
++basedirs;
|
++basedirs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Construct LINK as explained above. */
|
/* Construct LINK as explained above. */
|
||||||
@ -410,12 +411,12 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
|||||||
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
||||||
{
|
{
|
||||||
/* Just write "orig" over "html". We need to do it this way
|
/* Just write "orig" over "html". We need to do it this way
|
||||||
because when we're checking to see if we've downloaded the
|
because when we're checking to see if we've downloaded the
|
||||||
file before (to see if we can skip downloading it), we don't
|
file before (to see if we can skip downloading it), we don't
|
||||||
know if it's a text/html file. Therefore we don't know yet
|
know if it's a text/html file. Therefore we don't know yet
|
||||||
at that stage that -E is going to cause us to tack on
|
at that stage that -E is going to cause us to tack on
|
||||||
".html", so we need to compare vs. the original URL plus
|
".html", so we need to compare vs. the original URL plus
|
||||||
".orig", not the original URL plus ".html.orig". */
|
".orig", not the original URL plus ".html.orig". */
|
||||||
filename_plus_orig_suffix = alloca (filename_len + 1);
|
filename_plus_orig_suffix = alloca (filename_len + 1);
|
||||||
strcpy (filename_plus_orig_suffix, file);
|
strcpy (filename_plus_orig_suffix, file);
|
||||||
strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
|
strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
|
||||||
@ -440,25 +441,25 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
|||||||
{
|
{
|
||||||
/* Rename <file> to <file>.orig before former gets written over. */
|
/* Rename <file> to <file>.orig before former gets written over. */
|
||||||
if (rename (file, filename_plus_orig_suffix) != 0)
|
if (rename (file, filename_plus_orig_suffix) != 0)
|
||||||
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
|
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
|
||||||
file, filename_plus_orig_suffix, strerror (errno));
|
file, filename_plus_orig_suffix, strerror (errno));
|
||||||
|
|
||||||
/* Remember that we've already written a .orig backup for this file.
|
/* Remember that we've already written a .orig backup for this file.
|
||||||
Note that we never free this memory since we need it till the
|
Note that we never free this memory since we need it till the
|
||||||
convert_all_links() call, which is one of the last things the
|
convert_all_links() call, which is one of the last things the
|
||||||
program does before terminating. BTW, I'm not sure if it would be
|
program does before terminating. BTW, I'm not sure if it would be
|
||||||
safe to just set 'converted_file_ptr->string' to 'file' below,
|
safe to just set 'converted_file_ptr->string' to 'file' below,
|
||||||
rather than making a copy of the string... Another note is that I
|
rather than making a copy of the string... Another note is that I
|
||||||
thought I could just add a field to the urlpos structure saying
|
thought I could just add a field to the urlpos structure saying
|
||||||
that we'd written a .orig file for this URL, but that didn't work,
|
that we'd written a .orig file for this URL, but that didn't work,
|
||||||
so I had to make this separate list.
|
so I had to make this separate list.
|
||||||
-- Dan Harkless <wget@harkless.org>
|
-- Dan Harkless <wget@harkless.org>
|
||||||
|
|
||||||
This [adding a field to the urlpos structure] didn't work
|
This [adding a field to the urlpos structure] didn't work
|
||||||
because convert_file() is called from convert_all_links at
|
because convert_file() is called from convert_all_links at
|
||||||
the end of the retrieval with a freshly built new urlpos
|
the end of the retrieval with a freshly built new urlpos
|
||||||
list.
|
list.
|
||||||
-- Hrvoje Niksic <hniksic@xemacs.org>
|
-- Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
*/
|
*/
|
||||||
string_set_add (converted_files, file);
|
string_set_add (converted_files, file);
|
||||||
}
|
}
|
||||||
@ -472,9 +473,9 @@ static const char *
|
|||||||
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
||||||
{
|
{
|
||||||
bool quote_flag = false;
|
bool quote_flag = false;
|
||||||
char quote_char = '\"'; /* use "..." for quoting, unless the
|
char quote_char = '\"'; /* use "..." for quoting, unless the
|
||||||
original value is quoted, in which
|
original value is quoted, in which
|
||||||
case reuse its quoting char. */
|
case reuse its quoting char. */
|
||||||
const char *frag_beg, *frag_end;
|
const char *frag_beg, *frag_end;
|
||||||
|
|
||||||
/* Structure of our string is:
|
/* Structure of our string is:
|
||||||
@ -489,7 +490,7 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
|||||||
quote_char = *p;
|
quote_char = *p;
|
||||||
quote_flag = true;
|
quote_flag = true;
|
||||||
++p;
|
++p;
|
||||||
size -= 2; /* disregard opening and closing quote */
|
size -= 2; /* disregard opening and closing quote */
|
||||||
}
|
}
|
||||||
putc (quote_char, fp);
|
putc (quote_char, fp);
|
||||||
fputs (new_text, fp);
|
fputs (new_text, fp);
|
||||||
@ -511,13 +512,13 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
|||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
replace_attr_refresh_hack (const char *p, int size, FILE *fp,
|
replace_attr_refresh_hack (const char *p, int size, FILE *fp,
|
||||||
const char *new_text, int timeout)
|
const char *new_text, int timeout)
|
||||||
{
|
{
|
||||||
/* "0; URL=..." */
|
/* "0; URL=..." */
|
||||||
char *new_with_timeout = (char *)alloca (numdigit (timeout)
|
char *new_with_timeout = (char *)alloca (numdigit (timeout)
|
||||||
+ 6 /* "; URL=" */
|
+ 6 /* "; URL=" */
|
||||||
+ strlen (new_text)
|
+ strlen (new_text)
|
||||||
+ 1);
|
+ 1);
|
||||||
sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
|
sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
|
||||||
|
|
||||||
return replace_attr (p, size, fp, new_with_timeout);
|
return replace_attr (p, size, fp, new_with_timeout);
|
||||||
@ -538,21 +539,21 @@ find_fragment (const char *beg, int size, const char **bp, const char **ep)
|
|||||||
for (; beg < end; beg++)
|
for (; beg < end; beg++)
|
||||||
{
|
{
|
||||||
switch (*beg)
|
switch (*beg)
|
||||||
{
|
{
|
||||||
case '&':
|
case '&':
|
||||||
saw_amp = true;
|
saw_amp = true;
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
if (!saw_amp)
|
if (!saw_amp)
|
||||||
{
|
{
|
||||||
*bp = beg;
|
*bp = beg;
|
||||||
*ep = end;
|
*ep = end;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
default:
|
default:
|
||||||
saw_amp = false;
|
saw_amp = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -588,26 +589,26 @@ local_quote_string (const char *file)
|
|||||||
switch (*from)
|
switch (*from)
|
||||||
{
|
{
|
||||||
case '%':
|
case '%':
|
||||||
*to++ = '%';
|
*to++ = '%';
|
||||||
*to++ = '2';
|
*to++ = '2';
|
||||||
*to++ = '5';
|
*to++ = '5';
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
*to++ = '%';
|
*to++ = '%';
|
||||||
*to++ = '2';
|
*to++ = '2';
|
||||||
*to++ = '3';
|
*to++ = '3';
|
||||||
break;
|
break;
|
||||||
case '?':
|
case '?':
|
||||||
if (opt.html_extension)
|
if (opt.html_extension)
|
||||||
{
|
{
|
||||||
*to++ = '%';
|
*to++ = '%';
|
||||||
*to++ = '3';
|
*to++ = '3';
|
||||||
*to++ = 'F';
|
*to++ = 'F';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
default:
|
default:
|
||||||
*to++ = *from;
|
*to++ = *from;
|
||||||
}
|
}
|
||||||
*to = '\0';
|
*to = '\0';
|
||||||
|
|
||||||
@ -618,11 +619,11 @@ local_quote_string (const char *file)
|
|||||||
downloaded_html_list, and downloaded_html_set. Other code calls
|
downloaded_html_list, and downloaded_html_set. Other code calls
|
||||||
these functions to let us know that a file has been downloaded. */
|
these functions to let us know that a file has been downloaded. */
|
||||||
|
|
||||||
#define ENSURE_TABLES_EXIST do { \
|
#define ENSURE_TABLES_EXIST do { \
|
||||||
if (!dl_file_url_map) \
|
if (!dl_file_url_map) \
|
||||||
dl_file_url_map = make_string_hash_table (0); \
|
dl_file_url_map = make_string_hash_table (0); \
|
||||||
if (!dl_url_file_map) \
|
if (!dl_url_file_map) \
|
||||||
dl_url_file_map = make_string_hash_table (0); \
|
dl_url_file_map = make_string_hash_table (0); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* Return true if S1 and S2 are the same, except for "/index.html".
|
/* Return true if S1 and S2 are the same, except for "/index.html".
|
||||||
@ -704,7 +705,7 @@ dissociate_urls_from_file (const char *file)
|
|||||||
{
|
{
|
||||||
/* Can't use hash_table_iter_* because the table mutates while mapping. */
|
/* Can't use hash_table_iter_* because the table mutates while mapping. */
|
||||||
hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
|
hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
|
||||||
(char *) file);
|
(char *) file);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Register that URL has been successfully downloaded to FILE. This
|
/* Register that URL has been successfully downloaded to FILE. This
|
||||||
@ -727,29 +728,29 @@ register_download (const char *url, const char *file)
|
|||||||
if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
||||||
{
|
{
|
||||||
if (0 == strcmp (url, old_url))
|
if (0 == strcmp (url, old_url))
|
||||||
/* We have somehow managed to download the same URL twice.
|
/* We have somehow managed to download the same URL twice.
|
||||||
Nothing to do. */
|
Nothing to do. */
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (match_except_index (url, old_url)
|
if (match_except_index (url, old_url)
|
||||||
&& !hash_table_contains (dl_url_file_map, url))
|
&& !hash_table_contains (dl_url_file_map, url))
|
||||||
/* The two URLs differ only in the "index.html" ending. For
|
/* The two URLs differ only in the "index.html" ending. For
|
||||||
example, one is "http://www.server.com/", and the other is
|
example, one is "http://www.server.com/", and the other is
|
||||||
"http://www.server.com/index.html". Don't remove the old
|
"http://www.server.com/index.html". Don't remove the old
|
||||||
one, just add the new one as a non-canonical entry. */
|
one, just add the new one as a non-canonical entry. */
|
||||||
goto url_only;
|
goto url_only;
|
||||||
|
|
||||||
hash_table_remove (dl_file_url_map, file);
|
hash_table_remove (dl_file_url_map, file);
|
||||||
xfree (old_file);
|
xfree (old_file);
|
||||||
xfree (old_url);
|
xfree (old_url);
|
||||||
|
|
||||||
/* Remove all the URLs that point to this file. Yes, there can
|
/* Remove all the URLs that point to this file. Yes, there can
|
||||||
be more than one such URL, because we store redirections as
|
be more than one such URL, because we store redirections as
|
||||||
multiple entries in dl_url_file_map. For example, if URL1
|
multiple entries in dl_url_file_map. For example, if URL1
|
||||||
redirects to URL2 which gets downloaded to FILE, we map both
|
redirects to URL2 which gets downloaded to FILE, we map both
|
||||||
URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
|
URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
|
||||||
only points to URL2.) When another URL gets loaded to FILE,
|
only points to URL2.) When another URL gets loaded to FILE,
|
||||||
we want both URL1 and URL2 dissociated from it.
|
we want both URL1 and URL2 dissociated from it.
|
||||||
|
|
||||||
This is a relatively expensive operation because it performs
|
This is a relatively expensive operation because it performs
|
||||||
a linear search of the whole hash table, but it should be
|
a linear search of the whole hash table, but it should be
|
||||||
@ -922,10 +923,10 @@ downloaded_file (downloaded_file_t mode, const char *file)
|
|||||||
if (mode == CHECK_FOR_FILE)
|
if (mode == CHECK_FOR_FILE)
|
||||||
{
|
{
|
||||||
if (!downloaded_files_hash)
|
if (!downloaded_files_hash)
|
||||||
return FILE_NOT_ALREADY_DOWNLOADED;
|
return FILE_NOT_ALREADY_DOWNLOADED;
|
||||||
ptr = hash_table_get (downloaded_files_hash, file);
|
ptr = hash_table_get (downloaded_files_hash, file);
|
||||||
if (!ptr)
|
if (!ptr)
|
||||||
return FILE_NOT_ALREADY_DOWNLOADED;
|
return FILE_NOT_ALREADY_DOWNLOADED;
|
||||||
return *ptr;
|
return *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -949,9 +950,9 @@ downloaded_files_free (void)
|
|||||||
{
|
{
|
||||||
hash_table_iterator iter;
|
hash_table_iterator iter;
|
||||||
for (hash_table_iterate (downloaded_files_hash, &iter);
|
for (hash_table_iterate (downloaded_files_hash, &iter);
|
||||||
hash_table_iter_next (&iter);
|
hash_table_iter_next (&iter);
|
||||||
)
|
)
|
||||||
xfree (iter.key);
|
xfree (iter.key);
|
||||||
hash_table_destroy (downloaded_files_hash);
|
hash_table_destroy (downloaded_files_hash);
|
||||||
downloaded_files_hash = NULL;
|
downloaded_files_hash = NULL;
|
||||||
}
|
}
|
||||||
@ -972,8 +973,8 @@ in_list (const struct broken_urls_list *list, const char *url)
|
|||||||
|
|
||||||
for (ptr = list; ptr; ptr = ptr->next)
|
for (ptr = list; ptr; ptr = ptr->next)
|
||||||
{
|
{
|
||||||
/* TODO: strcasecmp may not be appropriate to compare URLs */
|
/* str[case]cmp is inadequate for URL comparison */
|
||||||
if (strcasecmp (url, ptr->url) == 0) return true;
|
if (are_urls_equal (url, ptr->url) == 0) return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -983,6 +984,10 @@ void
|
|||||||
nonexisting_url (const char *url, const char *referrer)
|
nonexisting_url (const char *url, const char *referrer)
|
||||||
{
|
{
|
||||||
struct broken_urls_list *list;
|
struct broken_urls_list *list;
|
||||||
|
|
||||||
|
/* Ignore robots.txt URLs */
|
||||||
|
if (is_robots_txt_url (url))
|
||||||
|
return;
|
||||||
|
|
||||||
if (!nonexisting_urls_hash)
|
if (!nonexisting_urls_hash)
|
||||||
nonexisting_urls_hash = make_string_hash_table (0);
|
nonexisting_urls_hash = make_string_hash_table (0);
|
||||||
@ -1014,12 +1019,12 @@ nonexisting_urls_free (void)
|
|||||||
{
|
{
|
||||||
hash_table_iterator iter;
|
hash_table_iterator iter;
|
||||||
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
||||||
hash_table_iter_next (&iter);
|
hash_table_iter_next (&iter);
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
xfree (iter.key);
|
xfree (iter.key);
|
||||||
xfree (iter.value);
|
xfree (iter.value);
|
||||||
}
|
}
|
||||||
hash_table_destroy (nonexisting_urls_hash);
|
hash_table_destroy (nonexisting_urls_hash);
|
||||||
nonexisting_urls_hash = NULL;
|
nonexisting_urls_hash = NULL;
|
||||||
}
|
}
|
||||||
@ -1055,12 +1060,12 @@ print_broken_links (void)
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
struct broken_urls_list *list;
|
struct broken_urls_list *list;
|
||||||
|
|
||||||
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
|
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
|
||||||
|
|
||||||
for (list = (struct broken_urls_list *) iter.value;
|
for (list = (struct broken_urls_list *) iter.value;
|
||||||
list;
|
list;
|
||||||
list = list->next)
|
list = list->next)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
|
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
|
||||||
}
|
}
|
||||||
@ -1091,52 +1096,57 @@ html_quote_string (const char *s)
|
|||||||
for (i = 0; *s; s++, i++)
|
for (i = 0; *s; s++, i++)
|
||||||
{
|
{
|
||||||
if (*s == '&')
|
if (*s == '&')
|
||||||
i += 4; /* `amp;' */
|
i += 4; /* `amp;' */
|
||||||
else if (*s == '<' || *s == '>')
|
else if (*s == '<' || *s == '>')
|
||||||
i += 3; /* `lt;' and `gt;' */
|
i += 3; /* `lt;' and `gt;' */
|
||||||
else if (*s == '\"')
|
else if (*s == '\"')
|
||||||
i += 5; /* `quot;' */
|
i += 5; /* `quot;' */
|
||||||
else if (*s == ' ')
|
else if (*s == ' ')
|
||||||
i += 4; /* #32; */
|
i += 4; /* #32; */
|
||||||
}
|
}
|
||||||
res = xmalloc (i + 1);
|
res = xmalloc (i + 1);
|
||||||
s = b;
|
s = b;
|
||||||
for (p = res; *s; s++)
|
for (p = res; *s; s++)
|
||||||
{
|
{
|
||||||
switch (*s)
|
switch (*s)
|
||||||
{
|
{
|
||||||
case '&':
|
case '&':
|
||||||
*p++ = '&';
|
*p++ = '&';
|
||||||
*p++ = 'a';
|
*p++ = 'a';
|
||||||
*p++ = 'm';
|
*p++ = 'm';
|
||||||
*p++ = 'p';
|
*p++ = 'p';
|
||||||
*p++ = ';';
|
*p++ = ';';
|
||||||
break;
|
break;
|
||||||
case '<': case '>':
|
case '<': case '>':
|
||||||
*p++ = '&';
|
*p++ = '&';
|
||||||
*p++ = (*s == '<' ? 'l' : 'g');
|
*p++ = (*s == '<' ? 'l' : 'g');
|
||||||
*p++ = 't';
|
*p++ = 't';
|
||||||
*p++ = ';';
|
*p++ = ';';
|
||||||
break;
|
break;
|
||||||
case '\"':
|
case '\"':
|
||||||
*p++ = '&';
|
*p++ = '&';
|
||||||
*p++ = 'q';
|
*p++ = 'q';
|
||||||
*p++ = 'u';
|
*p++ = 'u';
|
||||||
*p++ = 'o';
|
*p++ = 'o';
|
||||||
*p++ = 't';
|
*p++ = 't';
|
||||||
*p++ = ';';
|
*p++ = ';';
|
||||||
break;
|
break;
|
||||||
case ' ':
|
case ' ':
|
||||||
*p++ = '&';
|
*p++ = '&';
|
||||||
*p++ = '#';
|
*p++ = '#';
|
||||||
*p++ = '3';
|
*p++ = '3';
|
||||||
*p++ = '2';
|
*p++ = '2';
|
||||||
*p++ = ';';
|
*p++ = ';';
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
*p++ = *s;
|
*p++ = *s;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*p = '\0';
|
*p = '\0';
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vim: et ts=2 sw=2
|
||||||
|
*/
|
||||||
|
|
||||||
|
272
src/res.c
272
src/res.c
@ -84,6 +84,10 @@ so, delete this exception statement from your version. */
|
|||||||
#include "retr.h"
|
#include "retr.h"
|
||||||
#include "res.h"
|
#include "res.h"
|
||||||
|
|
||||||
|
#ifdef TESTING
|
||||||
|
#include "test.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
struct path_info {
|
struct path_info {
|
||||||
char *path;
|
char *path;
|
||||||
bool allowedp;
|
bool allowedp;
|
||||||
@ -104,7 +108,7 @@ struct robot_specs {
|
|||||||
|
|
||||||
static void
|
static void
|
||||||
match_user_agent (const char *agent, int length,
|
match_user_agent (const char *agent, int length,
|
||||||
bool *matches, bool *exact_match)
|
bool *matches, bool *exact_match)
|
||||||
{
|
{
|
||||||
if (length == 1 && *agent == '*')
|
if (length == 1 && *agent == '*')
|
||||||
{
|
{
|
||||||
@ -128,7 +132,7 @@ match_user_agent (const char *agent, int length,
|
|||||||
|
|
||||||
static void
|
static void
|
||||||
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
|
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
|
||||||
bool allowedp, bool exactp)
|
bool allowedp, bool exactp)
|
||||||
{
|
{
|
||||||
struct path_info pp;
|
struct path_info pp;
|
||||||
if (path_b < path_e && *path_b == '/')
|
if (path_b < path_e && *path_b == '/')
|
||||||
@ -142,11 +146,11 @@ add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
|
|||||||
if (specs->count > specs->size)
|
if (specs->count > specs->size)
|
||||||
{
|
{
|
||||||
if (specs->size == 0)
|
if (specs->size == 0)
|
||||||
specs->size = 1;
|
specs->size = 1;
|
||||||
else
|
else
|
||||||
specs->size <<= 1;
|
specs->size <<= 1;
|
||||||
specs->paths = xrealloc (specs->paths,
|
specs->paths = xrealloc (specs->paths,
|
||||||
specs->size * sizeof (struct path_info));
|
specs->size * sizeof (struct path_info));
|
||||||
}
|
}
|
||||||
specs->paths[specs->count - 1] = pp;
|
specs->paths[specs->count - 1] = pp;
|
||||||
}
|
}
|
||||||
@ -176,12 +180,12 @@ prune_non_exact (struct robot_specs *specs)
|
|||||||
|
|
||||||
#define EOL(p) ((p) >= lineend)
|
#define EOL(p) ((p) >= lineend)
|
||||||
|
|
||||||
#define SKIP_SPACE(p) do { \
|
#define SKIP_SPACE(p) do { \
|
||||||
while (!EOL (p) && ISSPACE (*p)) \
|
while (!EOL (p) && ISSPACE (*p)) \
|
||||||
++p; \
|
++p; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define FIELD_IS(string_literal) \
|
#define FIELD_IS(string_literal) \
|
||||||
BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
|
BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
|
||||||
|
|
||||||
/* Parse textual RES specs beginning with SOURCE of length LENGTH.
|
/* Parse textual RES specs beginning with SOURCE of length LENGTH.
|
||||||
@ -245,113 +249,113 @@ res_parse (const char *source, int length)
|
|||||||
const char *value_b, *value_e;
|
const char *value_b, *value_e;
|
||||||
|
|
||||||
if (p == end)
|
if (p == end)
|
||||||
break;
|
break;
|
||||||
lineend_real = memchr (p, '\n', end - p);
|
lineend_real = memchr (p, '\n', end - p);
|
||||||
if (lineend_real)
|
if (lineend_real)
|
||||||
++lineend_real;
|
++lineend_real;
|
||||||
else
|
else
|
||||||
lineend_real = end;
|
lineend_real = end;
|
||||||
lineend = lineend_real;
|
lineend = lineend_real;
|
||||||
|
|
||||||
/* Before doing anything else, check whether the line is empty
|
/* Before doing anything else, check whether the line is empty
|
||||||
or comment-only. */
|
or comment-only. */
|
||||||
SKIP_SPACE (p);
|
SKIP_SPACE (p);
|
||||||
if (EOL (p) || *p == '#')
|
if (EOL (p) || *p == '#')
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
/* Make sure the end-of-line comments are respected by setting
|
/* Make sure the end-of-line comments are respected by setting
|
||||||
lineend to a location preceding the first comment. Real line
|
lineend to a location preceding the first comment. Real line
|
||||||
ending remains in lineend_real. */
|
ending remains in lineend_real. */
|
||||||
for (lineend = p; lineend < lineend_real; lineend++)
|
for (lineend = p; lineend < lineend_real; lineend++)
|
||||||
if ((lineend == p || ISSPACE (*(lineend - 1)))
|
if ((lineend == p || ISSPACE (*(lineend - 1)))
|
||||||
&& *lineend == '#')
|
&& *lineend == '#')
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Ignore trailing whitespace in the same way. */
|
/* Ignore trailing whitespace in the same way. */
|
||||||
while (lineend > p && ISSPACE (*(lineend - 1)))
|
while (lineend > p && ISSPACE (*(lineend - 1)))
|
||||||
--lineend;
|
--lineend;
|
||||||
|
|
||||||
assert (!EOL (p));
|
assert (!EOL (p));
|
||||||
|
|
||||||
field_b = p;
|
field_b = p;
|
||||||
while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
|
while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
|
||||||
++p;
|
++p;
|
||||||
field_e = p;
|
field_e = p;
|
||||||
|
|
||||||
SKIP_SPACE (p);
|
SKIP_SPACE (p);
|
||||||
if (field_b == field_e || EOL (p) || *p != ':')
|
if (field_b == field_e || EOL (p) || *p != ':')
|
||||||
{
|
{
|
||||||
DEBUGP (("Ignoring malformed line %d", line_count));
|
DEBUGP (("Ignoring malformed line %d", line_count));
|
||||||
goto next;
|
goto next;
|
||||||
}
|
}
|
||||||
++p; /* skip ':' */
|
++p; /* skip ':' */
|
||||||
SKIP_SPACE (p);
|
SKIP_SPACE (p);
|
||||||
|
|
||||||
value_b = p;
|
value_b = p;
|
||||||
while (!EOL (p))
|
while (!EOL (p))
|
||||||
++p;
|
++p;
|
||||||
value_e = p;
|
value_e = p;
|
||||||
|
|
||||||
/* Finally, we have a syntactically valid line. */
|
/* Finally, we have a syntactically valid line. */
|
||||||
if (FIELD_IS ("user-agent"))
|
if (FIELD_IS ("user-agent"))
|
||||||
{
|
{
|
||||||
/* We have to support several cases:
|
/* We have to support several cases:
|
||||||
|
|
||||||
--previous records--
|
--previous records--
|
||||||
|
|
||||||
User-Agent: foo
|
User-Agent: foo
|
||||||
User-Agent: Wget
|
User-Agent: Wget
|
||||||
User-Agent: bar
|
User-Agent: bar
|
||||||
... matching record ...
|
... matching record ...
|
||||||
|
|
||||||
User-Agent: baz
|
User-Agent: baz
|
||||||
User-Agent: qux
|
User-Agent: qux
|
||||||
... non-matching record ...
|
... non-matching record ...
|
||||||
|
|
||||||
User-Agent: *
|
User-Agent: *
|
||||||
... matching record, but will be pruned later ...
|
... matching record, but will be pruned later ...
|
||||||
|
|
||||||
We have to respect `User-Agent' at the beginning of each
|
We have to respect `User-Agent' at the beginning of each
|
||||||
new record simply because we don't know if we're going to
|
new record simply because we don't know if we're going to
|
||||||
encounter "Wget" among the agents or not. Hence,
|
encounter "Wget" among the agents or not. Hence,
|
||||||
match_user_agent is called when record_count != 0.
|
match_user_agent is called when record_count != 0.
|
||||||
|
|
||||||
But if record_count is 0, we have to keep calling it
|
But if record_count is 0, we have to keep calling it
|
||||||
until it matches, and if that happens, we must not call
|
until it matches, and if that happens, we must not call
|
||||||
it any more, until the next record. Hence the other part
|
it any more, until the next record. Hence the other part
|
||||||
of the condition. */
|
of the condition. */
|
||||||
if (record_count != 0 || user_agent_applies == false)
|
if (record_count != 0 || user_agent_applies == false)
|
||||||
match_user_agent (value_b, value_e - value_b,
|
match_user_agent (value_b, value_e - value_b,
|
||||||
&user_agent_applies, &user_agent_exact);
|
&user_agent_applies, &user_agent_exact);
|
||||||
if (user_agent_exact)
|
if (user_agent_exact)
|
||||||
found_exact = true;
|
found_exact = true;
|
||||||
record_count = 0;
|
record_count = 0;
|
||||||
}
|
}
|
||||||
else if (FIELD_IS ("allow"))
|
else if (FIELD_IS ("allow"))
|
||||||
{
|
{
|
||||||
if (user_agent_applies)
|
if (user_agent_applies)
|
||||||
{
|
{
|
||||||
add_path (specs, value_b, value_e, true, user_agent_exact);
|
add_path (specs, value_b, value_e, true, user_agent_exact);
|
||||||
}
|
}
|
||||||
++record_count;
|
++record_count;
|
||||||
}
|
}
|
||||||
else if (FIELD_IS ("disallow"))
|
else if (FIELD_IS ("disallow"))
|
||||||
{
|
{
|
||||||
if (user_agent_applies)
|
if (user_agent_applies)
|
||||||
{
|
{
|
||||||
bool allowed = false;
|
bool allowed = false;
|
||||||
if (value_b == value_e)
|
if (value_b == value_e)
|
||||||
/* Empty "disallow" line means everything is *allowed*! */
|
/* Empty "disallow" line means everything is *allowed*! */
|
||||||
allowed = true;
|
allowed = true;
|
||||||
add_path (specs, value_b, value_e, allowed, user_agent_exact);
|
add_path (specs, value_b, value_e, allowed, user_agent_exact);
|
||||||
}
|
}
|
||||||
++record_count;
|
++record_count;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
DEBUGP (("Ignoring unknown field at line %d", line_count));
|
DEBUGP (("Ignoring unknown field at line %d", line_count));
|
||||||
goto next;
|
goto next;
|
||||||
}
|
}
|
||||||
|
|
||||||
next:
|
next:
|
||||||
p = lineend_real;
|
p = lineend_real;
|
||||||
@ -361,15 +365,15 @@ res_parse (const char *source, int length)
|
|||||||
if (found_exact)
|
if (found_exact)
|
||||||
{
|
{
|
||||||
/* We've encountered an exactly matching user-agent. Throw out
|
/* We've encountered an exactly matching user-agent. Throw out
|
||||||
all the stuff with user-agent: *. */
|
all the stuff with user-agent: *. */
|
||||||
prune_non_exact (specs);
|
prune_non_exact (specs);
|
||||||
}
|
}
|
||||||
else if (specs->size > specs->count)
|
else if (specs->size > specs->count)
|
||||||
{
|
{
|
||||||
/* add_path normally over-allocates specs->paths. Reallocate it
|
/* add_path normally over-allocates specs->paths. Reallocate it
|
||||||
to the correct size in order to conserve some memory. */
|
to the correct size in order to conserve some memory. */
|
||||||
specs->paths = xrealloc (specs->paths,
|
specs->paths = xrealloc (specs->paths,
|
||||||
specs->count * sizeof (struct path_info));
|
specs->count * sizeof (struct path_info));
|
||||||
specs->size = specs->count;
|
specs->size = specs->count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -387,7 +391,7 @@ res_parse_from_file (const char *filename)
|
|||||||
if (!fm)
|
if (!fm)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
|
logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
|
||||||
filename, strerror (errno));
|
filename, strerror (errno));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
specs = res_parse (fm->content, fm->length);
|
specs = res_parse (fm->content, fm->length);
|
||||||
@ -411,16 +415,16 @@ free_specs (struct robot_specs *specs)
|
|||||||
that number is not a numerical representation of '/', decode C and
|
that number is not a numerical representation of '/', decode C and
|
||||||
advance the pointer. */
|
advance the pointer. */
|
||||||
|
|
||||||
#define DECODE_MAYBE(c, ptr) do { \
|
#define DECODE_MAYBE(c, ptr) do { \
|
||||||
if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
|
if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
|
||||||
{ \
|
{ \
|
||||||
char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
|
char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
|
||||||
if (decoded != '/') \
|
if (decoded != '/') \
|
||||||
{ \
|
{ \
|
||||||
c = decoded; \
|
c = decoded; \
|
||||||
ptr += 2; \
|
ptr += 2; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* The inner matching engine: return true if RECORD_PATH matches
|
/* The inner matching engine: return true if RECORD_PATH matches
|
||||||
@ -438,13 +442,13 @@ matches (const char *record_path, const char *url_path)
|
|||||||
char rc = *rp;
|
char rc = *rp;
|
||||||
char uc = *up;
|
char uc = *up;
|
||||||
if (!rc)
|
if (!rc)
|
||||||
return true;
|
return true;
|
||||||
if (!uc)
|
if (!uc)
|
||||||
return false;
|
return false;
|
||||||
DECODE_MAYBE(rc, rp);
|
DECODE_MAYBE(rc, rp);
|
||||||
DECODE_MAYBE(uc, up);
|
DECODE_MAYBE(uc, up);
|
||||||
if (rc != uc)
|
if (rc != uc)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -461,11 +465,11 @@ res_match_path (const struct robot_specs *specs, const char *path)
|
|||||||
for (i = 0; i < specs->count; i++)
|
for (i = 0; i < specs->count; i++)
|
||||||
if (matches (specs->paths[i].path, path))
|
if (matches (specs->paths[i].path, path))
|
||||||
{
|
{
|
||||||
bool allowedp = specs->paths[i].allowedp;
|
bool allowedp = specs->paths[i].allowedp;
|
||||||
DEBUGP (("%s path %s because of rule `%s'.\n",
|
DEBUGP (("%s path %s because of rule `%s'.\n",
|
||||||
allowedp ? "Allowing" : "Rejecting",
|
allowedp ? "Allowing" : "Rejecting",
|
||||||
path, specs->paths[i].path));
|
path, specs->paths[i].path));
|
||||||
return allowedp;
|
return allowedp;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -475,12 +479,12 @@ res_match_path (const struct robot_specs *specs, const char *path)
|
|||||||
static struct hash_table *registered_specs;
|
static struct hash_table *registered_specs;
|
||||||
|
|
||||||
/* Stolen from cookies.c. */
|
/* Stolen from cookies.c. */
|
||||||
#define SET_HOSTPORT(host, port, result) do { \
|
#define SET_HOSTPORT(host, port, result) do { \
|
||||||
int HP_len = strlen (host); \
|
int HP_len = strlen (host); \
|
||||||
result = alloca (HP_len + 1 + numdigit (port) + 1); \
|
result = alloca (HP_len + 1 + numdigit (port) + 1); \
|
||||||
memcpy (result, host, HP_len); \
|
memcpy (result, host, HP_len); \
|
||||||
result[HP_len] = ':'; \
|
result[HP_len] = ':'; \
|
||||||
number_to_string (result + HP_len + 1, port); \
|
number_to_string (result + HP_len + 1, port); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* Register RES specs that below to server on HOST:PORT. They will
|
/* Register RES specs that below to server on HOST:PORT. They will
|
||||||
@ -499,7 +503,7 @@ res_register_specs (const char *host, int port, struct robot_specs *specs)
|
|||||||
if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
|
if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
|
||||||
{
|
{
|
||||||
if (old)
|
if (old)
|
||||||
free_specs (old);
|
free_specs (old);
|
||||||
hash_table_put (registered_specs, hp_old, specs);
|
hash_table_put (registered_specs, hp_old, specs);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -544,14 +548,25 @@ res_retrieve_file (const char *url, char **file)
|
|||||||
if (err != RETROK && *file != NULL)
|
if (err != RETROK && *file != NULL)
|
||||||
{
|
{
|
||||||
/* If the file is not retrieved correctly, but retrieve_url
|
/* If the file is not retrieved correctly, but retrieve_url
|
||||||
allocated the file name, deallocate is here so that the
|
allocated the file name, deallocate is here so that the
|
||||||
caller doesn't have to worry about it. */
|
caller doesn't have to worry about it. */
|
||||||
xfree (*file);
|
xfree (*file);
|
||||||
*file = NULL;
|
*file = NULL;
|
||||||
}
|
}
|
||||||
return err == RETROK;
|
return err == RETROK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
is_robots_txt_url (const char *url)
|
||||||
|
{
|
||||||
|
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||||
|
bool ret = are_urls_equal (url, robots_url);
|
||||||
|
|
||||||
|
xfree (robots_url);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
res_cleanup (void)
|
res_cleanup (void)
|
||||||
{
|
{
|
||||||
@ -559,13 +574,44 @@ res_cleanup (void)
|
|||||||
{
|
{
|
||||||
hash_table_iterator iter;
|
hash_table_iterator iter;
|
||||||
for (hash_table_iterate (registered_specs, &iter);
|
for (hash_table_iterate (registered_specs, &iter);
|
||||||
hash_table_iter_next (&iter);
|
hash_table_iter_next (&iter);
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
xfree (iter.key);
|
xfree (iter.key);
|
||||||
free_specs (iter.value);
|
free_specs (iter.value);
|
||||||
}
|
}
|
||||||
hash_table_destroy (registered_specs);
|
hash_table_destroy (registered_specs);
|
||||||
registered_specs = NULL;
|
registered_specs = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef TESTING
|
||||||
|
|
||||||
|
const char *
|
||||||
|
test_is_robots_txt_url()
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct {
|
||||||
|
char *url;
|
||||||
|
bool expected_result;
|
||||||
|
} test_array[] = {
|
||||||
|
{ "http://www.yoyodyne.com/robots.txt", true },
|
||||||
|
{ "http://www.yoyodyne.com/somepath/", false },
|
||||||
|
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
|
||||||
|
};
|
||||||
|
|
||||||
|
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
|
||||||
|
{
|
||||||
|
mu_assert ("test_is_robots_txt_url: wrong result",
|
||||||
|
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* TESTING */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vim: et ts=2 sw=2
|
||||||
|
*/
|
||||||
|
|
||||||
|
@ -42,6 +42,8 @@ struct robot_specs *res_get_specs (const char *, int);
|
|||||||
|
|
||||||
bool res_retrieve_file (const char *, char **);
|
bool res_retrieve_file (const char *, char **);
|
||||||
|
|
||||||
|
bool is_robots_txt_url (const char *);
|
||||||
|
|
||||||
void res_cleanup (void);
|
void res_cleanup (void);
|
||||||
|
|
||||||
#endif /* RES_H */
|
#endif /* RES_H */
|
||||||
|
@ -40,6 +40,8 @@ const char *test_subdir_p();
|
|||||||
const char *test_dir_matches_p();
|
const char *test_dir_matches_p();
|
||||||
const char *test_cmd_spec_restrict_file_names();
|
const char *test_cmd_spec_restrict_file_names();
|
||||||
const char *test_append_uri_pathel();
|
const char *test_append_uri_pathel();
|
||||||
|
const char *test_are_urls_equal();
|
||||||
|
const char *test_is_robots_txt_url();
|
||||||
|
|
||||||
int tests_run;
|
int tests_run;
|
||||||
|
|
||||||
@ -51,6 +53,8 @@ all_tests()
|
|||||||
mu_run_test (test_dir_matches_p);
|
mu_run_test (test_dir_matches_p);
|
||||||
mu_run_test (test_cmd_spec_restrict_file_names);
|
mu_run_test (test_cmd_spec_restrict_file_names);
|
||||||
mu_run_test (test_append_uri_pathel);
|
mu_run_test (test_append_uri_pathel);
|
||||||
|
mu_run_test (test_are_urls_equal);
|
||||||
|
mu_run_test (test_is_robots_txt_url);
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -97,4 +97,6 @@ int mkalldirs (const char *);
|
|||||||
char *rewrite_shorthand_url (const char *);
|
char *rewrite_shorthand_url (const char *);
|
||||||
bool schemes_are_similar_p (enum url_scheme a, enum url_scheme b);
|
bool schemes_are_similar_p (enum url_scheme a, enum url_scheme b);
|
||||||
|
|
||||||
|
bool are_urls_equal (const char *u1, const char *u2);
|
||||||
|
|
||||||
#endif /* URL_H */
|
#endif /* URL_H */
|
||||||
|
Loading…
Reference in New Issue
Block a user