diff --git a/src/ChangeLog b/src/ChangeLog index 51c194f8..563abbf3 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,22 @@ +2006-06-28 Mauro Tortonesi + + * res.c: Implemented is_robots_txt_url function for detection of + robots.txt URLs and related test routine. + + * res.h: Ditto. + + * url.c: Implemented are_urls_equal function for URL comparison and + related testing routine. + + * url.h: Ditto. + + * convert.c: Fixes for recursive spider mode: don't consider + non-existing robots.txt as a broken link, and use are_urls_equal + instead of strcasecmp for referrer URLs comparison. + + * test.c: Call tests routines for are_urls_equal and + is_robots_txt_url. + 2006-06-26 Hrvoje Niksic * wget.h (wgint): Typedef to any 64-bit (or larger) type we can diff --git a/src/convert.c b/src/convert.c index 98133733..cd4873ab 100644 --- a/src/convert.c +++ b/src/convert.c @@ -45,6 +45,7 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "ptimer.h" +#include "res.h" static struct hash_table *dl_file_url_map; struct hash_table *dl_url_file_map; @@ -99,13 +100,13 @@ convert_all_links (void) char *file = file_array[i]; /* Determine the URL of the HTML file. get_urls_html will need - it. */ + it. */ url = hash_table_get (dl_file_url_map, file); if (!url) - { - DEBUGP (("Apparently %s has been removed.\n", file)); - continue; - } + { + DEBUGP (("Apparently %s has been removed.\n", file)); + continue; + } DEBUGP (("Scanning %s (from %s)\n", file, url)); @@ -117,48 +118,48 @@ convert_all_links (void) links that have been followed from other files. */ for (cur_url = urls; cur_url; cur_url = cur_url->next) - { - char *local_name; - struct url *u = cur_url->url; + { + char *local_name; + struct url *u = cur_url->url; - if (cur_url->link_base_p) - { - /* Base references have been resolved by our parser, so - we turn the base URL into an empty string. (Perhaps - we should remove the tag entirely?) */ - cur_url->convert = CO_NULLIFY_BASE; - continue; - } + if (cur_url->link_base_p) + { + /* Base references have been resolved by our parser, so + we turn the base URL into an empty string. (Perhaps + we should remove the tag entirely?) */ + cur_url->convert = CO_NULLIFY_BASE; + continue; + } - /* We decide the direction of conversion according to whether - a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. */ - local_name = hash_table_get (dl_url_file_map, u->url); + /* We decide the direction of conversion according to whether + a URL was downloaded. Downloaded URLs will be converted + ABS2REL, whereas non-downloaded will be converted REL2ABS. */ + local_name = hash_table_get (dl_url_file_map, u->url); - /* Decide on the conversion type. */ - if (local_name) - { - /* We've downloaded this URL. Convert it to relative + /* Decide on the conversion type. */ + if (local_name) + { + /* We've downloaded this URL. Convert it to relative form. We do this even if the URL already is in relative form, because our directory structure may not be identical to that on the server (think `-nd', `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; - cur_url->local_name = xstrdup (local_name); - DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); - } - else - { - /* We haven't downloaded this URL. If it's not already + cur_url->convert = CO_CONVERT_TO_RELATIVE; + cur_url->local_name = xstrdup (local_name); + DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); + } + else + { + /* We haven't downloaded this URL. If it's not already complete (including a full host name), convert it to that form, so it can be reached while browsing this HTML locally. */ - if (!cur_url->link_complete_p) - cur_url->convert = CO_CONVERT_TO_COMPLETE; - cur_url->local_name = NULL; - DEBUGP (("will convert url %s to complete\n", u->url)); - } - } + if (!cur_url->link_complete_p) + cur_url->convert = CO_CONVERT_TO_COMPLETE; + cur_url->local_name = NULL; + DEBUGP (("will convert url %s to complete\n", u->url)); + } + } /* Convert the links in the file. */ convert_links (file, urls); @@ -171,13 +172,13 @@ convert_all_links (void) secs = ptimer_measure (timer); ptimer_destroy (timer); logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"), - file_count, print_decimal (secs)); + file_count, print_decimal (secs)); } static void write_backup_file (const char *, downloaded_file_t); static const char *replace_attr (const char *, int, FILE *, const char *); static const char *replace_attr_refresh_hack (const char *, int, FILE *, - const char *, int); + const char *, int); static char *local_quote_string (const char *); static char *construct_relative (const char *, const char *); @@ -205,11 +206,11 @@ convert_links (const char *file, struct urlpos *links) struct urlpos *dry; for (dry = links; dry; dry = dry->next) if (dry->convert != CO_NOCONVERT) - ++dry_count; + ++dry_count; if (!dry_count) { - logputs (LOG_VERBOSE, _("nothing to do.\n")); - return; + logputs (LOG_VERBOSE, _("nothing to do.\n")); + return; } } @@ -217,7 +218,7 @@ convert_links (const char *file, struct urlpos *links) if (!fm) { logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), - file, strerror (errno)); + file, strerror (errno)); return; } @@ -232,7 +233,7 @@ convert_links (const char *file, struct urlpos *links) if (unlink (file) < 0 && errno != ENOENT) { logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"), - file, strerror (errno)); + file, strerror (errno)); read_file_free (fm); return; } @@ -241,7 +242,7 @@ convert_links (const char *file, struct urlpos *links) if (!fp) { logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), - file, strerror (errno)); + file, strerror (errno)); read_file_free (fm); return; } @@ -254,16 +255,16 @@ convert_links (const char *file, struct urlpos *links) char *url_start = fm->content + link->pos; if (link->pos >= fm->length) - { - DEBUGP (("Something strange is going on. Please investigate.")); - break; - } + { + DEBUGP (("Something strange is going on. Please investigate.")); + break; + } /* If the URL is not to be converted, skip it. */ if (link->convert == CO_NOCONVERT) - { - DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos)); - continue; - } + { + DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos)); + continue; + } /* Echo the file contents, up to the offending URL's opening quote, to the outfile. */ @@ -271,52 +272,52 @@ convert_links (const char *file, struct urlpos *links) p = url_start; switch (link->convert) - { - case CO_CONVERT_TO_RELATIVE: - /* Convert absolute URL to relative. */ - { - char *newname = construct_relative (file, link->local_name); - char *quoted_newname = local_quote_string (newname); + { + case CO_CONVERT_TO_RELATIVE: + /* Convert absolute URL to relative. */ + { + char *newname = construct_relative (file, link->local_name); + char *quoted_newname = local_quote_string (newname); - if (!link->link_refresh_p) - p = replace_attr (p, link->size, fp, quoted_newname); - else - p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, - link->refresh_timeout); + if (!link->link_refresh_p) + p = replace_attr (p, link->size, fp, quoted_newname); + else + p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, + link->refresh_timeout); - DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", - link->url->url, newname, link->pos, file)); - xfree (newname); - xfree (quoted_newname); - ++to_file_count; - break; - } - case CO_CONVERT_TO_COMPLETE: - /* Convert the link to absolute URL. */ - { - char *newlink = link->url->url; - char *quoted_newlink = html_quote_string (newlink); + DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", + link->url->url, newname, link->pos, file)); + xfree (newname); + xfree (quoted_newname); + ++to_file_count; + break; + } + case CO_CONVERT_TO_COMPLETE: + /* Convert the link to absolute URL. */ + { + char *newlink = link->url->url; + char *quoted_newlink = html_quote_string (newlink); - if (!link->link_refresh_p) - p = replace_attr (p, link->size, fp, quoted_newlink); - else - p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, - link->refresh_timeout); + if (!link->link_refresh_p) + p = replace_attr (p, link->size, fp, quoted_newlink); + else + p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, + link->refresh_timeout); - DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", - newlink, link->pos, file)); - xfree (quoted_newlink); - ++to_url_count; - break; - } - case CO_NULLIFY_BASE: - /* Change the base href to "". */ - p = replace_attr (p, link->size, fp, ""); - break; - case CO_NOCONVERT: - abort (); - break; - } + DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", + newlink, link->pos, file)); + xfree (quoted_newlink); + ++to_url_count; + break; + } + case CO_NULLIFY_BASE: + /* Change the base href to "". */ + p = replace_attr (p, link->size, fp, ""); + break; + case CO_NOCONVERT: + abort (); + break; + } } /* Output the rest of the file. */ @@ -359,7 +360,7 @@ construct_relative (const char *basefile, const char *linkfile) for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l) { if (*b == '/') - start = (b - basefile) + 1; + start = (b - basefile) + 1; } basefile += start; linkfile += start; @@ -380,7 +381,7 @@ construct_relative (const char *basefile, const char *linkfile) for (b = basefile; *b; b++) { if (*b == '/') - ++basedirs; + ++basedirs; } /* Construct LINK as explained above. */ @@ -410,12 +411,12 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) { /* Just write "orig" over "html". We need to do it this way - because when we're checking to see if we've downloaded the - file before (to see if we can skip downloading it), we don't - know if it's a text/html file. Therefore we don't know yet - at that stage that -E is going to cause us to tack on - ".html", so we need to compare vs. the original URL plus - ".orig", not the original URL plus ".html.orig". */ + because when we're checking to see if we've downloaded the + file before (to see if we can skip downloading it), we don't + know if it's a text/html file. Therefore we don't know yet + at that stage that -E is going to cause us to tack on + ".html", so we need to compare vs. the original URL plus + ".orig", not the original URL plus ".html.orig". */ filename_plus_orig_suffix = alloca (filename_len + 1); strcpy (filename_plus_orig_suffix, file); strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig"); @@ -440,25 +441,25 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) { /* Rename to .orig before former gets written over. */ if (rename (file, filename_plus_orig_suffix) != 0) - logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), - file, filename_plus_orig_suffix, strerror (errno)); + logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), + file, filename_plus_orig_suffix, strerror (errno)); /* Remember that we've already written a .orig backup for this file. - Note that we never free this memory since we need it till the - convert_all_links() call, which is one of the last things the - program does before terminating. BTW, I'm not sure if it would be - safe to just set 'converted_file_ptr->string' to 'file' below, - rather than making a copy of the string... Another note is that I - thought I could just add a field to the urlpos structure saying - that we'd written a .orig file for this URL, but that didn't work, - so I had to make this separate list. - -- Dan Harkless + Note that we never free this memory since we need it till the + convert_all_links() call, which is one of the last things the + program does before terminating. BTW, I'm not sure if it would be + safe to just set 'converted_file_ptr->string' to 'file' below, + rather than making a copy of the string... Another note is that I + thought I could just add a field to the urlpos structure saying + that we'd written a .orig file for this URL, but that didn't work, + so I had to make this separate list. + -- Dan Harkless This [adding a field to the urlpos structure] didn't work because convert_file() is called from convert_all_links at the end of the retrieval with a freshly built new urlpos list. - -- Hrvoje Niksic + -- Hrvoje Niksic */ string_set_add (converted_files, file); } @@ -472,9 +473,9 @@ static const char * replace_attr (const char *p, int size, FILE *fp, const char *new_text) { bool quote_flag = false; - char quote_char = '\"'; /* use "..." for quoting, unless the - original value is quoted, in which - case reuse its quoting char. */ + char quote_char = '\"'; /* use "..." for quoting, unless the + original value is quoted, in which + case reuse its quoting char. */ const char *frag_beg, *frag_end; /* Structure of our string is: @@ -489,7 +490,7 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text) quote_char = *p; quote_flag = true; ++p; - size -= 2; /* disregard opening and closing quote */ + size -= 2; /* disregard opening and closing quote */ } putc (quote_char, fp); fputs (new_text, fp); @@ -511,13 +512,13 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text) static const char * replace_attr_refresh_hack (const char *p, int size, FILE *fp, - const char *new_text, int timeout) + const char *new_text, int timeout) { /* "0; URL=..." */ char *new_with_timeout = (char *)alloca (numdigit (timeout) - + 6 /* "; URL=" */ - + strlen (new_text) - + 1); + + 6 /* "; URL=" */ + + strlen (new_text) + + 1); sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text); return replace_attr (p, size, fp, new_with_timeout); @@ -538,21 +539,21 @@ find_fragment (const char *beg, int size, const char **bp, const char **ep) for (; beg < end; beg++) { switch (*beg) - { - case '&': - saw_amp = true; - break; - case '#': - if (!saw_amp) - { - *bp = beg; - *ep = end; - return true; - } - /* fallthrough */ - default: - saw_amp = false; - } + { + case '&': + saw_amp = true; + break; + case '#': + if (!saw_amp) + { + *bp = beg; + *ep = end; + return true; + } + /* fallthrough */ + default: + saw_amp = false; + } } return false; } @@ -588,26 +589,26 @@ local_quote_string (const char *file) switch (*from) { case '%': - *to++ = '%'; - *to++ = '2'; - *to++ = '5'; - break; + *to++ = '%'; + *to++ = '2'; + *to++ = '5'; + break; case '#': - *to++ = '%'; - *to++ = '2'; - *to++ = '3'; - break; + *to++ = '%'; + *to++ = '2'; + *to++ = '3'; + break; case '?': - if (opt.html_extension) - { - *to++ = '%'; - *to++ = '3'; - *to++ = 'F'; - break; - } - /* fallthrough */ + if (opt.html_extension) + { + *to++ = '%'; + *to++ = '3'; + *to++ = 'F'; + break; + } + /* fallthrough */ default: - *to++ = *from; + *to++ = *from; } *to = '\0'; @@ -618,11 +619,11 @@ local_quote_string (const char *file) downloaded_html_list, and downloaded_html_set. Other code calls these functions to let us know that a file has been downloaded. */ -#define ENSURE_TABLES_EXIST do { \ - if (!dl_file_url_map) \ - dl_file_url_map = make_string_hash_table (0); \ - if (!dl_url_file_map) \ - dl_url_file_map = make_string_hash_table (0); \ +#define ENSURE_TABLES_EXIST do { \ + if (!dl_file_url_map) \ + dl_file_url_map = make_string_hash_table (0); \ + if (!dl_url_file_map) \ + dl_url_file_map = make_string_hash_table (0); \ } while (0) /* Return true if S1 and S2 are the same, except for "/index.html". @@ -704,7 +705,7 @@ dissociate_urls_from_file (const char *file) { /* Can't use hash_table_iter_* because the table mutates while mapping. */ hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper, - (char *) file); + (char *) file); } /* Register that URL has been successfully downloaded to FILE. This @@ -727,29 +728,29 @@ register_download (const char *url, const char *file) if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) { if (0 == strcmp (url, old_url)) - /* We have somehow managed to download the same URL twice. - Nothing to do. */ - return; + /* We have somehow managed to download the same URL twice. + Nothing to do. */ + return; if (match_except_index (url, old_url) - && !hash_table_contains (dl_url_file_map, url)) - /* The two URLs differ only in the "index.html" ending. For - example, one is "http://www.server.com/", and the other is - "http://www.server.com/index.html". Don't remove the old - one, just add the new one as a non-canonical entry. */ - goto url_only; + && !hash_table_contains (dl_url_file_map, url)) + /* The two URLs differ only in the "index.html" ending. For + example, one is "http://www.server.com/", and the other is + "http://www.server.com/index.html". Don't remove the old + one, just add the new one as a non-canonical entry. */ + goto url_only; hash_table_remove (dl_file_url_map, file); xfree (old_file); xfree (old_url); /* Remove all the URLs that point to this file. Yes, there can - be more than one such URL, because we store redirections as - multiple entries in dl_url_file_map. For example, if URL1 - redirects to URL2 which gets downloaded to FILE, we map both - URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map - only points to URL2.) When another URL gets loaded to FILE, - we want both URL1 and URL2 dissociated from it. + be more than one such URL, because we store redirections as + multiple entries in dl_url_file_map. For example, if URL1 + redirects to URL2 which gets downloaded to FILE, we map both + URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map + only points to URL2.) When another URL gets loaded to FILE, + we want both URL1 and URL2 dissociated from it. This is a relatively expensive operation because it performs a linear search of the whole hash table, but it should be @@ -922,10 +923,10 @@ downloaded_file (downloaded_file_t mode, const char *file) if (mode == CHECK_FOR_FILE) { if (!downloaded_files_hash) - return FILE_NOT_ALREADY_DOWNLOADED; + return FILE_NOT_ALREADY_DOWNLOADED; ptr = hash_table_get (downloaded_files_hash, file); if (!ptr) - return FILE_NOT_ALREADY_DOWNLOADED; + return FILE_NOT_ALREADY_DOWNLOADED; return *ptr; } @@ -949,9 +950,9 @@ downloaded_files_free (void) { hash_table_iterator iter; for (hash_table_iterate (downloaded_files_hash, &iter); - hash_table_iter_next (&iter); - ) - xfree (iter.key); + hash_table_iter_next (&iter); + ) + xfree (iter.key); hash_table_destroy (downloaded_files_hash); downloaded_files_hash = NULL; } @@ -972,8 +973,8 @@ in_list (const struct broken_urls_list *list, const char *url) for (ptr = list; ptr; ptr = ptr->next) { - /* TODO: strcasecmp may not be appropriate to compare URLs */ - if (strcasecmp (url, ptr->url) == 0) return true; + /* str[case]cmp is inadequate for URL comparison */ + if (are_urls_equal (url, ptr->url) == 0) return true; } return false; @@ -983,6 +984,10 @@ void nonexisting_url (const char *url, const char *referrer) { struct broken_urls_list *list; + + /* Ignore robots.txt URLs */ + if (is_robots_txt_url (url)) + return; if (!nonexisting_urls_hash) nonexisting_urls_hash = make_string_hash_table (0); @@ -1014,12 +1019,12 @@ nonexisting_urls_free (void) { hash_table_iterator iter; for (hash_table_iterate (nonexisting_urls_hash, &iter); - hash_table_iter_next (&iter); - ) + hash_table_iter_next (&iter); + ) { - xfree (iter.key); - xfree (iter.value); - } + xfree (iter.key); + xfree (iter.value); + } hash_table_destroy (nonexisting_urls_hash); nonexisting_urls_hash = NULL; } @@ -1055,12 +1060,12 @@ print_broken_links (void) ) { struct broken_urls_list *list; - + logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key); for (list = (struct broken_urls_list *) iter.value; list; - list = list->next) + list = list->next) { logprintf (LOG_NOTQUIET, _(" %s\n"), list->url); } @@ -1091,52 +1096,57 @@ html_quote_string (const char *s) for (i = 0; *s; s++, i++) { if (*s == '&') - i += 4; /* `amp;' */ + i += 4; /* `amp;' */ else if (*s == '<' || *s == '>') - i += 3; /* `lt;' and `gt;' */ + i += 3; /* `lt;' and `gt;' */ else if (*s == '\"') - i += 5; /* `quot;' */ + i += 5; /* `quot;' */ else if (*s == ' ') - i += 4; /* #32; */ + i += 4; /* #32; */ } res = xmalloc (i + 1); s = b; for (p = res; *s; s++) { switch (*s) - { - case '&': - *p++ = '&'; - *p++ = 'a'; - *p++ = 'm'; - *p++ = 'p'; - *p++ = ';'; - break; - case '<': case '>': - *p++ = '&'; - *p++ = (*s == '<' ? 'l' : 'g'); - *p++ = 't'; - *p++ = ';'; - break; - case '\"': - *p++ = '&'; - *p++ = 'q'; - *p++ = 'u'; - *p++ = 'o'; - *p++ = 't'; - *p++ = ';'; - break; - case ' ': - *p++ = '&'; - *p++ = '#'; - *p++ = '3'; - *p++ = '2'; - *p++ = ';'; - break; - default: - *p++ = *s; - } + { + case '&': + *p++ = '&'; + *p++ = 'a'; + *p++ = 'm'; + *p++ = 'p'; + *p++ = ';'; + break; + case '<': case '>': + *p++ = '&'; + *p++ = (*s == '<' ? 'l' : 'g'); + *p++ = 't'; + *p++ = ';'; + break; + case '\"': + *p++ = '&'; + *p++ = 'q'; + *p++ = 'u'; + *p++ = 'o'; + *p++ = 't'; + *p++ = ';'; + break; + case ' ': + *p++ = '&'; + *p++ = '#'; + *p++ = '3'; + *p++ = '2'; + *p++ = ';'; + break; + default: + *p++ = *s; + } } *p = '\0'; return res; } + +/* + * vim: et ts=2 sw=2 + */ + diff --git a/src/res.c b/src/res.c index 656f2895..103bc4e7 100644 --- a/src/res.c +++ b/src/res.c @@ -84,6 +84,10 @@ so, delete this exception statement from your version. */ #include "retr.h" #include "res.h" +#ifdef TESTING +#include "test.h" +#endif + struct path_info { char *path; bool allowedp; @@ -104,7 +108,7 @@ struct robot_specs { static void match_user_agent (const char *agent, int length, - bool *matches, bool *exact_match) + bool *matches, bool *exact_match) { if (length == 1 && *agent == '*') { @@ -128,7 +132,7 @@ match_user_agent (const char *agent, int length, static void add_path (struct robot_specs *specs, const char *path_b, const char *path_e, - bool allowedp, bool exactp) + bool allowedp, bool exactp) { struct path_info pp; if (path_b < path_e && *path_b == '/') @@ -142,11 +146,11 @@ add_path (struct robot_specs *specs, const char *path_b, const char *path_e, if (specs->count > specs->size) { if (specs->size == 0) - specs->size = 1; + specs->size = 1; else - specs->size <<= 1; + specs->size <<= 1; specs->paths = xrealloc (specs->paths, - specs->size * sizeof (struct path_info)); + specs->size * sizeof (struct path_info)); } specs->paths[specs->count - 1] = pp; } @@ -176,12 +180,12 @@ prune_non_exact (struct robot_specs *specs) #define EOL(p) ((p) >= lineend) -#define SKIP_SPACE(p) do { \ - while (!EOL (p) && ISSPACE (*p)) \ - ++p; \ +#define SKIP_SPACE(p) do { \ + while (!EOL (p) && ISSPACE (*p)) \ + ++p; \ } while (0) -#define FIELD_IS(string_literal) \ +#define FIELD_IS(string_literal) \ BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) /* Parse textual RES specs beginning with SOURCE of length LENGTH. @@ -245,113 +249,113 @@ res_parse (const char *source, int length) const char *value_b, *value_e; if (p == end) - break; + break; lineend_real = memchr (p, '\n', end - p); if (lineend_real) - ++lineend_real; + ++lineend_real; else - lineend_real = end; + lineend_real = end; lineend = lineend_real; /* Before doing anything else, check whether the line is empty - or comment-only. */ + or comment-only. */ SKIP_SPACE (p); if (EOL (p) || *p == '#') - goto next; + goto next; /* Make sure the end-of-line comments are respected by setting - lineend to a location preceding the first comment. Real line - ending remains in lineend_real. */ + lineend to a location preceding the first comment. Real line + ending remains in lineend_real. */ for (lineend = p; lineend < lineend_real; lineend++) - if ((lineend == p || ISSPACE (*(lineend - 1))) - && *lineend == '#') - break; + if ((lineend == p || ISSPACE (*(lineend - 1))) + && *lineend == '#') + break; /* Ignore trailing whitespace in the same way. */ while (lineend > p && ISSPACE (*(lineend - 1))) - --lineend; + --lineend; assert (!EOL (p)); field_b = p; while (!EOL (p) && (ISALNUM (*p) || *p == '-')) - ++p; + ++p; field_e = p; SKIP_SPACE (p); if (field_b == field_e || EOL (p) || *p != ':') - { - DEBUGP (("Ignoring malformed line %d", line_count)); - goto next; - } - ++p; /* skip ':' */ + { + DEBUGP (("Ignoring malformed line %d", line_count)); + goto next; + } + ++p; /* skip ':' */ SKIP_SPACE (p); value_b = p; while (!EOL (p)) - ++p; + ++p; value_e = p; /* Finally, we have a syntactically valid line. */ if (FIELD_IS ("user-agent")) - { - /* We have to support several cases: + { + /* We have to support several cases: - --previous records-- + --previous records-- - User-Agent: foo - User-Agent: Wget - User-Agent: bar - ... matching record ... + User-Agent: foo + User-Agent: Wget + User-Agent: bar + ... matching record ... - User-Agent: baz - User-Agent: qux - ... non-matching record ... + User-Agent: baz + User-Agent: qux + ... non-matching record ... - User-Agent: * - ... matching record, but will be pruned later ... + User-Agent: * + ... matching record, but will be pruned later ... - We have to respect `User-Agent' at the beginning of each - new record simply because we don't know if we're going to - encounter "Wget" among the agents or not. Hence, - match_user_agent is called when record_count != 0. + We have to respect `User-Agent' at the beginning of each + new record simply because we don't know if we're going to + encounter "Wget" among the agents or not. Hence, + match_user_agent is called when record_count != 0. - But if record_count is 0, we have to keep calling it - until it matches, and if that happens, we must not call - it any more, until the next record. Hence the other part - of the condition. */ - if (record_count != 0 || user_agent_applies == false) - match_user_agent (value_b, value_e - value_b, - &user_agent_applies, &user_agent_exact); - if (user_agent_exact) - found_exact = true; - record_count = 0; - } + But if record_count is 0, we have to keep calling it + until it matches, and if that happens, we must not call + it any more, until the next record. Hence the other part + of the condition. */ + if (record_count != 0 || user_agent_applies == false) + match_user_agent (value_b, value_e - value_b, + &user_agent_applies, &user_agent_exact); + if (user_agent_exact) + found_exact = true; + record_count = 0; + } else if (FIELD_IS ("allow")) - { - if (user_agent_applies) - { - add_path (specs, value_b, value_e, true, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + add_path (specs, value_b, value_e, true, user_agent_exact); + } + ++record_count; + } else if (FIELD_IS ("disallow")) - { - if (user_agent_applies) - { - bool allowed = false; - if (value_b == value_e) - /* Empty "disallow" line means everything is *allowed*! */ - allowed = true; - add_path (specs, value_b, value_e, allowed, user_agent_exact); - } - ++record_count; - } + { + if (user_agent_applies) + { + bool allowed = false; + if (value_b == value_e) + /* Empty "disallow" line means everything is *allowed*! */ + allowed = true; + add_path (specs, value_b, value_e, allowed, user_agent_exact); + } + ++record_count; + } else - { - DEBUGP (("Ignoring unknown field at line %d", line_count)); - goto next; - } + { + DEBUGP (("Ignoring unknown field at line %d", line_count)); + goto next; + } next: p = lineend_real; @@ -361,15 +365,15 @@ res_parse (const char *source, int length) if (found_exact) { /* We've encountered an exactly matching user-agent. Throw out - all the stuff with user-agent: *. */ + all the stuff with user-agent: *. */ prune_non_exact (specs); } else if (specs->size > specs->count) { /* add_path normally over-allocates specs->paths. Reallocate it - to the correct size in order to conserve some memory. */ + to the correct size in order to conserve some memory. */ specs->paths = xrealloc (specs->paths, - specs->count * sizeof (struct path_info)); + specs->count * sizeof (struct path_info)); specs->size = specs->count; } @@ -387,7 +391,7 @@ res_parse_from_file (const char *filename) if (!fm) { logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"), - filename, strerror (errno)); + filename, strerror (errno)); return NULL; } specs = res_parse (fm->content, fm->length); @@ -411,16 +415,16 @@ free_specs (struct robot_specs *specs) that number is not a numerical representation of '/', decode C and advance the pointer. */ -#define DECODE_MAYBE(c, ptr) do { \ - if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ - { \ - char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ - if (decoded != '/') \ - { \ - c = decoded; \ - ptr += 2; \ - } \ - } \ +#define DECODE_MAYBE(c, ptr) do { \ + if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \ + { \ + char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ + if (decoded != '/') \ + { \ + c = decoded; \ + ptr += 2; \ + } \ + } \ } while (0) /* The inner matching engine: return true if RECORD_PATH matches @@ -438,13 +442,13 @@ matches (const char *record_path, const char *url_path) char rc = *rp; char uc = *up; if (!rc) - return true; + return true; if (!uc) - return false; + return false; DECODE_MAYBE(rc, rp); DECODE_MAYBE(uc, up); if (rc != uc) - return false; + return false; } } @@ -461,11 +465,11 @@ res_match_path (const struct robot_specs *specs, const char *path) for (i = 0; i < specs->count; i++) if (matches (specs->paths[i].path, path)) { - bool allowedp = specs->paths[i].allowedp; - DEBUGP (("%s path %s because of rule `%s'.\n", - allowedp ? "Allowing" : "Rejecting", - path, specs->paths[i].path)); - return allowedp; + bool allowedp = specs->paths[i].allowedp; + DEBUGP (("%s path %s because of rule `%s'.\n", + allowedp ? "Allowing" : "Rejecting", + path, specs->paths[i].path)); + return allowedp; } return true; } @@ -475,12 +479,12 @@ res_match_path (const struct robot_specs *specs, const char *path) static struct hash_table *registered_specs; /* Stolen from cookies.c. */ -#define SET_HOSTPORT(host, port, result) do { \ - int HP_len = strlen (host); \ - result = alloca (HP_len + 1 + numdigit (port) + 1); \ - memcpy (result, host, HP_len); \ - result[HP_len] = ':'; \ - number_to_string (result + HP_len + 1, port); \ +#define SET_HOSTPORT(host, port, result) do { \ + int HP_len = strlen (host); \ + result = alloca (HP_len + 1 + numdigit (port) + 1); \ + memcpy (result, host, HP_len); \ + result[HP_len] = ':'; \ + number_to_string (result + HP_len + 1, port); \ } while (0) /* Register RES specs that below to server on HOST:PORT. They will @@ -499,7 +503,7 @@ res_register_specs (const char *host, int port, struct robot_specs *specs) if (hash_table_get_pair (registered_specs, hp, &hp_old, &old)) { if (old) - free_specs (old); + free_specs (old); hash_table_put (registered_specs, hp_old, specs); } else @@ -544,14 +548,25 @@ res_retrieve_file (const char *url, char **file) if (err != RETROK && *file != NULL) { /* If the file is not retrieved correctly, but retrieve_url - allocated the file name, deallocate is here so that the - caller doesn't have to worry about it. */ + allocated the file name, deallocate is here so that the + caller doesn't have to worry about it. */ xfree (*file); *file = NULL; } return err == RETROK; } +bool +is_robots_txt_url (const char *url) +{ + char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + bool ret = are_urls_equal (url, robots_url); + + xfree (robots_url); + + return ret; +} + void res_cleanup (void) { @@ -559,13 +574,44 @@ res_cleanup (void) { hash_table_iterator iter; for (hash_table_iterate (registered_specs, &iter); - hash_table_iter_next (&iter); - ) - { - xfree (iter.key); - free_specs (iter.value); - } + hash_table_iter_next (&iter); + ) + { + xfree (iter.key); + free_specs (iter.value); + } hash_table_destroy (registered_specs); registered_specs = NULL; } } + +#ifdef TESTING + +const char * +test_is_robots_txt_url() +{ + int i; + struct { + char *url; + bool expected_result; + } test_array[] = { + { "http://www.yoyodyne.com/robots.txt", true }, + { "http://www.yoyodyne.com/somepath/", false }, + { "http://www.yoyodyne.com/somepath/robots.txt", false }, + }; + + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) + { + mu_assert ("test_is_robots_txt_url: wrong result", + is_robots_txt_url (test_array[i].url) == test_array[i].expected_result); + } + + return NULL; +} + +#endif /* TESTING */ + +/* + * vim: et ts=2 sw=2 + */ + diff --git a/src/res.h b/src/res.h index c0fd2b43..2b129025 100644 --- a/src/res.h +++ b/src/res.h @@ -42,6 +42,8 @@ struct robot_specs *res_get_specs (const char *, int); bool res_retrieve_file (const char *, char **); +bool is_robots_txt_url (const char *); + void res_cleanup (void); #endif /* RES_H */ diff --git a/src/test.c b/src/test.c index bc24feaa..164a6d99 100644 --- a/src/test.c +++ b/src/test.c @@ -40,6 +40,8 @@ const char *test_subdir_p(); const char *test_dir_matches_p(); const char *test_cmd_spec_restrict_file_names(); const char *test_append_uri_pathel(); +const char *test_are_urls_equal(); +const char *test_is_robots_txt_url(); int tests_run; @@ -51,6 +53,8 @@ all_tests() mu_run_test (test_dir_matches_p); mu_run_test (test_cmd_spec_restrict_file_names); mu_run_test (test_append_uri_pathel); + mu_run_test (test_are_urls_equal); + mu_run_test (test_is_robots_txt_url); return NULL; } diff --git a/src/url.c b/src/url.c index 1d199fea..5025e937 100644 --- a/src/url.c +++ b/src/url.c @@ -48,10 +48,10 @@ so, delete this exception statement from your version. */ #endif enum { - scm_disabled = 1, /* for https when OpenSSL fails to init. */ - scm_has_params = 2, /* whether scheme has ;params */ - scm_has_query = 4, /* whether scheme has ?query */ - scm_has_fragment = 8 /* whether scheme has #fragment */ + scm_disabled = 1, /* for https when OpenSSL fails to init. */ + scm_has_params = 2, /* whether scheme has ;params */ + scm_has_query = 4, /* whether scheme has ?query */ + scm_has_fragment = 8 /* whether scheme has #fragment */ }; struct scheme_data @@ -69,14 +69,14 @@ struct scheme_data /* Supported schemes: */ static struct scheme_data supported_schemes[] = { - { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment }, + { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment }, #ifdef HAVE_SSL - { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment }, + { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment }, #endif - { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, + { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, /* SCHEME_INVALID */ - { NULL, NULL, -1, 0 } + { NULL, NULL, -1, 0 } }; /* Forward declarations: */ @@ -169,30 +169,30 @@ static const unsigned char urlchr_table[256] = static void url_unescape (char *s) { - char *t = s; /* t - tortoise */ - char *h = s; /* h - hare */ + char *t = s; /* t - tortoise */ + char *h = s; /* h - hare */ for (; *h; h++, t++) { if (*h != '%') - { - copychar: - *t = *h; - } + { + copychar: + *t = *h; + } else - { - char c; - /* Do nothing if '%' is not followed by two hex digits. */ - if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2]))) - goto copychar; - c = X2DIGITS_TO_NUM (h[1], h[2]); - /* Don't unescape %00 because there is no way to insert it - into a C string without effectively truncating it. */ - if (c == '\0') - goto copychar; - *t = c; - h += 2; - } + { + char c; + /* Do nothing if '%' is not followed by two hex digits. */ + if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2]))) + goto copychar; + c = X2DIGITS_TO_NUM (h[1], h[2]); + /* Don't unescape %00 because there is no way to insert it + into a C string without effectively truncating it. */ + if (c == '\0') + goto copychar; + *t = c; + h += 2; + } } *t = '\0'; } @@ -214,7 +214,7 @@ url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough) for (p1 = s; *p1; p1++) if (urlchr_test (*p1, mask)) - addition += 2; /* Two more characters (hex digits) */ + addition += 2; /* Two more characters (hex digits) */ if (!addition) return allow_passthrough ? (char *)s : xstrdup (s); @@ -228,14 +228,14 @@ url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough) { /* Quote the characters that match the test mask. */ if (urlchr_test (*p1, mask)) - { - unsigned char c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); - } + { + unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); + } else - *p2++ = *p1++; + *p2++ = *p1++; } assert (p2 - newstr == newlen); *p2 = '\0'; @@ -273,10 +273,10 @@ char_needs_escaping (const char *p) if (*p == '%') { if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2))) - return false; + return false; else - /* Garbled %.. sequence: encode `%'. */ - return true; + /* Garbled %.. sequence: encode `%'. */ + return true; } else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) return true; @@ -379,7 +379,7 @@ reencode_escapes (const char *s) if (!encode_count) /* The string is good as it is. */ - return (char *) s; /* C const model sucks. */ + return (char *) s; /* C const model sucks. */ oldlen = p1 - s; /* Each encoding adds two characters (hex digits). */ @@ -394,10 +394,10 @@ reencode_escapes (const char *s) while (*p1) if (char_needs_escaping (p1)) { - unsigned char c = *p1++; - *p2++ = '%'; - *p2++ = XNUM_TO_DIGIT (c >> 4); - *p2++ = XNUM_TO_DIGIT (c & 0xf); + unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); } else *p2++ = *p1++; @@ -417,12 +417,12 @@ url_scheme (const char *url) for (i = 0; supported_schemes[i].leading_string; i++) if (0 == strncasecmp (url, supported_schemes[i].leading_string, - strlen (supported_schemes[i].leading_string))) + strlen (supported_schemes[i].leading_string))) { - if (!(supported_schemes[i].flags & scm_disabled)) - return (enum url_scheme) i; - else - return SCHEME_INVALID; + if (!(supported_schemes[i].flags & scm_disabled)) + return (enum url_scheme) i; + else + return SCHEME_INVALID; } return SCHEME_INVALID; @@ -489,11 +489,11 @@ parse_credentials (const char *beg, const char *end, char **user, char **passwd) const char *userend; if (beg == end) - return false; /* empty user name */ + return false; /* empty user name */ colon = memchr (beg, ':', end - beg); if (colon == beg) - return false; /* again empty user name */ + return false; /* again empty user name */ if (colon) { @@ -549,10 +549,10 @@ rewrite_shorthand_url (const char *url) if (p && *p == ':') { /* Colon indicates ftp, as in foo.bar.com:path. Check for - special case of http port number ("localhost:10000"). */ + special case of http port number ("localhost:10000"). */ int digits = strspn (p + 1, "0123456789"); if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0')) - goto http; + goto http; /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */ ret = aprintf ("ftp://%s", url); @@ -592,8 +592,8 @@ lowercase_str (char *str) for (; *str; str++) if (ISUPPER (*str)) { - changed = true; - *str = TOLOWER (*str); + changed = true; + *str = TOLOWER (*str); } return changed; } @@ -616,21 +616,21 @@ init_seps (enum url_scheme scheme) } static const char *parse_errors[] = { -#define PE_NO_ERROR 0 +#define PE_NO_ERROR 0 N_("No error"), -#define PE_UNSUPPORTED_SCHEME 1 +#define PE_UNSUPPORTED_SCHEME 1 N_("Unsupported scheme"), -#define PE_INVALID_HOST_NAME 2 +#define PE_INVALID_HOST_NAME 2 N_("Invalid host name"), -#define PE_BAD_PORT_NUMBER 3 +#define PE_BAD_PORT_NUMBER 3 N_("Bad port number"), -#define PE_INVALID_USER_NAME 4 +#define PE_INVALID_USER_NAME 4 N_("Invalid user name"), -#define PE_UNTERMINATED_IPV6_ADDRESS 5 +#define PE_UNTERMINATED_IPV6_ADDRESS 5 N_("Unterminated IPv6 numeric address"), -#define PE_IPV6_NOT_SUPPORTED 6 +#define PE_IPV6_NOT_SUPPORTED 6 N_("IPv6 addresses not supported"), -#define PE_INVALID_IPV6_ADDRESS 7 +#define PE_INVALID_IPV6_ADDRESS 7 N_("Invalid IPv6 numeric address") }; @@ -701,26 +701,26 @@ url_parse (const char *url, int *error) if (*p == '[') { /* Handle IPv6 address inside square brackets. Ideally we'd - just look for the terminating ']', but rfc2732 mandates - rejecting invalid IPv6 addresses. */ + just look for the terminating ']', but rfc2732 mandates + rejecting invalid IPv6 addresses. */ /* The address begins after '['. */ host_b = p + 1; host_e = strchr (host_b, ']'); if (!host_e) - { - error_code = PE_UNTERMINATED_IPV6_ADDRESS; - goto error; - } + { + error_code = PE_UNTERMINATED_IPV6_ADDRESS; + goto error; + } #ifdef ENABLE_IPV6 /* Check if the IPv6 address is valid. */ if (!is_valid_ipv6_address(host_b, host_e)) - { - error_code = PE_INVALID_IPV6_ADDRESS; - goto error; - } + { + error_code = PE_INVALID_IPV6_ADDRESS; + goto error; + } /* Continue parsing after the closing ']'. */ p = host_e + 1; @@ -730,22 +730,22 @@ url_parse (const char *url, int *error) #endif /* The closing bracket must be followed by a separator or by the - null char. */ + null char. */ /* http://[::1]... */ /* ^ */ if (!strchr (seps, *p)) - { - /* Trailing garbage after []-delimited IPv6 address. */ - error_code = PE_INVALID_HOST_NAME; - goto error; - } + { + /* Trailing garbage after []-delimited IPv6 address. */ + error_code = PE_INVALID_HOST_NAME; + goto error; + } } else { p = strpbrk_or_eos (p, seps); host_e = p; } - ++seps; /* advance to '/' */ + ++seps; /* advance to '/' */ if (host_b == host_e) { @@ -767,24 +767,24 @@ url_parse (const char *url, int *error) /* Allow empty port, as per rfc2396. */ if (port_b != port_e) - for (port = 0, pp = port_b; pp < port_e; pp++) - { - if (!ISDIGIT (*pp)) - { - /* http://host:12randomgarbage/blah */ - /* ^ */ - error_code = PE_BAD_PORT_NUMBER; - goto error; - } - port = 10 * port + (*pp - '0'); - /* Check for too large port numbers here, before we have - a chance to overflow on bogus port values. */ - if (port > 0xffff) - { - error_code = PE_BAD_PORT_NUMBER; - goto error; - } - } + for (port = 0, pp = port_b; pp < port_e; pp++) + { + if (!ISDIGIT (*pp)) + { + /* http://host:12randomgarbage/blah */ + /* ^ */ + error_code = PE_BAD_PORT_NUMBER; + goto error; + } + port = 10 * port + (*pp - '0'); + /* Check for too large port numbers here, before we have + a chance to overflow on bogus port values. */ + if (port > 0xffff) + { + error_code = PE_BAD_PORT_NUMBER; + goto error; + } + } } /* Advance to the first separator *after* '/' (either ';' or '?', depending on the scheme). */ @@ -792,10 +792,10 @@ url_parse (const char *url, int *error) /* Get the optional parts of URL, each part being delimited by current location and the position of the next separator. */ -#define GET_URL_PART(sepchar, var) do { \ - if (*p == sepchar) \ - var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \ - ++seps; \ +#define GET_URL_PART(sepchar, var) do { \ + if (*p == sepchar) \ + var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \ + ++seps; \ } while (0) GET_URL_PART ('/', path); @@ -815,10 +815,10 @@ url_parse (const char *url, int *error) /* ^ ^ */ /* uname_b uname_e */ if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) - { - error_code = PE_INVALID_USER_NAME; - goto error; - } + { + error_code = PE_INVALID_USER_NAME; + goto error; + } } u = xnew0 (struct url); @@ -854,19 +854,19 @@ url_parse (const char *url, int *error) if (path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what - url_string might return different from URL_ENCODED, rebuild - u->url using url_string. */ + url_string might return different from URL_ENCODED, rebuild + u->url using url_string. */ u->url = url_string (u, false); if (url_encoded != url) - xfree ((char *) url_encoded); + xfree ((char *) url_encoded); } else { if (url_encoded == url) - u->url = xstrdup (url); + u->url = xstrdup (url); else - u->url = url_encoded; + u->url = url_encoded; } return u; @@ -956,14 +956,14 @@ full_path_length (const struct url *url) static void full_path_write (const struct url *url, char *where) { -#define FROB(el, chr) do { \ - char *f_el = url->el; \ - if (f_el) { \ - int l = strlen (f_el); \ - *where++ = chr; \ - memcpy (where, f_el, l); \ - where += l; \ - } \ +#define FROB(el, chr) do { \ + char *f_el = url->el; \ + if (f_el) { \ + int l = strlen (f_el); \ + *where++ = chr; \ + memcpy (where, f_el, l); \ + where += l; \ + } \ } while (0) FROB (path, '/'); @@ -998,17 +998,17 @@ unescape_single_char (char *str, char chr) { const char c1 = XNUM_TO_DIGIT (chr >> 4); const char c2 = XNUM_TO_DIGIT (chr & 0xf); - char *h = str; /* hare */ - char *t = str; /* tortoise */ + char *h = str; /* hare */ + char *t = str; /* tortoise */ for (; *h; h++, t++) { if (h[0] == '%' && h[1] == c1 && h[2] == c2) - { - *t = chr; - h += 2; - } + { + *t = chr; + h += 2; + } else - *t = *h; + *t = *h; } *t = '\0'; } @@ -1136,27 +1136,27 @@ mkalldirs (const char *path) if ((stat (t, &st) == 0)) { if (S_ISDIR (st.st_mode)) - { - xfree (t); - return 0; - } + { + xfree (t); + return 0; + } else - { - /* If the dir exists as a file name, remove it first. This - is *only* for Wget to work with buggy old CERN http - servers. Here is the scenario: When Wget tries to - retrieve a directory without a slash, e.g. - http://foo/bar (bar being a directory), CERN server will - not redirect it too http://foo/bar/ -- it will generate a - directory listing containing links to bar/file1, - bar/file2, etc. Wget will lose because it saves this - HTML listing to a file `bar', so it cannot create the - directory. To work around this, if the file of the same - name exists, we just remove it and create the directory - anyway. */ - DEBUGP (("Removing %s because of directory danger!\n", t)); - unlink (t); - } + { + /* If the dir exists as a file name, remove it first. This + is *only* for Wget to work with buggy old CERN http + servers. Here is the scenario: When Wget tries to + retrieve a directory without a slash, e.g. + http://foo/bar (bar being a directory), CERN server will + not redirect it too http://foo/bar/ -- it will generate a + directory listing containing links to bar/file1, + bar/file2, etc. Wget will lose because it saves this + HTML listing to a file `bar', so it cannot create the + directory. To work around this, if the file of the same + name exists, we just remove it and create the directory + anyway. */ + DEBUGP (("Removing %s because of directory danger!\n", t)); + unlink (t); + } } res = make_directory (t); if (res != 0) @@ -1185,9 +1185,9 @@ struct growable { the current TAIL position. If necessary, this will grow the string and update its allocated size. If the string is already large enough to take TAIL+APPEND_COUNT characters, this does nothing. */ -#define GROW(g, append_size) do { \ - struct growable *G_ = g; \ - DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \ +#define GROW(g, append_size) do { \ + struct growable *G_ = g; \ + DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \ } while (0) /* Return the tail position of the string. */ @@ -1220,9 +1220,9 @@ append_char (char ch, struct growable *dest) } enum { - filechr_not_unix = 1, /* unusable on Unix, / and \0 */ - filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ - filechr_control = 4 /* a control character, e.g. 0-31 */ + filechr_not_unix = 1, /* unusable on Unix, / and \0 */ + filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ + filechr_control = 4 /* a control character, e.g. 0-31 */ }; #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask)) @@ -1297,7 +1297,7 @@ UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ static void append_uri_pathel (const char *b, const char *e, bool escaped, - struct growable *dest) + struct growable *dest) { const char *p; int quoted, outlen; @@ -1344,24 +1344,24 @@ append_uri_pathel (const char *b, const char *e, bool escaped, if (!quoted) { /* If there's nothing to quote, we can simply append the string - without processing it again. */ + without processing it again. */ memcpy (TAIL (dest), b, outlen); } else { char *q = TAIL (dest); for (p = b; p < e; p++) - { - if (!FILE_CHAR_TEST (*p, mask)) - *q++ = *p; - else - { - unsigned char ch = *p; - *q++ = '%'; - *q++ = XNUM_TO_DIGIT (ch >> 4); - *q++ = XNUM_TO_DIGIT (ch & 0xf); - } - } + { + if (!FILE_CHAR_TEST (*p, mask)) + *q++ = *p; + else + { + unsigned char ch = *p; + *q++ = '%'; + *q++ = XNUM_TO_DIGIT (ch >> 4); + *q++ = XNUM_TO_DIGIT (ch & 0xf); + } + } assert (q - TAIL (dest) == outlen); } @@ -1378,7 +1378,7 @@ append_uri_pathel (const char *b, const char *e, bool escaped, *q = TOUPPER (*q); } } - + TAIL_INCR (dest, outlen); } @@ -1409,13 +1409,13 @@ append_dir_structure (const struct url *u, struct growable *dest) for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1) { if (cut-- > 0) - continue; + continue; if (pathel == next) - /* Ignore empty pathels. */ - continue; + /* Ignore empty pathels. */ + continue; if (dest->tail) - append_char ('/', dest); + append_char ('/', dest); append_uri_pathel (pathel, next, true, dest); } } @@ -1426,7 +1426,7 @@ append_dir_structure (const struct url *u, struct growable *dest) char * url_file_name (const struct url *u) { - struct growable fnres; /* stands for "file name result" */ + struct growable fnres; /* stands for "file name result" */ const char *u_file, *u_query; char *fname, *unique; @@ -1445,30 +1445,30 @@ url_file_name (const struct url *u) if (opt.dirstruct) { if (opt.protocol_directories) - { - if (fnres.tail) - append_char ('/', &fnres); - append_string (supported_schemes[u->scheme].name, &fnres); - } + { + if (fnres.tail) + append_char ('/', &fnres); + append_string (supported_schemes[u->scheme].name, &fnres); + } if (opt.add_hostdir) - { - if (fnres.tail) - append_char ('/', &fnres); - if (0 != strcmp (u->host, "..")) - append_string (u->host, &fnres); - else - /* Host name can come from the network; malicious DNS may - allow ".." to be resolved, causing us to write to - "../". Defang such host names. */ - append_string ("%2E%2E", &fnres); - if (u->port != scheme_default_port (u->scheme)) - { - char portstr[24]; - number_to_string (portstr, u->port); - append_char (FN_PORT_SEP, &fnres); - append_string (portstr, &fnres); - } - } + { + if (fnres.tail) + append_char ('/', &fnres); + if (0 != strcmp (u->host, "..")) + append_string (u->host, &fnres); + else + /* Host name can come from the network; malicious DNS may + allow ".." to be resolved, causing us to write to + "../". Defang such host names. */ + append_string ("%2E%2E", &fnres); + if (u->port != scheme_default_port (u->scheme)) + { + char portstr[24]; + number_to_string (portstr, u->port); + append_char (FN_PORT_SEP, &fnres); + append_string (portstr, &fnres); + } + } append_dir_structure (u, &fnres); } @@ -1528,8 +1528,8 @@ url_file_name (const struct url *u) static bool path_simplify (char *path) { - char *h = path; /* hare */ - char *t = path; /* tortoise */ + char *h = path; /* hare */ + char *t = path; /* tortoise */ char *end = strchr (path, '\0'); while (h < end) @@ -1537,45 +1537,45 @@ path_simplify (char *path) /* Hare should be at the beginning of a path element. */ if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) - { - /* Ignore "./". */ - h += 2; - } + { + /* Ignore "./". */ + h += 2; + } else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) - { - /* Handle "../" by retreating the tortoise by one path - element -- but not past beggining. */ - if (t > path) - { - /* Move backwards until T hits the beginning of the - previous path element or the beginning of path. */ - for (--t; t > path && t[-1] != '/'; t--) - ; - } - h += 3; - } + { + /* Handle "../" by retreating the tortoise by one path + element -- but not past beggining. */ + if (t > path) + { + /* Move backwards until T hits the beginning of the + previous path element or the beginning of path. */ + for (--t; t > path && t[-1] != '/'; t--) + ; + } + h += 3; + } else - { - /* A regular path element. If H hasn't advanced past T, - simply skip to the next path element. Otherwise, copy - the path element until the next slash. */ - if (t == h) - { - /* Skip the path element, including the slash. */ - while (h < end && *h != '/') - t++, h++; - if (h < end) - t++, h++; - } - else - { - /* Copy the path element, including the final slash. */ - while (h < end && *h != '/') - *t++ = *h++; - if (h < end) - *t++ = *h++; - } - } + { + /* A regular path element. If H hasn't advanced past T, + simply skip to the next path element. Otherwise, copy + the path element until the next slash. */ + if (t == h) + { + /* Skip the path element, including the slash. */ + while (h < end && *h != '/') + t++, h++; + if (h < end) + t++, h++; + } + else + { + /* Copy the path element, including the final slash. */ + while (h < end && *h != '/') + *t++ = *h++; + if (h < end) + *t++ = *h++; + } + } } if (t != h) @@ -1594,7 +1594,7 @@ path_end (const char *url) enum url_scheme scheme = url_scheme (url); const char *seps; if (scheme == SCHEME_INVALID) - scheme = SCHEME_HTTP; /* use http semantics for rel links */ + scheme = SCHEME_HTTP; /* use http semantics for rel links */ /* +2 to ignore the first two separators ':' and '/' */ seps = init_seps (scheme) + 2; return strpbrk_or_eos (url, seps); @@ -1638,7 +1638,7 @@ uri_merge (const char *base, const char *link) else if (*link == '?') { /* LINK points to the same location, but changes the query - string. Examples: */ + string. Examples: */ /* uri_merge("path", "?new") -> "path?new" */ /* uri_merge("path?foo", "?new") -> "path?new" */ /* uri_merge("path?foo#bar", "?new") -> "path?new" */ @@ -1658,7 +1658,7 @@ uri_merge (const char *base, const char *link) int baselength; const char *end1 = strchr (base, '#'); if (!end1) - end1 = base + strlen (base); + end1 = base + strlen (base); baselength = end1 - base; merge = xmalloc (baselength + linklength + 1); memcpy (merge, base, baselength); @@ -1668,8 +1668,8 @@ uri_merge (const char *base, const char *link) else if (*link == '/' && *(link + 1) == '/') { /* LINK begins with "//" and so is a net path: we need to - replace everything after (and including) the double slash - with LINK. */ + replace everything after (and including) the double slash + with LINK. */ /* uri_merge("foo", "//new/bar") -> "//new/bar" */ /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ @@ -1682,112 +1682,112 @@ uri_merge (const char *base, const char *link) /* Look for first slash. */ slash = memchr (base, '/', end - base); /* If found slash and it is a double slash, then replace - from this point, else default to replacing from the - beginning. */ + from this point, else default to replacing from the + beginning. */ if (slash && *(slash + 1) == '/') - start_insert = slash; + start_insert = slash; else - start_insert = base; + start_insert = base; span = start_insert - base; merge = xmalloc (span + linklength + 1); if (span) - memcpy (merge, base, span); + memcpy (merge, base, span); memcpy (merge + span, link, linklength); merge[span + linklength] = '\0'; } else if (*link == '/') { /* LINK is an absolute path: we need to replace everything - after (and including) the FIRST slash with LINK. + after (and including) the FIRST slash with LINK. - So, if BASE is "http://host/whatever/foo/bar", and LINK is - "/qux/xyzzy", our result should be - "http://host/qux/xyzzy". */ + So, if BASE is "http://host/whatever/foo/bar", and LINK is + "/qux/xyzzy", our result should be + "http://host/qux/xyzzy". */ int span; const char *slash; const char *start_insert = NULL; /* for gcc to shut up. */ const char *pos = base; bool seen_slash_slash = false; /* We're looking for the first slash, but want to ignore - double slash. */ + double slash. */ again: slash = memchr (pos, '/', end - pos); if (slash && !seen_slash_slash) - if (*(slash + 1) == '/') - { - pos = slash + 2; - seen_slash_slash = true; - goto again; - } + if (*(slash + 1) == '/') + { + pos = slash + 2; + seen_slash_slash = true; + goto again; + } /* At this point, SLASH is the location of the first / after - "//", or the first slash altogether. START_INSERT is the - pointer to the location where LINK will be inserted. When - examining the last two examples, keep in mind that LINK - begins with '/'. */ + "//", or the first slash altogether. START_INSERT is the + pointer to the location where LINK will be inserted. When + examining the last two examples, keep in mind that LINK + begins with '/'. */ if (!slash && !seen_slash_slash) - /* example: "foo" */ - /* ^ */ - start_insert = base; + /* example: "foo" */ + /* ^ */ + start_insert = base; else if (!slash && seen_slash_slash) - /* example: "http://foo" */ - /* ^ */ - start_insert = end; + /* example: "http://foo" */ + /* ^ */ + start_insert = end; else if (slash && !seen_slash_slash) - /* example: "foo/bar" */ - /* ^ */ - start_insert = base; + /* example: "foo/bar" */ + /* ^ */ + start_insert = base; else if (slash && seen_slash_slash) - /* example: "http://something/" */ - /* ^ */ - start_insert = slash; + /* example: "http://something/" */ + /* ^ */ + start_insert = slash; span = start_insert - base; merge = xmalloc (span + linklength + 1); if (span) - memcpy (merge, base, span); + memcpy (merge, base, span); memcpy (merge + span, link, linklength); merge[span + linklength] = '\0'; } else { /* LINK is a relative URL: we need to replace everything - after last slash (possibly empty) with LINK. + after last slash (possibly empty) with LINK. - So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", - our result should be "whatever/foo/qux/xyzzy". */ + So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", + our result should be "whatever/foo/qux/xyzzy". */ bool need_explicit_slash = false; int span; const char *start_insert; const char *last_slash = find_last_char (base, end, '/'); if (!last_slash) - { - /* No slash found at all. Replace what we have with LINK. */ - start_insert = base; - } + { + /* No slash found at all. Replace what we have with LINK. */ + start_insert = base; + } else if (last_slash && last_slash >= base + 2 - && last_slash[-2] == ':' && last_slash[-1] == '/') - { - /* example: http://host" */ - /* ^ */ - start_insert = end + 1; - need_explicit_slash = true; - } + && last_slash[-2] == ':' && last_slash[-1] == '/') + { + /* example: http://host" */ + /* ^ */ + start_insert = end + 1; + need_explicit_slash = true; + } else - { - /* example: "whatever/foo/bar" */ - /* ^ */ - start_insert = last_slash + 1; - } + { + /* example: "whatever/foo/bar" */ + /* ^ */ + start_insert = last_slash + 1; + } span = start_insert - base; merge = xmalloc (span + linklength + 1); if (span) - memcpy (merge, base, span); + memcpy (merge, base, span); if (need_explicit_slash) - merge[span - 1] = '/'; + merge[span - 1] = '/'; memcpy (merge + span, link, linklength); merge[span + linklength] = '\0'; } @@ -1795,10 +1795,10 @@ uri_merge (const char *base, const char *link) return merge; } -#define APPEND(p, s) do { \ - int len = strlen (s); \ - memcpy (p, s, len); \ - p += len; \ +#define APPEND(p, s) do { \ + int len = strlen (s); \ + memcpy (p, s, len); \ + p += len; \ } while (0) /* Use this instead of password when the actual password is supposed @@ -1834,12 +1834,12 @@ url_string (const struct url *url, bool hide_password) { quoted_user = url_escape_allow_passthrough (url->user); if (url->passwd) - { - if (hide_password) - quoted_passwd = HIDDEN_PASSWORD; - else - quoted_passwd = url_escape_allow_passthrough (url->passwd); - } + { + if (hide_password) + quoted_passwd = HIDDEN_PASSWORD; + else + quoted_passwd = url_escape_allow_passthrough (url->passwd); + } } /* In the unlikely event that the host name contains non-printable @@ -1854,17 +1854,17 @@ url_string (const struct url *url, bool hide_password) brackets_around_host = strchr (quoted_host, ':') != NULL; size = (strlen (scheme_str) - + strlen (quoted_host) - + (brackets_around_host ? 2 : 0) - + fplen - + 1); + + strlen (quoted_host) + + (brackets_around_host ? 2 : 0) + + fplen + + 1); if (url->port != scheme_port) size += 1 + numdigit (url->port); if (quoted_user) { size += 1 + strlen (quoted_user); if (quoted_passwd) - size += 1 + strlen (quoted_passwd); + size += 1 + strlen (quoted_passwd); } p = result = xmalloc (size); @@ -1874,10 +1874,10 @@ url_string (const struct url *url, bool hide_password) { APPEND (p, quoted_user); if (quoted_passwd) - { - *p++ = ':'; - APPEND (p, quoted_passwd); - } + { + *p++ = ':'; + APPEND (p, quoted_passwd); + } *p++ = '@'; } @@ -1926,6 +1926,64 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b) return false; } +static int +getchar_from_escaped_string (const char *str, char *c) +{ + const char *p = str; + + assert (str && *str); + assert (c); + + if (p[0] == '%') + { + if (p[1] == 0) + return 0; /* error: invalid string */ + + if (p[1] == '%') + { + *c = '%'; + return 1; + } + else + { + if (p[2] == 0) + return 0; /* error: invalid string */ + + *c = X2DIGITS_TO_NUM (p[1], p[2]); + + return 3; + } + } + else + { + *c = p[0]; + } + + return 1; +} + +bool +are_urls_equal (const char *u1, const char *u2) +{ + const char *p, *q; + int pp, qq; + char ch1, ch2; + + p = u1; + q = u2; + + while (*p + && (pp = getchar_from_escaped_string (p, &ch1)) + && (qq = getchar_from_escaped_string (q, &ch2)) + && (TOLOWER(ch1) == TOLOWER(ch2))) + { + p += pp; + q += qq; + } + + return (*p == 0 && *q == 0 ? true : false); +} + #if 0 /* Debugging and testing support for path_simplify. */ @@ -1948,16 +2006,16 @@ run_test (char *test, char *expected_result, bool expected_change) if (0 != strcmp (test_copy, expected_result)) { printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n", - test, expected_result, test_copy); + test, expected_result, test_copy); } if (modified != expected_change) { if (expected_change) - printf ("Expected modification with path_simplify(\"%s\").\n", - test); + printf ("Expected modification with path_simplify(\"%s\").\n", + test); else - printf ("Expected no modification with path_simplify(\"%s\").\n", - test); + printf ("Expected no modification with path_simplify(\"%s\").\n", + test); } xfree (test_copy); } @@ -1969,28 +2027,28 @@ test_path_simplify (void) char *test, *result; bool should_modify; } tests[] = { - { "", "", false }, - { ".", "", true }, - { "./", "", true }, - { "..", "", true }, - { "../", "", true }, - { "foo", "foo", false }, - { "foo/bar", "foo/bar", false }, - { "foo///bar", "foo///bar", false }, - { "foo/.", "foo/", true }, - { "foo/./", "foo/", true }, - { "foo./", "foo./", false }, - { "foo/../bar", "bar", true }, - { "foo/../bar/", "bar/", true }, - { "foo/bar/..", "foo/", true }, - { "foo/bar/../x", "foo/x", true }, - { "foo/bar/../x/", "foo/x/", true }, - { "foo/..", "", true }, - { "foo/../..", "", true }, - { "foo/../../..", "", true }, - { "foo/../../bar/../../baz", "baz", true }, - { "a/b/../../c", "c", true }, - { "./a/../b", "b", true } + { "", "", false }, + { ".", "", true }, + { "./", "", true }, + { "..", "", true }, + { "../", "", true }, + { "foo", "foo", false }, + { "foo/bar", "foo/bar", false }, + { "foo///bar", "foo///bar", false }, + { "foo/.", "foo/", true }, + { "foo/./", "foo/", true }, + { "foo./", "foo./", false }, + { "foo/../bar", "bar", true }, + { "foo/../bar/", "bar/", true }, + { "foo/bar/..", "foo/", true }, + { "foo/bar/../x", "foo/x", true }, + { "foo/bar/../x/", "foo/x/", true }, + { "foo/..", "", true }, + { "foo/../..", "", true }, + { "foo/../../..", "", true }, + { "foo/../../bar/../../baz", "baz", true }, + { "a/b/../../c", "c", true }, + { "./a/../b", "b", true } }; int i; @@ -2036,5 +2094,33 @@ test_append_uri_pathel() return NULL; } +const char* +test_are_urls_equal() +{ + int i; + struct { + char *url1; + char *url2; + bool expected_result; + } test_array[] = { + { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true }, + { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false }, + { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false }, + { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true }, + }; + + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) + { + mu_assert ("test_are_urls_equal: wrong result", + are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result); + } + + return NULL; +} + #endif /* TESTING */ +/* + * vim: et ts=2 sw=2 + */ + diff --git a/src/url.h b/src/url.h index d907efaa..9f5834ec 100644 --- a/src/url.h +++ b/src/url.h @@ -97,4 +97,6 @@ int mkalldirs (const char *); char *rewrite_shorthand_url (const char *); bool schemes_are_similar_p (enum url_scheme a, enum url_scheme b); +bool are_urls_equal (const char *u1, const char *u2); + #endif /* URL_H */