[svn] Fix overzealous URL-removal in register_download.

Published in <sxszo4yqq91.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-12-04 19:51:23 -08:00
parent 936b074bd9
commit 8a2ab60263
2 changed files with 91 additions and 18 deletions

View File

@ -1,3 +1,10 @@
2001-12-05 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (convert_all_links): Guard against duplicates in
downloaded_html_files.
(register_download): Don't invalidate similar-looking URLs.
(match_except_index): New function.
2001-12-05 Hrvoje Niksic <hniksic@arsdigita.com> 2001-12-05 Hrvoje Niksic <hniksic@arsdigita.com>
* utils.c (path_simplify): Document with test cases. * utils.c (path_simplify): Document with test cases.

View File

@ -613,6 +613,60 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
dl_url_file_map = make_string_hash_table (0); \ dl_url_file_map = make_string_hash_table (0); \
} while (0) } while (0)
/* Return 1 if S1 and S2 are the same, except for "/index.html". The
three cases in which it returns one are (substitute any substring
for "foo"):
m("foo/index.html", "foo/") ==> 1
m("foo/", "foo/index.html") ==> 1
m("foo", "foo/index.html") ==> 1
m("foo", "foo/" ==> 1
m("foo", "foo") ==> 1 */
static int
match_except_index (const char *s1, const char *s2)
{
int i;
const char *lng;
/* Skip common substring. */
for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
;
if (i == 0)
/* Strings differ at the very beginning -- bail out. We need to
check this explicitly to avoid `lng - 1' reading outside the
array. */
return 0;
if (!*s1 && !*s2)
/* Both strings hit EOF -- strings are equal. */
return 1;
else if (*s1 && *s2)
/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
return 0;
else if (*s1)
/* S1 is the longer one. */
lng = s1;
else
/* S2 is the longer one. */
lng = s2;
/* foo */ /* foo/ */
/* foo/index.html */ /* or */ /* foo/index.html */
/* ^ */ /* ^ */
if (*lng != '/')
/* The right-hand case. */
--lng;
if (*lng == '/' && *(lng + 1) == '\0')
/* foo */
/* foo/ */
return 1;
return 0 == strcmp (lng, "/index.html");
}
static int static int
dissociate_urls_from_file_mapper (void *key, void *value, void *arg) dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
{ {
@ -652,14 +706,10 @@ register_download (const char *url, const char *file)
ENSURE_TABLES_EXIST; ENSURE_TABLES_EXIST;
/* With some forms of retrieval, it is possible, although not /* With some forms of retrieval, it is possible, although not likely
likely, for different URLs to resolve to the same file name. For or particularly desirable. If both are downloaded, the second
example, "http://www.server.com/" and download will override the first one. When that happens,
"http://www.server.com/index.html" will both resolve to the same dissociate the old file name from the URL. */
file, "index.html". If both are downloaded, the second download
will override the first one.
If that happens, dissociate the old file name from the URL. */
if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
{ {
@ -668,6 +718,14 @@ register_download (const char *url, const char *file)
Nothing to do. */ Nothing to do. */
return; return;
if (match_except_index (url, old_url)
&& !hash_table_contains (dl_url_file_map, url))
/* The two URLs differ only in the "index.html" ending. For
example, one is "http://www.server.com/", and the other is
"http://www.server.com/index.html". Don't remove the old
one, just add the new one as a non-canonical entry. */
goto url_only;
hash_table_remove (dl_file_url_map, file); hash_table_remove (dl_file_url_map, file);
xfree (old_file); xfree (old_file);
xfree (old_url); xfree (old_url);
@ -694,6 +752,7 @@ register_download (const char *url, const char *file)
assert (!hash_table_contains (dl_url_file_map, url)); assert (!hash_table_contains (dl_url_file_map, url));
hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
url_only:
hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
} }
@ -761,11 +820,11 @@ void
convert_all_links (void) convert_all_links (void)
{ {
slist *html; slist *html;
struct wget_timer *timer;
long msecs; long msecs;
int file_count = 0; int file_count = 0;
timer = wtimer_new (); struct wget_timer *timer = wtimer_new ();
struct hash_table *seen = make_string_hash_table (0);
/* Destructively reverse downloaded_html_files to get it in the right order. /* Destructively reverse downloaded_html_files to get it in the right order.
recursive_retrieve() used slist_prepend() consistently. */ recursive_retrieve() used slist_prepend() consistently. */
@ -775,20 +834,26 @@ convert_all_links (void)
{ {
struct urlpos *urls, *cur_url; struct urlpos *urls, *cur_url;
char *url; char *url;
char *file = html->string;
/* Guard against duplicates. */
if (string_set_contains (seen, file))
continue;
string_set_add (seen, file);
/* Determine the URL of the HTML file. get_urls_html will need /* Determine the URL of the HTML file. get_urls_html will need
it. */ it. */
url = hash_table_get (dl_file_url_map, html->string); url = hash_table_get (dl_file_url_map, file);
if (!url) if (!url)
{ {
DEBUGP (("Apparently %s has been removed.\n", html->string)); DEBUGP (("Apparently %s has been removed.\n", file));
continue; continue;
} }
DEBUGP (("Rescanning %s (from %s)\n", html->string, url)); DEBUGP (("Scanning %s (from %s)\n", file, url));
/* Parse the HTML file... */ /* Parse the HTML file... */
urls = get_urls_html (html->string, url, NULL); urls = get_urls_html (file, url, NULL);
/* We don't respect meta_disallow_follow here because, even if /* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the the file is not followed, we might still want to convert the
@ -812,9 +877,6 @@ convert_all_links (void)
a URL was downloaded. Downloaded URLs will be converted a URL was downloaded. Downloaded URLs will be converted
ABS2REL, whereas non-downloaded will be converted REL2ABS. */ ABS2REL, whereas non-downloaded will be converted REL2ABS. */
local_name = hash_table_get (dl_url_file_map, u->url); local_name = hash_table_get (dl_url_file_map, u->url);
if (local_name)
DEBUGP (("%s marked for conversion, local %s\n",
u->url, local_name));
/* Decide on the conversion type. */ /* Decide on the conversion type. */
if (local_name) if (local_name)
@ -826,6 +888,7 @@ convert_all_links (void)
`--cut-dirs', etc.) */ `--cut-dirs', etc.) */
cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->convert = CO_CONVERT_TO_RELATIVE;
cur_url->local_name = xstrdup (local_name); cur_url->local_name = xstrdup (local_name);
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
} }
else else
{ {
@ -836,11 +899,12 @@ convert_all_links (void)
if (!cur_url->link_complete_p) if (!cur_url->link_complete_p)
cur_url->convert = CO_CONVERT_TO_COMPLETE; cur_url->convert = CO_CONVERT_TO_COMPLETE;
cur_url->local_name = NULL; cur_url->local_name = NULL;
DEBUGP (("will convert url %s to complete\n", u->url));
} }
} }
/* Convert the links in the file. */ /* Convert the links in the file. */
convert_links (html->string, urls); convert_links (file, urls);
++file_count; ++file_count;
/* Free the data. */ /* Free the data. */
@ -851,6 +915,8 @@ convert_all_links (void)
wtimer_delete (timer); wtimer_delete (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"), logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
file_count, (double)msecs / 1000); file_count, (double)msecs / 1000);
string_set_free (seen);
} }
/* Cleanup the data structures associated with recursive retrieving /* Cleanup the data structures associated with recursive retrieving