diff --git a/src/ChangeLog b/src/ChangeLog index e32012db..47d568b4 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,6 +1,10 @@ 2014-11-26 Tim Ruehsen <tim.ruehsen@gmx.de> - * src/recur.c: Fix blacklisting of URLs to download + * recur.c: Add space after function names + +2014-11-26 Tim Ruehsen <tim.ruehsen@gmx.de> + + * recur.c: Fix blacklisting of URLs to download Fixes a reported crash and prevents multiple downloads of the same file in case the URL is escaped in different ways. diff --git a/src/recur.c b/src/recur.c index 348e9a69..33d5a142 100644 --- a/src/recur.c +++ b/src/recur.c @@ -160,16 +160,16 @@ url_dequeue (struct url_queue *queue, struct iri **i, return true; } -static void blacklist_add(struct hash_table *blacklist, const char *url) +static void blacklist_add (struct hash_table *blacklist, const char *url) { - char *url_unescaped = xstrdup(url); + char *url_unescaped = xstrdup (url); url_unescape (url_unescaped); string_set_add (blacklist, url_unescaped); xfree (url_unescaped); } -static int blacklist_contains(struct hash_table *blacklist, const char *url) +static int blacklist_contains (struct hash_table *blacklist, const char *url) { char *url_unescaped = xstrdup(url); int ret; @@ -241,7 +241,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) just URL so we enqueue the canonical form of the URL. */ url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, false); - blacklist_add(blacklist, start_url_parsed->url); + blacklist_add (blacklist, start_url_parsed->url); while (1) { @@ -332,7 +332,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) else /* Make sure that the old pre-redirect form gets blacklisted. */ - blacklist_add(blacklist, url); + blacklist_add (blacklist, url); } xfree (url); @@ -343,7 +343,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) xfree (url); url = xstrdup (url_parsed->url); } - url_free(url_parsed); + url_free (url_parsed); } if (opt.spider) @@ -425,7 +425,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ - blacklist_add(blacklist, child->url->url); + blacklist_add (blacklist, child->url->url); } } @@ -497,7 +497,7 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) URL is to be descended to. This is only ever called from retrieve_tree, but is in a separate function for clarity. - The most expensive checks (such as those for robots) are memorized + The most expensive checks (such as those for robots) are memoized by storing these URLs to BLACKLIST. This may or may not help. It will help if those URLs are encountered many times. */ @@ -512,7 +512,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); - if (blacklist_contains(blacklist, url)) + if (blacklist_contains (blacklist, url)) { if (opt.spider) { @@ -691,7 +691,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (!res_match_path (specs, u->path)) { DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); - blacklist_add(blacklist, url); + blacklist_add (blacklist, url); goto out; } } @@ -734,7 +734,7 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth, start_url_parsed, blacklist, iri); if (success) - blacklist_add(blacklist, upos->url->url); + blacklist_add (blacklist, upos->url->url); else DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));