1999-12-02 15:42:23 +08:00
|
|
|
|
/* Handling of recursive HTTP retrieving.
|
2001-11-25 11:10:34 +08:00
|
|
|
|
Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-05-28 03:35:15 +08:00
|
|
|
|
This file is part of GNU Wget.
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-05-28 03:35:15 +08:00
|
|
|
|
GNU Wget is free software; you can redistribute it and/or modify
|
1999-12-02 15:42:23 +08:00
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
2001-05-28 03:35:15 +08:00
|
|
|
|
GNU Wget is distributed in the hope that it will be useful,
|
1999-12-02 15:42:23 +08:00
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
2001-05-28 03:35:15 +08:00
|
|
|
|
along with Wget; if not, write to the Free Software
|
1999-12-02 15:42:23 +08:00
|
|
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
|
# include <string.h>
|
|
|
|
|
#else
|
|
|
|
|
# include <strings.h>
|
|
|
|
|
#endif /* HAVE_STRING_H */
|
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
|
# include <unistd.h>
|
|
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
|
|
|
#include "wget.h"
|
|
|
|
|
#include "url.h"
|
|
|
|
|
#include "recur.h"
|
|
|
|
|
#include "utils.h"
|
|
|
|
|
#include "retr.h"
|
|
|
|
|
#include "ftp.h"
|
|
|
|
|
#include "fnmatch.h"
|
|
|
|
|
#include "host.h"
|
2000-11-20 04:50:10 +08:00
|
|
|
|
#include "hash.h"
|
2001-11-18 10:17:30 +08:00
|
|
|
|
#include "res.h"
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2000-12-18 02:12:02 +08:00
|
|
|
|
#ifndef errno
|
|
|
|
|
extern int errno;
|
|
|
|
|
#endif
|
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
extern char *version_string;
|
|
|
|
|
|
2000-11-20 04:50:10 +08:00
|
|
|
|
static struct hash_table *dl_file_url_map;
|
|
|
|
|
static struct hash_table *dl_url_file_map;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-03-31 10:05:54 +08:00
|
|
|
|
/* List of HTML files downloaded in this Wget run. Used for link
|
2001-11-25 11:10:34 +08:00
|
|
|
|
conversion after Wget is done. This list should only be traversed
|
|
|
|
|
in order. If you need to check whether a file has been downloaded,
|
|
|
|
|
use a hash table, e.g. dl_file_url_map. */
|
2001-03-31 10:05:54 +08:00
|
|
|
|
static slist *downloaded_html_files;
|
2001-12-05 05:03:35 +08:00
|
|
|
|
|
|
|
|
|
static void register_delete_file PARAMS ((const char *));
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
/* Functions for maintaining the URL queue. */
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct queue_element {
|
|
|
|
|
const char *url;
|
|
|
|
|
const char *referer;
|
|
|
|
|
int depth;
|
|
|
|
|
struct queue_element *next;
|
|
|
|
|
};
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct url_queue {
|
|
|
|
|
struct queue_element *head;
|
|
|
|
|
struct queue_element *tail;
|
|
|
|
|
int count, maxcount;
|
|
|
|
|
};
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Create a URL queue. */
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
static struct url_queue *
|
|
|
|
|
url_queue_new (void)
|
|
|
|
|
{
|
|
|
|
|
struct url_queue *queue = xmalloc (sizeof (*queue));
|
|
|
|
|
memset (queue, '\0', sizeof (*queue));
|
|
|
|
|
return queue;
|
|
|
|
|
}
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Delete a URL queue. */
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
static void
|
|
|
|
|
url_queue_delete (struct url_queue *queue)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
xfree (queue);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Enqueue a URL in the queue. The queue is FIFO: the items will be
|
|
|
|
|
retrieved ("dequeued") from the queue in the order they were placed
|
|
|
|
|
into it. */
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
url_enqueue (struct url_queue *queue,
|
|
|
|
|
const char *url, const char *referer, int depth)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct queue_element *qel = xmalloc (sizeof (*qel));
|
|
|
|
|
qel->url = url;
|
|
|
|
|
qel->referer = referer;
|
|
|
|
|
qel->depth = depth;
|
|
|
|
|
qel->next = NULL;
|
|
|
|
|
|
|
|
|
|
++queue->count;
|
|
|
|
|
if (queue->count > queue->maxcount)
|
|
|
|
|
queue->maxcount = queue->count;
|
|
|
|
|
|
|
|
|
|
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
|
|
|
|
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
|
|
|
|
|
|
|
|
|
if (queue->tail)
|
|
|
|
|
queue->tail->next = qel;
|
|
|
|
|
queue->tail = qel;
|
|
|
|
|
|
|
|
|
|
if (!queue->head)
|
|
|
|
|
queue->head = queue->tail;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Take a URL out of the queue. Return 1 if this operation succeeded,
|
|
|
|
|
or 0 if the queue is empty. */
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
url_dequeue (struct url_queue *queue,
|
|
|
|
|
const char **url, const char **referer, int *depth)
|
|
|
|
|
{
|
|
|
|
|
struct queue_element *qel = queue->head;
|
|
|
|
|
|
|
|
|
|
if (!qel)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
queue->head = queue->head->next;
|
|
|
|
|
if (!queue->head)
|
|
|
|
|
queue->tail = NULL;
|
|
|
|
|
|
|
|
|
|
*url = qel->url;
|
|
|
|
|
*referer = qel->referer;
|
|
|
|
|
*depth = qel->depth;
|
|
|
|
|
|
|
|
|
|
--queue->count;
|
|
|
|
|
|
|
|
|
|
DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
|
|
|
|
|
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
|
|
|
|
|
|
|
|
|
xfree (qel);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2001-11-26 05:03:30 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
|
|
|
|
|
struct url *, struct hash_table *));
|
2001-11-26 09:11:48 +08:00
|
|
|
|
static int descend_redirect_p PARAMS ((const char *, const char *, int,
|
|
|
|
|
struct url *, struct hash_table *));
|
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
/* Retrieve a part of the web beginning with START_URL. This used to
|
|
|
|
|
be called "recursive retrieval", because the old function was
|
|
|
|
|
recursive and implemented depth-first search. retrieve_tree on the
|
|
|
|
|
other hand implements breadth-search traversal of the tree, which
|
|
|
|
|
results in much nicer ordering of downloads.
|
|
|
|
|
|
|
|
|
|
The algorithm this function uses is simple:
|
|
|
|
|
|
|
|
|
|
1. put START_URL in the queue.
|
|
|
|
|
2. while there are URLs in the queue:
|
|
|
|
|
|
|
|
|
|
3. get next URL from the queue.
|
|
|
|
|
4. download it.
|
|
|
|
|
5. if the URL is HTML and its depth does not exceed maximum depth,
|
|
|
|
|
get the list of URLs embedded therein.
|
|
|
|
|
6. for each of those URLs do the following:
|
|
|
|
|
|
|
|
|
|
7. if the URL is not one of those downloaded before, and if it
|
|
|
|
|
satisfies the criteria specified by the various command-line
|
|
|
|
|
options, add it to the queue. */
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
|
|
|
|
uerr_t
|
2001-11-25 11:10:34 +08:00
|
|
|
|
retrieve_tree (const char *start_url)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
uerr_t status = RETROK;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* The queue of URLs we need to load. */
|
|
|
|
|
struct url_queue *queue = url_queue_new ();
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-26 05:03:30 +08:00
|
|
|
|
/* The URLs we do not wish to enqueue, because they are already in
|
|
|
|
|
the queue, but haven't been downloaded yet. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct hash_table *blacklist = make_string_hash_table (0);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* We'll need various components of this, so better get it over with
|
|
|
|
|
now. */
|
|
|
|
|
struct url *start_url_parsed = url_parse (start_url, NULL);
|
2000-11-20 04:50:10 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
url_enqueue (queue, xstrdup (start_url), NULL, 0);
|
|
|
|
|
string_set_add (blacklist, start_url);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
while (1)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
int descend = 0;
|
|
|
|
|
char *url, *referer, *file = NULL;
|
|
|
|
|
int depth;
|
|
|
|
|
boolean dash_p_leaf_HTML = FALSE;
|
|
|
|
|
|
2000-11-02 07:17:31 +08:00
|
|
|
|
if (downloaded_exceeds_quota ())
|
1999-12-02 15:42:23 +08:00
|
|
|
|
break;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
if (status == FWRITEERR)
|
|
|
|
|
break;
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
/* Get the next URL from the queue... */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
if (!url_dequeue (queue,
|
|
|
|
|
(const char **)&url, (const char **)&referer,
|
|
|
|
|
&depth))
|
|
|
|
|
break;
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
/* ...and download it. Note that this download is in most cases
|
|
|
|
|
unconditional, as download_child_p already makes sure a file
|
|
|
|
|
doesn't get enqueued twice -- and yet this check is here, and
|
|
|
|
|
not in download_child_p. This is so that if you run `wget -r
|
|
|
|
|
URL1 URL2', and a random URL is encountered once under URL1
|
|
|
|
|
and again under URL2, but at a different (possibly smaller)
|
|
|
|
|
depth, we want the URL's children to be taken into account
|
|
|
|
|
the second time. */
|
|
|
|
|
if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
|
|
|
|
url, (char *)hash_table_get (dl_url_file_map, url)));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
int dt = 0;
|
|
|
|
|
char *redirected = NULL;
|
|
|
|
|
int oldrec = opt.recursive;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
opt.recursive = 0;
|
|
|
|
|
status = retrieve_url (url, &file, &redirected, NULL, &dt);
|
|
|
|
|
opt.recursive = oldrec;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
if (file && status == RETROK
|
|
|
|
|
&& (dt & RETROKF) && (dt & TEXTHTML))
|
|
|
|
|
descend = 1;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
if (redirected)
|
|
|
|
|
{
|
|
|
|
|
/* We have been redirected, possibly to another host, or
|
|
|
|
|
different path, or wherever. Check whether we really
|
|
|
|
|
want to follow it. */
|
|
|
|
|
if (descend)
|
|
|
|
|
{
|
|
|
|
|
if (!descend_redirect_p (redirected, url, depth,
|
|
|
|
|
start_url_parsed, blacklist))
|
|
|
|
|
descend = 0;
|
|
|
|
|
else
|
|
|
|
|
/* Make sure that the old pre-redirect form gets
|
|
|
|
|
blacklisted. */
|
|
|
|
|
string_set_add (blacklist, url);
|
|
|
|
|
}
|
2001-11-26 09:11:48 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
xfree (url);
|
|
|
|
|
url = redirected;
|
|
|
|
|
}
|
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
if (descend
|
|
|
|
|
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-12-01 11:06:41 +08:00
|
|
|
|
if (opt.page_requisites
|
|
|
|
|
&& (depth == opt.reclevel || depth == opt.reclevel + 1))
|
|
|
|
|
{
|
|
|
|
|
/* When -p is specified, we are allowed to exceed the
|
|
|
|
|
maximum depth, but only for the "inline" links,
|
|
|
|
|
i.e. those that are needed to display the page.
|
|
|
|
|
Originally this could exceed the depth at most by
|
|
|
|
|
one, but we allow one more level so that the leaf
|
|
|
|
|
pages that contain frames can be loaded
|
|
|
|
|
correctly. */
|
|
|
|
|
dash_p_leaf_HTML = TRUE;
|
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
else
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Either -p wasn't specified or it was and we've
|
2001-12-01 11:06:41 +08:00
|
|
|
|
already spent the two extra (pseudo-)levels that it
|
2001-11-25 11:10:34 +08:00
|
|
|
|
affords us, so we need to bail out. */
|
|
|
|
|
DEBUGP (("Not descending further; at depth %d, max. %d.\n",
|
|
|
|
|
depth, opt.reclevel));
|
|
|
|
|
descend = 0;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
/* If the downloaded document was HTML, parse it and enqueue the
|
|
|
|
|
links it contains. */
|
|
|
|
|
|
|
|
|
|
if (descend)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
int meta_disallow_follow = 0;
|
2001-12-01 05:17:53 +08:00
|
|
|
|
struct urlpos *children
|
|
|
|
|
= get_urls_html (file, url, &meta_disallow_follow);
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
if (opt.use_robots && meta_disallow_follow)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
free_urlpos (children);
|
|
|
|
|
children = NULL;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
if (children)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct urlpos *child = children;
|
|
|
|
|
struct url *url_parsed = url_parsed = url_parse (url, NULL);
|
|
|
|
|
assert (url_parsed != NULL);
|
|
|
|
|
|
|
|
|
|
for (; child; child = child->next)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-26 02:40:55 +08:00
|
|
|
|
if (child->ignore_when_downloading)
|
|
|
|
|
continue;
|
2001-12-01 05:17:53 +08:00
|
|
|
|
if (dash_p_leaf_HTML && !child->link_inline_p)
|
|
|
|
|
continue;
|
2001-12-05 05:03:35 +08:00
|
|
|
|
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
|
|
|
|
blacklist))
|
2001-11-25 11:10:34 +08:00
|
|
|
|
{
|
|
|
|
|
url_enqueue (queue, xstrdup (child->url->url),
|
|
|
|
|
xstrdup (url), depth + 1);
|
|
|
|
|
/* We blacklist the URL we have enqueued, because we
|
|
|
|
|
don't want to enqueue (and hence download) the
|
|
|
|
|
same URL twice. */
|
|
|
|
|
string_set_add (blacklist, child->url->url);
|
|
|
|
|
}
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
url_free (url_parsed);
|
|
|
|
|
free_urlpos (children);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
if (opt.delete_after || (file && !acceptable (file)))
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Either --delete-after was specified, or we loaded this
|
|
|
|
|
otherwise rejected (e.g. by -R) HTML file just so we
|
|
|
|
|
could harvest its hyperlinks -- in either case, delete
|
|
|
|
|
the local file. */
|
|
|
|
|
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
|
|
|
|
|
opt.delete_after ? "--delete-after" :
|
|
|
|
|
"recursive rejection criteria"));
|
|
|
|
|
logprintf (LOG_VERBOSE,
|
2001-11-26 09:11:48 +08:00
|
|
|
|
(opt.delete_after
|
|
|
|
|
? _("Removing %s.\n")
|
2001-11-25 11:10:34 +08:00
|
|
|
|
: _("Removing %s since it should be rejected.\n")),
|
|
|
|
|
file);
|
|
|
|
|
if (unlink (file))
|
|
|
|
|
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
|
2001-12-05 05:03:35 +08:00
|
|
|
|
register_delete_file (file);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
xfree (url);
|
|
|
|
|
FREE_MAYBE (referer);
|
|
|
|
|
FREE_MAYBE (file);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If anything is left of the queue due to a premature exit, free it
|
|
|
|
|
now. */
|
|
|
|
|
{
|
|
|
|
|
char *d1, *d2;
|
|
|
|
|
int d3;
|
|
|
|
|
while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
|
|
|
|
|
{
|
|
|
|
|
xfree (d1);
|
|
|
|
|
FREE_MAYBE (d2);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
url_queue_delete (queue);
|
|
|
|
|
|
|
|
|
|
if (start_url_parsed)
|
|
|
|
|
url_free (start_url_parsed);
|
|
|
|
|
string_set_free (blacklist);
|
|
|
|
|
|
|
|
|
|
if (downloaded_exceeds_quota ())
|
|
|
|
|
return QUOTEXC;
|
|
|
|
|
else if (status == FWRITEERR)
|
|
|
|
|
return FWRITEERR;
|
|
|
|
|
else
|
|
|
|
|
return RETROK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Based on the context provided by retrieve_tree, decide whether a
|
|
|
|
|
URL is to be descended to. This is only ever called from
|
2001-11-26 05:03:30 +08:00
|
|
|
|
retrieve_tree, but is in a separate function for clarity.
|
|
|
|
|
|
|
|
|
|
The most expensive checks (such as those for robots) are memoized
|
|
|
|
|
by storing these URLs to BLACKLIST. This may or may not help. It
|
|
|
|
|
will help if those URLs are encountered many times. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
static int
|
2001-12-05 05:03:35 +08:00
|
|
|
|
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|
|
|
|
struct url *start_url_parsed, struct hash_table *blacklist)
|
2001-11-25 11:10:34 +08:00
|
|
|
|
{
|
|
|
|
|
struct url *u = upos->url;
|
|
|
|
|
const char *url = u->url;
|
|
|
|
|
|
|
|
|
|
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
|
|
|
|
|
|
|
|
|
|
if (string_set_contains (blacklist, url))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("Already on the black list.\n"));
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Several things to check for:
|
|
|
|
|
1. if scheme is not http, and we don't load it
|
|
|
|
|
2. check for relative links (if relative_only is set)
|
|
|
|
|
3. check for domain
|
|
|
|
|
4. check for no-parent
|
|
|
|
|
5. check for excludes && includes
|
|
|
|
|
6. check for suffix
|
|
|
|
|
7. check for same host (if spanhost is unset), with possible
|
|
|
|
|
gethostbyname baggage
|
|
|
|
|
8. check for robots.txt
|
|
|
|
|
|
|
|
|
|
Addendum: If the URL is FTP, and it is to be loaded, only the
|
|
|
|
|
domain and suffix settings are "stronger".
|
|
|
|
|
|
|
|
|
|
Note that .html files will get loaded regardless of suffix rules
|
|
|
|
|
(but that is remedied later with unlink) unless the depth equals
|
|
|
|
|
the maximum depth.
|
|
|
|
|
|
|
|
|
|
More time- and memory- consuming tests should be put later on
|
|
|
|
|
the list. */
|
|
|
|
|
|
|
|
|
|
/* 1. Schemes other than HTTP are normally not recursed into. */
|
|
|
|
|
if (u->scheme != SCHEME_HTTP
|
|
|
|
|
&& !(u->scheme == SCHEME_FTP && opt.follow_ftp))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("Not following non-HTTP schemes.\n"));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 2. If it is an absolute link and they are not followed, throw it
|
|
|
|
|
out. */
|
|
|
|
|
if (u->scheme == SCHEME_HTTP)
|
|
|
|
|
if (opt.relative_only && !upos->link_relative_p)
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("It doesn't really look like a relative link.\n"));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 3. If its domain is not to be accepted/looked-up, chuck it
|
|
|
|
|
out. */
|
|
|
|
|
if (!accept_domain (u))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("The domain was not accepted.\n"));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 4. Check for parent directory.
|
|
|
|
|
|
|
|
|
|
If we descended to a different host or changed the scheme, ignore
|
2001-12-01 05:17:53 +08:00
|
|
|
|
opt.no_parent. Also ignore it for documents needed to display
|
|
|
|
|
the parent page when in -p mode. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
if (opt.no_parent
|
2001-11-30 01:04:28 +08:00
|
|
|
|
&& u->scheme == start_url_parsed->scheme
|
|
|
|
|
&& 0 == strcasecmp (u->host, start_url_parsed->host)
|
2001-12-01 05:17:53 +08:00
|
|
|
|
&& u->port == start_url_parsed->port
|
|
|
|
|
&& !(opt.page_requisites && upos->link_inline_p))
|
2001-11-25 11:10:34 +08:00
|
|
|
|
{
|
2001-11-30 01:04:28 +08:00
|
|
|
|
if (!frontcmp (start_url_parsed->dir, u->dir))
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-30 01:04:28 +08:00
|
|
|
|
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
|
|
|
|
|
u->dir, start_url_parsed->dir));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* 5. If the file does not match the acceptance list, or is on the
|
|
|
|
|
rejection list, chuck it out. The same goes for the directory
|
|
|
|
|
exclusion and inclusion lists. */
|
|
|
|
|
if (opt.includes || opt.excludes)
|
|
|
|
|
{
|
|
|
|
|
if (!accdir (u->dir, ALLABS))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* 6. */
|
|
|
|
|
{
|
2001-11-26 05:03:30 +08:00
|
|
|
|
char *suf;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Check for acceptance/rejection rules. We ignore these rules
|
|
|
|
|
for HTML documents because they might lead to other files which
|
|
|
|
|
need to be downloaded. Of course, we don't know which
|
|
|
|
|
documents are HTML before downloading them, so we guess.
|
|
|
|
|
|
|
|
|
|
A file is subject to acceptance/rejection rules if:
|
|
|
|
|
|
|
|
|
|
* u->file is not "" (i.e. it is not a directory)
|
|
|
|
|
and either:
|
|
|
|
|
+ there is no file suffix,
|
|
|
|
|
+ or there is a suffix, but is not "html" or "htm",
|
|
|
|
|
+ both:
|
|
|
|
|
- recursion is not infinite,
|
|
|
|
|
- and we are at its very end. */
|
|
|
|
|
|
|
|
|
|
if (u->file[0] != '\0'
|
|
|
|
|
&& ((suf = suffix (url)) == NULL
|
|
|
|
|
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|
2001-12-01 05:17:53 +08:00
|
|
|
|
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
|
2001-11-25 11:10:34 +08:00
|
|
|
|
{
|
|
|
|
|
if (!acceptable (u->file))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
|
|
|
|
url, u->file));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 7. */
|
|
|
|
|
if (u->scheme == parent->scheme)
|
|
|
|
|
if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
|
|
|
|
|
u->host, parent->host));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
goto out;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* 8. */
|
|
|
|
|
if (opt.use_robots && u->scheme == SCHEME_HTTP)
|
|
|
|
|
{
|
|
|
|
|
struct robot_specs *specs = res_get_specs (u->host, u->port);
|
|
|
|
|
if (!specs)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
char *rfile;
|
|
|
|
|
if (res_retrieve_file (url, &rfile))
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
specs = res_parse_from_file (rfile);
|
|
|
|
|
xfree (rfile);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* If we cannot get real specs, at least produce
|
|
|
|
|
dummy ones so that we can register them and stop
|
|
|
|
|
trying to retrieve them. */
|
|
|
|
|
specs = res_parse ("", 0);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
res_register_specs (u->host, u->port, specs);
|
|
|
|
|
}
|
2000-10-24 11:43:47 +08:00
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Now that we have (or don't have) robots.txt specs, we can
|
|
|
|
|
check what they say. */
|
|
|
|
|
if (!res_match_path (specs, u->path))
|
|
|
|
|
{
|
|
|
|
|
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
|
2001-11-26 05:03:30 +08:00
|
|
|
|
string_set_add (blacklist, url);
|
|
|
|
|
goto out;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
/* The URL has passed all the tests. It can be placed in the
|
|
|
|
|
download queue. */
|
|
|
|
|
DEBUGP (("Decided to load it.\n"));
|
|
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
DEBUGP (("Decided NOT to load it.\n"));
|
|
|
|
|
|
|
|
|
|
return 0;
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-26 09:11:48 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
/* This function determines whether we will consider downloading the
|
|
|
|
|
children of a URL whose download resulted in a redirection,
|
|
|
|
|
possibly to another host, etc. It is needed very rarely, and thus
|
|
|
|
|
it is merely a simple-minded wrapper around download_child_p. */
|
2001-11-26 09:11:48 +08:00
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
descend_redirect_p (const char *redirected, const char *original, int depth,
|
|
|
|
|
struct url *start_url_parsed, struct hash_table *blacklist)
|
|
|
|
|
{
|
|
|
|
|
struct url *orig_parsed, *new_parsed;
|
|
|
|
|
struct urlpos *upos;
|
|
|
|
|
int success;
|
|
|
|
|
|
|
|
|
|
orig_parsed = url_parse (original, NULL);
|
|
|
|
|
assert (orig_parsed != NULL);
|
|
|
|
|
|
|
|
|
|
new_parsed = url_parse (redirected, NULL);
|
|
|
|
|
assert (new_parsed != NULL);
|
|
|
|
|
|
|
|
|
|
upos = xmalloc (sizeof (struct urlpos));
|
|
|
|
|
memset (upos, 0, sizeof (*upos));
|
|
|
|
|
upos->url = new_parsed;
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
success = download_child_p (upos, orig_parsed, depth,
|
|
|
|
|
start_url_parsed, blacklist);
|
2001-11-26 09:11:48 +08:00
|
|
|
|
|
|
|
|
|
url_free (orig_parsed);
|
|
|
|
|
url_free (new_parsed);
|
|
|
|
|
xfree (upos);
|
|
|
|
|
|
|
|
|
|
if (!success)
|
|
|
|
|
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
|
|
|
|
|
|
|
|
|
|
return success;
|
|
|
|
|
}
|
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
#define ENSURE_TABLES_EXIST do { \
|
|
|
|
|
if (!dl_file_url_map) \
|
|
|
|
|
dl_file_url_map = make_string_hash_table (0); \
|
|
|
|
|
if (!dl_url_file_map) \
|
|
|
|
|
dl_url_file_map = make_string_hash_table (0); \
|
|
|
|
|
} while (0)
|
|
|
|
|
|
2001-12-05 11:51:23 +08:00
|
|
|
|
/* Return 1 if S1 and S2 are the same, except for "/index.html". The
|
|
|
|
|
three cases in which it returns one are (substitute any substring
|
|
|
|
|
for "foo"):
|
|
|
|
|
|
|
|
|
|
m("foo/index.html", "foo/") ==> 1
|
|
|
|
|
m("foo/", "foo/index.html") ==> 1
|
|
|
|
|
m("foo", "foo/index.html") ==> 1
|
|
|
|
|
m("foo", "foo/" ==> 1
|
|
|
|
|
m("foo", "foo") ==> 1 */
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
match_except_index (const char *s1, const char *s2)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
const char *lng;
|
|
|
|
|
|
|
|
|
|
/* Skip common substring. */
|
|
|
|
|
for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
|
|
|
|
|
;
|
|
|
|
|
if (i == 0)
|
|
|
|
|
/* Strings differ at the very beginning -- bail out. We need to
|
|
|
|
|
check this explicitly to avoid `lng - 1' reading outside the
|
|
|
|
|
array. */
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (!*s1 && !*s2)
|
|
|
|
|
/* Both strings hit EOF -- strings are equal. */
|
|
|
|
|
return 1;
|
|
|
|
|
else if (*s1 && *s2)
|
|
|
|
|
/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
|
|
|
|
|
return 0;
|
|
|
|
|
else if (*s1)
|
|
|
|
|
/* S1 is the longer one. */
|
|
|
|
|
lng = s1;
|
|
|
|
|
else
|
|
|
|
|
/* S2 is the longer one. */
|
|
|
|
|
lng = s2;
|
|
|
|
|
|
|
|
|
|
/* foo */ /* foo/ */
|
|
|
|
|
/* foo/index.html */ /* or */ /* foo/index.html */
|
|
|
|
|
/* ^ */ /* ^ */
|
|
|
|
|
|
|
|
|
|
if (*lng != '/')
|
|
|
|
|
/* The right-hand case. */
|
|
|
|
|
--lng;
|
|
|
|
|
|
|
|
|
|
if (*lng == '/' && *(lng + 1) == '\0')
|
|
|
|
|
/* foo */
|
|
|
|
|
/* foo/ */
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
return 0 == strcmp (lng, "/index.html");
|
|
|
|
|
}
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
static int
|
|
|
|
|
dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
|
|
|
|
|
{
|
|
|
|
|
char *mapping_url = (char *)key;
|
|
|
|
|
char *mapping_file = (char *)value;
|
|
|
|
|
char *file = (char *)arg;
|
|
|
|
|
|
|
|
|
|
if (0 == strcmp (mapping_file, file))
|
|
|
|
|
{
|
|
|
|
|
hash_table_remove (dl_url_file_map, mapping_url);
|
|
|
|
|
xfree (mapping_url);
|
|
|
|
|
xfree (mapping_file);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Continue mapping. */
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Remove all associations from various URLs to FILE from dl_url_file_map. */
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
dissociate_urls_from_file (const char *file)
|
|
|
|
|
{
|
|
|
|
|
hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
|
|
|
|
|
(char *)file);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Register that URL has been successfully downloaded to FILE. This
|
|
|
|
|
is used by the link conversion code to convert references to URLs
|
|
|
|
|
to references to local files. It is also being used to check if a
|
|
|
|
|
URL has already been downloaded. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-03-31 10:05:54 +08:00
|
|
|
|
void
|
|
|
|
|
register_download (const char *url, const char *file)
|
|
|
|
|
{
|
2001-12-05 05:03:35 +08:00
|
|
|
|
char *old_file, *old_url;
|
|
|
|
|
|
|
|
|
|
ENSURE_TABLES_EXIST;
|
|
|
|
|
|
2001-12-05 11:51:23 +08:00
|
|
|
|
/* With some forms of retrieval, it is possible, although not likely
|
|
|
|
|
or particularly desirable. If both are downloaded, the second
|
|
|
|
|
download will override the first one. When that happens,
|
|
|
|
|
dissociate the old file name from the URL. */
|
2001-12-05 05:03:35 +08:00
|
|
|
|
|
|
|
|
|
if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
|
|
|
|
{
|
|
|
|
|
if (0 == strcmp (url, old_url))
|
|
|
|
|
/* We have somehow managed to download the same URL twice.
|
|
|
|
|
Nothing to do. */
|
|
|
|
|
return;
|
|
|
|
|
|
2001-12-05 11:51:23 +08:00
|
|
|
|
if (match_except_index (url, old_url)
|
|
|
|
|
&& !hash_table_contains (dl_url_file_map, url))
|
|
|
|
|
/* The two URLs differ only in the "index.html" ending. For
|
|
|
|
|
example, one is "http://www.server.com/", and the other is
|
|
|
|
|
"http://www.server.com/index.html". Don't remove the old
|
|
|
|
|
one, just add the new one as a non-canonical entry. */
|
|
|
|
|
goto url_only;
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
hash_table_remove (dl_file_url_map, file);
|
|
|
|
|
xfree (old_file);
|
|
|
|
|
xfree (old_url);
|
|
|
|
|
|
|
|
|
|
/* Remove all the URLs that point to this file. Yes, there can
|
|
|
|
|
be more than one such URL, because we store redirections as
|
|
|
|
|
multiple entries in dl_url_file_map. For example, if URL1
|
|
|
|
|
redirects to URL2 which gets downloaded to FILE, we map both
|
|
|
|
|
URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
|
|
|
|
|
only points to URL2.) When another URL gets loaded to FILE,
|
|
|
|
|
we want both URL1 and URL2 dissociated from it.
|
|
|
|
|
|
|
|
|
|
This is a relatively expensive operation because it performs
|
|
|
|
|
a linear search of the whole hash table, but it should be
|
|
|
|
|
called very rarely, only when two URLs resolve to the same
|
|
|
|
|
file name, *and* the "<file>.1" extensions are turned off.
|
|
|
|
|
In other words, almost never. */
|
|
|
|
|
dissociate_urls_from_file (file);
|
|
|
|
|
}
|
|
|
|
|
|
2001-12-14 03:18:31 +08:00
|
|
|
|
hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
|
|
|
|
|
|
|
|
|
|
url_only:
|
2001-12-05 05:03:35 +08:00
|
|
|
|
/* A URL->FILE mapping is not possible without a FILE->URL mapping.
|
|
|
|
|
If the latter were present, it should have been removed by the
|
2001-12-14 03:18:31 +08:00
|
|
|
|
above `if'. So we could write:
|
|
|
|
|
|
|
|
|
|
assert (!hash_table_contains (dl_url_file_map, url));
|
|
|
|
|
|
|
|
|
|
The above is correct when running in recursive mode where the
|
|
|
|
|
same URL always resolves to the same file. But if you do
|
|
|
|
|
something like:
|
|
|
|
|
|
|
|
|
|
wget URL URL
|
|
|
|
|
|
|
|
|
|
then the first URL will resolve to "FILE", and the other to
|
|
|
|
|
"FILE.1". In that case, FILE.1 will not be found in
|
|
|
|
|
dl_file_url_map, but URL will still point to FILE in
|
|
|
|
|
dl_url_file_map. */
|
|
|
|
|
if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
|
|
|
|
|
{
|
|
|
|
|
hash_table_remove (dl_url_file_map, url);
|
|
|
|
|
xfree (old_url);
|
|
|
|
|
xfree (old_file);
|
|
|
|
|
}
|
2001-12-05 05:03:35 +08:00
|
|
|
|
|
|
|
|
|
hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
|
2001-03-31 10:05:54 +08:00
|
|
|
|
}
|
|
|
|
|
|
2001-11-25 11:10:34 +08:00
|
|
|
|
/* Register that FROM has been redirected to TO. This assumes that TO
|
|
|
|
|
is successfully downloaded and already registered using
|
|
|
|
|
register_download() above. */
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
register_redirection (const char *from, const char *to)
|
|
|
|
|
{
|
|
|
|
|
char *file;
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
ENSURE_TABLES_EXIST;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
file = hash_table_get (dl_url_file_map, to);
|
|
|
|
|
assert (file != NULL);
|
|
|
|
|
if (!hash_table_contains (dl_url_file_map, from))
|
|
|
|
|
hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
|
|
|
|
|
}
|
|
|
|
|
|
2001-12-05 05:03:35 +08:00
|
|
|
|
/* Register that the file has been deleted. */
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
register_delete_file (const char *file)
|
|
|
|
|
{
|
|
|
|
|
char *old_url, *old_file;
|
|
|
|
|
|
|
|
|
|
ENSURE_TABLES_EXIST;
|
|
|
|
|
|
|
|
|
|
if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
hash_table_remove (dl_file_url_map, file);
|
|
|
|
|
xfree (old_file);
|
|
|
|
|
xfree (old_url);
|
|
|
|
|
dissociate_urls_from_file (file);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Register that FILE is an HTML file that has been downloaded. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-03-31 10:05:54 +08:00
|
|
|
|
void
|
|
|
|
|
register_html (const char *url, const char *file)
|
|
|
|
|
{
|
|
|
|
|
if (!opt.convert_links)
|
|
|
|
|
return;
|
|
|
|
|
downloaded_html_files = slist_prepend (downloaded_html_files, file);
|
|
|
|
|
}
|
|
|
|
|
|
2001-11-26 09:11:48 +08:00
|
|
|
|
/* This function is called when the retrieval is done to convert the
|
|
|
|
|
links that have been downloaded. It has to be called at the end of
|
|
|
|
|
the retrieval, because only then does Wget know conclusively which
|
|
|
|
|
URLs have been downloaded, and which not, so it can tell which
|
|
|
|
|
direction to convert to.
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-26 09:11:48 +08:00
|
|
|
|
The "direction" means that the URLs to the files that have been
|
|
|
|
|
downloaded get converted to the relative URL which will point to
|
|
|
|
|
that file. And the other URLs get converted to the remote URL on
|
|
|
|
|
the server.
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-11-26 09:11:48 +08:00
|
|
|
|
All the downloaded HTMLs are kept in downloaded_html_files, and
|
|
|
|
|
downloaded URLs in urls_downloaded. All the information is
|
|
|
|
|
extracted from these two lists. */
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
convert_all_links (void)
|
|
|
|
|
{
|
|
|
|
|
slist *html;
|
2001-11-26 05:03:30 +08:00
|
|
|
|
long msecs;
|
|
|
|
|
int file_count = 0;
|
|
|
|
|
|
2001-12-05 11:51:23 +08:00
|
|
|
|
struct wget_timer *timer = wtimer_new ();
|
|
|
|
|
struct hash_table *seen = make_string_hash_table (0);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
|
2001-03-31 10:05:54 +08:00
|
|
|
|
/* Destructively reverse downloaded_html_files to get it in the right order.
|
2000-11-21 10:06:36 +08:00
|
|
|
|
recursive_retrieve() used slist_prepend() consistently. */
|
2001-03-31 10:05:54 +08:00
|
|
|
|
downloaded_html_files = slist_nreverse (downloaded_html_files);
|
2000-11-21 10:06:36 +08:00
|
|
|
|
|
2001-03-31 10:05:54 +08:00
|
|
|
|
for (html = downloaded_html_files; html; html = html->next)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct urlpos *urls, *cur_url;
|
2000-11-20 04:50:10 +08:00
|
|
|
|
char *url;
|
2001-12-05 11:51:23 +08:00
|
|
|
|
char *file = html->string;
|
|
|
|
|
|
|
|
|
|
/* Guard against duplicates. */
|
|
|
|
|
if (string_set_contains (seen, file))
|
|
|
|
|
continue;
|
|
|
|
|
string_set_add (seen, file);
|
2000-11-20 04:50:10 +08:00
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
/* Determine the URL of the HTML file. get_urls_html will need
|
|
|
|
|
it. */
|
2001-12-05 11:51:23 +08:00
|
|
|
|
url = hash_table_get (dl_file_url_map, file);
|
2001-12-05 05:03:35 +08:00
|
|
|
|
if (!url)
|
|
|
|
|
{
|
2001-12-05 11:51:23 +08:00
|
|
|
|
DEBUGP (("Apparently %s has been removed.\n", file));
|
2001-12-05 05:03:35 +08:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2001-12-05 11:51:23 +08:00
|
|
|
|
DEBUGP (("Scanning %s (from %s)\n", file, url));
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
/* Parse the HTML file... */
|
2001-12-05 11:51:23 +08:00
|
|
|
|
urls = get_urls_html (file, url, NULL);
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2000-11-21 10:06:36 +08:00
|
|
|
|
/* We don't respect meta_disallow_follow here because, even if
|
|
|
|
|
the file is not followed, we might still want to convert the
|
|
|
|
|
links that have been followed from other files. */
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2000-11-21 10:06:36 +08:00
|
|
|
|
for (cur_url = urls; cur_url; cur_url = cur_url->next)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2000-11-20 04:50:10 +08:00
|
|
|
|
char *local_name;
|
2001-11-25 11:10:34 +08:00
|
|
|
|
struct url *u = cur_url->url;
|
2000-11-21 10:06:36 +08:00
|
|
|
|
|
2001-11-26 02:40:55 +08:00
|
|
|
|
if (cur_url->link_base_p)
|
|
|
|
|
{
|
|
|
|
|
/* Base references have been resolved by our parser, so
|
|
|
|
|
we turn the base URL into an empty string. (Perhaps
|
|
|
|
|
we should remove the tag entirely?) */
|
|
|
|
|
cur_url->convert = CO_NULLIFY_BASE;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
/* We decide the direction of conversion according to whether
|
|
|
|
|
a URL was downloaded. Downloaded URLs will be converted
|
2000-11-20 04:50:10 +08:00
|
|
|
|
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
|
|
|
|
local_name = hash_table_get (dl_url_file_map, u->url);
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
2001-11-26 02:40:55 +08:00
|
|
|
|
/* Decide on the conversion type. */
|
2000-11-20 04:50:10 +08:00
|
|
|
|
if (local_name)
|
1999-12-02 15:42:23 +08:00
|
|
|
|
{
|
2000-11-21 10:06:36 +08:00
|
|
|
|
/* We've downloaded this URL. Convert it to relative
|
|
|
|
|
form. We do this even if the URL already is in
|
|
|
|
|
relative form, because our directory structure may
|
|
|
|
|
not be identical to that on the server (think `-nd',
|
|
|
|
|
`--cut-dirs', etc.) */
|
|
|
|
|
cur_url->convert = CO_CONVERT_TO_RELATIVE;
|
|
|
|
|
cur_url->local_name = xstrdup (local_name);
|
2001-12-05 11:51:23 +08:00
|
|
|
|
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2000-11-21 10:06:36 +08:00
|
|
|
|
/* We haven't downloaded this URL. If it's not already
|
|
|
|
|
complete (including a full host name), convert it to
|
|
|
|
|
that form, so it can be reached while browsing this
|
|
|
|
|
HTML locally. */
|
|
|
|
|
if (!cur_url->link_complete_p)
|
|
|
|
|
cur_url->convert = CO_CONVERT_TO_COMPLETE;
|
|
|
|
|
cur_url->local_name = NULL;
|
2001-12-05 11:51:23 +08:00
|
|
|
|
DEBUGP (("will convert url %s to complete\n", u->url));
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2001-11-26 05:03:30 +08:00
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
/* Convert the links in the file. */
|
2001-12-05 11:51:23 +08:00
|
|
|
|
convert_links (file, urls);
|
2001-11-26 05:03:30 +08:00
|
|
|
|
++file_count;
|
|
|
|
|
|
1999-12-02 15:42:23 +08:00
|
|
|
|
/* Free the data. */
|
|
|
|
|
free_urlpos (urls);
|
|
|
|
|
}
|
2001-11-26 05:03:30 +08:00
|
|
|
|
|
|
|
|
|
msecs = wtimer_elapsed (timer);
|
|
|
|
|
wtimer_delete (timer);
|
|
|
|
|
logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
|
|
|
|
|
file_count, (double)msecs / 1000);
|
2001-12-05 11:51:23 +08:00
|
|
|
|
|
|
|
|
|
string_set_free (seen);
|
1999-12-02 15:42:23 +08:00
|
|
|
|
}
|
2001-11-25 11:10:34 +08:00
|
|
|
|
|
|
|
|
|
/* Cleanup the data structures associated with recursive retrieving
|
|
|
|
|
(the variables above). */
|
|
|
|
|
void
|
|
|
|
|
recursive_cleanup (void)
|
|
|
|
|
{
|
|
|
|
|
if (dl_file_url_map)
|
|
|
|
|
{
|
|
|
|
|
free_keys_and_values (dl_file_url_map);
|
|
|
|
|
hash_table_destroy (dl_file_url_map);
|
|
|
|
|
dl_file_url_map = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (dl_url_file_map)
|
|
|
|
|
{
|
|
|
|
|
free_keys_and_values (dl_url_file_map);
|
|
|
|
|
hash_table_destroy (dl_url_file_map);
|
|
|
|
|
dl_url_file_map = NULL;
|
|
|
|
|
}
|
|
|
|
|
slist_free (downloaded_html_files);
|
|
|
|
|
downloaded_html_files = NULL;
|
|
|
|
|
}
|