mirror of
https://github.com/mirror/wget.git
synced 2025-01-10 20:30:09 +08:00
b0b1c815c1
- use mmap() to read whole files in core instead of allocating memory and read'ing it. - use a new, more general, HTML parser (html-parse.c) and interface to it from Wget (html-url.c). - respect <meta name=robots content=nofollow> (easy with the new HTML parser). - use hash tables instead of linked lists in places where the lists were used to facilitate mappings. - rewrite the code in host.c to be more readable and faster (hash tables instead of home-grown lists.) - make convert_links properly convert partial URLs to complete ones for those URLs that have *not* been downloaded. - use HTTP persistent connections where available. very simple-minded, caches the last connection to the server. Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
118 lines
3.9 KiB
C
118 lines
3.9 KiB
C
/* Declarations for url.c.
|
|
Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
|
|
|
|
This file is part of Wget.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
|
|
#ifndef URL_H
|
|
#define URL_H
|
|
|
|
/* If the string contains unsafe characters, duplicate it with
|
|
encode_string, otherwise just copy it with strdup. */
|
|
#define CLEANDUP(x) (contains_unsafe (x) ? encode_string (x) : xstrdup (x))
|
|
|
|
/* Structure containing info on a URL. */
|
|
struct urlinfo
|
|
{
|
|
char *url; /* Unchanged URL */
|
|
uerr_t proto; /* URL protocol */
|
|
char *host; /* Extracted hostname */
|
|
unsigned short port;
|
|
char ftp_type;
|
|
char *path, *dir, *file, *qstring;
|
|
/* Path, dir, file, and query string
|
|
(properly decoded) */
|
|
char *user, *passwd; /* Username and password */
|
|
struct urlinfo *proxy; /* The exact string to pass to proxy
|
|
server */
|
|
char *referer; /* The source from which the request
|
|
URI was obtained */
|
|
char *local; /* The local filename of the URL
|
|
document */
|
|
};
|
|
|
|
enum uflags
|
|
{
|
|
URELATIVE = 0x0001, /* Is URL relative? */
|
|
UNOPROTO = 0x0002, /* Is URL without a protocol? */
|
|
UABS2REL = 0x0004, /* Convert absolute to relative? */
|
|
UREL2ABS = 0x0008 /* Convert relative to absolute? */
|
|
};
|
|
|
|
/* A structure that defines the whereabouts of a URL, i.e. its
|
|
position in an HTML document, etc. */
|
|
typedef struct _urlpos
|
|
{
|
|
char *url; /* URL */
|
|
char *local_name; /* Local file to which it was saved */
|
|
enum uflags flags; /* Various flags */
|
|
int pos, size; /* Relative position in the buffer */
|
|
struct _urlpos *next; /* Next struct in list */
|
|
} urlpos;
|
|
|
|
/* downloaded_file() takes a parameter of this type and returns this type. */
|
|
typedef enum
|
|
{
|
|
/* Return enumerators: */
|
|
FILE_NOT_ALREADY_DOWNLOADED = 0,
|
|
|
|
/* Return / parameter enumerators: */
|
|
FILE_DOWNLOADED_NORMALLY,
|
|
FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
|
|
|
|
/* Parameter enumerators: */
|
|
CHECK_FOR_FILE
|
|
} downloaded_file_t;
|
|
|
|
/* Function declarations */
|
|
|
|
int skip_url PARAMS ((const char *));
|
|
|
|
int contains_unsafe PARAMS ((const char *));
|
|
char *encode_string PARAMS ((const char *));
|
|
|
|
struct urlinfo *newurl PARAMS ((void));
|
|
void freeurl PARAMS ((struct urlinfo *, int));
|
|
uerr_t urlproto PARAMS ((const char *));
|
|
int skip_proto PARAMS ((const char *));
|
|
int has_proto PARAMS ((const char *));
|
|
int skip_uname PARAMS ((const char *));
|
|
|
|
uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int));
|
|
char *str_url PARAMS ((const struct urlinfo *, int));
|
|
int url_equal PARAMS ((const char *, const char *));
|
|
|
|
urlpos *get_urls_file PARAMS ((const char *));
|
|
urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
|
|
void free_urlpos PARAMS ((urlpos *));
|
|
|
|
char *url_concat PARAMS ((const char *, const char *));
|
|
|
|
void rotate_backups PARAMS ((const char *));
|
|
int mkalldirs PARAMS ((const char *));
|
|
char *url_filename PARAMS ((const struct urlinfo *));
|
|
void opt_url PARAMS ((struct urlinfo *));
|
|
|
|
char *getproxy PARAMS ((uerr_t));
|
|
int no_proxy_match PARAMS ((const char *, const char **));
|
|
|
|
void convert_links PARAMS ((const char *, urlpos *));
|
|
urlpos *add_url PARAMS ((urlpos *, const char *, const char *));
|
|
|
|
downloaded_file_t downloaded_file PARAMS ((downloaded_file_t, const char *));
|
|
|
|
#endif /* URL_H */
|