mirror of
https://github.com/mirror/wget.git
synced 2024-12-28 05:40:08 +08:00
[svn] A bunch of new features:
- use mmap() to read whole files in core instead of allocating memory and read'ing it. - use a new, more general, HTML parser (html-parse.c) and interface to it from Wget (html-url.c). - respect <meta name=robots content=nofollow> (easy with the new HTML parser). - use hash tables instead of linked lists in places where the lists were used to facilitate mappings. - rewrite the code in host.c to be more readable and faster (hash tables instead of home-grown lists.) - make convert_links properly convert partial URLs to complete ones for those URLs that have *not* been downloaded. - use HTTP persistent connections where available. very simple-minded, caches the last connection to the server. Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
This commit is contained in:
parent
ccf31643ab
commit
b0b1c815c1
@ -1,3 +1,7 @@
|
||||
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* configure.in: Test for MMAP.
|
||||
|
||||
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* windows/config.h.ms: snprintf and vsnprintf exist under Windows.
|
||||
|
12
TODO
12
TODO
@ -49,15 +49,6 @@ changes.
|
||||
* Make `-k' check for files that were downloaded in the past and convert links
|
||||
to them in newly-downloaded documents.
|
||||
|
||||
* -k should convert relative references to absolute if not downloaded.
|
||||
|
||||
* -k should convert "hostless absolute" URLs, like <A HREF="/index.html">.
|
||||
However, Brian McMahon <bm@iucr.org> wants the old incorrect behavior to still
|
||||
be available as an option, as he depends on it to allow mirrors of his site to
|
||||
send CGI queries to his original site, but still get graphics off of the
|
||||
mirror site. Perhaps this would be better dealt with by adding an option to
|
||||
tell -k not to convert certain URL patterns?
|
||||
|
||||
* Add option to clobber existing file names (no `.N' suffixes).
|
||||
|
||||
* Introduce a concept of "boolean" options. For instance, every
|
||||
@ -85,9 +76,6 @@ changes.
|
||||
* Allow size limit to files (perhaps with an option to download oversize files
|
||||
up through the limit or not at all, to get more functionality than [u]limit.
|
||||
|
||||
* Recognize HTML comments correctly. Add more options for handling
|
||||
bogus HTML found all over the 'net.
|
||||
|
||||
* Implement breadth-first retrieval.
|
||||
|
||||
* Download to .in* when mirroring.
|
||||
|
350
configure
vendored
350
configure
vendored
@ -2040,15 +2040,55 @@ EOF
|
||||
|
||||
fi
|
||||
|
||||
for ac_func in strdup strstr strcasecmp strncasecmp
|
||||
for ac_hdr in unistd.h
|
||||
do
|
||||
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
|
||||
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
|
||||
echo "configure:2048: checking for $ac_hdr" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2053 "configure"
|
||||
#include "confdefs.h"
|
||||
#include <$ac_hdr>
|
||||
EOF
|
||||
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
|
||||
{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
|
||||
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
|
||||
if test -z "$ac_err"; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_header_$ac_safe=yes"
|
||||
else
|
||||
echo "$ac_err" >&5
|
||||
echo "configure: failed program was:" >&5
|
||||
cat conftest.$ac_ext >&5
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_header_$ac_safe=no"
|
||||
fi
|
||||
rm -f conftest*
|
||||
fi
|
||||
if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
|
||||
echo "$ac_t""yes" 1>&6
|
||||
ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
|
||||
cat >> confdefs.h <<EOF
|
||||
#define $ac_tr_hdr 1
|
||||
EOF
|
||||
|
||||
else
|
||||
echo "$ac_t""no" 1>&6
|
||||
fi
|
||||
done
|
||||
|
||||
for ac_func in getpagesize
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2047: checking for $ac_func" >&5
|
||||
echo "configure:2087: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2052 "configure"
|
||||
#line 2092 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2071,7 +2111,233 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
echo "configure: failed program was:" >&5
|
||||
cat conftest.$ac_ext >&5
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=no"
|
||||
fi
|
||||
rm -f conftest*
|
||||
fi
|
||||
|
||||
if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then
|
||||
echo "$ac_t""yes" 1>&6
|
||||
ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
|
||||
cat >> confdefs.h <<EOF
|
||||
#define $ac_tr_func 1
|
||||
EOF
|
||||
|
||||
else
|
||||
echo "$ac_t""no" 1>&6
|
||||
fi
|
||||
done
|
||||
|
||||
echo $ac_n "checking for working mmap""... $ac_c" 1>&6
|
||||
echo "configure:2140: checking for working mmap" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
if test "$cross_compiling" = yes; then
|
||||
ac_cv_func_mmap_fixed_mapped=no
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2148 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
/* Thanks to Mike Haertel and Jim Avera for this test.
|
||||
Here is a matrix of mmap possibilities:
|
||||
mmap private not fixed
|
||||
mmap private fixed at somewhere currently unmapped
|
||||
mmap private fixed at somewhere already mapped
|
||||
mmap shared not fixed
|
||||
mmap shared fixed at somewhere currently unmapped
|
||||
mmap shared fixed at somewhere already mapped
|
||||
For private mappings, we should verify that changes cannot be read()
|
||||
back from the file, nor mmap's back from the file at a different
|
||||
address. (There have been systems where private was not correctly
|
||||
implemented like the infamous i386 svr4.0, and systems where the
|
||||
VM page cache was not coherent with the filesystem buffer cache
|
||||
like early versions of FreeBSD and possibly contemporary NetBSD.)
|
||||
For shared mappings, we should conversely verify that changes get
|
||||
propogated back to all the places they're supposed to be.
|
||||
|
||||
Grep wants private fixed already mapped.
|
||||
The main things grep needs to know about mmap are:
|
||||
* does it exist and is it safe to write into the mmap'd area
|
||||
* how to use it (BSD variants) */
|
||||
#include <sys/types.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
/* This mess was copied from the GNU getpagesize.h. */
|
||||
#ifndef HAVE_GETPAGESIZE
|
||||
# ifdef HAVE_UNISTD_H
|
||||
# include <unistd.h>
|
||||
# endif
|
||||
|
||||
/* Assume that all systems that can run configure have sys/param.h. */
|
||||
# ifndef HAVE_SYS_PARAM_H
|
||||
# define HAVE_SYS_PARAM_H 1
|
||||
# endif
|
||||
|
||||
# ifdef _SC_PAGESIZE
|
||||
# define getpagesize() sysconf(_SC_PAGESIZE)
|
||||
# else /* no _SC_PAGESIZE */
|
||||
# ifdef HAVE_SYS_PARAM_H
|
||||
# include <sys/param.h>
|
||||
# ifdef EXEC_PAGESIZE
|
||||
# define getpagesize() EXEC_PAGESIZE
|
||||
# else /* no EXEC_PAGESIZE */
|
||||
# ifdef NBPG
|
||||
# define getpagesize() NBPG * CLSIZE
|
||||
# ifndef CLSIZE
|
||||
# define CLSIZE 1
|
||||
# endif /* no CLSIZE */
|
||||
# else /* no NBPG */
|
||||
# ifdef NBPC
|
||||
# define getpagesize() NBPC
|
||||
# else /* no NBPC */
|
||||
# ifdef PAGESIZE
|
||||
# define getpagesize() PAGESIZE
|
||||
# endif /* PAGESIZE */
|
||||
# endif /* no NBPC */
|
||||
# endif /* no NBPG */
|
||||
# endif /* no EXEC_PAGESIZE */
|
||||
# else /* no HAVE_SYS_PARAM_H */
|
||||
# define getpagesize() 8192 /* punt totally */
|
||||
# endif /* no HAVE_SYS_PARAM_H */
|
||||
# endif /* no _SC_PAGESIZE */
|
||||
|
||||
#endif /* no HAVE_GETPAGESIZE */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" { void *malloc(unsigned); }
|
||||
#else
|
||||
char *malloc();
|
||||
#endif
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
char *data, *data2, *data3;
|
||||
int i, pagesize;
|
||||
int fd;
|
||||
|
||||
pagesize = getpagesize();
|
||||
|
||||
/*
|
||||
* First, make a file with some known garbage in it.
|
||||
*/
|
||||
data = malloc(pagesize);
|
||||
if (!data)
|
||||
exit(1);
|
||||
for (i = 0; i < pagesize; ++i)
|
||||
*(data + i) = rand();
|
||||
umask(0);
|
||||
fd = creat("conftestmmap", 0600);
|
||||
if (fd < 0)
|
||||
exit(1);
|
||||
if (write(fd, data, pagesize) != pagesize)
|
||||
exit(1);
|
||||
close(fd);
|
||||
|
||||
/*
|
||||
* Next, try to mmap the file at a fixed address which
|
||||
* already has something else allocated at it. If we can,
|
||||
* also make sure that we see the same garbage.
|
||||
*/
|
||||
fd = open("conftestmmap", O_RDWR);
|
||||
if (fd < 0)
|
||||
exit(1);
|
||||
data2 = malloc(2 * pagesize);
|
||||
if (!data2)
|
||||
exit(1);
|
||||
data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1);
|
||||
if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_FIXED, fd, 0L))
|
||||
exit(1);
|
||||
for (i = 0; i < pagesize; ++i)
|
||||
if (*(data + i) != *(data2 + i))
|
||||
exit(1);
|
||||
|
||||
/*
|
||||
* Finally, make sure that changes to the mapped area
|
||||
* do not percolate back to the file as seen by read().
|
||||
* (This is a bug on some variants of i386 svr4.0.)
|
||||
*/
|
||||
for (i = 0; i < pagesize; ++i)
|
||||
*(data2 + i) = *(data2 + i) + 1;
|
||||
data3 = malloc(pagesize);
|
||||
if (!data3)
|
||||
exit(1);
|
||||
if (read(fd, data3, pagesize) != pagesize)
|
||||
exit(1);
|
||||
for (i = 0; i < pagesize; ++i)
|
||||
if (*(data + i) != *(data3 + i))
|
||||
exit(1);
|
||||
close(fd);
|
||||
unlink("conftestmmap");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
EOF
|
||||
if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
|
||||
then
|
||||
ac_cv_func_mmap_fixed_mapped=yes
|
||||
else
|
||||
echo "configure: failed program was:" >&5
|
||||
cat conftest.$ac_ext >&5
|
||||
rm -fr conftest*
|
||||
ac_cv_func_mmap_fixed_mapped=no
|
||||
fi
|
||||
rm -fr conftest*
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6
|
||||
if test $ac_cv_func_mmap_fixed_mapped = yes; then
|
||||
cat >> confdefs.h <<\EOF
|
||||
#define HAVE_MMAP 1
|
||||
EOF
|
||||
|
||||
fi
|
||||
|
||||
for ac_func in strdup strstr strcasecmp strncasecmp
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2313: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2318 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
#include <assert.h>
|
||||
/* Override any gcc2 internal prototype to avoid an error. */
|
||||
/* We use char because int might match the return type of a gcc2
|
||||
builtin and then its argument prototype would still apply. */
|
||||
char $ac_func();
|
||||
|
||||
int main() {
|
||||
|
||||
/* The GNU C library defines this for functions which it implements
|
||||
to always fail with ENOSYS. Some functions are actually named
|
||||
something starting with __ and the normal name is an alias. */
|
||||
#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
|
||||
choke me
|
||||
#else
|
||||
$ac_func();
|
||||
#endif
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2098,12 +2364,12 @@ done
|
||||
for ac_func in gettimeofday mktime strptime
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2102: checking for $ac_func" >&5
|
||||
echo "configure:2368: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2107 "configure"
|
||||
#line 2373 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2126,7 +2392,7 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2130: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2153,12 +2419,12 @@ done
|
||||
for ac_func in strerror snprintf vsnprintf select signal symlink access isatty
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2157: checking for $ac_func" >&5
|
||||
echo "configure:2423: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2162 "configure"
|
||||
#line 2428 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2181,7 +2447,7 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2185: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2208,12 +2474,12 @@ done
|
||||
for ac_func in uname gethostname
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2212: checking for $ac_func" >&5
|
||||
echo "configure:2478: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2217 "configure"
|
||||
#line 2483 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2236,7 +2502,7 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2264,12 +2530,12 @@ done
|
||||
for ac_func in gethostbyname
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2268: checking for $ac_func" >&5
|
||||
echo "configure:2534: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2273 "configure"
|
||||
#line 2539 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2292,7 +2558,7 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2314,7 +2580,7 @@ EOF
|
||||
else
|
||||
echo "$ac_t""no" 1>&6
|
||||
echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
|
||||
echo "configure:2318: checking for gethostbyname in -lnsl" >&5
|
||||
echo "configure:2584: checking for gethostbyname in -lnsl" >&5
|
||||
ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
|
||||
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
@ -2322,7 +2588,7 @@ else
|
||||
ac_save_LIBS="$LIBS"
|
||||
LIBS="-lnsl $LIBS"
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2326 "configure"
|
||||
#line 2592 "configure"
|
||||
#include "confdefs.h"
|
||||
/* Override any gcc2 internal prototype to avoid an error. */
|
||||
/* We use char because int might match the return type of a gcc2
|
||||
@ -2333,7 +2599,7 @@ int main() {
|
||||
gethostbyname()
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2337: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_lib_$ac_lib_var=yes"
|
||||
else
|
||||
@ -2367,7 +2633,7 @@ done
|
||||
|
||||
|
||||
echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6
|
||||
echo "configure:2371: checking for socket in -lsocket" >&5
|
||||
echo "configure:2637: checking for socket in -lsocket" >&5
|
||||
ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'`
|
||||
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
@ -2375,7 +2641,7 @@ else
|
||||
ac_save_LIBS="$LIBS"
|
||||
LIBS="-lsocket $LIBS"
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2379 "configure"
|
||||
#line 2645 "configure"
|
||||
#include "confdefs.h"
|
||||
/* Override any gcc2 internal prototype to avoid an error. */
|
||||
/* We use char because int might match the return type of a gcc2
|
||||
@ -2386,7 +2652,7 @@ int main() {
|
||||
socket()
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_lib_$ac_lib_var=yes"
|
||||
else
|
||||
@ -2417,7 +2683,7 @@ fi
|
||||
if test "x${with_socks}" = xyes
|
||||
then
|
||||
echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6
|
||||
echo "configure:2421: checking for main in -lresolv" >&5
|
||||
echo "configure:2687: checking for main in -lresolv" >&5
|
||||
ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'`
|
||||
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
@ -2425,14 +2691,14 @@ else
|
||||
ac_save_LIBS="$LIBS"
|
||||
LIBS="-lresolv $LIBS"
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2429 "configure"
|
||||
#line 2695 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
int main() {
|
||||
main()
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2436: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_lib_$ac_lib_var=yes"
|
||||
else
|
||||
@ -2460,7 +2726,7 @@ else
|
||||
fi
|
||||
|
||||
echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6
|
||||
echo "configure:2464: checking for Rconnect in -lsocks" >&5
|
||||
echo "configure:2730: checking for Rconnect in -lsocks" >&5
|
||||
ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'`
|
||||
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
@ -2468,7 +2734,7 @@ else
|
||||
ac_save_LIBS="$LIBS"
|
||||
LIBS="-lsocks $LIBS"
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2472 "configure"
|
||||
#line 2738 "configure"
|
||||
#include "confdefs.h"
|
||||
/* Override any gcc2 internal prototype to avoid an error. */
|
||||
/* We use char because int might match the return type of a gcc2
|
||||
@ -2479,7 +2745,7 @@ int main() {
|
||||
Rconnect()
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2483: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_lib_$ac_lib_var=yes"
|
||||
else
|
||||
@ -2511,7 +2777,7 @@ fi
|
||||
ALL_LINGUAS="cs de hr it no pl pt_BR ru"
|
||||
|
||||
echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6
|
||||
echo "configure:2515: checking whether NLS is requested" >&5
|
||||
echo "configure:2781: checking whether NLS is requested" >&5
|
||||
# Check whether --enable-nls or --disable-nls was given.
|
||||
if test "${enable_nls+set}" = set; then
|
||||
enableval="$enable_nls"
|
||||
@ -2528,7 +2794,7 @@ fi
|
||||
# Extract the first word of "msgfmt", so it can be a program name with args.
|
||||
set dummy msgfmt; ac_word=$2
|
||||
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
|
||||
echo "configure:2532: checking for $ac_word" >&5
|
||||
echo "configure:2798: checking for $ac_word" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
@ -2562,7 +2828,7 @@ fi
|
||||
# Extract the first word of "xgettext", so it can be a program name with args.
|
||||
set dummy xgettext; ac_word=$2
|
||||
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
|
||||
echo "configure:2566: checking for $ac_word" >&5
|
||||
echo "configure:2832: checking for $ac_word" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
@ -2597,7 +2863,7 @@ fi
|
||||
# Extract the first word of "gmsgfmt", so it can be a program name with args.
|
||||
set dummy gmsgfmt; ac_word=$2
|
||||
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
|
||||
echo "configure:2601: checking for $ac_word" >&5
|
||||
echo "configure:2867: checking for $ac_word" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
@ -2647,17 +2913,17 @@ fi
|
||||
do
|
||||
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
|
||||
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
|
||||
echo "configure:2651: checking for $ac_hdr" >&5
|
||||
echo "configure:2917: checking for $ac_hdr" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2656 "configure"
|
||||
#line 2922 "configure"
|
||||
#include "confdefs.h"
|
||||
#include <$ac_hdr>
|
||||
EOF
|
||||
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
|
||||
{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
|
||||
{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
|
||||
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
|
||||
if test -z "$ac_err"; then
|
||||
rm -rf conftest*
|
||||
@ -2687,12 +2953,12 @@ done
|
||||
for ac_func in gettext
|
||||
do
|
||||
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
|
||||
echo "configure:2691: checking for $ac_func" >&5
|
||||
echo "configure:2957: checking for $ac_func" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2696 "configure"
|
||||
#line 2962 "configure"
|
||||
#include "confdefs.h"
|
||||
/* System header to define __stub macros and hopefully few prototypes,
|
||||
which can conflict with char $ac_func(); below. */
|
||||
@ -2715,7 +2981,7 @@ $ac_func();
|
||||
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_func_$ac_func=yes"
|
||||
else
|
||||
@ -2737,7 +3003,7 @@ EOF
|
||||
else
|
||||
echo "$ac_t""no" 1>&6
|
||||
echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6
|
||||
echo "configure:2741: checking for gettext in -lintl" >&5
|
||||
echo "configure:3007: checking for gettext in -lintl" >&5
|
||||
ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'`
|
||||
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
@ -2745,7 +3011,7 @@ else
|
||||
ac_save_LIBS="$LIBS"
|
||||
LIBS="-lintl $LIBS"
|
||||
cat > conftest.$ac_ext <<EOF
|
||||
#line 2749 "configure"
|
||||
#line 3015 "configure"
|
||||
#include "confdefs.h"
|
||||
/* Override any gcc2 internal prototype to avoid an error. */
|
||||
/* We use char because int might match the return type of a gcc2
|
||||
@ -2756,7 +3022,7 @@ int main() {
|
||||
gettext()
|
||||
; return 0; }
|
||||
EOF
|
||||
if { (eval echo configure:2760: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
|
||||
rm -rf conftest*
|
||||
eval "ac_cv_lib_$ac_lib_var=yes"
|
||||
else
|
||||
@ -2824,7 +3090,7 @@ do
|
||||
# Extract the first word of "$ac_prog", so it can be a program name with args.
|
||||
set dummy $ac_prog; ac_word=$2
|
||||
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
|
||||
echo "configure:2828: checking for $ac_word" >&5
|
||||
echo "configure:3094: checking for $ac_word" >&5
|
||||
if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then
|
||||
echo $ac_n "(cached) $ac_c" 1>&6
|
||||
else
|
||||
|
@ -160,6 +160,7 @@ dnl
|
||||
dnl Checks for library functions.
|
||||
dnl
|
||||
AC_FUNC_ALLOCA
|
||||
AC_FUNC_MMAP
|
||||
AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp)
|
||||
AC_CHECK_FUNCS(gettimeofday mktime strptime)
|
||||
AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)
|
||||
|
@ -1,3 +1,8 @@
|
||||
2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* wget.texi (Robots): Document that we now support the meta tag
|
||||
exclusion.
|
||||
|
||||
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* wget.texi: Use --- consistently.
|
||||
|
@ -2548,8 +2548,8 @@ this:
|
||||
|
||||
This is explained in some detail at
|
||||
@url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}.
|
||||
Unfortunately, Wget does not support this method of robot exclusion yet,
|
||||
but it will be implemented in the next release.
|
||||
Wget supports this method of robot exclusion in addition to the usual
|
||||
@file{/robots.txt} exclusion.
|
||||
|
||||
@node Security Considerations, Contributors, Robots, Appendices
|
||||
@section Security Considerations
|
||||
|
BIN
po/pt_BR.gmo
BIN
po/pt_BR.gmo
Binary file not shown.
114
src/ChangeLog
114
src/ChangeLog
@ -1,3 +1,117 @@
|
||||
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* retr.c (get_contents): If use_expected, make sure that the
|
||||
appropriate amount of data is being read.
|
||||
|
||||
* http.c (gethttp): Check for both `Keep-Alive: ...' and
|
||||
`Connection: Keep-Alive'.
|
||||
|
||||
* wget.h (DEBUGP): Call debug_logprintf only if opt.debug is
|
||||
turned on.
|
||||
|
||||
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* http.c (connection_available_p): Use it.
|
||||
|
||||
* connect.c (test_socket_open): New function.
|
||||
|
||||
* http.c (gethttp): Support persistent connections. Based on the
|
||||
ideas, and partly on code, by Sam Horrocks <sam@daemoninc.com>.
|
||||
(register_persistent): New function.
|
||||
(connection_available_p): Ditto.
|
||||
(invalidate_connection): Ditto.
|
||||
|
||||
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* url.c (convert_links): Handle UREL2ABS case.
|
||||
|
||||
* recur.c (recursive_retrieve): Instead of the list
|
||||
urls_downloaded, use hash tables dl_file_url_map and
|
||||
dl_url_file_map.
|
||||
(convert_all_links): Use them to retrieve data.
|
||||
|
||||
* host.c (clean_hosts): Free the hash tables.
|
||||
|
||||
* main.c (private_initialize): Call host_init().
|
||||
|
||||
* host.c (store_hostaddress): Use a saner, hash table-based data
|
||||
model.
|
||||
(realhost): Ditto.
|
||||
(host_init): Initialize the hash tables.
|
||||
|
||||
2000-11-18 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* utils.c (slist_append): Eviscerate NOSORT. Hash tables are now
|
||||
used for what the sorted slists used to be used for.
|
||||
(slist_contains): Don't rely on the list being sorted.
|
||||
(slist_append): Simplify the code.
|
||||
|
||||
* recur.c (recursive_cleanup): Use free_string_set.
|
||||
|
||||
* utils.c (string_set_add, string_set_exists, string_set_free):
|
||||
New functions for easier freeing of hash tables whose keys are
|
||||
strdup'ed strings.
|
||||
|
||||
* recur.c (recursive_retrieve): Use the hash table functions for
|
||||
storing undesirable URLs.
|
||||
|
||||
* hash.c: New file.
|
||||
|
||||
2000-11-17 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* main.c (private_initialize): Call url_init.
|
||||
(main): Call private_initialize.
|
||||
|
||||
* url.c (unsafe_char_table): New table.
|
||||
(UNSAFE_CHAR): Use it.
|
||||
(init_unsafe_char_table): New function.
|
||||
(url_init): New function; call init_unsafe_char_table.
|
||||
|
||||
2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* html-url.c (handle_link): Handle HTML fragment identifiers.
|
||||
|
||||
* recur.c (recursive_retrieve): If norobot info is respected and
|
||||
the file is specified not to be followed by robots, respect that.
|
||||
|
||||
* html-url.c (collect_tags_mapper): Handle <meta name=robots
|
||||
content=X>. For us the important cases are where X is NONE or
|
||||
where X contains NOFOLLOW.
|
||||
(get_urls_html): Propagate that information to the caller.
|
||||
|
||||
2000-11-13 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* url.c (convert_links): Unlink the file we might be reading from
|
||||
before writing to it.
|
||||
(convert_links): Use alloca instead of malloc for
|
||||
filename_plus_orig_suffix.
|
||||
|
||||
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* url.c (get_urls_file): Ditto.
|
||||
(convert_links): Ditto.
|
||||
|
||||
* html-url.c (get_urls_html): Use read_file() instead of
|
||||
load_file().
|
||||
|
||||
* utils.c (read_file): New function, instead of the old
|
||||
load_file().
|
||||
(read_file_free): Ditto.
|
||||
|
||||
* url.c (findurl): Search only for the supported protocols.
|
||||
(convert_links): Use fwrite() when writing out a region of
|
||||
characters.
|
||||
|
||||
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* ftp-ls.c: Move html_quote_string and ftp_index here.
|
||||
|
||||
* url.c: Remove get_urls_html, since that's now in html-url.c.
|
||||
|
||||
* html-url.c: New file.
|
||||
|
||||
* html-parse.c: New file.
|
||||
|
||||
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* mswindows.h: Define snprintf and vsnprintf to _snprintf and
|
||||
|
@ -57,9 +57,10 @@ MD5_OBJ = @MD5_OBJ@
|
||||
OPIE_OBJ = @OPIE_OBJ@
|
||||
|
||||
OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o \
|
||||
ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o \
|
||||
http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o \
|
||||
recur$o retr$o snprintf$o url$o utils$o version$o
|
||||
ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o \
|
||||
html-parse$o html-url$o http$o init$o log$o main$o \
|
||||
$(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o \
|
||||
url$o utils$o version$o
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .o ._c ._o
|
||||
@ -133,26 +134,31 @@ TAGS: *.c *.h
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
cmpt$o: config.h wget.h sysdep.h options.h
|
||||
connect$o: config.h wget.h sysdep.h options.h connect.h host.h
|
||||
fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h
|
||||
ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h
|
||||
ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h
|
||||
ftp-opie$o: config.h wget.h sysdep.h options.h md5.h
|
||||
ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h
|
||||
getopt$o: wget.h sysdep.h options.h
|
||||
headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h
|
||||
host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h
|
||||
html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h
|
||||
http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h
|
||||
init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h
|
||||
log$o: config.h wget.h sysdep.h options.h utils.h
|
||||
main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h
|
||||
md5$o: wget.h sysdep.h options.h md5.h
|
||||
mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h
|
||||
netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h
|
||||
rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h
|
||||
recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h
|
||||
retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h
|
||||
url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h
|
||||
utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h
|
||||
cmpt$o: wget.h
|
||||
connect$o: wget.h connect.h host.h
|
||||
fnmatch$o: wget.h fnmatch.h
|
||||
ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h
|
||||
ftp-ls$o: wget.h utils.h ftp.h url.h
|
||||
ftp-opie$o: wget.h md5.h
|
||||
ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h
|
||||
getopt$o: wget.h getopt.h
|
||||
hash$o: wget.h utils.h hash.h
|
||||
headers$o: wget.h connect.h rbuf.h headers.h
|
||||
host$o: wget.h utils.h host.h url.h hash.h
|
||||
html-parse$o: wget.h html-parse.h
|
||||
html-url$o: wget.h html-parse.h url.h utils.h
|
||||
html$o: wget.h url.h utils.h ftp.h
|
||||
http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h
|
||||
init$o: wget.h utils.h init.h host.h recur.h netrc.h
|
||||
log$o: wget.h utils.h
|
||||
main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h
|
||||
md5$o: wget.h md5.h
|
||||
mswindows$o: wget.h url.h
|
||||
netrc$o: wget.h utils.h netrc.h init.h
|
||||
rbuf$o: wget.h rbuf.h connect.h
|
||||
recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h
|
||||
retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h
|
||||
snprintf$o:
|
||||
url$o: wget.h utils.h url.h host.h
|
||||
utils$o: wget.h utils.h fnmatch.h hash.h
|
||||
version$o:
|
||||
|
@ -101,6 +101,9 @@ char *alloca ();
|
||||
/* Define if you have the uname function. */
|
||||
#undef HAVE_UNAME
|
||||
|
||||
/* Define if you have a working version of mmap. */
|
||||
#undef HAVE_MMAP
|
||||
|
||||
/* Define if you have the gethostname function. */
|
||||
#undef HAVE_GETHOSTNAME
|
||||
|
||||
|
@ -107,6 +107,37 @@ make_connection (int *sock, char *hostname, unsigned short port)
|
||||
return NOCONERROR;
|
||||
}
|
||||
|
||||
int
|
||||
test_socket_open (int sock)
|
||||
{
|
||||
#ifdef HAVE_SELECT
|
||||
fd_set check_set;
|
||||
struct timeval to;
|
||||
|
||||
/* Check if we still have a valid (non-EOF) connection. From Andrew
|
||||
* Maholski's code in the Unix Socket FAQ. */
|
||||
|
||||
FD_ZERO (&check_set);
|
||||
FD_SET (sock, &check_set);
|
||||
|
||||
/* Wait one microsecond */
|
||||
to.tv_sec = 0;
|
||||
to.tv_usec = 1;
|
||||
|
||||
/* If we get a timeout, then that means still connected */
|
||||
if (select (sock + 1, &check_set, NULL, NULL, &to) == 0)
|
||||
{
|
||||
/* Connection is valid (not EOF), so continue */
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
#else
|
||||
/* Without select, it's hard to know for sure. */
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Bind the local port PORT. This does all the necessary work, which
|
||||
is creating a socket, setting SO_REUSEADDR option on it, then
|
||||
calling bind() and listen(). If *PORT is 0, a random port is
|
||||
|
173
src/ftp-ls.c
173
src/ftp-ls.c
@ -36,6 +36,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "wget.h"
|
||||
#include "utils.h"
|
||||
#include "ftp.h"
|
||||
#include "url.h"
|
||||
|
||||
/* Converts symbolic permissions to number-style ones, e.g. string
|
||||
rwxr-xr-x to 755. For now, it knows nothing of
|
||||
@ -388,3 +389,175 @@ ftp_parse_ls (const char *file)
|
||||
{
|
||||
return ftp_parse_unix_ls (file);
|
||||
}
|
||||
|
||||
/* Stuff for creating FTP index. */
|
||||
|
||||
/* The function returns the pointer to the malloc-ed quoted version of
|
||||
string s. It will recognize and quote numeric and special graphic
|
||||
entities, as per RFC1866:
|
||||
|
||||
`&' -> `&'
|
||||
`<' -> `<'
|
||||
`>' -> `>'
|
||||
`"' -> `"'
|
||||
|
||||
No other entities are recognized or replaced. */
|
||||
static char *
|
||||
html_quote_string (const char *s)
|
||||
{
|
||||
const char *b = s;
|
||||
char *p, *res;
|
||||
int i;
|
||||
|
||||
/* Pass through the string, and count the new size. */
|
||||
for (i = 0; *s; s++, i++)
|
||||
{
|
||||
if (*s == '&')
|
||||
i += 4; /* `amp;' */
|
||||
else if (*s == '<' || *s == '>')
|
||||
i += 3; /* `lt;' and `gt;' */
|
||||
else if (*s == '\"')
|
||||
i += 5; /* `quot;' */
|
||||
}
|
||||
res = (char *)xmalloc (i + 1);
|
||||
s = b;
|
||||
for (p = res; *s; s++)
|
||||
{
|
||||
switch (*s)
|
||||
{
|
||||
case '&':
|
||||
*p++ = '&';
|
||||
*p++ = 'a';
|
||||
*p++ = 'm';
|
||||
*p++ = 'p';
|
||||
*p++ = ';';
|
||||
break;
|
||||
case '<': case '>':
|
||||
*p++ = '&';
|
||||
*p++ = (*s == '<' ? 'l' : 'g');
|
||||
*p++ = 't';
|
||||
*p++ = ';';
|
||||
break;
|
||||
case '\"':
|
||||
*p++ = '&';
|
||||
*p++ = 'q';
|
||||
*p++ = 'u';
|
||||
*p++ = 'o';
|
||||
*p++ = 't';
|
||||
*p++ = ';';
|
||||
break;
|
||||
default:
|
||||
*p++ = *s;
|
||||
}
|
||||
}
|
||||
*p = '\0';
|
||||
return res;
|
||||
}
|
||||
|
||||
/* The function creates an HTML index containing references to given
|
||||
directories and files on the appropriate host. The references are
|
||||
FTP. */
|
||||
uerr_t
|
||||
ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
|
||||
{
|
||||
FILE *fp;
|
||||
char *upwd;
|
||||
char *htclfile; /* HTML-clean file name */
|
||||
|
||||
if (!opt.dfp)
|
||||
{
|
||||
fp = fopen (file, "wb");
|
||||
if (!fp)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return FOPENERR;
|
||||
}
|
||||
}
|
||||
else
|
||||
fp = opt.dfp;
|
||||
if (u->user)
|
||||
{
|
||||
char *tmpu, *tmpp; /* temporary, clean user and passwd */
|
||||
|
||||
tmpu = CLEANDUP (u->user);
|
||||
tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
|
||||
upwd = (char *)xmalloc (strlen (tmpu)
|
||||
+ (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
|
||||
sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
|
||||
free (tmpu);
|
||||
FREE_MAYBE (tmpp);
|
||||
}
|
||||
else
|
||||
upwd = xstrdup ("");
|
||||
fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
|
||||
fprintf (fp, "<html>\n<head>\n<title>");
|
||||
fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
|
||||
fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
|
||||
fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
|
||||
fprintf (fp, "</h1>\n<hr>\n<pre>\n");
|
||||
while (f)
|
||||
{
|
||||
fprintf (fp, " ");
|
||||
if (f->tstamp != -1)
|
||||
{
|
||||
/* #### Should we translate the months? */
|
||||
static char *months[] = {
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
|
||||
};
|
||||
struct tm *ptm = localtime ((time_t *)&f->tstamp);
|
||||
|
||||
fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
|
||||
ptm->tm_mday);
|
||||
if (ptm->tm_hour)
|
||||
fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
|
||||
else
|
||||
fprintf (fp, " ");
|
||||
}
|
||||
else
|
||||
fprintf (fp, _("time unknown "));
|
||||
switch (f->type)
|
||||
{
|
||||
case FT_PLAINFILE:
|
||||
fprintf (fp, _("File "));
|
||||
break;
|
||||
case FT_DIRECTORY:
|
||||
fprintf (fp, _("Directory "));
|
||||
break;
|
||||
case FT_SYMLINK:
|
||||
fprintf (fp, _("Link "));
|
||||
break;
|
||||
default:
|
||||
fprintf (fp, _("Not sure "));
|
||||
break;
|
||||
}
|
||||
htclfile = html_quote_string (f->name);
|
||||
fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
|
||||
if (*u->dir != '/')
|
||||
putc ('/', fp);
|
||||
fprintf (fp, "%s", u->dir);
|
||||
if (*u->dir)
|
||||
putc ('/', fp);
|
||||
fprintf (fp, "%s", htclfile);
|
||||
if (f->type == FT_DIRECTORY)
|
||||
putc ('/', fp);
|
||||
fprintf (fp, "\">%s", htclfile);
|
||||
if (f->type == FT_DIRECTORY)
|
||||
putc ('/', fp);
|
||||
fprintf (fp, "</a> ");
|
||||
if (f->type == FT_PLAINFILE)
|
||||
fprintf (fp, _(" (%s bytes)"), legible (f->size));
|
||||
else if (f->type == FT_SYMLINK)
|
||||
fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
|
||||
putc ('\n', fp);
|
||||
free (htclfile);
|
||||
f = f->next;
|
||||
}
|
||||
fprintf (fp, "</pre>\n</body>\n</html>\n");
|
||||
free (upwd);
|
||||
if (!opt.dfp)
|
||||
fclose (fp);
|
||||
else
|
||||
fflush (fp);
|
||||
return FTPOK;
|
||||
}
|
||||
|
@ -40,7 +40,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "rbuf.h"
|
||||
#include "retr.h"
|
||||
#include "ftp.h"
|
||||
#include "html.h"
|
||||
#include "connect.h"
|
||||
#include "host.h"
|
||||
#include "fnmatch.h"
|
||||
@ -722,7 +721,7 @@ Error in server response, closing control connection.\n"));
|
||||
}
|
||||
reset_timer ();
|
||||
/* Get the contents of the document. */
|
||||
res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf);
|
||||
res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0);
|
||||
con->dltime = elapsed_time ();
|
||||
tms = time_str (NULL);
|
||||
tmrate = rate (*len - restval, con->dltime);
|
||||
|
@ -92,4 +92,6 @@ typedef struct
|
||||
struct fileinfo *ftp_parse_ls PARAMS ((const char *));
|
||||
uerr_t ftp_loop PARAMS ((struct urlinfo *, int *));
|
||||
|
||||
uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *);
|
||||
|
||||
#endif /* FTP_H */
|
||||
|
403
src/hash.c
Normal file
403
src/hash.c
Normal file
@ -0,0 +1,403 @@
|
||||
/* Hash tables.
|
||||
Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include <config.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
#ifdef STANDALONE
|
||||
# define xmalloc malloc
|
||||
# define xrealloc realloc
|
||||
#endif
|
||||
|
||||
/* This file implements simple hash tables based on linear probing.
|
||||
The hash table stores key-value pairs in a contiguous array. Both
|
||||
key and value are void pointers that the hash and test functions
|
||||
know how to handle.
|
||||
|
||||
Although Knuth & co. recommend double hashing over linear probing,
|
||||
we use the latter because it accesses array elements sequentially
|
||||
in case of a collision, yielding in better cache behaviour and
|
||||
ultimately in better speed. To avoid collision problems with
|
||||
linear probing, we make sure that the table grows as soon as the
|
||||
fullness/size ratio exceeds 75%. */
|
||||
|
||||
struct ht_pair {
|
||||
void *key;
|
||||
void *value;
|
||||
};
|
||||
|
||||
struct hash_table {
|
||||
unsigned long (*hash_function) (const void *);
|
||||
int (*test_function) (const void *, const void *);
|
||||
|
||||
int size; /* size of the array */
|
||||
int fullness; /* number of non-empty fields */
|
||||
int count; /* number of non-empty, non-deleted
|
||||
fields. */
|
||||
|
||||
struct ht_pair *pairs;
|
||||
};
|
||||
|
||||
#define ENTRY_DELETED ((void *)0xdeadbeef)
|
||||
|
||||
#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
|
||||
#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL)
|
||||
|
||||
/* Find a prime near, but greather than or equal to SIZE. */
|
||||
|
||||
int
|
||||
prime_size (int size)
|
||||
{
|
||||
static const unsigned long primes [] = {
|
||||
19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
|
||||
1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
|
||||
19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
|
||||
204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
|
||||
1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
|
||||
10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
|
||||
50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
|
||||
243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
|
||||
1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
|
||||
};
|
||||
int i;
|
||||
for (i = 0; i < ARRAY_SIZE (primes); i++)
|
||||
if (primes[i] >= size)
|
||||
return primes[i];
|
||||
/* huh? */
|
||||
return size;
|
||||
}
|
||||
|
||||
/* Create a hash table of INITIAL_SIZE with hash function
|
||||
HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
|
||||
start out with a "small" table which will be regrown as needed,
|
||||
specify 0 as INITIAL_SIZE. */
|
||||
|
||||
struct hash_table *
|
||||
hash_table_new (int initial_size,
|
||||
unsigned long (*hash_function) (const void *),
|
||||
int (*test_function) (const void *, const void *))
|
||||
{
|
||||
struct hash_table *ht
|
||||
= (struct hash_table *)xmalloc (sizeof (struct hash_table));
|
||||
ht->hash_function = hash_function;
|
||||
ht->test_function = test_function;
|
||||
ht->size = prime_size (initial_size);
|
||||
ht->fullness = 0;
|
||||
ht->count = 0;
|
||||
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
|
||||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||||
return ht;
|
||||
}
|
||||
|
||||
/* Free the data associated with hash table HT. */
|
||||
|
||||
void
|
||||
hash_table_destroy (struct hash_table *ht)
|
||||
{
|
||||
free (ht->pairs);
|
||||
free (ht);
|
||||
}
|
||||
|
||||
/* Get the value that corresponds to the key KEY in the hash table HT.
|
||||
If no value is found, return NULL. Note that NULL is a legal value
|
||||
for value; if you are storing NULLs in your hash table, you can use
|
||||
hash_table_exists to be sure that a (possibly NULL) value exists in
|
||||
the table. */
|
||||
|
||||
void *
|
||||
hash_table_get (struct hash_table *ht, const void *key)
|
||||
{
|
||||
int location = ht->hash_function (key) % ht->size;
|
||||
while (1)
|
||||
{
|
||||
struct ht_pair *the_pair = ht->pairs + location;
|
||||
if (EMPTY_ENTRY_P (the_pair->key))
|
||||
return NULL;
|
||||
else if (DELETED_ENTRY_P (the_pair->key)
|
||||
|| !ht->test_function (key, the_pair->key))
|
||||
{
|
||||
++location;
|
||||
if (location == ht->size)
|
||||
location = 0;
|
||||
}
|
||||
else
|
||||
return the_pair->value;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return 1 if KEY exists in HT, 0 otherwise. */
|
||||
|
||||
int
|
||||
hash_table_exists (struct hash_table *ht, const void *key)
|
||||
{
|
||||
int location = ht->hash_function (key) % ht->size;
|
||||
while (1)
|
||||
{
|
||||
struct ht_pair *the_pair = ht->pairs + location;
|
||||
if (EMPTY_ENTRY_P (the_pair->key))
|
||||
return 0;
|
||||
else if (DELETED_ENTRY_P (the_pair->key)
|
||||
|| !ht->test_function (key, the_pair->key))
|
||||
{
|
||||
++location;
|
||||
if (location == ht->size)
|
||||
location = 0;
|
||||
}
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
|
||||
|
||||
/* Grow hash table HT as necessary, and rehash all the key-value
|
||||
pairs. */
|
||||
|
||||
static void
|
||||
grow_hash_table (struct hash_table *ht)
|
||||
{
|
||||
int i;
|
||||
struct ht_pair *old_pairs = ht->pairs;
|
||||
int old_count = ht->count; /* for assert() below */
|
||||
int old_size = ht->size;
|
||||
|
||||
/* Normally, the idea is to double ht->size (and round it to next
|
||||
prime) on each regrow:
|
||||
|
||||
ht->size = prime_size (ht->size * 2);
|
||||
|
||||
But it is possible that the table has large fullness because of
|
||||
the many deleted entries. If that is the case, we don't want to
|
||||
blindly grow the table; we just want to rehash it. For that
|
||||
reason, we use ht->count as the relevant parameter. MAX is used
|
||||
only because we don't want to actually shrink the table. (But
|
||||
maybe that's wrong.) */
|
||||
|
||||
int needed_size = prime_size (ht->count * 2);
|
||||
ht->size = MAX (old_size, needed_size);
|
||||
|
||||
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
|
||||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||||
|
||||
/* Need to reset these two; hash_table_put will reinitialize them. */
|
||||
ht->fullness = 0;
|
||||
ht->count = 0;
|
||||
for (i = 0; i < old_size; i++)
|
||||
{
|
||||
struct ht_pair *the_pair = old_pairs + i;
|
||||
if (!EMPTY_ENTRY_P (the_pair->key)
|
||||
&& !DELETED_ENTRY_P (the_pair->key))
|
||||
hash_table_put (ht, the_pair->key, the_pair->value);
|
||||
}
|
||||
assert (ht->count == old_count);
|
||||
free (old_pairs);
|
||||
}
|
||||
|
||||
/* Put VALUE in the hash table HT under the key KEY. This regrows the
|
||||
table if necessary. */
|
||||
|
||||
void
|
||||
hash_table_put (struct hash_table *ht, const void *key, void *value)
|
||||
{
|
||||
int location = ht->hash_function (key) % ht->size;
|
||||
while (1)
|
||||
{
|
||||
struct ht_pair *the_pair = ht->pairs + location;
|
||||
if (EMPTY_ENTRY_P (the_pair->key))
|
||||
{
|
||||
++ht->fullness;
|
||||
++ht->count;
|
||||
just_insert:
|
||||
the_pair->key = (void *)key; /* const? */
|
||||
the_pair->value = value;
|
||||
break;
|
||||
}
|
||||
else if (DELETED_ENTRY_P (the_pair->key))
|
||||
{
|
||||
/* We're replacing a deleteed entry, so ht->count gets
|
||||
increased, but ht->fullness remains unchanged. */
|
||||
++ht->count;
|
||||
goto just_insert;
|
||||
}
|
||||
else if (ht->test_function (key, the_pair->key))
|
||||
{
|
||||
/* We're replacing an existing entry, so ht->count and
|
||||
ht->fullness remain unchanged. */
|
||||
goto just_insert;
|
||||
}
|
||||
else
|
||||
{
|
||||
++location;
|
||||
if (location == ht->size)
|
||||
location = 0;
|
||||
}
|
||||
}
|
||||
if (ht->fullness * 4 > ht->size * 3)
|
||||
/* When fullness exceeds 75% of size, regrow the table. */
|
||||
grow_hash_table (ht);
|
||||
}
|
||||
|
||||
/* Remove KEY from HT. */
|
||||
|
||||
int
|
||||
hash_table_remove (struct hash_table *ht, const void *key)
|
||||
{
|
||||
int location = ht->hash_function (key) % ht->size;
|
||||
while (1)
|
||||
{
|
||||
struct ht_pair *the_pair = ht->pairs + location;
|
||||
if (EMPTY_ENTRY_P (the_pair->key))
|
||||
return 0;
|
||||
else if (DELETED_ENTRY_P (the_pair->key)
|
||||
|| !ht->test_function (key, the_pair->key))
|
||||
{
|
||||
++location;
|
||||
if (location == ht->size)
|
||||
location = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* We don't really remove an entry from the hash table: we
|
||||
just mark it as deleted. This is because there may be
|
||||
other entries located after this entry whose hash number
|
||||
points to a location before this entry. (Example: keys
|
||||
A, B and C have the same hash. If you were to really
|
||||
*delete* B from the table, C could no longer be found.)
|
||||
|
||||
As an optimization, it might be worthwhile to check
|
||||
whether the immediately preceding entry is empty and, if
|
||||
so, really delete the pair (set it to empty and decrease
|
||||
the fullness along with the count). I *think* it should
|
||||
be safe. */
|
||||
the_pair->key = ENTRY_DELETED;
|
||||
--ht->count;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hash_table_clear (struct hash_table *ht)
|
||||
{
|
||||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||||
ht->fullness = 0;
|
||||
ht->count = 0;
|
||||
}
|
||||
|
||||
void
|
||||
hash_table_map (struct hash_table *ht,
|
||||
int (*mapfun) (void *, void *, void *),
|
||||
void *closure)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < ht->size; i++)
|
||||
{
|
||||
struct ht_pair *the_pair = ht->pairs + i;
|
||||
if (!EMPTY_ENTRY_P (the_pair->key)
|
||||
&& !DELETED_ENTRY_P (the_pair->key))
|
||||
if (mapfun (the_pair->key, the_pair->value, closure))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Support for hash tables whose keys are strings. */
|
||||
|
||||
/* supposedly from the Dragon Book P436. */
|
||||
unsigned long
|
||||
string_hash (const void *sv)
|
||||
{
|
||||
unsigned int h = 0;
|
||||
unsigned const char *x = (unsigned const char *) sv;
|
||||
|
||||
while (*x)
|
||||
{
|
||||
unsigned int g;
|
||||
h = (h << 4) + *x++;
|
||||
if ((g = h & 0xf0000000) != 0)
|
||||
h = (h ^ (g >> 24)) ^ g;
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
int
|
||||
string_cmp (const void *s1, const void *s2)
|
||||
{
|
||||
return !strcmp ((const char *)s1, (const char *)s2);
|
||||
}
|
||||
|
||||
struct hash_table *
|
||||
make_string_hash_table (int initial_size)
|
||||
{
|
||||
return hash_table_new (initial_size, string_hash, string_cmp);
|
||||
}
|
||||
|
||||
|
||||
#ifdef STANDALONE
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int
|
||||
print_hash_table_mapper (const void *key, void *value, void *count)
|
||||
{
|
||||
++*(int *)count;
|
||||
printf ("%s: %s\n", (const char *)key, (char *)value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
print_hash (struct hash_table *sht)
|
||||
{
|
||||
int debug_count = 0;
|
||||
hash_table_map (sht, print_hash_table_mapper, &debug_count);
|
||||
assert (debug_count == sht->count);
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
struct hash_table *ht = make_string_hash_table (0);
|
||||
char line[80];
|
||||
while ((fgets (line, sizeof (line), stdin)))
|
||||
{
|
||||
int len = strlen (line);
|
||||
if (len <= 1)
|
||||
continue;
|
||||
line[--len] = '\0';
|
||||
hash_table_put (ht, strdup (line), "here I am!");
|
||||
if (len % 2)
|
||||
hash_table_remove (ht, line);
|
||||
}
|
||||
print_hash (ht);
|
||||
#if 0
|
||||
printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif
|
50
src/hash.h
Normal file
50
src/hash.h
Normal file
@ -0,0 +1,50 @@
|
||||
/* Hash table declarations.
|
||||
Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
/* From XEmacs, and hence from Dragon book. */
|
||||
|
||||
#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */
|
||||
#define HASH2(a,b) (GOOD_HASH * (a) + (b))
|
||||
#define HASH3(a,b,c) (GOOD_HASH * HASH2 (a,b) + (c))
|
||||
#define HASH4(a,b,c,d) (GOOD_HASH * HASH3 (a,b,c) + (d))
|
||||
#define HASH5(a,b,c,d,e) (GOOD_HASH * HASH4 (a,b,c,d) + (e))
|
||||
#define HASH6(a,b,c,d,e,f) (GOOD_HASH * HASH5 (a,b,c,d,e) + (f))
|
||||
#define HASH7(a,b,c,d,e,f,g) (GOOD_HASH * HASH6 (a,b,c,d,e,f) + (g))
|
||||
#define HASH8(a,b,c,d,e,f,g,h) (GOOD_HASH * HASH7 (a,b,c,d,e,f,g) + (h))
|
||||
#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i))
|
||||
|
||||
struct hash_table;
|
||||
|
||||
struct hash_table *hash_table_new PARAMS ((int,
|
||||
unsigned long (*) (const void *),
|
||||
int (*) (const void *,
|
||||
const void *)));
|
||||
void hash_table_destroy PARAMS ((struct hash_table *));
|
||||
void *hash_table_get PARAMS ((struct hash_table *, const void *));
|
||||
int hash_table_exists PARAMS ((struct hash_table *, const void *));
|
||||
void hash_table_put PARAMS ((struct hash_table *, const void *, void *));
|
||||
int hash_table_remove PARAMS ((struct hash_table *, const void *));
|
||||
void hash_table_clear PARAMS ((struct hash_table *));
|
||||
void hash_table_map PARAMS ((struct hash_table *,
|
||||
int (*) (void *, void *, void *),
|
||||
void *));
|
||||
|
||||
unsigned long string_hash PARAMS ((const void *));
|
||||
int string_cmp PARAMS ((const void *, const void *));
|
||||
struct hash_table *make_string_hash_table PARAMS ((int));
|
@ -165,6 +165,14 @@ header_strdup (const char *header, void *closure)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Write the value 1 into the integer pointed to by CLOSURE. */
|
||||
int
|
||||
header_exists (const char *header, void *closure)
|
||||
{
|
||||
*(int *)closure = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Skip LWS (linear white space), if present. Returns number of
|
||||
characters to skip. */
|
||||
int
|
||||
|
@ -31,5 +31,6 @@ int header_process PARAMS ((const char *, const char *,
|
||||
|
||||
int header_extract_number PARAMS ((const char *, void *));
|
||||
int header_strdup PARAMS ((const char *, void *));
|
||||
int header_exists PARAMS ((const char *, void *));
|
||||
|
||||
int skip_lws PARAMS ((const char *));
|
||||
|
342
src/host.c
342
src/host.c
@ -1,5 +1,5 @@
|
||||
/* Dealing with host names.
|
||||
Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
|
||||
Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
@ -48,35 +48,38 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "utils.h"
|
||||
#include "host.h"
|
||||
#include "url.h"
|
||||
#include "hash.h"
|
||||
|
||||
#ifndef errno
|
||||
extern int errno;
|
||||
#endif
|
||||
|
||||
/* Host list entry */
|
||||
struct host
|
||||
/* Mapping between all known hosts to their addresses (n.n.n.n). */
|
||||
struct hash_table *host_name_address_map;
|
||||
|
||||
/* Mapping between all known addresses (n.n.n.n) to their hosts. This
|
||||
is the inverse of host_name_address_map. These two tables share
|
||||
the strdup'ed strings. */
|
||||
struct hash_table *host_address_name_map;
|
||||
|
||||
/* Mapping between auxilliary (slave) and master host names. */
|
||||
struct hash_table *host_slave_master_map;
|
||||
|
||||
/* Utility function: like xstrdup(), but also lowercases S. */
|
||||
|
||||
static char *
|
||||
xstrdup_lower (const char *s)
|
||||
{
|
||||
/* Host's symbolical name, as encountered at the time of first
|
||||
inclusion, e.g. "fly.cc.fer.hr". */
|
||||
char *hostname;
|
||||
/* Host's "real" name, i.e. its IP address, written out in ASCII
|
||||
form of N.N.N.N, e.g. "161.53.70.130". */
|
||||
char *realname;
|
||||
/* More than one HOSTNAME can correspond to the same REALNAME. For
|
||||
our purposes, the canonical name of the host is its HOSTNAME when
|
||||
it was first encountered. This entry is said to have QUALITY. */
|
||||
int quality;
|
||||
/* Next entry in the list. */
|
||||
struct host *next;
|
||||
};
|
||||
|
||||
static struct host *hlist;
|
||||
|
||||
static struct host *add_hlist PARAMS ((struct host *, const char *,
|
||||
const char *, int));
|
||||
char *copy = xstrdup (s);
|
||||
char *p = copy;
|
||||
for (; *p; p++)
|
||||
*p = TOLOWER (*p);
|
||||
return copy;
|
||||
}
|
||||
|
||||
/* The same as gethostbyname, but supports internet addresses of the
|
||||
form `N.N.N.N'. */
|
||||
form `N.N.N.N'. On some systems gethostbyname() knows how to do
|
||||
this automatically. */
|
||||
struct hostent *
|
||||
ngethostbyname (const char *name)
|
||||
{
|
||||
@ -91,42 +94,51 @@ ngethostbyname (const char *name)
|
||||
return hp;
|
||||
}
|
||||
|
||||
/* Search for HOST in the linked list L, by hostname. Return the
|
||||
entry, if found, or NULL. The search is case-insensitive. */
|
||||
static struct host *
|
||||
search_host (struct host *l, const char *host)
|
||||
{
|
||||
for (; l; l = l->next)
|
||||
if (strcasecmp (l->hostname, host) == 0)
|
||||
return l;
|
||||
return NULL;
|
||||
}
|
||||
/* Add host name HOST with the address ADDR_TEXT to the cache.
|
||||
Normally this means that the (HOST, ADDR_TEXT) pair will be to
|
||||
host_name_address_map and to host_address_name_map. (It is the
|
||||
caller's responsibility to make sure that HOST is not already in
|
||||
host_name_address_map.)
|
||||
|
||||
/* Like search_host, but searches by address. */
|
||||
static struct host *
|
||||
search_address (struct host *l, const char *address)
|
||||
If the ADDR_TEXT has already been seen and belongs to another host,
|
||||
HOST will be added to host_slave_master_map instead. */
|
||||
|
||||
static void
|
||||
add_host_to_cache (const char *host, const char *addr_text)
|
||||
{
|
||||
for (; l; l = l->next)
|
||||
char *canonical_name = hash_table_get (host_address_name_map, addr_text);
|
||||
if (canonical_name)
|
||||
{
|
||||
int cmp = strcmp (l->realname, address);
|
||||
if (cmp == 0)
|
||||
return l;
|
||||
else if (cmp > 0)
|
||||
return NULL;
|
||||
DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
|
||||
host, canonical_name));
|
||||
/* We've already dealt with that host under another name. */
|
||||
hash_table_put (host_slave_master_map,
|
||||
xstrdup_lower (host),
|
||||
xstrdup_lower (canonical_name));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* This is really the first time we're dealing with that host. */
|
||||
char *h_copy = xstrdup_lower (host);
|
||||
char *a_copy = xstrdup (addr_text);
|
||||
DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
|
||||
hash_table_put (host_name_address_map, h_copy, a_copy);
|
||||
hash_table_put (host_address_name_map, a_copy, h_copy);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Store the address of HOSTNAME, internet-style, to WHERE. First
|
||||
check for it in the host list, and (if not found), use
|
||||
ngethostbyname to get it.
|
||||
/* Store the address of HOSTNAME, internet-style (four octets in
|
||||
network order), to WHERE. First try to get the address from the
|
||||
cache; if it is not available, call the DNS functions and update
|
||||
the cache.
|
||||
|
||||
Return 1 on successful finding of the hostname, 0 otherwise. */
|
||||
int
|
||||
store_hostaddress (unsigned char *where, const char *hostname)
|
||||
{
|
||||
struct host *t;
|
||||
unsigned long addr;
|
||||
char *addr_text;
|
||||
char *canonical_name;
|
||||
struct hostent *hptr;
|
||||
struct in_addr in;
|
||||
char *inet_s;
|
||||
@ -134,178 +146,119 @@ store_hostaddress (unsigned char *where, const char *hostname)
|
||||
/* If the address is of the form d.d.d.d, there will be no trouble
|
||||
with it. */
|
||||
addr = (unsigned long)inet_addr (hostname);
|
||||
if ((int)addr == -1)
|
||||
{
|
||||
/* If it is not of that form, try to find it in the cache. */
|
||||
t = search_host (hlist, hostname);
|
||||
if (t)
|
||||
addr = (unsigned long)inet_addr (t->realname);
|
||||
}
|
||||
/* If we have the numeric address, just store it. */
|
||||
if ((int)addr != -1)
|
||||
{
|
||||
/* ADDR is in network byte order, meaning the code works on
|
||||
little and big endian 32-bit architectures without change.
|
||||
On big endian 64-bit architectures we need to be careful to
|
||||
copy the correct four bytes. */
|
||||
int offset = 0;
|
||||
/* ADDR is defined to be in network byte order, meaning the code
|
||||
works on little and big endian 32-bit architectures without
|
||||
change. On big endian 64-bit architectures we need to be
|
||||
careful to copy the correct four bytes. */
|
||||
int offset;
|
||||
have_addr:
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
offset = sizeof (unsigned long) - 4;
|
||||
#else
|
||||
offset = 0;
|
||||
#endif
|
||||
memcpy (where, (char *)&addr + offset, 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* By now we know that the address is not of the form d.d.d.d. Try
|
||||
to find it in our cache of host addresses. */
|
||||
addr_text = hash_table_get (host_name_address_map, hostname);
|
||||
if (addr_text)
|
||||
{
|
||||
DEBUGP (("Found %s in host_name_address_map: %s\n",
|
||||
hostname, addr_text));
|
||||
addr = (unsigned long)inet_addr (addr_text);
|
||||
goto have_addr;
|
||||
}
|
||||
|
||||
/* Maybe this host is known to us under another name. If so, we'll
|
||||
find it in host_slave_master_map, and use the master name to find
|
||||
its address in host_name_address_map. */
|
||||
canonical_name = hash_table_get (host_slave_master_map, hostname);
|
||||
if (canonical_name)
|
||||
{
|
||||
addr_text = hash_table_get (host_name_address_map, canonical_name);
|
||||
assert (addr_text != NULL);
|
||||
DEBUGP (("Found %s as slave of %s -> %s\n",
|
||||
hostname, canonical_name, addr_text));
|
||||
addr = (unsigned long)inet_addr (addr_text);
|
||||
goto have_addr;
|
||||
}
|
||||
|
||||
/* Since all else has failed, let's try gethostbyname(). Note that
|
||||
we use gethostbyname() rather than ngethostbyname(), because we
|
||||
*know* the address is not numerical. */
|
||||
already know that the address is not numerical. */
|
||||
hptr = gethostbyname (hostname);
|
||||
if (!hptr)
|
||||
return 0;
|
||||
/* Copy the address of the host to socket description. */
|
||||
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
|
||||
/* Now that we're here, we could as well cache the hostname for
|
||||
future use, as in realhost(). First, we have to look for it by
|
||||
address to know if it's already in the cache by another name. */
|
||||
assert (hptr->h_length == 4);
|
||||
|
||||
/* Now that we've gone through the truoble of calling
|
||||
gethostbyname(), we can store this valuable information to the
|
||||
cache. First, we have to look for it by address to know if it's
|
||||
already in the cache by another name. */
|
||||
/* Originally, we copied to in.s_addr, but it appears to be missing
|
||||
on some systems. */
|
||||
memcpy (&in, *hptr->h_addr_list, sizeof (in));
|
||||
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
|
||||
t = search_address (hlist, inet_s);
|
||||
if (t) /* Found in the list, as realname. */
|
||||
{
|
||||
/* Set the default, 0 quality. */
|
||||
hlist = add_hlist (hlist, hostname, inet_s, 0);
|
||||
return 1;
|
||||
}
|
||||
/* Since this is really the first time this host is encountered,
|
||||
set quality to 1. */
|
||||
hlist = add_hlist (hlist, hostname, inet_s, 1);
|
||||
inet_s = inet_ntoa (in);
|
||||
add_host_to_cache (hostname, inet_s);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Add a host to the host list. The list is sorted by addresses. For
|
||||
equal addresses, the entries with quality should bubble towards the
|
||||
beginning of the list. */
|
||||
static struct host *
|
||||
add_hlist (struct host *l, const char *nhost, const char *nreal, int quality)
|
||||
{
|
||||
struct host *t, *old, *beg;
|
||||
|
||||
/* The entry goes to the beginning of the list if the list is empty
|
||||
or the order requires it. */
|
||||
if (!l || (strcmp (nreal, l->realname) < 0))
|
||||
{
|
||||
t = (struct host *)xmalloc (sizeof (struct host));
|
||||
t->hostname = xstrdup (nhost);
|
||||
t->realname = xstrdup (nreal);
|
||||
t->quality = quality;
|
||||
t->next = l;
|
||||
return t;
|
||||
}
|
||||
|
||||
beg = l;
|
||||
/* Second two one-before-the-last element. */
|
||||
while (l->next)
|
||||
{
|
||||
int cmp;
|
||||
old = l;
|
||||
l = l->next;
|
||||
cmp = strcmp (nreal, l->realname);
|
||||
if (cmp >= 0)
|
||||
continue;
|
||||
/* If the next list element is greater than s, put s between the
|
||||
current and the next list element. */
|
||||
t = (struct host *)xmalloc (sizeof (struct host));
|
||||
old->next = t;
|
||||
t->next = l;
|
||||
t->hostname = xstrdup (nhost);
|
||||
t->realname = xstrdup (nreal);
|
||||
t->quality = quality;
|
||||
return beg;
|
||||
}
|
||||
t = (struct host *)xmalloc (sizeof (struct host));
|
||||
t->hostname = xstrdup (nhost);
|
||||
t->realname = xstrdup (nreal);
|
||||
t->quality = quality;
|
||||
/* Insert the new element after the last element. */
|
||||
l->next = t;
|
||||
t->next = NULL;
|
||||
return beg;
|
||||
}
|
||||
|
||||
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
|
||||
is referenced by more than one name, "real" name is considered to
|
||||
be the first one encountered in the past.
|
||||
|
||||
If the host cannot be found in the list of already dealt-with
|
||||
hosts, try with its INET address. If this fails too, add it to the
|
||||
list. The routine does not call gethostbyname twice for the same
|
||||
host if it can possibly avoid it. */
|
||||
be the first one encountered in the past. */
|
||||
char *
|
||||
realhost (const char *host)
|
||||
{
|
||||
struct host *l, *l_real;
|
||||
struct in_addr in;
|
||||
struct hostent *hptr;
|
||||
char *inet_s;
|
||||
char *master_name;
|
||||
|
||||
DEBUGP (("Checking for %s.\n", host));
|
||||
/* Look for the host, looking by the host name. */
|
||||
l = search_host (hlist, host);
|
||||
if (l && l->quality) /* Found it with quality */
|
||||
DEBUGP (("Checking for %s in host_name_address_map.\n", host));
|
||||
if (hash_table_exists (host_name_address_map, host))
|
||||
{
|
||||
DEBUGP (("%s was already used, by that name.\n", host));
|
||||
/* Here we return l->hostname, not host, because of the possible
|
||||
case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are
|
||||
the same, but we want the one that was first. */
|
||||
return xstrdup (l->hostname);
|
||||
DEBUGP (("Found; %s was already used, by that name.\n", host));
|
||||
return xstrdup_lower (host);
|
||||
}
|
||||
else if (!l) /* Not found, with or without quality */
|
||||
{
|
||||
/* The fact that gethostbyname will get called makes it
|
||||
necessary to store it to the list, to ensure that
|
||||
gethostbyname will not be called twice for the same string.
|
||||
However, the quality argument must be set appropriately.
|
||||
|
||||
Note that add_hlist must be called *after* the realname
|
||||
search, or the quality would be always set to 0 */
|
||||
DEBUGP (("This is the first time I hear about host %s by that name.\n",
|
||||
host));
|
||||
hptr = ngethostbyname (host);
|
||||
if (!hptr)
|
||||
return xstrdup (host);
|
||||
DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
|
||||
master_name = hash_table_get (host_slave_master_map, host);
|
||||
if (master_name)
|
||||
{
|
||||
has_master:
|
||||
DEBUGP (("Found; %s was already used, by the name %s.\n",
|
||||
host, master_name));
|
||||
return xstrdup (master_name);
|
||||
}
|
||||
|
||||
DEBUGP (("First time I hear about %s by that name; looking it up.\n",
|
||||
host));
|
||||
hptr = ngethostbyname (host);
|
||||
if (hptr)
|
||||
{
|
||||
char *inet_s;
|
||||
/* Originally, we copied to in.s_addr, but it appears to be
|
||||
missing on some systems. */
|
||||
missing on some systems. */
|
||||
memcpy (&in, *hptr->h_addr_list, sizeof (in));
|
||||
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
|
||||
}
|
||||
else /* Found, without quality */
|
||||
{
|
||||
/* This case happens when host is on the list,
|
||||
but not as first entry (the one with quality).
|
||||
Then we just get its INET address and pick
|
||||
up the first entry with quality. */
|
||||
DEBUGP (("We've dealt with host %s, but under the name %s.\n",
|
||||
host, l->realname));
|
||||
STRDUP_ALLOCA (inet_s, l->realname);
|
||||
inet_s = inet_ntoa (in);
|
||||
|
||||
add_host_to_cache (host, inet_s);
|
||||
|
||||
/* add_host_to_cache() can establish a slave-master mapping. */
|
||||
DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
|
||||
master_name = hash_table_get (host_slave_master_map, host);
|
||||
if (master_name)
|
||||
goto has_master;
|
||||
}
|
||||
|
||||
/* Now we certainly have the INET address. The following loop is
|
||||
guaranteed to pick either an entry with quality (because it is
|
||||
the first one), or none at all. */
|
||||
l_real = search_address (hlist, inet_s);
|
||||
if (l_real) /* Found in the list, as realname. */
|
||||
{
|
||||
if (!l)
|
||||
/* Set the default, 0 quality. */
|
||||
hlist = add_hlist (hlist, host, inet_s, 0);
|
||||
return xstrdup (l_real->hostname);
|
||||
}
|
||||
/* Since this is really the first time this host is encountered,
|
||||
set quality to 1. */
|
||||
hlist = add_hlist (hlist, host, inet_s, 1);
|
||||
return xstrdup (host);
|
||||
return xstrdup_lower (host);
|
||||
}
|
||||
|
||||
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
|
||||
@ -547,20 +500,23 @@ herrmsg (int error)
|
||||
return _("Unknown error");
|
||||
}
|
||||
|
||||
/* Clean the host list. This is a separate function, so we needn't
|
||||
export HLIST and its implementation. Ha! */
|
||||
void
|
||||
clean_hosts (void)
|
||||
{
|
||||
struct host *l = hlist;
|
||||
|
||||
while (l)
|
||||
{
|
||||
struct host *p = l->next;
|
||||
free (l->hostname);
|
||||
free (l->realname);
|
||||
free (l);
|
||||
l = p;
|
||||
}
|
||||
hlist = NULL;
|
||||
/* host_name_address_map and host_address_name_map share the
|
||||
strings. Because of that, calling free_keys_and_values once
|
||||
suffices for both. */
|
||||
free_keys_and_values (host_name_address_map);
|
||||
hash_table_destroy (host_name_address_map);
|
||||
hash_table_destroy (host_address_name_map);
|
||||
free_keys_and_values (host_slave_master_map);
|
||||
hash_table_destroy (host_slave_master_map);
|
||||
}
|
||||
|
||||
void
|
||||
host_init (void)
|
||||
{
|
||||
host_name_address_map = make_string_hash_table (0);
|
||||
host_address_name_map = make_string_hash_table (0);
|
||||
host_slave_master_map = make_string_hash_table (0);
|
||||
}
|
||||
|
856
src/html-parse.c
Normal file
856
src/html-parse.c
Normal file
@ -0,0 +1,856 @@
|
||||
/* HTML parser for Wget.
|
||||
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or (at
|
||||
your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
/* The only entry point to this module is map_html_tags(), which see. */
|
||||
|
||||
/* TODO:
|
||||
|
||||
- Allow hooks for callers to process contents outside tags. This
|
||||
is needed to implement handling <style> and <script>. The
|
||||
taginfo structure already carries the information about where the
|
||||
tags are, but this is not enough, because one would also want to
|
||||
skip the comments. (The funny thing is that for <style> and
|
||||
<script> you *don't* want to skip comments!)
|
||||
|
||||
- Create a test suite for regression testing. */
|
||||
|
||||
/* HISTORY:
|
||||
|
||||
This is the third HTML parser written for Wget. The first one was
|
||||
written some time during the Geturl 1.0 beta cycle, and was very
|
||||
inefficient and buggy. It also contained some very complex code to
|
||||
remember a list of parser states, because it was supposed to be
|
||||
reentrant. The idea was that several parsers would be running
|
||||
concurrently, and you'd have pass the function a unique ID string
|
||||
(for example, the URL) by which it found the relevant parser state
|
||||
and returned the next URL. Over-engineering at its best.
|
||||
|
||||
The second HTML parser was written for Wget 1.4 (the first version
|
||||
by the name `Wget'), and was a complete rewrite. Although the new
|
||||
parser behaved much better and made no claims of reentrancy, it
|
||||
still shared many of the fundamental flaws of the old version -- it
|
||||
only regarded HTML in terms tag-attribute pairs, where the
|
||||
attribute's value was a URL to be returned. Any other property of
|
||||
HTML, such as <base href=...>, or strange way to specify a URL,
|
||||
such as <meta http-equiv=Refresh content="0; URL=..."> had to be
|
||||
crudely hacked in -- and the caller had to be aware of these hacks.
|
||||
Like its predecessor, this parser did not support HTML comments.
|
||||
|
||||
After Wget 1.5.1 was released, I set out to write a third HTML
|
||||
parser. The objectives of the new parser were to: (1) provide a
|
||||
clean way to analyze HTML lexically, (2) separate interpretation of
|
||||
the markup from the parsing process, (3) be as correct as possible,
|
||||
e.g. correctly skipping comments and other SGML declarations, (4)
|
||||
understand the most common errors in markup and skip them or be
|
||||
relaxed towrds them, and (5) be reasonably efficient (no regexps,
|
||||
minimum copying and minimum or no heap allocation).
|
||||
|
||||
I believe this parser meets all of the above goals. It is
|
||||
reasonably well structured, and could be relatively easily
|
||||
separated from Wget and used elsewhere. While some of its
|
||||
intrinsic properties limit its value as a general-purpose HTML
|
||||
parser, I believe that, with minimum modifications, it could serve
|
||||
as a backend for one.
|
||||
|
||||
Due to time and other constraints, this parser was not integrated
|
||||
into Wget until the version ???. */
|
||||
|
||||
/* DESCRIPTION:
|
||||
|
||||
The single entry point of this parser is map_html_tags(), which
|
||||
works by calling a function you specify for each tag. The function
|
||||
gets called with the pointer to a structure describing the tag and
|
||||
its attributes. */
|
||||
|
||||
/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
|
||||
still need Wget headers to compile. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#else
|
||||
# include <strings.h>
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "html-parse.h"
|
||||
|
||||
#ifdef STANDALONE
|
||||
# define xmalloc malloc
|
||||
# define xrealloc realloc
|
||||
#endif /* STANDALONE */
|
||||
|
||||
/* Pool support. For efficiency, map_html_tags() stores temporary
|
||||
string data to a single stack-allocated pool. If the pool proves
|
||||
too small, additional memory is allocated/resized with
|
||||
malloc()/realloc(). */
|
||||
|
||||
struct pool {
|
||||
char *contents; /* pointer to the contents. */
|
||||
int size; /* size of the pool. */
|
||||
int index; /* next unoccupied position in
|
||||
contents. */
|
||||
|
||||
int alloca_p; /* whether contents was allocated
|
||||
using alloca(). */
|
||||
char *orig_contents; /* orig_contents, allocated by
|
||||
alloca(). this is used by
|
||||
POOL_FREE to restore the pool to
|
||||
the "initial" state. */
|
||||
int orig_size;
|
||||
};
|
||||
|
||||
/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
|
||||
|
||||
#define POOL_INIT(pool, initial_size) do { \
|
||||
(pool).size = (initial_size); \
|
||||
(pool).contents = ALLOCA_ARRAY (char, (pool).size); \
|
||||
(pool).index = 0; \
|
||||
(pool).alloca_p = 1; \
|
||||
(pool).orig_contents = (pool).contents; \
|
||||
(pool).orig_size = (pool).size; \
|
||||
} while (0)
|
||||
|
||||
/* Grow the pool to accomodate at least SIZE new bytes. If the pool
|
||||
already has room to accomodate SIZE bytes of data, this is a no-op. */
|
||||
|
||||
#define POOL_GROW(pool, increase) do { \
|
||||
int PG_newsize = (pool).index + increase; \
|
||||
DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize, \
|
||||
(pool).alloca_p, char); \
|
||||
} while (0)
|
||||
|
||||
/* Append text in the range [beg, end) to POOL. No zero-termination
|
||||
is done. */
|
||||
|
||||
#define POOL_APPEND(pool, beg, end) do { \
|
||||
const char *PA_beg = beg; \
|
||||
int PA_size = end - PA_beg; \
|
||||
POOL_GROW (pool, PA_size); \
|
||||
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
|
||||
(pool).index += PA_size; \
|
||||
} while (0)
|
||||
|
||||
/* The same as the above, but with zero termination. */
|
||||
|
||||
#define POOL_APPEND_ZT(pool, beg, end) do { \
|
||||
const char *PA_beg = beg; \
|
||||
int PA_size = end - PA_beg; \
|
||||
POOL_GROW (pool, PA_size + 1); \
|
||||
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
|
||||
(pool).contents[(pool).index + PA_size] = '\0'; \
|
||||
(pool).index += PA_size + 1; \
|
||||
} while (0)
|
||||
|
||||
/* Forget old pool contents. The allocated memory is not freed. */
|
||||
#define POOL_REWIND(pool) pool.index = 0
|
||||
|
||||
/* Free heap-allocated memory for contents of POOL. This calls free()
|
||||
if the memory was allocated through malloc. It also restores
|
||||
`contents' and `size' to their original, pre-malloc values. That
|
||||
way after POOL_FREE, the pool is fully usable, just as if it were
|
||||
freshly initialized with POOL_INIT. */
|
||||
|
||||
#define POOL_FREE(pool) do { \
|
||||
if (!(pool).alloca_p) \
|
||||
free ((pool).contents); \
|
||||
(pool).contents = (pool).orig_contents; \
|
||||
(pool).size = (pool).orig_size; \
|
||||
(pool).index = 0; \
|
||||
(pool).alloca_p = 1; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define AP_DOWNCASE 1
|
||||
#define AP_PROCESS_ENTITIES 2
|
||||
#define AP_SKIP_BLANKS 4
|
||||
|
||||
/* Copy the text in the range [BEG, END) to POOL, optionally
|
||||
performing operations specified by FLAGS. FLAGS may be any
|
||||
combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
|
||||
with the following meaning:
|
||||
|
||||
* AP_DOWNCASE -- downcase all the letters;
|
||||
|
||||
* AP_PROCESS_ENTITIES -- process the SGML entities and write out
|
||||
the decoded string. Recognized entities are <, >, &, ",
|
||||
  and the numerical entities.
|
||||
|
||||
* AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
|
||||
of text. */
|
||||
static void
|
||||
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
|
||||
{
|
||||
int old_index = pool->index;
|
||||
int size;
|
||||
|
||||
/* First, skip blanks if required. We must do this before entities
|
||||
are processed, so that blanks can still be inserted as, for
|
||||
instance, ` '. */
|
||||
if (flags & AP_SKIP_BLANKS)
|
||||
{
|
||||
while (beg < end && ISSPACE (*beg))
|
||||
++beg;
|
||||
while (end > beg && ISSPACE (end[-1]))
|
||||
--end;
|
||||
}
|
||||
size = end - beg;
|
||||
|
||||
if (flags & AP_PROCESS_ENTITIES)
|
||||
{
|
||||
/* Stack-allocate a copy of text, process entities and copy it
|
||||
to the pool. */
|
||||
char *local_copy = (char *)alloca (size + 1);
|
||||
const char *from = beg;
|
||||
char *to = local_copy;
|
||||
|
||||
while (from < end)
|
||||
{
|
||||
if (*from != '&')
|
||||
*to++ = *from++;
|
||||
else
|
||||
{
|
||||
const char *save = from;
|
||||
int remain;
|
||||
|
||||
if (++from == end) goto lose;
|
||||
remain = end - from;
|
||||
|
||||
if (*from == '#')
|
||||
{
|
||||
int numeric;
|
||||
++from;
|
||||
if (from == end || !ISDIGIT (*from)) goto lose;
|
||||
for (numeric = 0; from < end && ISDIGIT (*from); from++)
|
||||
numeric = 10 * numeric + (*from) - '0';
|
||||
if (from < end && ISALPHA (*from)) goto lose;
|
||||
numeric &= 0xff;
|
||||
*to++ = numeric;
|
||||
}
|
||||
#define FROB(x) (remain >= (sizeof (x) - 1) \
|
||||
&& !memcmp (from, x, sizeof (x) - 1) \
|
||||
&& (*(from + sizeof (x) - 1) == ';' \
|
||||
|| remain == sizeof (x) - 1 \
|
||||
|| !ISALNUM (*(from + sizeof (x) - 1))))
|
||||
else if (FROB ("lt"))
|
||||
*to++ = '<', from += 2;
|
||||
else if (FROB ("gt"))
|
||||
*to++ = '>', from += 2;
|
||||
else if (FROB ("amp"))
|
||||
*to++ = '&', from += 3;
|
||||
else if (FROB ("quot"))
|
||||
*to++ = '\"', from += 4;
|
||||
/* We don't implement the proposed "Added Latin 1"
|
||||
entities (except for nbsp), because it is unnecessary
|
||||
in the context of Wget, and would require hashing to
|
||||
work efficiently. */
|
||||
else if (FROB ("nbsp"))
|
||||
*to++ = 160, from += 4;
|
||||
else
|
||||
goto lose;
|
||||
#undef FROB
|
||||
/* If the entity was followed by `;', we step over the
|
||||
`;'. Otherwise, it was followed by either a
|
||||
non-alphanumeric or EOB, in which case we do nothing. */
|
||||
if (from < end && *from == ';')
|
||||
++from;
|
||||
continue;
|
||||
|
||||
lose:
|
||||
/* This was not an entity after all. Back out. */
|
||||
from = save;
|
||||
*to++ = *from++;
|
||||
}
|
||||
}
|
||||
*to++ = '\0';
|
||||
POOL_APPEND (*pool, local_copy, to);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Just copy the text to the pool. */
|
||||
POOL_APPEND_ZT (*pool, beg, end);
|
||||
}
|
||||
|
||||
if (flags & AP_DOWNCASE)
|
||||
{
|
||||
char *p = pool->contents + old_index;
|
||||
for (; *p; p++)
|
||||
*p = TOLOWER (*p);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check whether the contents of [POS, POS+LENGTH) match any of the
|
||||
strings in the ARRAY. */
|
||||
static int
|
||||
array_allowed (const char **array, const char *beg, const char *end)
|
||||
{
|
||||
int length = end - beg;
|
||||
if (array)
|
||||
{
|
||||
for (; *array; array++)
|
||||
if (length >= strlen (*array)
|
||||
&& !strncasecmp (*array, beg, length))
|
||||
break;
|
||||
if (!*array)
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* RFC1866: name [of attribute or tag] consists of letters, digits,
|
||||
periods, or hyphens. We also allow _, for compatibility with
|
||||
brain-damaged generators. */
|
||||
#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
|
||||
|
||||
/* States while advancing through comments. */
|
||||
#define AC_S_DONE 0
|
||||
#define AC_S_BACKOUT 1
|
||||
#define AC_S_BANG 2
|
||||
#define AC_S_DEFAULT 3
|
||||
#define AC_S_DCLNAME 4
|
||||
#define AC_S_DASH1 5
|
||||
#define AC_S_DASH2 6
|
||||
#define AC_S_COMMENT 7
|
||||
#define AC_S_DASH3 8
|
||||
#define AC_S_DASH4 9
|
||||
#define AC_S_QUOTE1 10
|
||||
#define AC_S_IN_QUOTE 11
|
||||
#define AC_S_QUOTE2 12
|
||||
|
||||
#ifdef STANDALONE
|
||||
static int comment_backout_count;
|
||||
#endif
|
||||
|
||||
/* Advance over an SGML declaration (the <!...> forms you find in HTML
|
||||
documents). The function returns the location after the
|
||||
declaration. The reason we need this is that HTML comments are
|
||||
expressed as comments in so-called "empty declarations".
|
||||
|
||||
To recap: any SGML declaration may have comments associated with
|
||||
it, e.g.
|
||||
<!MY-DECL -- isn't this fun? -- foo bar>
|
||||
|
||||
An HTML comment is merely an empty declaration (<!>) with a comment
|
||||
attached, like this:
|
||||
<!-- some stuff here -->
|
||||
|
||||
Several comments may be embedded in one comment declaration:
|
||||
<!-- have -- -- fun -->
|
||||
|
||||
Whitespace is allowed between and after the comments, but not
|
||||
before the first comment.
|
||||
|
||||
Additionally, this function attempts to handle double quotes in
|
||||
SGML declarations correctly. */
|
||||
static const char *
|
||||
advance_declaration (const char *beg, const char *end)
|
||||
{
|
||||
const char *p = beg;
|
||||
char quote_char = '\0'; /* shut up, gcc! */
|
||||
char ch;
|
||||
int state = AC_S_BANG;
|
||||
|
||||
if (beg == end)
|
||||
return beg;
|
||||
ch = *p++;
|
||||
|
||||
/* It looked like a good idea to write this as a state machine, but
|
||||
now I wonder... */
|
||||
|
||||
while (state != AC_S_DONE && state != AC_S_BACKOUT)
|
||||
{
|
||||
if (p == end)
|
||||
state = AC_S_BACKOUT;
|
||||
switch (state)
|
||||
{
|
||||
case AC_S_DONE:
|
||||
case AC_S_BACKOUT:
|
||||
break;
|
||||
case AC_S_BANG:
|
||||
if (ch == '!')
|
||||
{
|
||||
ch = *p++;
|
||||
state = AC_S_DEFAULT;
|
||||
}
|
||||
else
|
||||
state = AC_S_BACKOUT;
|
||||
break;
|
||||
case AC_S_DEFAULT:
|
||||
switch (ch)
|
||||
{
|
||||
case '-':
|
||||
state = AC_S_DASH1;
|
||||
break;
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\r':
|
||||
case '\n':
|
||||
ch = *p++;
|
||||
break;
|
||||
case '>':
|
||||
state = AC_S_DONE;
|
||||
break;
|
||||
case '\'':
|
||||
case '\"':
|
||||
state = AC_S_QUOTE1;
|
||||
break;
|
||||
default:
|
||||
if (NAME_CHAR_P (ch))
|
||||
state = AC_S_DCLNAME;
|
||||
else
|
||||
state = AC_S_BACKOUT;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case AC_S_DCLNAME:
|
||||
if (NAME_CHAR_P (ch))
|
||||
ch = *p++;
|
||||
else if (ch == '-')
|
||||
state = AC_S_DASH1;
|
||||
else
|
||||
state = AC_S_DEFAULT;
|
||||
break;
|
||||
case AC_S_QUOTE1:
|
||||
assert (ch == '\'' || ch == '\"');
|
||||
quote_char = ch; /* cheating -- I really don't feel like
|
||||
introducing more different states for
|
||||
different quote characters. */
|
||||
ch = *p++;
|
||||
state = AC_S_IN_QUOTE;
|
||||
break;
|
||||
case AC_S_IN_QUOTE:
|
||||
if (ch == quote_char)
|
||||
state = AC_S_QUOTE2;
|
||||
else
|
||||
ch = *p++;
|
||||
break;
|
||||
case AC_S_QUOTE2:
|
||||
assert (ch == quote_char);
|
||||
ch = *p++;
|
||||
state = AC_S_DEFAULT;
|
||||
break;
|
||||
case AC_S_DASH1:
|
||||
assert (ch == '-');
|
||||
ch = *p++;
|
||||
state = AC_S_DASH2;
|
||||
break;
|
||||
case AC_S_DASH2:
|
||||
switch (ch)
|
||||
{
|
||||
case '-':
|
||||
ch = *p++;
|
||||
state = AC_S_COMMENT;
|
||||
break;
|
||||
default:
|
||||
state = AC_S_BACKOUT;
|
||||
}
|
||||
break;
|
||||
case AC_S_COMMENT:
|
||||
switch (ch)
|
||||
{
|
||||
case '-':
|
||||
state = AC_S_DASH3;
|
||||
break;
|
||||
default:
|
||||
ch = *p++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case AC_S_DASH3:
|
||||
assert (ch == '-');
|
||||
ch = *p++;
|
||||
state = AC_S_DASH4;
|
||||
break;
|
||||
case AC_S_DASH4:
|
||||
switch (ch)
|
||||
{
|
||||
case '-':
|
||||
ch = *p++;
|
||||
state = AC_S_DEFAULT;
|
||||
break;
|
||||
default:
|
||||
state = AC_S_COMMENT;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (state == AC_S_BACKOUT)
|
||||
{
|
||||
#ifdef STANDALONE
|
||||
++comment_backout_count;
|
||||
#endif
|
||||
return beg + 1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Advance P (a char pointer), with the explicit intent of being able
|
||||
to read the next character. If this is not possible, go to finish. */
|
||||
|
||||
#define ADVANCE(p) do { \
|
||||
++p; \
|
||||
if (p >= end) \
|
||||
goto finish; \
|
||||
} while (0)
|
||||
|
||||
/* Skip whitespace, if any. */
|
||||
|
||||
#define SKIP_WS(p) do { \
|
||||
while (ISSPACE (*p)) { \
|
||||
ADVANCE (p); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* Skip non-whitespace, if any. */
|
||||
|
||||
#define SKIP_NON_WS(p) do { \
|
||||
while (!ISSPACE (*p)) { \
|
||||
ADVANCE (p); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef STANDALONE
|
||||
static int tag_backout_count;
|
||||
#endif
|
||||
|
||||
/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
|
||||
MAPFUN will be called with two arguments: pointer to an initialized
|
||||
struct taginfo, and CLOSURE.
|
||||
|
||||
ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
|
||||
be processed by this function. If it is NULL, all the tags are
|
||||
allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
|
||||
|
||||
(Obviously, the caller can filter out unwanted tags and attributes
|
||||
just as well, but this is just an optimization designed to avoid
|
||||
unnecessary copying for tags/attributes which the caller doesn't
|
||||
want to know about. These lists are searched linearly; therefore,
|
||||
if you're interested in a large number of tags or attributes, you'd
|
||||
better set these to NULL and filter them out yourself with a
|
||||
hashing process most appropriate for your application.) */
|
||||
|
||||
void
|
||||
map_html_tags (const char *text, int size,
|
||||
const char **allowed_tag_names,
|
||||
const char **allowed_attribute_names,
|
||||
void (*mapfun) (struct taginfo *, void *),
|
||||
void *closure)
|
||||
{
|
||||
const char *p = text;
|
||||
const char *end = text + size;
|
||||
|
||||
int attr_pair_count = 8;
|
||||
int attr_pair_alloca_p = 1;
|
||||
struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
|
||||
struct pool pool;
|
||||
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
POOL_INIT (pool, 256);
|
||||
|
||||
{
|
||||
int nattrs, end_tag;
|
||||
const char *tag_name_begin, *tag_name_end;
|
||||
const char *tag_start_position;
|
||||
int uninteresting_tag;
|
||||
|
||||
look_for_tag:
|
||||
POOL_REWIND (pool);
|
||||
|
||||
nattrs = 0;
|
||||
end_tag = 0;
|
||||
|
||||
/* Find beginning of tag. We use memchr() instead of the usual
|
||||
looping with ADVANCE() for speed. */
|
||||
p = memchr (p, '<', end - p);
|
||||
if (!p)
|
||||
goto finish;
|
||||
|
||||
tag_start_position = p;
|
||||
ADVANCE (p);
|
||||
|
||||
/* Establish the type of the tag (start-tag, end-tag or
|
||||
declaration). */
|
||||
if (*p == '!')
|
||||
{
|
||||
/* This is an SGML declaration -- just skip it. */
|
||||
p = advance_declaration (p, end);
|
||||
if (p == end)
|
||||
goto finish;
|
||||
goto look_for_tag;
|
||||
}
|
||||
else if (*p == '/')
|
||||
{
|
||||
end_tag = 1;
|
||||
ADVANCE (p);
|
||||
}
|
||||
tag_name_begin = p;
|
||||
while (NAME_CHAR_P (*p))
|
||||
ADVANCE (p);
|
||||
if (p == tag_name_begin)
|
||||
goto look_for_tag;
|
||||
tag_name_end = p;
|
||||
SKIP_WS (p);
|
||||
if (end_tag && *p != '>')
|
||||
goto backout_tag;
|
||||
|
||||
if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
|
||||
/* We can't just say "goto look_for_tag" here because we need
|
||||
the loop below to properly advance over the tag's attributes. */
|
||||
uninteresting_tag = 1;
|
||||
else
|
||||
{
|
||||
uninteresting_tag = 0;
|
||||
convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
|
||||
}
|
||||
|
||||
/* Find the attributes. */
|
||||
while (1)
|
||||
{
|
||||
const char *attr_name_begin, *attr_name_end;
|
||||
const char *attr_value_begin, *attr_value_end;
|
||||
const char *attr_raw_value_begin, *attr_raw_value_end;
|
||||
int operation = AP_DOWNCASE; /* stupid compiler. */
|
||||
|
||||
SKIP_WS (p);
|
||||
|
||||
/* Check for end of tag definition. */
|
||||
if (*p == '>')
|
||||
break;
|
||||
|
||||
/* Establish bounds of attribute name. */
|
||||
attr_name_begin = p; /* <foo bar ...> */
|
||||
/* ^ */
|
||||
while (NAME_CHAR_P (*p))
|
||||
ADVANCE (p);
|
||||
attr_name_end = p; /* <foo bar ...> */
|
||||
/* ^ */
|
||||
if (attr_name_begin == attr_name_end)
|
||||
goto backout_tag;
|
||||
|
||||
/* Establish bounds of attribute value. */
|
||||
SKIP_WS (p);
|
||||
if (NAME_CHAR_P (*p) || *p == '>')
|
||||
{
|
||||
/* Minimized attribute syntax allows `=' to be omitted.
|
||||
For example, <UL COMPACT> is a valid shorthand for <UL
|
||||
COMPACT="compact">. Even if such attributes are not
|
||||
useful to Wget, we need to support them, so that the
|
||||
tags containing them can be parsed correctly. */
|
||||
attr_raw_value_begin = attr_value_begin = attr_name_begin;
|
||||
attr_raw_value_end = attr_value_end = attr_name_end;
|
||||
}
|
||||
else if (*p == '=')
|
||||
{
|
||||
ADVANCE (p);
|
||||
SKIP_WS (p);
|
||||
if (*p == '\"' || *p == '\'')
|
||||
{
|
||||
int newline_seen = 0;
|
||||
char quote_char = *p;
|
||||
attr_raw_value_begin = p;
|
||||
ADVANCE (p);
|
||||
attr_value_begin = p; /* <foo bar="baz"> */
|
||||
/* ^ */
|
||||
while (*p != quote_char)
|
||||
{
|
||||
if (!newline_seen && *p == '\n')
|
||||
{
|
||||
/* If a newline is seen within the quotes, it
|
||||
is most likely that someone forgot to close
|
||||
the quote. In that case, we back out to
|
||||
the value beginning, and terminate the tag
|
||||
at either `>' or the delimiter, whichever
|
||||
comes first. Such a tag terminated at `>'
|
||||
is discarded. */
|
||||
p = attr_value_begin;
|
||||
newline_seen = 1;
|
||||
continue;
|
||||
}
|
||||
else if (newline_seen && *p == '>')
|
||||
break;
|
||||
ADVANCE (p);
|
||||
}
|
||||
attr_value_end = p; /* <foo bar="baz"> */
|
||||
/* ^ */
|
||||
if (*p == quote_char)
|
||||
ADVANCE (p);
|
||||
else
|
||||
goto look_for_tag;
|
||||
attr_raw_value_end = p; /* <foo bar="baz"> */
|
||||
/* ^ */
|
||||
/* The AP_SKIP_BLANKS part is not entirely correct,
|
||||
because we don't want to skip blanks for all the
|
||||
attribute values. */
|
||||
operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
|
||||
}
|
||||
else
|
||||
{
|
||||
attr_value_begin = p; /* <foo bar=baz> */
|
||||
/* ^ */
|
||||
/* According to SGML, a name token should consist only
|
||||
of alphanumerics, . and -. However, this is often
|
||||
violated by, for instance, `%' in `width=75%'.
|
||||
We'll be liberal and allow just about anything as
|
||||
an attribute value. */
|
||||
while (!ISSPACE (*p) && *p != '>')
|
||||
ADVANCE (p);
|
||||
attr_value_end = p; /* <foo bar=baz qux=quix> */
|
||||
/* ^ */
|
||||
if (attr_value_begin == attr_value_end)
|
||||
/* <foo bar=> */
|
||||
/* ^ */
|
||||
goto backout_tag;
|
||||
attr_raw_value_begin = attr_value_begin;
|
||||
attr_raw_value_end = attr_value_end;
|
||||
operation = AP_PROCESS_ENTITIES;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* We skipped the whitespace and found something that is
|
||||
neither `=' nor the beginning of the next attribute's
|
||||
name. Back out. */
|
||||
goto backout_tag; /* <foo bar /... */
|
||||
/* ^ */
|
||||
}
|
||||
|
||||
/* If we're not interested in the tag, don't bother with any
|
||||
of the attributes. */
|
||||
if (uninteresting_tag)
|
||||
continue;
|
||||
|
||||
/* If we aren't interested in the attribute, skip it. We
|
||||
cannot do this test any sooner, because our text pointer
|
||||
needs to correctly advance over the attribute. */
|
||||
if (allowed_attribute_names
|
||||
&& !array_allowed (allowed_attribute_names, attr_name_begin,
|
||||
attr_name_end))
|
||||
continue;
|
||||
|
||||
DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
|
||||
attr_pair_alloca_p, struct attr_pair);
|
||||
|
||||
pairs[nattrs].name_pool_index = pool.index;
|
||||
convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
|
||||
|
||||
pairs[nattrs].value_pool_index = pool.index;
|
||||
convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
|
||||
pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
|
||||
pairs[nattrs].value_raw_size = (attr_raw_value_end
|
||||
- attr_raw_value_begin);
|
||||
++nattrs;
|
||||
}
|
||||
|
||||
if (uninteresting_tag)
|
||||
{
|
||||
ADVANCE (p);
|
||||
goto look_for_tag;
|
||||
}
|
||||
|
||||
/* By now, we have a valid tag with a name and zero or more
|
||||
attributes. Fill in the data and call the mapper function. */
|
||||
{
|
||||
int i;
|
||||
struct taginfo taginfo;
|
||||
|
||||
taginfo.name = pool.contents;
|
||||
taginfo.end_tag_p = end_tag;
|
||||
taginfo.nattrs = nattrs;
|
||||
/* We fill in the char pointers only now, when pool can no
|
||||
longer get realloc'ed. If we did that above, we could get
|
||||
hosed by reallocation. Obviously, after this point, the pool
|
||||
may no longer be grown. */
|
||||
for (i = 0; i < nattrs; i++)
|
||||
{
|
||||
pairs[i].name = pool.contents + pairs[i].name_pool_index;
|
||||
pairs[i].value = pool.contents + pairs[i].value_pool_index;
|
||||
}
|
||||
taginfo.attrs = pairs;
|
||||
taginfo.start_position = tag_start_position;
|
||||
taginfo.end_position = p + 1;
|
||||
/* Ta-dam! */
|
||||
(*mapfun) (&taginfo, closure);
|
||||
ADVANCE (p);
|
||||
}
|
||||
goto look_for_tag;
|
||||
|
||||
backout_tag:
|
||||
#ifdef STANDALONE
|
||||
++tag_backout_count;
|
||||
#endif
|
||||
/* The tag wasn't really a tag. Treat its contents as ordinary
|
||||
data characters. */
|
||||
p = tag_start_position + 1;
|
||||
goto look_for_tag;
|
||||
}
|
||||
|
||||
finish:
|
||||
POOL_FREE (pool);
|
||||
if (!attr_pair_alloca_p)
|
||||
free (pairs);
|
||||
}
|
||||
|
||||
#undef ADVANCE
|
||||
#undef SKIP_WS
|
||||
#undef SKIP_NON_WS
|
||||
|
||||
#ifdef STANDALONE
|
||||
static void
|
||||
test_mapper (struct taginfo *taginfo, void *arg)
|
||||
{
|
||||
int i;
|
||||
|
||||
printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
|
||||
for (i = 0; i < taginfo->nattrs; i++)
|
||||
printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
|
||||
putchar ('\n');
|
||||
++*(int *)arg;
|
||||
}
|
||||
|
||||
int main ()
|
||||
{
|
||||
int size = 256;
|
||||
char *x = (char *)xmalloc (size);
|
||||
int length = 0;
|
||||
int read_count;
|
||||
int tag_counter = 0;
|
||||
|
||||
while ((read_count = fread (x + length, 1, size - length, stdin)))
|
||||
{
|
||||
length += read_count;
|
||||
size <<= 1;
|
||||
x = (char *)xrealloc (x, size);
|
||||
}
|
||||
|
||||
map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
|
||||
printf ("TAGS: %d\n", tag_counter);
|
||||
printf ("Tag backouts: %d\n", tag_backout_count);
|
||||
printf ("Comment backouts: %d\n", comment_backout_count);
|
||||
return 0;
|
||||
}
|
||||
#endif /* STANDALONE */
|
44
src/html-parse.h
Normal file
44
src/html-parse.h
Normal file
@ -0,0 +1,44 @@
|
||||
/* Declarations for html-parse.c.
|
||||
Copyright (C) 1998 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
struct attr_pair {
|
||||
char *name; /* attribute name */
|
||||
char *value; /* attribute value */
|
||||
|
||||
/* Needed for URL conversion; the places where the value begins and
|
||||
ends, including the quotes and everything. */
|
||||
const char *value_raw_beginning;
|
||||
int value_raw_size;
|
||||
|
||||
/* Used internally by map_html_tags. */
|
||||
int name_pool_index, value_pool_index;
|
||||
};
|
||||
|
||||
struct taginfo {
|
||||
char *name; /* tag name */
|
||||
int end_tag_p; /* whether this is an end-tag */
|
||||
int nattrs; /* number of attributes */
|
||||
struct attr_pair *attrs; /* attributes */
|
||||
|
||||
const char *start_position; /* start position of tag */
|
||||
const char *end_position; /* end position of tag */
|
||||
};
|
||||
|
||||
void map_html_tags PARAMS ((const char *, int, const char **, const char **,
|
||||
void (*) (struct taginfo *, void *), void *));
|
569
src/html-url.c
Normal file
569
src/html-url.c
Normal file
@ -0,0 +1,569 @@
|
||||
/* Collect URLs from HTML source.
|
||||
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#else
|
||||
# include <strings.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "html-parse.h"
|
||||
#include "url.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifndef errno
|
||||
extern int errno;
|
||||
#endif
|
||||
|
||||
enum tag_category { TC_LINK, TC_SPEC };
|
||||
|
||||
/* Here we try to categorize the known tags. Each tag has its ID and
|
||||
cetegory. Category TC_LINK means that one or more of its
|
||||
attributes contain links that should be retrieved. TC_SPEC means
|
||||
that the tag is specific in some way, and has to be handled
|
||||
specially. */
|
||||
static struct {
|
||||
const char *name;
|
||||
enum tag_category category;
|
||||
} known_tags[] = {
|
||||
#define TAG_A 0
|
||||
{ "a", TC_LINK },
|
||||
#define TAG_APPLET 1
|
||||
{ "applet", TC_LINK },
|
||||
#define TAG_AREA 2
|
||||
{ "area", TC_LINK },
|
||||
#define TAG_BASE 3
|
||||
{ "base", TC_SPEC },
|
||||
#define TAG_BGSOUND 4
|
||||
{ "bgsound", TC_LINK },
|
||||
#define TAG_BODY 5
|
||||
{ "body", TC_LINK },
|
||||
#define TAG_EMBED 6
|
||||
{ "embed", TC_LINK },
|
||||
#define TAG_FIG 7
|
||||
{ "fig", TC_LINK },
|
||||
#define TAG_FRAME 8
|
||||
{ "frame", TC_LINK },
|
||||
#define TAG_IFRAME 9
|
||||
{ "iframe", TC_LINK },
|
||||
#define TAG_IMG 10
|
||||
{ "img", TC_LINK },
|
||||
#define TAG_INPUT 11
|
||||
{ "input", TC_LINK },
|
||||
#define TAG_LAYER 12
|
||||
{ "layer", TC_LINK },
|
||||
#define TAG_LINK 13
|
||||
{ "link", TC_SPEC },
|
||||
#define TAG_META 14
|
||||
{ "meta", TC_SPEC },
|
||||
#define TAG_OVERLAY 15
|
||||
{ "overlay", TC_LINK },
|
||||
#define TAG_SCRIPT 16
|
||||
{ "script", TC_LINK },
|
||||
#define TAG_TABLE 17
|
||||
{ "table", TC_LINK },
|
||||
#define TAG_TD 18
|
||||
{ "td", TC_LINK },
|
||||
#define TAG_TH 19
|
||||
{ "th", TC_LINK }
|
||||
};
|
||||
|
||||
/* Flags for specific url-attr pairs handled through TC_LINK: */
|
||||
#define AF_EXTERNAL 1
|
||||
|
||||
/* For tags handled by TC_LINK: attributes that contain URLs to
|
||||
download. */
|
||||
static struct {
|
||||
int tagid;
|
||||
const char *attr_name;
|
||||
int flags;
|
||||
} url_tag_attr_map[] = {
|
||||
{ TAG_A, "href", AF_EXTERNAL },
|
||||
{ TAG_APPLET, "code", 0 },
|
||||
{ TAG_AREA, "href", AF_EXTERNAL },
|
||||
{ TAG_BGSOUND, "src", 0 },
|
||||
{ TAG_BODY, "background", 0 },
|
||||
{ TAG_EMBED, "src", 0 },
|
||||
{ TAG_FIG, "src", 0 },
|
||||
{ TAG_FRAME, "src", 0 },
|
||||
{ TAG_IFRAME, "src", 0 },
|
||||
{ TAG_IMG, "href", 0 },
|
||||
{ TAG_IMG, "lowsrc", 0 },
|
||||
{ TAG_IMG, "src", 0 },
|
||||
{ TAG_INPUT, "src", 0 },
|
||||
{ TAG_LAYER, "src", 0 },
|
||||
{ TAG_OVERLAY, "src", 0 },
|
||||
{ TAG_SCRIPT, "src", 0 },
|
||||
{ TAG_TABLE, "background", 0 },
|
||||
{ TAG_TD, "background", 0 },
|
||||
{ TAG_TH, "background", 0 }
|
||||
};
|
||||
|
||||
/* The lists of interesting tags and attributes are built dynamically,
|
||||
from the information above. However, some places in the code refer
|
||||
to the attributes not mentioned here. We add them manually. */
|
||||
static const char *additional_attributes[] = {
|
||||
"rel", /* for TAG_LINK */
|
||||
"http-equiv", /* for TAG_META */
|
||||
"name", /* for TAG_META */
|
||||
"content" /* for TAG_META */
|
||||
};
|
||||
|
||||
static const char **interesting_tags;
|
||||
static const char **interesting_attributes;
|
||||
|
||||
void
|
||||
init_interesting (void)
|
||||
{
|
||||
/* Init the variables interesting_tags and interesting_attributes
|
||||
that are used by the HTML parser to know which tags and
|
||||
attributes we're interested in. We initialize this only once,
|
||||
for performance reasons.
|
||||
|
||||
Here we also make sure that what we put in interesting_tags
|
||||
matches the user's preferences as specified through --ignore-tags
|
||||
and --follow-tags. */
|
||||
|
||||
{
|
||||
int i, ind = 0;
|
||||
int size = ARRAY_SIZE (known_tags);
|
||||
interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
|
||||
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
const char *name = known_tags[i].name;
|
||||
|
||||
/* Normally here we could say:
|
||||
interesting_tags[i] = name;
|
||||
But we need to respect the settings of --ignore-tags and
|
||||
--follow-tags, so the code gets a bit harier. */
|
||||
|
||||
if (opt.ignore_tags)
|
||||
{
|
||||
/* --ignore-tags was specified. Do not match these
|
||||
specific tags. --ignore-tags takes precedence over
|
||||
--follow-tags, so we process --ignore first and fall
|
||||
through if there's no match. */
|
||||
int j, lose = 0;
|
||||
for (j = 0; opt.ignore_tags[j] != NULL; j++)
|
||||
/* Loop through all the tags this user doesn't care
|
||||
about. */
|
||||
if (strcasecmp(opt.ignore_tags[j], name) == EQ)
|
||||
{
|
||||
lose = 1;
|
||||
break;
|
||||
}
|
||||
if (lose)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (opt.follow_tags)
|
||||
{
|
||||
/* --follow-tags was specified. Only match these specific
|
||||
tags, so return FALSE if we don't match one of them. */
|
||||
int j, win = 0;
|
||||
for (j = 0; opt.follow_tags[j] != NULL; j++)
|
||||
/* Loop through all the tags this user cares about. */
|
||||
if (strcasecmp(opt.follow_tags[j], name) == EQ)
|
||||
{
|
||||
win = 1;
|
||||
break;
|
||||
}
|
||||
if (!win)
|
||||
continue; /* wasn't one of the explicitly
|
||||
desired tags */
|
||||
}
|
||||
|
||||
/* If we get to here, --follow-tags isn't being used or the
|
||||
tag is among the ones that are follwed, and --ignore-tags,
|
||||
if specified, didn't include this tag, so it's an
|
||||
"interesting" one. */
|
||||
interesting_tags[ind++] = name;
|
||||
}
|
||||
interesting_tags[ind] = NULL;
|
||||
}
|
||||
|
||||
/* The same for attributes, except we loop through url_tag_attr_map.
|
||||
Here we also need to make sure that the list of attributes is
|
||||
unique, and to include the attributes from additional_attributes. */
|
||||
{
|
||||
int i, ind;
|
||||
const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
|
||||
* sizeof (char *));
|
||||
/* First copy the "additional" attributes. */
|
||||
for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
|
||||
att[i] = additional_attributes[i];
|
||||
ind = i;
|
||||
att[ind] = NULL;
|
||||
for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
|
||||
{
|
||||
int j, seen = 0;
|
||||
const char *look_for = url_tag_attr_map[i].attr_name;
|
||||
for (j = 0; j < ind - 1; j++)
|
||||
if (!strcmp (att[j], look_for))
|
||||
{
|
||||
seen = 1;
|
||||
break;
|
||||
}
|
||||
if (!seen)
|
||||
{
|
||||
att = xrealloc (att, (ind + 2) * sizeof (*att));
|
||||
att[ind++] = look_for;
|
||||
att[ind] = NULL;
|
||||
}
|
||||
}
|
||||
interesting_attributes = att;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
find_tag (const char *tag_name)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* This is linear search; if the number of tags grow, we can switch
|
||||
to binary search. */
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE (known_tags); i++)
|
||||
{
|
||||
int cmp = strcasecmp (known_tags[i].name, tag_name);
|
||||
/* known_tags are sorted alphabetically, so we can
|
||||
micro-optimize. */
|
||||
if (cmp > 0)
|
||||
break;
|
||||
else if (cmp == 0)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Find the value of attribute named NAME in the taginfo TAG. If the
|
||||
attribute is not present, return NULL. If ATTRID is non-NULL, the
|
||||
exact identity of the attribute will be returned. */
|
||||
static char *
|
||||
find_attr (struct taginfo *tag, const char *name, int *attrid)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < tag->nattrs; i++)
|
||||
if (!strcasecmp (tag->attrs[i].name, name))
|
||||
{
|
||||
if (attrid)
|
||||
*attrid = i;
|
||||
return tag->attrs[i].value;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct collect_urls_closure {
|
||||
char *text; /* HTML text. */
|
||||
char *base; /* Base URI of the document, possibly
|
||||
changed through <base href=...>. */
|
||||
urlpos *head, *tail; /* List of URLs */
|
||||
const char *parent_base; /* Base of the current document. */
|
||||
const char *document_file; /* File name of this document. */
|
||||
int dash_p_leaf_HTML; /* Whether -p is specified, and this
|
||||
document is the "leaf" node of the
|
||||
HTML tree. */
|
||||
int nofollow; /* whether NOFOLLOW was specified in a
|
||||
<meta name=robots> tag. */
|
||||
};
|
||||
|
||||
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
|
||||
are the necessary context to store the position and size. */
|
||||
|
||||
static void
|
||||
handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
struct taginfo *tag, int attrid)
|
||||
{
|
||||
int no_proto = !has_proto (link_uri);
|
||||
urlpos *newel;
|
||||
|
||||
const char *base = closure->base ? closure->base : closure->parent_base;
|
||||
char *complete_uri;
|
||||
|
||||
char *fragment = strrchr (link_uri, '#');
|
||||
|
||||
if (fragment)
|
||||
{
|
||||
/* Nullify the fragment identifier, i.e. everything after the
|
||||
last occurrence of `#', inclusive. This copying is
|
||||
relatively inefficient, but it doesn't matter because
|
||||
fragment identifiers don't come up all that often. */
|
||||
int hashlen = fragment - link_uri;
|
||||
char *p = alloca (hashlen + 1);
|
||||
memcpy (p, link_uri, hashlen);
|
||||
p[hashlen] = '\0';
|
||||
link_uri = p;
|
||||
}
|
||||
|
||||
if (!base)
|
||||
{
|
||||
if (no_proto)
|
||||
{
|
||||
/* We have no base, and the link does not have a protocol or
|
||||
a host attached to it. Nothing we can do. */
|
||||
/* #### Should we print a warning here? Wget 1.5.x used to. */
|
||||
return;
|
||||
}
|
||||
else
|
||||
complete_uri = xstrdup (link_uri);
|
||||
}
|
||||
else
|
||||
complete_uri = url_concat (base, link_uri);
|
||||
|
||||
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
||||
closure->document_file, base ? base : "(null)",
|
||||
link_uri, complete_uri));
|
||||
|
||||
newel = (urlpos *)xmalloc (sizeof (urlpos));
|
||||
|
||||
memset (newel, 0, sizeof (*newel));
|
||||
newel->next = NULL;
|
||||
newel->url = complete_uri;
|
||||
newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
|
||||
newel->size = tag->attrs[attrid].value_raw_size;
|
||||
|
||||
/* A URL is relative if the host and protocol are not named, and the
|
||||
name does not start with `/'.
|
||||
#### This logic might need some rethinking. */
|
||||
if (no_proto && *link_uri != '/')
|
||||
newel->flags |= (URELATIVE | UNOPROTO);
|
||||
else if (no_proto)
|
||||
newel->flags |= UNOPROTO;
|
||||
|
||||
if (closure->tail)
|
||||
{
|
||||
closure->tail->next = newel;
|
||||
closure->tail = newel;
|
||||
}
|
||||
else
|
||||
closure->tail = closure->head = newel;
|
||||
}
|
||||
|
||||
/* #### Document what this does.
|
||||
#### It would be nice to split this into several functions. */
|
||||
|
||||
static void
|
||||
collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
{
|
||||
struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
|
||||
int tagid = find_tag (tag->name);
|
||||
assert (tagid != -1);
|
||||
|
||||
switch (known_tags[tagid].category)
|
||||
{
|
||||
case TC_LINK:
|
||||
{
|
||||
int i;
|
||||
int size = ARRAY_SIZE (url_tag_attr_map);
|
||||
for (i = 0; i < size; i++)
|
||||
if (url_tag_attr_map[i].tagid == tagid)
|
||||
break;
|
||||
/* We've found the index of url_tag_attr_map where the
|
||||
attributes of our tags begin. Now, look for every one of
|
||||
them, and handle it. */
|
||||
for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
|
||||
{
|
||||
char *attr_value;
|
||||
int id;
|
||||
if (closure->dash_p_leaf_HTML
|
||||
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
|
||||
/* If we're at a -p leaf node, we don't want to retrieve
|
||||
links to references we know are external, such as <a
|
||||
href=...>. */
|
||||
continue;
|
||||
|
||||
/* This find_attr() buried in a loop may seem inefficient
|
||||
(O(n^2)), but it's not, since the number of attributes
|
||||
(n) we loop over is extremely small. In the worst case
|
||||
of IMG with all its possible attributes, n^2 will be
|
||||
only 9. */
|
||||
attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
|
||||
if (attr_value)
|
||||
handle_link (closure, attr_value, tag, id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TC_SPEC:
|
||||
switch (tagid)
|
||||
{
|
||||
case TAG_BASE:
|
||||
{
|
||||
char *newbase = find_attr (tag, "href", NULL);
|
||||
if (!newbase)
|
||||
break;
|
||||
if (closure->base)
|
||||
free (closure->base);
|
||||
if (closure->parent_base)
|
||||
closure->base = url_concat (closure->parent_base, newbase);
|
||||
else
|
||||
closure->base = xstrdup (newbase);
|
||||
}
|
||||
break;
|
||||
case TAG_LINK:
|
||||
{
|
||||
int id;
|
||||
char *rel = find_attr (tag, "rel", NULL);
|
||||
char *href = find_attr (tag, "href", &id);
|
||||
if (href)
|
||||
{
|
||||
/* In the normal case, all <link href=...> tags are
|
||||
fair game.
|
||||
|
||||
In the special case of when -p is active, however,
|
||||
and we're at a leaf node (relative to the -l
|
||||
max. depth) in the HTML document tree, the only
|
||||
<LINK> tag we'll follow is a <LINK REL=
|
||||
"stylesheet">, as it's necessary for displaying
|
||||
this document properly. We won't follow other
|
||||
<LINK> tags, like <LINK REL="home">, for instance,
|
||||
as they refer to external documents. */
|
||||
if (!closure->dash_p_leaf_HTML
|
||||
|| (rel && !strcasecmp (rel, "stylesheet")))
|
||||
handle_link (closure, href, tag, id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TAG_META:
|
||||
/* Some pages use a META tag to specify that the page be
|
||||
refreshed by a new page after a given number of seconds.
|
||||
The general format for this is:
|
||||
|
||||
<meta http-equiv=Refresh content="NUMBER; URL=index2.html">
|
||||
|
||||
So we just need to skip past the "NUMBER; URL=" garbage
|
||||
to get to the URL. */
|
||||
{
|
||||
int id;
|
||||
char *name = find_attr (tag, "name", NULL);
|
||||
char *http_equiv = find_attr (tag, "http-equiv", &id);
|
||||
if (http_equiv && !strcasecmp (http_equiv, "refresh"))
|
||||
{
|
||||
char *refresh = find_attr (tag, "content", NULL);
|
||||
char *p = refresh;
|
||||
int offset;
|
||||
while (ISDIGIT (*p))
|
||||
++p;
|
||||
if (*p++ != ';')
|
||||
return;
|
||||
while (ISSPACE (*p))
|
||||
++p;
|
||||
if (!(TOUPPER (*p) == 'U'
|
||||
&& TOUPPER (*(p + 1)) == 'R'
|
||||
&& TOUPPER (*(p + 2)) == 'L'
|
||||
&& *(p + 3) == '='))
|
||||
return;
|
||||
p += 4;
|
||||
while (ISSPACE (*p))
|
||||
++p;
|
||||
offset = p - refresh;
|
||||
tag->attrs[id].value_raw_beginning += offset;
|
||||
tag->attrs[id].value_raw_size -= offset;
|
||||
handle_link (closure, p, tag, id);
|
||||
}
|
||||
else if (name && !strcasecmp (name, "robots"))
|
||||
{
|
||||
/* Handle stuff like:
|
||||
<meta name="robots" content="index,nofollow"> */
|
||||
char *content = find_attr (tag, "content", NULL);
|
||||
if (!content)
|
||||
return;
|
||||
if (!strcasecmp (content, "none"))
|
||||
closure->nofollow = 1;
|
||||
else
|
||||
{
|
||||
while (*content)
|
||||
{
|
||||
/* Find the next occurrence of ',' or the end of
|
||||
the string. */
|
||||
char *end = strchr (content, ',');
|
||||
if (end)
|
||||
++end;
|
||||
else
|
||||
end = content + strlen (content);
|
||||
if (!strncasecmp (content, "nofollow", end - content))
|
||||
closure->nofollow = 1;
|
||||
content = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* Category is TC_SPEC, but tag name is unhandled. This
|
||||
must not be. */
|
||||
abort ();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Scan FILE, retrieving links to HTML documents from it. Each link is
|
||||
|
||||
Similar to get_urls_file, but for HTML files. FILE is scanned as
|
||||
an HTML document. get_urls_html() constructs the URLs from the
|
||||
relative href-s.
|
||||
|
||||
If SILENT is non-zero, do not barf on baseless relative links. */
|
||||
urlpos *
|
||||
get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
|
||||
int *meta_disallow_follow)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
struct collect_urls_closure closure;
|
||||
|
||||
/* Load the file. */
|
||||
fm = read_file (file);
|
||||
if (!fm)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
|
||||
|
||||
closure.text = fm->content;
|
||||
closure.head = closure.tail = NULL;
|
||||
closure.base = NULL;
|
||||
closure.parent_base = this_url ? this_url : opt.base_href;
|
||||
closure.document_file = file;
|
||||
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
|
||||
closure.nofollow = 0;
|
||||
|
||||
if (!interesting_tags)
|
||||
init_interesting ();
|
||||
|
||||
map_html_tags (fm->content, fm->length, interesting_tags,
|
||||
interesting_attributes, collect_tags_mapper, &closure);
|
||||
|
||||
DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
|
||||
if (meta_disallow_follow)
|
||||
*meta_disallow_follow = closure.nofollow;
|
||||
|
||||
FREE_MAYBE (closure.base);
|
||||
read_file_free (fm);
|
||||
return closure.head;
|
||||
}
|
203
src/http.c
203
src/http.c
@ -254,6 +254,85 @@ http_process_type (const char *hdr, void *arg)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Check whether the `Connection' header is set to "keep-alive". */
|
||||
static int
|
||||
http_process_connection (const char *hdr, void *arg)
|
||||
{
|
||||
int *flag = (int *)arg;
|
||||
if (!strcasecmp (hdr, "Keep-Alive"))
|
||||
*flag = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Persistent connections (pc). */
|
||||
|
||||
static unsigned char pc_last_host[4];
|
||||
static unsigned short pc_last_port;
|
||||
static int pc_last_fd;
|
||||
|
||||
static void
|
||||
register_persistent (const char *host, unsigned short port, int fd)
|
||||
{
|
||||
if (!store_hostaddress (pc_last_host, host))
|
||||
return;
|
||||
pc_last_port = port;
|
||||
pc_last_fd = fd;
|
||||
}
|
||||
|
||||
static void
|
||||
invalidate_persistent (void)
|
||||
{
|
||||
pc_last_port = 0;
|
||||
}
|
||||
|
||||
static int
|
||||
persistent_available_p (const char *host, unsigned short port)
|
||||
{
|
||||
unsigned char this_host[4];
|
||||
if (port != pc_last_port)
|
||||
return 0;
|
||||
if (!store_hostaddress (this_host, host))
|
||||
return 0;
|
||||
if (memcmp (pc_last_host, this_host, 4))
|
||||
return 0;
|
||||
if (!test_socket_open (pc_last_fd))
|
||||
{
|
||||
invalidate_persistent ();
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The idea behind these two CLOSE macros is to distinguish between
|
||||
two cases: one when the job we've been doing is finished, and we
|
||||
want to close the connection and leave, and two when something is
|
||||
seriously wrong and we're closing the connection as part of
|
||||
cleanup.
|
||||
|
||||
In case of keep_alive, CLOSE_FINISH should leave the connection
|
||||
open, while CLOSE_INVALIDATE should still close it.
|
||||
|
||||
The semantic difference between the flags `keep_alive' and
|
||||
`reused_connection' is that keep_alive defines the state of HTTP:
|
||||
whether the connection *will* be preservable. reused_connection,
|
||||
on the other hand, reflects the present: whether the *current*
|
||||
connection is the result of preserving. */
|
||||
|
||||
#define CLOSE_FINISH(fd) do { \
|
||||
if (!keep_alive) \
|
||||
{ \
|
||||
CLOSE (fd); \
|
||||
if (reused_connection) \
|
||||
invalidate_persistent (); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CLOSE_INVALIDATE(fd) do { \
|
||||
CLOSE (fd); \
|
||||
if (reused_connection) \
|
||||
invalidate_persistent (); \
|
||||
} while (0)
|
||||
|
||||
|
||||
struct http_stat
|
||||
{
|
||||
@ -317,6 +396,8 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
|
||||
FILE *fp;
|
||||
int auth_tried_already;
|
||||
struct rbuf rbuf;
|
||||
int keep_alive, http_keep_alive_1, http_keep_alive_2;
|
||||
int reused_connection;
|
||||
|
||||
if (!(*dt & HEAD_ONLY))
|
||||
/* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
|
||||
@ -329,6 +410,9 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
|
||||
again:
|
||||
/* We need to come back here when the initial attempt to retrieve
|
||||
without authorization header fails. */
|
||||
keep_alive = 0;
|
||||
http_keep_alive_1 = http_keep_alive_2 = 0;
|
||||
reused_connection = 0;
|
||||
|
||||
/* Initialize certain elements of struct http_stat. */
|
||||
hs->len = 0L;
|
||||
@ -345,40 +429,49 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
|
||||
ou = u;
|
||||
|
||||
/* First: establish the connection. */
|
||||
logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
|
||||
err = make_connection (&sock, u->host, u->port);
|
||||
switch (err)
|
||||
if (u->proxy || !persistent_available_p (u->host, u->port))
|
||||
{
|
||||
case HOSTERR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
|
||||
return HOSTERR;
|
||||
break;
|
||||
case CONSOCKERR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
|
||||
return CONSOCKERR;
|
||||
break;
|
||||
case CONREFUSED:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET,
|
||||
_("Connection to %s:%hu refused.\n"), u->host, u->port);
|
||||
CLOSE (sock);
|
||||
return CONREFUSED;
|
||||
case CONERROR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
|
||||
CLOSE (sock);
|
||||
return CONERROR;
|
||||
break;
|
||||
case NOCONERROR:
|
||||
/* Everything is fine! */
|
||||
logputs (LOG_VERBOSE, _("connected!\n"));
|
||||
break;
|
||||
default:
|
||||
abort ();
|
||||
break;
|
||||
} /* switch */
|
||||
logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
|
||||
err = make_connection (&sock, u->host, u->port);
|
||||
switch (err)
|
||||
{
|
||||
case HOSTERR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
|
||||
return HOSTERR;
|
||||
break;
|
||||
case CONSOCKERR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
|
||||
return CONSOCKERR;
|
||||
break;
|
||||
case CONREFUSED:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET,
|
||||
_("Connection to %s:%hu refused.\n"), u->host, u->port);
|
||||
CLOSE (sock);
|
||||
return CONREFUSED;
|
||||
case CONERROR:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
|
||||
CLOSE (sock);
|
||||
return CONERROR;
|
||||
break;
|
||||
case NOCONERROR:
|
||||
/* Everything is fine! */
|
||||
logputs (LOG_VERBOSE, _("connected!\n"));
|
||||
break;
|
||||
default:
|
||||
abort ();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
logprintf (LOG_VERBOSE, _("Reusing connection to %s:%hu.\n"), u->host, u->port);
|
||||
sock = pc_last_fd;
|
||||
reused_connection = 1;
|
||||
}
|
||||
|
||||
if (u->proxy)
|
||||
path = u->proxy->url;
|
||||
@ -487,6 +580,7 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
|
||||
User-Agent: %s\r\n\
|
||||
Host: %s%s\r\n\
|
||||
Accept: %s\r\n\
|
||||
Connection: Keep-Alive\r\n\
|
||||
%s%s%s%s%s%s\r\n",
|
||||
command, path, useragent, remhost,
|
||||
host_port ? host_port : "",
|
||||
@ -505,8 +599,9 @@ Accept: %s\r\n\
|
||||
num_written = iwrite (sock, request, strlen (request));
|
||||
if (num_written < 0)
|
||||
{
|
||||
logputs (LOG_VERBOSE, _("Failed writing HTTP request.\n"));
|
||||
CLOSE (sock);
|
||||
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
|
||||
strerror (errno));
|
||||
CLOSE_INVALIDATE (sock);
|
||||
return WRITEFAILED;
|
||||
}
|
||||
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
|
||||
@ -553,7 +648,7 @@ Accept: %s\r\n\
|
||||
FREE_MAYBE (type);
|
||||
FREE_MAYBE (hs->newloc);
|
||||
FREE_MAYBE (all_headers);
|
||||
CLOSE (sock);
|
||||
CLOSE_INVALIDATE (sock);
|
||||
return HEOF;
|
||||
}
|
||||
else if (status == HG_ERROR)
|
||||
@ -565,7 +660,7 @@ Accept: %s\r\n\
|
||||
FREE_MAYBE (type);
|
||||
FREE_MAYBE (hs->newloc);
|
||||
FREE_MAYBE (all_headers);
|
||||
CLOSE (sock);
|
||||
CLOSE_INVALIDATE (sock);
|
||||
return HERR;
|
||||
}
|
||||
|
||||
@ -672,12 +767,32 @@ Accept: %s\r\n\
|
||||
goto done_header;
|
||||
}
|
||||
}
|
||||
/* Check for the `Keep-Alive' header. */
|
||||
if (!http_keep_alive_1)
|
||||
{
|
||||
if (header_process (hdr, "Keep-Alive", header_exists,
|
||||
&http_keep_alive_1))
|
||||
goto done_header;
|
||||
}
|
||||
/* Check for `Connection: Keep-Alive'. */
|
||||
if (!http_keep_alive_2)
|
||||
{
|
||||
if (header_process (hdr, "Connection", http_process_connection,
|
||||
&http_keep_alive_2))
|
||||
goto done_header;
|
||||
}
|
||||
done_header:
|
||||
free (hdr);
|
||||
}
|
||||
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
|
||||
if (contlen != -1
|
||||
&& (http_keep_alive_1 || http_keep_alive_2))
|
||||
keep_alive = 1;
|
||||
if (keep_alive && !reused_connection)
|
||||
register_persistent (u->host, u->port, sock);
|
||||
|
||||
if ((statcode == HTTP_STATUS_UNAUTHORIZED)
|
||||
&& authenticate_h)
|
||||
{
|
||||
@ -685,7 +800,7 @@ Accept: %s\r\n\
|
||||
FREE_MAYBE (type);
|
||||
type = NULL;
|
||||
FREEHSTAT (*hs);
|
||||
CLOSE (sock);
|
||||
CLOSE_FINISH (sock);
|
||||
if (auth_tried_already)
|
||||
{
|
||||
/* If we have tried it already, then there is not point
|
||||
@ -753,7 +868,7 @@ Accept: %s\r\n\
|
||||
FREE_MAYBE (type);
|
||||
FREE_MAYBE (hs->newloc);
|
||||
FREE_MAYBE (all_headers);
|
||||
CLOSE (sock);
|
||||
CLOSE_INVALIDATE (sock);
|
||||
return RANGEERR;
|
||||
}
|
||||
|
||||
@ -783,7 +898,7 @@ Accept: %s\r\n\
|
||||
_("Location: %s%s\n"),
|
||||
hs->newloc ? hs->newloc : _("unspecified"),
|
||||
hs->newloc ? _(" [following]") : "");
|
||||
CLOSE (sock);
|
||||
CLOSE_FINISH (sock);
|
||||
FREE_MAYBE (type);
|
||||
FREE_MAYBE (all_headers);
|
||||
return NEWLOCATION;
|
||||
@ -824,7 +939,7 @@ Accept: %s\r\n\
|
||||
hs->res = 0;
|
||||
FREE_MAYBE (type);
|
||||
FREE_MAYBE (all_headers);
|
||||
CLOSE (sock);
|
||||
CLOSE_FINISH (sock);
|
||||
return RETRFINISHED;
|
||||
}
|
||||
|
||||
@ -838,7 +953,7 @@ Accept: %s\r\n\
|
||||
if (!fp)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", u->local, strerror (errno));
|
||||
CLOSE (sock);
|
||||
CLOSE_FINISH (sock);
|
||||
FREE_MAYBE (all_headers);
|
||||
return FOPENERR;
|
||||
}
|
||||
@ -863,7 +978,7 @@ Accept: %s\r\n\
|
||||
/* Get the contents of the document. */
|
||||
hs->res = get_contents (sock, fp, &hs->len, hs->restval,
|
||||
(contlen != -1 ? contlen : 0),
|
||||
&rbuf);
|
||||
&rbuf, keep_alive);
|
||||
hs->dltime = elapsed_time ();
|
||||
{
|
||||
/* Close or flush the file. We have to be careful to check for
|
||||
@ -878,7 +993,7 @@ Accept: %s\r\n\
|
||||
hs->res = -2;
|
||||
}
|
||||
FREE_MAYBE (all_headers);
|
||||
CLOSE (sock);
|
||||
CLOSE_FINISH (sock);
|
||||
if (hs->res == -2)
|
||||
return FWRITEERR;
|
||||
return RETRFINISHED;
|
||||
|
15
src/main.c
15
src/main.c
@ -97,6 +97,20 @@ i18n_initialize (void)
|
||||
textdomain ("wget");
|
||||
#endif /* HAVE_NLS */
|
||||
}
|
||||
|
||||
/* It's kosher to declare these here because their interface _has_ to
|
||||
be void foo(void). */
|
||||
void url_init PARAMS ((void));
|
||||
void host_init PARAMS ((void));
|
||||
|
||||
/* This just calls the various initialization functions from the
|
||||
modules that need one-time initialization. */
|
||||
static void
|
||||
private_initialize (void)
|
||||
{
|
||||
url_init ();
|
||||
host_init ();
|
||||
}
|
||||
|
||||
/* Print the usage message. */
|
||||
static void
|
||||
@ -293,6 +307,7 @@ main (int argc, char *const *argv)
|
||||
};
|
||||
|
||||
i18n_initialize ();
|
||||
private_initialize ();
|
||||
|
||||
append_to_log = 0;
|
||||
|
||||
|
148
src/recur.c
148
src/recur.c
@ -42,21 +42,20 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "ftp.h"
|
||||
#include "fnmatch.h"
|
||||
#include "host.h"
|
||||
#include "hash.h"
|
||||
|
||||
extern char *version_string;
|
||||
|
||||
#define ROBOTS_FILENAME "robots.txt"
|
||||
|
||||
/* #### Many of these lists should really be hashtables! */
|
||||
|
||||
/* List of downloaded URLs. */
|
||||
static urlpos *urls_downloaded;
|
||||
static struct hash_table *dl_file_url_map;
|
||||
static struct hash_table *dl_url_file_map;
|
||||
|
||||
/* List of HTML URLs. */
|
||||
static slist *urls_html;
|
||||
|
||||
/* List of undesirable-to-load URLs. */
|
||||
static slist *ulist;
|
||||
static struct hash_table *undesirable_urls;
|
||||
|
||||
/* List of forbidden locations. */
|
||||
static char **forbidden = NULL;
|
||||
@ -84,14 +83,28 @@ static int robots_match PARAMS ((struct urlinfo *, char **));
|
||||
void
|
||||
recursive_cleanup (void)
|
||||
{
|
||||
free_slist (ulist);
|
||||
ulist = NULL;
|
||||
if (undesirable_urls)
|
||||
{
|
||||
string_set_free (undesirable_urls);
|
||||
undesirable_urls = NULL;
|
||||
}
|
||||
if (dl_file_url_map)
|
||||
{
|
||||
free_keys_and_values (dl_file_url_map);
|
||||
hash_table_destroy (dl_file_url_map);
|
||||
dl_file_url_map = NULL;
|
||||
}
|
||||
if (dl_url_file_map)
|
||||
{
|
||||
free_keys_and_values (dl_url_file_map);
|
||||
hash_table_destroy (dl_url_file_map);
|
||||
dl_url_file_map = NULL;
|
||||
}
|
||||
undesirable_urls = NULL;
|
||||
free_vec (forbidden);
|
||||
forbidden = NULL;
|
||||
free_slist (urls_html);
|
||||
slist_free (urls_html);
|
||||
urls_html = NULL;
|
||||
free_urlpos (urls_downloaded);
|
||||
urls_downloaded = NULL;
|
||||
FREE_MAYBE (base_dir);
|
||||
FREE_MAYBE (robots_host);
|
||||
first_time = 1;
|
||||
@ -117,6 +130,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
char *constr, *filename, *newloc;
|
||||
char *canon_this_url = NULL;
|
||||
int dt, inl, dash_p_leaf_HTML = FALSE;
|
||||
int meta_disallow_follow;
|
||||
int this_url_ftp; /* See below the explanation */
|
||||
uerr_t err;
|
||||
struct urlinfo *rurl;
|
||||
@ -132,17 +146,29 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
/* Cache the current URL in the list. */
|
||||
if (first_time)
|
||||
{
|
||||
ulist = add_slist (ulist, this_url, 0);
|
||||
urls_downloaded = NULL;
|
||||
/* These three operations need to be done only once per Wget
|
||||
run. They should probably be at a different location. */
|
||||
if (!undesirable_urls)
|
||||
undesirable_urls = make_string_hash_table (0);
|
||||
if (!dl_file_url_map)
|
||||
dl_file_url_map = make_string_hash_table (0);
|
||||
if (!dl_url_file_map)
|
||||
dl_url_file_map = make_string_hash_table (0);
|
||||
|
||||
hash_table_clear (undesirable_urls);
|
||||
string_set_add (undesirable_urls, this_url);
|
||||
hash_table_clear (dl_file_url_map);
|
||||
hash_table_clear (dl_url_file_map);
|
||||
urls_html = NULL;
|
||||
/* Enter this_url to the slist, in original and "enhanced" form. */
|
||||
/* Enter this_url to the hash table, in original and "enhanced" form. */
|
||||
u = newurl ();
|
||||
err = parseurl (this_url, u, 0);
|
||||
if (err == URLOK)
|
||||
{
|
||||
ulist = add_slist (ulist, u->url, 0);
|
||||
urls_downloaded = add_url (urls_downloaded, u->url, file);
|
||||
urls_html = add_slist (urls_html, file, NOSORT);
|
||||
string_set_add (undesirable_urls, u->url);
|
||||
hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
|
||||
hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
|
||||
urls_html = slist_append (urls_html, file);
|
||||
if (opt.no_parent)
|
||||
base_dir = xstrdup (u->dir); /* Set the base dir. */
|
||||
/* Set the canonical this_url to be sent as referer. This
|
||||
@ -191,7 +217,15 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
|
||||
/* Get the URL-s from an HTML file: */
|
||||
url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
|
||||
0, dash_p_leaf_HTML);
|
||||
dash_p_leaf_HTML, &meta_disallow_follow);
|
||||
|
||||
if (opt.use_robots && meta_disallow_follow)
|
||||
{
|
||||
/* The META tag says we are not to follow this file. Respect
|
||||
that. */
|
||||
free_urlpos (url_list);
|
||||
url_list = NULL;
|
||||
}
|
||||
|
||||
/* Decide what to do with each of the URLs. A URL will be loaded if
|
||||
it meets several requirements, discussed later. */
|
||||
@ -240,16 +274,16 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
the list. */
|
||||
|
||||
/* inl is set if the URL we are working on (constr) is stored in
|
||||
ulist. Using it is crucial to avoid the incessant calls to
|
||||
in_slist, which is quite slow. */
|
||||
inl = in_slist (ulist, constr);
|
||||
undesirable_urls. Using it is crucial to avoid unnecessary
|
||||
repeated continuous hits to the hash table. */
|
||||
inl = string_set_exists (undesirable_urls, constr);
|
||||
|
||||
/* If it is FTP, and FTP is not followed, chuck it out. */
|
||||
if (!inl)
|
||||
if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
|
||||
{
|
||||
DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
/* If it is absolute link and they are not followed, chuck it
|
||||
@ -258,7 +292,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
if (opt.relative_only && !(cur_url->flags & URELATIVE))
|
||||
{
|
||||
DEBUGP (("It doesn't really look like a relative link.\n"));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
/* If its domain is not to be accepted/looked-up, chuck it out. */
|
||||
@ -266,7 +300,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
if (!accept_domain (u))
|
||||
{
|
||||
DEBUGP (("I don't like the smell of that domain.\n"));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
/* Check for parent directory. */
|
||||
@ -286,7 +320,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
{
|
||||
/* Failing that too, kill the URL. */
|
||||
DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
freeurl (ut, 1);
|
||||
@ -300,7 +334,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
if (!accdir (u->dir, ALLABS))
|
||||
{
|
||||
DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
}
|
||||
@ -330,7 +364,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
{
|
||||
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
||||
constr, u->file));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
}
|
||||
@ -353,12 +387,12 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
}
|
||||
free (constr);
|
||||
constr = xstrdup (u->url);
|
||||
inl = in_slist (ulist, constr);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
|
||||
if (!opt.spanhost && this_url && !same_host (this_url, constr))
|
||||
{
|
||||
DEBUGP (("This is not the same hostname as the parent's.\n"));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
}
|
||||
@ -398,7 +432,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
{
|
||||
DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
|
||||
ROBOTS_FILENAME));
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
}
|
||||
@ -409,7 +443,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
{
|
||||
DEBUGP (("I've decided to load it -> "));
|
||||
/* Add it to the list of already-loaded URL-s. */
|
||||
ulist = add_slist (ulist, constr, 0);
|
||||
string_set_add (undesirable_urls, constr);
|
||||
/* Automatically followed FTPs will *not* be downloaded
|
||||
recursively. */
|
||||
if (u->proto == URLFTP)
|
||||
@ -439,10 +473,13 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
{
|
||||
if (dt & RETROKF)
|
||||
{
|
||||
urls_downloaded = add_url (urls_downloaded, constr, filename);
|
||||
hash_table_put (dl_file_url_map,
|
||||
xstrdup (filename), xstrdup (constr));
|
||||
hash_table_put (dl_url_file_map,
|
||||
xstrdup (constr), xstrdup (filename));
|
||||
/* If the URL is HTML, note it. */
|
||||
if (dt & TEXTHTML)
|
||||
urls_html = add_slist (urls_html, filename, NOSORT);
|
||||
urls_html = slist_append (urls_html, filename);
|
||||
}
|
||||
}
|
||||
/* If there was no error, and the type is text/html, parse
|
||||
@ -489,6 +526,10 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
/* Increment the pbuf for the appropriate size. */
|
||||
}
|
||||
if (opt.convert_links && !opt.delete_after)
|
||||
/* This is merely the first pass: the links that have been
|
||||
successfully downloaded are converted. In the second pass,
|
||||
convert_all_links() will also convert those links that have NOT
|
||||
been downloaded to their canonical form. */
|
||||
convert_links (file, url_list);
|
||||
/* Free the linked list of URL-s. */
|
||||
free_urlpos (url_list);
|
||||
@ -531,30 +572,37 @@ void
|
||||
convert_all_links (void)
|
||||
{
|
||||
uerr_t res;
|
||||
urlpos *l1, *l2, *urls;
|
||||
urlpos *l1, *urls;
|
||||
struct urlinfo *u;
|
||||
slist *html;
|
||||
urlpos *urlhtml;
|
||||
|
||||
for (html = urls_html; html; html = html->next)
|
||||
{
|
||||
int meta_disallow_follow;
|
||||
char *url;
|
||||
|
||||
DEBUGP (("Rescanning %s\n", html->string));
|
||||
/* Determine the URL of the HTML file. get_urls_html will need
|
||||
it. */
|
||||
for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
|
||||
if (!strcmp (urlhtml->local_name, html->string))
|
||||
break;
|
||||
if (urlhtml)
|
||||
DEBUGP (("It should correspond to %s.\n", urlhtml->url));
|
||||
url = hash_table_get (dl_file_url_map, html->string);
|
||||
if (url)
|
||||
DEBUGP (("It should correspond to %s.\n", url));
|
||||
else
|
||||
DEBUGP (("I cannot find the corresponding URL.\n"));
|
||||
/* Parse the HTML file... */
|
||||
urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
|
||||
FALSE);
|
||||
urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow);
|
||||
if (opt.use_robots && meta_disallow_follow)
|
||||
{
|
||||
/* The META tag says we are not to follow this file.
|
||||
Respect that. */
|
||||
free_urlpos (urls);
|
||||
urls = NULL;
|
||||
}
|
||||
if (!urls)
|
||||
continue;
|
||||
for (l1 = urls; l1; l1 = l1->next)
|
||||
{
|
||||
char *local_name;
|
||||
/* The URL must be in canonical form to be compared. */
|
||||
u = newurl ();
|
||||
res = parseurl (l1->url, u, 0);
|
||||
@ -565,22 +613,18 @@ convert_all_links (void)
|
||||
}
|
||||
/* We decide the direction of conversion according to whether
|
||||
a URL was downloaded. Downloaded URLs will be converted
|
||||
ABS2REL, whereas non-downloaded will be converted REL2ABS.
|
||||
Note: not yet implemented; only ABS2REL works. */
|
||||
for (l2 = urls_downloaded; l2; l2 = l2->next)
|
||||
if (!strcmp (l2->url, u->url))
|
||||
{
|
||||
DEBUGP (("%s flagged for conversion, local %s\n",
|
||||
l2->url, l2->local_name));
|
||||
break;
|
||||
}
|
||||
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
||||
local_name = hash_table_get (dl_url_file_map, u->url);
|
||||
if (local_name)
|
||||
DEBUGP (("%s flagged for conversion, local %s\n",
|
||||
u->url, local_name));
|
||||
/* Clear the flags. */
|
||||
l1->flags &= ~ (UABS2REL | UREL2ABS);
|
||||
/* Decide on the conversion direction. */
|
||||
if (l2)
|
||||
if (local_name)
|
||||
{
|
||||
l1->flags |= UABS2REL;
|
||||
l1->local_name = xstrdup (l2->local_name);
|
||||
l1->local_name = xstrdup (local_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
70
src/retr.c
70
src/retr.c
@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "ftp.h"
|
||||
#include "host.h"
|
||||
#include "connect.h"
|
||||
#include "hash.h"
|
||||
|
||||
#ifdef WINDOWS
|
||||
LARGE_INTEGER internal_time;
|
||||
@ -60,6 +61,8 @@ enum spflags { SP_NONE, SP_INIT, SP_FINISH };
|
||||
|
||||
static int show_progress PARAMS ((long, long, enum spflags));
|
||||
|
||||
#define MIN(i, j) ((i) <= (j) ? (i) : (j))
|
||||
|
||||
/* Reads the contents of file descriptor FD, until it is closed, or a
|
||||
read error occurs. The data is read in 8K chunks, and stored to
|
||||
stream fp, which should have been open for writing. If BUF is
|
||||
@ -83,9 +86,9 @@ static int show_progress PARAMS ((long, long, enum spflags));
|
||||
from fd immediately, flush or discard the buffer. */
|
||||
int
|
||||
get_contents (int fd, FILE *fp, long *len, long restval, long expected,
|
||||
struct rbuf *rbuf)
|
||||
struct rbuf *rbuf, int use_expected)
|
||||
{
|
||||
int res;
|
||||
int res = 0;
|
||||
static char c[8192];
|
||||
|
||||
*len = restval;
|
||||
@ -105,10 +108,17 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
|
||||
*len += res;
|
||||
}
|
||||
}
|
||||
/* Read from fd while there is available data. */
|
||||
do
|
||||
/* Read from fd while there is available data.
|
||||
|
||||
Normally, if expected is 0, it means that it is not known how
|
||||
much data is expected. However, if use_expected is specified,
|
||||
then expected being zero means exactly that. */
|
||||
while (!use_expected || (*len < expected))
|
||||
{
|
||||
res = iread (fd, c, sizeof (c));
|
||||
int amount_to_read = (use_expected
|
||||
? MIN (expected - *len, sizeof (c))
|
||||
: sizeof (c));
|
||||
res = iread (fd, c, amount_to_read);
|
||||
if (res > 0)
|
||||
{
|
||||
if (fwrite (c, sizeof (char), res, fp) < res)
|
||||
@ -120,7 +130,9 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
|
||||
}
|
||||
*len += res;
|
||||
}
|
||||
} while (res > 0);
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (res < -1)
|
||||
res = -1;
|
||||
if (opt.verbose)
|
||||
@ -323,7 +335,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
int local_use_proxy;
|
||||
char *mynewloc, *proxy;
|
||||
struct urlinfo *u;
|
||||
slist *redirections;
|
||||
struct hash_table *redirections = NULL;
|
||||
|
||||
/* If dt is NULL, just ignore it. */
|
||||
if (!dt)
|
||||
@ -334,8 +346,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
if (file)
|
||||
*file = NULL;
|
||||
|
||||
redirections = NULL;
|
||||
|
||||
u = newurl ();
|
||||
/* Parse the URL. */
|
||||
result = parseurl (url, u, 0);
|
||||
@ -343,7 +353,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
free (url);
|
||||
return result;
|
||||
}
|
||||
@ -379,7 +390,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
{
|
||||
logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
free (url);
|
||||
return PROXERR;
|
||||
}
|
||||
@ -392,7 +404,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
else
|
||||
logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
free (url);
|
||||
return PROXERR;
|
||||
}
|
||||
@ -454,7 +467,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result));
|
||||
freeurl (newloc_struct, 1);
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
free (url);
|
||||
free (mynewloc);
|
||||
return result;
|
||||
@ -466,34 +480,29 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
free (mynewloc);
|
||||
mynewloc = xstrdup (newloc_struct->url);
|
||||
|
||||
/* Check for redirection to back to itself. */
|
||||
if (!strcmp (u->url, newloc_struct->url))
|
||||
if (!redirections)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
|
||||
mynewloc);
|
||||
freeurl (newloc_struct, 1);
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
free (url);
|
||||
free (mynewloc);
|
||||
return WRONGCODE;
|
||||
redirections = make_string_hash_table (0);
|
||||
/* Add current URL immediately so we can detect it as soon
|
||||
as possible in case of a cycle. */
|
||||
string_set_add (redirections, u->url);
|
||||
}
|
||||
|
||||
/* The new location is OK. Let's check for redirection cycle by
|
||||
peeking through the history of redirections. */
|
||||
if (in_slist (redirections, newloc_struct->url))
|
||||
if (string_set_exists (redirections, newloc_struct->url))
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
|
||||
mynewloc);
|
||||
freeurl (newloc_struct, 1);
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
free (url);
|
||||
free (mynewloc);
|
||||
return WRONGCODE;
|
||||
}
|
||||
|
||||
redirections = add_slist (redirections, newloc_struct->url, NOSORT);
|
||||
string_set_add (redirections, newloc_struct->url);
|
||||
|
||||
free (url);
|
||||
url = mynewloc;
|
||||
@ -510,7 +519,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
*file = NULL;
|
||||
}
|
||||
freeurl (u, 1);
|
||||
free_slist (redirections);
|
||||
if (redirections)
|
||||
string_set_free (redirections);
|
||||
|
||||
if (newloc)
|
||||
*newloc = url;
|
||||
@ -531,9 +541,7 @@ retrieve_from_file (const char *file, int html, int *count)
|
||||
uerr_t status;
|
||||
urlpos *url_list, *cur_url;
|
||||
|
||||
/* If spider-mode is on, we do not want get_urls_html barfing
|
||||
errors on baseless links. */
|
||||
url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE)
|
||||
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
|
||||
: get_urls_file (file));
|
||||
status = RETROK; /* Suppose everything is OK. */
|
||||
*count = 0; /* Reset the URL count. */
|
||||
|
@ -22,7 +22,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
#include "rbuf.h"
|
||||
|
||||
int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *));
|
||||
int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *, int));
|
||||
|
||||
uerr_t retrieve_url PARAMS ((const char *, char **, char **,
|
||||
const char *, int *));
|
||||
|
615
src/url.c
615
src/url.c
@ -38,7 +38,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "utils.h"
|
||||
#include "url.h"
|
||||
#include "host.h"
|
||||
#include "html.h"
|
||||
|
||||
#ifndef errno
|
||||
extern int errno;
|
||||
@ -48,22 +47,12 @@ extern int errno;
|
||||
#define DEFAULT_HTTP_PORT 80
|
||||
#define DEFAULT_FTP_PORT 21
|
||||
|
||||
/* URL separator (for findurl) */
|
||||
#define URL_SEPARATOR "!\"#'(),>`{}|<>"
|
||||
/* Table of Unsafe chars. This is intialized in
|
||||
init_unsafe_char_table. */
|
||||
|
||||
/* A list of unsafe characters for encoding, as per RFC1738. '@' and
|
||||
':' (not listed in RFC) were added because of user/password
|
||||
encoding. */
|
||||
static char unsafe_char_table[256];
|
||||
|
||||
#ifndef WINDOWS
|
||||
# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
|
||||
#else /* WINDOWS */
|
||||
# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
|
||||
#endif /* WINDOWS */
|
||||
|
||||
#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
|
||||
|| ((unsigned char)(c) > '~') /* ASCII 127 */ \
|
||||
|| strchr (URL_UNSAFE_CHARS, c))
|
||||
#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
|
||||
|
||||
/* If S contains unsafe characters, free it and replace it with a
|
||||
version that doesn't. */
|
||||
@ -176,6 +165,34 @@ skip_url (const char *url)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Unsafe chars:
|
||||
- anything <= 32;
|
||||
- stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
|
||||
- @ and :, for user/password encoding.
|
||||
- everything over 127 (but we don't bother with recording those. */
|
||||
void
|
||||
init_unsafe_char_table (void)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < 256; i++)
|
||||
if (i < 32 || i >= 127
|
||||
|| i == '<'
|
||||
|| i == '>'
|
||||
|| i == '\"'
|
||||
|| i == '#'
|
||||
|| i == '%'
|
||||
|| i == '{'
|
||||
|| i == '}'
|
||||
|| i == '|'
|
||||
|| i == '\\'
|
||||
|| i == '^'
|
||||
|| i == '~'
|
||||
|| i == '['
|
||||
|| i == ']'
|
||||
|| i == '`')
|
||||
unsafe_char_table[i] = 1;
|
||||
}
|
||||
|
||||
/* Returns 1 if the string contains unsafe characters, 0 otherwise. */
|
||||
int
|
||||
contains_unsafe (const char *s)
|
||||
@ -296,7 +313,7 @@ skip_proto (const char *url)
|
||||
|
||||
/* Returns 1 if the URL begins with a protocol (supported or
|
||||
unsupported), 0 otherwise. */
|
||||
static int
|
||||
int
|
||||
has_proto (const char *url)
|
||||
{
|
||||
char **s;
|
||||
@ -765,297 +782,54 @@ url_equal (const char *url1, const char *url2)
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Find URL of format scheme:hostname[:port]/dir in a buffer. The
|
||||
buffer may contain pretty much anything; no errors are signaled. */
|
||||
static const char *
|
||||
findurl (const char *buf, int howmuch, int *count)
|
||||
{
|
||||
char **prot;
|
||||
const char *s1, *s2;
|
||||
|
||||
for (s1 = buf; howmuch; s1++, howmuch--)
|
||||
for (prot = protostrings; *prot; prot++)
|
||||
if (howmuch <= strlen (*prot))
|
||||
continue;
|
||||
else if (!strncasecmp (*prot, s1, strlen (*prot)))
|
||||
{
|
||||
for (s2 = s1, *count = 0;
|
||||
howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
|
||||
!strchr (URL_SEPARATOR, *s2);
|
||||
s2++, (*count)++, howmuch--);
|
||||
return s1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Scans the file for signs of URL-s. Returns a vector of pointers,
|
||||
each pointer representing a URL string. The file is *not* assumed
|
||||
to be HTML. */
|
||||
urlpos *
|
||||
get_urls_file (const char *file)
|
||||
{
|
||||
long nread;
|
||||
FILE *fp;
|
||||
char *buf;
|
||||
const char *pbuf;
|
||||
int size;
|
||||
urlpos *first, *current, *old;
|
||||
struct file_memory *fm;
|
||||
urlpos *head, *tail;
|
||||
const char *text, *text_end;
|
||||
|
||||
if (file && !HYPHENP (file))
|
||||
{
|
||||
fp = fopen (file, "rb");
|
||||
if (!fp)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
fp = stdin;
|
||||
/* Load the file. */
|
||||
load_file (fp, &buf, &nread);
|
||||
if (file && !HYPHENP (file))
|
||||
fclose (fp);
|
||||
DEBUGP (("Loaded %s (size %ld).\n", file, nread));
|
||||
first = current = NULL;
|
||||
/* Fill the linked list with URLs. */
|
||||
for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
|
||||
pbuf += size)
|
||||
fm = read_file (file);
|
||||
if (!fm)
|
||||
{
|
||||
/* Allocate the space. */
|
||||
old = current;
|
||||
current = (urlpos *)xmalloc (sizeof (urlpos));
|
||||
if (old)
|
||||
old->next = current;
|
||||
memset (current, 0, sizeof (*current));
|
||||
current->next = NULL;
|
||||
current->url = (char *)xmalloc (size + 1);
|
||||
memcpy (current->url, pbuf, size);
|
||||
current->url[size] = '\0';
|
||||
if (!first)
|
||||
first = current;
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
/* Free the buffer. */
|
||||
free (buf);
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
/* Similar to get_urls_file, but for HTML files. FILE is scanned as
|
||||
an HTML document using htmlfindurl(), which see. get_urls_html()
|
||||
constructs the HTML-s from the relative href-s.
|
||||
|
||||
If SILENT is non-zero, do not barf on baseless relative links. */
|
||||
urlpos *
|
||||
get_urls_html (const char *file, const char *this_url, int silent,
|
||||
int dash_p_leaf_HTML)
|
||||
{
|
||||
long nread;
|
||||
FILE *fp;
|
||||
char *orig_buf;
|
||||
const char *buf;
|
||||
int step, first_time;
|
||||
urlpos *first, *current, *old;
|
||||
|
||||
if (file && !HYPHENP (file))
|
||||
DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
|
||||
head = tail = NULL;
|
||||
text = fm->content;
|
||||
text_end = fm->content + fm->length;
|
||||
while (text < text_end)
|
||||
{
|
||||
fp = fopen (file, "rb");
|
||||
if (!fp)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
fp = stdin;
|
||||
/* Load the file. */
|
||||
load_file (fp, &orig_buf, &nread);
|
||||
if (file && !HYPHENP (file))
|
||||
fclose (fp);
|
||||
DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
|
||||
first = current = NULL;
|
||||
first_time = 1;
|
||||
/* Iterate over the URLs in BUF, picked by htmlfindurl(). */
|
||||
for (buf = orig_buf;
|
||||
(buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
|
||||
dash_p_leaf_HTML));
|
||||
buf += step)
|
||||
{
|
||||
int i, no_proto;
|
||||
int size = step;
|
||||
const char *pbuf = buf;
|
||||
char *constr, *base;
|
||||
const char *cbase;
|
||||
char *needs_freeing, *url_data;
|
||||
|
||||
first_time = 0;
|
||||
|
||||
/* A frequent phenomenon that needs to be handled are pages
|
||||
generated by brain-damaged HTML generators, which refer to to
|
||||
URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
|
||||
any spaces at the beginning or at the end of the string.
|
||||
This is probably not strictly correct, but that's what the
|
||||
browsers do, so we may follow. May the authors of "WYSIWYG"
|
||||
HTML tools burn in hell for the damage they've inflicted! */
|
||||
while ((pbuf < buf + step) && ISSPACE (*pbuf))
|
||||
{
|
||||
++pbuf;
|
||||
--size;
|
||||
}
|
||||
while (size && ISSPACE (pbuf[size - 1]))
|
||||
--size;
|
||||
if (!size)
|
||||
break;
|
||||
|
||||
/* It would be nice if we could avoid allocating memory in this
|
||||
loop, but I don't see an easy way. To process the entities,
|
||||
we need to either copy the data, or change it destructively.
|
||||
I choose the former.
|
||||
|
||||
We have two pointers: needs_freeing and url_data, because the
|
||||
code below does thing like url_data += <something>, and we
|
||||
want to pass the original string to free(). */
|
||||
needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
|
||||
size = strlen (url_data);
|
||||
|
||||
for (i = 0; protostrings[i]; i++)
|
||||
{
|
||||
if (!strncasecmp (protostrings[i], url_data,
|
||||
MINVAL (strlen (protostrings[i]), size)))
|
||||
break;
|
||||
}
|
||||
/* Check for http:RELATIVE_URI. See below for details. */
|
||||
if (protostrings[i]
|
||||
&& !(strncasecmp (url_data, "http:", 5) == 0
|
||||
&& strncasecmp (url_data, "http://", 7) != 0))
|
||||
{
|
||||
no_proto = 0;
|
||||
}
|
||||
const char *line_beg = text;
|
||||
const char *line_end = memchr (text, '\n', text_end - text);
|
||||
if (!line_end)
|
||||
line_end = text_end;
|
||||
else
|
||||
++line_end;
|
||||
text = line_end;
|
||||
while (line_beg < line_end
|
||||
&& ISSPACE (*line_beg))
|
||||
++line_beg;
|
||||
while (line_end > line_beg + 1
|
||||
&& ISSPACE (*(line_end - 1)))
|
||||
--line_end;
|
||||
if (line_end > line_beg)
|
||||
{
|
||||
no_proto = 1;
|
||||
/* This is for extremely brain-damaged pages that refer to
|
||||
relative URI-s as <a href="http:URL">. Just strip off the
|
||||
silly leading "http:" (as well as any leading blanks
|
||||
before it). */
|
||||
if ((size > 5) && !strncasecmp ("http:", url_data, 5))
|
||||
url_data += 5, size -= 5;
|
||||
}
|
||||
if (!no_proto)
|
||||
{
|
||||
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
|
||||
{
|
||||
if (!strncasecmp (sup_protos[i].name, url_data,
|
||||
MINVAL (strlen (sup_protos[i].name), size)))
|
||||
break;
|
||||
}
|
||||
/* Do *not* accept a non-supported protocol. */
|
||||
if (i == ARRAY_SIZE (sup_protos))
|
||||
{
|
||||
free (needs_freeing);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (no_proto)
|
||||
{
|
||||
/* First, construct the base, which can be relative itself.
|
||||
|
||||
Criteria for creating the base are:
|
||||
1) html_base created by <base href="...">
|
||||
2) current URL
|
||||
3) base provided from the command line */
|
||||
cbase = html_base ();
|
||||
if (!cbase)
|
||||
cbase = this_url;
|
||||
if (!cbase)
|
||||
cbase = opt.base_href;
|
||||
if (!cbase) /* Error condition -- a baseless
|
||||
relative link. */
|
||||
{
|
||||
if (!opt.quiet && !silent)
|
||||
{
|
||||
/* Use malloc, not alloca because this is called in
|
||||
a loop. */
|
||||
char *temp = (char *)malloc (size + 1);
|
||||
strncpy (temp, url_data, size);
|
||||
temp[size] = '\0';
|
||||
logprintf (LOG_NOTQUIET,
|
||||
_("Error (%s): Link %s without a base provided.\n"),
|
||||
file, temp);
|
||||
free (temp);
|
||||
}
|
||||
free (needs_freeing);
|
||||
continue;
|
||||
}
|
||||
if (this_url)
|
||||
base = construct (this_url, cbase, strlen (cbase),
|
||||
!has_proto (cbase));
|
||||
urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
|
||||
memset (entry, 0, sizeof (*entry));
|
||||
entry->next = NULL;
|
||||
entry->url = strdupdelim (line_beg, line_end);
|
||||
if (!head)
|
||||
head = entry;
|
||||
else
|
||||
{
|
||||
/* Base must now be absolute, with host name and
|
||||
protocol. */
|
||||
if (!has_proto (cbase))
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("\
|
||||
Error (%s): Base %s relative, without referer URL.\n"),
|
||||
file, cbase);
|
||||
free (needs_freeing);
|
||||
continue;
|
||||
}
|
||||
base = xstrdup (cbase);
|
||||
}
|
||||
constr = construct (base, url_data, size, no_proto);
|
||||
free (base);
|
||||
tail->next = entry;
|
||||
tail = entry;
|
||||
}
|
||||
else /* has proto */
|
||||
{
|
||||
constr = (char *)xmalloc (size + 1);
|
||||
strncpy (constr, url_data, size);
|
||||
constr[size] = '\0';
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if (opt.debug)
|
||||
{
|
||||
char *tmp;
|
||||
const char *tmp2;
|
||||
|
||||
tmp2 = html_base ();
|
||||
/* Use malloc, not alloca because this is called in a loop. */
|
||||
tmp = (char *)xmalloc (size + 1);
|
||||
strncpy (tmp, url_data, size);
|
||||
tmp[size] = '\0';
|
||||
logprintf (LOG_ALWAYS,
|
||||
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
|
||||
file, this_url ? this_url : "(null)",
|
||||
tmp2 ? tmp2 : "(null)", tmp, constr);
|
||||
free (tmp);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Allocate the space. */
|
||||
old = current;
|
||||
current = (urlpos *)xmalloc (sizeof (urlpos));
|
||||
if (old)
|
||||
old->next = current;
|
||||
if (!first)
|
||||
first = current;
|
||||
/* Fill the values. */
|
||||
memset (current, 0, sizeof (*current));
|
||||
current->next = NULL;
|
||||
current->url = constr;
|
||||
current->size = step;
|
||||
current->pos = buf - orig_buf;
|
||||
/* A URL is relative if the host and protocol are not named,
|
||||
and the name does not start with `/'. */
|
||||
if (no_proto && *url_data != '/')
|
||||
current->flags |= (URELATIVE | UNOPROTO);
|
||||
else if (no_proto)
|
||||
current->flags |= UNOPROTO;
|
||||
free (needs_freeing);
|
||||
}
|
||||
free (orig_buf);
|
||||
|
||||
return first;
|
||||
read_file_free (fm);
|
||||
return head;
|
||||
}
|
||||
|
||||
/* Free the linked list of urlpos. */
|
||||
@ -1527,103 +1301,59 @@ no_proxy_match (const char *host, const char **no_proxy)
|
||||
return !sufmatch (no_proxy, host);
|
||||
}
|
||||
|
||||
static void write_backup_file PARAMS ((const char *, downloaded_file_t));
|
||||
|
||||
/* Change the links in an HTML document. Accepts a structure that
|
||||
defines the positions of all the links. */
|
||||
void
|
||||
convert_links (const char *file, urlpos *l)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
FILE *fp;
|
||||
char *buf, *p, *p2;
|
||||
char *p;
|
||||
downloaded_file_t downloaded_file_return;
|
||||
long size;
|
||||
|
||||
{
|
||||
/* First we do a "dry run": go through the list L and see whether
|
||||
any URL needs to be converted in the first place. If not, just
|
||||
leave the file alone. */
|
||||
int count = 0;
|
||||
urlpos *dry = l;
|
||||
for (dry = l; dry; dry = dry->next)
|
||||
if (dry->flags & (UABS2REL | UREL2ABS))
|
||||
++count;
|
||||
if (!count)
|
||||
{
|
||||
logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"),
|
||||
file);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
|
||||
/* Read from the file.... */
|
||||
fp = fopen (file, "rb");
|
||||
if (!fp)
|
||||
|
||||
fm = read_file (file);
|
||||
if (!fm)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
||||
file, strerror (errno));
|
||||
return;
|
||||
}
|
||||
/* ...to a buffer. */
|
||||
load_file (fp, &buf, &size);
|
||||
fclose (fp);
|
||||
|
||||
downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
|
||||
|
||||
downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
|
||||
if (opt.backup_converted && downloaded_file_return)
|
||||
/* Rather than just writing over the original .html file with the converted
|
||||
version, save the former to *.orig. Note we only do this for files we've
|
||||
_successfully_ downloaded, so we don't clobber .orig files sitting around
|
||||
from previous invocations. */
|
||||
write_backup_file (file, downloaded_file_return);
|
||||
|
||||
/* Before opening the file for writing, unlink the file. This is
|
||||
important if the data in FM is mmaped. In such case, nulling the
|
||||
file, which is what fopen() below does, would make us read all
|
||||
zeroes from the mmaped region. */
|
||||
if (unlink (file) < 0 && errno != ENOENT)
|
||||
{
|
||||
/* Construct the backup filename as the original name plus ".orig". */
|
||||
size_t filename_len = strlen(file);
|
||||
char* filename_plus_orig_suffix;
|
||||
boolean already_wrote_backup_file = FALSE;
|
||||
slist* converted_file_ptr;
|
||||
static slist* converted_files = NULL;
|
||||
|
||||
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
||||
{
|
||||
/* Just write "orig" over "html". We need to do it this way because
|
||||
when we're checking to see if we've downloaded the file before (to
|
||||
see if we can skip downloading it), we don't know if it's a
|
||||
text/html file. Therefore we don't know yet at that stage that -E
|
||||
is going to cause us to tack on ".html", so we need to compare
|
||||
vs. the original URL plus ".orig", not the original URL plus
|
||||
".html.orig". */
|
||||
filename_plus_orig_suffix = xmalloc(filename_len + 1);
|
||||
strcpy(filename_plus_orig_suffix, file);
|
||||
strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
|
||||
}
|
||||
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
|
||||
{
|
||||
/* Append ".orig" to the name. */
|
||||
filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
|
||||
strcpy(filename_plus_orig_suffix, file);
|
||||
strcpy(filename_plus_orig_suffix + filename_len, ".orig");
|
||||
}
|
||||
|
||||
/* We can get called twice on the same URL thanks to the
|
||||
convert_all_links() call in main(). If we write the .orig file each
|
||||
time in such a case, it'll end up containing the first-pass conversion,
|
||||
not the original file. So, see if we've already been called on this
|
||||
file. */
|
||||
converted_file_ptr = converted_files;
|
||||
while (converted_file_ptr != NULL)
|
||||
if (strcmp(converted_file_ptr->string, file) == 0)
|
||||
{
|
||||
already_wrote_backup_file = TRUE;
|
||||
break;
|
||||
}
|
||||
else
|
||||
converted_file_ptr = converted_file_ptr->next;
|
||||
|
||||
if (!already_wrote_backup_file)
|
||||
{
|
||||
/* Rename <file> to <file>.orig before former gets written over. */
|
||||
if (rename(file, filename_plus_orig_suffix) != 0)
|
||||
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
|
||||
file, filename_plus_orig_suffix, strerror (errno));
|
||||
|
||||
/* Remember that we've already written a .orig backup for this file.
|
||||
Note that we never free this memory since we need it till the
|
||||
convert_all_links() call, which is one of the last things the
|
||||
program does before terminating. BTW, I'm not sure if it would be
|
||||
safe to just set 'converted_file_ptr->string' to 'file' below,
|
||||
rather than making a copy of the string... Another note is that I
|
||||
thought I could just add a field to the urlpos structure saying
|
||||
that we'd written a .orig file for this URL, but that didn't work,
|
||||
so I had to make this separate list. */
|
||||
converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
|
||||
converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
|
||||
converted_file_ptr->next = converted_files;
|
||||
converted_files = converted_file_ptr;
|
||||
}
|
||||
|
||||
free(filename_plus_orig_suffix);
|
||||
logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
|
||||
file, strerror (errno));
|
||||
read_file_free (fm);
|
||||
return;
|
||||
}
|
||||
/* Now open the file for writing. */
|
||||
fp = fopen (file, "wb");
|
||||
@ -1631,50 +1361,63 @@ convert_links (const char *file, urlpos *l)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
|
||||
file, strerror (errno));
|
||||
free (buf);
|
||||
read_file_free (fm);
|
||||
return;
|
||||
}
|
||||
/* Presumably we have to loop through multiple URLs here (even though we're
|
||||
only talking about a single local file) because of the -O option. */
|
||||
for (p = buf; l; l = l->next)
|
||||
/* Here we loop through all the URLs in file, replacing those of
|
||||
them that are downloaded with relative references. */
|
||||
p = fm->content;
|
||||
for (; l; l = l->next)
|
||||
{
|
||||
if (l->pos >= size)
|
||||
char *url_start = fm->content + l->pos;
|
||||
if (l->pos >= fm->length)
|
||||
{
|
||||
DEBUGP (("Something strange is going on. Please investigate."));
|
||||
break;
|
||||
}
|
||||
/* If the URL already is relative or it is not to be converted
|
||||
for some other reason (e.g. because of not having been
|
||||
downloaded in the first place), skip it. */
|
||||
if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
|
||||
/* If the URL is not to be converted, skip it. */
|
||||
if (!(l->flags & (UABS2REL | UREL2ABS)))
|
||||
{
|
||||
DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
|
||||
l->pos, l->flags));
|
||||
continue;
|
||||
}
|
||||
/* Else, reach the position of the offending URL, echoing
|
||||
everything up to it to the outfile. */
|
||||
for (p2 = buf + l->pos; p < p2; p++)
|
||||
putc (*p, fp);
|
||||
|
||||
/* Echo the file contents, up to the offending URL's opening
|
||||
quote, to the outfile. */
|
||||
fwrite (p, 1, url_start - p, fp);
|
||||
p = url_start;
|
||||
if (l->flags & UABS2REL)
|
||||
/* Convert absolute URL to relative. */
|
||||
{
|
||||
/* Convert absolute URL to relative. */
|
||||
char *newname = construct_relative (file, l->local_name);
|
||||
fprintf (fp, "%s", newname);
|
||||
putc (*p, fp); /* quoting char */
|
||||
fputs (newname, fp);
|
||||
p += l->size - 1;
|
||||
putc (*p, fp); /* close quote */
|
||||
++p;
|
||||
DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
|
||||
l->url, newname, l->pos, file));
|
||||
free (newname);
|
||||
}
|
||||
p += l->size;
|
||||
else if (l->flags & UREL2ABS)
|
||||
{
|
||||
/* Convert the link to absolute URL. */
|
||||
char *newlink = l->url;
|
||||
putc (*p, fp); /* quoting char */
|
||||
fputs (newlink, fp);
|
||||
p += l->size - 1;
|
||||
putc (*p, fp); /* close quote */
|
||||
++p;
|
||||
DEBUGP (("REL2ABS: <something> to %s at position %d in %s.\n",
|
||||
newlink, l->pos, file));
|
||||
}
|
||||
}
|
||||
/* Output the rest of the file. */
|
||||
if (p - buf < size)
|
||||
{
|
||||
for (p2 = buf + size; p < p2; p++)
|
||||
putc (*p, fp);
|
||||
}
|
||||
if (p - fm->content < fm->length)
|
||||
fwrite (p, 1, fm->length - (p - fm->content), fp);
|
||||
fclose (fp);
|
||||
free (buf);
|
||||
read_file_free (fm);
|
||||
logputs (LOG_VERBOSE, _("done.\n"));
|
||||
}
|
||||
|
||||
@ -1746,6 +1489,79 @@ add_url (urlpos *l, const char *url, const char *file)
|
||||
return t;
|
||||
}
|
||||
|
||||
static void
|
||||
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
||||
{
|
||||
/* Rather than just writing over the original .html file with the
|
||||
converted version, save the former to *.orig. Note we only do
|
||||
this for files we've _successfully_ downloaded, so we don't
|
||||
clobber .orig files sitting around from previous invocations. */
|
||||
|
||||
/* Construct the backup filename as the original name plus ".orig". */
|
||||
size_t filename_len = strlen(file);
|
||||
char* filename_plus_orig_suffix;
|
||||
boolean already_wrote_backup_file = FALSE;
|
||||
slist* converted_file_ptr;
|
||||
static slist* converted_files = NULL;
|
||||
|
||||
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
||||
{
|
||||
/* Just write "orig" over "html". We need to do it this way
|
||||
because when we're checking to see if we've downloaded the
|
||||
file before (to see if we can skip downloading it), we don't
|
||||
know if it's a text/html file. Therefore we don't know yet
|
||||
at that stage that -E is going to cause us to tack on
|
||||
".html", so we need to compare vs. the original URL plus
|
||||
".orig", not the original URL plus ".html.orig". */
|
||||
filename_plus_orig_suffix = alloca (filename_len + 1);
|
||||
strcpy(filename_plus_orig_suffix, file);
|
||||
strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
|
||||
}
|
||||
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
|
||||
{
|
||||
/* Append ".orig" to the name. */
|
||||
filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
|
||||
strcpy(filename_plus_orig_suffix, file);
|
||||
strcpy(filename_plus_orig_suffix + filename_len, ".orig");
|
||||
}
|
||||
|
||||
/* We can get called twice on the same URL thanks to the
|
||||
convert_all_links() call in main(). If we write the .orig file
|
||||
each time in such a case, it'll end up containing the first-pass
|
||||
conversion, not the original file. So, see if we've already been
|
||||
called on this file. */
|
||||
converted_file_ptr = converted_files;
|
||||
while (converted_file_ptr != NULL)
|
||||
if (strcmp(converted_file_ptr->string, file) == 0)
|
||||
{
|
||||
already_wrote_backup_file = TRUE;
|
||||
break;
|
||||
}
|
||||
else
|
||||
converted_file_ptr = converted_file_ptr->next;
|
||||
|
||||
if (!already_wrote_backup_file)
|
||||
{
|
||||
/* Rename <file> to <file>.orig before former gets written over. */
|
||||
if (rename(file, filename_plus_orig_suffix) != 0)
|
||||
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
|
||||
file, filename_plus_orig_suffix, strerror (errno));
|
||||
|
||||
/* Remember that we've already written a .orig backup for this file.
|
||||
Note that we never free this memory since we need it till the
|
||||
convert_all_links() call, which is one of the last things the
|
||||
program does before terminating. BTW, I'm not sure if it would be
|
||||
safe to just set 'converted_file_ptr->string' to 'file' below,
|
||||
rather than making a copy of the string... Another note is that I
|
||||
thought I could just add a field to the urlpos structure saying
|
||||
that we'd written a .orig file for this URL, but that didn't work,
|
||||
so I had to make this separate list. */
|
||||
converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
|
||||
converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
|
||||
converted_file_ptr->next = converted_files;
|
||||
converted_files = converted_file_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remembers which files have been downloaded. In the standard case, should be
|
||||
called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
|
||||
@ -1798,3 +1614,10 @@ downloaded_file (downloaded_file_t mode, const char* file)
|
||||
return FILE_NOT_ALREADY_DOWNLOADED;
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialization of static stuff. */
|
||||
void
|
||||
url_init (void)
|
||||
{
|
||||
init_unsafe_char_table ();
|
||||
}
|
||||
|
@ -88,6 +88,7 @@ struct urlinfo *newurl PARAMS ((void));
|
||||
void freeurl PARAMS ((struct urlinfo *, int));
|
||||
uerr_t urlproto PARAMS ((const char *));
|
||||
int skip_proto PARAMS ((const char *));
|
||||
int has_proto PARAMS ((const char *));
|
||||
int skip_uname PARAMS ((const char *));
|
||||
|
||||
uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int));
|
||||
@ -95,7 +96,7 @@ char *str_url PARAMS ((const struct urlinfo *, int));
|
||||
int url_equal PARAMS ((const char *, const char *));
|
||||
|
||||
urlpos *get_urls_file PARAMS ((const char *));
|
||||
urlpos *get_urls_html PARAMS ((const char *, const char *, int, int));
|
||||
urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
|
||||
void free_urlpos PARAMS ((urlpos *));
|
||||
|
||||
char *url_concat PARAMS ((const char *, const char *));
|
||||
|
301
src/utils.c
301
src/utils.c
@ -31,6 +31,9 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_MMAP
|
||||
# include <sys/mman.h>
|
||||
#endif
|
||||
#ifdef HAVE_PWD_H
|
||||
# include <pwd.h>
|
||||
#endif
|
||||
@ -45,11 +48,13 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#ifdef NeXT
|
||||
# include <libc.h> /* for access() */
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "utils.h"
|
||||
#include "fnmatch.h"
|
||||
#include "hash.h"
|
||||
|
||||
#ifndef errno
|
||||
extern int errno;
|
||||
@ -736,28 +741,149 @@ read_whole_line (FILE *fp)
|
||||
line = xrealloc (line, length + 1);
|
||||
return line;
|
||||
}
|
||||
|
||||
/* Read FILE into memory. A pointer to `struct file_memory' are
|
||||
returned; use struct element `content' to access file contents, and
|
||||
the element `length' to know the file length. `content' is *not*
|
||||
zero-terminated, and you should *not* read or write beyond the [0,
|
||||
length) range of characters.
|
||||
|
||||
/* Load file pointed to by FP to memory and return the malloc-ed
|
||||
buffer with the contents. *NREAD will contain the number of read
|
||||
bytes. The file is loaded in chunks, allocated exponentially,
|
||||
starting with FILE_BUFFER_SIZE bytes. */
|
||||
void
|
||||
load_file (FILE *fp, char **buf, long *nread)
|
||||
After you are done with the file contents, call read_file_free to
|
||||
release the memory.
|
||||
|
||||
Depending on the operating system and the type of file that is
|
||||
being read, read_file() either mmap's the file into memory, or
|
||||
reads the file into the core using read().
|
||||
|
||||
If file is named "-", fileno(stdin) is used for reading instead.
|
||||
If you want to read from a real file named "-", use "./-" instead. */
|
||||
|
||||
struct file_memory *
|
||||
read_file (const char *file)
|
||||
{
|
||||
long bufsize;
|
||||
int fd;
|
||||
struct file_memory *fm;
|
||||
long size;
|
||||
int inhibit_close = 0;
|
||||
|
||||
bufsize = 512;
|
||||
*nread = 0;
|
||||
*buf = NULL;
|
||||
while (!feof (fp) && !ferror (fp))
|
||||
/* Some magic in the finest tradition of Perl and its kin: if FILE
|
||||
is "-", just use stdin. */
|
||||
if (HYPHENP (file))
|
||||
{
|
||||
*buf = (char *)xrealloc (*buf, bufsize + *nread);
|
||||
*nread += fread (*buf + *nread, sizeof (char), bufsize, fp);
|
||||
bufsize <<= 1;
|
||||
fd = fileno (stdin);
|
||||
inhibit_close = 1;
|
||||
/* Note that we don't inhibit mmap() in this case. If stdin is
|
||||
redirected from a regular file, mmap() will still work. */
|
||||
}
|
||||
/* #### No indication of encountered error?? */
|
||||
else
|
||||
fd = open (file, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return NULL;
|
||||
fm = xmalloc (sizeof (struct file_memory));
|
||||
|
||||
#ifdef HAVE_MMAP
|
||||
{
|
||||
struct stat buf;
|
||||
if (fstat (fd, &buf) < 0)
|
||||
goto mmap_lose;
|
||||
fm->length = buf.st_size;
|
||||
/* NOTE: As far as I know, the callers of this function never
|
||||
modify the file text. Relying on this would enable us to
|
||||
specify PROT_READ and MAP_SHARED for a marginal gain in
|
||||
efficiency, but at some cost to generality. */
|
||||
fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE, fd, 0);
|
||||
if (fm->content == MAP_FAILED)
|
||||
goto mmap_lose;
|
||||
if (!inhibit_close)
|
||||
close (fd);
|
||||
|
||||
fm->mmap_p = 1;
|
||||
return fm;
|
||||
}
|
||||
|
||||
mmap_lose:
|
||||
/* The most common reason why mmap() fails is that FD does not point
|
||||
to a plain file. However, it's also possible that mmap() doesn't
|
||||
work for a particular type of file. Therefore, whenever mmap()
|
||||
fails, we just fall back to the regular method. */
|
||||
#endif /* HAVE_MMAP */
|
||||
|
||||
fm->length = 0;
|
||||
size = 512; /* number of bytes fm->contents can
|
||||
hold at any given time. */
|
||||
fm->content = xmalloc (size);
|
||||
while (1)
|
||||
{
|
||||
long nread;
|
||||
if (fm->length > size / 2)
|
||||
{
|
||||
/* #### I'm not sure whether the whole exponential-growth
|
||||
thing makes sense with kernel read. On Linux at least,
|
||||
read() refuses to read more than 4K from a file at a
|
||||
single chunk anyway. But other Unixes might optimize it
|
||||
better, and it doesn't *hurt* anything, so I'm leaving
|
||||
it. */
|
||||
|
||||
/* Normally, we grow SIZE exponentially to make the number
|
||||
of calls to read() and realloc() logarithmic in relation
|
||||
to file size. However, read() can read an amount of data
|
||||
smaller than requested, and it would be unreasonably to
|
||||
double SIZE every time *something* was read. Therefore,
|
||||
we double SIZE only when the length exceeds half of the
|
||||
entire allocated size. */
|
||||
size <<= 1;
|
||||
fm->content = xrealloc (fm->content, size);
|
||||
}
|
||||
nread = read (fd, fm->content + fm->length, size - fm->length);
|
||||
if (nread > 0)
|
||||
/* Successful read. */
|
||||
fm->length += nread;
|
||||
else if (nread < 0)
|
||||
/* Error. */
|
||||
goto lose;
|
||||
else
|
||||
/* EOF */
|
||||
break;
|
||||
}
|
||||
if (!inhibit_close)
|
||||
close (fd);
|
||||
if (size > fm->length && fm->length != 0)
|
||||
/* Due to exponential growth of fm->content, the allocated region
|
||||
might be much larger than what is actually needed. */
|
||||
fm->content = xrealloc (fm->content, fm->length);
|
||||
fm->mmap_p = 0;
|
||||
return fm;
|
||||
|
||||
lose:
|
||||
if (!inhibit_close)
|
||||
close (fd);
|
||||
free (fm->content);
|
||||
free (fm);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Release the resources held by FM. Specifically, this calls
|
||||
munmap() or free() on fm->content, depending whether mmap or
|
||||
malloc/read were used to read in the file. It also frees the
|
||||
memory needed to hold the FM structure itself. */
|
||||
|
||||
void
|
||||
read_file_free (struct file_memory *fm)
|
||||
{
|
||||
#ifdef HAVE_MMAP
|
||||
if (fm->mmap_p)
|
||||
{
|
||||
munmap (fm->content, fm->length);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
free (fm->content);
|
||||
}
|
||||
free (fm);
|
||||
}
|
||||
|
||||
/* Free the pointers in a NULL-terminated vector of pointers, then
|
||||
free the pointer itself. */
|
||||
void
|
||||
@ -801,97 +927,42 @@ merge_vecs (char **v1, char **v2)
|
||||
return v1;
|
||||
}
|
||||
|
||||
/* A set of simple-minded routines to store and search for strings in
|
||||
a linked list. You may add a string to the slist, and peek whether
|
||||
it's still in there at any time later. */
|
||||
/* A set of simple-minded routines to store strings in a linked list.
|
||||
This used to also be used for searching, but now we have hash
|
||||
tables for that. */
|
||||
|
||||
/* Add an element to the list. If flags is NOSORT, the list will not
|
||||
be sorted. */
|
||||
/* Append an element to the list. */
|
||||
slist *
|
||||
add_slist (slist *l, const char *s, int flags)
|
||||
slist_append (slist *l, const char *s)
|
||||
{
|
||||
slist *t, *old, *beg;
|
||||
int cmp;
|
||||
slist *newel = (slist *)xmalloc (sizeof (slist));
|
||||
slist *beg = l;
|
||||
|
||||
if (flags & NOSORT)
|
||||
{
|
||||
if (!l)
|
||||
{
|
||||
t = (slist *)xmalloc (sizeof (slist));
|
||||
t->string = xstrdup (s);
|
||||
t->next = NULL;
|
||||
return t;
|
||||
}
|
||||
beg = l;
|
||||
/* Find the last element. */
|
||||
while (l->next)
|
||||
l = l->next;
|
||||
t = (slist *)xmalloc (sizeof (slist));
|
||||
l->next = t;
|
||||
t->string = xstrdup (s);
|
||||
t->next = NULL;
|
||||
return beg;
|
||||
}
|
||||
/* Empty list or changing the first element. */
|
||||
if (!l || (cmp = strcmp (l->string, s)) > 0)
|
||||
{
|
||||
t = (slist *)xmalloc (sizeof (slist));
|
||||
t->string = xstrdup (s);
|
||||
t->next = l;
|
||||
return t;
|
||||
}
|
||||
newel->string = xstrdup (s);
|
||||
newel->next = NULL;
|
||||
|
||||
beg = l;
|
||||
if (cmp == 0)
|
||||
return beg;
|
||||
|
||||
/* Second two one-before-the-last element. */
|
||||
if (!l)
|
||||
return newel;
|
||||
/* Find the last element. */
|
||||
while (l->next)
|
||||
{
|
||||
old = l;
|
||||
l = l->next;
|
||||
cmp = strcmp (s, l->string);
|
||||
if (cmp == 0) /* no repeating in the list */
|
||||
return beg;
|
||||
else if (cmp > 0)
|
||||
continue;
|
||||
/* If the next list element is greater than s, put s between the
|
||||
current and the next list element. */
|
||||
t = (slist *)xmalloc (sizeof (slist));
|
||||
old->next = t;
|
||||
t->next = l;
|
||||
t->string = xstrdup (s);
|
||||
return beg;
|
||||
}
|
||||
t = (slist *)xmalloc (sizeof (slist));
|
||||
t->string = xstrdup (s);
|
||||
/* Insert the new element after the last element. */
|
||||
l->next = t;
|
||||
t->next = NULL;
|
||||
l = l->next;
|
||||
l->next = newel;
|
||||
return beg;
|
||||
}
|
||||
|
||||
/* Is there a specific entry in the list? */
|
||||
int
|
||||
in_slist (slist *l, const char *s)
|
||||
slist_contains (slist *l, const char *s)
|
||||
{
|
||||
int cmp;
|
||||
|
||||
while (l)
|
||||
{
|
||||
cmp = strcmp (l->string, s);
|
||||
if (cmp == 0)
|
||||
return 1;
|
||||
else if (cmp > 0) /* the list is ordered! */
|
||||
return 0;
|
||||
l = l->next;
|
||||
}
|
||||
for (; l; l = l->next)
|
||||
if (!strcmp (l->string, s))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Free the whole slist. */
|
||||
void
|
||||
free_slist (slist *l)
|
||||
slist_free (slist *l)
|
||||
{
|
||||
slist *n;
|
||||
|
||||
@ -903,6 +974,58 @@ free_slist (slist *l)
|
||||
l = n;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sometimes it's useful to create "sets" of strings, i.e. special
|
||||
hash tables where you want to store strings as keys and merely
|
||||
query for their existence. Here is a set of utility routines that
|
||||
makes that transparent. */
|
||||
|
||||
void
|
||||
string_set_add (struct hash_table *ht, const char *s)
|
||||
{
|
||||
/* We use "1" as value. It provides us a useful and clear arbitrary
|
||||
value, and it consumes no memory -- the pointers to the same
|
||||
string "1" will be shared by all the key-value pairs in the hash
|
||||
table. */
|
||||
hash_table_put (ht, xstrdup (s), "1");
|
||||
}
|
||||
|
||||
int
|
||||
string_set_exists (struct hash_table *ht, const char *s)
|
||||
{
|
||||
return hash_table_exists (ht, s);
|
||||
}
|
||||
|
||||
static int
|
||||
string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
|
||||
{
|
||||
free (key);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
string_set_free (struct hash_table *ht)
|
||||
{
|
||||
hash_table_map (ht, string_set_free_mapper, NULL);
|
||||
hash_table_destroy (ht);
|
||||
}
|
||||
|
||||
static int
|
||||
free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
|
||||
{
|
||||
free (key);
|
||||
free (value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Another utility function: call free() on all keys and values of HT. */
|
||||
|
||||
void
|
||||
free_keys_and_values (struct hash_table *ht)
|
||||
{
|
||||
hash_table_map (ht, free_keys_and_values_mapper, NULL);
|
||||
}
|
||||
|
||||
|
||||
/* Engine for legible and legible_long_long; this function works on
|
||||
strings. */
|
||||
|
27
src/utils.h
27
src/utils.h
@ -20,11 +20,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
/* Flags for slist. */
|
||||
enum {
|
||||
NOSORT = 1
|
||||
};
|
||||
|
||||
enum accd {
|
||||
ALLABS = 1
|
||||
};
|
||||
@ -36,6 +31,14 @@ typedef struct _slist
|
||||
struct _slist *next;
|
||||
} slist;
|
||||
|
||||
struct hash_table;
|
||||
|
||||
struct file_memory {
|
||||
char *content;
|
||||
long length;
|
||||
int mmap_p;
|
||||
};
|
||||
|
||||
char *time_str PARAMS ((time_t *));
|
||||
const char *uerrmsg PARAMS ((uerr_t));
|
||||
|
||||
@ -58,13 +61,19 @@ int accdir PARAMS ((const char *s, enum accd));
|
||||
char *suffix PARAMS ((const char *s));
|
||||
|
||||
char *read_whole_line PARAMS ((FILE *));
|
||||
void load_file PARAMS ((FILE *, char **, long *));
|
||||
struct file_memory *read_file PARAMS ((const char *));
|
||||
void read_file_free PARAMS ((struct file_memory *));
|
||||
|
||||
void free_vec PARAMS ((char **));
|
||||
char **merge_vecs PARAMS ((char **, char **));
|
||||
slist *add_slist PARAMS ((slist *, const char *, int));
|
||||
int in_slist PARAMS ((slist *, const char *));
|
||||
void free_slist PARAMS ((slist *));
|
||||
slist *slist_append PARAMS ((slist *, const char *));
|
||||
int slist_contains PARAMS ((slist *, const char *));
|
||||
void slist_free PARAMS ((slist *));
|
||||
|
||||
void string_set_add PARAMS ((struct hash_table *, const char *));
|
||||
int string_set_exists PARAMS ((struct hash_table *, const char *));
|
||||
void string_set_free PARAMS ((struct hash_table *));
|
||||
void free_keys_and_values PARAMS ((struct hash_table *));
|
||||
|
||||
char *legible PARAMS ((long));
|
||||
char *legible_very_long PARAMS ((VERY_LONG_TYPE));
|
||||
|
@ -71,7 +71,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
/* Print X if debugging is enabled; a no-op otherwise. */
|
||||
#ifdef DEBUG
|
||||
# define DEBUGP(x) do { debug_logprintf x; } while (0)
|
||||
# define DEBUGP(x) do { if (opt.debug) { debug_logprintf x; } } while (0)
|
||||
#else /* not DEBUG */
|
||||
# define DEBUGP(x) DO_NOTHING
|
||||
#endif /* not DEBUG */
|
||||
|
Loading…
Reference in New Issue
Block a user