[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2000-11-19 12:50:10 -08:00
parent ccf31643ab
commit b0b1c815c1
39 changed files with 3518 additions and 901 deletions

View File

@ -1,3 +1,7 @@
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
* configure.in: Test for MMAP.
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* windows/config.h.ms: snprintf and vsnprintf exist under Windows.

12
TODO
View File

@ -49,15 +49,6 @@ changes.
* Make `-k' check for files that were downloaded in the past and convert links
to them in newly-downloaded documents.
* -k should convert relative references to absolute if not downloaded.
* -k should convert "hostless absolute" URLs, like <A HREF="/index.html">.
However, Brian McMahon <bm@iucr.org> wants the old incorrect behavior to still
be available as an option, as he depends on it to allow mirrors of his site to
send CGI queries to his original site, but still get graphics off of the
mirror site. Perhaps this would be better dealt with by adding an option to
tell -k not to convert certain URL patterns?
* Add option to clobber existing file names (no `.N' suffixes).
* Introduce a concept of "boolean" options. For instance, every
@ -85,9 +76,6 @@ changes.
* Allow size limit to files (perhaps with an option to download oversize files
up through the limit or not at all, to get more functionality than [u]limit.
* Recognize HTML comments correctly. Add more options for handling
bogus HTML found all over the 'net.
* Implement breadth-first retrieval.
* Download to .in* when mirroring.

350
configure vendored
View File

@ -2040,15 +2040,55 @@ EOF
fi
for ac_func in strdup strstr strcasecmp strncasecmp
for ac_hdr in unistd.h
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
echo "configure:2048: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2053 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
eval "ac_cv_header_$ac_safe=yes"
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
eval "ac_cv_header_$ac_safe=no"
fi
rm -f conftest*
fi
if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
echo "$ac_t""yes" 1>&6
ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
cat >> confdefs.h <<EOF
#define $ac_tr_hdr 1
EOF
else
echo "$ac_t""no" 1>&6
fi
done
for ac_func in getpagesize
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2047: checking for $ac_func" >&5
echo "configure:2087: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2052 "configure"
#line 2092 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2071,7 +2111,233 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
eval "ac_cv_func_$ac_func=no"
fi
rm -f conftest*
fi
if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then
echo "$ac_t""yes" 1>&6
ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
cat >> confdefs.h <<EOF
#define $ac_tr_func 1
EOF
else
echo "$ac_t""no" 1>&6
fi
done
echo $ac_n "checking for working mmap""... $ac_c" 1>&6
echo "configure:2140: checking for working mmap" >&5
if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test "$cross_compiling" = yes; then
ac_cv_func_mmap_fixed_mapped=no
else
cat > conftest.$ac_ext <<EOF
#line 2148 "configure"
#include "confdefs.h"
/* Thanks to Mike Haertel and Jim Avera for this test.
Here is a matrix of mmap possibilities:
mmap private not fixed
mmap private fixed at somewhere currently unmapped
mmap private fixed at somewhere already mapped
mmap shared not fixed
mmap shared fixed at somewhere currently unmapped
mmap shared fixed at somewhere already mapped
For private mappings, we should verify that changes cannot be read()
back from the file, nor mmap's back from the file at a different
address. (There have been systems where private was not correctly
implemented like the infamous i386 svr4.0, and systems where the
VM page cache was not coherent with the filesystem buffer cache
like early versions of FreeBSD and possibly contemporary NetBSD.)
For shared mappings, we should conversely verify that changes get
propogated back to all the places they're supposed to be.
Grep wants private fixed already mapped.
The main things grep needs to know about mmap are:
* does it exist and is it safe to write into the mmap'd area
* how to use it (BSD variants) */
#include <sys/types.h>
#include <fcntl.h>
#include <sys/mman.h>
/* This mess was copied from the GNU getpagesize.h. */
#ifndef HAVE_GETPAGESIZE
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
/* Assume that all systems that can run configure have sys/param.h. */
# ifndef HAVE_SYS_PARAM_H
# define HAVE_SYS_PARAM_H 1
# endif
# ifdef _SC_PAGESIZE
# define getpagesize() sysconf(_SC_PAGESIZE)
# else /* no _SC_PAGESIZE */
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# ifdef EXEC_PAGESIZE
# define getpagesize() EXEC_PAGESIZE
# else /* no EXEC_PAGESIZE */
# ifdef NBPG
# define getpagesize() NBPG * CLSIZE
# ifndef CLSIZE
# define CLSIZE 1
# endif /* no CLSIZE */
# else /* no NBPG */
# ifdef NBPC
# define getpagesize() NBPC
# else /* no NBPC */
# ifdef PAGESIZE
# define getpagesize() PAGESIZE
# endif /* PAGESIZE */
# endif /* no NBPC */
# endif /* no NBPG */
# endif /* no EXEC_PAGESIZE */
# else /* no HAVE_SYS_PARAM_H */
# define getpagesize() 8192 /* punt totally */
# endif /* no HAVE_SYS_PARAM_H */
# endif /* no _SC_PAGESIZE */
#endif /* no HAVE_GETPAGESIZE */
#ifdef __cplusplus
extern "C" { void *malloc(unsigned); }
#else
char *malloc();
#endif
int
main()
{
char *data, *data2, *data3;
int i, pagesize;
int fd;
pagesize = getpagesize();
/*
* First, make a file with some known garbage in it.
*/
data = malloc(pagesize);
if (!data)
exit(1);
for (i = 0; i < pagesize; ++i)
*(data + i) = rand();
umask(0);
fd = creat("conftestmmap", 0600);
if (fd < 0)
exit(1);
if (write(fd, data, pagesize) != pagesize)
exit(1);
close(fd);
/*
* Next, try to mmap the file at a fixed address which
* already has something else allocated at it. If we can,
* also make sure that we see the same garbage.
*/
fd = open("conftestmmap", O_RDWR);
if (fd < 0)
exit(1);
data2 = malloc(2 * pagesize);
if (!data2)
exit(1);
data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1);
if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED, fd, 0L))
exit(1);
for (i = 0; i < pagesize; ++i)
if (*(data + i) != *(data2 + i))
exit(1);
/*
* Finally, make sure that changes to the mapped area
* do not percolate back to the file as seen by read().
* (This is a bug on some variants of i386 svr4.0.)
*/
for (i = 0; i < pagesize; ++i)
*(data2 + i) = *(data2 + i) + 1;
data3 = malloc(pagesize);
if (!data3)
exit(1);
if (read(fd, data3, pagesize) != pagesize)
exit(1);
for (i = 0; i < pagesize; ++i)
if (*(data + i) != *(data3 + i))
exit(1);
close(fd);
unlink("conftestmmap");
exit(0);
}
EOF
if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
then
ac_cv_func_mmap_fixed_mapped=yes
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -fr conftest*
ac_cv_func_mmap_fixed_mapped=no
fi
rm -fr conftest*
fi
fi
echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6
if test $ac_cv_func_mmap_fixed_mapped = yes; then
cat >> confdefs.h <<\EOF
#define HAVE_MMAP 1
EOF
fi
for ac_func in strdup strstr strcasecmp strncasecmp
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2313: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2318 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
#include <assert.h>
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
builtin and then its argument prototype would still apply. */
char $ac_func();
int main() {
/* The GNU C library defines this for functions which it implements
to always fail with ENOSYS. Some functions are actually named
something starting with __ and the normal name is an alias. */
#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
choke me
#else
$ac_func();
#endif
; return 0; }
EOF
if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2098,12 +2364,12 @@ done
for ac_func in gettimeofday mktime strptime
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2102: checking for $ac_func" >&5
echo "configure:2368: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2107 "configure"
#line 2373 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2126,7 +2392,7 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2130: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2153,12 +2419,12 @@ done
for ac_func in strerror snprintf vsnprintf select signal symlink access isatty
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2157: checking for $ac_func" >&5
echo "configure:2423: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2162 "configure"
#line 2428 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2181,7 +2447,7 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2185: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2208,12 +2474,12 @@ done
for ac_func in uname gethostname
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2212: checking for $ac_func" >&5
echo "configure:2478: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2217 "configure"
#line 2483 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2236,7 +2502,7 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2264,12 +2530,12 @@ done
for ac_func in gethostbyname
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2268: checking for $ac_func" >&5
echo "configure:2534: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2273 "configure"
#line 2539 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2292,7 +2558,7 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2314,7 +2580,7 @@ EOF
else
echo "$ac_t""no" 1>&6
echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
echo "configure:2318: checking for gethostbyname in -lnsl" >&5
echo "configure:2584: checking for gethostbyname in -lnsl" >&5
ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
@ -2322,7 +2588,7 @@ else
ac_save_LIBS="$LIBS"
LIBS="-lnsl $LIBS"
cat > conftest.$ac_ext <<EOF
#line 2326 "configure"
#line 2592 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
@ -2333,7 +2599,7 @@ int main() {
gethostbyname()
; return 0; }
EOF
if { (eval echo configure:2337: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
@ -2367,7 +2633,7 @@ done
echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6
echo "configure:2371: checking for socket in -lsocket" >&5
echo "configure:2637: checking for socket in -lsocket" >&5
ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
@ -2375,7 +2641,7 @@ else
ac_save_LIBS="$LIBS"
LIBS="-lsocket $LIBS"
cat > conftest.$ac_ext <<EOF
#line 2379 "configure"
#line 2645 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
@ -2386,7 +2652,7 @@ int main() {
socket()
; return 0; }
EOF
if { (eval echo configure:2390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
@ -2417,7 +2683,7 @@ fi
if test "x${with_socks}" = xyes
then
echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6
echo "configure:2421: checking for main in -lresolv" >&5
echo "configure:2687: checking for main in -lresolv" >&5
ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
@ -2425,14 +2691,14 @@ else
ac_save_LIBS="$LIBS"
LIBS="-lresolv $LIBS"
cat > conftest.$ac_ext <<EOF
#line 2429 "configure"
#line 2695 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
if { (eval echo configure:2436: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
@ -2460,7 +2726,7 @@ else
fi
echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6
echo "configure:2464: checking for Rconnect in -lsocks" >&5
echo "configure:2730: checking for Rconnect in -lsocks" >&5
ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
@ -2468,7 +2734,7 @@ else
ac_save_LIBS="$LIBS"
LIBS="-lsocks $LIBS"
cat > conftest.$ac_ext <<EOF
#line 2472 "configure"
#line 2738 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
@ -2479,7 +2745,7 @@ int main() {
Rconnect()
; return 0; }
EOF
if { (eval echo configure:2483: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
@ -2511,7 +2777,7 @@ fi
ALL_LINGUAS="cs de hr it no pl pt_BR ru"
echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6
echo "configure:2515: checking whether NLS is requested" >&5
echo "configure:2781: checking whether NLS is requested" >&5
# Check whether --enable-nls or --disable-nls was given.
if test "${enable_nls+set}" = set; then
enableval="$enable_nls"
@ -2528,7 +2794,7 @@ fi
# Extract the first word of "msgfmt", so it can be a program name with args.
set dummy msgfmt; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:2532: checking for $ac_word" >&5
echo "configure:2798: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@ -2562,7 +2828,7 @@ fi
# Extract the first word of "xgettext", so it can be a program name with args.
set dummy xgettext; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:2566: checking for $ac_word" >&5
echo "configure:2832: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@ -2597,7 +2863,7 @@ fi
# Extract the first word of "gmsgfmt", so it can be a program name with args.
set dummy gmsgfmt; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:2601: checking for $ac_word" >&5
echo "configure:2867: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@ -2647,17 +2913,17 @@ fi
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
echo "configure:2651: checking for $ac_hdr" >&5
echo "configure:2917: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2656 "configure"
#line 2922 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
@ -2687,12 +2953,12 @@ done
for ac_func in gettext
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:2691: checking for $ac_func" >&5
echo "configure:2957: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
#line 2696 "configure"
#line 2962 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@ -2715,7 +2981,7 @@ $ac_func();
; return 0; }
EOF
if { (eval echo configure:2719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@ -2737,7 +3003,7 @@ EOF
else
echo "$ac_t""no" 1>&6
echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6
echo "configure:2741: checking for gettext in -lintl" >&5
echo "configure:3007: checking for gettext in -lintl" >&5
ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
@ -2745,7 +3011,7 @@ else
ac_save_LIBS="$LIBS"
LIBS="-lintl $LIBS"
cat > conftest.$ac_ext <<EOF
#line 2749 "configure"
#line 3015 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
@ -2756,7 +3022,7 @@ int main() {
gettext()
; return 0; }
EOF
if { (eval echo configure:2760: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
@ -2824,7 +3090,7 @@ do
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:2828: checking for $ac_word" >&5
echo "configure:3094: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else

View File

@ -160,6 +160,7 @@ dnl
dnl Checks for library functions.
dnl
AC_FUNC_ALLOCA
AC_FUNC_MMAP
AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp)
AC_CHECK_FUNCS(gettimeofday mktime strptime)
AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)

View File

@ -1,3 +1,8 @@
2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
* wget.texi (Robots): Document that we now support the meta tag
exclusion.
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* wget.texi: Use --- consistently.

View File

@ -2548,8 +2548,8 @@ this:
This is explained in some detail at
@url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}.
Unfortunately, Wget does not support this method of robot exclusion yet,
but it will be implemented in the next release.
Wget supports this method of robot exclusion in addition to the usual
@file{/robots.txt} exclusion.
@node Security Considerations, Contributors, Robots, Appendices
@section Security Considerations

BIN
po/cs.gmo

Binary file not shown.

BIN
po/de.gmo

Binary file not shown.

BIN
po/hr.gmo

Binary file not shown.

BIN
po/it.gmo

Binary file not shown.

BIN
po/no.gmo

Binary file not shown.

BIN
po/pl.gmo

Binary file not shown.

Binary file not shown.

BIN
po/ru.gmo

Binary file not shown.

View File

@ -1,3 +1,117 @@
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
* retr.c (get_contents): If use_expected, make sure that the
appropriate amount of data is being read.
* http.c (gethttp): Check for both `Keep-Alive: ...' and
`Connection: Keep-Alive'.
* wget.h (DEBUGP): Call debug_logprintf only if opt.debug is
turned on.
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
* http.c (connection_available_p): Use it.
* connect.c (test_socket_open): New function.
* http.c (gethttp): Support persistent connections. Based on the
ideas, and partly on code, by Sam Horrocks <sam@daemoninc.com>.
(register_persistent): New function.
(connection_available_p): Ditto.
(invalidate_connection): Ditto.
2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Handle UREL2ABS case.
* recur.c (recursive_retrieve): Instead of the list
urls_downloaded, use hash tables dl_file_url_map and
dl_url_file_map.
(convert_all_links): Use them to retrieve data.
* host.c (clean_hosts): Free the hash tables.
* main.c (private_initialize): Call host_init().
* host.c (store_hostaddress): Use a saner, hash table-based data
model.
(realhost): Ditto.
(host_init): Initialize the hash tables.
2000-11-18 Hrvoje Niksic <hniksic@arsdigita.com>
* utils.c (slist_append): Eviscerate NOSORT. Hash tables are now
used for what the sorted slists used to be used for.
(slist_contains): Don't rely on the list being sorted.
(slist_append): Simplify the code.
* recur.c (recursive_cleanup): Use free_string_set.
* utils.c (string_set_add, string_set_exists, string_set_free):
New functions for easier freeing of hash tables whose keys are
strdup'ed strings.
* recur.c (recursive_retrieve): Use the hash table functions for
storing undesirable URLs.
* hash.c: New file.
2000-11-17 Hrvoje Niksic <hniksic@arsdigita.com>
* main.c (private_initialize): Call url_init.
(main): Call private_initialize.
* url.c (unsafe_char_table): New table.
(UNSAFE_CHAR): Use it.
(init_unsafe_char_table): New function.
(url_init): New function; call init_unsafe_char_table.
2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
* html-url.c (handle_link): Handle HTML fragment identifiers.
* recur.c (recursive_retrieve): If norobot info is respected and
the file is specified not to be followed by robots, respect that.
* html-url.c (collect_tags_mapper): Handle <meta name=robots
content=X>. For us the important cases are where X is NONE or
where X contains NOFOLLOW.
(get_urls_html): Propagate that information to the caller.
2000-11-13 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Unlink the file we might be reading from
before writing to it.
(convert_links): Use alloca instead of malloc for
filename_plus_orig_suffix.
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (get_urls_file): Ditto.
(convert_links): Ditto.
* html-url.c (get_urls_html): Use read_file() instead of
load_file().
* utils.c (read_file): New function, instead of the old
load_file().
(read_file_free): Ditto.
* url.c (findurl): Search only for the supported protocols.
(convert_links): Use fwrite() when writing out a region of
characters.
2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
* ftp-ls.c: Move html_quote_string and ftp_index here.
* url.c: Remove get_urls_html, since that's now in html-url.c.
* html-url.c: New file.
* html-parse.c: New file.
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* mswindows.h: Define snprintf and vsnprintf to _snprintf and

View File

@ -57,9 +57,10 @@ MD5_OBJ = @MD5_OBJ@
OPIE_OBJ = @OPIE_OBJ@
OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o \
ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o \
http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o \
recur$o retr$o snprintf$o url$o utils$o version$o
ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o \
html-parse$o html-url$o http$o init$o log$o main$o \
$(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o \
url$o utils$o version$o
.SUFFIXES:
.SUFFIXES: .c .o ._c ._o
@ -133,26 +134,31 @@ TAGS: *.c *.h
# DO NOT DELETE THIS LINE -- make depend depends on it.
cmpt$o: config.h wget.h sysdep.h options.h
connect$o: config.h wget.h sysdep.h options.h connect.h host.h
fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h
ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h
ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h
ftp-opie$o: config.h wget.h sysdep.h options.h md5.h
ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h
getopt$o: wget.h sysdep.h options.h
headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h
host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h
html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h
http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h
init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h
log$o: config.h wget.h sysdep.h options.h utils.h
main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h
md5$o: wget.h sysdep.h options.h md5.h
mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h
netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h
rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h
recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h
retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h
url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h
utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h
cmpt$o: wget.h
connect$o: wget.h connect.h host.h
fnmatch$o: wget.h fnmatch.h
ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h
ftp-ls$o: wget.h utils.h ftp.h url.h
ftp-opie$o: wget.h md5.h
ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h
getopt$o: wget.h getopt.h
hash$o: wget.h utils.h hash.h
headers$o: wget.h connect.h rbuf.h headers.h
host$o: wget.h utils.h host.h url.h hash.h
html-parse$o: wget.h html-parse.h
html-url$o: wget.h html-parse.h url.h utils.h
html$o: wget.h url.h utils.h ftp.h
http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h
init$o: wget.h utils.h init.h host.h recur.h netrc.h
log$o: wget.h utils.h
main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h
md5$o: wget.h md5.h
mswindows$o: wget.h url.h
netrc$o: wget.h utils.h netrc.h init.h
rbuf$o: wget.h rbuf.h connect.h
recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h
retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h
snprintf$o:
url$o: wget.h utils.h url.h host.h
utils$o: wget.h utils.h fnmatch.h hash.h
version$o:

View File

@ -101,6 +101,9 @@ char *alloca ();
/* Define if you have the uname function. */
#undef HAVE_UNAME
/* Define if you have a working version of mmap. */
#undef HAVE_MMAP
/* Define if you have the gethostname function. */
#undef HAVE_GETHOSTNAME

View File

@ -107,6 +107,37 @@ make_connection (int *sock, char *hostname, unsigned short port)
return NOCONERROR;
}
int
test_socket_open (int sock)
{
#ifdef HAVE_SELECT
fd_set check_set;
struct timeval to;
/* Check if we still have a valid (non-EOF) connection. From Andrew
* Maholski's code in the Unix Socket FAQ. */
FD_ZERO (&check_set);
FD_SET (sock, &check_set);
/* Wait one microsecond */
to.tv_sec = 0;
to.tv_usec = 1;
/* If we get a timeout, then that means still connected */
if (select (sock + 1, &check_set, NULL, NULL, &to) == 0)
{
/* Connection is valid (not EOF), so continue */
return 1;
}
else
return 0;
#else
/* Without select, it's hard to know for sure. */
return 1;
#endif
}
/* Bind the local port PORT. This does all the necessary work, which
is creating a socket, setting SO_REUSEADDR option on it, then
calling bind() and listen(). If *PORT is 0, a random port is

View File

@ -36,6 +36,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "wget.h"
#include "utils.h"
#include "ftp.h"
#include "url.h"
/* Converts symbolic permissions to number-style ones, e.g. string
rwxr-xr-x to 755. For now, it knows nothing of
@ -388,3 +389,175 @@ ftp_parse_ls (const char *file)
{
return ftp_parse_unix_ls (file);
}
/* Stuff for creating FTP index. */
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
entities, as per RFC1866:
`&' -> `&amp;'
`<' -> `&lt;'
`>' -> `&gt;'
`"' -> `&quot;'
No other entities are recognized or replaced. */
static char *
html_quote_string (const char *s)
{
const char *b = s;
char *p, *res;
int i;
/* Pass through the string, and count the new size. */
for (i = 0; *s; s++, i++)
{
if (*s == '&')
i += 4; /* `amp;' */
else if (*s == '<' || *s == '>')
i += 3; /* `lt;' and `gt;' */
else if (*s == '\"')
i += 5; /* `quot;' */
}
res = (char *)xmalloc (i + 1);
s = b;
for (p = res; *s; s++)
{
switch (*s)
{
case '&':
*p++ = '&';
*p++ = 'a';
*p++ = 'm';
*p++ = 'p';
*p++ = ';';
break;
case '<': case '>':
*p++ = '&';
*p++ = (*s == '<' ? 'l' : 'g');
*p++ = 't';
*p++ = ';';
break;
case '\"':
*p++ = '&';
*p++ = 'q';
*p++ = 'u';
*p++ = 'o';
*p++ = 't';
*p++ = ';';
break;
default:
*p++ = *s;
}
}
*p = '\0';
return res;
}
/* The function creates an HTML index containing references to given
directories and files on the appropriate host. The references are
FTP. */
uerr_t
ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
{
FILE *fp;
char *upwd;
char *htclfile; /* HTML-clean file name */
if (!opt.dfp)
{
fp = fopen (file, "wb");
if (!fp)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return FOPENERR;
}
}
else
fp = opt.dfp;
if (u->user)
{
char *tmpu, *tmpp; /* temporary, clean user and passwd */
tmpu = CLEANDUP (u->user);
tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
upwd = (char *)xmalloc (strlen (tmpu)
+ (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
free (tmpu);
FREE_MAYBE (tmpp);
}
else
upwd = xstrdup ("");
fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
fprintf (fp, "<html>\n<head>\n<title>");
fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
fprintf (fp, "</h1>\n<hr>\n<pre>\n");
while (f)
{
fprintf (fp, " ");
if (f->tstamp != -1)
{
/* #### Should we translate the months? */
static char *months[] = {
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
};
struct tm *ptm = localtime ((time_t *)&f->tstamp);
fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
ptm->tm_mday);
if (ptm->tm_hour)
fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
else
fprintf (fp, " ");
}
else
fprintf (fp, _("time unknown "));
switch (f->type)
{
case FT_PLAINFILE:
fprintf (fp, _("File "));
break;
case FT_DIRECTORY:
fprintf (fp, _("Directory "));
break;
case FT_SYMLINK:
fprintf (fp, _("Link "));
break;
default:
fprintf (fp, _("Not sure "));
break;
}
htclfile = html_quote_string (f->name);
fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
if (*u->dir != '/')
putc ('/', fp);
fprintf (fp, "%s", u->dir);
if (*u->dir)
putc ('/', fp);
fprintf (fp, "%s", htclfile);
if (f->type == FT_DIRECTORY)
putc ('/', fp);
fprintf (fp, "\">%s", htclfile);
if (f->type == FT_DIRECTORY)
putc ('/', fp);
fprintf (fp, "</a> ");
if (f->type == FT_PLAINFILE)
fprintf (fp, _(" (%s bytes)"), legible (f->size));
else if (f->type == FT_SYMLINK)
fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
putc ('\n', fp);
free (htclfile);
f = f->next;
}
fprintf (fp, "</pre>\n</body>\n</html>\n");
free (upwd);
if (!opt.dfp)
fclose (fp);
else
fflush (fp);
return FTPOK;
}

View File

@ -40,7 +40,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "rbuf.h"
#include "retr.h"
#include "ftp.h"
#include "html.h"
#include "connect.h"
#include "host.h"
#include "fnmatch.h"
@ -722,7 +721,7 @@ Error in server response, closing control connection.\n"));
}
reset_timer ();
/* Get the contents of the document. */
res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf);
res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0);
con->dltime = elapsed_time ();
tms = time_str (NULL);
tmrate = rate (*len - restval, con->dltime);

View File

@ -92,4 +92,6 @@ typedef struct
struct fileinfo *ftp_parse_ls PARAMS ((const char *));
uerr_t ftp_loop PARAMS ((struct urlinfo *, int *));
uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *);
#endif /* FTP_H */

403
src/hash.c Normal file
View File

@ -0,0 +1,403 @@
/* Hash tables.
Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdlib.h>
#include <assert.h>
#include "wget.h"
#include "utils.h"
#include "hash.h"
#ifdef STANDALONE
# define xmalloc malloc
# define xrealloc realloc
#endif
/* This file implements simple hash tables based on linear probing.
The hash table stores key-value pairs in a contiguous array. Both
key and value are void pointers that the hash and test functions
know how to handle.
Although Knuth & co. recommend double hashing over linear probing,
we use the latter because it accesses array elements sequentially
in case of a collision, yielding in better cache behaviour and
ultimately in better speed. To avoid collision problems with
linear probing, we make sure that the table grows as soon as the
fullness/size ratio exceeds 75%. */
struct ht_pair {
void *key;
void *value;
};
struct hash_table {
unsigned long (*hash_function) (const void *);
int (*test_function) (const void *, const void *);
int size; /* size of the array */
int fullness; /* number of non-empty fields */
int count; /* number of non-empty, non-deleted
fields. */
struct ht_pair *pairs;
};
#define ENTRY_DELETED ((void *)0xdeadbeef)
#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL)
/* Find a prime near, but greather than or equal to SIZE. */
int
prime_size (int size)
{
static const unsigned long primes [] = {
19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
};
int i;
for (i = 0; i < ARRAY_SIZE (primes); i++)
if (primes[i] >= size)
return primes[i];
/* huh? */
return size;
}
/* Create a hash table of INITIAL_SIZE with hash function
HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
start out with a "small" table which will be regrown as needed,
specify 0 as INITIAL_SIZE. */
struct hash_table *
hash_table_new (int initial_size,
unsigned long (*hash_function) (const void *),
int (*test_function) (const void *, const void *))
{
struct hash_table *ht
= (struct hash_table *)xmalloc (sizeof (struct hash_table));
ht->hash_function = hash_function;
ht->test_function = test_function;
ht->size = prime_size (initial_size);
ht->fullness = 0;
ht->count = 0;
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
return ht;
}
/* Free the data associated with hash table HT. */
void
hash_table_destroy (struct hash_table *ht)
{
free (ht->pairs);
free (ht);
}
/* Get the value that corresponds to the key KEY in the hash table HT.
If no value is found, return NULL. Note that NULL is a legal value
for value; if you are storing NULLs in your hash table, you can use
hash_table_exists to be sure that a (possibly NULL) value exists in
the table. */
void *
hash_table_get (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return NULL;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
return the_pair->value;
}
}
/* Return 1 if KEY exists in HT, 0 otherwise. */
int
hash_table_exists (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return 0;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
return 1;
}
}
#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
/* Grow hash table HT as necessary, and rehash all the key-value
pairs. */
static void
grow_hash_table (struct hash_table *ht)
{
int i;
struct ht_pair *old_pairs = ht->pairs;
int old_count = ht->count; /* for assert() below */
int old_size = ht->size;
/* Normally, the idea is to double ht->size (and round it to next
prime) on each regrow:
ht->size = prime_size (ht->size * 2);
But it is possible that the table has large fullness because of
the many deleted entries. If that is the case, we don't want to
blindly grow the table; we just want to rehash it. For that
reason, we use ht->count as the relevant parameter. MAX is used
only because we don't want to actually shrink the table. (But
maybe that's wrong.) */
int needed_size = prime_size (ht->count * 2);
ht->size = MAX (old_size, needed_size);
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
/* Need to reset these two; hash_table_put will reinitialize them. */
ht->fullness = 0;
ht->count = 0;
for (i = 0; i < old_size; i++)
{
struct ht_pair *the_pair = old_pairs + i;
if (!EMPTY_ENTRY_P (the_pair->key)
&& !DELETED_ENTRY_P (the_pair->key))
hash_table_put (ht, the_pair->key, the_pair->value);
}
assert (ht->count == old_count);
free (old_pairs);
}
/* Put VALUE in the hash table HT under the key KEY. This regrows the
table if necessary. */
void
hash_table_put (struct hash_table *ht, const void *key, void *value)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
{
++ht->fullness;
++ht->count;
just_insert:
the_pair->key = (void *)key; /* const? */
the_pair->value = value;
break;
}
else if (DELETED_ENTRY_P (the_pair->key))
{
/* We're replacing a deleteed entry, so ht->count gets
increased, but ht->fullness remains unchanged. */
++ht->count;
goto just_insert;
}
else if (ht->test_function (key, the_pair->key))
{
/* We're replacing an existing entry, so ht->count and
ht->fullness remain unchanged. */
goto just_insert;
}
else
{
++location;
if (location == ht->size)
location = 0;
}
}
if (ht->fullness * 4 > ht->size * 3)
/* When fullness exceeds 75% of size, regrow the table. */
grow_hash_table (ht);
}
/* Remove KEY from HT. */
int
hash_table_remove (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return 0;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
{
/* We don't really remove an entry from the hash table: we
just mark it as deleted. This is because there may be
other entries located after this entry whose hash number
points to a location before this entry. (Example: keys
A, B and C have the same hash. If you were to really
*delete* B from the table, C could no longer be found.)
As an optimization, it might be worthwhile to check
whether the immediately preceding entry is empty and, if
so, really delete the pair (set it to empty and decrease
the fullness along with the count). I *think* it should
be safe. */
the_pair->key = ENTRY_DELETED;
--ht->count;
return 1;
}
}
}
void
hash_table_clear (struct hash_table *ht)
{
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
ht->fullness = 0;
ht->count = 0;
}
void
hash_table_map (struct hash_table *ht,
int (*mapfun) (void *, void *, void *),
void *closure)
{
int i;
for (i = 0; i < ht->size; i++)
{
struct ht_pair *the_pair = ht->pairs + i;
if (!EMPTY_ENTRY_P (the_pair->key)
&& !DELETED_ENTRY_P (the_pair->key))
if (mapfun (the_pair->key, the_pair->value, closure))
return;
}
}
/* Support for hash tables whose keys are strings. */
/* supposedly from the Dragon Book P436. */
unsigned long
string_hash (const void *sv)
{
unsigned int h = 0;
unsigned const char *x = (unsigned const char *) sv;
while (*x)
{
unsigned int g;
h = (h << 4) + *x++;
if ((g = h & 0xf0000000) != 0)
h = (h ^ (g >> 24)) ^ g;
}
return h;
}
int
string_cmp (const void *s1, const void *s2)
{
return !strcmp ((const char *)s1, (const char *)s2);
}
struct hash_table *
make_string_hash_table (int initial_size)
{
return hash_table_new (initial_size, string_hash, string_cmp);
}
#ifdef STANDALONE
#include <stdio.h>
#include <string.h>
int
print_hash_table_mapper (const void *key, void *value, void *count)
{
++*(int *)count;
printf ("%s: %s\n", (const char *)key, (char *)value);
return 0;
}
void
print_hash (struct hash_table *sht)
{
int debug_count = 0;
hash_table_map (sht, print_hash_table_mapper, &debug_count);
assert (debug_count == sht->count);
}
int
main (void)
{
struct hash_table *ht = make_string_hash_table (0);
char line[80];
while ((fgets (line, sizeof (line), stdin)))
{
int len = strlen (line);
if (len <= 1)
continue;
line[--len] = '\0';
hash_table_put (ht, strdup (line), "here I am!");
if (len % 2)
hash_table_remove (ht, line);
}
print_hash (ht);
#if 0
printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
#endif
return 0;
}
#endif

50
src/hash.h Normal file
View File

@ -0,0 +1,50 @@
/* Hash table declarations.
Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/* From XEmacs, and hence from Dragon book. */
#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */
#define HASH2(a,b) (GOOD_HASH * (a) + (b))
#define HASH3(a,b,c) (GOOD_HASH * HASH2 (a,b) + (c))
#define HASH4(a,b,c,d) (GOOD_HASH * HASH3 (a,b,c) + (d))
#define HASH5(a,b,c,d,e) (GOOD_HASH * HASH4 (a,b,c,d) + (e))
#define HASH6(a,b,c,d,e,f) (GOOD_HASH * HASH5 (a,b,c,d,e) + (f))
#define HASH7(a,b,c,d,e,f,g) (GOOD_HASH * HASH6 (a,b,c,d,e,f) + (g))
#define HASH8(a,b,c,d,e,f,g,h) (GOOD_HASH * HASH7 (a,b,c,d,e,f,g) + (h))
#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i))
struct hash_table;
struct hash_table *hash_table_new PARAMS ((int,
unsigned long (*) (const void *),
int (*) (const void *,
const void *)));
void hash_table_destroy PARAMS ((struct hash_table *));
void *hash_table_get PARAMS ((struct hash_table *, const void *));
int hash_table_exists PARAMS ((struct hash_table *, const void *));
void hash_table_put PARAMS ((struct hash_table *, const void *, void *));
int hash_table_remove PARAMS ((struct hash_table *, const void *));
void hash_table_clear PARAMS ((struct hash_table *));
void hash_table_map PARAMS ((struct hash_table *,
int (*) (void *, void *, void *),
void *));
unsigned long string_hash PARAMS ((const void *));
int string_cmp PARAMS ((const void *, const void *));
struct hash_table *make_string_hash_table PARAMS ((int));

View File

@ -165,6 +165,14 @@ header_strdup (const char *header, void *closure)
return 1;
}
/* Write the value 1 into the integer pointed to by CLOSURE. */
int
header_exists (const char *header, void *closure)
{
*(int *)closure = 1;
return 1;
}
/* Skip LWS (linear white space), if present. Returns number of
characters to skip. */
int

View File

@ -31,5 +31,6 @@ int header_process PARAMS ((const char *, const char *,
int header_extract_number PARAMS ((const char *, void *));
int header_strdup PARAMS ((const char *, void *));
int header_exists PARAMS ((const char *, void *));
int skip_lws PARAMS ((const char *));

View File

@ -1,5 +1,5 @@
/* Dealing with host names.
Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
This file is part of Wget.
@ -48,35 +48,38 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "utils.h"
#include "host.h"
#include "url.h"
#include "hash.h"
#ifndef errno
extern int errno;
#endif
/* Host list entry */
struct host
/* Mapping between all known hosts to their addresses (n.n.n.n). */
struct hash_table *host_name_address_map;
/* Mapping between all known addresses (n.n.n.n) to their hosts. This
is the inverse of host_name_address_map. These two tables share
the strdup'ed strings. */
struct hash_table *host_address_name_map;
/* Mapping between auxilliary (slave) and master host names. */
struct hash_table *host_slave_master_map;
/* Utility function: like xstrdup(), but also lowercases S. */
static char *
xstrdup_lower (const char *s)
{
/* Host's symbolical name, as encountered at the time of first
inclusion, e.g. "fly.cc.fer.hr". */
char *hostname;
/* Host's "real" name, i.e. its IP address, written out in ASCII
form of N.N.N.N, e.g. "161.53.70.130". */
char *realname;
/* More than one HOSTNAME can correspond to the same REALNAME. For
our purposes, the canonical name of the host is its HOSTNAME when
it was first encountered. This entry is said to have QUALITY. */
int quality;
/* Next entry in the list. */
struct host *next;
};
static struct host *hlist;
static struct host *add_hlist PARAMS ((struct host *, const char *,
const char *, int));
char *copy = xstrdup (s);
char *p = copy;
for (; *p; p++)
*p = TOLOWER (*p);
return copy;
}
/* The same as gethostbyname, but supports internet addresses of the
form `N.N.N.N'. */
form `N.N.N.N'. On some systems gethostbyname() knows how to do
this automatically. */
struct hostent *
ngethostbyname (const char *name)
{
@ -91,42 +94,51 @@ ngethostbyname (const char *name)
return hp;
}
/* Search for HOST in the linked list L, by hostname. Return the
entry, if found, or NULL. The search is case-insensitive. */
static struct host *
search_host (struct host *l, const char *host)
{
for (; l; l = l->next)
if (strcasecmp (l->hostname, host) == 0)
return l;
return NULL;
}
/* Add host name HOST with the address ADDR_TEXT to the cache.
Normally this means that the (HOST, ADDR_TEXT) pair will be to
host_name_address_map and to host_address_name_map. (It is the
caller's responsibility to make sure that HOST is not already in
host_name_address_map.)
/* Like search_host, but searches by address. */
static struct host *
search_address (struct host *l, const char *address)
If the ADDR_TEXT has already been seen and belongs to another host,
HOST will be added to host_slave_master_map instead. */
static void
add_host_to_cache (const char *host, const char *addr_text)
{
for (; l; l = l->next)
char *canonical_name = hash_table_get (host_address_name_map, addr_text);
if (canonical_name)
{
int cmp = strcmp (l->realname, address);
if (cmp == 0)
return l;
else if (cmp > 0)
return NULL;
DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
host, canonical_name));
/* We've already dealt with that host under another name. */
hash_table_put (host_slave_master_map,
xstrdup_lower (host),
xstrdup_lower (canonical_name));
}
else
{
/* This is really the first time we're dealing with that host. */
char *h_copy = xstrdup_lower (host);
char *a_copy = xstrdup (addr_text);
DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
hash_table_put (host_name_address_map, h_copy, a_copy);
hash_table_put (host_address_name_map, a_copy, h_copy);
}
return NULL;
}
/* Store the address of HOSTNAME, internet-style, to WHERE. First
check for it in the host list, and (if not found), use
ngethostbyname to get it.
/* Store the address of HOSTNAME, internet-style (four octets in
network order), to WHERE. First try to get the address from the
cache; if it is not available, call the DNS functions and update
the cache.
Return 1 on successful finding of the hostname, 0 otherwise. */
int
store_hostaddress (unsigned char *where, const char *hostname)
{
struct host *t;
unsigned long addr;
char *addr_text;
char *canonical_name;
struct hostent *hptr;
struct in_addr in;
char *inet_s;
@ -134,178 +146,119 @@ store_hostaddress (unsigned char *where, const char *hostname)
/* If the address is of the form d.d.d.d, there will be no trouble
with it. */
addr = (unsigned long)inet_addr (hostname);
if ((int)addr == -1)
{
/* If it is not of that form, try to find it in the cache. */
t = search_host (hlist, hostname);
if (t)
addr = (unsigned long)inet_addr (t->realname);
}
/* If we have the numeric address, just store it. */
if ((int)addr != -1)
{
/* ADDR is in network byte order, meaning the code works on
little and big endian 32-bit architectures without change.
On big endian 64-bit architectures we need to be careful to
copy the correct four bytes. */
int offset = 0;
/* ADDR is defined to be in network byte order, meaning the code
works on little and big endian 32-bit architectures without
change. On big endian 64-bit architectures we need to be
careful to copy the correct four bytes. */
int offset;
have_addr:
#ifdef WORDS_BIGENDIAN
offset = sizeof (unsigned long) - 4;
#else
offset = 0;
#endif
memcpy (where, (char *)&addr + offset, 4);
return 1;
}
/* By now we know that the address is not of the form d.d.d.d. Try
to find it in our cache of host addresses. */
addr_text = hash_table_get (host_name_address_map, hostname);
if (addr_text)
{
DEBUGP (("Found %s in host_name_address_map: %s\n",
hostname, addr_text));
addr = (unsigned long)inet_addr (addr_text);
goto have_addr;
}
/* Maybe this host is known to us under another name. If so, we'll
find it in host_slave_master_map, and use the master name to find
its address in host_name_address_map. */
canonical_name = hash_table_get (host_slave_master_map, hostname);
if (canonical_name)
{
addr_text = hash_table_get (host_name_address_map, canonical_name);
assert (addr_text != NULL);
DEBUGP (("Found %s as slave of %s -> %s\n",
hostname, canonical_name, addr_text));
addr = (unsigned long)inet_addr (addr_text);
goto have_addr;
}
/* Since all else has failed, let's try gethostbyname(). Note that
we use gethostbyname() rather than ngethostbyname(), because we
*know* the address is not numerical. */
already know that the address is not numerical. */
hptr = gethostbyname (hostname);
if (!hptr)
return 0;
/* Copy the address of the host to socket description. */
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
/* Now that we're here, we could as well cache the hostname for
future use, as in realhost(). First, we have to look for it by
address to know if it's already in the cache by another name. */
assert (hptr->h_length == 4);
/* Now that we've gone through the truoble of calling
gethostbyname(), we can store this valuable information to the
cache. First, we have to look for it by address to know if it's
already in the cache by another name. */
/* Originally, we copied to in.s_addr, but it appears to be missing
on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
t = search_address (hlist, inet_s);
if (t) /* Found in the list, as realname. */
{
/* Set the default, 0 quality. */
hlist = add_hlist (hlist, hostname, inet_s, 0);
return 1;
}
/* Since this is really the first time this host is encountered,
set quality to 1. */
hlist = add_hlist (hlist, hostname, inet_s, 1);
inet_s = inet_ntoa (in);
add_host_to_cache (hostname, inet_s);
return 1;
}
/* Add a host to the host list. The list is sorted by addresses. For
equal addresses, the entries with quality should bubble towards the
beginning of the list. */
static struct host *
add_hlist (struct host *l, const char *nhost, const char *nreal, int quality)
{
struct host *t, *old, *beg;
/* The entry goes to the beginning of the list if the list is empty
or the order requires it. */
if (!l || (strcmp (nreal, l->realname) < 0))
{
t = (struct host *)xmalloc (sizeof (struct host));
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
t->next = l;
return t;
}
beg = l;
/* Second two one-before-the-last element. */
while (l->next)
{
int cmp;
old = l;
l = l->next;
cmp = strcmp (nreal, l->realname);
if (cmp >= 0)
continue;
/* If the next list element is greater than s, put s between the
current and the next list element. */
t = (struct host *)xmalloc (sizeof (struct host));
old->next = t;
t->next = l;
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
return beg;
}
t = (struct host *)xmalloc (sizeof (struct host));
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
/* Insert the new element after the last element. */
l->next = t;
t->next = NULL;
return beg;
}
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
is referenced by more than one name, "real" name is considered to
be the first one encountered in the past.
If the host cannot be found in the list of already dealt-with
hosts, try with its INET address. If this fails too, add it to the
list. The routine does not call gethostbyname twice for the same
host if it can possibly avoid it. */
be the first one encountered in the past. */
char *
realhost (const char *host)
{
struct host *l, *l_real;
struct in_addr in;
struct hostent *hptr;
char *inet_s;
char *master_name;
DEBUGP (("Checking for %s.\n", host));
/* Look for the host, looking by the host name. */
l = search_host (hlist, host);
if (l && l->quality) /* Found it with quality */
DEBUGP (("Checking for %s in host_name_address_map.\n", host));
if (hash_table_exists (host_name_address_map, host))
{
DEBUGP (("%s was already used, by that name.\n", host));
/* Here we return l->hostname, not host, because of the possible
case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are
the same, but we want the one that was first. */
return xstrdup (l->hostname);
DEBUGP (("Found; %s was already used, by that name.\n", host));
return xstrdup_lower (host);
}
else if (!l) /* Not found, with or without quality */
{
/* The fact that gethostbyname will get called makes it
necessary to store it to the list, to ensure that
gethostbyname will not be called twice for the same string.
However, the quality argument must be set appropriately.
Note that add_hlist must be called *after* the realname
search, or the quality would be always set to 0 */
DEBUGP (("This is the first time I hear about host %s by that name.\n",
host));
hptr = ngethostbyname (host);
if (!hptr)
return xstrdup (host);
DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
master_name = hash_table_get (host_slave_master_map, host);
if (master_name)
{
has_master:
DEBUGP (("Found; %s was already used, by the name %s.\n",
host, master_name));
return xstrdup (master_name);
}
DEBUGP (("First time I hear about %s by that name; looking it up.\n",
host));
hptr = ngethostbyname (host);
if (hptr)
{
char *inet_s;
/* Originally, we copied to in.s_addr, but it appears to be
missing on some systems. */
missing on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
}
else /* Found, without quality */
{
/* This case happens when host is on the list,
but not as first entry (the one with quality).
Then we just get its INET address and pick
up the first entry with quality. */
DEBUGP (("We've dealt with host %s, but under the name %s.\n",
host, l->realname));
STRDUP_ALLOCA (inet_s, l->realname);
inet_s = inet_ntoa (in);
add_host_to_cache (host, inet_s);
/* add_host_to_cache() can establish a slave-master mapping. */
DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
master_name = hash_table_get (host_slave_master_map, host);
if (master_name)
goto has_master;
}
/* Now we certainly have the INET address. The following loop is
guaranteed to pick either an entry with quality (because it is
the first one), or none at all. */
l_real = search_address (hlist, inet_s);
if (l_real) /* Found in the list, as realname. */
{
if (!l)
/* Set the default, 0 quality. */
hlist = add_hlist (hlist, host, inet_s, 0);
return xstrdup (l_real->hostname);
}
/* Since this is really the first time this host is encountered,
set quality to 1. */
hlist = add_hlist (hlist, host, inet_s, 1);
return xstrdup (host);
return xstrdup_lower (host);
}
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
@ -547,20 +500,23 @@ herrmsg (int error)
return _("Unknown error");
}
/* Clean the host list. This is a separate function, so we needn't
export HLIST and its implementation. Ha! */
void
clean_hosts (void)
{
struct host *l = hlist;
while (l)
{
struct host *p = l->next;
free (l->hostname);
free (l->realname);
free (l);
l = p;
}
hlist = NULL;
/* host_name_address_map and host_address_name_map share the
strings. Because of that, calling free_keys_and_values once
suffices for both. */
free_keys_and_values (host_name_address_map);
hash_table_destroy (host_name_address_map);
hash_table_destroy (host_address_name_map);
free_keys_and_values (host_slave_master_map);
hash_table_destroy (host_slave_master_map);
}
void
host_init (void)
{
host_name_address_map = make_string_hash_table (0);
host_address_name_map = make_string_hash_table (0);
host_slave_master_map = make_string_hash_table (0);
}

856
src/html-parse.c Normal file
View File

@ -0,0 +1,856 @@
/* HTML parser for Wget.
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/* The only entry point to this module is map_html_tags(), which see. */
/* TODO:
- Allow hooks for callers to process contents outside tags. This
is needed to implement handling <style> and <script>. The
taginfo structure already carries the information about where the
tags are, but this is not enough, because one would also want to
skip the comments. (The funny thing is that for <style> and
<script> you *don't* want to skip comments!)
- Create a test suite for regression testing. */
/* HISTORY:
This is the third HTML parser written for Wget. The first one was
written some time during the Geturl 1.0 beta cycle, and was very
inefficient and buggy. It also contained some very complex code to
remember a list of parser states, because it was supposed to be
reentrant. The idea was that several parsers would be running
concurrently, and you'd have pass the function a unique ID string
(for example, the URL) by which it found the relevant parser state
and returned the next URL. Over-engineering at its best.
The second HTML parser was written for Wget 1.4 (the first version
by the name `Wget'), and was a complete rewrite. Although the new
parser behaved much better and made no claims of reentrancy, it
still shared many of the fundamental flaws of the old version -- it
only regarded HTML in terms tag-attribute pairs, where the
attribute's value was a URL to be returned. Any other property of
HTML, such as <base href=...>, or strange way to specify a URL,
such as <meta http-equiv=Refresh content="0; URL=..."> had to be
crudely hacked in -- and the caller had to be aware of these hacks.
Like its predecessor, this parser did not support HTML comments.
After Wget 1.5.1 was released, I set out to write a third HTML
parser. The objectives of the new parser were to: (1) provide a
clean way to analyze HTML lexically, (2) separate interpretation of
the markup from the parsing process, (3) be as correct as possible,
e.g. correctly skipping comments and other SGML declarations, (4)
understand the most common errors in markup and skip them or be
relaxed towrds them, and (5) be reasonably efficient (no regexps,
minimum copying and minimum or no heap allocation).
I believe this parser meets all of the above goals. It is
reasonably well structured, and could be relatively easily
separated from Wget and used elsewhere. While some of its
intrinsic properties limit its value as a general-purpose HTML
parser, I believe that, with minimum modifications, it could serve
as a backend for one.
Due to time and other constraints, this parser was not integrated
into Wget until the version ???. */
/* DESCRIPTION:
The single entry point of this parser is map_html_tags(), which
works by calling a function you specify for each tag. The function
gets called with the pointer to a structure describing the tag and
its attributes. */
/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
still need Wget headers to compile. */
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#include <assert.h>
#include "wget.h"
#include "html-parse.h"
#ifdef STANDALONE
# define xmalloc malloc
# define xrealloc realloc
#endif /* STANDALONE */
/* Pool support. For efficiency, map_html_tags() stores temporary
string data to a single stack-allocated pool. If the pool proves
too small, additional memory is allocated/resized with
malloc()/realloc(). */
struct pool {
char *contents; /* pointer to the contents. */
int size; /* size of the pool. */
int index; /* next unoccupied position in
contents. */
int alloca_p; /* whether contents was allocated
using alloca(). */
char *orig_contents; /* orig_contents, allocated by
alloca(). this is used by
POOL_FREE to restore the pool to
the "initial" state. */
int orig_size;
};
/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
#define POOL_INIT(pool, initial_size) do { \
(pool).size = (initial_size); \
(pool).contents = ALLOCA_ARRAY (char, (pool).size); \
(pool).index = 0; \
(pool).alloca_p = 1; \
(pool).orig_contents = (pool).contents; \
(pool).orig_size = (pool).size; \
} while (0)
/* Grow the pool to accomodate at least SIZE new bytes. If the pool
already has room to accomodate SIZE bytes of data, this is a no-op. */
#define POOL_GROW(pool, increase) do { \
int PG_newsize = (pool).index + increase; \
DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize, \
(pool).alloca_p, char); \
} while (0)
/* Append text in the range [beg, end) to POOL. No zero-termination
is done. */
#define POOL_APPEND(pool, beg, end) do { \
const char *PA_beg = beg; \
int PA_size = end - PA_beg; \
POOL_GROW (pool, PA_size); \
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
(pool).index += PA_size; \
} while (0)
/* The same as the above, but with zero termination. */
#define POOL_APPEND_ZT(pool, beg, end) do { \
const char *PA_beg = beg; \
int PA_size = end - PA_beg; \
POOL_GROW (pool, PA_size + 1); \
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
(pool).contents[(pool).index + PA_size] = '\0'; \
(pool).index += PA_size + 1; \
} while (0)
/* Forget old pool contents. The allocated memory is not freed. */
#define POOL_REWIND(pool) pool.index = 0
/* Free heap-allocated memory for contents of POOL. This calls free()
if the memory was allocated through malloc. It also restores
`contents' and `size' to their original, pre-malloc values. That
way after POOL_FREE, the pool is fully usable, just as if it were
freshly initialized with POOL_INIT. */
#define POOL_FREE(pool) do { \
if (!(pool).alloca_p) \
free ((pool).contents); \
(pool).contents = (pool).orig_contents; \
(pool).size = (pool).orig_size; \
(pool).index = 0; \
(pool).alloca_p = 1; \
} while (0)
#define AP_DOWNCASE 1
#define AP_PROCESS_ENTITIES 2
#define AP_SKIP_BLANKS 4
/* Copy the text in the range [BEG, END) to POOL, optionally
performing operations specified by FLAGS. FLAGS may be any
combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
with the following meaning:
* AP_DOWNCASE -- downcase all the letters;
* AP_PROCESS_ENTITIES -- process the SGML entities and write out
the decoded string. Recognized entities are &lt, &gt, &amp, &quot,
&nbsp and the numerical entities.
* AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
of text. */
static void
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
{
int old_index = pool->index;
int size;
/* First, skip blanks if required. We must do this before entities
are processed, so that blanks can still be inserted as, for
instance, `&#32;'. */
if (flags & AP_SKIP_BLANKS)
{
while (beg < end && ISSPACE (*beg))
++beg;
while (end > beg && ISSPACE (end[-1]))
--end;
}
size = end - beg;
if (flags & AP_PROCESS_ENTITIES)
{
/* Stack-allocate a copy of text, process entities and copy it
to the pool. */
char *local_copy = (char *)alloca (size + 1);
const char *from = beg;
char *to = local_copy;
while (from < end)
{
if (*from != '&')
*to++ = *from++;
else
{
const char *save = from;
int remain;
if (++from == end) goto lose;
remain = end - from;
if (*from == '#')
{
int numeric;
++from;
if (from == end || !ISDIGIT (*from)) goto lose;
for (numeric = 0; from < end && ISDIGIT (*from); from++)
numeric = 10 * numeric + (*from) - '0';
if (from < end && ISALPHA (*from)) goto lose;
numeric &= 0xff;
*to++ = numeric;
}
#define FROB(x) (remain >= (sizeof (x) - 1) \
&& !memcmp (from, x, sizeof (x) - 1) \
&& (*(from + sizeof (x) - 1) == ';' \
|| remain == sizeof (x) - 1 \
|| !ISALNUM (*(from + sizeof (x) - 1))))
else if (FROB ("lt"))
*to++ = '<', from += 2;
else if (FROB ("gt"))
*to++ = '>', from += 2;
else if (FROB ("amp"))
*to++ = '&', from += 3;
else if (FROB ("quot"))
*to++ = '\"', from += 4;
/* We don't implement the proposed "Added Latin 1"
entities (except for nbsp), because it is unnecessary
in the context of Wget, and would require hashing to
work efficiently. */
else if (FROB ("nbsp"))
*to++ = 160, from += 4;
else
goto lose;
#undef FROB
/* If the entity was followed by `;', we step over the
`;'. Otherwise, it was followed by either a
non-alphanumeric or EOB, in which case we do nothing. */
if (from < end && *from == ';')
++from;
continue;
lose:
/* This was not an entity after all. Back out. */
from = save;
*to++ = *from++;
}
}
*to++ = '\0';
POOL_APPEND (*pool, local_copy, to);
}
else
{
/* Just copy the text to the pool. */
POOL_APPEND_ZT (*pool, beg, end);
}
if (flags & AP_DOWNCASE)
{
char *p = pool->contents + old_index;
for (; *p; p++)
*p = TOLOWER (*p);
}
}
/* Check whether the contents of [POS, POS+LENGTH) match any of the
strings in the ARRAY. */
static int
array_allowed (const char **array, const char *beg, const char *end)
{
int length = end - beg;
if (array)
{
for (; *array; array++)
if (length >= strlen (*array)
&& !strncasecmp (*array, beg, length))
break;
if (!*array)
return 0;
}
return 1;
}
/* RFC1866: name [of attribute or tag] consists of letters, digits,
periods, or hyphens. We also allow _, for compatibility with
brain-damaged generators. */
#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
/* States while advancing through comments. */
#define AC_S_DONE 0
#define AC_S_BACKOUT 1
#define AC_S_BANG 2
#define AC_S_DEFAULT 3
#define AC_S_DCLNAME 4
#define AC_S_DASH1 5
#define AC_S_DASH2 6
#define AC_S_COMMENT 7
#define AC_S_DASH3 8
#define AC_S_DASH4 9
#define AC_S_QUOTE1 10
#define AC_S_IN_QUOTE 11
#define AC_S_QUOTE2 12
#ifdef STANDALONE
static int comment_backout_count;
#endif
/* Advance over an SGML declaration (the <!...> forms you find in HTML
documents). The function returns the location after the
declaration. The reason we need this is that HTML comments are
expressed as comments in so-called "empty declarations".
To recap: any SGML declaration may have comments associated with
it, e.g.
<!MY-DECL -- isn't this fun? -- foo bar>
An HTML comment is merely an empty declaration (<!>) with a comment
attached, like this:
<!-- some stuff here -->
Several comments may be embedded in one comment declaration:
<!-- have -- -- fun -->
Whitespace is allowed between and after the comments, but not
before the first comment.
Additionally, this function attempts to handle double quotes in
SGML declarations correctly. */
static const char *
advance_declaration (const char *beg, const char *end)
{
const char *p = beg;
char quote_char = '\0'; /* shut up, gcc! */
char ch;
int state = AC_S_BANG;
if (beg == end)
return beg;
ch = *p++;
/* It looked like a good idea to write this as a state machine, but
now I wonder... */
while (state != AC_S_DONE && state != AC_S_BACKOUT)
{
if (p == end)
state = AC_S_BACKOUT;
switch (state)
{
case AC_S_DONE:
case AC_S_BACKOUT:
break;
case AC_S_BANG:
if (ch == '!')
{
ch = *p++;
state = AC_S_DEFAULT;
}
else
state = AC_S_BACKOUT;
break;
case AC_S_DEFAULT:
switch (ch)
{
case '-':
state = AC_S_DASH1;
break;
case ' ':
case '\t':
case '\r':
case '\n':
ch = *p++;
break;
case '>':
state = AC_S_DONE;
break;
case '\'':
case '\"':
state = AC_S_QUOTE1;
break;
default:
if (NAME_CHAR_P (ch))
state = AC_S_DCLNAME;
else
state = AC_S_BACKOUT;
break;
}
break;
case AC_S_DCLNAME:
if (NAME_CHAR_P (ch))
ch = *p++;
else if (ch == '-')
state = AC_S_DASH1;
else
state = AC_S_DEFAULT;
break;
case AC_S_QUOTE1:
assert (ch == '\'' || ch == '\"');
quote_char = ch; /* cheating -- I really don't feel like
introducing more different states for
different quote characters. */
ch = *p++;
state = AC_S_IN_QUOTE;
break;
case AC_S_IN_QUOTE:
if (ch == quote_char)
state = AC_S_QUOTE2;
else
ch = *p++;
break;
case AC_S_QUOTE2:
assert (ch == quote_char);
ch = *p++;
state = AC_S_DEFAULT;
break;
case AC_S_DASH1:
assert (ch == '-');
ch = *p++;
state = AC_S_DASH2;
break;
case AC_S_DASH2:
switch (ch)
{
case '-':
ch = *p++;
state = AC_S_COMMENT;
break;
default:
state = AC_S_BACKOUT;
}
break;
case AC_S_COMMENT:
switch (ch)
{
case '-':
state = AC_S_DASH3;
break;
default:
ch = *p++;
break;
}
break;
case AC_S_DASH3:
assert (ch == '-');
ch = *p++;
state = AC_S_DASH4;
break;
case AC_S_DASH4:
switch (ch)
{
case '-':
ch = *p++;
state = AC_S_DEFAULT;
break;
default:
state = AC_S_COMMENT;
break;
}
break;
}
}
if (state == AC_S_BACKOUT)
{
#ifdef STANDALONE
++comment_backout_count;
#endif
return beg + 1;
}
return p;
}
/* Advance P (a char pointer), with the explicit intent of being able
to read the next character. If this is not possible, go to finish. */
#define ADVANCE(p) do { \
++p; \
if (p >= end) \
goto finish; \
} while (0)
/* Skip whitespace, if any. */
#define SKIP_WS(p) do { \
while (ISSPACE (*p)) { \
ADVANCE (p); \
} \
} while (0)
/* Skip non-whitespace, if any. */
#define SKIP_NON_WS(p) do { \
while (!ISSPACE (*p)) { \
ADVANCE (p); \
} \
} while (0)
#ifdef STANDALONE
static int tag_backout_count;
#endif
/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
MAPFUN will be called with two arguments: pointer to an initialized
struct taginfo, and CLOSURE.
ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
be processed by this function. If it is NULL, all the tags are
allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
(Obviously, the caller can filter out unwanted tags and attributes
just as well, but this is just an optimization designed to avoid
unnecessary copying for tags/attributes which the caller doesn't
want to know about. These lists are searched linearly; therefore,
if you're interested in a large number of tags or attributes, you'd
better set these to NULL and filter them out yourself with a
hashing process most appropriate for your application.) */
void
map_html_tags (const char *text, int size,
const char **allowed_tag_names,
const char **allowed_attribute_names,
void (*mapfun) (struct taginfo *, void *),
void *closure)
{
const char *p = text;
const char *end = text + size;
int attr_pair_count = 8;
int attr_pair_alloca_p = 1;
struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
struct pool pool;
if (!size)
return;
POOL_INIT (pool, 256);
{
int nattrs, end_tag;
const char *tag_name_begin, *tag_name_end;
const char *tag_start_position;
int uninteresting_tag;
look_for_tag:
POOL_REWIND (pool);
nattrs = 0;
end_tag = 0;
/* Find beginning of tag. We use memchr() instead of the usual
looping with ADVANCE() for speed. */
p = memchr (p, '<', end - p);
if (!p)
goto finish;
tag_start_position = p;
ADVANCE (p);
/* Establish the type of the tag (start-tag, end-tag or
declaration). */
if (*p == '!')
{
/* This is an SGML declaration -- just skip it. */
p = advance_declaration (p, end);
if (p == end)
goto finish;
goto look_for_tag;
}
else if (*p == '/')
{
end_tag = 1;
ADVANCE (p);
}
tag_name_begin = p;
while (NAME_CHAR_P (*p))
ADVANCE (p);
if (p == tag_name_begin)
goto look_for_tag;
tag_name_end = p;
SKIP_WS (p);
if (end_tag && *p != '>')
goto backout_tag;
if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
/* We can't just say "goto look_for_tag" here because we need
the loop below to properly advance over the tag's attributes. */
uninteresting_tag = 1;
else
{
uninteresting_tag = 0;
convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
}
/* Find the attributes. */
while (1)
{
const char *attr_name_begin, *attr_name_end;
const char *attr_value_begin, *attr_value_end;
const char *attr_raw_value_begin, *attr_raw_value_end;
int operation = AP_DOWNCASE; /* stupid compiler. */
SKIP_WS (p);
/* Check for end of tag definition. */
if (*p == '>')
break;
/* Establish bounds of attribute name. */
attr_name_begin = p; /* <foo bar ...> */
/* ^ */
while (NAME_CHAR_P (*p))
ADVANCE (p);
attr_name_end = p; /* <foo bar ...> */
/* ^ */
if (attr_name_begin == attr_name_end)
goto backout_tag;
/* Establish bounds of attribute value. */
SKIP_WS (p);
if (NAME_CHAR_P (*p) || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
COMPACT="compact">. Even if such attributes are not
useful to Wget, we need to support them, so that the
tags containing them can be parsed correctly. */
attr_raw_value_begin = attr_value_begin = attr_name_begin;
attr_raw_value_end = attr_value_end = attr_name_end;
}
else if (*p == '=')
{
ADVANCE (p);
SKIP_WS (p);
if (*p == '\"' || *p == '\'')
{
int newline_seen = 0;
char quote_char = *p;
attr_raw_value_begin = p;
ADVANCE (p);
attr_value_begin = p; /* <foo bar="baz"> */
/* ^ */
while (*p != quote_char)
{
if (!newline_seen && *p == '\n')
{
/* If a newline is seen within the quotes, it
is most likely that someone forgot to close
the quote. In that case, we back out to
the value beginning, and terminate the tag
at either `>' or the delimiter, whichever
comes first. Such a tag terminated at `>'
is discarded. */
p = attr_value_begin;
newline_seen = 1;
continue;
}
else if (newline_seen && *p == '>')
break;
ADVANCE (p);
}
attr_value_end = p; /* <foo bar="baz"> */
/* ^ */
if (*p == quote_char)
ADVANCE (p);
else
goto look_for_tag;
attr_raw_value_end = p; /* <foo bar="baz"> */
/* ^ */
/* The AP_SKIP_BLANKS part is not entirely correct,
because we don't want to skip blanks for all the
attribute values. */
operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
}
else
{
attr_value_begin = p; /* <foo bar=baz> */
/* ^ */
/* According to SGML, a name token should consist only
of alphanumerics, . and -. However, this is often
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
while (!ISSPACE (*p) && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
if (attr_value_begin == attr_value_end)
/* <foo bar=> */
/* ^ */
goto backout_tag;
attr_raw_value_begin = attr_value_begin;
attr_raw_value_end = attr_value_end;
operation = AP_PROCESS_ENTITIES;
}
}
else
{
/* We skipped the whitespace and found something that is
neither `=' nor the beginning of the next attribute's
name. Back out. */
goto backout_tag; /* <foo bar /... */
/* ^ */
}
/* If we're not interested in the tag, don't bother with any
of the attributes. */
if (uninteresting_tag)
continue;
/* If we aren't interested in the attribute, skip it. We
cannot do this test any sooner, because our text pointer
needs to correctly advance over the attribute. */
if (allowed_attribute_names
&& !array_allowed (allowed_attribute_names, attr_name_begin,
attr_name_end))
continue;
DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
attr_pair_alloca_p, struct attr_pair);
pairs[nattrs].name_pool_index = pool.index;
convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
pairs[nattrs].value_pool_index = pool.index;
convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
pairs[nattrs].value_raw_size = (attr_raw_value_end
- attr_raw_value_begin);
++nattrs;
}
if (uninteresting_tag)
{
ADVANCE (p);
goto look_for_tag;
}
/* By now, we have a valid tag with a name and zero or more
attributes. Fill in the data and call the mapper function. */
{
int i;
struct taginfo taginfo;
taginfo.name = pool.contents;
taginfo.end_tag_p = end_tag;
taginfo.nattrs = nattrs;
/* We fill in the char pointers only now, when pool can no
longer get realloc'ed. If we did that above, we could get
hosed by reallocation. Obviously, after this point, the pool
may no longer be grown. */
for (i = 0; i < nattrs; i++)
{
pairs[i].name = pool.contents + pairs[i].name_pool_index;
pairs[i].value = pool.contents + pairs[i].value_pool_index;
}
taginfo.attrs = pairs;
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
/* Ta-dam! */
(*mapfun) (&taginfo, closure);
ADVANCE (p);
}
goto look_for_tag;
backout_tag:
#ifdef STANDALONE
++tag_backout_count;
#endif
/* The tag wasn't really a tag. Treat its contents as ordinary
data characters. */
p = tag_start_position + 1;
goto look_for_tag;
}
finish:
POOL_FREE (pool);
if (!attr_pair_alloca_p)
free (pairs);
}
#undef ADVANCE
#undef SKIP_WS
#undef SKIP_NON_WS
#ifdef STANDALONE
static void
test_mapper (struct taginfo *taginfo, void *arg)
{
int i;
printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
for (i = 0; i < taginfo->nattrs; i++)
printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
putchar ('\n');
++*(int *)arg;
}
int main ()
{
int size = 256;
char *x = (char *)xmalloc (size);
int length = 0;
int read_count;
int tag_counter = 0;
while ((read_count = fread (x + length, 1, size - length, stdin)))
{
length += read_count;
size <<= 1;
x = (char *)xrealloc (x, size);
}
map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
printf ("TAGS: %d\n", tag_counter);
printf ("Tag backouts: %d\n", tag_backout_count);
printf ("Comment backouts: %d\n", comment_backout_count);
return 0;
}
#endif /* STANDALONE */

44
src/html-parse.h Normal file
View File

@ -0,0 +1,44 @@
/* Declarations for html-parse.c.
Copyright (C) 1998 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
struct attr_pair {
char *name; /* attribute name */
char *value; /* attribute value */
/* Needed for URL conversion; the places where the value begins and
ends, including the quotes and everything. */
const char *value_raw_beginning;
int value_raw_size;
/* Used internally by map_html_tags. */
int name_pool_index, value_pool_index;
};
struct taginfo {
char *name; /* tag name */
int end_tag_p; /* whether this is an end-tag */
int nattrs; /* number of attributes */
struct attr_pair *attrs; /* attributes */
const char *start_position; /* start position of tag */
const char *end_position; /* end position of tag */
};
void map_html_tags PARAMS ((const char *, int, const char **, const char **,
void (*) (struct taginfo *, void *), void *));

569
src/html-url.c Normal file
View File

@ -0,0 +1,569 @@
/* Collect URLs from HTML source.
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <config.h>
#include <stdio.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
#include <assert.h>
#include "wget.h"
#include "html-parse.h"
#include "url.h"
#include "utils.h"
#ifndef errno
extern int errno;
#endif
enum tag_category { TC_LINK, TC_SPEC };
/* Here we try to categorize the known tags. Each tag has its ID and
cetegory. Category TC_LINK means that one or more of its
attributes contain links that should be retrieved. TC_SPEC means
that the tag is specific in some way, and has to be handled
specially. */
static struct {
const char *name;
enum tag_category category;
} known_tags[] = {
#define TAG_A 0
{ "a", TC_LINK },
#define TAG_APPLET 1
{ "applet", TC_LINK },
#define TAG_AREA 2
{ "area", TC_LINK },
#define TAG_BASE 3
{ "base", TC_SPEC },
#define TAG_BGSOUND 4
{ "bgsound", TC_LINK },
#define TAG_BODY 5
{ "body", TC_LINK },
#define TAG_EMBED 6
{ "embed", TC_LINK },
#define TAG_FIG 7
{ "fig", TC_LINK },
#define TAG_FRAME 8
{ "frame", TC_LINK },
#define TAG_IFRAME 9
{ "iframe", TC_LINK },
#define TAG_IMG 10
{ "img", TC_LINK },
#define TAG_INPUT 11
{ "input", TC_LINK },
#define TAG_LAYER 12
{ "layer", TC_LINK },
#define TAG_LINK 13
{ "link", TC_SPEC },
#define TAG_META 14
{ "meta", TC_SPEC },
#define TAG_OVERLAY 15
{ "overlay", TC_LINK },
#define TAG_SCRIPT 16
{ "script", TC_LINK },
#define TAG_TABLE 17
{ "table", TC_LINK },
#define TAG_TD 18
{ "td", TC_LINK },
#define TAG_TH 19
{ "th", TC_LINK }
};
/* Flags for specific url-attr pairs handled through TC_LINK: */
#define AF_EXTERNAL 1
/* For tags handled by TC_LINK: attributes that contain URLs to
download. */
static struct {
int tagid;
const char *attr_name;
int flags;
} url_tag_attr_map[] = {
{ TAG_A, "href", AF_EXTERNAL },
{ TAG_APPLET, "code", 0 },
{ TAG_AREA, "href", AF_EXTERNAL },
{ TAG_BGSOUND, "src", 0 },
{ TAG_BODY, "background", 0 },
{ TAG_EMBED, "src", 0 },
{ TAG_FIG, "src", 0 },
{ TAG_FRAME, "src", 0 },
{ TAG_IFRAME, "src", 0 },
{ TAG_IMG, "href", 0 },
{ TAG_IMG, "lowsrc", 0 },
{ TAG_IMG, "src", 0 },
{ TAG_INPUT, "src", 0 },
{ TAG_LAYER, "src", 0 },
{ TAG_OVERLAY, "src", 0 },
{ TAG_SCRIPT, "src", 0 },
{ TAG_TABLE, "background", 0 },
{ TAG_TD, "background", 0 },
{ TAG_TH, "background", 0 }
};
/* The lists of interesting tags and attributes are built dynamically,
from the information above. However, some places in the code refer
to the attributes not mentioned here. We add them manually. */
static const char *additional_attributes[] = {
"rel", /* for TAG_LINK */
"http-equiv", /* for TAG_META */
"name", /* for TAG_META */
"content" /* for TAG_META */
};
static const char **interesting_tags;
static const char **interesting_attributes;
void
init_interesting (void)
{
/* Init the variables interesting_tags and interesting_attributes
that are used by the HTML parser to know which tags and
attributes we're interested in. We initialize this only once,
for performance reasons.
Here we also make sure that what we put in interesting_tags
matches the user's preferences as specified through --ignore-tags
and --follow-tags. */
{
int i, ind = 0;
int size = ARRAY_SIZE (known_tags);
interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
for (i = 0; i < size; i++)
{
const char *name = known_tags[i].name;
/* Normally here we could say:
interesting_tags[i] = name;
But we need to respect the settings of --ignore-tags and
--follow-tags, so the code gets a bit harier. */
if (opt.ignore_tags)
{
/* --ignore-tags was specified. Do not match these
specific tags. --ignore-tags takes precedence over
--follow-tags, so we process --ignore first and fall
through if there's no match. */
int j, lose = 0;
for (j = 0; opt.ignore_tags[j] != NULL; j++)
/* Loop through all the tags this user doesn't care
about. */
if (strcasecmp(opt.ignore_tags[j], name) == EQ)
{
lose = 1;
break;
}
if (lose)
continue;
}
if (opt.follow_tags)
{
/* --follow-tags was specified. Only match these specific
tags, so return FALSE if we don't match one of them. */
int j, win = 0;
for (j = 0; opt.follow_tags[j] != NULL; j++)
/* Loop through all the tags this user cares about. */
if (strcasecmp(opt.follow_tags[j], name) == EQ)
{
win = 1;
break;
}
if (!win)
continue; /* wasn't one of the explicitly
desired tags */
}
/* If we get to here, --follow-tags isn't being used or the
tag is among the ones that are follwed, and --ignore-tags,
if specified, didn't include this tag, so it's an
"interesting" one. */
interesting_tags[ind++] = name;
}
interesting_tags[ind] = NULL;
}
/* The same for attributes, except we loop through url_tag_attr_map.
Here we also need to make sure that the list of attributes is
unique, and to include the attributes from additional_attributes. */
{
int i, ind;
const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
* sizeof (char *));
/* First copy the "additional" attributes. */
for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
att[i] = additional_attributes[i];
ind = i;
att[ind] = NULL;
for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
{
int j, seen = 0;
const char *look_for = url_tag_attr_map[i].attr_name;
for (j = 0; j < ind - 1; j++)
if (!strcmp (att[j], look_for))
{
seen = 1;
break;
}
if (!seen)
{
att = xrealloc (att, (ind + 2) * sizeof (*att));
att[ind++] = look_for;
att[ind] = NULL;
}
}
interesting_attributes = att;
}
}
static int
find_tag (const char *tag_name)
{
int i;
/* This is linear search; if the number of tags grow, we can switch
to binary search. */
for (i = 0; i < ARRAY_SIZE (known_tags); i++)
{
int cmp = strcasecmp (known_tags[i].name, tag_name);
/* known_tags are sorted alphabetically, so we can
micro-optimize. */
if (cmp > 0)
break;
else if (cmp == 0)
return i;
}
return -1;
}
/* Find the value of attribute named NAME in the taginfo TAG. If the
attribute is not present, return NULL. If ATTRID is non-NULL, the
exact identity of the attribute will be returned. */
static char *
find_attr (struct taginfo *tag, const char *name, int *attrid)
{
int i;
for (i = 0; i < tag->nattrs; i++)
if (!strcasecmp (tag->attrs[i].name, name))
{
if (attrid)
*attrid = i;
return tag->attrs[i].value;
}
return NULL;
}
struct collect_urls_closure {
char *text; /* HTML text. */
char *base; /* Base URI of the document, possibly
changed through <base href=...>. */
urlpos *head, *tail; /* List of URLs */
const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */
int dash_p_leaf_HTML; /* Whether -p is specified, and this
document is the "leaf" node of the
HTML tree. */
int nofollow; /* whether NOFOLLOW was specified in a
<meta name=robots> tag. */
};
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
are the necessary context to store the position and size. */
static void
handle_link (struct collect_urls_closure *closure, const char *link_uri,
struct taginfo *tag, int attrid)
{
int no_proto = !has_proto (link_uri);
urlpos *newel;
const char *base = closure->base ? closure->base : closure->parent_base;
char *complete_uri;
char *fragment = strrchr (link_uri, '#');
if (fragment)
{
/* Nullify the fragment identifier, i.e. everything after the
last occurrence of `#', inclusive. This copying is
relatively inefficient, but it doesn't matter because
fragment identifiers don't come up all that often. */
int hashlen = fragment - link_uri;
char *p = alloca (hashlen + 1);
memcpy (p, link_uri, hashlen);
p[hashlen] = '\0';
link_uri = p;
}
if (!base)
{
if (no_proto)
{
/* We have no base, and the link does not have a protocol or
a host attached to it. Nothing we can do. */
/* #### Should we print a warning here? Wget 1.5.x used to. */
return;
}
else
complete_uri = xstrdup (link_uri);
}
else
complete_uri = url_concat (base, link_uri);
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
closure->document_file, base ? base : "(null)",
link_uri, complete_uri));
newel = (urlpos *)xmalloc (sizeof (urlpos));
memset (newel, 0, sizeof (*newel));
newel->next = NULL;
newel->url = complete_uri;
newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
newel->size = tag->attrs[attrid].value_raw_size;
/* A URL is relative if the host and protocol are not named, and the
name does not start with `/'.
#### This logic might need some rethinking. */
if (no_proto && *link_uri != '/')
newel->flags |= (URELATIVE | UNOPROTO);
else if (no_proto)
newel->flags |= UNOPROTO;
if (closure->tail)
{
closure->tail->next = newel;
closure->tail = newel;
}
else
closure->tail = closure->head = newel;
}
/* #### Document what this does.
#### It would be nice to split this into several functions. */
static void
collect_tags_mapper (struct taginfo *tag, void *arg)
{
struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
int tagid = find_tag (tag->name);
assert (tagid != -1);
switch (known_tags[tagid].category)
{
case TC_LINK:
{
int i;
int size = ARRAY_SIZE (url_tag_attr_map);
for (i = 0; i < size; i++)
if (url_tag_attr_map[i].tagid == tagid)
break;
/* We've found the index of url_tag_attr_map where the
attributes of our tags begin. Now, look for every one of
them, and handle it. */
for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
{
char *attr_value;
int id;
if (closure->dash_p_leaf_HTML
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
/* If we're at a -p leaf node, we don't want to retrieve
links to references we know are external, such as <a
href=...>. */
continue;
/* This find_attr() buried in a loop may seem inefficient
(O(n^2)), but it's not, since the number of attributes
(n) we loop over is extremely small. In the worst case
of IMG with all its possible attributes, n^2 will be
only 9. */
attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
if (attr_value)
handle_link (closure, attr_value, tag, id);
}
}
break;
case TC_SPEC:
switch (tagid)
{
case TAG_BASE:
{
char *newbase = find_attr (tag, "href", NULL);
if (!newbase)
break;
if (closure->base)
free (closure->base);
if (closure->parent_base)
closure->base = url_concat (closure->parent_base, newbase);
else
closure->base = xstrdup (newbase);
}
break;
case TAG_LINK:
{
int id;
char *rel = find_attr (tag, "rel", NULL);
char *href = find_attr (tag, "href", &id);
if (href)
{
/* In the normal case, all <link href=...> tags are
fair game.
In the special case of when -p is active, however,
and we're at a leaf node (relative to the -l
max. depth) in the HTML document tree, the only
<LINK> tag we'll follow is a <LINK REL=
"stylesheet">, as it's necessary for displaying
this document properly. We won't follow other
<LINK> tags, like <LINK REL="home">, for instance,
as they refer to external documents. */
if (!closure->dash_p_leaf_HTML
|| (rel && !strcasecmp (rel, "stylesheet")))
handle_link (closure, href, tag, id);
}
}
break;
case TAG_META:
/* Some pages use a META tag to specify that the page be
refreshed by a new page after a given number of seconds.
The general format for this is:
<meta http-equiv=Refresh content="NUMBER; URL=index2.html">
So we just need to skip past the "NUMBER; URL=" garbage
to get to the URL. */
{
int id;
char *name = find_attr (tag, "name", NULL);
char *http_equiv = find_attr (tag, "http-equiv", &id);
if (http_equiv && !strcasecmp (http_equiv, "refresh"))
{
char *refresh = find_attr (tag, "content", NULL);
char *p = refresh;
int offset;
while (ISDIGIT (*p))
++p;
if (*p++ != ';')
return;
while (ISSPACE (*p))
++p;
if (!(TOUPPER (*p) == 'U'
&& TOUPPER (*(p + 1)) == 'R'
&& TOUPPER (*(p + 2)) == 'L'
&& *(p + 3) == '='))
return;
p += 4;
while (ISSPACE (*p))
++p;
offset = p - refresh;
tag->attrs[id].value_raw_beginning += offset;
tag->attrs[id].value_raw_size -= offset;
handle_link (closure, p, tag, id);
}
else if (name && !strcasecmp (name, "robots"))
{
/* Handle stuff like:
<meta name="robots" content="index,nofollow"> */
char *content = find_attr (tag, "content", NULL);
if (!content)
return;
if (!strcasecmp (content, "none"))
closure->nofollow = 1;
else
{
while (*content)
{
/* Find the next occurrence of ',' or the end of
the string. */
char *end = strchr (content, ',');
if (end)
++end;
else
end = content + strlen (content);
if (!strncasecmp (content, "nofollow", end - content))
closure->nofollow = 1;
content = end;
}
}
}
}
break;
default:
/* Category is TC_SPEC, but tag name is unhandled. This
must not be. */
abort ();
}
break;
}
}
/* Scan FILE, retrieving links to HTML documents from it. Each link is
Similar to get_urls_file, but for HTML files. FILE is scanned as
an HTML document. get_urls_html() constructs the URLs from the
relative href-s.
If SILENT is non-zero, do not barf on baseless relative links. */
urlpos *
get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
int *meta_disallow_follow)
{
struct file_memory *fm;
struct collect_urls_closure closure;
/* Load the file. */
fm = read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
closure.text = fm->content;
closure.head = closure.tail = NULL;
closure.base = NULL;
closure.parent_base = this_url ? this_url : opt.base_href;
closure.document_file = file;
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0;
if (!interesting_tags)
init_interesting ();
map_html_tags (fm->content, fm->length, interesting_tags,
interesting_attributes, collect_tags_mapper, &closure);
DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = closure.nofollow;
FREE_MAYBE (closure.base);
read_file_free (fm);
return closure.head;
}

View File

@ -254,6 +254,85 @@ http_process_type (const char *hdr, void *arg)
return 1;
}
/* Check whether the `Connection' header is set to "keep-alive". */
static int
http_process_connection (const char *hdr, void *arg)
{
int *flag = (int *)arg;
if (!strcasecmp (hdr, "Keep-Alive"))
*flag = 1;
return 1;
}
/* Persistent connections (pc). */
static unsigned char pc_last_host[4];
static unsigned short pc_last_port;
static int pc_last_fd;
static void
register_persistent (const char *host, unsigned short port, int fd)
{
if (!store_hostaddress (pc_last_host, host))
return;
pc_last_port = port;
pc_last_fd = fd;
}
static void
invalidate_persistent (void)
{
pc_last_port = 0;
}
static int
persistent_available_p (const char *host, unsigned short port)
{
unsigned char this_host[4];
if (port != pc_last_port)
return 0;
if (!store_hostaddress (this_host, host))
return 0;
if (memcmp (pc_last_host, this_host, 4))
return 0;
if (!test_socket_open (pc_last_fd))
{
invalidate_persistent ();
return 0;
}
return 1;
}
/* The idea behind these two CLOSE macros is to distinguish between
two cases: one when the job we've been doing is finished, and we
want to close the connection and leave, and two when something is
seriously wrong and we're closing the connection as part of
cleanup.
In case of keep_alive, CLOSE_FINISH should leave the connection
open, while CLOSE_INVALIDATE should still close it.
The semantic difference between the flags `keep_alive' and
`reused_connection' is that keep_alive defines the state of HTTP:
whether the connection *will* be preservable. reused_connection,
on the other hand, reflects the present: whether the *current*
connection is the result of preserving. */
#define CLOSE_FINISH(fd) do { \
if (!keep_alive) \
{ \
CLOSE (fd); \
if (reused_connection) \
invalidate_persistent (); \
} \
} while (0)
#define CLOSE_INVALIDATE(fd) do { \
CLOSE (fd); \
if (reused_connection) \
invalidate_persistent (); \
} while (0)
struct http_stat
{
@ -317,6 +396,8 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
FILE *fp;
int auth_tried_already;
struct rbuf rbuf;
int keep_alive, http_keep_alive_1, http_keep_alive_2;
int reused_connection;
if (!(*dt & HEAD_ONLY))
/* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
@ -329,6 +410,9 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
again:
/* We need to come back here when the initial attempt to retrieve
without authorization header fails. */
keep_alive = 0;
http_keep_alive_1 = http_keep_alive_2 = 0;
reused_connection = 0;
/* Initialize certain elements of struct http_stat. */
hs->len = 0L;
@ -345,40 +429,49 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
ou = u;
/* First: establish the connection. */
logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
err = make_connection (&sock, u->host, u->port);
switch (err)
if (u->proxy || !persistent_available_p (u->host, u->port))
{
case HOSTERR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
return HOSTERR;
break;
case CONSOCKERR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
return CONSOCKERR;
break;
case CONREFUSED:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET,
_("Connection to %s:%hu refused.\n"), u->host, u->port);
CLOSE (sock);
return CONREFUSED;
case CONERROR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
CLOSE (sock);
return CONERROR;
break;
case NOCONERROR:
/* Everything is fine! */
logputs (LOG_VERBOSE, _("connected!\n"));
break;
default:
abort ();
break;
} /* switch */
logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
err = make_connection (&sock, u->host, u->port);
switch (err)
{
case HOSTERR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
return HOSTERR;
break;
case CONSOCKERR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
return CONSOCKERR;
break;
case CONREFUSED:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET,
_("Connection to %s:%hu refused.\n"), u->host, u->port);
CLOSE (sock);
return CONREFUSED;
case CONERROR:
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
CLOSE (sock);
return CONERROR;
break;
case NOCONERROR:
/* Everything is fine! */
logputs (LOG_VERBOSE, _("connected!\n"));
break;
default:
abort ();
break;
}
}
else
{
logprintf (LOG_VERBOSE, _("Reusing connection to %s:%hu.\n"), u->host, u->port);
sock = pc_last_fd;
reused_connection = 1;
}
if (u->proxy)
path = u->proxy->url;
@ -487,6 +580,7 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
User-Agent: %s\r\n\
Host: %s%s\r\n\
Accept: %s\r\n\
Connection: Keep-Alive\r\n\
%s%s%s%s%s%s\r\n",
command, path, useragent, remhost,
host_port ? host_port : "",
@ -505,8 +599,9 @@ Accept: %s\r\n\
num_written = iwrite (sock, request, strlen (request));
if (num_written < 0)
{
logputs (LOG_VERBOSE, _("Failed writing HTTP request.\n"));
CLOSE (sock);
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
strerror (errno));
CLOSE_INVALIDATE (sock);
return WRITEFAILED;
}
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
@ -553,7 +648,7 @@ Accept: %s\r\n\
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
CLOSE (sock);
CLOSE_INVALIDATE (sock);
return HEOF;
}
else if (status == HG_ERROR)
@ -565,7 +660,7 @@ Accept: %s\r\n\
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
CLOSE (sock);
CLOSE_INVALIDATE (sock);
return HERR;
}
@ -672,12 +767,32 @@ Accept: %s\r\n\
goto done_header;
}
}
/* Check for the `Keep-Alive' header. */
if (!http_keep_alive_1)
{
if (header_process (hdr, "Keep-Alive", header_exists,
&http_keep_alive_1))
goto done_header;
}
/* Check for `Connection: Keep-Alive'. */
if (!http_keep_alive_2)
{
if (header_process (hdr, "Connection", http_process_connection,
&http_keep_alive_2))
goto done_header;
}
done_header:
free (hdr);
}
logputs (LOG_VERBOSE, "\n");
if (contlen != -1
&& (http_keep_alive_1 || http_keep_alive_2))
keep_alive = 1;
if (keep_alive && !reused_connection)
register_persistent (u->host, u->port, sock);
if ((statcode == HTTP_STATUS_UNAUTHORIZED)
&& authenticate_h)
{
@ -685,7 +800,7 @@ Accept: %s\r\n\
FREE_MAYBE (type);
type = NULL;
FREEHSTAT (*hs);
CLOSE (sock);
CLOSE_FINISH (sock);
if (auth_tried_already)
{
/* If we have tried it already, then there is not point
@ -753,7 +868,7 @@ Accept: %s\r\n\
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
CLOSE (sock);
CLOSE_INVALIDATE (sock);
return RANGEERR;
}
@ -783,7 +898,7 @@ Accept: %s\r\n\
_("Location: %s%s\n"),
hs->newloc ? hs->newloc : _("unspecified"),
hs->newloc ? _(" [following]") : "");
CLOSE (sock);
CLOSE_FINISH (sock);
FREE_MAYBE (type);
FREE_MAYBE (all_headers);
return NEWLOCATION;
@ -824,7 +939,7 @@ Accept: %s\r\n\
hs->res = 0;
FREE_MAYBE (type);
FREE_MAYBE (all_headers);
CLOSE (sock);
CLOSE_FINISH (sock);
return RETRFINISHED;
}
@ -838,7 +953,7 @@ Accept: %s\r\n\
if (!fp)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", u->local, strerror (errno));
CLOSE (sock);
CLOSE_FINISH (sock);
FREE_MAYBE (all_headers);
return FOPENERR;
}
@ -863,7 +978,7 @@ Accept: %s\r\n\
/* Get the contents of the document. */
hs->res = get_contents (sock, fp, &hs->len, hs->restval,
(contlen != -1 ? contlen : 0),
&rbuf);
&rbuf, keep_alive);
hs->dltime = elapsed_time ();
{
/* Close or flush the file. We have to be careful to check for
@ -878,7 +993,7 @@ Accept: %s\r\n\
hs->res = -2;
}
FREE_MAYBE (all_headers);
CLOSE (sock);
CLOSE_FINISH (sock);
if (hs->res == -2)
return FWRITEERR;
return RETRFINISHED;

View File

@ -97,6 +97,20 @@ i18n_initialize (void)
textdomain ("wget");
#endif /* HAVE_NLS */
}
/* It's kosher to declare these here because their interface _has_ to
be void foo(void). */
void url_init PARAMS ((void));
void host_init PARAMS ((void));
/* This just calls the various initialization functions from the
modules that need one-time initialization. */
static void
private_initialize (void)
{
url_init ();
host_init ();
}
/* Print the usage message. */
static void
@ -293,6 +307,7 @@ main (int argc, char *const *argv)
};
i18n_initialize ();
private_initialize ();
append_to_log = 0;

View File

@ -42,21 +42,20 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "ftp.h"
#include "fnmatch.h"
#include "host.h"
#include "hash.h"
extern char *version_string;
#define ROBOTS_FILENAME "robots.txt"
/* #### Many of these lists should really be hashtables! */
/* List of downloaded URLs. */
static urlpos *urls_downloaded;
static struct hash_table *dl_file_url_map;
static struct hash_table *dl_url_file_map;
/* List of HTML URLs. */
static slist *urls_html;
/* List of undesirable-to-load URLs. */
static slist *ulist;
static struct hash_table *undesirable_urls;
/* List of forbidden locations. */
static char **forbidden = NULL;
@ -84,14 +83,28 @@ static int robots_match PARAMS ((struct urlinfo *, char **));
void
recursive_cleanup (void)
{
free_slist (ulist);
ulist = NULL;
if (undesirable_urls)
{
string_set_free (undesirable_urls);
undesirable_urls = NULL;
}
if (dl_file_url_map)
{
free_keys_and_values (dl_file_url_map);
hash_table_destroy (dl_file_url_map);
dl_file_url_map = NULL;
}
if (dl_url_file_map)
{
free_keys_and_values (dl_url_file_map);
hash_table_destroy (dl_url_file_map);
dl_url_file_map = NULL;
}
undesirable_urls = NULL;
free_vec (forbidden);
forbidden = NULL;
free_slist (urls_html);
slist_free (urls_html);
urls_html = NULL;
free_urlpos (urls_downloaded);
urls_downloaded = NULL;
FREE_MAYBE (base_dir);
FREE_MAYBE (robots_host);
first_time = 1;
@ -117,6 +130,7 @@ recursive_retrieve (const char *file, const char *this_url)
char *constr, *filename, *newloc;
char *canon_this_url = NULL;
int dt, inl, dash_p_leaf_HTML = FALSE;
int meta_disallow_follow;
int this_url_ftp; /* See below the explanation */
uerr_t err;
struct urlinfo *rurl;
@ -132,17 +146,29 @@ recursive_retrieve (const char *file, const char *this_url)
/* Cache the current URL in the list. */
if (first_time)
{
ulist = add_slist (ulist, this_url, 0);
urls_downloaded = NULL;
/* These three operations need to be done only once per Wget
run. They should probably be at a different location. */
if (!undesirable_urls)
undesirable_urls = make_string_hash_table (0);
if (!dl_file_url_map)
dl_file_url_map = make_string_hash_table (0);
if (!dl_url_file_map)
dl_url_file_map = make_string_hash_table (0);
hash_table_clear (undesirable_urls);
string_set_add (undesirable_urls, this_url);
hash_table_clear (dl_file_url_map);
hash_table_clear (dl_url_file_map);
urls_html = NULL;
/* Enter this_url to the slist, in original and "enhanced" form. */
/* Enter this_url to the hash table, in original and "enhanced" form. */
u = newurl ();
err = parseurl (this_url, u, 0);
if (err == URLOK)
{
ulist = add_slist (ulist, u->url, 0);
urls_downloaded = add_url (urls_downloaded, u->url, file);
urls_html = add_slist (urls_html, file, NOSORT);
string_set_add (undesirable_urls, u->url);
hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
urls_html = slist_append (urls_html, file);
if (opt.no_parent)
base_dir = xstrdup (u->dir); /* Set the base dir. */
/* Set the canonical this_url to be sent as referer. This
@ -191,7 +217,15 @@ recursive_retrieve (const char *file, const char *this_url)
/* Get the URL-s from an HTML file: */
url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
0, dash_p_leaf_HTML);
dash_p_leaf_HTML, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
/* The META tag says we are not to follow this file. Respect
that. */
free_urlpos (url_list);
url_list = NULL;
}
/* Decide what to do with each of the URLs. A URL will be loaded if
it meets several requirements, discussed later. */
@ -240,16 +274,16 @@ recursive_retrieve (const char *file, const char *this_url)
the list. */
/* inl is set if the URL we are working on (constr) is stored in
ulist. Using it is crucial to avoid the incessant calls to
in_slist, which is quite slow. */
inl = in_slist (ulist, constr);
undesirable_urls. Using it is crucial to avoid unnecessary
repeated continuous hits to the hash table. */
inl = string_set_exists (undesirable_urls, constr);
/* If it is FTP, and FTP is not followed, chuck it out. */
if (!inl)
if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
{
DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
/* If it is absolute link and they are not followed, chuck it
@ -258,7 +292,7 @@ recursive_retrieve (const char *file, const char *this_url)
if (opt.relative_only && !(cur_url->flags & URELATIVE))
{
DEBUGP (("It doesn't really look like a relative link.\n"));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
/* If its domain is not to be accepted/looked-up, chuck it out. */
@ -266,7 +300,7 @@ recursive_retrieve (const char *file, const char *this_url)
if (!accept_domain (u))
{
DEBUGP (("I don't like the smell of that domain.\n"));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
/* Check for parent directory. */
@ -286,7 +320,7 @@ recursive_retrieve (const char *file, const char *this_url)
{
/* Failing that too, kill the URL. */
DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
freeurl (ut, 1);
@ -300,7 +334,7 @@ recursive_retrieve (const char *file, const char *this_url)
if (!accdir (u->dir, ALLABS))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
}
@ -330,7 +364,7 @@ recursive_retrieve (const char *file, const char *this_url)
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
constr, u->file));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
}
@ -353,12 +387,12 @@ recursive_retrieve (const char *file, const char *this_url)
}
free (constr);
constr = xstrdup (u->url);
inl = in_slist (ulist, constr);
string_set_add (undesirable_urls, constr);
if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
if (!opt.spanhost && this_url && !same_host (this_url, constr))
{
DEBUGP (("This is not the same hostname as the parent's.\n"));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
}
@ -398,7 +432,7 @@ recursive_retrieve (const char *file, const char *this_url)
{
DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
ROBOTS_FILENAME));
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
inl = 1;
}
}
@ -409,7 +443,7 @@ recursive_retrieve (const char *file, const char *this_url)
{
DEBUGP (("I've decided to load it -> "));
/* Add it to the list of already-loaded URL-s. */
ulist = add_slist (ulist, constr, 0);
string_set_add (undesirable_urls, constr);
/* Automatically followed FTPs will *not* be downloaded
recursively. */
if (u->proto == URLFTP)
@ -439,10 +473,13 @@ recursive_retrieve (const char *file, const char *this_url)
{
if (dt & RETROKF)
{
urls_downloaded = add_url (urls_downloaded, constr, filename);
hash_table_put (dl_file_url_map,
xstrdup (filename), xstrdup (constr));
hash_table_put (dl_url_file_map,
xstrdup (constr), xstrdup (filename));
/* If the URL is HTML, note it. */
if (dt & TEXTHTML)
urls_html = add_slist (urls_html, filename, NOSORT);
urls_html = slist_append (urls_html, filename);
}
}
/* If there was no error, and the type is text/html, parse
@ -489,6 +526,10 @@ recursive_retrieve (const char *file, const char *this_url)
/* Increment the pbuf for the appropriate size. */
}
if (opt.convert_links && !opt.delete_after)
/* This is merely the first pass: the links that have been
successfully downloaded are converted. In the second pass,
convert_all_links() will also convert those links that have NOT
been downloaded to their canonical form. */
convert_links (file, url_list);
/* Free the linked list of URL-s. */
free_urlpos (url_list);
@ -531,30 +572,37 @@ void
convert_all_links (void)
{
uerr_t res;
urlpos *l1, *l2, *urls;
urlpos *l1, *urls;
struct urlinfo *u;
slist *html;
urlpos *urlhtml;
for (html = urls_html; html; html = html->next)
{
int meta_disallow_follow;
char *url;
DEBUGP (("Rescanning %s\n", html->string));
/* Determine the URL of the HTML file. get_urls_html will need
it. */
for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
if (!strcmp (urlhtml->local_name, html->string))
break;
if (urlhtml)
DEBUGP (("It should correspond to %s.\n", urlhtml->url));
url = hash_table_get (dl_file_url_map, html->string);
if (url)
DEBUGP (("It should correspond to %s.\n", url));
else
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
FALSE);
urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
/* The META tag says we are not to follow this file.
Respect that. */
free_urlpos (urls);
urls = NULL;
}
if (!urls)
continue;
for (l1 = urls; l1; l1 = l1->next)
{
char *local_name;
/* The URL must be in canonical form to be compared. */
u = newurl ();
res = parseurl (l1->url, u, 0);
@ -565,22 +613,18 @@ convert_all_links (void)
}
/* We decide the direction of conversion according to whether
a URL was downloaded. Downloaded URLs will be converted
ABS2REL, whereas non-downloaded will be converted REL2ABS.
Note: not yet implemented; only ABS2REL works. */
for (l2 = urls_downloaded; l2; l2 = l2->next)
if (!strcmp (l2->url, u->url))
{
DEBUGP (("%s flagged for conversion, local %s\n",
l2->url, l2->local_name));
break;
}
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
local_name = hash_table_get (dl_url_file_map, u->url);
if (local_name)
DEBUGP (("%s flagged for conversion, local %s\n",
u->url, local_name));
/* Clear the flags. */
l1->flags &= ~ (UABS2REL | UREL2ABS);
/* Decide on the conversion direction. */
if (l2)
if (local_name)
{
l1->flags |= UABS2REL;
l1->local_name = xstrdup (l2->local_name);
l1->local_name = xstrdup (local_name);
}
else
{

View File

@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "ftp.h"
#include "host.h"
#include "connect.h"
#include "hash.h"
#ifdef WINDOWS
LARGE_INTEGER internal_time;
@ -60,6 +61,8 @@ enum spflags { SP_NONE, SP_INIT, SP_FINISH };
static int show_progress PARAMS ((long, long, enum spflags));
#define MIN(i, j) ((i) <= (j) ? (i) : (j))
/* Reads the contents of file descriptor FD, until it is closed, or a
read error occurs. The data is read in 8K chunks, and stored to
stream fp, which should have been open for writing. If BUF is
@ -83,9 +86,9 @@ static int show_progress PARAMS ((long, long, enum spflags));
from fd immediately, flush or discard the buffer. */
int
get_contents (int fd, FILE *fp, long *len, long restval, long expected,
struct rbuf *rbuf)
struct rbuf *rbuf, int use_expected)
{
int res;
int res = 0;
static char c[8192];
*len = restval;
@ -105,10 +108,17 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
*len += res;
}
}
/* Read from fd while there is available data. */
do
/* Read from fd while there is available data.
Normally, if expected is 0, it means that it is not known how
much data is expected. However, if use_expected is specified,
then expected being zero means exactly that. */
while (!use_expected || (*len < expected))
{
res = iread (fd, c, sizeof (c));
int amount_to_read = (use_expected
? MIN (expected - *len, sizeof (c))
: sizeof (c));
res = iread (fd, c, amount_to_read);
if (res > 0)
{
if (fwrite (c, sizeof (char), res, fp) < res)
@ -120,7 +130,9 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
}
*len += res;
}
} while (res > 0);
else
break;
}
if (res < -1)
res = -1;
if (opt.verbose)
@ -323,7 +335,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
int local_use_proxy;
char *mynewloc, *proxy;
struct urlinfo *u;
slist *redirections;
struct hash_table *redirections = NULL;
/* If dt is NULL, just ignore it. */
if (!dt)
@ -334,8 +346,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file)
*file = NULL;
redirections = NULL;
u = newurl ();
/* Parse the URL. */
result = parseurl (url, u, 0);
@ -343,7 +353,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
free (url);
return result;
}
@ -379,7 +390,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
{
logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
free (url);
return PROXERR;
}
@ -392,7 +404,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
else
logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
free (url);
return PROXERR;
}
@ -454,7 +467,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result));
freeurl (newloc_struct, 1);
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
free (url);
free (mynewloc);
return result;
@ -466,34 +480,29 @@ retrieve_url (const char *origurl, char **file, char **newloc,
free (mynewloc);
mynewloc = xstrdup (newloc_struct->url);
/* Check for redirection to back to itself. */
if (!strcmp (u->url, newloc_struct->url))
if (!redirections)
{
logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
mynewloc);
freeurl (newloc_struct, 1);
freeurl (u, 1);
free_slist (redirections);
free (url);
free (mynewloc);
return WRONGCODE;
redirections = make_string_hash_table (0);
/* Add current URL immediately so we can detect it as soon
as possible in case of a cycle. */
string_set_add (redirections, u->url);
}
/* The new location is OK. Let's check for redirection cycle by
peeking through the history of redirections. */
if (in_slist (redirections, newloc_struct->url))
if (string_set_exists (redirections, newloc_struct->url))
{
logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
mynewloc);
freeurl (newloc_struct, 1);
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
free (url);
free (mynewloc);
return WRONGCODE;
}
redirections = add_slist (redirections, newloc_struct->url, NOSORT);
string_set_add (redirections, newloc_struct->url);
free (url);
url = mynewloc;
@ -510,7 +519,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
*file = NULL;
}
freeurl (u, 1);
free_slist (redirections);
if (redirections)
string_set_free (redirections);
if (newloc)
*newloc = url;
@ -531,9 +541,7 @@ retrieve_from_file (const char *file, int html, int *count)
uerr_t status;
urlpos *url_list, *cur_url;
/* If spider-mode is on, we do not want get_urls_html barfing
errors on baseless links. */
url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE)
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
: get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */

View File

@ -22,7 +22,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "rbuf.h"
int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *));
int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *, int));
uerr_t retrieve_url PARAMS ((const char *, char **, char **,
const char *, int *));

615
src/url.c
View File

@ -38,7 +38,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include "utils.h"
#include "url.h"
#include "host.h"
#include "html.h"
#ifndef errno
extern int errno;
@ -48,22 +47,12 @@ extern int errno;
#define DEFAULT_HTTP_PORT 80
#define DEFAULT_FTP_PORT 21
/* URL separator (for findurl) */
#define URL_SEPARATOR "!\"#'(),>`{}|<>"
/* Table of Unsafe chars. This is intialized in
init_unsafe_char_table. */
/* A list of unsafe characters for encoding, as per RFC1738. '@' and
':' (not listed in RFC) were added because of user/password
encoding. */
static char unsafe_char_table[256];
#ifndef WINDOWS
# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
#else /* WINDOWS */
# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
#endif /* WINDOWS */
#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
|| ((unsigned char)(c) > '~') /* ASCII 127 */ \
|| strchr (URL_UNSAFE_CHARS, c))
#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
/* If S contains unsafe characters, free it and replace it with a
version that doesn't. */
@ -176,6 +165,34 @@ skip_url (const char *url)
return 0;
}
/* Unsafe chars:
- anything <= 32;
- stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
- @ and :, for user/password encoding.
- everything over 127 (but we don't bother with recording those. */
void
init_unsafe_char_table (void)
{
int i;
for (i = 0; i < 256; i++)
if (i < 32 || i >= 127
|| i == '<'
|| i == '>'
|| i == '\"'
|| i == '#'
|| i == '%'
|| i == '{'
|| i == '}'
|| i == '|'
|| i == '\\'
|| i == '^'
|| i == '~'
|| i == '['
|| i == ']'
|| i == '`')
unsafe_char_table[i] = 1;
}
/* Returns 1 if the string contains unsafe characters, 0 otherwise. */
int
contains_unsafe (const char *s)
@ -296,7 +313,7 @@ skip_proto (const char *url)
/* Returns 1 if the URL begins with a protocol (supported or
unsupported), 0 otherwise. */
static int
int
has_proto (const char *url)
{
char **s;
@ -765,297 +782,54 @@ url_equal (const char *url1, const char *url2)
return res;
}
/* Find URL of format scheme:hostname[:port]/dir in a buffer. The
buffer may contain pretty much anything; no errors are signaled. */
static const char *
findurl (const char *buf, int howmuch, int *count)
{
char **prot;
const char *s1, *s2;
for (s1 = buf; howmuch; s1++, howmuch--)
for (prot = protostrings; *prot; prot++)
if (howmuch <= strlen (*prot))
continue;
else if (!strncasecmp (*prot, s1, strlen (*prot)))
{
for (s2 = s1, *count = 0;
howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
!strchr (URL_SEPARATOR, *s2);
s2++, (*count)++, howmuch--);
return s1;
}
return NULL;
}
/* Scans the file for signs of URL-s. Returns a vector of pointers,
each pointer representing a URL string. The file is *not* assumed
to be HTML. */
urlpos *
get_urls_file (const char *file)
{
long nread;
FILE *fp;
char *buf;
const char *pbuf;
int size;
urlpos *first, *current, *old;
struct file_memory *fm;
urlpos *head, *tail;
const char *text, *text_end;
if (file && !HYPHENP (file))
{
fp = fopen (file, "rb");
if (!fp)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
}
else
fp = stdin;
/* Load the file. */
load_file (fp, &buf, &nread);
if (file && !HYPHENP (file))
fclose (fp);
DEBUGP (("Loaded %s (size %ld).\n", file, nread));
first = current = NULL;
/* Fill the linked list with URLs. */
for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
pbuf += size)
fm = read_file (file);
if (!fm)
{
/* Allocate the space. */
old = current;
current = (urlpos *)xmalloc (sizeof (urlpos));
if (old)
old->next = current;
memset (current, 0, sizeof (*current));
current->next = NULL;
current->url = (char *)xmalloc (size + 1);
memcpy (current->url, pbuf, size);
current->url[size] = '\0';
if (!first)
first = current;
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
/* Free the buffer. */
free (buf);
return first;
}
/* Similar to get_urls_file, but for HTML files. FILE is scanned as
an HTML document using htmlfindurl(), which see. get_urls_html()
constructs the HTML-s from the relative href-s.
If SILENT is non-zero, do not barf on baseless relative links. */
urlpos *
get_urls_html (const char *file, const char *this_url, int silent,
int dash_p_leaf_HTML)
{
long nread;
FILE *fp;
char *orig_buf;
const char *buf;
int step, first_time;
urlpos *first, *current, *old;
if (file && !HYPHENP (file))
DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
head = tail = NULL;
text = fm->content;
text_end = fm->content + fm->length;
while (text < text_end)
{
fp = fopen (file, "rb");
if (!fp)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
}
else
fp = stdin;
/* Load the file. */
load_file (fp, &orig_buf, &nread);
if (file && !HYPHENP (file))
fclose (fp);
DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
first = current = NULL;
first_time = 1;
/* Iterate over the URLs in BUF, picked by htmlfindurl(). */
for (buf = orig_buf;
(buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
dash_p_leaf_HTML));
buf += step)
{
int i, no_proto;
int size = step;
const char *pbuf = buf;
char *constr, *base;
const char *cbase;
char *needs_freeing, *url_data;
first_time = 0;
/* A frequent phenomenon that needs to be handled are pages
generated by brain-damaged HTML generators, which refer to to
URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
any spaces at the beginning or at the end of the string.
This is probably not strictly correct, but that's what the
browsers do, so we may follow. May the authors of "WYSIWYG"
HTML tools burn in hell for the damage they've inflicted! */
while ((pbuf < buf + step) && ISSPACE (*pbuf))
{
++pbuf;
--size;
}
while (size && ISSPACE (pbuf[size - 1]))
--size;
if (!size)
break;
/* It would be nice if we could avoid allocating memory in this
loop, but I don't see an easy way. To process the entities,
we need to either copy the data, or change it destructively.
I choose the former.
We have two pointers: needs_freeing and url_data, because the
code below does thing like url_data += <something>, and we
want to pass the original string to free(). */
needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
size = strlen (url_data);
for (i = 0; protostrings[i]; i++)
{
if (!strncasecmp (protostrings[i], url_data,
MINVAL (strlen (protostrings[i]), size)))
break;
}
/* Check for http:RELATIVE_URI. See below for details. */
if (protostrings[i]
&& !(strncasecmp (url_data, "http:", 5) == 0
&& strncasecmp (url_data, "http://", 7) != 0))
{
no_proto = 0;
}
const char *line_beg = text;
const char *line_end = memchr (text, '\n', text_end - text);
if (!line_end)
line_end = text_end;
else
++line_end;
text = line_end;
while (line_beg < line_end
&& ISSPACE (*line_beg))
++line_beg;
while (line_end > line_beg + 1
&& ISSPACE (*(line_end - 1)))
--line_end;
if (line_end > line_beg)
{
no_proto = 1;
/* This is for extremely brain-damaged pages that refer to
relative URI-s as <a href="http:URL">. Just strip off the
silly leading "http:" (as well as any leading blanks
before it). */
if ((size > 5) && !strncasecmp ("http:", url_data, 5))
url_data += 5, size -= 5;
}
if (!no_proto)
{
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
{
if (!strncasecmp (sup_protos[i].name, url_data,
MINVAL (strlen (sup_protos[i].name), size)))
break;
}
/* Do *not* accept a non-supported protocol. */
if (i == ARRAY_SIZE (sup_protos))
{
free (needs_freeing);
continue;
}
}
if (no_proto)
{
/* First, construct the base, which can be relative itself.
Criteria for creating the base are:
1) html_base created by <base href="...">
2) current URL
3) base provided from the command line */
cbase = html_base ();
if (!cbase)
cbase = this_url;
if (!cbase)
cbase = opt.base_href;
if (!cbase) /* Error condition -- a baseless
relative link. */
{
if (!opt.quiet && !silent)
{
/* Use malloc, not alloca because this is called in
a loop. */
char *temp = (char *)malloc (size + 1);
strncpy (temp, url_data, size);
temp[size] = '\0';
logprintf (LOG_NOTQUIET,
_("Error (%s): Link %s without a base provided.\n"),
file, temp);
free (temp);
}
free (needs_freeing);
continue;
}
if (this_url)
base = construct (this_url, cbase, strlen (cbase),
!has_proto (cbase));
urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
memset (entry, 0, sizeof (*entry));
entry->next = NULL;
entry->url = strdupdelim (line_beg, line_end);
if (!head)
head = entry;
else
{
/* Base must now be absolute, with host name and
protocol. */
if (!has_proto (cbase))
{
logprintf (LOG_NOTQUIET, _("\
Error (%s): Base %s relative, without referer URL.\n"),
file, cbase);
free (needs_freeing);
continue;
}
base = xstrdup (cbase);
}
constr = construct (base, url_data, size, no_proto);
free (base);
tail->next = entry;
tail = entry;
}
else /* has proto */
{
constr = (char *)xmalloc (size + 1);
strncpy (constr, url_data, size);
constr[size] = '\0';
}
#ifdef DEBUG
if (opt.debug)
{
char *tmp;
const char *tmp2;
tmp2 = html_base ();
/* Use malloc, not alloca because this is called in a loop. */
tmp = (char *)xmalloc (size + 1);
strncpy (tmp, url_data, size);
tmp[size] = '\0';
logprintf (LOG_ALWAYS,
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
file, this_url ? this_url : "(null)",
tmp2 ? tmp2 : "(null)", tmp, constr);
free (tmp);
}
#endif
/* Allocate the space. */
old = current;
current = (urlpos *)xmalloc (sizeof (urlpos));
if (old)
old->next = current;
if (!first)
first = current;
/* Fill the values. */
memset (current, 0, sizeof (*current));
current->next = NULL;
current->url = constr;
current->size = step;
current->pos = buf - orig_buf;
/* A URL is relative if the host and protocol are not named,
and the name does not start with `/'. */
if (no_proto && *url_data != '/')
current->flags |= (URELATIVE | UNOPROTO);
else if (no_proto)
current->flags |= UNOPROTO;
free (needs_freeing);
}
free (orig_buf);
return first;
read_file_free (fm);
return head;
}
/* Free the linked list of urlpos. */
@ -1527,103 +1301,59 @@ no_proxy_match (const char *host, const char **no_proxy)
return !sufmatch (no_proxy, host);
}
static void write_backup_file PARAMS ((const char *, downloaded_file_t));
/* Change the links in an HTML document. Accepts a structure that
defines the positions of all the links. */
void
convert_links (const char *file, urlpos *l)
{
struct file_memory *fm;
FILE *fp;
char *buf, *p, *p2;
char *p;
downloaded_file_t downloaded_file_return;
long size;
{
/* First we do a "dry run": go through the list L and see whether
any URL needs to be converted in the first place. If not, just
leave the file alone. */
int count = 0;
urlpos *dry = l;
for (dry = l; dry; dry = dry->next)
if (dry->flags & (UABS2REL | UREL2ABS))
++count;
if (!count)
{
logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"),
file);
return;
}
}
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
/* Read from the file.... */
fp = fopen (file, "rb");
if (!fp)
fm = read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
file, strerror (errno));
return;
}
/* ...to a buffer. */
load_file (fp, &buf, &size);
fclose (fp);
downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
if (opt.backup_converted && downloaded_file_return)
/* Rather than just writing over the original .html file with the converted
version, save the former to *.orig. Note we only do this for files we've
_successfully_ downloaded, so we don't clobber .orig files sitting around
from previous invocations. */
write_backup_file (file, downloaded_file_return);
/* Before opening the file for writing, unlink the file. This is
important if the data in FM is mmaped. In such case, nulling the
file, which is what fopen() below does, would make us read all
zeroes from the mmaped region. */
if (unlink (file) < 0 && errno != ENOENT)
{
/* Construct the backup filename as the original name plus ".orig". */
size_t filename_len = strlen(file);
char* filename_plus_orig_suffix;
boolean already_wrote_backup_file = FALSE;
slist* converted_file_ptr;
static slist* converted_files = NULL;
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
/* Just write "orig" over "html". We need to do it this way because
when we're checking to see if we've downloaded the file before (to
see if we can skip downloading it), we don't know if it's a
text/html file. Therefore we don't know yet at that stage that -E
is going to cause us to tack on ".html", so we need to compare
vs. the original URL plus ".orig", not the original URL plus
".html.orig". */
filename_plus_orig_suffix = xmalloc(filename_len + 1);
strcpy(filename_plus_orig_suffix, file);
strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
}
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
strcpy(filename_plus_orig_suffix, file);
strcpy(filename_plus_orig_suffix + filename_len, ".orig");
}
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file each
time in such a case, it'll end up containing the first-pass conversion,
not the original file. So, see if we've already been called on this
file. */
converted_file_ptr = converted_files;
while (converted_file_ptr != NULL)
if (strcmp(converted_file_ptr->string, file) == 0)
{
already_wrote_backup_file = TRUE;
break;
}
else
converted_file_ptr = converted_file_ptr->next;
if (!already_wrote_backup_file)
{
/* Rename <file> to <file>.orig before former gets written over. */
if (rename(file, filename_plus_orig_suffix) != 0)
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
file, filename_plus_orig_suffix, strerror (errno));
/* Remember that we've already written a .orig backup for this file.
Note that we never free this memory since we need it till the
convert_all_links() call, which is one of the last things the
program does before terminating. BTW, I'm not sure if it would be
safe to just set 'converted_file_ptr->string' to 'file' below,
rather than making a copy of the string... Another note is that I
thought I could just add a field to the urlpos structure saying
that we'd written a .orig file for this URL, but that didn't work,
so I had to make this separate list. */
converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
converted_file_ptr->next = converted_files;
converted_files = converted_file_ptr;
}
free(filename_plus_orig_suffix);
logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
file, strerror (errno));
read_file_free (fm);
return;
}
/* Now open the file for writing. */
fp = fopen (file, "wb");
@ -1631,50 +1361,63 @@ convert_links (const char *file, urlpos *l)
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
file, strerror (errno));
free (buf);
read_file_free (fm);
return;
}
/* Presumably we have to loop through multiple URLs here (even though we're
only talking about a single local file) because of the -O option. */
for (p = buf; l; l = l->next)
/* Here we loop through all the URLs in file, replacing those of
them that are downloaded with relative references. */
p = fm->content;
for (; l; l = l->next)
{
if (l->pos >= size)
char *url_start = fm->content + l->pos;
if (l->pos >= fm->length)
{
DEBUGP (("Something strange is going on. Please investigate."));
break;
}
/* If the URL already is relative or it is not to be converted
for some other reason (e.g. because of not having been
downloaded in the first place), skip it. */
if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
/* If the URL is not to be converted, skip it. */
if (!(l->flags & (UABS2REL | UREL2ABS)))
{
DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
l->pos, l->flags));
continue;
}
/* Else, reach the position of the offending URL, echoing
everything up to it to the outfile. */
for (p2 = buf + l->pos; p < p2; p++)
putc (*p, fp);
/* Echo the file contents, up to the offending URL's opening
quote, to the outfile. */
fwrite (p, 1, url_start - p, fp);
p = url_start;
if (l->flags & UABS2REL)
/* Convert absolute URL to relative. */
{
/* Convert absolute URL to relative. */
char *newname = construct_relative (file, l->local_name);
fprintf (fp, "%s", newname);
putc (*p, fp); /* quoting char */
fputs (newname, fp);
p += l->size - 1;
putc (*p, fp); /* close quote */
++p;
DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
l->url, newname, l->pos, file));
free (newname);
}
p += l->size;
else if (l->flags & UREL2ABS)
{
/* Convert the link to absolute URL. */
char *newlink = l->url;
putc (*p, fp); /* quoting char */
fputs (newlink, fp);
p += l->size - 1;
putc (*p, fp); /* close quote */
++p;
DEBUGP (("REL2ABS: <something> to %s at position %d in %s.\n",
newlink, l->pos, file));
}
}
/* Output the rest of the file. */
if (p - buf < size)
{
for (p2 = buf + size; p < p2; p++)
putc (*p, fp);
}
if (p - fm->content < fm->length)
fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
free (buf);
read_file_free (fm);
logputs (LOG_VERBOSE, _("done.\n"));
}
@ -1746,6 +1489,79 @@ add_url (urlpos *l, const char *url, const char *file)
return t;
}
static void
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
{
/* Rather than just writing over the original .html file with the
converted version, save the former to *.orig. Note we only do
this for files we've _successfully_ downloaded, so we don't
clobber .orig files sitting around from previous invocations. */
/* Construct the backup filename as the original name plus ".orig". */
size_t filename_len = strlen(file);
char* filename_plus_orig_suffix;
boolean already_wrote_backup_file = FALSE;
slist* converted_file_ptr;
static slist* converted_files = NULL;
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
/* Just write "orig" over "html". We need to do it this way
because when we're checking to see if we've downloaded the
file before (to see if we can skip downloading it), we don't
know if it's a text/html file. Therefore we don't know yet
at that stage that -E is going to cause us to tack on
".html", so we need to compare vs. the original URL plus
".orig", not the original URL plus ".html.orig". */
filename_plus_orig_suffix = alloca (filename_len + 1);
strcpy(filename_plus_orig_suffix, file);
strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
}
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
strcpy(filename_plus_orig_suffix, file);
strcpy(filename_plus_orig_suffix + filename_len, ".orig");
}
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file
each time in such a case, it'll end up containing the first-pass
conversion, not the original file. So, see if we've already been
called on this file. */
converted_file_ptr = converted_files;
while (converted_file_ptr != NULL)
if (strcmp(converted_file_ptr->string, file) == 0)
{
already_wrote_backup_file = TRUE;
break;
}
else
converted_file_ptr = converted_file_ptr->next;
if (!already_wrote_backup_file)
{
/* Rename <file> to <file>.orig before former gets written over. */
if (rename(file, filename_plus_orig_suffix) != 0)
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
file, filename_plus_orig_suffix, strerror (errno));
/* Remember that we've already written a .orig backup for this file.
Note that we never free this memory since we need it till the
convert_all_links() call, which is one of the last things the
program does before terminating. BTW, I'm not sure if it would be
safe to just set 'converted_file_ptr->string' to 'file' below,
rather than making a copy of the string... Another note is that I
thought I could just add a field to the urlpos structure saying
that we'd written a .orig file for this URL, but that didn't work,
so I had to make this separate list. */
converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
converted_file_ptr->next = converted_files;
converted_files = converted_file_ptr;
}
}
/* Remembers which files have been downloaded. In the standard case, should be
called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
@ -1798,3 +1614,10 @@ downloaded_file (downloaded_file_t mode, const char* file)
return FILE_NOT_ALREADY_DOWNLOADED;
}
}
/* Initialization of static stuff. */
void
url_init (void)
{
init_unsafe_char_table ();
}

View File

@ -88,6 +88,7 @@ struct urlinfo *newurl PARAMS ((void));
void freeurl PARAMS ((struct urlinfo *, int));
uerr_t urlproto PARAMS ((const char *));
int skip_proto PARAMS ((const char *));
int has_proto PARAMS ((const char *));
int skip_uname PARAMS ((const char *));
uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int));
@ -95,7 +96,7 @@ char *str_url PARAMS ((const struct urlinfo *, int));
int url_equal PARAMS ((const char *, const char *));
urlpos *get_urls_file PARAMS ((const char *));
urlpos *get_urls_html PARAMS ((const char *, const char *, int, int));
urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
void free_urlpos PARAMS ((urlpos *));
char *url_concat PARAMS ((const char *, const char *));

View File

@ -31,6 +31,9 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif
#ifdef HAVE_MMAP
# include <sys/mman.h>
#endif
#ifdef HAVE_PWD_H
# include <pwd.h>
#endif
@ -45,11 +48,13 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifdef NeXT
# include <libc.h> /* for access() */
#endif
#include <fcntl.h>
#include <assert.h>
#include "wget.h"
#include "utils.h"
#include "fnmatch.h"
#include "hash.h"
#ifndef errno
extern int errno;
@ -736,28 +741,149 @@ read_whole_line (FILE *fp)
line = xrealloc (line, length + 1);
return line;
}
/* Read FILE into memory. A pointer to `struct file_memory' are
returned; use struct element `content' to access file contents, and
the element `length' to know the file length. `content' is *not*
zero-terminated, and you should *not* read or write beyond the [0,
length) range of characters.
/* Load file pointed to by FP to memory and return the malloc-ed
buffer with the contents. *NREAD will contain the number of read
bytes. The file is loaded in chunks, allocated exponentially,
starting with FILE_BUFFER_SIZE bytes. */
void
load_file (FILE *fp, char **buf, long *nread)
After you are done with the file contents, call read_file_free to
release the memory.
Depending on the operating system and the type of file that is
being read, read_file() either mmap's the file into memory, or
reads the file into the core using read().
If file is named "-", fileno(stdin) is used for reading instead.
If you want to read from a real file named "-", use "./-" instead. */
struct file_memory *
read_file (const char *file)
{
long bufsize;
int fd;
struct file_memory *fm;
long size;
int inhibit_close = 0;
bufsize = 512;
*nread = 0;
*buf = NULL;
while (!feof (fp) && !ferror (fp))
/* Some magic in the finest tradition of Perl and its kin: if FILE
is "-", just use stdin. */
if (HYPHENP (file))
{
*buf = (char *)xrealloc (*buf, bufsize + *nread);
*nread += fread (*buf + *nread, sizeof (char), bufsize, fp);
bufsize <<= 1;
fd = fileno (stdin);
inhibit_close = 1;
/* Note that we don't inhibit mmap() in this case. If stdin is
redirected from a regular file, mmap() will still work. */
}
/* #### No indication of encountered error?? */
else
fd = open (file, O_RDONLY);
if (fd < 0)
return NULL;
fm = xmalloc (sizeof (struct file_memory));
#ifdef HAVE_MMAP
{
struct stat buf;
if (fstat (fd, &buf) < 0)
goto mmap_lose;
fm->length = buf.st_size;
/* NOTE: As far as I know, the callers of this function never
modify the file text. Relying on this would enable us to
specify PROT_READ and MAP_SHARED for a marginal gain in
efficiency, but at some cost to generality. */
fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
MAP_PRIVATE, fd, 0);
if (fm->content == MAP_FAILED)
goto mmap_lose;
if (!inhibit_close)
close (fd);
fm->mmap_p = 1;
return fm;
}
mmap_lose:
/* The most common reason why mmap() fails is that FD does not point
to a plain file. However, it's also possible that mmap() doesn't
work for a particular type of file. Therefore, whenever mmap()
fails, we just fall back to the regular method. */
#endif /* HAVE_MMAP */
fm->length = 0;
size = 512; /* number of bytes fm->contents can
hold at any given time. */
fm->content = xmalloc (size);
while (1)
{
long nread;
if (fm->length > size / 2)
{
/* #### I'm not sure whether the whole exponential-growth
thing makes sense with kernel read. On Linux at least,
read() refuses to read more than 4K from a file at a
single chunk anyway. But other Unixes might optimize it
better, and it doesn't *hurt* anything, so I'm leaving
it. */
/* Normally, we grow SIZE exponentially to make the number
of calls to read() and realloc() logarithmic in relation
to file size. However, read() can read an amount of data
smaller than requested, and it would be unreasonably to
double SIZE every time *something* was read. Therefore,
we double SIZE only when the length exceeds half of the
entire allocated size. */
size <<= 1;
fm->content = xrealloc (fm->content, size);
}
nread = read (fd, fm->content + fm->length, size - fm->length);
if (nread > 0)
/* Successful read. */
fm->length += nread;
else if (nread < 0)
/* Error. */
goto lose;
else
/* EOF */
break;
}
if (!inhibit_close)
close (fd);
if (size > fm->length && fm->length != 0)
/* Due to exponential growth of fm->content, the allocated region
might be much larger than what is actually needed. */
fm->content = xrealloc (fm->content, fm->length);
fm->mmap_p = 0;
return fm;
lose:
if (!inhibit_close)
close (fd);
free (fm->content);
free (fm);
return NULL;
}
/* Release the resources held by FM. Specifically, this calls
munmap() or free() on fm->content, depending whether mmap or
malloc/read were used to read in the file. It also frees the
memory needed to hold the FM structure itself. */
void
read_file_free (struct file_memory *fm)
{
#ifdef HAVE_MMAP
if (fm->mmap_p)
{
munmap (fm->content, fm->length);
}
else
#endif
{
free (fm->content);
}
free (fm);
}
/* Free the pointers in a NULL-terminated vector of pointers, then
free the pointer itself. */
void
@ -801,97 +927,42 @@ merge_vecs (char **v1, char **v2)
return v1;
}
/* A set of simple-minded routines to store and search for strings in
a linked list. You may add a string to the slist, and peek whether
it's still in there at any time later. */
/* A set of simple-minded routines to store strings in a linked list.
This used to also be used for searching, but now we have hash
tables for that. */
/* Add an element to the list. If flags is NOSORT, the list will not
be sorted. */
/* Append an element to the list. */
slist *
add_slist (slist *l, const char *s, int flags)
slist_append (slist *l, const char *s)
{
slist *t, *old, *beg;
int cmp;
slist *newel = (slist *)xmalloc (sizeof (slist));
slist *beg = l;
if (flags & NOSORT)
{
if (!l)
{
t = (slist *)xmalloc (sizeof (slist));
t->string = xstrdup (s);
t->next = NULL;
return t;
}
beg = l;
/* Find the last element. */
while (l->next)
l = l->next;
t = (slist *)xmalloc (sizeof (slist));
l->next = t;
t->string = xstrdup (s);
t->next = NULL;
return beg;
}
/* Empty list or changing the first element. */
if (!l || (cmp = strcmp (l->string, s)) > 0)
{
t = (slist *)xmalloc (sizeof (slist));
t->string = xstrdup (s);
t->next = l;
return t;
}
newel->string = xstrdup (s);
newel->next = NULL;
beg = l;
if (cmp == 0)
return beg;
/* Second two one-before-the-last element. */
if (!l)
return newel;
/* Find the last element. */
while (l->next)
{
old = l;
l = l->next;
cmp = strcmp (s, l->string);
if (cmp == 0) /* no repeating in the list */
return beg;
else if (cmp > 0)
continue;
/* If the next list element is greater than s, put s between the
current and the next list element. */
t = (slist *)xmalloc (sizeof (slist));
old->next = t;
t->next = l;
t->string = xstrdup (s);
return beg;
}
t = (slist *)xmalloc (sizeof (slist));
t->string = xstrdup (s);
/* Insert the new element after the last element. */
l->next = t;
t->next = NULL;
l = l->next;
l->next = newel;
return beg;
}
/* Is there a specific entry in the list? */
int
in_slist (slist *l, const char *s)
slist_contains (slist *l, const char *s)
{
int cmp;
while (l)
{
cmp = strcmp (l->string, s);
if (cmp == 0)
return 1;
else if (cmp > 0) /* the list is ordered! */
return 0;
l = l->next;
}
for (; l; l = l->next)
if (!strcmp (l->string, s))
return 1;
return 0;
}
/* Free the whole slist. */
void
free_slist (slist *l)
slist_free (slist *l)
{
slist *n;
@ -903,6 +974,58 @@ free_slist (slist *l)
l = n;
}
}
/* Sometimes it's useful to create "sets" of strings, i.e. special
hash tables where you want to store strings as keys and merely
query for their existence. Here is a set of utility routines that
makes that transparent. */
void
string_set_add (struct hash_table *ht, const char *s)
{
/* We use "1" as value. It provides us a useful and clear arbitrary
value, and it consumes no memory -- the pointers to the same
string "1" will be shared by all the key-value pairs in the hash
table. */
hash_table_put (ht, xstrdup (s), "1");
}
int
string_set_exists (struct hash_table *ht, const char *s)
{
return hash_table_exists (ht, s);
}
static int
string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
{
free (key);
return 0;
}
void
string_set_free (struct hash_table *ht)
{
hash_table_map (ht, string_set_free_mapper, NULL);
hash_table_destroy (ht);
}
static int
free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
{
free (key);
free (value);
return 0;
}
/* Another utility function: call free() on all keys and values of HT. */
void
free_keys_and_values (struct hash_table *ht)
{
hash_table_map (ht, free_keys_and_values_mapper, NULL);
}
/* Engine for legible and legible_long_long; this function works on
strings. */

View File

@ -20,11 +20,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifndef UTILS_H
#define UTILS_H
/* Flags for slist. */
enum {
NOSORT = 1
};
enum accd {
ALLABS = 1
};
@ -36,6 +31,14 @@ typedef struct _slist
struct _slist *next;
} slist;
struct hash_table;
struct file_memory {
char *content;
long length;
int mmap_p;
};
char *time_str PARAMS ((time_t *));
const char *uerrmsg PARAMS ((uerr_t));
@ -58,13 +61,19 @@ int accdir PARAMS ((const char *s, enum accd));
char *suffix PARAMS ((const char *s));
char *read_whole_line PARAMS ((FILE *));
void load_file PARAMS ((FILE *, char **, long *));
struct file_memory *read_file PARAMS ((const char *));
void read_file_free PARAMS ((struct file_memory *));
void free_vec PARAMS ((char **));
char **merge_vecs PARAMS ((char **, char **));
slist *add_slist PARAMS ((slist *, const char *, int));
int in_slist PARAMS ((slist *, const char *));
void free_slist PARAMS ((slist *));
slist *slist_append PARAMS ((slist *, const char *));
int slist_contains PARAMS ((slist *, const char *));
void slist_free PARAMS ((slist *));
void string_set_add PARAMS ((struct hash_table *, const char *));
int string_set_exists PARAMS ((struct hash_table *, const char *));
void string_set_free PARAMS ((struct hash_table *));
void free_keys_and_values PARAMS ((struct hash_table *));
char *legible PARAMS ((long));
char *legible_very_long PARAMS ((VERY_LONG_TYPE));

View File

@ -71,7 +71,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/* Print X if debugging is enabled; a no-op otherwise. */
#ifdef DEBUG
# define DEBUGP(x) do { debug_logprintf x; } while (0)
# define DEBUGP(x) do { if (opt.debug) { debug_logprintf x; } } while (0)
#else /* not DEBUG */
# define DEBUGP(x) DO_NOTHING
#endif /* not DEBUG */