diff --git a/ChangeLog b/ChangeLog index 46905616..e873628f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com> + + * configure.in: Test for MMAP. + 2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com> * windows/config.h.ms: snprintf and vsnprintf exist under Windows. diff --git a/TODO b/TODO index d8319039..c0fecefc 100644 --- a/TODO +++ b/TODO @@ -49,15 +49,6 @@ changes. * Make `-k' check for files that were downloaded in the past and convert links to them in newly-downloaded documents. -* -k should convert relative references to absolute if not downloaded. - -* -k should convert "hostless absolute" URLs, like <A HREF="/index.html">. - However, Brian McMahon <bm@iucr.org> wants the old incorrect behavior to still - be available as an option, as he depends on it to allow mirrors of his site to - send CGI queries to his original site, but still get graphics off of the - mirror site. Perhaps this would be better dealt with by adding an option to - tell -k not to convert certain URL patterns? - * Add option to clobber existing file names (no `.N' suffixes). * Introduce a concept of "boolean" options. For instance, every @@ -85,9 +76,6 @@ changes. * Allow size limit to files (perhaps with an option to download oversize files up through the limit or not at all, to get more functionality than [u]limit. -* Recognize HTML comments correctly. Add more options for handling - bogus HTML found all over the 'net. - * Implement breadth-first retrieval. * Download to .in* when mirroring. diff --git a/configure b/configure index f7e130a2..de78c984 100755 --- a/configure +++ b/configure @@ -2040,15 +2040,55 @@ EOF fi -for ac_func in strdup strstr strcasecmp strncasecmp +for ac_hdr in unistd.h +do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "configure:2048: checking for $ac_hdr" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext <<EOF +#line 2053 "configure" +#include "confdefs.h" +#include <$ac_hdr> +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'` + cat >> confdefs.h <<EOF +#define $ac_tr_hdr 1 +EOF + +else + echo "$ac_t""no" 1>&6 +fi +done + +for ac_func in getpagesize do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2047: checking for $ac_func" >&5 +echo "configure:2087: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2052 "configure" +#line 2092 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2071,7 +2111,233 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <<EOF +#define $ac_tr_func 1 +EOF + +else + echo "$ac_t""no" 1>&6 +fi +done + +echo $ac_n "checking for working mmap""... $ac_c" 1>&6 +echo "configure:2140: checking for working mmap" >&5 +if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_func_mmap_fixed_mapped=no +else + cat > conftest.$ac_ext <<EOF +#line 2148 "configure" +#include "confdefs.h" + +/* Thanks to Mike Haertel and Jim Avera for this test. + Here is a matrix of mmap possibilities: + mmap private not fixed + mmap private fixed at somewhere currently unmapped + mmap private fixed at somewhere already mapped + mmap shared not fixed + mmap shared fixed at somewhere currently unmapped + mmap shared fixed at somewhere already mapped + For private mappings, we should verify that changes cannot be read() + back from the file, nor mmap's back from the file at a different + address. (There have been systems where private was not correctly + implemented like the infamous i386 svr4.0, and systems where the + VM page cache was not coherent with the filesystem buffer cache + like early versions of FreeBSD and possibly contemporary NetBSD.) + For shared mappings, we should conversely verify that changes get + propogated back to all the places they're supposed to be. + + Grep wants private fixed already mapped. + The main things grep needs to know about mmap are: + * does it exist and is it safe to write into the mmap'd area + * how to use it (BSD variants) */ +#include <sys/types.h> +#include <fcntl.h> +#include <sys/mman.h> + +/* This mess was copied from the GNU getpagesize.h. */ +#ifndef HAVE_GETPAGESIZE +# ifdef HAVE_UNISTD_H +# include <unistd.h> +# endif + +/* Assume that all systems that can run configure have sys/param.h. */ +# ifndef HAVE_SYS_PARAM_H +# define HAVE_SYS_PARAM_H 1 +# endif + +# ifdef _SC_PAGESIZE +# define getpagesize() sysconf(_SC_PAGESIZE) +# else /* no _SC_PAGESIZE */ +# ifdef HAVE_SYS_PARAM_H +# include <sys/param.h> +# ifdef EXEC_PAGESIZE +# define getpagesize() EXEC_PAGESIZE +# else /* no EXEC_PAGESIZE */ +# ifdef NBPG +# define getpagesize() NBPG * CLSIZE +# ifndef CLSIZE +# define CLSIZE 1 +# endif /* no CLSIZE */ +# else /* no NBPG */ +# ifdef NBPC +# define getpagesize() NBPC +# else /* no NBPC */ +# ifdef PAGESIZE +# define getpagesize() PAGESIZE +# endif /* PAGESIZE */ +# endif /* no NBPC */ +# endif /* no NBPG */ +# endif /* no EXEC_PAGESIZE */ +# else /* no HAVE_SYS_PARAM_H */ +# define getpagesize() 8192 /* punt totally */ +# endif /* no HAVE_SYS_PARAM_H */ +# endif /* no _SC_PAGESIZE */ + +#endif /* no HAVE_GETPAGESIZE */ + +#ifdef __cplusplus +extern "C" { void *malloc(unsigned); } +#else +char *malloc(); +#endif + +int +main() +{ + char *data, *data2, *data3; + int i, pagesize; + int fd; + + pagesize = getpagesize(); + + /* + * First, make a file with some known garbage in it. + */ + data = malloc(pagesize); + if (!data) + exit(1); + for (i = 0; i < pagesize; ++i) + *(data + i) = rand(); + umask(0); + fd = creat("conftestmmap", 0600); + if (fd < 0) + exit(1); + if (write(fd, data, pagesize) != pagesize) + exit(1); + close(fd); + + /* + * Next, try to mmap the file at a fixed address which + * already has something else allocated at it. If we can, + * also make sure that we see the same garbage. + */ + fd = open("conftestmmap", O_RDWR); + if (fd < 0) + exit(1); + data2 = malloc(2 * pagesize); + if (!data2) + exit(1); + data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1); + if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FIXED, fd, 0L)) + exit(1); + for (i = 0; i < pagesize; ++i) + if (*(data + i) != *(data2 + i)) + exit(1); + + /* + * Finally, make sure that changes to the mapped area + * do not percolate back to the file as seen by read(). + * (This is a bug on some variants of i386 svr4.0.) + */ + for (i = 0; i < pagesize; ++i) + *(data2 + i) = *(data2 + i) + 1; + data3 = malloc(pagesize); + if (!data3) + exit(1); + if (read(fd, data3, pagesize) != pagesize) + exit(1); + for (i = 0; i < pagesize; ++i) + if (*(data + i) != *(data3 + i)) + exit(1); + close(fd); + unlink("conftestmmap"); + exit(0); +} + +EOF +if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_func_mmap_fixed_mapped=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_func_mmap_fixed_mapped=no +fi +rm -fr conftest* +fi + +fi + +echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6 +if test $ac_cv_func_mmap_fixed_mapped = yes; then + cat >> confdefs.h <<\EOF +#define HAVE_MMAP 1 +EOF + +fi + +for ac_func in strdup strstr strcasecmp strncasecmp +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:2313: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext <<EOF +#line 2318 "configure" +#include "confdefs.h" +/* System header to define __stub macros and hopefully few prototypes, + which can conflict with char $ac_func(); below. */ +#include <assert.h> +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2098,12 +2364,12 @@ done for ac_func in gettimeofday mktime strptime do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2102: checking for $ac_func" >&5 +echo "configure:2368: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2107 "configure" +#line 2373 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2126,7 +2392,7 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2130: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2153,12 +2419,12 @@ done for ac_func in strerror snprintf vsnprintf select signal symlink access isatty do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2157: checking for $ac_func" >&5 +echo "configure:2423: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2162 "configure" +#line 2428 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2181,7 +2447,7 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2185: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2208,12 +2474,12 @@ done for ac_func in uname gethostname do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2212: checking for $ac_func" >&5 +echo "configure:2478: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2217 "configure" +#line 2483 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2236,7 +2502,7 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2264,12 +2530,12 @@ done for ac_func in gethostbyname do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2268: checking for $ac_func" >&5 +echo "configure:2534: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2273 "configure" +#line 2539 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2292,7 +2558,7 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2314,7 +2580,7 @@ EOF else echo "$ac_t""no" 1>&6 echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6 -echo "configure:2318: checking for gethostbyname in -lnsl" >&5 +echo "configure:2584: checking for gethostbyname in -lnsl" >&5 ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2322,7 +2588,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lnsl $LIBS" cat > conftest.$ac_ext <<EOF -#line 2326 "configure" +#line 2592 "configure" #include "confdefs.h" /* Override any gcc2 internal prototype to avoid an error. */ /* We use char because int might match the return type of a gcc2 @@ -2333,7 +2599,7 @@ int main() { gethostbyname() ; return 0; } EOF -if { (eval echo configure:2337: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2367,7 +2633,7 @@ done echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6 -echo "configure:2371: checking for socket in -lsocket" >&5 +echo "configure:2637: checking for socket in -lsocket" >&5 ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2375,7 +2641,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lsocket $LIBS" cat > conftest.$ac_ext <<EOF -#line 2379 "configure" +#line 2645 "configure" #include "confdefs.h" /* Override any gcc2 internal prototype to avoid an error. */ /* We use char because int might match the return type of a gcc2 @@ -2386,7 +2652,7 @@ int main() { socket() ; return 0; } EOF -if { (eval echo configure:2390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2417,7 +2683,7 @@ fi if test "x${with_socks}" = xyes then echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6 -echo "configure:2421: checking for main in -lresolv" >&5 +echo "configure:2687: checking for main in -lresolv" >&5 ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2425,14 +2691,14 @@ else ac_save_LIBS="$LIBS" LIBS="-lresolv $LIBS" cat > conftest.$ac_ext <<EOF -#line 2429 "configure" +#line 2695 "configure" #include "confdefs.h" int main() { main() ; return 0; } EOF -if { (eval echo configure:2436: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2460,7 +2726,7 @@ else fi echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6 -echo "configure:2464: checking for Rconnect in -lsocks" >&5 +echo "configure:2730: checking for Rconnect in -lsocks" >&5 ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2468,7 +2734,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lsocks $LIBS" cat > conftest.$ac_ext <<EOF -#line 2472 "configure" +#line 2738 "configure" #include "confdefs.h" /* Override any gcc2 internal prototype to avoid an error. */ /* We use char because int might match the return type of a gcc2 @@ -2479,7 +2745,7 @@ int main() { Rconnect() ; return 0; } EOF -if { (eval echo configure:2483: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2511,7 +2777,7 @@ fi ALL_LINGUAS="cs de hr it no pl pt_BR ru" echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6 -echo "configure:2515: checking whether NLS is requested" >&5 +echo "configure:2781: checking whether NLS is requested" >&5 # Check whether --enable-nls or --disable-nls was given. if test "${enable_nls+set}" = set; then enableval="$enable_nls" @@ -2528,7 +2794,7 @@ fi # Extract the first word of "msgfmt", so it can be a program name with args. set dummy msgfmt; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2532: checking for $ac_word" >&5 +echo "configure:2798: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2562,7 +2828,7 @@ fi # Extract the first word of "xgettext", so it can be a program name with args. set dummy xgettext; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2566: checking for $ac_word" >&5 +echo "configure:2832: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2597,7 +2863,7 @@ fi # Extract the first word of "gmsgfmt", so it can be a program name with args. set dummy gmsgfmt; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2601: checking for $ac_word" >&5 +echo "configure:2867: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2647,17 +2913,17 @@ fi do ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 -echo "configure:2651: checking for $ac_hdr" >&5 +echo "configure:2917: checking for $ac_hdr" >&5 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2656 "configure" +#line 2922 "configure" #include "confdefs.h" #include <$ac_hdr> EOF ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" -{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` if test -z "$ac_err"; then rm -rf conftest* @@ -2687,12 +2953,12 @@ done for ac_func in gettext do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2691: checking for $ac_func" >&5 +echo "configure:2957: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <<EOF -#line 2696 "configure" +#line 2962 "configure" #include "confdefs.h" /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func(); below. */ @@ -2715,7 +2981,7 @@ $ac_func(); ; return 0; } EOF -if { (eval echo configure:2719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2737,7 +3003,7 @@ EOF else echo "$ac_t""no" 1>&6 echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6 -echo "configure:2741: checking for gettext in -lintl" >&5 +echo "configure:3007: checking for gettext in -lintl" >&5 ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2745,7 +3011,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lintl $LIBS" cat > conftest.$ac_ext <<EOF -#line 2749 "configure" +#line 3015 "configure" #include "confdefs.h" /* Override any gcc2 internal prototype to avoid an error. */ /* We use char because int might match the return type of a gcc2 @@ -2756,7 +3022,7 @@ int main() { gettext() ; return 0; } EOF -if { (eval echo configure:2760: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2824,7 +3090,7 @@ do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2828: checking for $ac_word" >&5 +echo "configure:3094: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else diff --git a/configure.in b/configure.in index 474e5d57..4f4440b0 100644 --- a/configure.in +++ b/configure.in @@ -160,6 +160,7 @@ dnl dnl Checks for library functions. dnl AC_FUNC_ALLOCA +AC_FUNC_MMAP AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp) AC_CHECK_FUNCS(gettimeofday mktime strptime) AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty) diff --git a/doc/ChangeLog b/doc/ChangeLog index bddc4b5d..110102a6 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,8 @@ +2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com> + + * wget.texi (Robots): Document that we now support the meta tag + exclusion. + 2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com> * wget.texi: Use --- consistently. diff --git a/doc/wget.texi b/doc/wget.texi index eb4d00c9..1accbb94 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -2548,8 +2548,8 @@ this: This is explained in some detail at @url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}. -Unfortunately, Wget does not support this method of robot exclusion yet, -but it will be implemented in the next release. +Wget supports this method of robot exclusion in addition to the usual +@file{/robots.txt} exclusion. @node Security Considerations, Contributors, Robots, Appendices @section Security Considerations diff --git a/po/cs.gmo b/po/cs.gmo index 5511ce5c..c276436c 100644 Binary files a/po/cs.gmo and b/po/cs.gmo differ diff --git a/po/de.gmo b/po/de.gmo index 6dc32778..63f01f82 100644 Binary files a/po/de.gmo and b/po/de.gmo differ diff --git a/po/hr.gmo b/po/hr.gmo index 039855ad..70c9d001 100644 Binary files a/po/hr.gmo and b/po/hr.gmo differ diff --git a/po/it.gmo b/po/it.gmo index 6e9269b1..f70efae4 100644 Binary files a/po/it.gmo and b/po/it.gmo differ diff --git a/po/no.gmo b/po/no.gmo index a150d4e0..c54d1a07 100644 Binary files a/po/no.gmo and b/po/no.gmo differ diff --git a/po/pl.gmo b/po/pl.gmo index 6308a0e4..96e50064 100644 Binary files a/po/pl.gmo and b/po/pl.gmo differ diff --git a/po/pt_BR.gmo b/po/pt_BR.gmo index 917f90f8..447bdf17 100644 Binary files a/po/pt_BR.gmo and b/po/pt_BR.gmo differ diff --git a/po/ru.gmo b/po/ru.gmo index 6019b4af..df7cd87d 100644 Binary files a/po/ru.gmo and b/po/ru.gmo differ diff --git a/src/ChangeLog b/src/ChangeLog index 29f99623..acbc0331 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,117 @@ +2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com> + + * retr.c (get_contents): If use_expected, make sure that the + appropriate amount of data is being read. + + * http.c (gethttp): Check for both `Keep-Alive: ...' and + `Connection: Keep-Alive'. + + * wget.h (DEBUGP): Call debug_logprintf only if opt.debug is + turned on. + +2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com> + + * http.c (connection_available_p): Use it. + + * connect.c (test_socket_open): New function. + + * http.c (gethttp): Support persistent connections. Based on the + ideas, and partly on code, by Sam Horrocks <sam@daemoninc.com>. + (register_persistent): New function. + (connection_available_p): Ditto. + (invalidate_connection): Ditto. + +2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com> + + * url.c (convert_links): Handle UREL2ABS case. + + * recur.c (recursive_retrieve): Instead of the list + urls_downloaded, use hash tables dl_file_url_map and + dl_url_file_map. + (convert_all_links): Use them to retrieve data. + + * host.c (clean_hosts): Free the hash tables. + + * main.c (private_initialize): Call host_init(). + + * host.c (store_hostaddress): Use a saner, hash table-based data + model. + (realhost): Ditto. + (host_init): Initialize the hash tables. + +2000-11-18 Hrvoje Niksic <hniksic@arsdigita.com> + + * utils.c (slist_append): Eviscerate NOSORT. Hash tables are now + used for what the sorted slists used to be used for. + (slist_contains): Don't rely on the list being sorted. + (slist_append): Simplify the code. + + * recur.c (recursive_cleanup): Use free_string_set. + + * utils.c (string_set_add, string_set_exists, string_set_free): + New functions for easier freeing of hash tables whose keys are + strdup'ed strings. + + * recur.c (recursive_retrieve): Use the hash table functions for + storing undesirable URLs. + + * hash.c: New file. + +2000-11-17 Hrvoje Niksic <hniksic@arsdigita.com> + + * main.c (private_initialize): Call url_init. + (main): Call private_initialize. + + * url.c (unsafe_char_table): New table. + (UNSAFE_CHAR): Use it. + (init_unsafe_char_table): New function. + (url_init): New function; call init_unsafe_char_table. + +2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com> + + * html-url.c (handle_link): Handle HTML fragment identifiers. + + * recur.c (recursive_retrieve): If norobot info is respected and + the file is specified not to be followed by robots, respect that. + + * html-url.c (collect_tags_mapper): Handle <meta name=robots + content=X>. For us the important cases are where X is NONE or + where X contains NOFOLLOW. + (get_urls_html): Propagate that information to the caller. + +2000-11-13 Hrvoje Niksic <hniksic@arsdigita.com> + + * url.c (convert_links): Unlink the file we might be reading from + before writing to it. + (convert_links): Use alloca instead of malloc for + filename_plus_orig_suffix. + +2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com> + + * url.c (get_urls_file): Ditto. + (convert_links): Ditto. + + * html-url.c (get_urls_html): Use read_file() instead of + load_file(). + + * utils.c (read_file): New function, instead of the old + load_file(). + (read_file_free): Ditto. + + * url.c (findurl): Search only for the supported protocols. + (convert_links): Use fwrite() when writing out a region of + characters. + +2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com> + + * ftp-ls.c: Move html_quote_string and ftp_index here. + + * url.c: Remove get_urls_html, since that's now in html-url.c. + + * html-url.c: New file. + + * html-parse.c: New file. + 2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com> * mswindows.h: Define snprintf and vsnprintf to _snprintf and diff --git a/src/Makefile.in b/src/Makefile.in index e3b433b0..bfe9868d 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -57,9 +57,10 @@ MD5_OBJ = @MD5_OBJ@ OPIE_OBJ = @OPIE_OBJ@ OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o \ - ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o \ - http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o \ - recur$o retr$o snprintf$o url$o utils$o version$o + ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o \ + html-parse$o html-url$o http$o init$o log$o main$o \ + $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o \ + url$o utils$o version$o .SUFFIXES: .SUFFIXES: .c .o ._c ._o @@ -133,26 +134,31 @@ TAGS: *.c *.h # DO NOT DELETE THIS LINE -- make depend depends on it. -cmpt$o: config.h wget.h sysdep.h options.h -connect$o: config.h wget.h sysdep.h options.h connect.h host.h -fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h -ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h -ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h -ftp-opie$o: config.h wget.h sysdep.h options.h md5.h -ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h -getopt$o: wget.h sysdep.h options.h -headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h -host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h -html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h -http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h -init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h -log$o: config.h wget.h sysdep.h options.h utils.h -main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h -md5$o: wget.h sysdep.h options.h md5.h -mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h -netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h -rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h -recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h -retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h -url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h -utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h +cmpt$o: wget.h +connect$o: wget.h connect.h host.h +fnmatch$o: wget.h fnmatch.h +ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h +ftp-ls$o: wget.h utils.h ftp.h url.h +ftp-opie$o: wget.h md5.h +ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h +getopt$o: wget.h getopt.h +hash$o: wget.h utils.h hash.h +headers$o: wget.h connect.h rbuf.h headers.h +host$o: wget.h utils.h host.h url.h hash.h +html-parse$o: wget.h html-parse.h +html-url$o: wget.h html-parse.h url.h utils.h +html$o: wget.h url.h utils.h ftp.h +http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h +init$o: wget.h utils.h init.h host.h recur.h netrc.h +log$o: wget.h utils.h +main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h +md5$o: wget.h md5.h +mswindows$o: wget.h url.h +netrc$o: wget.h utils.h netrc.h init.h +rbuf$o: wget.h rbuf.h connect.h +recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h +retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h +snprintf$o: +url$o: wget.h utils.h url.h host.h +utils$o: wget.h utils.h fnmatch.h hash.h +version$o: diff --git a/src/config.h.in b/src/config.h.in index 2038acde..ed200e32 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -101,6 +101,9 @@ char *alloca (); /* Define if you have the uname function. */ #undef HAVE_UNAME +/* Define if you have a working version of mmap. */ +#undef HAVE_MMAP + /* Define if you have the gethostname function. */ #undef HAVE_GETHOSTNAME diff --git a/src/connect.c b/src/connect.c index 28ce2043..feb2bb52 100644 --- a/src/connect.c +++ b/src/connect.c @@ -107,6 +107,37 @@ make_connection (int *sock, char *hostname, unsigned short port) return NOCONERROR; } +int +test_socket_open (int sock) +{ +#ifdef HAVE_SELECT + fd_set check_set; + struct timeval to; + + /* Check if we still have a valid (non-EOF) connection. From Andrew + * Maholski's code in the Unix Socket FAQ. */ + + FD_ZERO (&check_set); + FD_SET (sock, &check_set); + + /* Wait one microsecond */ + to.tv_sec = 0; + to.tv_usec = 1; + + /* If we get a timeout, then that means still connected */ + if (select (sock + 1, &check_set, NULL, NULL, &to) == 0) + { + /* Connection is valid (not EOF), so continue */ + return 1; + } + else + return 0; +#else + /* Without select, it's hard to know for sure. */ + return 1; +#endif +} + /* Bind the local port PORT. This does all the necessary work, which is creating a socket, setting SO_REUSEADDR option on it, then calling bind() and listen(). If *PORT is 0, a random port is diff --git a/src/ftp-ls.c b/src/ftp-ls.c index 16a7f7d6..884cf3d8 100644 --- a/src/ftp-ls.c +++ b/src/ftp-ls.c @@ -36,6 +36,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "wget.h" #include "utils.h" #include "ftp.h" +#include "url.h" /* Converts symbolic permissions to number-style ones, e.g. string rwxr-xr-x to 755. For now, it knows nothing of @@ -388,3 +389,175 @@ ftp_parse_ls (const char *file) { return ftp_parse_unix_ls (file); } + +/* Stuff for creating FTP index. */ + +/* The function returns the pointer to the malloc-ed quoted version of + string s. It will recognize and quote numeric and special graphic + entities, as per RFC1866: + + `&' -> `&' + `<' -> `<' + `>' -> `>' + `"' -> `"' + + No other entities are recognized or replaced. */ +static char * +html_quote_string (const char *s) +{ + const char *b = s; + char *p, *res; + int i; + + /* Pass through the string, and count the new size. */ + for (i = 0; *s; s++, i++) + { + if (*s == '&') + i += 4; /* `amp;' */ + else if (*s == '<' || *s == '>') + i += 3; /* `lt;' and `gt;' */ + else if (*s == '\"') + i += 5; /* `quot;' */ + } + res = (char *)xmalloc (i + 1); + s = b; + for (p = res; *s; s++) + { + switch (*s) + { + case '&': + *p++ = '&'; + *p++ = 'a'; + *p++ = 'm'; + *p++ = 'p'; + *p++ = ';'; + break; + case '<': case '>': + *p++ = '&'; + *p++ = (*s == '<' ? 'l' : 'g'); + *p++ = 't'; + *p++ = ';'; + break; + case '\"': + *p++ = '&'; + *p++ = 'q'; + *p++ = 'u'; + *p++ = 'o'; + *p++ = 't'; + *p++ = ';'; + break; + default: + *p++ = *s; + } + } + *p = '\0'; + return res; +} + +/* The function creates an HTML index containing references to given + directories and files on the appropriate host. The references are + FTP. */ +uerr_t +ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f) +{ + FILE *fp; + char *upwd; + char *htclfile; /* HTML-clean file name */ + + if (!opt.dfp) + { + fp = fopen (file, "wb"); + if (!fp) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return FOPENERR; + } + } + else + fp = opt.dfp; + if (u->user) + { + char *tmpu, *tmpp; /* temporary, clean user and passwd */ + + tmpu = CLEANDUP (u->user); + tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL; + upwd = (char *)xmalloc (strlen (tmpu) + + (tmpp ? (1 + strlen (tmpp)) : 0) + 2); + sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : ""); + free (tmpu); + FREE_MAYBE (tmpp); + } + else + upwd = xstrdup (""); + fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n"); + fprintf (fp, "<html>\n<head>\n<title>"); + fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); + fprintf (fp, "</title>\n</head>\n<body>\n<h1>"); + fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); + fprintf (fp, "</h1>\n<hr>\n<pre>\n"); + while (f) + { + fprintf (fp, " "); + if (f->tstamp != -1) + { + /* #### Should we translate the months? */ + static char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + struct tm *ptm = localtime ((time_t *)&f->tstamp); + + fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon], + ptm->tm_mday); + if (ptm->tm_hour) + fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min); + else + fprintf (fp, " "); + } + else + fprintf (fp, _("time unknown ")); + switch (f->type) + { + case FT_PLAINFILE: + fprintf (fp, _("File ")); + break; + case FT_DIRECTORY: + fprintf (fp, _("Directory ")); + break; + case FT_SYMLINK: + fprintf (fp, _("Link ")); + break; + default: + fprintf (fp, _("Not sure ")); + break; + } + htclfile = html_quote_string (f->name); + fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port); + if (*u->dir != '/') + putc ('/', fp); + fprintf (fp, "%s", u->dir); + if (*u->dir) + putc ('/', fp); + fprintf (fp, "%s", htclfile); + if (f->type == FT_DIRECTORY) + putc ('/', fp); + fprintf (fp, "\">%s", htclfile); + if (f->type == FT_DIRECTORY) + putc ('/', fp); + fprintf (fp, "</a> "); + if (f->type == FT_PLAINFILE) + fprintf (fp, _(" (%s bytes)"), legible (f->size)); + else if (f->type == FT_SYMLINK) + fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)"); + putc ('\n', fp); + free (htclfile); + f = f->next; + } + fprintf (fp, "</pre>\n</body>\n</html>\n"); + free (upwd); + if (!opt.dfp) + fclose (fp); + else + fflush (fp); + return FTPOK; +} diff --git a/src/ftp.c b/src/ftp.c index 4c26cf7d..aa283cf8 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -40,7 +40,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "rbuf.h" #include "retr.h" #include "ftp.h" -#include "html.h" #include "connect.h" #include "host.h" #include "fnmatch.h" @@ -722,7 +721,7 @@ Error in server response, closing control connection.\n")); } reset_timer (); /* Get the contents of the document. */ - res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf); + res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0); con->dltime = elapsed_time (); tms = time_str (NULL); tmrate = rate (*len - restval, con->dltime); diff --git a/src/ftp.h b/src/ftp.h index c2e6d44c..064e6354 100644 --- a/src/ftp.h +++ b/src/ftp.h @@ -92,4 +92,6 @@ typedef struct struct fileinfo *ftp_parse_ls PARAMS ((const char *)); uerr_t ftp_loop PARAMS ((struct urlinfo *, int *)); +uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *); + #endif /* FTP_H */ diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 00000000..e54fb33a --- /dev/null +++ b/src/hash.c @@ -0,0 +1,403 @@ +/* Hash tables. + Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <stdlib.h> +#include <assert.h> + +#include "wget.h" +#include "utils.h" + +#include "hash.h" + +#ifdef STANDALONE +# define xmalloc malloc +# define xrealloc realloc +#endif + +/* This file implements simple hash tables based on linear probing. + The hash table stores key-value pairs in a contiguous array. Both + key and value are void pointers that the hash and test functions + know how to handle. + + Although Knuth & co. recommend double hashing over linear probing, + we use the latter because it accesses array elements sequentially + in case of a collision, yielding in better cache behaviour and + ultimately in better speed. To avoid collision problems with + linear probing, we make sure that the table grows as soon as the + fullness/size ratio exceeds 75%. */ + +struct ht_pair { + void *key; + void *value; +}; + +struct hash_table { + unsigned long (*hash_function) (const void *); + int (*test_function) (const void *, const void *); + + int size; /* size of the array */ + int fullness; /* number of non-empty fields */ + int count; /* number of non-empty, non-deleted + fields. */ + + struct ht_pair *pairs; +}; + +#define ENTRY_DELETED ((void *)0xdeadbeef) + +#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED) +#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL) + +/* Find a prime near, but greather than or equal to SIZE. */ + +int +prime_size (int size) +{ + static const unsigned long primes [] = { + 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031, + 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783, + 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941, + 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519, + 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301, + 10445899, 13579681, 17653589, 22949669, 29834603, 38784989, + 50420551, 65546729, 85210757, 110774011, 144006217, 187208107, + 243370577, 316381771, 411296309, 534685237, 695090819, 903618083, + 1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL + }; + int i; + for (i = 0; i < ARRAY_SIZE (primes); i++) + if (primes[i] >= size) + return primes[i]; + /* huh? */ + return size; +} + +/* Create a hash table of INITIAL_SIZE with hash function + HASH_FUNCTION and test function TEST_FUNCTION. If you wish to + start out with a "small" table which will be regrown as needed, + specify 0 as INITIAL_SIZE. */ + +struct hash_table * +hash_table_new (int initial_size, + unsigned long (*hash_function) (const void *), + int (*test_function) (const void *, const void *)) +{ + struct hash_table *ht + = (struct hash_table *)xmalloc (sizeof (struct hash_table)); + ht->hash_function = hash_function; + ht->test_function = test_function; + ht->size = prime_size (initial_size); + ht->fullness = 0; + ht->count = 0; + ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + return ht; +} + +/* Free the data associated with hash table HT. */ + +void +hash_table_destroy (struct hash_table *ht) +{ + free (ht->pairs); + free (ht); +} + +/* Get the value that corresponds to the key KEY in the hash table HT. + If no value is found, return NULL. Note that NULL is a legal value + for value; if you are storing NULLs in your hash table, you can use + hash_table_exists to be sure that a (possibly NULL) value exists in + the table. */ + +void * +hash_table_get (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return NULL; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + return the_pair->value; + } +} + +/* Return 1 if KEY exists in HT, 0 otherwise. */ + +int +hash_table_exists (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return 0; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + return 1; + } +} + +#define MAX(i, j) (((i) >= (j)) ? (i) : (j)) + +/* Grow hash table HT as necessary, and rehash all the key-value + pairs. */ + +static void +grow_hash_table (struct hash_table *ht) +{ + int i; + struct ht_pair *old_pairs = ht->pairs; + int old_count = ht->count; /* for assert() below */ + int old_size = ht->size; + + /* Normally, the idea is to double ht->size (and round it to next + prime) on each regrow: + + ht->size = prime_size (ht->size * 2); + + But it is possible that the table has large fullness because of + the many deleted entries. If that is the case, we don't want to + blindly grow the table; we just want to rehash it. For that + reason, we use ht->count as the relevant parameter. MAX is used + only because we don't want to actually shrink the table. (But + maybe that's wrong.) */ + + int needed_size = prime_size (ht->count * 2); + ht->size = MAX (old_size, needed_size); + + ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + + /* Need to reset these two; hash_table_put will reinitialize them. */ + ht->fullness = 0; + ht->count = 0; + for (i = 0; i < old_size; i++) + { + struct ht_pair *the_pair = old_pairs + i; + if (!EMPTY_ENTRY_P (the_pair->key) + && !DELETED_ENTRY_P (the_pair->key)) + hash_table_put (ht, the_pair->key, the_pair->value); + } + assert (ht->count == old_count); + free (old_pairs); +} + +/* Put VALUE in the hash table HT under the key KEY. This regrows the + table if necessary. */ + +void +hash_table_put (struct hash_table *ht, const void *key, void *value) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + { + ++ht->fullness; + ++ht->count; + just_insert: + the_pair->key = (void *)key; /* const? */ + the_pair->value = value; + break; + } + else if (DELETED_ENTRY_P (the_pair->key)) + { + /* We're replacing a deleteed entry, so ht->count gets + increased, but ht->fullness remains unchanged. */ + ++ht->count; + goto just_insert; + } + else if (ht->test_function (key, the_pair->key)) + { + /* We're replacing an existing entry, so ht->count and + ht->fullness remain unchanged. */ + goto just_insert; + } + else + { + ++location; + if (location == ht->size) + location = 0; + } + } + if (ht->fullness * 4 > ht->size * 3) + /* When fullness exceeds 75% of size, regrow the table. */ + grow_hash_table (ht); +} + +/* Remove KEY from HT. */ + +int +hash_table_remove (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return 0; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + { + /* We don't really remove an entry from the hash table: we + just mark it as deleted. This is because there may be + other entries located after this entry whose hash number + points to a location before this entry. (Example: keys + A, B and C have the same hash. If you were to really + *delete* B from the table, C could no longer be found.) + + As an optimization, it might be worthwhile to check + whether the immediately preceding entry is empty and, if + so, really delete the pair (set it to empty and decrease + the fullness along with the count). I *think* it should + be safe. */ + the_pair->key = ENTRY_DELETED; + --ht->count; + return 1; + } + } +} + +void +hash_table_clear (struct hash_table *ht) +{ + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + ht->fullness = 0; + ht->count = 0; +} + +void +hash_table_map (struct hash_table *ht, + int (*mapfun) (void *, void *, void *), + void *closure) +{ + int i; + for (i = 0; i < ht->size; i++) + { + struct ht_pair *the_pair = ht->pairs + i; + if (!EMPTY_ENTRY_P (the_pair->key) + && !DELETED_ENTRY_P (the_pair->key)) + if (mapfun (the_pair->key, the_pair->value, closure)) + return; + } +} + +/* Support for hash tables whose keys are strings. */ + +/* supposedly from the Dragon Book P436. */ +unsigned long +string_hash (const void *sv) +{ + unsigned int h = 0; + unsigned const char *x = (unsigned const char *) sv; + + while (*x) + { + unsigned int g; + h = (h << 4) + *x++; + if ((g = h & 0xf0000000) != 0) + h = (h ^ (g >> 24)) ^ g; + } + + return h; +} + +int +string_cmp (const void *s1, const void *s2) +{ + return !strcmp ((const char *)s1, (const char *)s2); +} + +struct hash_table * +make_string_hash_table (int initial_size) +{ + return hash_table_new (initial_size, string_hash, string_cmp); +} + + +#ifdef STANDALONE + +#include <stdio.h> +#include <string.h> + +int +print_hash_table_mapper (const void *key, void *value, void *count) +{ + ++*(int *)count; + printf ("%s: %s\n", (const char *)key, (char *)value); + return 0; +} + +void +print_hash (struct hash_table *sht) +{ + int debug_count = 0; + hash_table_map (sht, print_hash_table_mapper, &debug_count); + assert (debug_count == sht->count); +} + +int +main (void) +{ + struct hash_table *ht = make_string_hash_table (0); + char line[80]; + while ((fgets (line, sizeof (line), stdin))) + { + int len = strlen (line); + if (len <= 1) + continue; + line[--len] = '\0'; + hash_table_put (ht, strdup (line), "here I am!"); + if (len % 2) + hash_table_remove (ht, line); + } + print_hash (ht); +#if 0 + printf ("%d %d %d\n", ht->count, ht->fullness, ht->size); +#endif + return 0; +} +#endif diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 00000000..ab3136aa --- /dev/null +++ b/src/hash.h @@ -0,0 +1,50 @@ +/* Hash table declarations. + Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* From XEmacs, and hence from Dragon book. */ + +#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */ +#define HASH2(a,b) (GOOD_HASH * (a) + (b)) +#define HASH3(a,b,c) (GOOD_HASH * HASH2 (a,b) + (c)) +#define HASH4(a,b,c,d) (GOOD_HASH * HASH3 (a,b,c) + (d)) +#define HASH5(a,b,c,d,e) (GOOD_HASH * HASH4 (a,b,c,d) + (e)) +#define HASH6(a,b,c,d,e,f) (GOOD_HASH * HASH5 (a,b,c,d,e) + (f)) +#define HASH7(a,b,c,d,e,f,g) (GOOD_HASH * HASH6 (a,b,c,d,e,f) + (g)) +#define HASH8(a,b,c,d,e,f,g,h) (GOOD_HASH * HASH7 (a,b,c,d,e,f,g) + (h)) +#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i)) + +struct hash_table; + +struct hash_table *hash_table_new PARAMS ((int, + unsigned long (*) (const void *), + int (*) (const void *, + const void *))); +void hash_table_destroy PARAMS ((struct hash_table *)); +void *hash_table_get PARAMS ((struct hash_table *, const void *)); +int hash_table_exists PARAMS ((struct hash_table *, const void *)); +void hash_table_put PARAMS ((struct hash_table *, const void *, void *)); +int hash_table_remove PARAMS ((struct hash_table *, const void *)); +void hash_table_clear PARAMS ((struct hash_table *)); +void hash_table_map PARAMS ((struct hash_table *, + int (*) (void *, void *, void *), + void *)); + +unsigned long string_hash PARAMS ((const void *)); +int string_cmp PARAMS ((const void *, const void *)); +struct hash_table *make_string_hash_table PARAMS ((int)); diff --git a/src/headers.c b/src/headers.c index 6b1a670f..521073df 100644 --- a/src/headers.c +++ b/src/headers.c @@ -165,6 +165,14 @@ header_strdup (const char *header, void *closure) return 1; } +/* Write the value 1 into the integer pointed to by CLOSURE. */ +int +header_exists (const char *header, void *closure) +{ + *(int *)closure = 1; + return 1; +} + /* Skip LWS (linear white space), if present. Returns number of characters to skip. */ int diff --git a/src/headers.h b/src/headers.h index cc66e49b..5f85c6eb 100644 --- a/src/headers.h +++ b/src/headers.h @@ -31,5 +31,6 @@ int header_process PARAMS ((const char *, const char *, int header_extract_number PARAMS ((const char *, void *)); int header_strdup PARAMS ((const char *, void *)); +int header_exists PARAMS ((const char *, void *)); int skip_lws PARAMS ((const char *)); diff --git a/src/host.c b/src/host.c index 3fa1bb84..eeb4940d 100644 --- a/src/host.c +++ b/src/host.c @@ -1,5 +1,5 @@ /* Dealing with host names. - Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. This file is part of Wget. @@ -48,35 +48,38 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "host.h" #include "url.h" +#include "hash.h" #ifndef errno extern int errno; #endif -/* Host list entry */ -struct host +/* Mapping between all known hosts to their addresses (n.n.n.n). */ +struct hash_table *host_name_address_map; + +/* Mapping between all known addresses (n.n.n.n) to their hosts. This + is the inverse of host_name_address_map. These two tables share + the strdup'ed strings. */ +struct hash_table *host_address_name_map; + +/* Mapping between auxilliary (slave) and master host names. */ +struct hash_table *host_slave_master_map; + +/* Utility function: like xstrdup(), but also lowercases S. */ + +static char * +xstrdup_lower (const char *s) { - /* Host's symbolical name, as encountered at the time of first - inclusion, e.g. "fly.cc.fer.hr". */ - char *hostname; - /* Host's "real" name, i.e. its IP address, written out in ASCII - form of N.N.N.N, e.g. "161.53.70.130". */ - char *realname; - /* More than one HOSTNAME can correspond to the same REALNAME. For - our purposes, the canonical name of the host is its HOSTNAME when - it was first encountered. This entry is said to have QUALITY. */ - int quality; - /* Next entry in the list. */ - struct host *next; -}; - -static struct host *hlist; - -static struct host *add_hlist PARAMS ((struct host *, const char *, - const char *, int)); + char *copy = xstrdup (s); + char *p = copy; + for (; *p; p++) + *p = TOLOWER (*p); + return copy; +} /* The same as gethostbyname, but supports internet addresses of the - form `N.N.N.N'. */ + form `N.N.N.N'. On some systems gethostbyname() knows how to do + this automatically. */ struct hostent * ngethostbyname (const char *name) { @@ -91,42 +94,51 @@ ngethostbyname (const char *name) return hp; } -/* Search for HOST in the linked list L, by hostname. Return the - entry, if found, or NULL. The search is case-insensitive. */ -static struct host * -search_host (struct host *l, const char *host) -{ - for (; l; l = l->next) - if (strcasecmp (l->hostname, host) == 0) - return l; - return NULL; -} +/* Add host name HOST with the address ADDR_TEXT to the cache. + Normally this means that the (HOST, ADDR_TEXT) pair will be to + host_name_address_map and to host_address_name_map. (It is the + caller's responsibility to make sure that HOST is not already in + host_name_address_map.) -/* Like search_host, but searches by address. */ -static struct host * -search_address (struct host *l, const char *address) + If the ADDR_TEXT has already been seen and belongs to another host, + HOST will be added to host_slave_master_map instead. */ + +static void +add_host_to_cache (const char *host, const char *addr_text) { - for (; l; l = l->next) + char *canonical_name = hash_table_get (host_address_name_map, addr_text); + if (canonical_name) { - int cmp = strcmp (l->realname, address); - if (cmp == 0) - return l; - else if (cmp > 0) - return NULL; + DEBUGP (("Mapping %s to %s in host_slave_master_map.\n", + host, canonical_name)); + /* We've already dealt with that host under another name. */ + hash_table_put (host_slave_master_map, + xstrdup_lower (host), + xstrdup_lower (canonical_name)); + } + else + { + /* This is really the first time we're dealing with that host. */ + char *h_copy = xstrdup_lower (host); + char *a_copy = xstrdup (addr_text); + DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy)); + hash_table_put (host_name_address_map, h_copy, a_copy); + hash_table_put (host_address_name_map, a_copy, h_copy); } - return NULL; } -/* Store the address of HOSTNAME, internet-style, to WHERE. First - check for it in the host list, and (if not found), use - ngethostbyname to get it. +/* Store the address of HOSTNAME, internet-style (four octets in + network order), to WHERE. First try to get the address from the + cache; if it is not available, call the DNS functions and update + the cache. Return 1 on successful finding of the hostname, 0 otherwise. */ int store_hostaddress (unsigned char *where, const char *hostname) { - struct host *t; unsigned long addr; + char *addr_text; + char *canonical_name; struct hostent *hptr; struct in_addr in; char *inet_s; @@ -134,178 +146,119 @@ store_hostaddress (unsigned char *where, const char *hostname) /* If the address is of the form d.d.d.d, there will be no trouble with it. */ addr = (unsigned long)inet_addr (hostname); - if ((int)addr == -1) - { - /* If it is not of that form, try to find it in the cache. */ - t = search_host (hlist, hostname); - if (t) - addr = (unsigned long)inet_addr (t->realname); - } /* If we have the numeric address, just store it. */ if ((int)addr != -1) { - /* ADDR is in network byte order, meaning the code works on - little and big endian 32-bit architectures without change. - On big endian 64-bit architectures we need to be careful to - copy the correct four bytes. */ - int offset = 0; + /* ADDR is defined to be in network byte order, meaning the code + works on little and big endian 32-bit architectures without + change. On big endian 64-bit architectures we need to be + careful to copy the correct four bytes. */ + int offset; + have_addr: #ifdef WORDS_BIGENDIAN offset = sizeof (unsigned long) - 4; +#else + offset = 0; #endif memcpy (where, (char *)&addr + offset, 4); return 1; } + + /* By now we know that the address is not of the form d.d.d.d. Try + to find it in our cache of host addresses. */ + addr_text = hash_table_get (host_name_address_map, hostname); + if (addr_text) + { + DEBUGP (("Found %s in host_name_address_map: %s\n", + hostname, addr_text)); + addr = (unsigned long)inet_addr (addr_text); + goto have_addr; + } + + /* Maybe this host is known to us under another name. If so, we'll + find it in host_slave_master_map, and use the master name to find + its address in host_name_address_map. */ + canonical_name = hash_table_get (host_slave_master_map, hostname); + if (canonical_name) + { + addr_text = hash_table_get (host_name_address_map, canonical_name); + assert (addr_text != NULL); + DEBUGP (("Found %s as slave of %s -> %s\n", + hostname, canonical_name, addr_text)); + addr = (unsigned long)inet_addr (addr_text); + goto have_addr; + } + /* Since all else has failed, let's try gethostbyname(). Note that we use gethostbyname() rather than ngethostbyname(), because we - *know* the address is not numerical. */ + already know that the address is not numerical. */ hptr = gethostbyname (hostname); if (!hptr) return 0; /* Copy the address of the host to socket description. */ memcpy (where, hptr->h_addr_list[0], hptr->h_length); - /* Now that we're here, we could as well cache the hostname for - future use, as in realhost(). First, we have to look for it by - address to know if it's already in the cache by another name. */ + assert (hptr->h_length == 4); + /* Now that we've gone through the truoble of calling + gethostbyname(), we can store this valuable information to the + cache. First, we have to look for it by address to know if it's + already in the cache by another name. */ /* Originally, we copied to in.s_addr, but it appears to be missing on some systems. */ memcpy (&in, *hptr->h_addr_list, sizeof (in)); - STRDUP_ALLOCA (inet_s, inet_ntoa (in)); - t = search_address (hlist, inet_s); - if (t) /* Found in the list, as realname. */ - { - /* Set the default, 0 quality. */ - hlist = add_hlist (hlist, hostname, inet_s, 0); - return 1; - } - /* Since this is really the first time this host is encountered, - set quality to 1. */ - hlist = add_hlist (hlist, hostname, inet_s, 1); + inet_s = inet_ntoa (in); + add_host_to_cache (hostname, inet_s); return 1; } -/* Add a host to the host list. The list is sorted by addresses. For - equal addresses, the entries with quality should bubble towards the - beginning of the list. */ -static struct host * -add_hlist (struct host *l, const char *nhost, const char *nreal, int quality) -{ - struct host *t, *old, *beg; - - /* The entry goes to the beginning of the list if the list is empty - or the order requires it. */ - if (!l || (strcmp (nreal, l->realname) < 0)) - { - t = (struct host *)xmalloc (sizeof (struct host)); - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - t->next = l; - return t; - } - - beg = l; - /* Second two one-before-the-last element. */ - while (l->next) - { - int cmp; - old = l; - l = l->next; - cmp = strcmp (nreal, l->realname); - if (cmp >= 0) - continue; - /* If the next list element is greater than s, put s between the - current and the next list element. */ - t = (struct host *)xmalloc (sizeof (struct host)); - old->next = t; - t->next = l; - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - return beg; - } - t = (struct host *)xmalloc (sizeof (struct host)); - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - /* Insert the new element after the last element. */ - l->next = t; - t->next = NULL; - return beg; -} - /* Determine the "real" name of HOST, as perceived by Wget. If HOST is referenced by more than one name, "real" name is considered to - be the first one encountered in the past. - - If the host cannot be found in the list of already dealt-with - hosts, try with its INET address. If this fails too, add it to the - list. The routine does not call gethostbyname twice for the same - host if it can possibly avoid it. */ + be the first one encountered in the past. */ char * realhost (const char *host) { - struct host *l, *l_real; struct in_addr in; struct hostent *hptr; - char *inet_s; + char *master_name; - DEBUGP (("Checking for %s.\n", host)); - /* Look for the host, looking by the host name. */ - l = search_host (hlist, host); - if (l && l->quality) /* Found it with quality */ + DEBUGP (("Checking for %s in host_name_address_map.\n", host)); + if (hash_table_exists (host_name_address_map, host)) { - DEBUGP (("%s was already used, by that name.\n", host)); - /* Here we return l->hostname, not host, because of the possible - case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are - the same, but we want the one that was first. */ - return xstrdup (l->hostname); + DEBUGP (("Found; %s was already used, by that name.\n", host)); + return xstrdup_lower (host); } - else if (!l) /* Not found, with or without quality */ - { - /* The fact that gethostbyname will get called makes it - necessary to store it to the list, to ensure that - gethostbyname will not be called twice for the same string. - However, the quality argument must be set appropriately. - Note that add_hlist must be called *after* the realname - search, or the quality would be always set to 0 */ - DEBUGP (("This is the first time I hear about host %s by that name.\n", - host)); - hptr = ngethostbyname (host); - if (!hptr) - return xstrdup (host); + DEBUGP (("Checking for %s in host_slave_master_map.\n", host)); + master_name = hash_table_get (host_slave_master_map, host); + if (master_name) + { + has_master: + DEBUGP (("Found; %s was already used, by the name %s.\n", + host, master_name)); + return xstrdup (master_name); + } + + DEBUGP (("First time I hear about %s by that name; looking it up.\n", + host)); + hptr = ngethostbyname (host); + if (hptr) + { + char *inet_s; /* Originally, we copied to in.s_addr, but it appears to be - missing on some systems. */ + missing on some systems. */ memcpy (&in, *hptr->h_addr_list, sizeof (in)); - STRDUP_ALLOCA (inet_s, inet_ntoa (in)); - } - else /* Found, without quality */ - { - /* This case happens when host is on the list, - but not as first entry (the one with quality). - Then we just get its INET address and pick - up the first entry with quality. */ - DEBUGP (("We've dealt with host %s, but under the name %s.\n", - host, l->realname)); - STRDUP_ALLOCA (inet_s, l->realname); + inet_s = inet_ntoa (in); + + add_host_to_cache (host, inet_s); + + /* add_host_to_cache() can establish a slave-master mapping. */ + DEBUGP (("Checking again for %s in host_slave_master_map.\n", host)); + master_name = hash_table_get (host_slave_master_map, host); + if (master_name) + goto has_master; } - /* Now we certainly have the INET address. The following loop is - guaranteed to pick either an entry with quality (because it is - the first one), or none at all. */ - l_real = search_address (hlist, inet_s); - if (l_real) /* Found in the list, as realname. */ - { - if (!l) - /* Set the default, 0 quality. */ - hlist = add_hlist (hlist, host, inet_s, 0); - return xstrdup (l_real->hostname); - } - /* Since this is really the first time this host is encountered, - set quality to 1. */ - hlist = add_hlist (hlist, host, inet_s, 1); - return xstrdup (host); + return xstrdup_lower (host); } /* Compare two hostnames (out of URL-s if the arguments are URL-s), @@ -547,20 +500,23 @@ herrmsg (int error) return _("Unknown error"); } -/* Clean the host list. This is a separate function, so we needn't - export HLIST and its implementation. Ha! */ void clean_hosts (void) { - struct host *l = hlist; - - while (l) - { - struct host *p = l->next; - free (l->hostname); - free (l->realname); - free (l); - l = p; - } - hlist = NULL; + /* host_name_address_map and host_address_name_map share the + strings. Because of that, calling free_keys_and_values once + suffices for both. */ + free_keys_and_values (host_name_address_map); + hash_table_destroy (host_name_address_map); + hash_table_destroy (host_address_name_map); + free_keys_and_values (host_slave_master_map); + hash_table_destroy (host_slave_master_map); +} + +void +host_init (void) +{ + host_name_address_map = make_string_hash_table (0); + host_address_name_map = make_string_hash_table (0); + host_slave_master_map = make_string_hash_table (0); } diff --git a/src/html-parse.c b/src/html-parse.c new file mode 100644 index 00000000..b5efa7f2 --- /dev/null +++ b/src/html-parse.c @@ -0,0 +1,856 @@ +/* HTML parser for Wget. + Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* The only entry point to this module is map_html_tags(), which see. */ + +/* TODO: + + - Allow hooks for callers to process contents outside tags. This + is needed to implement handling <style> and <script>. The + taginfo structure already carries the information about where the + tags are, but this is not enough, because one would also want to + skip the comments. (The funny thing is that for <style> and + <script> you *don't* want to skip comments!) + + - Create a test suite for regression testing. */ + +/* HISTORY: + + This is the third HTML parser written for Wget. The first one was + written some time during the Geturl 1.0 beta cycle, and was very + inefficient and buggy. It also contained some very complex code to + remember a list of parser states, because it was supposed to be + reentrant. The idea was that several parsers would be running + concurrently, and you'd have pass the function a unique ID string + (for example, the URL) by which it found the relevant parser state + and returned the next URL. Over-engineering at its best. + + The second HTML parser was written for Wget 1.4 (the first version + by the name `Wget'), and was a complete rewrite. Although the new + parser behaved much better and made no claims of reentrancy, it + still shared many of the fundamental flaws of the old version -- it + only regarded HTML in terms tag-attribute pairs, where the + attribute's value was a URL to be returned. Any other property of + HTML, such as <base href=...>, or strange way to specify a URL, + such as <meta http-equiv=Refresh content="0; URL=..."> had to be + crudely hacked in -- and the caller had to be aware of these hacks. + Like its predecessor, this parser did not support HTML comments. + + After Wget 1.5.1 was released, I set out to write a third HTML + parser. The objectives of the new parser were to: (1) provide a + clean way to analyze HTML lexically, (2) separate interpretation of + the markup from the parsing process, (3) be as correct as possible, + e.g. correctly skipping comments and other SGML declarations, (4) + understand the most common errors in markup and skip them or be + relaxed towrds them, and (5) be reasonably efficient (no regexps, + minimum copying and minimum or no heap allocation). + + I believe this parser meets all of the above goals. It is + reasonably well structured, and could be relatively easily + separated from Wget and used elsewhere. While some of its + intrinsic properties limit its value as a general-purpose HTML + parser, I believe that, with minimum modifications, it could serve + as a backend for one. + + Due to time and other constraints, this parser was not integrated + into Wget until the version ???. */ + +/* DESCRIPTION: + + The single entry point of this parser is map_html_tags(), which + works by calling a function you specify for each tag. The function + gets called with the pointer to a structure describing the tag and + its attributes. */ + +/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll + still need Wget headers to compile. */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#ifdef HAVE_STRING_H +# include <string.h> +#else +# include <strings.h> +#endif +#include <assert.h> + +#include "wget.h" +#include "html-parse.h" + +#ifdef STANDALONE +# define xmalloc malloc +# define xrealloc realloc +#endif /* STANDALONE */ + +/* Pool support. For efficiency, map_html_tags() stores temporary + string data to a single stack-allocated pool. If the pool proves + too small, additional memory is allocated/resized with + malloc()/realloc(). */ + +struct pool { + char *contents; /* pointer to the contents. */ + int size; /* size of the pool. */ + int index; /* next unoccupied position in + contents. */ + + int alloca_p; /* whether contents was allocated + using alloca(). */ + char *orig_contents; /* orig_contents, allocated by + alloca(). this is used by + POOL_FREE to restore the pool to + the "initial" state. */ + int orig_size; +}; + +/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */ + +#define POOL_INIT(pool, initial_size) do { \ + (pool).size = (initial_size); \ + (pool).contents = ALLOCA_ARRAY (char, (pool).size); \ + (pool).index = 0; \ + (pool).alloca_p = 1; \ + (pool).orig_contents = (pool).contents; \ + (pool).orig_size = (pool).size; \ +} while (0) + +/* Grow the pool to accomodate at least SIZE new bytes. If the pool + already has room to accomodate SIZE bytes of data, this is a no-op. */ + +#define POOL_GROW(pool, increase) do { \ + int PG_newsize = (pool).index + increase; \ + DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize, \ + (pool).alloca_p, char); \ +} while (0) + +/* Append text in the range [beg, end) to POOL. No zero-termination + is done. */ + +#define POOL_APPEND(pool, beg, end) do { \ + const char *PA_beg = beg; \ + int PA_size = end - PA_beg; \ + POOL_GROW (pool, PA_size); \ + memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \ + (pool).index += PA_size; \ +} while (0) + +/* The same as the above, but with zero termination. */ + +#define POOL_APPEND_ZT(pool, beg, end) do { \ + const char *PA_beg = beg; \ + int PA_size = end - PA_beg; \ + POOL_GROW (pool, PA_size + 1); \ + memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \ + (pool).contents[(pool).index + PA_size] = '\0'; \ + (pool).index += PA_size + 1; \ +} while (0) + +/* Forget old pool contents. The allocated memory is not freed. */ +#define POOL_REWIND(pool) pool.index = 0 + +/* Free heap-allocated memory for contents of POOL. This calls free() + if the memory was allocated through malloc. It also restores + `contents' and `size' to their original, pre-malloc values. That + way after POOL_FREE, the pool is fully usable, just as if it were + freshly initialized with POOL_INIT. */ + +#define POOL_FREE(pool) do { \ + if (!(pool).alloca_p) \ + free ((pool).contents); \ + (pool).contents = (pool).orig_contents; \ + (pool).size = (pool).orig_size; \ + (pool).index = 0; \ + (pool).alloca_p = 1; \ +} while (0) + + +#define AP_DOWNCASE 1 +#define AP_PROCESS_ENTITIES 2 +#define AP_SKIP_BLANKS 4 + +/* Copy the text in the range [BEG, END) to POOL, optionally + performing operations specified by FLAGS. FLAGS may be any + combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS + with the following meaning: + + * AP_DOWNCASE -- downcase all the letters; + + * AP_PROCESS_ENTITIES -- process the SGML entities and write out + the decoded string. Recognized entities are <, >, &, ", +   and the numerical entities. + + * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end + of text. */ +static void +convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) +{ + int old_index = pool->index; + int size; + + /* First, skip blanks if required. We must do this before entities + are processed, so that blanks can still be inserted as, for + instance, ` '. */ + if (flags & AP_SKIP_BLANKS) + { + while (beg < end && ISSPACE (*beg)) + ++beg; + while (end > beg && ISSPACE (end[-1])) + --end; + } + size = end - beg; + + if (flags & AP_PROCESS_ENTITIES) + { + /* Stack-allocate a copy of text, process entities and copy it + to the pool. */ + char *local_copy = (char *)alloca (size + 1); + const char *from = beg; + char *to = local_copy; + + while (from < end) + { + if (*from != '&') + *to++ = *from++; + else + { + const char *save = from; + int remain; + + if (++from == end) goto lose; + remain = end - from; + + if (*from == '#') + { + int numeric; + ++from; + if (from == end || !ISDIGIT (*from)) goto lose; + for (numeric = 0; from < end && ISDIGIT (*from); from++) + numeric = 10 * numeric + (*from) - '0'; + if (from < end && ISALPHA (*from)) goto lose; + numeric &= 0xff; + *to++ = numeric; + } +#define FROB(x) (remain >= (sizeof (x) - 1) \ + && !memcmp (from, x, sizeof (x) - 1) \ + && (*(from + sizeof (x) - 1) == ';' \ + || remain == sizeof (x) - 1 \ + || !ISALNUM (*(from + sizeof (x) - 1)))) + else if (FROB ("lt")) + *to++ = '<', from += 2; + else if (FROB ("gt")) + *to++ = '>', from += 2; + else if (FROB ("amp")) + *to++ = '&', from += 3; + else if (FROB ("quot")) + *to++ = '\"', from += 4; + /* We don't implement the proposed "Added Latin 1" + entities (except for nbsp), because it is unnecessary + in the context of Wget, and would require hashing to + work efficiently. */ + else if (FROB ("nbsp")) + *to++ = 160, from += 4; + else + goto lose; +#undef FROB + /* If the entity was followed by `;', we step over the + `;'. Otherwise, it was followed by either a + non-alphanumeric or EOB, in which case we do nothing. */ + if (from < end && *from == ';') + ++from; + continue; + + lose: + /* This was not an entity after all. Back out. */ + from = save; + *to++ = *from++; + } + } + *to++ = '\0'; + POOL_APPEND (*pool, local_copy, to); + } + else + { + /* Just copy the text to the pool. */ + POOL_APPEND_ZT (*pool, beg, end); + } + + if (flags & AP_DOWNCASE) + { + char *p = pool->contents + old_index; + for (; *p; p++) + *p = TOLOWER (*p); + } +} + +/* Check whether the contents of [POS, POS+LENGTH) match any of the + strings in the ARRAY. */ +static int +array_allowed (const char **array, const char *beg, const char *end) +{ + int length = end - beg; + if (array) + { + for (; *array; array++) + if (length >= strlen (*array) + && !strncasecmp (*array, beg, length)) + break; + if (!*array) + return 0; + } + return 1; +} + +/* RFC1866: name [of attribute or tag] consists of letters, digits, + periods, or hyphens. We also allow _, for compatibility with + brain-damaged generators. */ +#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_') + +/* States while advancing through comments. */ +#define AC_S_DONE 0 +#define AC_S_BACKOUT 1 +#define AC_S_BANG 2 +#define AC_S_DEFAULT 3 +#define AC_S_DCLNAME 4 +#define AC_S_DASH1 5 +#define AC_S_DASH2 6 +#define AC_S_COMMENT 7 +#define AC_S_DASH3 8 +#define AC_S_DASH4 9 +#define AC_S_QUOTE1 10 +#define AC_S_IN_QUOTE 11 +#define AC_S_QUOTE2 12 + +#ifdef STANDALONE +static int comment_backout_count; +#endif + +/* Advance over an SGML declaration (the <!...> forms you find in HTML + documents). The function returns the location after the + declaration. The reason we need this is that HTML comments are + expressed as comments in so-called "empty declarations". + + To recap: any SGML declaration may have comments associated with + it, e.g. + <!MY-DECL -- isn't this fun? -- foo bar> + + An HTML comment is merely an empty declaration (<!>) with a comment + attached, like this: + <!-- some stuff here --> + + Several comments may be embedded in one comment declaration: + <!-- have -- -- fun --> + + Whitespace is allowed between and after the comments, but not + before the first comment. + + Additionally, this function attempts to handle double quotes in + SGML declarations correctly. */ +static const char * +advance_declaration (const char *beg, const char *end) +{ + const char *p = beg; + char quote_char = '\0'; /* shut up, gcc! */ + char ch; + int state = AC_S_BANG; + + if (beg == end) + return beg; + ch = *p++; + + /* It looked like a good idea to write this as a state machine, but + now I wonder... */ + + while (state != AC_S_DONE && state != AC_S_BACKOUT) + { + if (p == end) + state = AC_S_BACKOUT; + switch (state) + { + case AC_S_DONE: + case AC_S_BACKOUT: + break; + case AC_S_BANG: + if (ch == '!') + { + ch = *p++; + state = AC_S_DEFAULT; + } + else + state = AC_S_BACKOUT; + break; + case AC_S_DEFAULT: + switch (ch) + { + case '-': + state = AC_S_DASH1; + break; + case ' ': + case '\t': + case '\r': + case '\n': + ch = *p++; + break; + case '>': + state = AC_S_DONE; + break; + case '\'': + case '\"': + state = AC_S_QUOTE1; + break; + default: + if (NAME_CHAR_P (ch)) + state = AC_S_DCLNAME; + else + state = AC_S_BACKOUT; + break; + } + break; + case AC_S_DCLNAME: + if (NAME_CHAR_P (ch)) + ch = *p++; + else if (ch == '-') + state = AC_S_DASH1; + else + state = AC_S_DEFAULT; + break; + case AC_S_QUOTE1: + assert (ch == '\'' || ch == '\"'); + quote_char = ch; /* cheating -- I really don't feel like + introducing more different states for + different quote characters. */ + ch = *p++; + state = AC_S_IN_QUOTE; + break; + case AC_S_IN_QUOTE: + if (ch == quote_char) + state = AC_S_QUOTE2; + else + ch = *p++; + break; + case AC_S_QUOTE2: + assert (ch == quote_char); + ch = *p++; + state = AC_S_DEFAULT; + break; + case AC_S_DASH1: + assert (ch == '-'); + ch = *p++; + state = AC_S_DASH2; + break; + case AC_S_DASH2: + switch (ch) + { + case '-': + ch = *p++; + state = AC_S_COMMENT; + break; + default: + state = AC_S_BACKOUT; + } + break; + case AC_S_COMMENT: + switch (ch) + { + case '-': + state = AC_S_DASH3; + break; + default: + ch = *p++; + break; + } + break; + case AC_S_DASH3: + assert (ch == '-'); + ch = *p++; + state = AC_S_DASH4; + break; + case AC_S_DASH4: + switch (ch) + { + case '-': + ch = *p++; + state = AC_S_DEFAULT; + break; + default: + state = AC_S_COMMENT; + break; + } + break; + } + } + + if (state == AC_S_BACKOUT) + { +#ifdef STANDALONE + ++comment_backout_count; +#endif + return beg + 1; + } + return p; +} + +/* Advance P (a char pointer), with the explicit intent of being able + to read the next character. If this is not possible, go to finish. */ + +#define ADVANCE(p) do { \ + ++p; \ + if (p >= end) \ + goto finish; \ +} while (0) + +/* Skip whitespace, if any. */ + +#define SKIP_WS(p) do { \ + while (ISSPACE (*p)) { \ + ADVANCE (p); \ + } \ +} while (0) + +/* Skip non-whitespace, if any. */ + +#define SKIP_NON_WS(p) do { \ + while (!ISSPACE (*p)) { \ + ADVANCE (p); \ + } \ +} while (0) + +#ifdef STANDALONE +static int tag_backout_count; +#endif + +/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long. + MAPFUN will be called with two arguments: pointer to an initialized + struct taginfo, and CLOSURE. + + ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to + be processed by this function. If it is NULL, all the tags are + allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES. + + (Obviously, the caller can filter out unwanted tags and attributes + just as well, but this is just an optimization designed to avoid + unnecessary copying for tags/attributes which the caller doesn't + want to know about. These lists are searched linearly; therefore, + if you're interested in a large number of tags or attributes, you'd + better set these to NULL and filter them out yourself with a + hashing process most appropriate for your application.) */ + +void +map_html_tags (const char *text, int size, + const char **allowed_tag_names, + const char **allowed_attribute_names, + void (*mapfun) (struct taginfo *, void *), + void *closure) +{ + const char *p = text; + const char *end = text + size; + + int attr_pair_count = 8; + int attr_pair_alloca_p = 1; + struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count); + struct pool pool; + + if (!size) + return; + + POOL_INIT (pool, 256); + + { + int nattrs, end_tag; + const char *tag_name_begin, *tag_name_end; + const char *tag_start_position; + int uninteresting_tag; + + look_for_tag: + POOL_REWIND (pool); + + nattrs = 0; + end_tag = 0; + + /* Find beginning of tag. We use memchr() instead of the usual + looping with ADVANCE() for speed. */ + p = memchr (p, '<', end - p); + if (!p) + goto finish; + + tag_start_position = p; + ADVANCE (p); + + /* Establish the type of the tag (start-tag, end-tag or + declaration). */ + if (*p == '!') + { + /* This is an SGML declaration -- just skip it. */ + p = advance_declaration (p, end); + if (p == end) + goto finish; + goto look_for_tag; + } + else if (*p == '/') + { + end_tag = 1; + ADVANCE (p); + } + tag_name_begin = p; + while (NAME_CHAR_P (*p)) + ADVANCE (p); + if (p == tag_name_begin) + goto look_for_tag; + tag_name_end = p; + SKIP_WS (p); + if (end_tag && *p != '>') + goto backout_tag; + + if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end)) + /* We can't just say "goto look_for_tag" here because we need + the loop below to properly advance over the tag's attributes. */ + uninteresting_tag = 1; + else + { + uninteresting_tag = 0; + convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE); + } + + /* Find the attributes. */ + while (1) + { + const char *attr_name_begin, *attr_name_end; + const char *attr_value_begin, *attr_value_end; + const char *attr_raw_value_begin, *attr_raw_value_end; + int operation = AP_DOWNCASE; /* stupid compiler. */ + + SKIP_WS (p); + + /* Check for end of tag definition. */ + if (*p == '>') + break; + + /* Establish bounds of attribute name. */ + attr_name_begin = p; /* <foo bar ...> */ + /* ^ */ + while (NAME_CHAR_P (*p)) + ADVANCE (p); + attr_name_end = p; /* <foo bar ...> */ + /* ^ */ + if (attr_name_begin == attr_name_end) + goto backout_tag; + + /* Establish bounds of attribute value. */ + SKIP_WS (p); + if (NAME_CHAR_P (*p) || *p == '>') + { + /* Minimized attribute syntax allows `=' to be omitted. + For example, <UL COMPACT> is a valid shorthand for <UL + COMPACT="compact">. Even if such attributes are not + useful to Wget, we need to support them, so that the + tags containing them can be parsed correctly. */ + attr_raw_value_begin = attr_value_begin = attr_name_begin; + attr_raw_value_end = attr_value_end = attr_name_end; + } + else if (*p == '=') + { + ADVANCE (p); + SKIP_WS (p); + if (*p == '\"' || *p == '\'') + { + int newline_seen = 0; + char quote_char = *p; + attr_raw_value_begin = p; + ADVANCE (p); + attr_value_begin = p; /* <foo bar="baz"> */ + /* ^ */ + while (*p != quote_char) + { + if (!newline_seen && *p == '\n') + { + /* If a newline is seen within the quotes, it + is most likely that someone forgot to close + the quote. In that case, we back out to + the value beginning, and terminate the tag + at either `>' or the delimiter, whichever + comes first. Such a tag terminated at `>' + is discarded. */ + p = attr_value_begin; + newline_seen = 1; + continue; + } + else if (newline_seen && *p == '>') + break; + ADVANCE (p); + } + attr_value_end = p; /* <foo bar="baz"> */ + /* ^ */ + if (*p == quote_char) + ADVANCE (p); + else + goto look_for_tag; + attr_raw_value_end = p; /* <foo bar="baz"> */ + /* ^ */ + /* The AP_SKIP_BLANKS part is not entirely correct, + because we don't want to skip blanks for all the + attribute values. */ + operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS; + } + else + { + attr_value_begin = p; /* <foo bar=baz> */ + /* ^ */ + /* According to SGML, a name token should consist only + of alphanumerics, . and -. However, this is often + violated by, for instance, `%' in `width=75%'. + We'll be liberal and allow just about anything as + an attribute value. */ + while (!ISSPACE (*p) && *p != '>') + ADVANCE (p); + attr_value_end = p; /* <foo bar=baz qux=quix> */ + /* ^ */ + if (attr_value_begin == attr_value_end) + /* <foo bar=> */ + /* ^ */ + goto backout_tag; + attr_raw_value_begin = attr_value_begin; + attr_raw_value_end = attr_value_end; + operation = AP_PROCESS_ENTITIES; + } + } + else + { + /* We skipped the whitespace and found something that is + neither `=' nor the beginning of the next attribute's + name. Back out. */ + goto backout_tag; /* <foo bar /... */ + /* ^ */ + } + + /* If we're not interested in the tag, don't bother with any + of the attributes. */ + if (uninteresting_tag) + continue; + + /* If we aren't interested in the attribute, skip it. We + cannot do this test any sooner, because our text pointer + needs to correctly advance over the attribute. */ + if (allowed_attribute_names + && !array_allowed (allowed_attribute_names, attr_name_begin, + attr_name_end)) + continue; + + DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1, + attr_pair_alloca_p, struct attr_pair); + + pairs[nattrs].name_pool_index = pool.index; + convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE); + + pairs[nattrs].value_pool_index = pool.index; + convert_and_copy (&pool, attr_value_begin, attr_value_end, operation); + pairs[nattrs].value_raw_beginning = attr_raw_value_begin; + pairs[nattrs].value_raw_size = (attr_raw_value_end + - attr_raw_value_begin); + ++nattrs; + } + + if (uninteresting_tag) + { + ADVANCE (p); + goto look_for_tag; + } + + /* By now, we have a valid tag with a name and zero or more + attributes. Fill in the data and call the mapper function. */ + { + int i; + struct taginfo taginfo; + + taginfo.name = pool.contents; + taginfo.end_tag_p = end_tag; + taginfo.nattrs = nattrs; + /* We fill in the char pointers only now, when pool can no + longer get realloc'ed. If we did that above, we could get + hosed by reallocation. Obviously, after this point, the pool + may no longer be grown. */ + for (i = 0; i < nattrs; i++) + { + pairs[i].name = pool.contents + pairs[i].name_pool_index; + pairs[i].value = pool.contents + pairs[i].value_pool_index; + } + taginfo.attrs = pairs; + taginfo.start_position = tag_start_position; + taginfo.end_position = p + 1; + /* Ta-dam! */ + (*mapfun) (&taginfo, closure); + ADVANCE (p); + } + goto look_for_tag; + + backout_tag: +#ifdef STANDALONE + ++tag_backout_count; +#endif + /* The tag wasn't really a tag. Treat its contents as ordinary + data characters. */ + p = tag_start_position + 1; + goto look_for_tag; + } + + finish: + POOL_FREE (pool); + if (!attr_pair_alloca_p) + free (pairs); +} + +#undef ADVANCE +#undef SKIP_WS +#undef SKIP_NON_WS + +#ifdef STANDALONE +static void +test_mapper (struct taginfo *taginfo, void *arg) +{ + int i; + + printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name); + for (i = 0; i < taginfo->nattrs; i++) + printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value); + putchar ('\n'); + ++*(int *)arg; +} + +int main () +{ + int size = 256; + char *x = (char *)xmalloc (size); + int length = 0; + int read_count; + int tag_counter = 0; + + while ((read_count = fread (x + length, 1, size - length, stdin))) + { + length += read_count; + size <<= 1; + x = (char *)xrealloc (x, size); + } + + map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter); + printf ("TAGS: %d\n", tag_counter); + printf ("Tag backouts: %d\n", tag_backout_count); + printf ("Comment backouts: %d\n", comment_backout_count); + return 0; +} +#endif /* STANDALONE */ diff --git a/src/html-parse.h b/src/html-parse.h new file mode 100644 index 00000000..5810a2af --- /dev/null +++ b/src/html-parse.h @@ -0,0 +1,44 @@ +/* Declarations for html-parse.c. + Copyright (C) 1998 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +struct attr_pair { + char *name; /* attribute name */ + char *value; /* attribute value */ + + /* Needed for URL conversion; the places where the value begins and + ends, including the quotes and everything. */ + const char *value_raw_beginning; + int value_raw_size; + + /* Used internally by map_html_tags. */ + int name_pool_index, value_pool_index; +}; + +struct taginfo { + char *name; /* tag name */ + int end_tag_p; /* whether this is an end-tag */ + int nattrs; /* number of attributes */ + struct attr_pair *attrs; /* attributes */ + + const char *start_position; /* start position of tag */ + const char *end_position; /* end position of tag */ +}; + +void map_html_tags PARAMS ((const char *, int, const char **, const char **, + void (*) (struct taginfo *, void *), void *)); diff --git a/src/html-url.c b/src/html-url.c new file mode 100644 index 00000000..0441b470 --- /dev/null +++ b/src/html-url.c @@ -0,0 +1,569 @@ +/* Collect URLs from HTML source. + Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include <config.h> + +#include <stdio.h> +#ifdef HAVE_STRING_H +# include <string.h> +#else +# include <strings.h> +#endif +#include <stdlib.h> +#include <ctype.h> +#include <errno.h> +#include <assert.h> + +#include "wget.h" +#include "html-parse.h" +#include "url.h" +#include "utils.h" + +#ifndef errno +extern int errno; +#endif + +enum tag_category { TC_LINK, TC_SPEC }; + +/* Here we try to categorize the known tags. Each tag has its ID and + cetegory. Category TC_LINK means that one or more of its + attributes contain links that should be retrieved. TC_SPEC means + that the tag is specific in some way, and has to be handled + specially. */ +static struct { + const char *name; + enum tag_category category; +} known_tags[] = { +#define TAG_A 0 + { "a", TC_LINK }, +#define TAG_APPLET 1 + { "applet", TC_LINK }, +#define TAG_AREA 2 + { "area", TC_LINK }, +#define TAG_BASE 3 + { "base", TC_SPEC }, +#define TAG_BGSOUND 4 + { "bgsound", TC_LINK }, +#define TAG_BODY 5 + { "body", TC_LINK }, +#define TAG_EMBED 6 + { "embed", TC_LINK }, +#define TAG_FIG 7 + { "fig", TC_LINK }, +#define TAG_FRAME 8 + { "frame", TC_LINK }, +#define TAG_IFRAME 9 + { "iframe", TC_LINK }, +#define TAG_IMG 10 + { "img", TC_LINK }, +#define TAG_INPUT 11 + { "input", TC_LINK }, +#define TAG_LAYER 12 + { "layer", TC_LINK }, +#define TAG_LINK 13 + { "link", TC_SPEC }, +#define TAG_META 14 + { "meta", TC_SPEC }, +#define TAG_OVERLAY 15 + { "overlay", TC_LINK }, +#define TAG_SCRIPT 16 + { "script", TC_LINK }, +#define TAG_TABLE 17 + { "table", TC_LINK }, +#define TAG_TD 18 + { "td", TC_LINK }, +#define TAG_TH 19 + { "th", TC_LINK } +}; + +/* Flags for specific url-attr pairs handled through TC_LINK: */ +#define AF_EXTERNAL 1 + +/* For tags handled by TC_LINK: attributes that contain URLs to + download. */ +static struct { + int tagid; + const char *attr_name; + int flags; +} url_tag_attr_map[] = { + { TAG_A, "href", AF_EXTERNAL }, + { TAG_APPLET, "code", 0 }, + { TAG_AREA, "href", AF_EXTERNAL }, + { TAG_BGSOUND, "src", 0 }, + { TAG_BODY, "background", 0 }, + { TAG_EMBED, "src", 0 }, + { TAG_FIG, "src", 0 }, + { TAG_FRAME, "src", 0 }, + { TAG_IFRAME, "src", 0 }, + { TAG_IMG, "href", 0 }, + { TAG_IMG, "lowsrc", 0 }, + { TAG_IMG, "src", 0 }, + { TAG_INPUT, "src", 0 }, + { TAG_LAYER, "src", 0 }, + { TAG_OVERLAY, "src", 0 }, + { TAG_SCRIPT, "src", 0 }, + { TAG_TABLE, "background", 0 }, + { TAG_TD, "background", 0 }, + { TAG_TH, "background", 0 } +}; + +/* The lists of interesting tags and attributes are built dynamically, + from the information above. However, some places in the code refer + to the attributes not mentioned here. We add them manually. */ +static const char *additional_attributes[] = { + "rel", /* for TAG_LINK */ + "http-equiv", /* for TAG_META */ + "name", /* for TAG_META */ + "content" /* for TAG_META */ +}; + +static const char **interesting_tags; +static const char **interesting_attributes; + +void +init_interesting (void) +{ + /* Init the variables interesting_tags and interesting_attributes + that are used by the HTML parser to know which tags and + attributes we're interested in. We initialize this only once, + for performance reasons. + + Here we also make sure that what we put in interesting_tags + matches the user's preferences as specified through --ignore-tags + and --follow-tags. */ + + { + int i, ind = 0; + int size = ARRAY_SIZE (known_tags); + interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *)); + + for (i = 0; i < size; i++) + { + const char *name = known_tags[i].name; + + /* Normally here we could say: + interesting_tags[i] = name; + But we need to respect the settings of --ignore-tags and + --follow-tags, so the code gets a bit harier. */ + + if (opt.ignore_tags) + { + /* --ignore-tags was specified. Do not match these + specific tags. --ignore-tags takes precedence over + --follow-tags, so we process --ignore first and fall + through if there's no match. */ + int j, lose = 0; + for (j = 0; opt.ignore_tags[j] != NULL; j++) + /* Loop through all the tags this user doesn't care + about. */ + if (strcasecmp(opt.ignore_tags[j], name) == EQ) + { + lose = 1; + break; + } + if (lose) + continue; + } + + if (opt.follow_tags) + { + /* --follow-tags was specified. Only match these specific + tags, so return FALSE if we don't match one of them. */ + int j, win = 0; + for (j = 0; opt.follow_tags[j] != NULL; j++) + /* Loop through all the tags this user cares about. */ + if (strcasecmp(opt.follow_tags[j], name) == EQ) + { + win = 1; + break; + } + if (!win) + continue; /* wasn't one of the explicitly + desired tags */ + } + + /* If we get to here, --follow-tags isn't being used or the + tag is among the ones that are follwed, and --ignore-tags, + if specified, didn't include this tag, so it's an + "interesting" one. */ + interesting_tags[ind++] = name; + } + interesting_tags[ind] = NULL; + } + + /* The same for attributes, except we loop through url_tag_attr_map. + Here we also need to make sure that the list of attributes is + unique, and to include the attributes from additional_attributes. */ + { + int i, ind; + const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1) + * sizeof (char *)); + /* First copy the "additional" attributes. */ + for (i = 0; i < ARRAY_SIZE (additional_attributes); i++) + att[i] = additional_attributes[i]; + ind = i; + att[ind] = NULL; + for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++) + { + int j, seen = 0; + const char *look_for = url_tag_attr_map[i].attr_name; + for (j = 0; j < ind - 1; j++) + if (!strcmp (att[j], look_for)) + { + seen = 1; + break; + } + if (!seen) + { + att = xrealloc (att, (ind + 2) * sizeof (*att)); + att[ind++] = look_for; + att[ind] = NULL; + } + } + interesting_attributes = att; + } +} + +static int +find_tag (const char *tag_name) +{ + int i; + + /* This is linear search; if the number of tags grow, we can switch + to binary search. */ + + for (i = 0; i < ARRAY_SIZE (known_tags); i++) + { + int cmp = strcasecmp (known_tags[i].name, tag_name); + /* known_tags are sorted alphabetically, so we can + micro-optimize. */ + if (cmp > 0) + break; + else if (cmp == 0) + return i; + } + return -1; +} + +/* Find the value of attribute named NAME in the taginfo TAG. If the + attribute is not present, return NULL. If ATTRID is non-NULL, the + exact identity of the attribute will be returned. */ +static char * +find_attr (struct taginfo *tag, const char *name, int *attrid) +{ + int i; + for (i = 0; i < tag->nattrs; i++) + if (!strcasecmp (tag->attrs[i].name, name)) + { + if (attrid) + *attrid = i; + return tag->attrs[i].value; + } + return NULL; +} + +struct collect_urls_closure { + char *text; /* HTML text. */ + char *base; /* Base URI of the document, possibly + changed through <base href=...>. */ + urlpos *head, *tail; /* List of URLs */ + const char *parent_base; /* Base of the current document. */ + const char *document_file; /* File name of this document. */ + int dash_p_leaf_HTML; /* Whether -p is specified, and this + document is the "leaf" node of the + HTML tree. */ + int nofollow; /* whether NOFOLLOW was specified in a + <meta name=robots> tag. */ +}; + +/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID + are the necessary context to store the position and size. */ + +static void +handle_link (struct collect_urls_closure *closure, const char *link_uri, + struct taginfo *tag, int attrid) +{ + int no_proto = !has_proto (link_uri); + urlpos *newel; + + const char *base = closure->base ? closure->base : closure->parent_base; + char *complete_uri; + + char *fragment = strrchr (link_uri, '#'); + + if (fragment) + { + /* Nullify the fragment identifier, i.e. everything after the + last occurrence of `#', inclusive. This copying is + relatively inefficient, but it doesn't matter because + fragment identifiers don't come up all that often. */ + int hashlen = fragment - link_uri; + char *p = alloca (hashlen + 1); + memcpy (p, link_uri, hashlen); + p[hashlen] = '\0'; + link_uri = p; + } + + if (!base) + { + if (no_proto) + { + /* We have no base, and the link does not have a protocol or + a host attached to it. Nothing we can do. */ + /* #### Should we print a warning here? Wget 1.5.x used to. */ + return; + } + else + complete_uri = xstrdup (link_uri); + } + else + complete_uri = url_concat (base, link_uri); + + DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", + closure->document_file, base ? base : "(null)", + link_uri, complete_uri)); + + newel = (urlpos *)xmalloc (sizeof (urlpos)); + + memset (newel, 0, sizeof (*newel)); + newel->next = NULL; + newel->url = complete_uri; + newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text; + newel->size = tag->attrs[attrid].value_raw_size; + + /* A URL is relative if the host and protocol are not named, and the + name does not start with `/'. + #### This logic might need some rethinking. */ + if (no_proto && *link_uri != '/') + newel->flags |= (URELATIVE | UNOPROTO); + else if (no_proto) + newel->flags |= UNOPROTO; + + if (closure->tail) + { + closure->tail->next = newel; + closure->tail = newel; + } + else + closure->tail = closure->head = newel; +} + +/* #### Document what this does. + #### It would be nice to split this into several functions. */ + +static void +collect_tags_mapper (struct taginfo *tag, void *arg) +{ + struct collect_urls_closure *closure = (struct collect_urls_closure *)arg; + int tagid = find_tag (tag->name); + assert (tagid != -1); + + switch (known_tags[tagid].category) + { + case TC_LINK: + { + int i; + int size = ARRAY_SIZE (url_tag_attr_map); + for (i = 0; i < size; i++) + if (url_tag_attr_map[i].tagid == tagid) + break; + /* We've found the index of url_tag_attr_map where the + attributes of our tags begin. Now, look for every one of + them, and handle it. */ + for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++) + { + char *attr_value; + int id; + if (closure->dash_p_leaf_HTML + && (url_tag_attr_map[i].flags & AF_EXTERNAL)) + /* If we're at a -p leaf node, we don't want to retrieve + links to references we know are external, such as <a + href=...>. */ + continue; + + /* This find_attr() buried in a loop may seem inefficient + (O(n^2)), but it's not, since the number of attributes + (n) we loop over is extremely small. In the worst case + of IMG with all its possible attributes, n^2 will be + only 9. */ + attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id); + if (attr_value) + handle_link (closure, attr_value, tag, id); + } + } + break; + case TC_SPEC: + switch (tagid) + { + case TAG_BASE: + { + char *newbase = find_attr (tag, "href", NULL); + if (!newbase) + break; + if (closure->base) + free (closure->base); + if (closure->parent_base) + closure->base = url_concat (closure->parent_base, newbase); + else + closure->base = xstrdup (newbase); + } + break; + case TAG_LINK: + { + int id; + char *rel = find_attr (tag, "rel", NULL); + char *href = find_attr (tag, "href", &id); + if (href) + { + /* In the normal case, all <link href=...> tags are + fair game. + + In the special case of when -p is active, however, + and we're at a leaf node (relative to the -l + max. depth) in the HTML document tree, the only + <LINK> tag we'll follow is a <LINK REL= + "stylesheet">, as it's necessary for displaying + this document properly. We won't follow other + <LINK> tags, like <LINK REL="home">, for instance, + as they refer to external documents. */ + if (!closure->dash_p_leaf_HTML + || (rel && !strcasecmp (rel, "stylesheet"))) + handle_link (closure, href, tag, id); + } + } + break; + case TAG_META: + /* Some pages use a META tag to specify that the page be + refreshed by a new page after a given number of seconds. + The general format for this is: + + <meta http-equiv=Refresh content="NUMBER; URL=index2.html"> + + So we just need to skip past the "NUMBER; URL=" garbage + to get to the URL. */ + { + int id; + char *name = find_attr (tag, "name", NULL); + char *http_equiv = find_attr (tag, "http-equiv", &id); + if (http_equiv && !strcasecmp (http_equiv, "refresh")) + { + char *refresh = find_attr (tag, "content", NULL); + char *p = refresh; + int offset; + while (ISDIGIT (*p)) + ++p; + if (*p++ != ';') + return; + while (ISSPACE (*p)) + ++p; + if (!(TOUPPER (*p) == 'U' + && TOUPPER (*(p + 1)) == 'R' + && TOUPPER (*(p + 2)) == 'L' + && *(p + 3) == '=')) + return; + p += 4; + while (ISSPACE (*p)) + ++p; + offset = p - refresh; + tag->attrs[id].value_raw_beginning += offset; + tag->attrs[id].value_raw_size -= offset; + handle_link (closure, p, tag, id); + } + else if (name && !strcasecmp (name, "robots")) + { + /* Handle stuff like: + <meta name="robots" content="index,nofollow"> */ + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + if (!strcasecmp (content, "none")) + closure->nofollow = 1; + else + { + while (*content) + { + /* Find the next occurrence of ',' or the end of + the string. */ + char *end = strchr (content, ','); + if (end) + ++end; + else + end = content + strlen (content); + if (!strncasecmp (content, "nofollow", end - content)) + closure->nofollow = 1; + content = end; + } + } + } + } + break; + default: + /* Category is TC_SPEC, but tag name is unhandled. This + must not be. */ + abort (); + } + break; + } +} + +/* Scan FILE, retrieving links to HTML documents from it. Each link is + + Similar to get_urls_file, but for HTML files. FILE is scanned as + an HTML document. get_urls_html() constructs the URLs from the + relative href-s. + + If SILENT is non-zero, do not barf on baseless relative links. */ +urlpos * +get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML, + int *meta_disallow_follow) +{ + struct file_memory *fm; + struct collect_urls_closure closure; + + /* Load the file. */ + fm = read_file (file); + if (!fm) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return NULL; + } + DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + + closure.text = fm->content; + closure.head = closure.tail = NULL; + closure.base = NULL; + closure.parent_base = this_url ? this_url : opt.base_href; + closure.document_file = file; + closure.dash_p_leaf_HTML = dash_p_leaf_HTML; + closure.nofollow = 0; + + if (!interesting_tags) + init_interesting (); + + map_html_tags (fm->content, fm->length, interesting_tags, + interesting_attributes, collect_tags_mapper, &closure); + + DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow)); + if (meta_disallow_follow) + *meta_disallow_follow = closure.nofollow; + + FREE_MAYBE (closure.base); + read_file_free (fm); + return closure.head; +} diff --git a/src/http.c b/src/http.c index 48c1bc5e..912f4923 100644 --- a/src/http.c +++ b/src/http.c @@ -254,6 +254,85 @@ http_process_type (const char *hdr, void *arg) return 1; } +/* Check whether the `Connection' header is set to "keep-alive". */ +static int +http_process_connection (const char *hdr, void *arg) +{ + int *flag = (int *)arg; + if (!strcasecmp (hdr, "Keep-Alive")) + *flag = 1; + return 1; +} + +/* Persistent connections (pc). */ + +static unsigned char pc_last_host[4]; +static unsigned short pc_last_port; +static int pc_last_fd; + +static void +register_persistent (const char *host, unsigned short port, int fd) +{ + if (!store_hostaddress (pc_last_host, host)) + return; + pc_last_port = port; + pc_last_fd = fd; +} + +static void +invalidate_persistent (void) +{ + pc_last_port = 0; +} + +static int +persistent_available_p (const char *host, unsigned short port) +{ + unsigned char this_host[4]; + if (port != pc_last_port) + return 0; + if (!store_hostaddress (this_host, host)) + return 0; + if (memcmp (pc_last_host, this_host, 4)) + return 0; + if (!test_socket_open (pc_last_fd)) + { + invalidate_persistent (); + return 0; + } + return 1; +} + +/* The idea behind these two CLOSE macros is to distinguish between + two cases: one when the job we've been doing is finished, and we + want to close the connection and leave, and two when something is + seriously wrong and we're closing the connection as part of + cleanup. + + In case of keep_alive, CLOSE_FINISH should leave the connection + open, while CLOSE_INVALIDATE should still close it. + + The semantic difference between the flags `keep_alive' and + `reused_connection' is that keep_alive defines the state of HTTP: + whether the connection *will* be preservable. reused_connection, + on the other hand, reflects the present: whether the *current* + connection is the result of preserving. */ + +#define CLOSE_FINISH(fd) do { \ + if (!keep_alive) \ + { \ + CLOSE (fd); \ + if (reused_connection) \ + invalidate_persistent (); \ + } \ +} while (0) + +#define CLOSE_INVALIDATE(fd) do { \ + CLOSE (fd); \ + if (reused_connection) \ + invalidate_persistent (); \ +} while (0) + struct http_stat { @@ -317,6 +396,8 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt) FILE *fp; int auth_tried_already; struct rbuf rbuf; + int keep_alive, http_keep_alive_1, http_keep_alive_2; + int reused_connection; if (!(*dt & HEAD_ONLY)) /* If we're doing a GET on the URL, as opposed to just a HEAD, we need to @@ -329,6 +410,9 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt) again: /* We need to come back here when the initial attempt to retrieve without authorization header fails. */ + keep_alive = 0; + http_keep_alive_1 = http_keep_alive_2 = 0; + reused_connection = 0; /* Initialize certain elements of struct http_stat. */ hs->len = 0L; @@ -345,40 +429,49 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt) ou = u; /* First: establish the connection. */ - logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port); - err = make_connection (&sock, u->host, u->port); - switch (err) + if (u->proxy || !persistent_available_p (u->host, u->port)) { - case HOSTERR: - logputs (LOG_VERBOSE, "\n"); - logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno)); - return HOSTERR; - break; - case CONSOCKERR: - logputs (LOG_VERBOSE, "\n"); - logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno)); - return CONSOCKERR; - break; - case CONREFUSED: - logputs (LOG_VERBOSE, "\n"); - logprintf (LOG_NOTQUIET, - _("Connection to %s:%hu refused.\n"), u->host, u->port); - CLOSE (sock); - return CONREFUSED; - case CONERROR: - logputs (LOG_VERBOSE, "\n"); - logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno)); - CLOSE (sock); - return CONERROR; - break; - case NOCONERROR: - /* Everything is fine! */ - logputs (LOG_VERBOSE, _("connected!\n")); - break; - default: - abort (); - break; - } /* switch */ + logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port); + err = make_connection (&sock, u->host, u->port); + switch (err) + { + case HOSTERR: + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno)); + return HOSTERR; + break; + case CONSOCKERR: + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno)); + return CONSOCKERR; + break; + case CONREFUSED: + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, + _("Connection to %s:%hu refused.\n"), u->host, u->port); + CLOSE (sock); + return CONREFUSED; + case CONERROR: + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno)); + CLOSE (sock); + return CONERROR; + break; + case NOCONERROR: + /* Everything is fine! */ + logputs (LOG_VERBOSE, _("connected!\n")); + break; + default: + abort (); + break; + } + } + else + { + logprintf (LOG_VERBOSE, _("Reusing connection to %s:%hu.\n"), u->host, u->port); + sock = pc_last_fd; + reused_connection = 1; + } if (u->proxy) path = u->proxy->url; @@ -487,6 +580,7 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt) User-Agent: %s\r\n\ Host: %s%s\r\n\ Accept: %s\r\n\ +Connection: Keep-Alive\r\n\ %s%s%s%s%s%s\r\n", command, path, useragent, remhost, host_port ? host_port : "", @@ -505,8 +599,9 @@ Accept: %s\r\n\ num_written = iwrite (sock, request, strlen (request)); if (num_written < 0) { - logputs (LOG_VERBOSE, _("Failed writing HTTP request.\n")); - CLOSE (sock); + logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), + strerror (errno)); + CLOSE_INVALIDATE (sock); return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), @@ -553,7 +648,7 @@ Accept: %s\r\n\ FREE_MAYBE (type); FREE_MAYBE (hs->newloc); FREE_MAYBE (all_headers); - CLOSE (sock); + CLOSE_INVALIDATE (sock); return HEOF; } else if (status == HG_ERROR) @@ -565,7 +660,7 @@ Accept: %s\r\n\ FREE_MAYBE (type); FREE_MAYBE (hs->newloc); FREE_MAYBE (all_headers); - CLOSE (sock); + CLOSE_INVALIDATE (sock); return HERR; } @@ -672,12 +767,32 @@ Accept: %s\r\n\ goto done_header; } } + /* Check for the `Keep-Alive' header. */ + if (!http_keep_alive_1) + { + if (header_process (hdr, "Keep-Alive", header_exists, + &http_keep_alive_1)) + goto done_header; + } + /* Check for `Connection: Keep-Alive'. */ + if (!http_keep_alive_2) + { + if (header_process (hdr, "Connection", http_process_connection, + &http_keep_alive_2)) + goto done_header; + } done_header: free (hdr); } logputs (LOG_VERBOSE, "\n"); + if (contlen != -1 + && (http_keep_alive_1 || http_keep_alive_2)) + keep_alive = 1; + if (keep_alive && !reused_connection) + register_persistent (u->host, u->port, sock); + if ((statcode == HTTP_STATUS_UNAUTHORIZED) && authenticate_h) { @@ -685,7 +800,7 @@ Accept: %s\r\n\ FREE_MAYBE (type); type = NULL; FREEHSTAT (*hs); - CLOSE (sock); + CLOSE_FINISH (sock); if (auth_tried_already) { /* If we have tried it already, then there is not point @@ -753,7 +868,7 @@ Accept: %s\r\n\ FREE_MAYBE (type); FREE_MAYBE (hs->newloc); FREE_MAYBE (all_headers); - CLOSE (sock); + CLOSE_INVALIDATE (sock); return RANGEERR; } @@ -783,7 +898,7 @@ Accept: %s\r\n\ _("Location: %s%s\n"), hs->newloc ? hs->newloc : _("unspecified"), hs->newloc ? _(" [following]") : ""); - CLOSE (sock); + CLOSE_FINISH (sock); FREE_MAYBE (type); FREE_MAYBE (all_headers); return NEWLOCATION; @@ -824,7 +939,7 @@ Accept: %s\r\n\ hs->res = 0; FREE_MAYBE (type); FREE_MAYBE (all_headers); - CLOSE (sock); + CLOSE_FINISH (sock); return RETRFINISHED; } @@ -838,7 +953,7 @@ Accept: %s\r\n\ if (!fp) { logprintf (LOG_NOTQUIET, "%s: %s\n", u->local, strerror (errno)); - CLOSE (sock); + CLOSE_FINISH (sock); FREE_MAYBE (all_headers); return FOPENERR; } @@ -863,7 +978,7 @@ Accept: %s\r\n\ /* Get the contents of the document. */ hs->res = get_contents (sock, fp, &hs->len, hs->restval, (contlen != -1 ? contlen : 0), - &rbuf); + &rbuf, keep_alive); hs->dltime = elapsed_time (); { /* Close or flush the file. We have to be careful to check for @@ -878,7 +993,7 @@ Accept: %s\r\n\ hs->res = -2; } FREE_MAYBE (all_headers); - CLOSE (sock); + CLOSE_FINISH (sock); if (hs->res == -2) return FWRITEERR; return RETRFINISHED; diff --git a/src/main.c b/src/main.c index 300a82e4..ca4f7f23 100644 --- a/src/main.c +++ b/src/main.c @@ -97,6 +97,20 @@ i18n_initialize (void) textdomain ("wget"); #endif /* HAVE_NLS */ } + +/* It's kosher to declare these here because their interface _has_ to + be void foo(void). */ +void url_init PARAMS ((void)); +void host_init PARAMS ((void)); + +/* This just calls the various initialization functions from the + modules that need one-time initialization. */ +static void +private_initialize (void) +{ + url_init (); + host_init (); +} /* Print the usage message. */ static void @@ -293,6 +307,7 @@ main (int argc, char *const *argv) }; i18n_initialize (); + private_initialize (); append_to_log = 0; diff --git a/src/recur.c b/src/recur.c index 52cc8e12..695ce59d 100644 --- a/src/recur.c +++ b/src/recur.c @@ -42,21 +42,20 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "ftp.h" #include "fnmatch.h" #include "host.h" +#include "hash.h" extern char *version_string; #define ROBOTS_FILENAME "robots.txt" -/* #### Many of these lists should really be hashtables! */ - -/* List of downloaded URLs. */ -static urlpos *urls_downloaded; +static struct hash_table *dl_file_url_map; +static struct hash_table *dl_url_file_map; /* List of HTML URLs. */ static slist *urls_html; /* List of undesirable-to-load URLs. */ -static slist *ulist; +static struct hash_table *undesirable_urls; /* List of forbidden locations. */ static char **forbidden = NULL; @@ -84,14 +83,28 @@ static int robots_match PARAMS ((struct urlinfo *, char **)); void recursive_cleanup (void) { - free_slist (ulist); - ulist = NULL; + if (undesirable_urls) + { + string_set_free (undesirable_urls); + undesirable_urls = NULL; + } + if (dl_file_url_map) + { + free_keys_and_values (dl_file_url_map); + hash_table_destroy (dl_file_url_map); + dl_file_url_map = NULL; + } + if (dl_url_file_map) + { + free_keys_and_values (dl_url_file_map); + hash_table_destroy (dl_url_file_map); + dl_url_file_map = NULL; + } + undesirable_urls = NULL; free_vec (forbidden); forbidden = NULL; - free_slist (urls_html); + slist_free (urls_html); urls_html = NULL; - free_urlpos (urls_downloaded); - urls_downloaded = NULL; FREE_MAYBE (base_dir); FREE_MAYBE (robots_host); first_time = 1; @@ -117,6 +130,7 @@ recursive_retrieve (const char *file, const char *this_url) char *constr, *filename, *newloc; char *canon_this_url = NULL; int dt, inl, dash_p_leaf_HTML = FALSE; + int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ uerr_t err; struct urlinfo *rurl; @@ -132,17 +146,29 @@ recursive_retrieve (const char *file, const char *this_url) /* Cache the current URL in the list. */ if (first_time) { - ulist = add_slist (ulist, this_url, 0); - urls_downloaded = NULL; + /* These three operations need to be done only once per Wget + run. They should probably be at a different location. */ + if (!undesirable_urls) + undesirable_urls = make_string_hash_table (0); + if (!dl_file_url_map) + dl_file_url_map = make_string_hash_table (0); + if (!dl_url_file_map) + dl_url_file_map = make_string_hash_table (0); + + hash_table_clear (undesirable_urls); + string_set_add (undesirable_urls, this_url); + hash_table_clear (dl_file_url_map); + hash_table_clear (dl_url_file_map); urls_html = NULL; - /* Enter this_url to the slist, in original and "enhanced" form. */ + /* Enter this_url to the hash table, in original and "enhanced" form. */ u = newurl (); err = parseurl (this_url, u, 0); if (err == URLOK) { - ulist = add_slist (ulist, u->url, 0); - urls_downloaded = add_url (urls_downloaded, u->url, file); - urls_html = add_slist (urls_html, file, NOSORT); + string_set_add (undesirable_urls, u->url); + hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url)); + hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file)); + urls_html = slist_append (urls_html, file); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This @@ -191,7 +217,15 @@ recursive_retrieve (const char *file, const char *this_url) /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, - 0, dash_p_leaf_HTML); + dash_p_leaf_HTML, &meta_disallow_follow); + + if (opt.use_robots && meta_disallow_follow) + { + /* The META tag says we are not to follow this file. Respect + that. */ + free_urlpos (url_list); + url_list = NULL; + } /* Decide what to do with each of the URLs. A URL will be loaded if it meets several requirements, discussed later. */ @@ -240,16 +274,16 @@ recursive_retrieve (const char *file, const char *this_url) the list. */ /* inl is set if the URL we are working on (constr) is stored in - ulist. Using it is crucial to avoid the incessant calls to - in_slist, which is quite slow. */ - inl = in_slist (ulist, constr); + undesirable_urls. Using it is crucial to avoid unnecessary + repeated continuous hits to the hash table. */ + inl = string_set_exists (undesirable_urls, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* If it is absolute link and they are not followed, chuck it @@ -258,7 +292,7 @@ recursive_retrieve (const char *file, const char *this_url) if (opt.relative_only && !(cur_url->flags & URELATIVE)) { DEBUGP (("It doesn't really look like a relative link.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* If its domain is not to be accepted/looked-up, chuck it out. */ @@ -266,7 +300,7 @@ recursive_retrieve (const char *file, const char *this_url) if (!accept_domain (u)) { DEBUGP (("I don't like the smell of that domain.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* Check for parent directory. */ @@ -286,7 +320,7 @@ recursive_retrieve (const char *file, const char *this_url) { /* Failing that too, kill the URL. */ DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } freeurl (ut, 1); @@ -300,7 +334,7 @@ recursive_retrieve (const char *file, const char *this_url) if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -330,7 +364,7 @@ recursive_retrieve (const char *file, const char *this_url) { DEBUGP (("%s (%s) does not match acc/rej rules.\n", constr, u->file)); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -353,12 +387,12 @@ recursive_retrieve (const char *file, const char *this_url) } free (constr); constr = xstrdup (u->url); - inl = in_slist (ulist, constr); + string_set_add (undesirable_urls, constr); if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -398,7 +432,7 @@ recursive_retrieve (const char *file, const char *this_url) { DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, ROBOTS_FILENAME)); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -409,7 +443,7 @@ recursive_retrieve (const char *file, const char *this_url) { DEBUGP (("I've decided to load it -> ")); /* Add it to the list of already-loaded URL-s. */ - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); /* Automatically followed FTPs will *not* be downloaded recursively. */ if (u->proto == URLFTP) @@ -439,10 +473,13 @@ recursive_retrieve (const char *file, const char *this_url) { if (dt & RETROKF) { - urls_downloaded = add_url (urls_downloaded, constr, filename); + hash_table_put (dl_file_url_map, + xstrdup (filename), xstrdup (constr)); + hash_table_put (dl_url_file_map, + xstrdup (constr), xstrdup (filename)); /* If the URL is HTML, note it. */ if (dt & TEXTHTML) - urls_html = add_slist (urls_html, filename, NOSORT); + urls_html = slist_append (urls_html, filename); } } /* If there was no error, and the type is text/html, parse @@ -489,6 +526,10 @@ recursive_retrieve (const char *file, const char *this_url) /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links && !opt.delete_after) + /* This is merely the first pass: the links that have been + successfully downloaded are converted. In the second pass, + convert_all_links() will also convert those links that have NOT + been downloaded to their canonical form. */ convert_links (file, url_list); /* Free the linked list of URL-s. */ free_urlpos (url_list); @@ -531,30 +572,37 @@ void convert_all_links (void) { uerr_t res; - urlpos *l1, *l2, *urls; + urlpos *l1, *urls; struct urlinfo *u; slist *html; - urlpos *urlhtml; for (html = urls_html; html; html = html->next) { + int meta_disallow_follow; + char *url; + DEBUGP (("Rescanning %s\n", html->string)); /* Determine the URL of the HTML file. get_urls_html will need it. */ - for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next) - if (!strcmp (urlhtml->local_name, html->string)) - break; - if (urlhtml) - DEBUGP (("It should correspond to %s.\n", urlhtml->url)); + url = hash_table_get (dl_file_url_map, html->string); + if (url) + DEBUGP (("It should correspond to %s.\n", url)); else DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ - urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1, - FALSE); + urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow); + if (opt.use_robots && meta_disallow_follow) + { + /* The META tag says we are not to follow this file. + Respect that. */ + free_urlpos (urls); + urls = NULL; + } if (!urls) continue; for (l1 = urls; l1; l1 = l1->next) { + char *local_name; /* The URL must be in canonical form to be compared. */ u = newurl (); res = parseurl (l1->url, u, 0); @@ -565,22 +613,18 @@ convert_all_links (void) } /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. - Note: not yet implemented; only ABS2REL works. */ - for (l2 = urls_downloaded; l2; l2 = l2->next) - if (!strcmp (l2->url, u->url)) - { - DEBUGP (("%s flagged for conversion, local %s\n", - l2->url, l2->local_name)); - break; - } + ABS2REL, whereas non-downloaded will be converted REL2ABS. */ + local_name = hash_table_get (dl_url_file_map, u->url); + if (local_name) + DEBUGP (("%s flagged for conversion, local %s\n", + u->url, local_name)); /* Clear the flags. */ l1->flags &= ~ (UABS2REL | UREL2ABS); /* Decide on the conversion direction. */ - if (l2) + if (local_name) { l1->flags |= UABS2REL; - l1->local_name = xstrdup (l2->local_name); + l1->local_name = xstrdup (local_name); } else { diff --git a/src/retr.c b/src/retr.c index f60976d8..30f4556f 100644 --- a/src/retr.c +++ b/src/retr.c @@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "ftp.h" #include "host.h" #include "connect.h" +#include "hash.h" #ifdef WINDOWS LARGE_INTEGER internal_time; @@ -60,6 +61,8 @@ enum spflags { SP_NONE, SP_INIT, SP_FINISH }; static int show_progress PARAMS ((long, long, enum spflags)); +#define MIN(i, j) ((i) <= (j) ? (i) : (j)) + /* Reads the contents of file descriptor FD, until it is closed, or a read error occurs. The data is read in 8K chunks, and stored to stream fp, which should have been open for writing. If BUF is @@ -83,9 +86,9 @@ static int show_progress PARAMS ((long, long, enum spflags)); from fd immediately, flush or discard the buffer. */ int get_contents (int fd, FILE *fp, long *len, long restval, long expected, - struct rbuf *rbuf) + struct rbuf *rbuf, int use_expected) { - int res; + int res = 0; static char c[8192]; *len = restval; @@ -105,10 +108,17 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected, *len += res; } } - /* Read from fd while there is available data. */ - do + /* Read from fd while there is available data. + + Normally, if expected is 0, it means that it is not known how + much data is expected. However, if use_expected is specified, + then expected being zero means exactly that. */ + while (!use_expected || (*len < expected)) { - res = iread (fd, c, sizeof (c)); + int amount_to_read = (use_expected + ? MIN (expected - *len, sizeof (c)) + : sizeof (c)); + res = iread (fd, c, amount_to_read); if (res > 0) { if (fwrite (c, sizeof (char), res, fp) < res) @@ -120,7 +130,9 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected, } *len += res; } - } while (res > 0); + else + break; + } if (res < -1) res = -1; if (opt.verbose) @@ -323,7 +335,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, int local_use_proxy; char *mynewloc, *proxy; struct urlinfo *u; - slist *redirections; + struct hash_table *redirections = NULL; /* If dt is NULL, just ignore it. */ if (!dt) @@ -334,8 +346,6 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - redirections = NULL; - u = newurl (); /* Parse the URL. */ result = parseurl (url, u, 0); @@ -343,7 +353,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result)); freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); free (url); return result; } @@ -379,7 +390,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, { logputs (LOG_NOTQUIET, _("Could not find proxy host.\n")); freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); free (url); return PROXERR; } @@ -392,7 +404,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, else logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy); freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); free (url); return PROXERR; } @@ -454,7 +467,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result)); freeurl (newloc_struct, 1); freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); free (url); free (mynewloc); return result; @@ -466,34 +480,29 @@ retrieve_url (const char *origurl, char **file, char **newloc, free (mynewloc); mynewloc = xstrdup (newloc_struct->url); - /* Check for redirection to back to itself. */ - if (!strcmp (u->url, newloc_struct->url)) + if (!redirections) { - logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"), - mynewloc); - freeurl (newloc_struct, 1); - freeurl (u, 1); - free_slist (redirections); - free (url); - free (mynewloc); - return WRONGCODE; + redirections = make_string_hash_table (0); + /* Add current URL immediately so we can detect it as soon + as possible in case of a cycle. */ + string_set_add (redirections, u->url); } /* The new location is OK. Let's check for redirection cycle by peeking through the history of redirections. */ - if (in_slist (redirections, newloc_struct->url)) + if (string_set_exists (redirections, newloc_struct->url)) { logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"), mynewloc); freeurl (newloc_struct, 1); freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); free (url); free (mynewloc); return WRONGCODE; } - - redirections = add_slist (redirections, newloc_struct->url, NOSORT); + string_set_add (redirections, newloc_struct->url); free (url); url = mynewloc; @@ -510,7 +519,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, *file = NULL; } freeurl (u, 1); - free_slist (redirections); + if (redirections) + string_set_free (redirections); if (newloc) *newloc = url; @@ -531,9 +541,7 @@ retrieve_from_file (const char *file, int html, int *count) uerr_t status; urlpos *url_list, *cur_url; - /* If spider-mode is on, we do not want get_urls_html barfing - errors on baseless links. */ - url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE) + url_list = (html ? get_urls_html (file, NULL, FALSE, NULL) : get_urls_file (file)); status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ diff --git a/src/retr.h b/src/retr.h index 308eeedf..9866f246 100644 --- a/src/retr.h +++ b/src/retr.h @@ -22,7 +22,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "rbuf.h" -int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *)); +int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *, int)); uerr_t retrieve_url PARAMS ((const char *, char **, char **, const char *, int *)); diff --git a/src/url.c b/src/url.c index 44e7280d..c077f000 100644 --- a/src/url.c +++ b/src/url.c @@ -38,7 +38,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "url.h" #include "host.h" -#include "html.h" #ifndef errno extern int errno; @@ -48,22 +47,12 @@ extern int errno; #define DEFAULT_HTTP_PORT 80 #define DEFAULT_FTP_PORT 21 -/* URL separator (for findurl) */ -#define URL_SEPARATOR "!\"#'(),>`{}|<>" +/* Table of Unsafe chars. This is intialized in + init_unsafe_char_table. */ -/* A list of unsafe characters for encoding, as per RFC1738. '@' and - ':' (not listed in RFC) were added because of user/password - encoding. */ +static char unsafe_char_table[256]; -#ifndef WINDOWS -# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:" -#else /* WINDOWS */ -# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`" -#endif /* WINDOWS */ - -#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \ - || ((unsigned char)(c) > '~') /* ASCII 127 */ \ - || strchr (URL_UNSAFE_CHARS, c)) +#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)]) /* If S contains unsafe characters, free it and replace it with a version that doesn't. */ @@ -176,6 +165,34 @@ skip_url (const char *url) return 0; } +/* Unsafe chars: + - anything <= 32; + - stuff from rfc1738 ("<>\"#%{}|\\^~[]`"); + - @ and :, for user/password encoding. + - everything over 127 (but we don't bother with recording those. */ +void +init_unsafe_char_table (void) +{ + int i; + for (i = 0; i < 256; i++) + if (i < 32 || i >= 127 + || i == '<' + || i == '>' + || i == '\"' + || i == '#' + || i == '%' + || i == '{' + || i == '}' + || i == '|' + || i == '\\' + || i == '^' + || i == '~' + || i == '[' + || i == ']' + || i == '`') + unsafe_char_table[i] = 1; +} + /* Returns 1 if the string contains unsafe characters, 0 otherwise. */ int contains_unsafe (const char *s) @@ -296,7 +313,7 @@ skip_proto (const char *url) /* Returns 1 if the URL begins with a protocol (supported or unsupported), 0 otherwise. */ -static int +int has_proto (const char *url) { char **s; @@ -765,297 +782,54 @@ url_equal (const char *url1, const char *url2) return res; } -/* Find URL of format scheme:hostname[:port]/dir in a buffer. The - buffer may contain pretty much anything; no errors are signaled. */ -static const char * -findurl (const char *buf, int howmuch, int *count) -{ - char **prot; - const char *s1, *s2; - - for (s1 = buf; howmuch; s1++, howmuch--) - for (prot = protostrings; *prot; prot++) - if (howmuch <= strlen (*prot)) - continue; - else if (!strncasecmp (*prot, s1, strlen (*prot))) - { - for (s2 = s1, *count = 0; - howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) && - !strchr (URL_SEPARATOR, *s2); - s2++, (*count)++, howmuch--); - return s1; - } - return NULL; -} - -/* Scans the file for signs of URL-s. Returns a vector of pointers, - each pointer representing a URL string. The file is *not* assumed - to be HTML. */ urlpos * get_urls_file (const char *file) { - long nread; - FILE *fp; - char *buf; - const char *pbuf; - int size; - urlpos *first, *current, *old; + struct file_memory *fm; + urlpos *head, *tail; + const char *text, *text_end; - if (file && !HYPHENP (file)) - { - fp = fopen (file, "rb"); - if (!fp) - { - logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); - return NULL; - } - } - else - fp = stdin; /* Load the file. */ - load_file (fp, &buf, &nread); - if (file && !HYPHENP (file)) - fclose (fp); - DEBUGP (("Loaded %s (size %ld).\n", file, nread)); - first = current = NULL; - /* Fill the linked list with URLs. */ - for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size)); - pbuf += size) + fm = read_file (file); + if (!fm) { - /* Allocate the space. */ - old = current; - current = (urlpos *)xmalloc (sizeof (urlpos)); - if (old) - old->next = current; - memset (current, 0, sizeof (*current)); - current->next = NULL; - current->url = (char *)xmalloc (size + 1); - memcpy (current->url, pbuf, size); - current->url[size] = '\0'; - if (!first) - first = current; + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return NULL; } - /* Free the buffer. */ - free (buf); - - return first; -} - -/* Similar to get_urls_file, but for HTML files. FILE is scanned as - an HTML document using htmlfindurl(), which see. get_urls_html() - constructs the HTML-s from the relative href-s. - - If SILENT is non-zero, do not barf on baseless relative links. */ -urlpos * -get_urls_html (const char *file, const char *this_url, int silent, - int dash_p_leaf_HTML) -{ - long nread; - FILE *fp; - char *orig_buf; - const char *buf; - int step, first_time; - urlpos *first, *current, *old; - - if (file && !HYPHENP (file)) + DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + head = tail = NULL; + text = fm->content; + text_end = fm->content + fm->length; + while (text < text_end) { - fp = fopen (file, "rb"); - if (!fp) - { - logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); - return NULL; - } - } - else - fp = stdin; - /* Load the file. */ - load_file (fp, &orig_buf, &nread); - if (file && !HYPHENP (file)) - fclose (fp); - DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread)); - first = current = NULL; - first_time = 1; - /* Iterate over the URLs in BUF, picked by htmlfindurl(). */ - for (buf = orig_buf; - (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time, - dash_p_leaf_HTML)); - buf += step) - { - int i, no_proto; - int size = step; - const char *pbuf = buf; - char *constr, *base; - const char *cbase; - char *needs_freeing, *url_data; - - first_time = 0; - - /* A frequent phenomenon that needs to be handled are pages - generated by brain-damaged HTML generators, which refer to to - URI-s as <a href="<spaces>URI<spaces>">. We simply ignore - any spaces at the beginning or at the end of the string. - This is probably not strictly correct, but that's what the - browsers do, so we may follow. May the authors of "WYSIWYG" - HTML tools burn in hell for the damage they've inflicted! */ - while ((pbuf < buf + step) && ISSPACE (*pbuf)) - { - ++pbuf; - --size; - } - while (size && ISSPACE (pbuf[size - 1])) - --size; - if (!size) - break; - - /* It would be nice if we could avoid allocating memory in this - loop, but I don't see an easy way. To process the entities, - we need to either copy the data, or change it destructively. - I choose the former. - - We have two pointers: needs_freeing and url_data, because the - code below does thing like url_data += <something>, and we - want to pass the original string to free(). */ - needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size); - size = strlen (url_data); - - for (i = 0; protostrings[i]; i++) - { - if (!strncasecmp (protostrings[i], url_data, - MINVAL (strlen (protostrings[i]), size))) - break; - } - /* Check for http:RELATIVE_URI. See below for details. */ - if (protostrings[i] - && !(strncasecmp (url_data, "http:", 5) == 0 - && strncasecmp (url_data, "http://", 7) != 0)) - { - no_proto = 0; - } + const char *line_beg = text; + const char *line_end = memchr (text, '\n', text_end - text); + if (!line_end) + line_end = text_end; else + ++line_end; + text = line_end; + while (line_beg < line_end + && ISSPACE (*line_beg)) + ++line_beg; + while (line_end > line_beg + 1 + && ISSPACE (*(line_end - 1))) + --line_end; + if (line_end > line_beg) { - no_proto = 1; - /* This is for extremely brain-damaged pages that refer to - relative URI-s as <a href="http:URL">. Just strip off the - silly leading "http:" (as well as any leading blanks - before it). */ - if ((size > 5) && !strncasecmp ("http:", url_data, 5)) - url_data += 5, size -= 5; - } - if (!no_proto) - { - for (i = 0; i < ARRAY_SIZE (sup_protos); i++) - { - if (!strncasecmp (sup_protos[i].name, url_data, - MINVAL (strlen (sup_protos[i].name), size))) - break; - } - /* Do *not* accept a non-supported protocol. */ - if (i == ARRAY_SIZE (sup_protos)) - { - free (needs_freeing); - continue; - } - } - if (no_proto) - { - /* First, construct the base, which can be relative itself. - - Criteria for creating the base are: - 1) html_base created by <base href="..."> - 2) current URL - 3) base provided from the command line */ - cbase = html_base (); - if (!cbase) - cbase = this_url; - if (!cbase) - cbase = opt.base_href; - if (!cbase) /* Error condition -- a baseless - relative link. */ - { - if (!opt.quiet && !silent) - { - /* Use malloc, not alloca because this is called in - a loop. */ - char *temp = (char *)malloc (size + 1); - strncpy (temp, url_data, size); - temp[size] = '\0'; - logprintf (LOG_NOTQUIET, - _("Error (%s): Link %s without a base provided.\n"), - file, temp); - free (temp); - } - free (needs_freeing); - continue; - } - if (this_url) - base = construct (this_url, cbase, strlen (cbase), - !has_proto (cbase)); + urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos)); + memset (entry, 0, sizeof (*entry)); + entry->next = NULL; + entry->url = strdupdelim (line_beg, line_end); + if (!head) + head = entry; else - { - /* Base must now be absolute, with host name and - protocol. */ - if (!has_proto (cbase)) - { - logprintf (LOG_NOTQUIET, _("\ -Error (%s): Base %s relative, without referer URL.\n"), - file, cbase); - free (needs_freeing); - continue; - } - base = xstrdup (cbase); - } - constr = construct (base, url_data, size, no_proto); - free (base); + tail->next = entry; + tail = entry; } - else /* has proto */ - { - constr = (char *)xmalloc (size + 1); - strncpy (constr, url_data, size); - constr[size] = '\0'; - } -#ifdef DEBUG - if (opt.debug) - { - char *tmp; - const char *tmp2; - - tmp2 = html_base (); - /* Use malloc, not alloca because this is called in a loop. */ - tmp = (char *)xmalloc (size + 1); - strncpy (tmp, url_data, size); - tmp[size] = '\0'; - logprintf (LOG_ALWAYS, - "file %s; this_url %s; base %s\nlink: %s; constr: %s\n", - file, this_url ? this_url : "(null)", - tmp2 ? tmp2 : "(null)", tmp, constr); - free (tmp); - } -#endif - - /* Allocate the space. */ - old = current; - current = (urlpos *)xmalloc (sizeof (urlpos)); - if (old) - old->next = current; - if (!first) - first = current; - /* Fill the values. */ - memset (current, 0, sizeof (*current)); - current->next = NULL; - current->url = constr; - current->size = step; - current->pos = buf - orig_buf; - /* A URL is relative if the host and protocol are not named, - and the name does not start with `/'. */ - if (no_proto && *url_data != '/') - current->flags |= (URELATIVE | UNOPROTO); - else if (no_proto) - current->flags |= UNOPROTO; - free (needs_freeing); } - free (orig_buf); - - return first; + read_file_free (fm); + return head; } /* Free the linked list of urlpos. */ @@ -1527,103 +1301,59 @@ no_proxy_match (const char *host, const char **no_proxy) return !sufmatch (no_proxy, host); } +static void write_backup_file PARAMS ((const char *, downloaded_file_t)); + /* Change the links in an HTML document. Accepts a structure that defines the positions of all the links. */ void convert_links (const char *file, urlpos *l) { + struct file_memory *fm; FILE *fp; - char *buf, *p, *p2; + char *p; downloaded_file_t downloaded_file_return; - long size; + + { + /* First we do a "dry run": go through the list L and see whether + any URL needs to be converted in the first place. If not, just + leave the file alone. */ + int count = 0; + urlpos *dry = l; + for (dry = l; dry; dry = dry->next) + if (dry->flags & (UABS2REL | UREL2ABS)) + ++count; + if (!count) + { + logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"), + file); + return; + } + } logprintf (LOG_VERBOSE, _("Converting %s... "), file); - /* Read from the file.... */ - fp = fopen (file, "rb"); - if (!fp) + + fm = read_file (file); + if (!fm) { logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), file, strerror (errno)); return; } - /* ...to a buffer. */ - load_file (fp, &buf, &size); - fclose (fp); - - downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file); + downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file); if (opt.backup_converted && downloaded_file_return) - /* Rather than just writing over the original .html file with the converted - version, save the former to *.orig. Note we only do this for files we've - _successfully_ downloaded, so we don't clobber .orig files sitting around - from previous invocations. */ + write_backup_file (file, downloaded_file_return); + + /* Before opening the file for writing, unlink the file. This is + important if the data in FM is mmaped. In such case, nulling the + file, which is what fopen() below does, would make us read all + zeroes from the mmaped region. */ + if (unlink (file) < 0 && errno != ENOENT) { - /* Construct the backup filename as the original name plus ".orig". */ - size_t filename_len = strlen(file); - char* filename_plus_orig_suffix; - boolean already_wrote_backup_file = FALSE; - slist* converted_file_ptr; - static slist* converted_files = NULL; - - if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) - { - /* Just write "orig" over "html". We need to do it this way because - when we're checking to see if we've downloaded the file before (to - see if we can skip downloading it), we don't know if it's a - text/html file. Therefore we don't know yet at that stage that -E - is going to cause us to tack on ".html", so we need to compare - vs. the original URL plus ".orig", not the original URL plus - ".html.orig". */ - filename_plus_orig_suffix = xmalloc(filename_len + 1); - strcpy(filename_plus_orig_suffix, file); - strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig"); - } - else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ - { - /* Append ".orig" to the name. */ - filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig")); - strcpy(filename_plus_orig_suffix, file); - strcpy(filename_plus_orig_suffix + filename_len, ".orig"); - } - - /* We can get called twice on the same URL thanks to the - convert_all_links() call in main(). If we write the .orig file each - time in such a case, it'll end up containing the first-pass conversion, - not the original file. So, see if we've already been called on this - file. */ - converted_file_ptr = converted_files; - while (converted_file_ptr != NULL) - if (strcmp(converted_file_ptr->string, file) == 0) - { - already_wrote_backup_file = TRUE; - break; - } - else - converted_file_ptr = converted_file_ptr->next; - - if (!already_wrote_backup_file) - { - /* Rename <file> to <file>.orig before former gets written over. */ - if (rename(file, filename_plus_orig_suffix) != 0) - logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), - file, filename_plus_orig_suffix, strerror (errno)); - - /* Remember that we've already written a .orig backup for this file. - Note that we never free this memory since we need it till the - convert_all_links() call, which is one of the last things the - program does before terminating. BTW, I'm not sure if it would be - safe to just set 'converted_file_ptr->string' to 'file' below, - rather than making a copy of the string... Another note is that I - thought I could just add a field to the urlpos structure saying - that we'd written a .orig file for this URL, but that didn't work, - so I had to make this separate list. */ - converted_file_ptr = xmalloc(sizeof(*converted_file_ptr)); - converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */ - converted_file_ptr->next = converted_files; - converted_files = converted_file_ptr; - } - - free(filename_plus_orig_suffix); + logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"), + file, strerror (errno)); + read_file_free (fm); + return; } /* Now open the file for writing. */ fp = fopen (file, "wb"); @@ -1631,50 +1361,63 @@ convert_links (const char *file, urlpos *l) { logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), file, strerror (errno)); - free (buf); + read_file_free (fm); return; } - /* Presumably we have to loop through multiple URLs here (even though we're - only talking about a single local file) because of the -O option. */ - for (p = buf; l; l = l->next) + /* Here we loop through all the URLs in file, replacing those of + them that are downloaded with relative references. */ + p = fm->content; + for (; l; l = l->next) { - if (l->pos >= size) + char *url_start = fm->content + l->pos; + if (l->pos >= fm->length) { DEBUGP (("Something strange is going on. Please investigate.")); break; } - /* If the URL already is relative or it is not to be converted - for some other reason (e.g. because of not having been - downloaded in the first place), skip it. */ - if ((l->flags & URELATIVE) || !(l->flags & UABS2REL)) + /* If the URL is not to be converted, skip it. */ + if (!(l->flags & (UABS2REL | UREL2ABS))) { DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url, l->pos, l->flags)); continue; } - /* Else, reach the position of the offending URL, echoing - everything up to it to the outfile. */ - for (p2 = buf + l->pos; p < p2; p++) - putc (*p, fp); + + /* Echo the file contents, up to the offending URL's opening + quote, to the outfile. */ + fwrite (p, 1, url_start - p, fp); + p = url_start; if (l->flags & UABS2REL) - /* Convert absolute URL to relative. */ { + /* Convert absolute URL to relative. */ char *newname = construct_relative (file, l->local_name); - fprintf (fp, "%s", newname); + putc (*p, fp); /* quoting char */ + fputs (newname, fp); + p += l->size - 1; + putc (*p, fp); /* close quote */ + ++p; DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n", l->url, newname, l->pos, file)); free (newname); } - p += l->size; + else if (l->flags & UREL2ABS) + { + /* Convert the link to absolute URL. */ + char *newlink = l->url; + putc (*p, fp); /* quoting char */ + fputs (newlink, fp); + p += l->size - 1; + putc (*p, fp); /* close quote */ + ++p; + DEBUGP (("REL2ABS: <something> to %s at position %d in %s.\n", + newlink, l->pos, file)); + } } /* Output the rest of the file. */ - if (p - buf < size) - { - for (p2 = buf + size; p < p2; p++) - putc (*p, fp); - } + if (p - fm->content < fm->length) + fwrite (p, 1, fm->length - (p - fm->content), fp); fclose (fp); - free (buf); + read_file_free (fm); logputs (LOG_VERBOSE, _("done.\n")); } @@ -1746,6 +1489,79 @@ add_url (urlpos *l, const char *url, const char *file) return t; } +static void +write_backup_file (const char *file, downloaded_file_t downloaded_file_return) +{ + /* Rather than just writing over the original .html file with the + converted version, save the former to *.orig. Note we only do + this for files we've _successfully_ downloaded, so we don't + clobber .orig files sitting around from previous invocations. */ + + /* Construct the backup filename as the original name plus ".orig". */ + size_t filename_len = strlen(file); + char* filename_plus_orig_suffix; + boolean already_wrote_backup_file = FALSE; + slist* converted_file_ptr; + static slist* converted_files = NULL; + + if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) + { + /* Just write "orig" over "html". We need to do it this way + because when we're checking to see if we've downloaded the + file before (to see if we can skip downloading it), we don't + know if it's a text/html file. Therefore we don't know yet + at that stage that -E is going to cause us to tack on + ".html", so we need to compare vs. the original URL plus + ".orig", not the original URL plus ".html.orig". */ + filename_plus_orig_suffix = alloca (filename_len + 1); + strcpy(filename_plus_orig_suffix, file); + strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig"); + } + else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ + { + /* Append ".orig" to the name. */ + filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig")); + strcpy(filename_plus_orig_suffix, file); + strcpy(filename_plus_orig_suffix + filename_len, ".orig"); + } + + /* We can get called twice on the same URL thanks to the + convert_all_links() call in main(). If we write the .orig file + each time in such a case, it'll end up containing the first-pass + conversion, not the original file. So, see if we've already been + called on this file. */ + converted_file_ptr = converted_files; + while (converted_file_ptr != NULL) + if (strcmp(converted_file_ptr->string, file) == 0) + { + already_wrote_backup_file = TRUE; + break; + } + else + converted_file_ptr = converted_file_ptr->next; + + if (!already_wrote_backup_file) + { + /* Rename <file> to <file>.orig before former gets written over. */ + if (rename(file, filename_plus_orig_suffix) != 0) + logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), + file, filename_plus_orig_suffix, strerror (errno)); + + /* Remember that we've already written a .orig backup for this file. + Note that we never free this memory since we need it till the + convert_all_links() call, which is one of the last things the + program does before terminating. BTW, I'm not sure if it would be + safe to just set 'converted_file_ptr->string' to 'file' below, + rather than making a copy of the string... Another note is that I + thought I could just add a field to the urlpos structure saying + that we'd written a .orig file for this URL, but that didn't work, + so I had to make this separate list. */ + converted_file_ptr = xmalloc(sizeof(*converted_file_ptr)); + converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */ + converted_file_ptr->next = converted_files; + converted_files = converted_file_ptr; + } +} /* Remembers which files have been downloaded. In the standard case, should be called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually @@ -1798,3 +1614,10 @@ downloaded_file (downloaded_file_t mode, const char* file) return FILE_NOT_ALREADY_DOWNLOADED; } } + +/* Initialization of static stuff. */ +void +url_init (void) +{ + init_unsafe_char_table (); +} diff --git a/src/url.h b/src/url.h index 0f55ec35..648193fa 100644 --- a/src/url.h +++ b/src/url.h @@ -88,6 +88,7 @@ struct urlinfo *newurl PARAMS ((void)); void freeurl PARAMS ((struct urlinfo *, int)); uerr_t urlproto PARAMS ((const char *)); int skip_proto PARAMS ((const char *)); +int has_proto PARAMS ((const char *)); int skip_uname PARAMS ((const char *)); uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int)); @@ -95,7 +96,7 @@ char *str_url PARAMS ((const struct urlinfo *, int)); int url_equal PARAMS ((const char *, const char *)); urlpos *get_urls_file PARAMS ((const char *)); -urlpos *get_urls_html PARAMS ((const char *, const char *, int, int)); +urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *)); void free_urlpos PARAMS ((urlpos *)); char *url_concat PARAMS ((const char *, const char *)); diff --git a/src/utils.c b/src/utils.c index 795ecb75..a6a08e2a 100644 --- a/src/utils.c +++ b/src/utils.c @@ -31,6 +31,9 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef HAVE_UNISTD_H # include <unistd.h> #endif +#ifdef HAVE_MMAP +# include <sys/mman.h> +#endif #ifdef HAVE_PWD_H # include <pwd.h> #endif @@ -45,11 +48,13 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef NeXT # include <libc.h> /* for access() */ #endif +#include <fcntl.h> #include <assert.h> #include "wget.h" #include "utils.h" #include "fnmatch.h" +#include "hash.h" #ifndef errno extern int errno; @@ -736,28 +741,149 @@ read_whole_line (FILE *fp) line = xrealloc (line, length + 1); return line; } + +/* Read FILE into memory. A pointer to `struct file_memory' are + returned; use struct element `content' to access file contents, and + the element `length' to know the file length. `content' is *not* + zero-terminated, and you should *not* read or write beyond the [0, + length) range of characters. -/* Load file pointed to by FP to memory and return the malloc-ed - buffer with the contents. *NREAD will contain the number of read - bytes. The file is loaded in chunks, allocated exponentially, - starting with FILE_BUFFER_SIZE bytes. */ -void -load_file (FILE *fp, char **buf, long *nread) + After you are done with the file contents, call read_file_free to + release the memory. + + Depending on the operating system and the type of file that is + being read, read_file() either mmap's the file into memory, or + reads the file into the core using read(). + + If file is named "-", fileno(stdin) is used for reading instead. + If you want to read from a real file named "-", use "./-" instead. */ + +struct file_memory * +read_file (const char *file) { - long bufsize; + int fd; + struct file_memory *fm; + long size; + int inhibit_close = 0; - bufsize = 512; - *nread = 0; - *buf = NULL; - while (!feof (fp) && !ferror (fp)) + /* Some magic in the finest tradition of Perl and its kin: if FILE + is "-", just use stdin. */ + if (HYPHENP (file)) { - *buf = (char *)xrealloc (*buf, bufsize + *nread); - *nread += fread (*buf + *nread, sizeof (char), bufsize, fp); - bufsize <<= 1; + fd = fileno (stdin); + inhibit_close = 1; + /* Note that we don't inhibit mmap() in this case. If stdin is + redirected from a regular file, mmap() will still work. */ } - /* #### No indication of encountered error?? */ + else + fd = open (file, O_RDONLY); + if (fd < 0) + return NULL; + fm = xmalloc (sizeof (struct file_memory)); + +#ifdef HAVE_MMAP + { + struct stat buf; + if (fstat (fd, &buf) < 0) + goto mmap_lose; + fm->length = buf.st_size; + /* NOTE: As far as I know, the callers of this function never + modify the file text. Relying on this would enable us to + specify PROT_READ and MAP_SHARED for a marginal gain in + efficiency, but at some cost to generality. */ + fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (fm->content == MAP_FAILED) + goto mmap_lose; + if (!inhibit_close) + close (fd); + + fm->mmap_p = 1; + return fm; + } + + mmap_lose: + /* The most common reason why mmap() fails is that FD does not point + to a plain file. However, it's also possible that mmap() doesn't + work for a particular type of file. Therefore, whenever mmap() + fails, we just fall back to the regular method. */ +#endif /* HAVE_MMAP */ + + fm->length = 0; + size = 512; /* number of bytes fm->contents can + hold at any given time. */ + fm->content = xmalloc (size); + while (1) + { + long nread; + if (fm->length > size / 2) + { + /* #### I'm not sure whether the whole exponential-growth + thing makes sense with kernel read. On Linux at least, + read() refuses to read more than 4K from a file at a + single chunk anyway. But other Unixes might optimize it + better, and it doesn't *hurt* anything, so I'm leaving + it. */ + + /* Normally, we grow SIZE exponentially to make the number + of calls to read() and realloc() logarithmic in relation + to file size. However, read() can read an amount of data + smaller than requested, and it would be unreasonably to + double SIZE every time *something* was read. Therefore, + we double SIZE only when the length exceeds half of the + entire allocated size. */ + size <<= 1; + fm->content = xrealloc (fm->content, size); + } + nread = read (fd, fm->content + fm->length, size - fm->length); + if (nread > 0) + /* Successful read. */ + fm->length += nread; + else if (nread < 0) + /* Error. */ + goto lose; + else + /* EOF */ + break; + } + if (!inhibit_close) + close (fd); + if (size > fm->length && fm->length != 0) + /* Due to exponential growth of fm->content, the allocated region + might be much larger than what is actually needed. */ + fm->content = xrealloc (fm->content, fm->length); + fm->mmap_p = 0; + return fm; + + lose: + if (!inhibit_close) + close (fd); + free (fm->content); + free (fm); + return NULL; } +/* Release the resources held by FM. Specifically, this calls + munmap() or free() on fm->content, depending whether mmap or + malloc/read were used to read in the file. It also frees the + memory needed to hold the FM structure itself. */ + +void +read_file_free (struct file_memory *fm) +{ +#ifdef HAVE_MMAP + if (fm->mmap_p) + { + munmap (fm->content, fm->length); + } + else +#endif + { + free (fm->content); + } + free (fm); +} + /* Free the pointers in a NULL-terminated vector of pointers, then free the pointer itself. */ void @@ -801,97 +927,42 @@ merge_vecs (char **v1, char **v2) return v1; } -/* A set of simple-minded routines to store and search for strings in - a linked list. You may add a string to the slist, and peek whether - it's still in there at any time later. */ +/* A set of simple-minded routines to store strings in a linked list. + This used to also be used for searching, but now we have hash + tables for that. */ -/* Add an element to the list. If flags is NOSORT, the list will not - be sorted. */ +/* Append an element to the list. */ slist * -add_slist (slist *l, const char *s, int flags) +slist_append (slist *l, const char *s) { - slist *t, *old, *beg; - int cmp; + slist *newel = (slist *)xmalloc (sizeof (slist)); + slist *beg = l; - if (flags & NOSORT) - { - if (!l) - { - t = (slist *)xmalloc (sizeof (slist)); - t->string = xstrdup (s); - t->next = NULL; - return t; - } - beg = l; - /* Find the last element. */ - while (l->next) - l = l->next; - t = (slist *)xmalloc (sizeof (slist)); - l->next = t; - t->string = xstrdup (s); - t->next = NULL; - return beg; - } - /* Empty list or changing the first element. */ - if (!l || (cmp = strcmp (l->string, s)) > 0) - { - t = (slist *)xmalloc (sizeof (slist)); - t->string = xstrdup (s); - t->next = l; - return t; - } + newel->string = xstrdup (s); + newel->next = NULL; - beg = l; - if (cmp == 0) - return beg; - - /* Second two one-before-the-last element. */ + if (!l) + return newel; + /* Find the last element. */ while (l->next) - { - old = l; - l = l->next; - cmp = strcmp (s, l->string); - if (cmp == 0) /* no repeating in the list */ - return beg; - else if (cmp > 0) - continue; - /* If the next list element is greater than s, put s between the - current and the next list element. */ - t = (slist *)xmalloc (sizeof (slist)); - old->next = t; - t->next = l; - t->string = xstrdup (s); - return beg; - } - t = (slist *)xmalloc (sizeof (slist)); - t->string = xstrdup (s); - /* Insert the new element after the last element. */ - l->next = t; - t->next = NULL; + l = l->next; + l->next = newel; return beg; } /* Is there a specific entry in the list? */ int -in_slist (slist *l, const char *s) +slist_contains (slist *l, const char *s) { - int cmp; - - while (l) - { - cmp = strcmp (l->string, s); - if (cmp == 0) - return 1; - else if (cmp > 0) /* the list is ordered! */ - return 0; - l = l->next; - } + for (; l; l = l->next) + if (!strcmp (l->string, s)) + return 1; return 0; } /* Free the whole slist. */ void -free_slist (slist *l) +slist_free (slist *l) { slist *n; @@ -903,6 +974,58 @@ free_slist (slist *l) l = n; } } + +/* Sometimes it's useful to create "sets" of strings, i.e. special + hash tables where you want to store strings as keys and merely + query for their existence. Here is a set of utility routines that + makes that transparent. */ + +void +string_set_add (struct hash_table *ht, const char *s) +{ + /* We use "1" as value. It provides us a useful and clear arbitrary + value, and it consumes no memory -- the pointers to the same + string "1" will be shared by all the key-value pairs in the hash + table. */ + hash_table_put (ht, xstrdup (s), "1"); +} + +int +string_set_exists (struct hash_table *ht, const char *s) +{ + return hash_table_exists (ht, s); +} + +static int +string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored) +{ + free (key); + return 0; +} + +void +string_set_free (struct hash_table *ht) +{ + hash_table_map (ht, string_set_free_mapper, NULL); + hash_table_destroy (ht); +} + +static int +free_keys_and_values_mapper (void *key, void *value, void *arg_ignored) +{ + free (key); + free (value); + return 0; +} + +/* Another utility function: call free() on all keys and values of HT. */ + +void +free_keys_and_values (struct hash_table *ht) +{ + hash_table_map (ht, free_keys_and_values_mapper, NULL); +} + /* Engine for legible and legible_long_long; this function works on strings. */ diff --git a/src/utils.h b/src/utils.h index eb3c99cc..c9de38bb 100644 --- a/src/utils.h +++ b/src/utils.h @@ -20,11 +20,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef UTILS_H #define UTILS_H -/* Flags for slist. */ -enum { - NOSORT = 1 -}; - enum accd { ALLABS = 1 }; @@ -36,6 +31,14 @@ typedef struct _slist struct _slist *next; } slist; +struct hash_table; + +struct file_memory { + char *content; + long length; + int mmap_p; +}; + char *time_str PARAMS ((time_t *)); const char *uerrmsg PARAMS ((uerr_t)); @@ -58,13 +61,19 @@ int accdir PARAMS ((const char *s, enum accd)); char *suffix PARAMS ((const char *s)); char *read_whole_line PARAMS ((FILE *)); -void load_file PARAMS ((FILE *, char **, long *)); +struct file_memory *read_file PARAMS ((const char *)); +void read_file_free PARAMS ((struct file_memory *)); void free_vec PARAMS ((char **)); char **merge_vecs PARAMS ((char **, char **)); -slist *add_slist PARAMS ((slist *, const char *, int)); -int in_slist PARAMS ((slist *, const char *)); -void free_slist PARAMS ((slist *)); +slist *slist_append PARAMS ((slist *, const char *)); +int slist_contains PARAMS ((slist *, const char *)); +void slist_free PARAMS ((slist *)); + +void string_set_add PARAMS ((struct hash_table *, const char *)); +int string_set_exists PARAMS ((struct hash_table *, const char *)); +void string_set_free PARAMS ((struct hash_table *)); +void free_keys_and_values PARAMS ((struct hash_table *)); char *legible PARAMS ((long)); char *legible_very_long PARAMS ((VERY_LONG_TYPE)); diff --git a/src/wget.h b/src/wget.h index 196f913c..718ee0b1 100644 --- a/src/wget.h +++ b/src/wget.h @@ -71,7 +71,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* Print X if debugging is enabled; a no-op otherwise. */ #ifdef DEBUG -# define DEBUGP(x) do { debug_logprintf x; } while (0) +# define DEBUGP(x) do { if (opt.debug) { debug_logprintf x; } } while (0) #else /* not DEBUG */ # define DEBUGP(x) DO_NOTHING #endif /* not DEBUG */