[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory and read'ing it. - use a new, more general, HTML parser (html-parse.c) and interface to it from Wget (html-url.c). - respect <meta name=robots content=nofollow> (easy with the new HTML parser). - use hash tables instead of linked lists in places where the lists were used to facilitate mappings. - rewrite the code in host.c to be more readable and faster (hash tables instead of home-grown lists.) - make convert_links properly convert partial URLs to complete ones for those URLs that have *not* been downloaded. - use HTTP persistent connections where available. very simple-minded, caches the last connection to the server. Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
2025-04-24 04:05:05 +08:00 · 2000-11-19 12:50:10 -08:00 · 2000-11-19 12:50:10 -08:00 · b0b1c815c1
commit b0b1c815c1
parent ccf31643ab
39 changed files with 3518 additions and 901 deletions
--- a/4
+++ b/4
@ -1,3 +1,7 @@
+2000-11-10  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* configure.in: Test for MMAP.
+
 2000-11-16  Hrvoje Niksic  <hniksic@arsdigita.com>

 	* windows/config.h.ms: snprintf and vsnprintf exist under Windows.
--- a/12
+++ b/12
@ -49,15 +49,6 @@ changes.
 * Make `-k' check for files that were downloaded in the past and convert links 
  to them in newly-downloaded documents.

-* -k should convert relative references to absolute if not downloaded.
-
-* -k should convert "hostless absolute" URLs, like <A HREF="/index.html">.
-  However, Brian McMahon <bm@iucr.org> wants the old incorrect behavior to still
-  be available as an option, as he depends on it to allow mirrors of his site to
-  send CGI queries to his original site, but still get graphics off of the
-  mirror site.  Perhaps this would be better dealt with by adding an option to
-  tell -k not to convert certain URL patterns?
-
 * Add option to clobber existing file names (no `.N' suffixes).

 * Introduce a concept of "boolean" options.  For instance, every
@ -85,9 +76,6 @@ changes.
 * Allow size limit to files (perhaps with an option to download oversize files 
  up through the limit or not at all, to get more functionality than [u]limit.

-* Recognize HTML comments correctly.  Add more options for handling
-  bogus HTML found all over the 'net.
-
 * Implement breadth-first retrieval.

 * Download to .in* when mirroring.
--- a/350
+++ b/350
@ -2040,15 +2040,55 @@ EOF

 fi

-for ac_func in strdup strstr strcasecmp strncasecmp
+for ac_hdr in unistd.h
+do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "configure:2048: checking for $ac_hdr" >&5
+if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2053 "configure"
+#include "confdefs.h"
+#include <$ac_hdr>
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+    ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
+  cat >> confdefs.h <<EOF
+#define $ac_tr_hdr 1
+EOF
+ 
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+for ac_func in getpagesize
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2047: checking for $ac_func" >&5
+echo "configure:2087: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2052 "configure"
+#line 2092 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2071,7 +2111,233 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_func_$ac_func=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_func_$ac_func=no"
+fi
+rm -f conftest*
+fi
+
+if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+    ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+  cat >> confdefs.h <<EOF
+#define $ac_tr_func 1
+EOF
+ 
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+echo $ac_n "checking for working mmap""... $ac_c" 1>&6
+echo "configure:2140: checking for working mmap" >&5
+if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+  ac_cv_func_mmap_fixed_mapped=no
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2148 "configure"
+#include "confdefs.h"
+
+/* Thanks to Mike Haertel and Jim Avera for this test.
+   Here is a matrix of mmap possibilities:
+	mmap private not fixed
+	mmap private fixed at somewhere currently unmapped
+	mmap private fixed at somewhere already mapped
+	mmap shared not fixed
+	mmap shared fixed at somewhere currently unmapped
+	mmap shared fixed at somewhere already mapped
+   For private mappings, we should verify that changes cannot be read()
+   back from the file, nor mmap's back from the file at a different
+   address.  (There have been systems where private was not correctly
+   implemented like the infamous i386 svr4.0, and systems where the
+   VM page cache was not coherent with the filesystem buffer cache
+   like early versions of FreeBSD and possibly contemporary NetBSD.)
+   For shared mappings, we should conversely verify that changes get
+   propogated back to all the places they're supposed to be.
+
+   Grep wants private fixed already mapped.
+   The main things grep needs to know about mmap are:
+   * does it exist and is it safe to write into the mmap'd area
+   * how to use it (BSD variants)  */
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+/* This mess was copied from the GNU getpagesize.h.  */
+#ifndef HAVE_GETPAGESIZE
+# ifdef HAVE_UNISTD_H
+#  include <unistd.h>
+# endif
+
+/* Assume that all systems that can run configure have sys/param.h.  */
+# ifndef HAVE_SYS_PARAM_H
+#  define HAVE_SYS_PARAM_H 1
+# endif
+
+# ifdef _SC_PAGESIZE
+#  define getpagesize() sysconf(_SC_PAGESIZE)
+# else /* no _SC_PAGESIZE */
+#  ifdef HAVE_SYS_PARAM_H
+#   include <sys/param.h>
+#   ifdef EXEC_PAGESIZE
+#    define getpagesize() EXEC_PAGESIZE
+#   else /* no EXEC_PAGESIZE */
+#    ifdef NBPG
+#     define getpagesize() NBPG * CLSIZE
+#     ifndef CLSIZE
+#      define CLSIZE 1
+#     endif /* no CLSIZE */
+#    else /* no NBPG */
+#     ifdef NBPC
+#      define getpagesize() NBPC
+#     else /* no NBPC */
+#      ifdef PAGESIZE
+#       define getpagesize() PAGESIZE
+#      endif /* PAGESIZE */
+#     endif /* no NBPC */
+#    endif /* no NBPG */
+#   endif /* no EXEC_PAGESIZE */
+#  else /* no HAVE_SYS_PARAM_H */
+#   define getpagesize() 8192	/* punt totally */
+#  endif /* no HAVE_SYS_PARAM_H */
+# endif /* no _SC_PAGESIZE */
+
+#endif /* no HAVE_GETPAGESIZE */
+
+#ifdef __cplusplus
+extern "C" { void *malloc(unsigned); }
+#else
+char *malloc();
+#endif
+
+int
+main()
+{
+	char *data, *data2, *data3;
+	int i, pagesize;
+	int fd;
+
+	pagesize = getpagesize();
+
+	/*
+	 * First, make a file with some known garbage in it.
+	 */
+	data = malloc(pagesize);
+	if (!data)
+		exit(1);
+	for (i = 0; i < pagesize; ++i)
+		*(data + i) = rand();
+	umask(0);
+	fd = creat("conftestmmap", 0600);
+	if (fd < 0)
+		exit(1);
+	if (write(fd, data, pagesize) != pagesize)
+		exit(1);
+	close(fd);
+
+	/*
+	 * Next, try to mmap the file at a fixed address which
+	 * already has something else allocated at it.  If we can,
+	 * also make sure that we see the same garbage.
+	 */
+	fd = open("conftestmmap", O_RDWR);
+	if (fd < 0)
+		exit(1);
+	data2 = malloc(2 * pagesize);
+	if (!data2)
+		exit(1);
+	data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1);
+	if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE,
+	    MAP_PRIVATE | MAP_FIXED, fd, 0L))
+		exit(1);
+	for (i = 0; i < pagesize; ++i)
+		if (*(data + i) != *(data2 + i))
+			exit(1);
+
+	/*
+	 * Finally, make sure that changes to the mapped area
+	 * do not percolate back to the file as seen by read().
+	 * (This is a bug on some variants of i386 svr4.0.)
+	 */
+	for (i = 0; i < pagesize; ++i)
+		*(data2 + i) = *(data2 + i) + 1;
+	data3 = malloc(pagesize);
+	if (!data3)
+		exit(1);
+	if (read(fd, data3, pagesize) != pagesize)
+		exit(1);
+	for (i = 0; i < pagesize; ++i)
+		if (*(data + i) != *(data3 + i))
+			exit(1);
+	close(fd);
+	unlink("conftestmmap");
+	exit(0);
+}
+
+EOF
+if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+  ac_cv_func_mmap_fixed_mapped=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  ac_cv_func_mmap_fixed_mapped=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6
+if test $ac_cv_func_mmap_fixed_mapped = yes; then
+  cat >> confdefs.h <<\EOF
+#define HAVE_MMAP 1
+EOF
+
+fi
+
+for ac_func in strdup strstr strcasecmp strncasecmp
+do
+echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
+echo "configure:2313: checking for $ac_func" >&5
+if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2318 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $ac_func(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char $ac_func();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
+choke me
+#else
+$ac_func();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2098,12 +2364,12 @@ done
 for ac_func in gettimeofday mktime strptime
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2102: checking for $ac_func" >&5
+echo "configure:2368: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2107 "configure"
+#line 2373 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2126,7 +2392,7 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2130: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2153,12 +2419,12 @@ done
 for ac_func in strerror snprintf vsnprintf select signal symlink access isatty
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2157: checking for $ac_func" >&5
+echo "configure:2423: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2162 "configure"
+#line 2428 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2181,7 +2447,7 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2185: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2208,12 +2474,12 @@ done
 for ac_func in uname gethostname
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2212: checking for $ac_func" >&5
+echo "configure:2478: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2217 "configure"
+#line 2483 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2236,7 +2502,7 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2264,12 +2530,12 @@ done
 for ac_func in gethostbyname
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2268: checking for $ac_func" >&5
+echo "configure:2534: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2273 "configure"
+#line 2539 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2292,7 +2558,7 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2314,7 +2580,7 @@ EOF
 else
  echo "$ac_t""no" 1>&6
 echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
-echo "configure:2318: checking for gethostbyname in -lnsl" >&5
+echo "configure:2584: checking for gethostbyname in -lnsl" >&5
 ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
 if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
@ -2322,7 +2588,7 @@ else
  ac_save_LIBS="$LIBS"
 LIBS="-lnsl  $LIBS"
 cat > conftest.$ac_ext <<EOF
-#line 2326 "configure"
+#line 2592 "configure"
 #include "confdefs.h"
 /* Override any gcc2 internal prototype to avoid an error.  */
 /* We use char because int might match the return type of a gcc2
@ -2333,7 +2599,7 @@ int main() {
 gethostbyname()
 ; return 0; }
 EOF
-if { (eval echo configure:2337: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_lib_$ac_lib_var=yes"
 else
@ -2367,7 +2633,7 @@ done


 echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6
-echo "configure:2371: checking for socket in -lsocket" >&5
+echo "configure:2637: checking for socket in -lsocket" >&5
 ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'`
 if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
@ -2375,7 +2641,7 @@ else
  ac_save_LIBS="$LIBS"
 LIBS="-lsocket  $LIBS"
 cat > conftest.$ac_ext <<EOF
-#line 2379 "configure"
+#line 2645 "configure"
 #include "confdefs.h"
 /* Override any gcc2 internal prototype to avoid an error.  */
 /* We use char because int might match the return type of a gcc2
@ -2386,7 +2652,7 @@ int main() {
 socket()
 ; return 0; }
 EOF
-if { (eval echo configure:2390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_lib_$ac_lib_var=yes"
 else
@ -2417,7 +2683,7 @@ fi
 if test "x${with_socks}" = xyes
 then
  echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6
-echo "configure:2421: checking for main in -lresolv" >&5
+echo "configure:2687: checking for main in -lresolv" >&5
 ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'`
 if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
@ -2425,14 +2691,14 @@ else
  ac_save_LIBS="$LIBS"
 LIBS="-lresolv  $LIBS"
 cat > conftest.$ac_ext <<EOF
-#line 2429 "configure"
+#line 2695 "configure"
 #include "confdefs.h"

 int main() {
 main()
 ; return 0; }
 EOF
-if { (eval echo configure:2436: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_lib_$ac_lib_var=yes"
 else
@ -2460,7 +2726,7 @@ else
 fi

  echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6
-echo "configure:2464: checking for Rconnect in -lsocks" >&5
+echo "configure:2730: checking for Rconnect in -lsocks" >&5
 ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'`
 if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
@ -2468,7 +2734,7 @@ else
  ac_save_LIBS="$LIBS"
 LIBS="-lsocks  $LIBS"
 cat > conftest.$ac_ext <<EOF
-#line 2472 "configure"
+#line 2738 "configure"
 #include "confdefs.h"
 /* Override any gcc2 internal prototype to avoid an error.  */
 /* We use char because int might match the return type of a gcc2
@ -2479,7 +2745,7 @@ int main() {
 Rconnect()
 ; return 0; }
 EOF
-if { (eval echo configure:2483: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_lib_$ac_lib_var=yes"
 else
@ -2511,7 +2777,7 @@ fi
 ALL_LINGUAS="cs de hr it no pl pt_BR ru"

 echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6
-echo "configure:2515: checking whether NLS is requested" >&5
+echo "configure:2781: checking whether NLS is requested" >&5
        # Check whether --enable-nls or --disable-nls was given.
 if test "${enable_nls+set}" = set; then
  enableval="$enable_nls"
@ -2528,7 +2794,7 @@ fi
      # Extract the first word of "msgfmt", so it can be a program name with args.
 set dummy msgfmt; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2532: checking for $ac_word" >&5
+echo "configure:2798: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
@ -2562,7 +2828,7 @@ fi
      # Extract the first word of "xgettext", so it can be a program name with args.
 set dummy xgettext; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2566: checking for $ac_word" >&5
+echo "configure:2832: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
@ -2597,7 +2863,7 @@ fi
      # Extract the first word of "gmsgfmt", so it can be a program name with args.
 set dummy gmsgfmt; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2601: checking for $ac_word" >&5
+echo "configure:2867: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
@ -2647,17 +2913,17 @@ fi
 do
 ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
 echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:2651: checking for $ac_hdr" >&5
+echo "configure:2917: checking for $ac_hdr" >&5
 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2656 "configure"
+#line 2922 "configure"
 #include "confdefs.h"
 #include <$ac_hdr>
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
 ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
  rm -rf conftest*
@ -2687,12 +2953,12 @@ done
      for ac_func in gettext
 do
 echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2691: checking for $ac_func" >&5
+echo "configure:2957: checking for $ac_func" >&5
 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
  cat > conftest.$ac_ext <<EOF
-#line 2696 "configure"
+#line 2962 "configure"
 #include "confdefs.h"
 /* System header to define __stub macros and hopefully few prototypes,
    which can conflict with char $ac_func(); below.  */
@ -2715,7 +2981,7 @@ $ac_func();

 ; return 0; }
 EOF
-if { (eval echo configure:2719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_func_$ac_func=yes"
 else
@ -2737,7 +3003,7 @@ EOF
 else
  echo "$ac_t""no" 1>&6
 echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6
-echo "configure:2741: checking for gettext in -lintl" >&5
+echo "configure:3007: checking for gettext in -lintl" >&5
 ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'`
 if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
@ -2745,7 +3011,7 @@ else
  ac_save_LIBS="$LIBS"
 LIBS="-lintl  $LIBS"
 cat > conftest.$ac_ext <<EOF
-#line 2749 "configure"
+#line 3015 "configure"
 #include "confdefs.h"
 /* Override any gcc2 internal prototype to avoid an error.  */
 /* We use char because int might match the return type of a gcc2
@ -2756,7 +3022,7 @@ int main() {
 gettext()
 ; return 0; }
 EOF
-if { (eval echo configure:2760: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
  rm -rf conftest*
  eval "ac_cv_lib_$ac_lib_var=yes"
 else
@ -2824,7 +3090,7 @@ do
 # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2828: checking for $ac_word" >&5
+echo "configure:3094: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then
  echo $ac_n "(cached) $ac_c" 1>&6
 else
--- a/configure.in
+++ b/configure.in
@ -160,6 +160,7 @@ dnl
 dnl Checks for library functions.
 dnl
 AC_FUNC_ALLOCA
+AC_FUNC_MMAP
 AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp)
 AC_CHECK_FUNCS(gettimeofday mktime strptime)
 AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@ -1,3 +1,8 @@
+2000-11-15  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* wget.texi (Robots): Document that we now support the meta tag
+	exclusion.
+
 2000-11-16  Hrvoje Niksic  <hniksic@arsdigita.com>

 	* wget.texi: Use --- consistently.
--- a/doc/wget.texi
+++ b/doc/wget.texi
@ -2548,8 +2548,8 @@ this:

 This is explained in some detail at
@url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}.
-Unfortunately, Wget does not support this method of robot exclusion yet,
-but it will be implemented in the next release.
+Wget supports this method of robot exclusion in addition to the usual
+@file{/robots.txt} exclusion.

@node Security Considerations, Contributors, Robots, Appendices
@section Security Considerations
--- a/po/cs.gmo
+++ b/po/cs.gmo
--- a/po/de.gmo
+++ b/po/de.gmo
--- a/po/hr.gmo
+++ b/po/hr.gmo
--- a/po/it.gmo
+++ b/po/it.gmo
--- a/po/no.gmo
+++ b/po/no.gmo
--- a/po/pl.gmo
+++ b/po/pl.gmo
--- a/po/pt_BR.gmo
+++ b/po/pt_BR.gmo
--- a/po/ru.gmo
+++ b/po/ru.gmo
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,117 @@
+2000-11-19  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* retr.c (get_contents): If use_expected, make sure that the
+	appropriate amount of data is being read.
+
+	* http.c (gethttp): Check for both `Keep-Alive: ...' and
+	`Connection: Keep-Alive'.
+
+	* wget.h (DEBUGP): Call debug_logprintf only if opt.debug is
+	turned on.
+
+2000-11-19  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* http.c (connection_available_p): Use it.
+
+	* connect.c (test_socket_open): New function.
+
+	* http.c (gethttp): Support persistent connections.  Based on the
+	ideas, and partly on code, by Sam Horrocks <sam@daemoninc.com>.
+	(register_persistent): New function.
+	(connection_available_p): Ditto.
+	(invalidate_connection): Ditto.
+
+2000-11-19  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* url.c (convert_links): Handle UREL2ABS case.
+
+	* recur.c (recursive_retrieve): Instead of the list
+	urls_downloaded, use hash tables dl_file_url_map and
+	dl_url_file_map.
+	(convert_all_links): Use them to retrieve data.
+
+	* host.c (clean_hosts): Free the hash tables.
+
+	* main.c (private_initialize): Call host_init().
+
+	* host.c (store_hostaddress): Use a saner, hash table-based data
+	model.
+	(realhost): Ditto.
+	(host_init): Initialize the hash tables.
+
+2000-11-18  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* utils.c (slist_append): Eviscerate NOSORT.  Hash tables are now
+	used for what the sorted slists used to be used for.
+	(slist_contains): Don't rely on the list being sorted.
+	(slist_append): Simplify the code.
+
+	* recur.c (recursive_cleanup): Use free_string_set.
+
+	* utils.c (string_set_add, string_set_exists, string_set_free):
+	New functions for easier freeing of hash tables whose keys are
+	strdup'ed strings.
+
+	* recur.c (recursive_retrieve): Use the hash table functions for
+	storing undesirable URLs.
+
+	* hash.c: New file.
+
+2000-11-17  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* main.c (private_initialize): Call url_init.
+	(main): Call private_initialize.
+
+	* url.c (unsafe_char_table): New table.
+	(UNSAFE_CHAR): Use it.
+	(init_unsafe_char_table): New function.
+	(url_init): New function; call init_unsafe_char_table.
+
+2000-11-15  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* html-url.c (handle_link): Handle HTML fragment identifiers.
+
+	* recur.c (recursive_retrieve): If norobot info is respected and
+	the file is specified not to be followed by robots, respect that.
+
+	* html-url.c (collect_tags_mapper): Handle <meta name=robots
+	content=X>.  For us the important cases are where X is NONE or
+	where X contains NOFOLLOW.
+	(get_urls_html): Propagate that information to the caller.
+
+2000-11-13  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* url.c (convert_links): Unlink the file we might be reading from
+	before writing to it.
+	(convert_links): Use alloca instead of malloc for
+	filename_plus_orig_suffix.
+
+2000-11-10  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* url.c (get_urls_file): Ditto.
+	(convert_links): Ditto.
+
+	* html-url.c (get_urls_html): Use read_file() instead of
+	load_file().
+
+	* utils.c (read_file): New function, instead of the old
+	load_file().
+	(read_file_free): Ditto.
+
+	* url.c (findurl): Search only for the supported protocols.
+	(convert_links): Use fwrite() when writing out a region of
+	characters.
+
+2000-11-10  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* ftp-ls.c: Move html_quote_string and ftp_index here.
+
+	* url.c: Remove get_urls_html, since that's now in html-url.c.
+
+	* html-url.c: New file.
+
+	* html-parse.c: New file.
+
 2000-11-16  Hrvoje Niksic  <hniksic@arsdigita.com>

 	* mswindows.h: Define snprintf and vsnprintf to _snprintf and
--- a/src/Makefile.in
+++ b/src/Makefile.in
@ -57,9 +57,10 @@ MD5_OBJ = @MD5_OBJ@
 OPIE_OBJ = @OPIE_OBJ@

 OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o  \
-      ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o   \
-      http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o    \
-      recur$o retr$o snprintf$o url$o utils$o version$o
+      ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o   \
+      html-parse$o html-url$o http$o init$o log$o main$o      \
+      $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o     \
+      url$o utils$o version$o

 .SUFFIXES:
 .SUFFIXES: .c .o ._c ._o
@ -133,26 +134,31 @@ TAGS: *.c *.h

 # DO NOT DELETE THIS LINE -- make depend depends on it.

-cmpt$o: config.h wget.h sysdep.h options.h
-connect$o: config.h wget.h sysdep.h options.h connect.h host.h
-fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h
-ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h
-ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h
-ftp-opie$o: config.h wget.h sysdep.h options.h md5.h
-ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h
-getopt$o: wget.h sysdep.h options.h
-headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h
-host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h
-html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h
-http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h
-init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h
-log$o: config.h wget.h sysdep.h options.h utils.h
-main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h
-md5$o: wget.h sysdep.h options.h md5.h
-mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h
-netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h
-rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h
-recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h
-retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h
-url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h
-utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h
+cmpt$o: wget.h
+connect$o: wget.h connect.h host.h
+fnmatch$o: wget.h fnmatch.h
+ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h
+ftp-ls$o: wget.h utils.h ftp.h url.h
+ftp-opie$o: wget.h md5.h
+ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h
+getopt$o: wget.h getopt.h
+hash$o: wget.h utils.h hash.h
+headers$o: wget.h connect.h rbuf.h headers.h
+host$o: wget.h utils.h host.h url.h hash.h
+html-parse$o: wget.h html-parse.h
+html-url$o: wget.h html-parse.h url.h utils.h
+html$o: wget.h url.h utils.h ftp.h
+http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h
+init$o: wget.h utils.h init.h host.h recur.h netrc.h
+log$o: wget.h utils.h
+main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h
+md5$o: wget.h md5.h
+mswindows$o: wget.h url.h
+netrc$o: wget.h utils.h netrc.h init.h
+rbuf$o: wget.h rbuf.h connect.h
+recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h
+retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h
+snprintf$o:
+url$o: wget.h utils.h url.h host.h
+utils$o: wget.h utils.h fnmatch.h hash.h
+version$o:
--- a/src/config.h.in
+++ b/src/config.h.in
@ -101,6 +101,9 @@ char *alloca ();
 /* Define if you have the uname function.  */
 #undef HAVE_UNAME

+/* Define if you have a working version of mmap.  */
+#undef HAVE_MMAP
+
 /* Define if you have the gethostname function.  */
 #undef HAVE_GETHOSTNAME

--- a/src/connect.c
+++ b/src/connect.c
@ -107,6 +107,37 @@ make_connection (int *sock, char *hostname, unsigned short port)
  return NOCONERROR;
 }

+int
+test_socket_open (int sock)
+{
+#ifdef HAVE_SELECT
+  fd_set check_set;
+  struct timeval to;
+
+  /* Check if we still have a valid (non-EOF) connection.  From Andrew
+   * Maholski's code in the Unix Socket FAQ.  */
+
+  FD_ZERO (&check_set);
+  FD_SET (sock, &check_set);
+
+  /* Wait one microsecond */
+  to.tv_sec = 0;
+  to.tv_usec = 1;
+
+  /* If we get a timeout, then that means still connected */
+  if (select (sock + 1, &check_set, NULL, NULL, &to) == 0)
+    {
+      /* Connection is valid (not EOF), so continue */
+      return 1;
+    }
+  else
+    return 0;
+#else
+  /* Without select, it's hard to know for sure. */
+  return 1;
+#endif
+}
+
 /* Bind the local port PORT.  This does all the necessary work, which
   is creating a socket, setting SO_REUSEADDR option on it, then
   calling bind() and listen().  If *PORT is 0, a random port is
--- a/src/ftp-ls.c
+++ b/src/ftp-ls.c
@ -36,6 +36,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "wget.h"
 #include "utils.h"
 #include "ftp.h"
+#include "url.h"

 /* Converts symbolic permissions to number-style ones, e.g. string
   rwxr-xr-x to 755.  For now, it knows nothing of
@ -388,3 +389,175 @@ ftp_parse_ls (const char *file)
 {
  return ftp_parse_unix_ls (file);
 }
+
+/* Stuff for creating FTP index. */
+
+/* The function returns the pointer to the malloc-ed quoted version of
+   string s.  It will recognize and quote numeric and special graphic
+   entities, as per RFC1866:
+
+   `&' -> `&amp;'
+   `<' -> `&lt;'
+   `>' -> `&gt;'
+   `"' -> `&quot;'
+
+   No other entities are recognized or replaced.  */
+static char *
+html_quote_string (const char *s)
+{
+  const char *b = s;
+  char *p, *res;
+  int i;
+
+  /* Pass through the string, and count the new size.  */
+  for (i = 0; *s; s++, i++)
+    {
+      if (*s == '&')
+	i += 4;                /* `amp;' */
+      else if (*s == '<' || *s == '>')
+	i += 3;                /* `lt;' and `gt;' */
+      else if (*s == '\"')
+	i += 5;                /* `quot;' */
+    }
+  res = (char *)xmalloc (i + 1);
+  s = b;
+  for (p = res; *s; s++)
+    {
+      switch (*s)
+	{
+	case '&':
+	  *p++ = '&';
+	  *p++ = 'a';
+	  *p++ = 'm';
+	  *p++ = 'p';
+	  *p++ = ';';
+	  break;
+	case '<': case '>':
+	  *p++ = '&';
+	  *p++ = (*s == '<' ? 'l' : 'g');
+	  *p++ = 't';
+	  *p++ = ';';
+	  break;
+	case '\"':
+	  *p++ = '&';
+	  *p++ = 'q';
+	  *p++ = 'u';
+	  *p++ = 'o';
+	  *p++ = 't';
+	  *p++ = ';';
+	  break;
+	default:
+	  *p++ = *s;
+	}
+    }
+  *p = '\0';
+  return res;
+}
+
+/* The function creates an HTML index containing references to given
+   directories and files on the appropriate host.  The references are
+   FTP.  */
+uerr_t
+ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
+{
+  FILE *fp;
+  char *upwd;
+  char *htclfile;		/* HTML-clean file name */
+
+  if (!opt.dfp)
+    {
+      fp = fopen (file, "wb");
+      if (!fp)
+	{
+	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+	  return FOPENERR;
+	}
+    }
+  else
+    fp = opt.dfp;
+  if (u->user)
+    {
+      char *tmpu, *tmpp;        /* temporary, clean user and passwd */
+
+      tmpu = CLEANDUP (u->user);
+      tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
+      upwd = (char *)xmalloc (strlen (tmpu)
+			     + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
+      sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
+      free (tmpu);
+      FREE_MAYBE (tmpp);
+    }
+  else
+    upwd = xstrdup ("");
+  fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
+  fprintf (fp, "<html>\n<head>\n<title>");
+  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
+  fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
+  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
+  fprintf (fp, "</h1>\n<hr>\n<pre>\n");
+  while (f)
+    {
+      fprintf (fp, "  ");
+      if (f->tstamp != -1)
+	{
+	  /* #### Should we translate the months? */
+	  static char *months[] = {
+	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	  };
+	  struct tm *ptm = localtime ((time_t *)&f->tstamp);
+
+	  fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
+		  ptm->tm_mday);
+	  if (ptm->tm_hour)
+	    fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
+	  else
+	    fprintf (fp, "       ");
+	}
+      else
+	fprintf (fp, _("time unknown       "));
+      switch (f->type)
+	{
+	case FT_PLAINFILE:
+	  fprintf (fp, _("File        "));
+	  break;
+	case FT_DIRECTORY:
+	  fprintf (fp, _("Directory   "));
+	  break;
+	case FT_SYMLINK:
+	  fprintf (fp, _("Link        "));
+	  break;
+	default:
+	  fprintf (fp, _("Not sure    "));
+	  break;
+	}
+      htclfile = html_quote_string (f->name);
+      fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
+      if (*u->dir != '/')
+	putc ('/', fp);
+      fprintf (fp, "%s", u->dir);
+      if (*u->dir)
+	putc ('/', fp);
+      fprintf (fp, "%s", htclfile);
+      if (f->type == FT_DIRECTORY)
+	putc ('/', fp);
+      fprintf (fp, "\">%s", htclfile);
+      if (f->type == FT_DIRECTORY)
+	putc ('/', fp);
+      fprintf (fp, "</a> ");
+      if (f->type == FT_PLAINFILE)
+	fprintf (fp, _(" (%s bytes)"), legible (f->size));
+      else if (f->type == FT_SYMLINK)
+	fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
+      putc ('\n', fp);
+      free (htclfile);
+      f = f->next;
+    }
+  fprintf (fp, "</pre>\n</body>\n</html>\n");
+  free (upwd);
+  if (!opt.dfp)
+    fclose (fp);
+  else
+    fflush (fp);
+  return FTPOK;
+}
--- a/src/ftp.c
+++ b/src/ftp.c
@ -40,7 +40,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "rbuf.h"
 #include "retr.h"
 #include "ftp.h"
-#include "html.h"
 #include "connect.h"
 #include "host.h"
 #include "fnmatch.h"
@ -722,7 +721,7 @@ Error in server response, closing control connection.\n"));
    }
  reset_timer ();
  /* Get the contents of the document.  */
-  res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf);
+  res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0);
  con->dltime = elapsed_time ();
  tms = time_str (NULL);
  tmrate = rate (*len - restval, con->dltime);
--- a/src/ftp.h
+++ b/src/ftp.h
@ -92,4 +92,6 @@ typedef struct
 struct fileinfo *ftp_parse_ls PARAMS ((const char *));
 uerr_t ftp_loop PARAMS ((struct urlinfo *, int *));

+uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *);
+
 #endif /* FTP_H */
--- a/src/hash.c
+++ b/src/hash.c
@ -0,0 +1,403 @@
+/* Hash tables.
+   Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+
+#include "hash.h"
+
+#ifdef STANDALONE
+# define xmalloc malloc
+# define xrealloc realloc
+#endif
+
+/* This file implements simple hash tables based on linear probing.
+   The hash table stores key-value pairs in a contiguous array.  Both
+   key and value are void pointers that the hash and test functions
+   know how to handle.
+
+   Although Knuth & co. recommend double hashing over linear probing,
+   we use the latter because it accesses array elements sequentially
+   in case of a collision, yielding in better cache behaviour and
+   ultimately in better speed.  To avoid collision problems with
+   linear probing, we make sure that the table grows as soon as the
+   fullness/size ratio exceeds 75%.  */
+
+struct ht_pair {
+  void *key;
+  void *value;
+};
+
+struct hash_table {
+  unsigned long (*hash_function) (const void *);
+  int (*test_function) (const void *, const void *);
+
+  int size;			/* size of the array */
+  int fullness;			/* number of non-empty fields */
+  int count;			/* number of non-empty, non-deleted
+                                   fields. */
+
+  struct ht_pair *pairs;
+};
+
+#define ENTRY_DELETED ((void *)0xdeadbeef)
+
+#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
+#define EMPTY_ENTRY_P(ptr)   ((ptr) == NULL)
+
+/* Find a prime near, but greather than or equal to SIZE. */
+
+int
+prime_size (int size)
+{
+  static const unsigned long primes [] = {
+    19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
+    1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
+    19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
+    204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
+    1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
+    10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
+    50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
+    243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
+    1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
+  };
+  int i;
+  for (i = 0; i < ARRAY_SIZE (primes); i++)
+    if (primes[i] >= size)
+      return primes[i];
+  /* huh? */
+  return size;
+}
+
+/* Create a hash table of INITIAL_SIZE with hash function
+   HASH_FUNCTION and test function TEST_FUNCTION.  If you wish to
+   start out with a "small" table which will be regrown as needed,
+   specify 0 as INITIAL_SIZE.  */
+
+struct hash_table *
+hash_table_new (int initial_size,
+		unsigned long (*hash_function) (const void *),
+		int (*test_function) (const void *, const void *))
+{
+  struct hash_table *ht
+    = (struct hash_table *)xmalloc (sizeof (struct hash_table));
+  ht->hash_function = hash_function;
+  ht->test_function = test_function;
+  ht->size = prime_size (initial_size);
+  ht->fullness = 0;
+  ht->count    = 0;
+  ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
+  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+  return ht;
+}
+
+/* Free the data associated with hash table HT. */
+
+void
+hash_table_destroy (struct hash_table *ht)
+{
+  free (ht->pairs);
+  free (ht);
+}
+
+/* Get the value that corresponds to the key KEY in the hash table HT.
+   If no value is found, return NULL.  Note that NULL is a legal value
+   for value; if you are storing NULLs in your hash table, you can use
+   hash_table_exists to be sure that a (possibly NULL) value exists in
+   the table.  */
+
+void *
+hash_table_get (struct hash_table *ht, const void *key)
+{
+  int location = ht->hash_function (key) % ht->size;
+  while (1)
+    {
+      struct ht_pair *the_pair = ht->pairs + location;
+      if (EMPTY_ENTRY_P (the_pair->key))
+	return NULL;
+      else if (DELETED_ENTRY_P (the_pair->key)
+	       || !ht->test_function (key, the_pair->key))
+	{
+	  ++location;
+	  if (location == ht->size)
+	    location = 0;
+	}
+      else
+	return the_pair->value;
+    }
+}
+
+/* Return 1 if KEY exists in HT, 0 otherwise. */
+
+int
+hash_table_exists (struct hash_table *ht, const void *key)
+{
+  int location = ht->hash_function (key) % ht->size;
+  while (1)
+    {
+      struct ht_pair *the_pair = ht->pairs + location;
+      if (EMPTY_ENTRY_P (the_pair->key))
+	return 0;
+      else if (DELETED_ENTRY_P (the_pair->key)
+	       || !ht->test_function (key, the_pair->key))
+	{
+	  ++location;
+	  if (location == ht->size)
+	    location = 0;
+	}
+      else
+	return 1;
+    }
+}
+
+#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
+
+/* Grow hash table HT as necessary, and rehash all the key-value
+   pairs.  */
+
+static void
+grow_hash_table (struct hash_table *ht)
+{
+  int i;
+  struct ht_pair *old_pairs = ht->pairs;
+  int old_count = ht->count;	/* for assert() below */
+  int old_size = ht->size;
+
+  /* Normally, the idea is to double ht->size (and round it to next
+     prime) on each regrow:
+
+         ht->size = prime_size (ht->size * 2);
+
+     But it is possible that the table has large fullness because of
+     the many deleted entries.  If that is the case, we don't want to
+     blindly grow the table; we just want to rehash it.  For that
+     reason, we use ht->count as the relevant parameter.  MAX is used
+     only because we don't want to actually shrink the table.  (But
+     maybe that's wrong.)  */
+
+  int needed_size = prime_size (ht->count * 2);
+  ht->size = MAX (old_size, needed_size);
+
+  ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
+  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+
+  /* Need to reset these two; hash_table_put will reinitialize them.  */
+  ht->fullness = 0;
+  ht->count    = 0;
+  for (i = 0; i < old_size; i++)
+    {
+      struct ht_pair *the_pair = old_pairs + i;
+      if (!EMPTY_ENTRY_P (the_pair->key)
+	  && !DELETED_ENTRY_P (the_pair->key))
+	hash_table_put (ht, the_pair->key, the_pair->value);
+    }
+  assert (ht->count == old_count);
+  free (old_pairs);
+}
+
+/* Put VALUE in the hash table HT under the key KEY.  This regrows the
+   table if necessary.  */
+
+void
+hash_table_put (struct hash_table *ht, const void *key, void *value)
+{
+  int location = ht->hash_function (key) % ht->size;
+  while (1)
+    {
+      struct ht_pair *the_pair = ht->pairs + location;
+      if (EMPTY_ENTRY_P (the_pair->key))
+	{
+	  ++ht->fullness;
+	  ++ht->count;
+	just_insert:
+	  the_pair->key = (void *)key; /* const? */
+	  the_pair->value = value;
+	  break;
+	}
+      else if (DELETED_ENTRY_P (the_pair->key))
+	{
+	  /* We're replacing a deleteed entry, so ht->count gets
+             increased, but ht->fullness remains unchanged.  */
+	  ++ht->count;
+	  goto just_insert;
+	}
+      else if (ht->test_function (key, the_pair->key))
+	{
+	  /* We're replacing an existing entry, so ht->count and
+             ht->fullness remain unchanged.  */
+	  goto just_insert;
+	}
+      else
+	{
+	  ++location;
+	  if (location == ht->size)
+	    location = 0;
+	}
+    }
+  if (ht->fullness * 4 > ht->size * 3)
+    /* When fullness exceeds 75% of size, regrow the table. */
+    grow_hash_table (ht);
+}
+
+/* Remove KEY from HT. */
+
+int
+hash_table_remove (struct hash_table *ht, const void *key)
+{
+  int location = ht->hash_function (key) % ht->size;
+  while (1)
+    {
+      struct ht_pair *the_pair = ht->pairs + location;
+      if (EMPTY_ENTRY_P (the_pair->key))
+	return 0;
+      else if (DELETED_ENTRY_P (the_pair->key)
+	       || !ht->test_function (key, the_pair->key))
+	{
+	  ++location;
+	  if (location == ht->size)
+	    location = 0;
+	}
+      else
+	{
+	  /* We don't really remove an entry from the hash table: we
+	     just mark it as deleted.  This is because there may be
+	     other entries located after this entry whose hash number
+	     points to a location before this entry.  (Example: keys
+	     A, B and C have the same hash.  If you were to really
+	     *delete* B from the table, C could no longer be found.)
+
+	     As an optimization, it might be worthwhile to check
+	     whether the immediately preceding entry is empty and, if
+	     so, really delete the pair (set it to empty and decrease
+	     the fullness along with the count).  I *think* it should
+	     be safe.  */
+	  the_pair->key = ENTRY_DELETED;
+	  --ht->count;
+	  return 1;
+	}
+    }
+}
+
+void
+hash_table_clear (struct hash_table *ht)
+{
+  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+  ht->fullness = 0;
+  ht->count    = 0;
+}
+
+void
+hash_table_map (struct hash_table *ht,
+		int (*mapfun) (void *, void *, void *),
+		void *closure)
+{
+  int i;
+  for (i = 0; i < ht->size; i++)
+    {
+      struct ht_pair *the_pair = ht->pairs + i;
+      if (!EMPTY_ENTRY_P (the_pair->key)
+	  && !DELETED_ENTRY_P (the_pair->key))
+	if (mapfun (the_pair->key, the_pair->value, closure))
+	  return;
+    }
+}
+
+/* Support for hash tables whose keys are strings.  */
+
+/* supposedly from the Dragon Book P436. */
+unsigned long
+string_hash (const void *sv)
+{
+  unsigned int h = 0;
+  unsigned const char *x = (unsigned const char *) sv;
+
+  while (*x)
+    {
+      unsigned int g;
+      h = (h << 4) + *x++;
+      if ((g = h & 0xf0000000) != 0)
+	h = (h ^ (g >> 24)) ^ g;
+    }
+
+  return h;
+}
+
+int
+string_cmp (const void *s1, const void *s2)
+{
+  return !strcmp ((const char *)s1, (const char *)s2);
+}
+
+struct hash_table *
+make_string_hash_table (int initial_size)
+{
+  return hash_table_new (initial_size, string_hash, string_cmp);
+}
+
+
+#ifdef STANDALONE
+
+#include <stdio.h>
+#include <string.h>
+
+int
+print_hash_table_mapper (const void *key, void *value, void *count)
+{
+  ++*(int *)count;
+  printf ("%s: %s\n", (const char *)key, (char *)value);
+  return 0;
+}
+
+void
+print_hash (struct hash_table *sht)
+{
+  int debug_count = 0;
+  hash_table_map (sht, print_hash_table_mapper, &debug_count);
+  assert (debug_count == sht->count);
+}
+
+int
+main (void)
+{
+  struct hash_table *ht = make_string_hash_table (0);
+  char line[80];
+  while ((fgets (line, sizeof (line), stdin)))
+    {
+      int len = strlen (line);
+      if (len <= 1)
+	continue;
+      line[--len] = '\0';
+      hash_table_put (ht, strdup (line), "here I am!");
+      if (len % 2)
+	hash_table_remove (ht, line);
+    }
+  print_hash (ht);
+#if 0
+  printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
+#endif
+  return 0;
+}
+#endif
--- a/src/hash.h
+++ b/src/hash.h
@ -0,0 +1,50 @@
+/* Hash table declarations.
+   Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+/* From XEmacs, and hence from Dragon book. */
+
+#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */
+#define HASH2(a,b)               (GOOD_HASH * (a)                     + (b))
+#define HASH3(a,b,c)             (GOOD_HASH * HASH2 (a,b)             + (c))
+#define HASH4(a,b,c,d)           (GOOD_HASH * HASH3 (a,b,c)           + (d))
+#define HASH5(a,b,c,d,e)         (GOOD_HASH * HASH4 (a,b,c,d)         + (e))
+#define HASH6(a,b,c,d,e,f)       (GOOD_HASH * HASH5 (a,b,c,d,e)       + (f))
+#define HASH7(a,b,c,d,e,f,g)     (GOOD_HASH * HASH6 (a,b,c,d,e,f)     + (g))
+#define HASH8(a,b,c,d,e,f,g,h)   (GOOD_HASH * HASH7 (a,b,c,d,e,f,g)   + (h))
+#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i))
+
+struct hash_table;
+
+struct hash_table *hash_table_new PARAMS ((int,
+					   unsigned long (*) (const void *),
+					   int (*) (const void *,
+						    const void *)));
+void hash_table_destroy PARAMS ((struct hash_table *));
+void *hash_table_get PARAMS ((struct hash_table *, const void *));
+int hash_table_exists PARAMS ((struct hash_table *, const void *));
+void hash_table_put PARAMS ((struct hash_table *, const void *, void *));
+int hash_table_remove PARAMS ((struct hash_table *, const void *));
+void hash_table_clear PARAMS ((struct hash_table *));
+void hash_table_map PARAMS ((struct hash_table *,
+			     int (*) (void *, void *, void *),
+			     void *));
+
+unsigned long string_hash PARAMS ((const void *));
+int string_cmp PARAMS ((const void *, const void *));
+struct hash_table *make_string_hash_table PARAMS ((int));
--- a/src/headers.c
+++ b/src/headers.c
@ -165,6 +165,14 @@ header_strdup (const char *header, void *closure)
  return 1;
 }

+/* Write the value 1 into the integer pointed to by CLOSURE.  */
+int
+header_exists (const char *header, void *closure)
+{
+  *(int *)closure = 1;
+  return 1;
+}
+
 /* Skip LWS (linear white space), if present.  Returns number of
   characters to skip.  */
 int
--- a/src/headers.h
+++ b/src/headers.h
@ -31,5 +31,6 @@ int header_process PARAMS ((const char *, const char *,

 int header_extract_number PARAMS ((const char *, void *));
 int header_strdup PARAMS ((const char *, void *));
+int header_exists PARAMS ((const char *, void *));

 int skip_lws PARAMS ((const char *));
--- a/src/host.c
+++ b/src/host.c
@ -1,5 +1,5 @@
 /* Dealing with host names.
-   Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.

 This file is part of Wget.

@ -48,35 +48,38 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "utils.h"
 #include "host.h"
 #include "url.h"
+#include "hash.h"

 #ifndef errno
 extern int errno;
 #endif

-/* Host list entry */
-struct host
+/* Mapping between all known hosts to their addresses (n.n.n.n). */
+struct hash_table *host_name_address_map;
+
+/* Mapping between all known addresses (n.n.n.n) to their hosts.  This
+   is the inverse of host_name_address_map.  These two tables share
+   the strdup'ed strings. */
+struct hash_table *host_address_name_map;
+
+/* Mapping between auxilliary (slave) and master host names. */
+struct hash_table *host_slave_master_map;
+
+/* Utility function: like xstrdup(), but also lowercases S.  */
+
+static char *
+xstrdup_lower (const char *s)
 {
-  /* Host's symbolical name, as encountered at the time of first
-     inclusion, e.g. "fly.cc.fer.hr".  */
-  char *hostname;
-  /* Host's "real" name, i.e. its IP address, written out in ASCII
-     form of N.N.N.N, e.g. "161.53.70.130".  */
-  char *realname;
-  /* More than one HOSTNAME can correspond to the same REALNAME.  For
-     our purposes, the canonical name of the host is its HOSTNAME when
-     it was first encountered.  This entry is said to have QUALITY.  */
-  int quality;
-  /* Next entry in the list.  */
-  struct host *next;
-};
-
-static struct host *hlist;
-
-static struct host *add_hlist PARAMS ((struct host *, const char *,
-				       const char *, int));
+  char *copy = xstrdup (s);
+  char *p = copy;
+  for (; *p; p++)
+    *p = TOLOWER (*p);
+  return copy;
+}

 /* The same as gethostbyname, but supports internet addresses of the
-   form `N.N.N.N'.  */
+   form `N.N.N.N'.  On some systems gethostbyname() knows how to do
+   this automatically.  */
 struct hostent *
 ngethostbyname (const char *name)
 {
@ -91,42 +94,51 @@ ngethostbyname (const char *name)
  return hp;
 }

-/* Search for HOST in the linked list L, by hostname.  Return the
-   entry, if found, or NULL.  The search is case-insensitive.  */
-static struct host *
-search_host (struct host *l, const char *host)
-{
-  for (; l; l = l->next)
-    if (strcasecmp (l->hostname, host) == 0)
-      return l;
-  return NULL;
-}
+/* Add host name HOST with the address ADDR_TEXT to the cache.
+   Normally this means that the (HOST, ADDR_TEXT) pair will be to
+   host_name_address_map and to host_address_name_map.  (It is the
+   caller's responsibility to make sure that HOST is not already in
+   host_name_address_map.)

-/* Like search_host, but searches by address.  */
-static struct host *
-search_address (struct host *l, const char *address)
+   If the ADDR_TEXT has already been seen and belongs to another host,
+   HOST will be added to host_slave_master_map instead.  */
+
+static void
+add_host_to_cache (const char *host, const char *addr_text)
 {
-  for (; l; l = l->next)
+  char *canonical_name = hash_table_get (host_address_name_map, addr_text);
+  if (canonical_name)
    {
-      int cmp = strcmp (l->realname, address);
-      if (cmp == 0)
-	return l;
-      else if (cmp > 0)
-	return NULL;
+      DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
+	       host, canonical_name));
+      /* We've already dealt with that host under another name. */
+      hash_table_put (host_slave_master_map,
+		      xstrdup_lower (host),
+		      xstrdup_lower (canonical_name));
+    }
+  else
+    {
+      /* This is really the first time we're dealing with that host.  */
+      char *h_copy = xstrdup_lower (host);
+      char *a_copy = xstrdup (addr_text);
+      DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
+      hash_table_put (host_name_address_map, h_copy, a_copy);
+      hash_table_put (host_address_name_map, a_copy, h_copy);
    }
-  return NULL;
 }

-/* Store the address of HOSTNAME, internet-style, to WHERE.  First
-   check for it in the host list, and (if not found), use
-   ngethostbyname to get it.
+/* Store the address of HOSTNAME, internet-style (four octets in
+   network order), to WHERE.  First try to get the address from the
+   cache; if it is not available, call the DNS functions and update
+   the cache.

   Return 1 on successful finding of the hostname, 0 otherwise.  */
 int
 store_hostaddress (unsigned char *where, const char *hostname)
 {
-  struct host *t;
  unsigned long addr;
+  char *addr_text;
+  char *canonical_name;
  struct hostent *hptr;
  struct in_addr in;
  char *inet_s;
@ -134,178 +146,119 @@ store_hostaddress (unsigned char *where, const char *hostname)
  /* If the address is of the form d.d.d.d, there will be no trouble
     with it.  */
  addr = (unsigned long)inet_addr (hostname);
-  if ((int)addr == -1)
-    {
-      /* If it is not of that form, try to find it in the cache.  */
-      t = search_host (hlist, hostname);
-      if (t)
-	addr = (unsigned long)inet_addr (t->realname);
-    }
  /* If we have the numeric address, just store it.  */
  if ((int)addr != -1)
    {
-      /* ADDR is in network byte order, meaning the code works on
-         little and big endian 32-bit architectures without change.
-         On big endian 64-bit architectures we need to be careful to
-         copy the correct four bytes.  */
-      int offset = 0;
+      /* ADDR is defined to be in network byte order, meaning the code
+         works on little and big endian 32-bit architectures without
+         change.  On big endian 64-bit architectures we need to be
+         careful to copy the correct four bytes.  */
+      int offset;
+    have_addr:
 #ifdef WORDS_BIGENDIAN
      offset = sizeof (unsigned long) - 4;
+#else
+      offset = 0;
 #endif
      memcpy (where, (char *)&addr + offset, 4);
      return 1;
    }
+
+  /* By now we know that the address is not of the form d.d.d.d.  Try
+     to find it in our cache of host addresses.  */
+  addr_text = hash_table_get (host_name_address_map, hostname);
+  if (addr_text)
+    {
+      DEBUGP (("Found %s in host_name_address_map: %s\n",
+	       hostname, addr_text));
+      addr = (unsigned long)inet_addr (addr_text);
+      goto have_addr;
+    }
+
+  /* Maybe this host is known to us under another name.  If so, we'll
+     find it in host_slave_master_map, and use the master name to find
+     its address in host_name_address_map. */
+  canonical_name = hash_table_get (host_slave_master_map, hostname);
+  if (canonical_name)
+    {
+      addr_text = hash_table_get (host_name_address_map, canonical_name);
+      assert (addr_text != NULL);
+      DEBUGP (("Found %s as slave of %s -> %s\n",
+	       hostname, canonical_name, addr_text));
+      addr = (unsigned long)inet_addr (addr_text);
+      goto have_addr;
+    }
+
  /* Since all else has failed, let's try gethostbyname().  Note that
     we use gethostbyname() rather than ngethostbyname(), because we
-     *know* the address is not numerical.  */
+     already know that the address is not numerical.  */
  hptr = gethostbyname (hostname);
  if (!hptr)
    return 0;
  /* Copy the address of the host to socket description.  */
  memcpy (where, hptr->h_addr_list[0], hptr->h_length);
-  /* Now that we're here, we could as well cache the hostname for
-     future use, as in realhost().  First, we have to look for it by
-     address to know if it's already in the cache by another name.  */
+  assert (hptr->h_length == 4);

+  /* Now that we've gone through the truoble of calling
+     gethostbyname(), we can store this valuable information to the
+     cache.  First, we have to look for it by address to know if it's
+     already in the cache by another name.  */
  /* Originally, we copied to in.s_addr, but it appears to be missing
     on some systems.  */
  memcpy (&in, *hptr->h_addr_list, sizeof (in));
-  STRDUP_ALLOCA (inet_s, inet_ntoa (in));
-  t = search_address (hlist, inet_s);
-  if (t) /* Found in the list, as realname.  */
-    {
-      /* Set the default, 0 quality.  */
-      hlist = add_hlist (hlist, hostname, inet_s, 0);
-      return 1;
-    }
-  /* Since this is really the first time this host is encountered,
-     set quality to 1.  */
-  hlist = add_hlist (hlist, hostname, inet_s, 1);
+  inet_s = inet_ntoa (in);
+  add_host_to_cache (hostname, inet_s);
  return 1;
 }

-/* Add a host to the host list.  The list is sorted by addresses.  For
-   equal addresses, the entries with quality should bubble towards the
-   beginning of the list.  */
-static struct host *
-add_hlist (struct host *l, const char *nhost, const char *nreal, int quality)
-{
-  struct host *t, *old, *beg;
-
-  /* The entry goes to the beginning of the list if the list is empty
-     or the order requires it.  */
-  if (!l || (strcmp (nreal, l->realname) < 0))
-    {
-      t = (struct host *)xmalloc (sizeof (struct host));
-      t->hostname = xstrdup (nhost);
-      t->realname = xstrdup (nreal);
-      t->quality = quality;
-      t->next = l;
-      return t;
-    }
-
-  beg = l;
-  /* Second two one-before-the-last element.  */
-  while (l->next)
-    {
-      int cmp;
-      old = l;
-      l = l->next;
-      cmp = strcmp (nreal, l->realname);
-      if (cmp >= 0)
-	continue;
-      /* If the next list element is greater than s, put s between the
-	 current and the next list element.  */
-      t = (struct host *)xmalloc (sizeof (struct host));
-      old->next = t;
-      t->next = l;
-      t->hostname = xstrdup (nhost);
-      t->realname = xstrdup (nreal);
-      t->quality = quality;
-      return beg;
-    }
-  t = (struct host *)xmalloc (sizeof (struct host));
-  t->hostname = xstrdup (nhost);
-  t->realname = xstrdup (nreal);
-  t->quality = quality;
-  /* Insert the new element after the last element.  */
-  l->next = t;
-  t->next = NULL;
-  return beg;
-}
-
 /* Determine the "real" name of HOST, as perceived by Wget.  If HOST
   is referenced by more than one name, "real" name is considered to
-   be the first one encountered in the past.
-
-   If the host cannot be found in the list of already dealt-with
-   hosts, try with its INET address.  If this fails too, add it to the
-   list.  The routine does not call gethostbyname twice for the same
-   host if it can possibly avoid it.  */
+   be the first one encountered in the past.  */
 char *
 realhost (const char *host)
 {
-  struct host *l, *l_real;
  struct in_addr in;
  struct hostent *hptr;
-  char *inet_s;
+  char *master_name;

-  DEBUGP (("Checking for %s.\n", host));
-  /* Look for the host, looking by the host name.  */
-  l = search_host (hlist, host);
-  if (l && l->quality)		/* Found it with quality */
+  DEBUGP (("Checking for %s in host_name_address_map.\n", host));
+  if (hash_table_exists (host_name_address_map, host))
    {
-      DEBUGP (("%s was already used, by that name.\n", host));
-      /* Here we return l->hostname, not host, because of the possible
-         case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are
-         the same, but we want the one that was first.  */
-      return xstrdup (l->hostname);
+      DEBUGP (("Found; %s was already used, by that name.\n", host));
+      return xstrdup_lower (host);
    }
-  else if (!l)			/* Not found, with or without quality */
-    {
-      /* The fact that gethostbyname will get called makes it
-	 necessary to store it to the list, to ensure that
-	 gethostbyname will not be called twice for the same string.
-	 However, the quality argument must be set appropriately.

-	 Note that add_hlist must be called *after* the realname
-	 search, or the quality would be always set to 0 */
-      DEBUGP (("This is the first time I hear about host %s by that name.\n",
-	       host));
-      hptr = ngethostbyname (host);
-      if (!hptr)
-	return xstrdup (host);
+  DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
+  master_name = hash_table_get (host_slave_master_map, host);
+  if (master_name)
+    {
+    has_master:
+      DEBUGP (("Found; %s was already used, by the name %s.\n",
+	       host, master_name));
+      return xstrdup (master_name);
+    }
+
+  DEBUGP (("First time I hear about %s by that name; looking it up.\n",
+	   host));
+  hptr = ngethostbyname (host);
+  if (hptr)
+    {
+      char *inet_s;
      /* Originally, we copied to in.s_addr, but it appears to be
-         missing on some systems.  */
+	 missing on some systems.  */
      memcpy (&in, *hptr->h_addr_list, sizeof (in));
-      STRDUP_ALLOCA (inet_s, inet_ntoa (in));
-    }
-  else				/* Found, without quality */
-    {
-      /* This case happens when host is on the list,
-	 but not as first entry (the one with quality).
-	 Then we just get its INET address and pick
-	 up the first entry with quality.  */
-      DEBUGP (("We've dealt with host %s, but under the name %s.\n",
-	       host, l->realname));
-      STRDUP_ALLOCA (inet_s, l->realname);
+      inet_s = inet_ntoa (in);
+
+      add_host_to_cache (host, inet_s);
+
+      /* add_host_to_cache() can establish a slave-master mapping. */
+      DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
+      master_name = hash_table_get (host_slave_master_map, host);
+      if (master_name)
+	goto has_master;
    }

-  /* Now we certainly have the INET address.  The following loop is
-     guaranteed to pick either an entry with quality (because it is
-     the first one), or none at all.  */
-  l_real = search_address (hlist, inet_s);
-  if (l_real)			/* Found in the list, as realname.  */
-    {
-      if (!l)
-	/* Set the default, 0 quality.  */
-	hlist = add_hlist (hlist, host, inet_s, 0);
-      return xstrdup (l_real->hostname);
-    }
-  /* Since this is really the first time this host is encountered,
-     set quality to 1.  */
-  hlist = add_hlist (hlist, host, inet_s, 1);
-  return xstrdup (host);
+  return xstrdup_lower (host);
 }

 /* Compare two hostnames (out of URL-s if the arguments are URL-s),
@ -547,20 +500,23 @@ herrmsg (int error)
    return _("Unknown error");
 }

-/* Clean the host list.  This is a separate function, so we needn't
-   export HLIST and its implementation.  Ha!  */
 void
 clean_hosts (void)
 {
-  struct host *l = hlist;
-
-  while (l)
-    {
-      struct host *p = l->next;
-      free (l->hostname);
-      free (l->realname);
-      free (l);
-      l = p;
-    }
-  hlist = NULL;
+  /* host_name_address_map and host_address_name_map share the
+     strings.  Because of that, calling free_keys_and_values once
+     suffices for both.  */
+  free_keys_and_values (host_name_address_map);
+  hash_table_destroy (host_name_address_map);
+  hash_table_destroy (host_address_name_map);
+  free_keys_and_values (host_slave_master_map);
+  hash_table_destroy (host_slave_master_map);
+}
+
+void
+host_init (void)
+{
+  host_name_address_map = make_string_hash_table (0);
+  host_address_name_map = make_string_hash_table (0);
+  host_slave_master_map = make_string_hash_table (0);
 }
--- a/src/html-parse.c
+++ b/src/html-parse.c
@ -0,0 +1,856 @@
+/* HTML parser for Wget.
+   Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+/* The only entry point to this module is map_html_tags(), which see.  */
+
+/* TODO:
+
+   - Allow hooks for callers to process contents outside tags.  This
+     is needed to implement handling <style> and <script>.  The
+     taginfo structure already carries the information about where the
+     tags are, but this is not enough, because one would also want to
+     skip the comments.  (The funny thing is that for <style> and
+     <script> you *don't* want to skip comments!)
+
+   - Create a test suite for regression testing. */
+
+/* HISTORY:
+
+   This is the third HTML parser written for Wget.  The first one was
+   written some time during the Geturl 1.0 beta cycle, and was very
+   inefficient and buggy.  It also contained some very complex code to
+   remember a list of parser states, because it was supposed to be
+   reentrant.  The idea was that several parsers would be running
+   concurrently, and you'd have pass the function a unique ID string
+   (for example, the URL) by which it found the relevant parser state
+   and returned the next URL.  Over-engineering at its best.
+
+   The second HTML parser was written for Wget 1.4 (the first version
+   by the name `Wget'), and was a complete rewrite.  Although the new
+   parser behaved much better and made no claims of reentrancy, it
+   still shared many of the fundamental flaws of the old version -- it
+   only regarded HTML in terms tag-attribute pairs, where the
+   attribute's value was a URL to be returned.  Any other property of
+   HTML, such as <base href=...>, or strange way to specify a URL,
+   such as <meta http-equiv=Refresh content="0; URL=..."> had to be
+   crudely hacked in -- and the caller had to be aware of these hacks.
+   Like its predecessor, this parser did not support HTML comments.
+
+   After Wget 1.5.1 was released, I set out to write a third HTML
+   parser.  The objectives of the new parser were to: (1) provide a
+   clean way to analyze HTML lexically, (2) separate interpretation of
+   the markup from the parsing process, (3) be as correct as possible,
+   e.g. correctly skipping comments and other SGML declarations, (4)
+   understand the most common errors in markup and skip them or be
+   relaxed towrds them, and (5) be reasonably efficient (no regexps,
+   minimum copying and minimum or no heap allocation).
+
+   I believe this parser meets all of the above goals.  It is
+   reasonably well structured, and could be relatively easily
+   separated from Wget and used elsewhere.  While some of its
+   intrinsic properties limit its value as a general-purpose HTML
+   parser, I believe that, with minimum modifications, it could serve
+   as a backend for one.
+
+   Due to time and other constraints, this parser was not integrated
+   into Wget until the version ???. */
+
+/* DESCRIPTION:
+
+   The single entry point of this parser is map_html_tags(), which
+   works by calling a function you specify for each tag.  The function
+   gets called with the pointer to a structure describing the tag and
+   its attributes.  */
+
+/* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
+   still need Wget headers to compile.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <assert.h>
+
+#include "wget.h"
+#include "html-parse.h"
+
+#ifdef STANDALONE
+# define xmalloc malloc
+# define xrealloc realloc
+#endif /* STANDALONE */
+
+/* Pool support.  For efficiency, map_html_tags() stores temporary
+   string data to a single stack-allocated pool.  If the pool proves
+   too small, additional memory is allocated/resized with
+   malloc()/realloc().  */
+
+struct pool {
+  char *contents;		/* pointer to the contents. */
+  int size;			/* size of the pool. */
+  int index;			/* next unoccupied position in
+                                   contents. */
+
+  int alloca_p;			/* whether contents was allocated
+                                   using alloca(). */
+  char *orig_contents;		/* orig_contents, allocated by
+                                   alloca().  this is used by
+                                   POOL_FREE to restore the pool to
+                                   the "initial" state. */
+  int orig_size;
+};
+
+/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
+
+#define POOL_INIT(pool, initial_size) do {		\
+  (pool).size = (initial_size);				\
+  (pool).contents = ALLOCA_ARRAY (char, (pool).size);	\
+  (pool).index = 0;					\
+  (pool).alloca_p = 1;					\
+  (pool).orig_contents = (pool).contents;		\
+  (pool).orig_size = (pool).size;			\
+} while (0)
+
+/* Grow the pool to accomodate at least SIZE new bytes.  If the pool
+   already has room to accomodate SIZE bytes of data, this is a no-op.  */
+
+#define POOL_GROW(pool, increase) do {					\
+  int PG_newsize = (pool).index + increase;				\
+  DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize,	\
+			  (pool).alloca_p, char);			\
+} while (0)
+
+/* Append text in the range [beg, end) to POOL.  No zero-termination
+   is done.  */
+
+#define POOL_APPEND(pool, beg, end) do {			\
+  const char *PA_beg = beg;					\
+  int PA_size = end - PA_beg;					\
+  POOL_GROW (pool, PA_size);					\
+  memcpy ((pool).contents + (pool).index, PA_beg, PA_size);	\
+  (pool).index += PA_size;					\
+} while (0)
+
+/* The same as the above, but with zero termination. */
+
+#define POOL_APPEND_ZT(pool, beg, end) do {			\
+  const char *PA_beg = beg;					\
+  int PA_size = end - PA_beg;					\
+  POOL_GROW (pool, PA_size + 1);				\
+  memcpy ((pool).contents + (pool).index, PA_beg, PA_size);	\
+  (pool).contents[(pool).index + PA_size] = '\0';		\
+  (pool).index += PA_size + 1;					\
+} while (0)
+
+/* Forget old pool contents.  The allocated memory is not freed. */
+#define POOL_REWIND(pool) pool.index = 0
+
+/* Free heap-allocated memory for contents of POOL.  This calls free()
+   if the memory was allocated through malloc.  It also restores
+   `contents' and `size' to their original, pre-malloc values.  That
+   way after POOL_FREE, the pool is fully usable, just as if it were
+   freshly initialized with POOL_INIT.  */
+
+#define POOL_FREE(pool) do {			\
+  if (!(pool).alloca_p)				\
+    free ((pool).contents);			\
+  (pool).contents = (pool).orig_contents;	\
+  (pool).size = (pool).orig_size;		\
+  (pool).index = 0;				\
+  (pool).alloca_p = 1;				\
+} while (0)
+
+
+#define AP_DOWNCASE		1
+#define AP_PROCESS_ENTITIES	2
+#define AP_SKIP_BLANKS		4
+
+/* Copy the text in the range [BEG, END) to POOL, optionally
+   performing operations specified by FLAGS.  FLAGS may be any
+   combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
+   with the following meaning:
+
+   * AP_DOWNCASE -- downcase all the letters;
+
+   * AP_PROCESS_ENTITIES -- process the SGML entities and write out
+   the decoded string.  Recognized entities are &lt, &gt, &amp, &quot,
+   &nbsp and the numerical entities.
+
+   * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
+   of text.  */
+static void
+convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
+{
+  int old_index = pool->index;
+  int size;
+
+  /* First, skip blanks if required.  We must do this before entities
+     are processed, so that blanks can still be inserted as, for
+     instance, `&#32;'.  */
+  if (flags & AP_SKIP_BLANKS)
+    {
+      while (beg < end && ISSPACE (*beg))
+	++beg;
+      while (end > beg && ISSPACE (end[-1]))
+	--end;
+    }
+  size = end - beg;
+
+  if (flags & AP_PROCESS_ENTITIES)
+    {
+      /* Stack-allocate a copy of text, process entities and copy it
+         to the pool.  */
+      char *local_copy = (char *)alloca (size + 1);
+      const char *from = beg;
+      char *to = local_copy;
+
+      while (from < end)
+	{
+	  if (*from != '&')
+	    *to++ = *from++;
+	  else
+	    {
+	      const char *save = from;
+	      int remain;
+
+	      if (++from == end) goto lose;
+	      remain = end - from;
+
+	      if (*from == '#')
+		{
+		  int numeric;
+		  ++from;
+		  if (from == end || !ISDIGIT (*from)) goto lose;
+		  for (numeric = 0; from < end && ISDIGIT (*from); from++)
+		    numeric = 10 * numeric + (*from) - '0';
+		  if (from < end && ISALPHA (*from)) goto lose;
+		  numeric &= 0xff;
+		  *to++ = numeric;
+		}
+#define FROB(x) (remain >= (sizeof (x) - 1)			\
+		 && !memcmp (from, x, sizeof (x) - 1)		\
+		 && (*(from + sizeof (x) - 1) == ';'		\
+		     || remain == sizeof (x) - 1		\
+		     || !ISALNUM (*(from + sizeof (x) - 1))))
+	      else if (FROB ("lt"))
+		*to++ = '<', from += 2;
+	      else if (FROB ("gt"))
+		*to++ = '>', from += 2;
+	      else if (FROB ("amp"))
+		*to++ = '&', from += 3;
+	      else if (FROB ("quot"))
+		*to++ = '\"', from += 4;
+	      /* We don't implement the proposed "Added Latin 1"
+		 entities (except for nbsp), because it is unnecessary
+		 in the context of Wget, and would require hashing to
+		 work efficiently.  */
+	      else if (FROB ("nbsp"))
+		*to++ = 160, from += 4;
+	      else
+		goto lose;
+#undef FROB
+	      /* If the entity was followed by `;', we step over the
+		 `;'.  Otherwise, it was followed by either a
+		 non-alphanumeric or EOB, in which case we do nothing.	*/
+	      if (from < end && *from == ';')
+		++from;
+	      continue;
+
+	    lose:
+	      /* This was not an entity after all.  Back out.  */
+	      from = save;
+	      *to++ = *from++;
+	    }
+	}
+      *to++ = '\0';
+      POOL_APPEND (*pool, local_copy, to);
+    }
+  else
+    {
+      /* Just copy the text to the pool.  */
+      POOL_APPEND_ZT (*pool, beg, end);
+    }
+
+  if (flags & AP_DOWNCASE)
+    {
+      char *p = pool->contents + old_index;
+      for (; *p; p++)
+	*p = TOLOWER (*p);
+    }
+}
+
+/* Check whether the contents of [POS, POS+LENGTH) match any of the
+   strings in the ARRAY.  */
+static int
+array_allowed (const char **array, const char *beg, const char *end)
+{
+  int length = end - beg;
+  if (array)
+    {
+      for (; *array; array++)
+	if (length >= strlen (*array)
+	    && !strncasecmp (*array, beg, length))
+	  break;
+      if (!*array)
+	return 0;
+    }
+  return 1;
+}
+
+/* RFC1866: name [of attribute or tag] consists of letters, digits,
+   periods, or hyphens.  We also allow _, for compatibility with
+   brain-damaged generators.  */
+#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
+
+/* States while advancing through comments. */
+#define AC_S_DONE	0
+#define AC_S_BACKOUT	1
+#define AC_S_BANG	2
+#define AC_S_DEFAULT	3
+#define AC_S_DCLNAME	4
+#define AC_S_DASH1	5
+#define AC_S_DASH2	6
+#define AC_S_COMMENT	7
+#define AC_S_DASH3	8
+#define AC_S_DASH4	9
+#define AC_S_QUOTE1	10
+#define AC_S_IN_QUOTE	11
+#define AC_S_QUOTE2	12
+
+#ifdef STANDALONE
+static int comment_backout_count;
+#endif
+
+/* Advance over an SGML declaration (the <!...> forms you find in HTML
+   documents).  The function returns the location after the
+   declaration.  The reason we need this is that HTML comments are
+   expressed as comments in so-called "empty declarations".
+
+   To recap: any SGML declaration may have comments associated with
+   it, e.g.
+       <!MY-DECL -- isn't this fun? -- foo bar>
+
+   An HTML comment is merely an empty declaration (<!>) with a comment
+   attached, like this:
+       <!-- some stuff here -->
+
+   Several comments may be embedded in one comment declaration:
+       <!-- have -- -- fun -->
+
+   Whitespace is allowed between and after the comments, but not
+   before the first comment.
+
+   Additionally, this function attempts to handle double quotes in
+   SGML declarations correctly.  */
+static const char *
+advance_declaration (const char *beg, const char *end)
+{
+  const char *p = beg;
+  char quote_char = '\0';	/* shut up, gcc! */
+  char ch;
+  int state = AC_S_BANG;
+
+  if (beg == end)
+    return beg;
+  ch = *p++;
+
+  /* It looked like a good idea to write this as a state machine, but
+     now I wonder...  */
+
+  while (state != AC_S_DONE && state != AC_S_BACKOUT)
+    {
+      if (p == end)
+	state = AC_S_BACKOUT;
+      switch (state)
+	{
+	case AC_S_DONE:
+	case AC_S_BACKOUT:
+	  break;
+	case AC_S_BANG:
+	  if (ch == '!')
+	    {
+	      ch = *p++;
+	      state = AC_S_DEFAULT;
+	    }
+	  else
+	    state = AC_S_BACKOUT;
+	  break;
+	case AC_S_DEFAULT:
+	  switch (ch)
+	    {
+	    case '-':
+	      state = AC_S_DASH1;
+	      break;
+	    case ' ':
+	    case '\t':
+	    case '\r':
+	    case '\n':
+	      ch = *p++;
+	      break;
+	    case '>':
+	      state = AC_S_DONE;
+	      break;
+	    case '\'':
+	    case '\"':
+	      state = AC_S_QUOTE1;
+	      break;
+	    default:
+	      if (NAME_CHAR_P (ch))
+		state = AC_S_DCLNAME;
+	      else
+		state = AC_S_BACKOUT;
+	      break;
+	    }
+	  break;
+	case AC_S_DCLNAME:
+	  if (NAME_CHAR_P (ch))
+	    ch = *p++;
+	  else if (ch == '-')
+	    state = AC_S_DASH1;
+	  else
+	    state = AC_S_DEFAULT;
+	  break;
+	case AC_S_QUOTE1:
+	  assert (ch == '\'' || ch == '\"');
+	  quote_char = ch;	/* cheating -- I really don't feel like
+				   introducing more different states for
+				   different quote characters. */
+	  ch = *p++;
+	  state = AC_S_IN_QUOTE;
+	  break;
+	case AC_S_IN_QUOTE:
+	  if (ch == quote_char)
+	    state = AC_S_QUOTE2;
+	  else
+	    ch = *p++;
+	  break;
+	case AC_S_QUOTE2:
+	  assert (ch == quote_char);
+	  ch = *p++;
+	  state = AC_S_DEFAULT;
+	  break;
+	case AC_S_DASH1:
+	  assert (ch == '-');
+	  ch = *p++;
+	  state = AC_S_DASH2;
+	  break;
+	case AC_S_DASH2:
+	  switch (ch)
+	    {
+	    case '-':
+	      ch = *p++;
+	      state = AC_S_COMMENT;
+	      break;
+	    default:
+	      state = AC_S_BACKOUT;
+	    }
+	  break;
+	case AC_S_COMMENT:
+	  switch (ch)
+	    {
+	    case '-':
+	      state = AC_S_DASH3;
+	      break;
+	    default:
+	      ch = *p++;
+	      break;
+	    }
+	  break;
+	case AC_S_DASH3:
+	  assert (ch == '-');
+	  ch = *p++;
+	  state = AC_S_DASH4;
+	  break;
+	case AC_S_DASH4:
+	  switch (ch)
+	    {
+	    case '-':
+	      ch = *p++;
+	      state = AC_S_DEFAULT;
+	      break;
+	    default:
+	      state = AC_S_COMMENT;
+	      break;
+	    }
+	  break;
+	}
+    }
+
+  if (state == AC_S_BACKOUT)
+    {
+#ifdef STANDALONE
+      ++comment_backout_count;
+#endif
+      return beg + 1;
+    }
+  return p;
+}
+
+/* Advance P (a char pointer), with the explicit intent of being able
+   to read the next character.  If this is not possible, go to finish.  */
+
+#define ADVANCE(p) do {				\
+  ++p;						\
+  if (p >= end)					\
+    goto finish;				\
+} while (0)
+
+/* Skip whitespace, if any. */
+
+#define SKIP_WS(p) do {				\
+  while (ISSPACE (*p)) {			\
+    ADVANCE (p);				\
+  }						\
+} while (0)
+
+/* Skip non-whitespace, if any. */
+
+#define SKIP_NON_WS(p) do {			\
+  while (!ISSPACE (*p)) {			\
+    ADVANCE (p);				\
+  }						\
+} while (0)
+
+#ifdef STANDALONE
+static int tag_backout_count;
+#endif
+
+/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
+   MAPFUN will be called with two arguments: pointer to an initialized
+   struct taginfo, and CLOSURE.
+
+   ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
+   be processed by this function.  If it is NULL, all the tags are
+   allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
+
+   (Obviously, the caller can filter out unwanted tags and attributes
+   just as well, but this is just an optimization designed to avoid
+   unnecessary copying for tags/attributes which the caller doesn't
+   want to know about.  These lists are searched linearly; therefore,
+   if you're interested in a large number of tags or attributes, you'd
+   better set these to NULL and filter them out yourself with a
+   hashing process most appropriate for your application.)  */
+
+void
+map_html_tags (const char *text, int size,
+	       const char **allowed_tag_names,
+	       const char **allowed_attribute_names,
+	       void (*mapfun) (struct taginfo *, void *),
+	       void *closure)
+{
+  const char *p = text;
+  const char *end = text + size;
+
+  int attr_pair_count = 8;
+  int attr_pair_alloca_p = 1;
+  struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
+  struct pool pool;
+
+  if (!size)
+    return;
+
+  POOL_INIT (pool, 256);
+
+  {
+    int nattrs, end_tag;
+    const char *tag_name_begin, *tag_name_end;
+    const char *tag_start_position;
+    int uninteresting_tag;
+
+  look_for_tag:
+    POOL_REWIND (pool);
+
+    nattrs = 0;
+    end_tag = 0;
+
+    /* Find beginning of tag.  We use memchr() instead of the usual
+       looping with ADVANCE() for speed. */
+    p = memchr (p, '<', end - p);
+    if (!p)
+      goto finish;
+
+    tag_start_position = p;
+    ADVANCE (p);
+
+    /* Establish the type of the tag (start-tag, end-tag or
+       declaration).  */
+    if (*p == '!')
+      {
+	/* This is an SGML declaration -- just skip it.  */
+	p = advance_declaration (p, end);
+	if (p == end)
+	  goto finish;
+	goto look_for_tag;
+      }
+    else if (*p == '/')
+      {
+	end_tag = 1;
+	ADVANCE (p);
+      }
+    tag_name_begin = p;
+    while (NAME_CHAR_P (*p))
+      ADVANCE (p);
+    if (p == tag_name_begin)
+      goto look_for_tag;
+    tag_name_end = p;
+    SKIP_WS (p);
+    if (end_tag && *p != '>')
+      goto backout_tag;
+
+    if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
+      /* We can't just say "goto look_for_tag" here because we need
+         the loop below to properly advance over the tag's attributes.  */
+      uninteresting_tag = 1;
+    else
+      {
+	uninteresting_tag = 0;
+	convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
+      }
+
+    /* Find the attributes. */
+    while (1)
+      {
+	const char *attr_name_begin, *attr_name_end;
+	const char *attr_value_begin, *attr_value_end;
+	const char *attr_raw_value_begin, *attr_raw_value_end;
+	int operation = AP_DOWNCASE; /* stupid compiler. */
+
+	SKIP_WS (p);
+
+	/* Check for end of tag definition. */
+	if (*p == '>')
+	  break;
+
+	/* Establish bounds of attribute name. */
+	attr_name_begin = p;	/* <foo bar ...> */
+				/*      ^        */
+	while (NAME_CHAR_P (*p))
+	  ADVANCE (p);
+	attr_name_end = p;	/* <foo bar ...> */
+				/*         ^     */
+	if (attr_name_begin == attr_name_end)
+	  goto backout_tag;
+
+	/* Establish bounds of attribute value. */
+	SKIP_WS (p);
+	if (NAME_CHAR_P (*p) || *p == '>')
+	  {
+	    /* Minimized attribute syntax allows `=' to be omitted.
+               For example, <UL COMPACT> is a valid shorthand for <UL
+               COMPACT="compact">.  Even if such attributes are not
+               useful to Wget, we need to support them, so that the
+               tags containing them can be parsed correctly. */
+	    attr_raw_value_begin = attr_value_begin = attr_name_begin;
+	    attr_raw_value_end = attr_value_end = attr_name_end;
+	  }
+	else if (*p == '=')
+	  {
+	    ADVANCE (p);
+	    SKIP_WS (p);
+	    if (*p == '\"' || *p == '\'')
+	      {
+		int newline_seen = 0;
+		char quote_char = *p;
+		attr_raw_value_begin = p;
+		ADVANCE (p);
+		attr_value_begin = p; /* <foo bar="baz"> */
+				      /*           ^     */
+		while (*p != quote_char)
+		  {
+		    if (!newline_seen && *p == '\n')
+		      {
+			/* If a newline is seen within the quotes, it
+			   is most likely that someone forgot to close
+			   the quote.  In that case, we back out to
+			   the value beginning, and terminate the tag
+			   at either `>' or the delimiter, whichever
+			   comes first.  Such a tag terminated at `>'
+			   is discarded.  */
+			p = attr_value_begin;
+			newline_seen = 1;
+			continue;
+		      }
+		    else if (newline_seen && *p == '>')
+		      break;
+		    ADVANCE (p);
+		  }
+		attr_value_end = p; /* <foo bar="baz"> */
+				    /*              ^  */
+		if (*p == quote_char)
+		  ADVANCE (p);
+		else
+		  goto look_for_tag;
+		attr_raw_value_end = p;	/* <foo bar="baz"> */
+					/*               ^ */
+		/* The AP_SKIP_BLANKS part is not entirely correct,
+		   because we don't want to skip blanks for all the
+		   attribute values.  */
+		operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
+	      }
+	    else
+	      {
+		attr_value_begin = p; /* <foo bar=baz> */
+				      /*          ^    */
+		/* According to SGML, a name token should consist only
+		   of alphanumerics, . and -.  However, this is often
+		   violated by, for instance, `%' in `width=75%'.
+		   We'll be liberal and allow just about anything as
+		   an attribute value.  */
+		while (!ISSPACE (*p) && *p != '>')
+		  ADVANCE (p);
+		attr_value_end = p; /* <foo bar=baz qux=quix> */
+				    /*             ^          */
+		if (attr_value_begin == attr_value_end)
+		  /* <foo bar=> */
+		  /*          ^ */
+		  goto backout_tag;
+		attr_raw_value_begin = attr_value_begin;
+		attr_raw_value_end = attr_value_end;
+		operation = AP_PROCESS_ENTITIES;
+	      }
+	  }
+	else
+	  {
+	    /* We skipped the whitespace and found something that is
+	       neither `=' nor the beginning of the next attribute's
+	       name.  Back out.  */
+	    goto backout_tag;	/* <foo bar /... */
+				/*          ^    */
+	  }
+
+	/* If we're not interested in the tag, don't bother with any
+           of the attributes.  */
+	if (uninteresting_tag)
+	  continue;
+
+	/* If we aren't interested in the attribute, skip it.  We
+           cannot do this test any sooner, because our text pointer
+           needs to correctly advance over the attribute.  */
+	if (allowed_attribute_names
+	    && !array_allowed (allowed_attribute_names, attr_name_begin,
+			       attr_name_end))
+	  continue;
+
+	DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
+				attr_pair_alloca_p, struct attr_pair);
+
+	pairs[nattrs].name_pool_index = pool.index;
+	convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
+
+	pairs[nattrs].value_pool_index = pool.index;
+	convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
+	pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
+	pairs[nattrs].value_raw_size = (attr_raw_value_end
+					- attr_raw_value_begin);
+	++nattrs;
+      }
+
+    if (uninteresting_tag)
+      {
+	ADVANCE (p);
+	goto look_for_tag;
+      }
+
+    /* By now, we have a valid tag with a name and zero or more
+       attributes.  Fill in the data and call the mapper function.  */
+    {
+      int i;
+      struct taginfo taginfo;
+
+      taginfo.name      = pool.contents;
+      taginfo.end_tag_p = end_tag;
+      taginfo.nattrs    = nattrs;
+      /* We fill in the char pointers only now, when pool can no
+	 longer get realloc'ed.  If we did that above, we could get
+	 hosed by reallocation.  Obviously, after this point, the pool
+	 may no longer be grown.  */
+      for (i = 0; i < nattrs; i++)
+	{
+	  pairs[i].name = pool.contents + pairs[i].name_pool_index;
+	  pairs[i].value = pool.contents + pairs[i].value_pool_index;
+	}
+      taginfo.attrs = pairs;
+      taginfo.start_position = tag_start_position;
+      taginfo.end_position   = p + 1;
+      /* Ta-dam! */
+      (*mapfun) (&taginfo, closure);
+      ADVANCE (p);
+    }
+    goto look_for_tag;
+
+  backout_tag:
+#ifdef STANDALONE
+    ++tag_backout_count;
+#endif
+    /* The tag wasn't really a tag.  Treat its contents as ordinary
+       data characters. */
+    p = tag_start_position + 1;
+    goto look_for_tag;
+  }
+
+ finish:
+  POOL_FREE (pool);
+  if (!attr_pair_alloca_p)
+    free (pairs);
+}
+
+#undef ADVANCE
+#undef SKIP_WS
+#undef SKIP_NON_WS
+
+#ifdef STANDALONE
+static void
+test_mapper (struct taginfo *taginfo, void *arg)
+{
+  int i;
+
+  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
+  for (i = 0; i < taginfo->nattrs; i++)
+    printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
+  putchar ('\n');
+  ++*(int *)arg;
+}
+
+int main ()
+{
+  int size = 256;
+  char *x = (char *)xmalloc (size);
+  int length = 0;
+  int read_count;
+  int tag_counter = 0;
+
+  while ((read_count = fread (x + length, 1, size - length, stdin)))
+    {
+      length += read_count;
+      size <<= 1;
+      x = (char *)xrealloc (x, size);
+    }
+
+  map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
+  printf ("TAGS: %d\n", tag_counter);
+  printf ("Tag backouts:     %d\n", tag_backout_count);
+  printf ("Comment backouts: %d\n", comment_backout_count);
+  return 0;
+}
+#endif /* STANDALONE */
--- a/src/html-parse.h
+++ b/src/html-parse.h
@ -0,0 +1,44 @@
+/* Declarations for html-parse.c.
+   Copyright (C) 1998 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+struct attr_pair {
+  char *name;			/* attribute name */
+  char *value;			/* attribute value */
+
+  /* Needed for URL conversion; the places where the value begins and
+     ends, including the quotes and everything. */
+  const char *value_raw_beginning;
+  int value_raw_size;
+
+  /* Used internally by map_html_tags. */
+  int name_pool_index, value_pool_index;
+};
+
+struct taginfo {
+  char *name;			/* tag name */
+  int end_tag_p;		/* whether this is an end-tag */
+  int nattrs;			/* number of attributes */
+  struct attr_pair *attrs;	/* attributes */
+
+  const char *start_position;	/* start position of tag */
+  const char *end_position;	/* end position of tag */
+};
+
+void map_html_tags PARAMS ((const char *, int, const char **, const char **,
+			    void (*) (struct taginfo *, void *), void *));
--- a/src/html-url.c
+++ b/src/html-url.c
@ -0,0 +1,569 @@
+/* Collect URLs from HTML source.
+   Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "html-parse.h"
+#include "url.h"
+#include "utils.h"
+
+#ifndef errno
+extern int errno;
+#endif
+
+enum tag_category { TC_LINK, TC_SPEC };
+
+/* Here we try to categorize the known tags.  Each tag has its ID and
+   cetegory.  Category TC_LINK means that one or more of its
+   attributes contain links that should be retrieved.  TC_SPEC means
+   that the tag is specific in some way, and has to be handled
+   specially. */
+static struct {
+  const char *name;
+  enum tag_category category;
+} known_tags[] = {
+#define TAG_A		0
+  { "a",	TC_LINK },
+#define TAG_APPLET	1
+  { "applet",	TC_LINK },
+#define TAG_AREA	2
+  { "area",	TC_LINK },
+#define TAG_BASE	3
+  { "base",	TC_SPEC },
+#define TAG_BGSOUND	4
+  { "bgsound",	TC_LINK },
+#define TAG_BODY	5
+  { "body",	TC_LINK },
+#define TAG_EMBED	6
+  { "embed",	TC_LINK },
+#define TAG_FIG		7
+  { "fig",	TC_LINK },
+#define TAG_FRAME	8
+  { "frame",	TC_LINK },
+#define TAG_IFRAME	9
+  { "iframe",	TC_LINK },
+#define TAG_IMG		10
+  { "img",	TC_LINK },
+#define TAG_INPUT	11
+  { "input",	TC_LINK },
+#define TAG_LAYER	12
+  { "layer",	TC_LINK },
+#define TAG_LINK	13
+  { "link",	TC_SPEC },
+#define TAG_META	14
+  { "meta",	TC_SPEC },
+#define TAG_OVERLAY	15
+  { "overlay",	TC_LINK },
+#define TAG_SCRIPT	16
+  { "script",	TC_LINK },
+#define TAG_TABLE	17
+  { "table",	TC_LINK },
+#define TAG_TD		18
+  { "td",	TC_LINK },
+#define TAG_TH		19
+  { "th",	TC_LINK }
+};
+
+/* Flags for specific url-attr pairs handled through TC_LINK: */
+#define AF_EXTERNAL 1
+
+/* For tags handled by TC_LINK: attributes that contain URLs to
+   download. */
+static struct {
+  int tagid;
+  const char *attr_name;
+  int flags;
+} url_tag_attr_map[] = {
+  { TAG_A,		"href",		AF_EXTERNAL },
+  { TAG_APPLET,		"code",		0 },
+  { TAG_AREA,		"href",		AF_EXTERNAL },
+  { TAG_BGSOUND,	"src",		0 },
+  { TAG_BODY,		"background",	0 },
+  { TAG_EMBED,		"src",		0 },
+  { TAG_FIG,		"src",		0 },
+  { TAG_FRAME,		"src",		0 },
+  { TAG_IFRAME,		"src",		0 },
+  { TAG_IMG,		"href",		0 },
+  { TAG_IMG,		"lowsrc",	0 },
+  { TAG_IMG,		"src",		0 },
+  { TAG_INPUT,		"src",		0 },
+  { TAG_LAYER,		"src",		0 },
+  { TAG_OVERLAY,	"src",		0 },
+  { TAG_SCRIPT,		"src",		0 },
+  { TAG_TABLE,		"background",	0 },
+  { TAG_TD,		"background",	0 },
+  { TAG_TH,		"background",	0 }
+};
+
+/* The lists of interesting tags and attributes are built dynamically,
+   from the information above.  However, some places in the code refer
+   to the attributes not mentioned here.  We add them manually.  */
+static const char *additional_attributes[] = {
+  "rel",			/* for TAG_LINK */
+  "http-equiv",			/* for TAG_META */
+  "name",			/* for TAG_META */
+  "content"			/* for TAG_META */
+};
+
+static const char **interesting_tags;
+static const char **interesting_attributes;
+
+void
+init_interesting (void)
+{
+  /* Init the variables interesting_tags and interesting_attributes
+     that are used by the HTML parser to know which tags and
+     attributes we're interested in.  We initialize this only once,
+     for performance reasons.
+
+     Here we also make sure that what we put in interesting_tags
+     matches the user's preferences as specified through --ignore-tags
+     and --follow-tags.  */
+
+  {
+    int i, ind = 0;
+    int size = ARRAY_SIZE (known_tags);
+    interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
+
+    for (i = 0; i < size; i++)
+      {
+	const char *name = known_tags[i].name;
+
+	/* Normally here we could say:
+	   interesting_tags[i] = name;
+	   But we need to respect the settings of --ignore-tags and
+	   --follow-tags, so the code gets a bit harier.  */
+
+	if (opt.ignore_tags)
+	  {
+	    /* --ignore-tags was specified.  Do not match these
+	       specific tags.  --ignore-tags takes precedence over
+	       --follow-tags, so we process --ignore first and fall
+	       through if there's no match. */
+	    int j, lose = 0;
+	    for (j = 0; opt.ignore_tags[j] != NULL; j++)
+	      /* Loop through all the tags this user doesn't care
+                 about. */
+	      if (strcasecmp(opt.ignore_tags[j], name) == EQ)
+		{
+		  lose = 1;
+		  break;
+		}
+	    if (lose)
+	      continue;
+	  }
+
+	if (opt.follow_tags)
+	  {
+	    /* --follow-tags was specified.  Only match these specific
+	       tags, so return FALSE if we don't match one of them. */
+	    int j, win = 0;
+	    for (j = 0; opt.follow_tags[j] != NULL; j++)
+	      /* Loop through all the tags this user cares about. */
+	      if (strcasecmp(opt.follow_tags[j], name) == EQ)
+		{
+		  win = 1;
+		  break;
+		}
+	    if (!win)
+	      continue;		/* wasn't one of the explicitly
+                                   desired tags */
+	  }
+
+	/* If we get to here, --follow-tags isn't being used or the
+	   tag is among the ones that are follwed, and --ignore-tags,
+	   if specified, didn't include this tag, so it's an
+	   "interesting" one. */
+	interesting_tags[ind++] = name;
+      }
+    interesting_tags[ind] = NULL;
+  }
+
+  /* The same for attributes, except we loop through url_tag_attr_map.
+     Here we also need to make sure that the list of attributes is
+     unique, and to include the attributes from additional_attributes.  */
+  {
+    int i, ind;
+    const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
+				* sizeof (char *));
+    /* First copy the "additional" attributes. */
+    for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
+      att[i] = additional_attributes[i];
+    ind = i;
+    att[ind] = NULL;
+    for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
+      {
+	int j, seen = 0;
+	const char *look_for = url_tag_attr_map[i].attr_name;
+	for (j = 0; j < ind - 1; j++)
+	  if (!strcmp (att[j], look_for))
+	    {
+	      seen = 1;
+	      break;
+	    }
+	if (!seen)
+	  {
+	    att = xrealloc (att, (ind + 2) * sizeof (*att));
+	    att[ind++] = look_for;
+	    att[ind] = NULL;
+	  }
+      }
+    interesting_attributes = att;
+  }
+}
+
+static int
+find_tag (const char *tag_name)
+{
+  int i;
+
+  /* This is linear search; if the number of tags grow, we can switch
+     to binary search.  */
+
+  for (i = 0; i < ARRAY_SIZE (known_tags); i++)
+    {
+      int cmp = strcasecmp (known_tags[i].name, tag_name);
+      /* known_tags are sorted alphabetically, so we can
+         micro-optimize.  */
+      if (cmp > 0)
+	break;
+      else if (cmp == 0)
+	return i;
+    }
+  return -1;
+}
+
+/* Find the value of attribute named NAME in the taginfo TAG.  If the
+   attribute is not present, return NULL.  If ATTRID is non-NULL, the
+   exact identity of the attribute will be returned.  */
+static char *
+find_attr (struct taginfo *tag, const char *name, int *attrid)
+{
+  int i;
+  for (i = 0; i < tag->nattrs; i++)
+    if (!strcasecmp (tag->attrs[i].name, name))
+      {
+	if (attrid)
+	  *attrid = i;
+	return tag->attrs[i].value;
+      }
+  return NULL;
+}
+
+struct collect_urls_closure {
+  char *text;			/* HTML text. */
+  char *base;			/* Base URI of the document, possibly
+				   changed through <base href=...>. */
+  urlpos *head, *tail;		/* List of URLs */
+  const char *parent_base;	/* Base of the current document. */
+  const char *document_file;	/* File name of this document. */
+  int dash_p_leaf_HTML;		/* Whether -p is specified, and this
+                                   document is the "leaf" node of the
+                                   HTML tree. */
+  int nofollow;			/* whether NOFOLLOW was specified in a
+                                   <meta name=robots> tag. */
+};
+
+/* Resolve LINK_URI and append it to closure->tail.  TAG and ATTRID
+   are the necessary context to store the position and size.  */
+
+static void
+handle_link (struct collect_urls_closure *closure, const char *link_uri,
+	     struct taginfo *tag, int attrid)
+{
+  int no_proto = !has_proto (link_uri);
+  urlpos *newel;
+
+  const char *base = closure->base ? closure->base : closure->parent_base;
+  char *complete_uri;
+
+  char *fragment = strrchr (link_uri, '#');
+
+  if (fragment)
+    {
+      /* Nullify the fragment identifier, i.e. everything after the
+         last occurrence of `#', inclusive.  This copying is
+         relatively inefficient, but it doesn't matter because
+         fragment identifiers don't come up all that often.  */
+      int hashlen = fragment - link_uri;
+      char *p = alloca (hashlen + 1);
+      memcpy (p, link_uri, hashlen);
+      p[hashlen] = '\0';
+      link_uri = p;
+    }
+
+  if (!base)
+    {
+      if (no_proto)
+	{
+	  /* We have no base, and the link does not have a protocol or
+             a host attached to it.  Nothing we can do.  */
+	  /* #### Should we print a warning here?  Wget 1.5.x used to.  */
+	  return;
+	}
+      else
+	complete_uri = xstrdup (link_uri);
+    }
+  else
+    complete_uri = url_concat (base, link_uri);
+
+  DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
+	   closure->document_file, base ? base : "(null)",
+	   link_uri, complete_uri));
+
+  newel = (urlpos *)xmalloc (sizeof (urlpos));
+
+  memset (newel, 0, sizeof (*newel));
+  newel->next = NULL;
+  newel->url = complete_uri;
+  newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
+  newel->size = tag->attrs[attrid].value_raw_size;
+
+  /* A URL is relative if the host and protocol are not named, and the
+     name does not start with `/'.
+     #### This logic might need some rethinking.  */
+  if (no_proto && *link_uri != '/')
+    newel->flags |= (URELATIVE | UNOPROTO);
+  else if (no_proto)
+    newel->flags |= UNOPROTO;
+
+  if (closure->tail)
+    {
+      closure->tail->next = newel;
+      closure->tail = newel;
+    }
+  else
+    closure->tail = closure->head = newel;
+}
+
+/* #### Document what this does.
+   #### It would be nice to split this into several functions.  */
+
+static void
+collect_tags_mapper (struct taginfo *tag, void *arg)
+{
+  struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
+  int tagid = find_tag (tag->name);
+  assert (tagid != -1);
+
+  switch (known_tags[tagid].category)
+    {
+    case TC_LINK:
+      {
+	int i;
+	int size = ARRAY_SIZE (url_tag_attr_map);
+	for (i = 0; i < size; i++)
+	  if (url_tag_attr_map[i].tagid == tagid)
+	    break;
+	/* We've found the index of url_tag_attr_map where the
+           attributes of our tags begin.  Now, look for every one of
+           them, and handle it.  */
+	for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
+	  {
+	    char *attr_value;
+	    int id;
+	    if (closure->dash_p_leaf_HTML
+		&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
+	      /* If we're at a -p leaf node, we don't want to retrieve
+                 links to references we know are external, such as <a
+                 href=...>.  */
+	      continue;
+
+	    /* This find_attr() buried in a loop may seem inefficient
+               (O(n^2)), but it's not, since the number of attributes
+               (n) we loop over is extremely small.  In the worst case
+               of IMG with all its possible attributes, n^2 will be
+               only 9.  */
+	    attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
+	    if (attr_value)
+	      handle_link (closure, attr_value, tag, id);
+	  }
+      }
+      break;
+    case TC_SPEC:
+      switch (tagid)
+	{
+	case TAG_BASE:
+	  {
+	    char *newbase = find_attr (tag, "href", NULL);
+	    if (!newbase)
+	      break;
+	    if (closure->base)
+	      free (closure->base);
+	    if (closure->parent_base)
+	      closure->base = url_concat (closure->parent_base, newbase);
+	    else
+	      closure->base = xstrdup (newbase);
+	  }
+	  break;
+	case TAG_LINK:
+	  {
+	    int id;
+	    char *rel  = find_attr (tag, "rel", NULL);
+	    char *href = find_attr (tag, "href", &id);
+	    if (href)
+	      {
+		/* In the normal case, all <link href=...> tags are
+		   fair game.
+
+		   In the special case of when -p is active, however,
+		   and we're at a leaf node (relative to the -l
+		   max. depth) in the HTML document tree, the only
+		   <LINK> tag we'll follow is a <LINK REL=
+		   "stylesheet">, as it's necessary for displaying
+		   this document properly.  We won't follow other
+		   <LINK> tags, like <LINK REL="home">, for instance,
+		   as they refer to external documents.  */
+		if (!closure->dash_p_leaf_HTML
+		    || (rel && !strcasecmp (rel, "stylesheet")))
+		  handle_link (closure, href, tag, id);
+	      }
+	  }
+	  break;
+	case TAG_META:
+	  /* Some pages use a META tag to specify that the page be
+	     refreshed by a new page after a given number of seconds.
+	     The general format for this is:
+
+	     <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+
+	     So we just need to skip past the "NUMBER; URL=" garbage
+	     to get to the URL.  */
+	  {
+	    int id;
+	    char *name = find_attr (tag, "name", NULL);
+	    char *http_equiv = find_attr (tag, "http-equiv", &id);
+	    if (http_equiv && !strcasecmp (http_equiv, "refresh"))
+	      {
+		char *refresh = find_attr (tag, "content", NULL);
+		char *p = refresh;
+		int offset;
+		while (ISDIGIT (*p))
+		  ++p;
+		if (*p++ != ';')
+		  return;
+		while (ISSPACE (*p))
+		  ++p;
+		if (!(TOUPPER (*p) == 'U'
+		      && TOUPPER (*(p + 1)) == 'R'
+		      && TOUPPER (*(p + 2)) == 'L'
+		      && *(p + 3) == '='))
+		  return;
+		p += 4;
+		while (ISSPACE (*p))
+		  ++p;
+		offset = p - refresh;
+		tag->attrs[id].value_raw_beginning += offset;
+		tag->attrs[id].value_raw_size -= offset;
+		handle_link (closure, p, tag, id);
+	      }
+	    else if (name && !strcasecmp (name, "robots"))
+	      {
+		/* Handle stuff like:
+		   <meta name="robots" content="index,nofollow"> */
+		char *content = find_attr (tag, "content", NULL);
+		if (!content)
+		  return;
+		if (!strcasecmp (content, "none"))
+		  closure->nofollow = 1;
+		else
+		  {
+		    while (*content)
+		      {
+			/* Find the next occurrence of ',' or the end of
+			   the string.  */
+			char *end = strchr (content, ',');
+			if (end)
+			  ++end;
+			else
+			  end = content + strlen (content);
+			if (!strncasecmp (content, "nofollow", end - content))
+			  closure->nofollow = 1;
+			content = end;
+		      }
+		  }
+	      }
+	  }
+	  break;
+	default:
+	  /* Category is TC_SPEC, but tag name is unhandled.  This
+             must not be.  */
+	  abort ();
+	}
+      break;
+    }
+}
+
+/* Scan FILE, retrieving links to HTML documents from it.  Each link is 
+
+  Similar to get_urls_file, but for HTML files.  FILE is scanned as
+   an HTML document.  get_urls_html() constructs the URLs from the
+   relative href-s.
+
+   If SILENT is non-zero, do not barf on baseless relative links.  */
+urlpos *
+get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
+	       int *meta_disallow_follow)
+{
+  struct file_memory *fm;
+  struct collect_urls_closure closure;
+
+  /* Load the file. */
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+
+  closure.text = fm->content;
+  closure.head = closure.tail = NULL;
+  closure.base = NULL;
+  closure.parent_base = this_url ? this_url : opt.base_href;
+  closure.document_file = file;
+  closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
+  closure.nofollow = 0;
+
+  if (!interesting_tags)
+    init_interesting ();
+
+  map_html_tags (fm->content, fm->length, interesting_tags,
+		 interesting_attributes, collect_tags_mapper, &closure);
+
+  DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
+  if (meta_disallow_follow)
+    *meta_disallow_follow = closure.nofollow;
+
+  FREE_MAYBE (closure.base);
+  read_file_free (fm);
+  return closure.head;
+}
--- a/src/http.c
+++ b/src/http.c
@ -254,6 +254,85 @@ http_process_type (const char *hdr, void *arg)
  return 1;
 }

+/* Check whether the `Connection' header is set to "keep-alive". */
+static int
+http_process_connection (const char *hdr, void *arg)
+{
+  int *flag = (int *)arg;
+  if (!strcasecmp (hdr, "Keep-Alive"))
+    *flag = 1;
+  return 1;
+}
+
+/* Persistent connections (pc). */
+
+static unsigned char pc_last_host[4];
+static unsigned short pc_last_port;
+static int pc_last_fd;
+
+static void
+register_persistent (const char *host, unsigned short port, int fd)
+{
+  if (!store_hostaddress (pc_last_host, host))
+    return;
+  pc_last_port = port;
+  pc_last_fd = fd;
+}
+
+static void
+invalidate_persistent (void)
+{
+  pc_last_port = 0;
+}
+
+static int
+persistent_available_p (const char *host, unsigned short port)
+{
+  unsigned char this_host[4];
+  if (port != pc_last_port)
+    return 0;
+  if (!store_hostaddress (this_host, host))
+    return 0;
+  if (memcmp (pc_last_host, this_host, 4))
+    return 0;
+  if (!test_socket_open (pc_last_fd))
+    {
+      invalidate_persistent ();
+      return 0;
+    }
+  return 1;
+}
+
+/* The idea behind these two CLOSE macros is to distinguish between
+   two cases: one when the job we've been doing is finished, and we
+   want to close the connection and leave, and two when something is
+   seriously wrong and we're closing the connection as part of
+   cleanup.
+
+   In case of keep_alive, CLOSE_FINISH should leave the connection
+   open, while CLOSE_INVALIDATE should still close it.
+
+   The semantic difference between the flags `keep_alive' and
+   `reused_connection' is that keep_alive defines the state of HTTP:
+   whether the connection *will* be preservable.  reused_connection,
+   on the other hand, reflects the present: whether the *current*
+   connection is the result of preserving.  */
+
+#define CLOSE_FINISH(fd) do {			\
+  if (!keep_alive)				\
+    {						\
+      CLOSE (fd);				\
+      if (reused_connection)			\
+	invalidate_persistent ();		\
+    }						\
+} while (0)
+
+#define CLOSE_INVALIDATE(fd) do {		\
+  CLOSE (fd);					\
+  if (reused_connection)			\
+    invalidate_persistent ();			\
+} while (0)
+

 struct http_stat
 {
@ -317,6 +396,8 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
  FILE *fp;
  int auth_tried_already;
  struct rbuf rbuf;
+  int keep_alive, http_keep_alive_1, http_keep_alive_2;
+  int reused_connection;

  if (!(*dt & HEAD_ONLY))
    /* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
@ -329,6 +410,9 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
 again:
  /* We need to come back here when the initial attempt to retrieve
     without authorization header fails.  */
+  keep_alive = 0;
+  http_keep_alive_1 = http_keep_alive_2 = 0;
+  reused_connection = 0;

  /* Initialize certain elements of struct http_stat.  */
  hs->len = 0L;
@ -345,40 +429,49 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
    ou = u;

  /* First: establish the connection.  */
-  logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
-  err = make_connection (&sock, u->host, u->port);
-  switch (err)
+  if (u->proxy || !persistent_available_p (u->host, u->port))
    {
-    case HOSTERR:
-      logputs (LOG_VERBOSE, "\n");
-      logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
-      return HOSTERR;
-      break;
-    case CONSOCKERR:
-      logputs (LOG_VERBOSE, "\n");
-      logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
-      return CONSOCKERR;
-      break;
-    case CONREFUSED:
-      logputs (LOG_VERBOSE, "\n");
-      logprintf (LOG_NOTQUIET,
-		 _("Connection to %s:%hu refused.\n"), u->host, u->port);
-      CLOSE (sock);
-      return CONREFUSED;
-    case CONERROR:
-      logputs (LOG_VERBOSE, "\n");
-      logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
-      CLOSE (sock);
-      return CONERROR;
-      break;
-    case NOCONERROR:
-      /* Everything is fine!  */
-      logputs (LOG_VERBOSE, _("connected!\n"));
-      break;
-    default:
-      abort ();
-      break;
-    } /* switch */
+      logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
+      err = make_connection (&sock, u->host, u->port);
+      switch (err)
+	{
+	case HOSTERR:
+	  logputs (LOG_VERBOSE, "\n");
+	  logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
+	  return HOSTERR;
+	  break;
+	case CONSOCKERR:
+	  logputs (LOG_VERBOSE, "\n");
+	  logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
+	  return CONSOCKERR;
+	  break;
+	case CONREFUSED:
+	  logputs (LOG_VERBOSE, "\n");
+	  logprintf (LOG_NOTQUIET,
+		     _("Connection to %s:%hu refused.\n"), u->host, u->port);
+	  CLOSE (sock);
+	  return CONREFUSED;
+	case CONERROR:
+	  logputs (LOG_VERBOSE, "\n");
+	  logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
+	  CLOSE (sock);
+	  return CONERROR;
+	  break;
+	case NOCONERROR:
+	  /* Everything is fine!  */
+	  logputs (LOG_VERBOSE, _("connected!\n"));
+	  break;
+	default:
+	  abort ();
+	  break;
+	}
+    }
+  else
+    {
+      logprintf (LOG_VERBOSE, _("Reusing connection to %s:%hu.\n"), u->host, u->port);
+      sock = pc_last_fd;
+      reused_connection = 1;
+    }

  if (u->proxy)
    path = u->proxy->url;
@ -487,6 +580,7 @@ gethttp (struct urlinfo *u, struct http_stat *hs, int *dt)
 User-Agent: %s\r\n\
 Host: %s%s\r\n\
 Accept: %s\r\n\
+Connection: Keep-Alive\r\n\
 %s%s%s%s%s%s\r\n",
 	   command, path, useragent, remhost,
 	   host_port ? host_port : "",
@ -505,8 +599,9 @@ Accept: %s\r\n\
  num_written = iwrite (sock, request, strlen (request));
  if (num_written < 0)
    {
-      logputs (LOG_VERBOSE, _("Failed writing HTTP request.\n"));
-      CLOSE (sock);
+      logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
+		 strerror (errno));
+      CLOSE_INVALIDATE (sock);
      return WRITEFAILED;
    }
  logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
@ -553,7 +648,7 @@ Accept: %s\r\n\
 	  FREE_MAYBE (type);
 	  FREE_MAYBE (hs->newloc);
 	  FREE_MAYBE (all_headers);
-	  CLOSE (sock);
+	  CLOSE_INVALIDATE (sock);
 	  return HEOF;
 	}
      else if (status == HG_ERROR)
@ -565,7 +660,7 @@ Accept: %s\r\n\
 	  FREE_MAYBE (type);
 	  FREE_MAYBE (hs->newloc);
 	  FREE_MAYBE (all_headers);
-	  CLOSE (sock);
+	  CLOSE_INVALIDATE (sock);
 	  return HERR;
 	}

@ -672,12 +767,32 @@ Accept: %s\r\n\
 	      goto done_header;
 	    }
 	}
+      /* Check for the `Keep-Alive' header. */
+      if (!http_keep_alive_1)
+	{
+	  if (header_process (hdr, "Keep-Alive", header_exists,
+			      &http_keep_alive_1))
+	    goto done_header;
+	}
+      /* Check for `Connection: Keep-Alive'. */
+      if (!http_keep_alive_2)
+	{
+	  if (header_process (hdr, "Connection", http_process_connection,
+			      &http_keep_alive_2))
+	    goto done_header;
+	}
    done_header:
      free (hdr);
    }

  logputs (LOG_VERBOSE, "\n");

+  if (contlen != -1
+      && (http_keep_alive_1 || http_keep_alive_2))
+    keep_alive = 1;
+  if (keep_alive && !reused_connection)
+    register_persistent (u->host, u->port, sock);
+
  if ((statcode == HTTP_STATUS_UNAUTHORIZED)
      && authenticate_h)
    {
@ -685,7 +800,7 @@ Accept: %s\r\n\
      FREE_MAYBE (type);
      type = NULL;
      FREEHSTAT (*hs);
-      CLOSE (sock);
+      CLOSE_FINISH (sock);
      if (auth_tried_already)
 	{
 	  /* If we have tried it already, then there is not point
@ -753,7 +868,7 @@ Accept: %s\r\n\
      FREE_MAYBE (type);
      FREE_MAYBE (hs->newloc);
      FREE_MAYBE (all_headers);
-      CLOSE (sock);
+      CLOSE_INVALIDATE (sock);
      return RANGEERR;
    }

@ -783,7 +898,7 @@ Accept: %s\r\n\
 		     _("Location: %s%s\n"),
 		     hs->newloc ? hs->newloc : _("unspecified"),
 		     hs->newloc ? _(" [following]") : "");
-	  CLOSE (sock);
+	  CLOSE_FINISH (sock);
 	  FREE_MAYBE (type);
 	  FREE_MAYBE (all_headers);
 	  return NEWLOCATION;
@ -824,7 +939,7 @@ Accept: %s\r\n\
      hs->res = 0;
      FREE_MAYBE (type);
      FREE_MAYBE (all_headers);
-      CLOSE (sock);
+      CLOSE_FINISH (sock);
      return RETRFINISHED;
    }

@ -838,7 +953,7 @@ Accept: %s\r\n\
      if (!fp)
 	{
 	  logprintf (LOG_NOTQUIET, "%s: %s\n", u->local, strerror (errno));
-	  CLOSE (sock);
+	  CLOSE_FINISH (sock);
 	  FREE_MAYBE (all_headers);
 	  return FOPENERR;
 	}
@ -863,7 +978,7 @@ Accept: %s\r\n\
  /* Get the contents of the document.  */
  hs->res = get_contents (sock, fp, &hs->len, hs->restval,
 			  (contlen != -1 ? contlen : 0),
-			  &rbuf);
+			  &rbuf, keep_alive);
  hs->dltime = elapsed_time ();
  {
    /* Close or flush the file.  We have to be careful to check for
@ -878,7 +993,7 @@ Accept: %s\r\n\
      hs->res = -2;
  }
  FREE_MAYBE (all_headers);
-  CLOSE (sock);
+  CLOSE_FINISH (sock);
  if (hs->res == -2)
    return FWRITEERR;
  return RETRFINISHED;
--- a/src/main.c
+++ b/src/main.c
@ -97,6 +97,20 @@ i18n_initialize (void)
  textdomain ("wget");
 #endif /* HAVE_NLS */
 }
+
+/* It's kosher to declare these here because their interface _has_ to
+   be void foo(void).  */
+void url_init PARAMS ((void));
+void host_init PARAMS ((void));
+
+/* This just calls the various initialization functions from the
+   modules that need one-time initialization. */
+static void
+private_initialize (void)
+{
+  url_init ();
+  host_init ();
+}

 /* Print the usage message.  */
 static void
@ -293,6 +307,7 @@ main (int argc, char *const *argv)
  };

  i18n_initialize ();
+  private_initialize ();

  append_to_log = 0;

--- a/src/recur.c
+++ b/src/recur.c
@ -42,21 +42,20 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "ftp.h"
 #include "fnmatch.h"
 #include "host.h"
+#include "hash.h"

 extern char *version_string;

 #define ROBOTS_FILENAME "robots.txt"

-/* #### Many of these lists should really be hashtables!  */
-
-/* List of downloaded URLs.  */
-static urlpos *urls_downloaded;
+static struct hash_table *dl_file_url_map;
+static struct hash_table *dl_url_file_map;

 /* List of HTML URLs.  */
 static slist *urls_html;

 /* List of undesirable-to-load URLs.  */
-static slist *ulist;
+static struct hash_table *undesirable_urls;

 /* List of forbidden locations.  */
 static char **forbidden = NULL;
@ -84,14 +83,28 @@ static int robots_match PARAMS ((struct urlinfo *, char **));
 void
 recursive_cleanup (void)
 {
-  free_slist (ulist);
-  ulist = NULL;
+  if (undesirable_urls)
+    {
+      string_set_free (undesirable_urls);
+      undesirable_urls = NULL;
+    }
+  if (dl_file_url_map)
+    {
+      free_keys_and_values (dl_file_url_map);
+      hash_table_destroy (dl_file_url_map);
+      dl_file_url_map = NULL;
+    }
+  if (dl_url_file_map)
+    {
+      free_keys_and_values (dl_url_file_map);
+      hash_table_destroy (dl_url_file_map);
+      dl_url_file_map = NULL;
+    }
+  undesirable_urls = NULL;
  free_vec (forbidden);
  forbidden = NULL;
-  free_slist (urls_html);
+  slist_free (urls_html);
  urls_html = NULL;
-  free_urlpos (urls_downloaded);
-  urls_downloaded = NULL;
  FREE_MAYBE (base_dir);
  FREE_MAYBE (robots_host);
  first_time = 1;
@ -117,6 +130,7 @@ recursive_retrieve (const char *file, const char *this_url)
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
+  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  struct urlinfo *rurl;
@ -132,17 +146,29 @@ recursive_retrieve (const char *file, const char *this_url)
  /* Cache the current URL in the list.  */
  if (first_time)
    {
-      ulist = add_slist (ulist, this_url, 0);
-      urls_downloaded = NULL;
+      /* These three operations need to be done only once per Wget
+         run.  They should probably be at a different location.  */
+      if (!undesirable_urls)
+	undesirable_urls = make_string_hash_table (0);
+      if (!dl_file_url_map)
+	dl_file_url_map = make_string_hash_table (0);
+      if (!dl_url_file_map)
+	dl_url_file_map = make_string_hash_table (0);
+
+      hash_table_clear (undesirable_urls);
+      string_set_add (undesirable_urls, this_url);
+      hash_table_clear (dl_file_url_map);
+      hash_table_clear (dl_url_file_map);
      urls_html = NULL;
-      /* Enter this_url to the slist, in original and "enhanced" form.  */
+      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
 	{
-	  ulist = add_slist (ulist, u->url, 0);
-	  urls_downloaded = add_url (urls_downloaded, u->url, file);
-	  urls_html = add_slist (urls_html, file, NOSORT);
+	  string_set_add (undesirable_urls, u->url);
+	  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
+	  hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
+	  urls_html = slist_append (urls_html, file);
 	  if (opt.no_parent)
 	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
 	  /* Set the canonical this_url to be sent as referer.  This
@ -191,7 +217,15 @@ recursive_retrieve (const char *file, const char *this_url)

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
-			    0, dash_p_leaf_HTML);
+			    dash_p_leaf_HTML, &meta_disallow_follow);
+
+  if (opt.use_robots && meta_disallow_follow)
+    {
+      /* The META tag says we are not to follow this file.  Respect
+         that.  */
+      free_urlpos (url_list);
+      url_list = NULL;
+    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
@ -240,16 +274,16 @@ recursive_retrieve (const char *file, const char *this_url)
 	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
-	 ulist.  Using it is crucial to avoid the incessant calls to
-	 in_slist, which is quite slow.  */
-      inl = in_slist (ulist, constr);
+	 undesirable_urls.  Using it is crucial to avoid unnecessary
+	 repeated continuous hits to the hash table.  */
+      inl = string_set_exists (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
 	if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
 	  {
 	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
-	    ulist = add_slist (ulist, constr, 0);
+	    string_set_add (undesirable_urls, constr);
 	    inl = 1;
 	  }
      /* If it is absolute link and they are not followed, chuck it
@ -258,7 +292,7 @@ recursive_retrieve (const char *file, const char *this_url)
 	if (opt.relative_only && !(cur_url->flags & URELATIVE))
 	  {
 	    DEBUGP (("It doesn't really look like a relative link.\n"));
-	    ulist = add_slist (ulist, constr, 0);
+	    string_set_add (undesirable_urls, constr);
 	    inl = 1;
 	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
@ -266,7 +300,7 @@ recursive_retrieve (const char *file, const char *this_url)
 	if (!accept_domain (u))
 	  {
 	    DEBUGP (("I don't like the smell of that domain.\n"));
-	    ulist = add_slist (ulist, constr, 0);
+	    string_set_add (undesirable_urls, constr);
 	    inl = 1;
 	  }
      /* Check for parent directory.  */
@ -286,7 +320,7 @@ recursive_retrieve (const char *file, const char *this_url)
 		{
 		  /* Failing that too, kill the URL.  */
 		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
-		  ulist = add_slist (ulist, constr, 0);
+		  string_set_add (undesirable_urls, constr);
 		  inl = 1;
 		}
 	      freeurl (ut, 1);
@ -300,7 +334,7 @@ recursive_retrieve (const char *file, const char *this_url)
 	  if (!accdir (u->dir, ALLABS))
 	    {
 	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
-	      ulist = add_slist (ulist, constr, 0);
+	      string_set_add (undesirable_urls, constr);
 	      inl = 1;
 	    }
 	}
@ -330,7 +364,7 @@ recursive_retrieve (const char *file, const char *this_url)
 		{
 		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 			  constr, u->file));
-		  ulist = add_slist (ulist, constr, 0);
+		  string_set_add (undesirable_urls, constr);
 		  inl = 1;
 		}
 	    }
@ -353,12 +387,12 @@ recursive_retrieve (const char *file, const char *this_url)
 	    }
 	  free (constr);
 	  constr = xstrdup (u->url);
-	  inl = in_slist (ulist, constr);
+	  string_set_add (undesirable_urls, constr);
 	  if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
 	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
 	      {
 		DEBUGP (("This is not the same hostname as the parent's.\n"));
-		ulist = add_slist (ulist, constr, 0);
+		string_set_add (undesirable_urls, constr);
 		inl = 1;
 	      }
 	}
@ -398,7 +432,7 @@ recursive_retrieve (const char *file, const char *this_url)
 	    {
 	      DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
 		       ROBOTS_FILENAME));
-	      ulist = add_slist (ulist, constr, 0);
+	      string_set_add (undesirable_urls, constr);
 	      inl = 1;
 	    }
 	}
@ -409,7 +443,7 @@ recursive_retrieve (const char *file, const char *this_url)
 	{
 	  DEBUGP (("I've decided to load it -> "));
 	  /* Add it to the list of already-loaded URL-s.  */
-	  ulist = add_slist (ulist, constr, 0);
+	  string_set_add (undesirable_urls, constr);
 	  /* Automatically followed FTPs will *not* be downloaded
 	     recursively.  */
 	  if (u->proto == URLFTP)
@ -439,10 +473,13 @@ recursive_retrieve (const char *file, const char *this_url)
 	    {
 	      if (dt & RETROKF)
 		{
-		  urls_downloaded = add_url (urls_downloaded, constr, filename);
+		  hash_table_put (dl_file_url_map,
+				  xstrdup (filename), xstrdup (constr));
+		  hash_table_put (dl_url_file_map,
+				  xstrdup (constr), xstrdup (filename));
 		  /* If the URL is HTML, note it.  */
 		  if (dt & TEXTHTML)
-		    urls_html = add_slist (urls_html, filename, NOSORT);
+		    urls_html = slist_append (urls_html, filename);
 		}
 	    }
 	  /* If there was no error, and the type is text/html, parse
@ -489,6 +526,10 @@ recursive_retrieve (const char *file, const char *this_url)
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
+    /* This is merely the first pass: the links that have been
+       successfully downloaded are converted.  In the second pass,
+       convert_all_links() will also convert those links that have NOT
+       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
@ -531,30 +572,37 @@ void
 convert_all_links (void)
 {
  uerr_t res;
-  urlpos *l1, *l2, *urls;
+  urlpos *l1, *urls;
  struct urlinfo *u;
  slist *html;
-  urlpos *urlhtml;

  for (html = urls_html; html; html = html->next)
    {
+      int meta_disallow_follow;
+      char *url;
+
      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
 	 it.  */
-      for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
-	if (!strcmp (urlhtml->local_name, html->string))
-	  break;
-      if (urlhtml)
-	DEBUGP (("It should correspond to %s.\n", urlhtml->url));
+      url = hash_table_get (dl_file_url_map, html->string);
+      if (url)
+	DEBUGP (("It should correspond to %s.\n", url));
      else
 	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
-      urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
-			    FALSE);
+      urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow);
+      if (opt.use_robots && meta_disallow_follow)
+	{
+	  /* The META tag says we are not to follow this file.
+	     Respect that.  */
+	  free_urlpos (urls);
+	  urls = NULL;
+	}
      if (!urls)
 	continue;
      for (l1 = urls; l1; l1 = l1->next)
 	{
+	  char *local_name;
 	  /* The URL must be in canonical form to be compared.  */
 	  u = newurl ();
 	  res = parseurl (l1->url, u, 0);
@ -565,22 +613,18 @@ convert_all_links (void)
 	    }
 	  /* We decide the direction of conversion according to whether
 	     a URL was downloaded.  Downloaded URLs will be converted
-	     ABS2REL, whereas non-downloaded will be converted REL2ABS.
-	     Note: not yet implemented; only ABS2REL works.  */
-	  for (l2 = urls_downloaded; l2; l2 = l2->next)
-	    if (!strcmp (l2->url, u->url))
-	      {
-		DEBUGP (("%s flagged for conversion, local %s\n",
-			 l2->url, l2->local_name));
-		break;
-	      }
+	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
+	  local_name = hash_table_get (dl_url_file_map, u->url);
+	  if (local_name)
+	    DEBUGP (("%s flagged for conversion, local %s\n",
+		     u->url, local_name));
 	  /* Clear the flags.  */
 	  l1->flags &= ~ (UABS2REL | UREL2ABS);
 	  /* Decide on the conversion direction.  */
-	  if (l2)
+	  if (local_name)
 	    {
 	      l1->flags |= UABS2REL;
-	      l1->local_name = xstrdup (l2->local_name);
+	      l1->local_name = xstrdup (local_name);
 	    }
 	  else
 	    {
--- a/src/retr.c
+++ b/src/retr.c
@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "ftp.h"
 #include "host.h"
 #include "connect.h"
+#include "hash.h"

 #ifdef WINDOWS
 LARGE_INTEGER internal_time;
@ -60,6 +61,8 @@ enum spflags { SP_NONE, SP_INIT, SP_FINISH };

 static int show_progress PARAMS ((long, long, enum spflags));

+#define MIN(i, j) ((i) <= (j) ? (i) : (j))
+
 /* Reads the contents of file descriptor FD, until it is closed, or a
   read error occurs.  The data is read in 8K chunks, and stored to
   stream fp, which should have been open for writing.  If BUF is
@ -83,9 +86,9 @@ static int show_progress PARAMS ((long, long, enum spflags));
   from fd immediately, flush or discard the buffer.  */
 int
 get_contents (int fd, FILE *fp, long *len, long restval, long expected,
-	      struct rbuf *rbuf)
+	      struct rbuf *rbuf, int use_expected)
 {
-  int res;
+  int res = 0;
  static char c[8192];

  *len = restval;
@ -105,10 +108,17 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 	  *len += res;
 	}
    }
-  /* Read from fd while there is available data.  */
-  do
+  /* Read from fd while there is available data.
+
+     Normally, if expected is 0, it means that it is not known how
+     much data is expected.  However, if use_expected is specified,
+     then expected being zero means exactly that.  */
+  while (!use_expected || (*len < expected))
    {
-      res = iread (fd, c, sizeof (c));
+      int amount_to_read = (use_expected
+			    ? MIN (expected - *len, sizeof (c))
+			    : sizeof (c));
+      res = iread (fd, c, amount_to_read);
      if (res > 0)
 	{
 	  if (fwrite (c, sizeof (char), res, fp) < res)
@ -120,7 +130,9 @@ get_contents (int fd, FILE *fp, long *len, long restval, long expected,
 	    }
 	  *len += res;
 	}
-    } while (res > 0);
+      else
+	break;
+    }
  if (res < -1)
    res = -1;
  if (opt.verbose)
@ -323,7 +335,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  int local_use_proxy;
  char *mynewloc, *proxy;
  struct urlinfo *u;
-  slist *redirections;
+  struct hash_table *redirections = NULL;

  /* If dt is NULL, just ignore it.  */
  if (!dt)
@ -334,8 +346,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  if (file)
    *file = NULL;

-  redirections = NULL;
-
  u = newurl ();
  /* Parse the URL. */
  result = parseurl (url, u, 0);
@ -343,7 +353,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
      freeurl (u, 1);
-      free_slist (redirections);
+      if (redirections)
+	string_set_free (redirections);
      free (url);
      return result;
    }
@ -379,7 +390,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 	{
 	  logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
 	  freeurl (u, 1);
-	  free_slist (redirections);
+	  if (redirections)
+	    string_set_free (redirections);
 	  free (url);
 	  return PROXERR;
 	}
@ -392,7 +404,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 	  else
 	    logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
 	  freeurl (u, 1);
-	  free_slist (redirections);
+	  if (redirections)
+	    string_set_free (redirections);
 	  free (url);
 	  return PROXERR;
 	}
@ -454,7 +467,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 	  logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result));
 	  freeurl (newloc_struct, 1);
 	  freeurl (u, 1);
-	  free_slist (redirections);
+	  if (redirections)
+	    string_set_free (redirections);
 	  free (url);
 	  free (mynewloc);
 	  return result;
@ -466,34 +480,29 @@ retrieve_url (const char *origurl, char **file, char **newloc,
      free (mynewloc);
      mynewloc = xstrdup (newloc_struct->url);

-      /* Check for redirection to back to itself.  */
-      if (!strcmp (u->url, newloc_struct->url))
+      if (!redirections)
 	{
-	  logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
-		     mynewloc);
-	  freeurl (newloc_struct, 1);
-	  freeurl (u, 1);
-	  free_slist (redirections);
-	  free (url);
-	  free (mynewloc);
-	  return WRONGCODE;
+	  redirections = make_string_hash_table (0);
+	  /* Add current URL immediately so we can detect it as soon
+             as possible in case of a cycle. */
+	  string_set_add (redirections, u->url);
 	}

      /* The new location is OK.  Let's check for redirection cycle by
         peeking through the history of redirections. */
-      if (in_slist (redirections, newloc_struct->url))
+      if (string_set_exists (redirections, newloc_struct->url))
 	{
 	  logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
 		     mynewloc);
 	  freeurl (newloc_struct, 1);
 	  freeurl (u, 1);
-	  free_slist (redirections);
+	  if (redirections)
+	    string_set_free (redirections);
 	  free (url);
 	  free (mynewloc);
 	  return WRONGCODE;
 	}
-
-      redirections = add_slist (redirections, newloc_struct->url, NOSORT);
+      string_set_add (redirections, newloc_struct->url);

      free (url);
      url = mynewloc;
@ -510,7 +519,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 	*file = NULL;
    }
  freeurl (u, 1);
-  free_slist (redirections);
+  if (redirections)
+    string_set_free (redirections);

  if (newloc)
    *newloc = url;
@ -531,9 +541,7 @@ retrieve_from_file (const char *file, int html, int *count)
  uerr_t status;
  urlpos *url_list, *cur_url;

-  /* If spider-mode is on, we do not want get_urls_html barfing
-     errors on baseless links.  */
-  url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE)
+  url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
 	      : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
--- a/src/retr.h
+++ b/src/retr.h
@ -22,7 +22,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */

 #include "rbuf.h"

-int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *));
+int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *, int));

 uerr_t retrieve_url PARAMS ((const char *, char **, char **,
 			     const char *, int *));
--- a/src/url.c
+++ b/src/url.c
@ -38,7 +38,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "utils.h"
 #include "url.h"
 #include "host.h"
-#include "html.h"

 #ifndef errno
 extern int errno;
@ -48,22 +47,12 @@ extern int errno;
 #define DEFAULT_HTTP_PORT 80
 #define DEFAULT_FTP_PORT 21

-/* URL separator (for findurl) */
-#define URL_SEPARATOR "!\"#'(),>`{}|<>"
+/* Table of Unsafe chars.  This is intialized in
+   init_unsafe_char_table.  */

-/* A list of unsafe characters for encoding, as per RFC1738.  '@' and
-   ':' (not listed in RFC) were added because of user/password
-   encoding.  */
+static char unsafe_char_table[256];

-#ifndef WINDOWS
-# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
-#else  /* WINDOWS */
-# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
-#endif /* WINDOWS */
-
-#define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
-   			|| ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
-			|| strchr (URL_UNSAFE_CHARS, c))
+#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])

 /* If S contains unsafe characters, free it and replace it with a
   version that doesn't.  */
@ -176,6 +165,34 @@ skip_url (const char *url)
    return 0;
 }

+/* Unsafe chars:
+   - anything <= 32;
+   - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
+   - @ and :, for user/password encoding.
+   - everything over 127 (but we don't bother with recording those.  */
+void
+init_unsafe_char_table (void)
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (i < 32 || i >= 127
+	|| i == '<'
+	|| i == '>'
+	|| i == '\"'
+	|| i == '#'
+	|| i == '%'
+	|| i == '{'
+	|| i == '}'
+	|| i == '|'
+	|| i == '\\'
+	|| i == '^'
+	|| i == '~'
+	|| i == '['
+	|| i == ']'
+	|| i == '`')
+      unsafe_char_table[i] = 1;
+}
+
 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 int
 contains_unsafe (const char *s)
@ -296,7 +313,7 @@ skip_proto (const char *url)

 /* Returns 1 if the URL begins with a protocol (supported or
   unsupported), 0 otherwise.  */
-static int
+int
 has_proto (const char *url)
 {
  char **s;
@ -765,297 +782,54 @@ url_equal (const char *url1, const char *url2)
  return res;
 }

-/* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
-   buffer may contain pretty much anything; no errors are signaled.  */
-static const char *
-findurl (const char *buf, int howmuch, int *count)
-{
-  char **prot;
-  const char *s1, *s2;
-
-  for (s1 = buf; howmuch; s1++, howmuch--)
-    for (prot = protostrings; *prot; prot++)
-      if (howmuch <= strlen (*prot))
-	continue;
-      else if (!strncasecmp (*prot, s1, strlen (*prot)))
-	{
-	  for (s2 = s1, *count = 0;
-	       howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
-		 !strchr (URL_SEPARATOR, *s2);
-	       s2++, (*count)++, howmuch--);
-	  return s1;
-	}
-  return NULL;
-}
-
-/* Scans the file for signs of URL-s.  Returns a vector of pointers,
-   each pointer representing a URL string.  The file is *not* assumed
-   to be HTML.  */
 urlpos *
 get_urls_file (const char *file)
 {
-  long nread;
-  FILE *fp;
-  char *buf;
-  const char *pbuf;
-  int size;
-  urlpos *first, *current, *old;
+  struct file_memory *fm;
+  urlpos *head, *tail;
+  const char *text, *text_end;

-  if (file && !HYPHENP (file))
-    {
-      fp = fopen (file, "rb");
-      if (!fp)
-	{
-	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-	  return NULL;
-	}
-    }
-  else
-    fp = stdin;
  /* Load the file.  */
-  load_file (fp, &buf, &nread);
-  if (file && !HYPHENP (file))
-    fclose (fp);
-  DEBUGP (("Loaded %s (size %ld).\n", file, nread));
-  first = current = NULL;
-  /* Fill the linked list with URLs.  */
-  for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
-       pbuf += size)
+  fm = read_file (file);
+  if (!fm)
    {
-      /* Allocate the space.  */
-      old = current;
-      current = (urlpos *)xmalloc (sizeof (urlpos));
-      if (old)
-	old->next = current;
-      memset (current, 0, sizeof (*current));
-      current->next = NULL;
-      current->url = (char *)xmalloc (size + 1);
-      memcpy (current->url, pbuf, size);
-      current->url[size] = '\0';
-      if (!first)
-	first = current;
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
    }
-  /* Free the buffer.  */
-  free (buf);
-
-  return first;
-}
-
-/* Similar to get_urls_file, but for HTML files.  FILE is scanned as
-   an HTML document using htmlfindurl(), which see.  get_urls_html()
-   constructs the HTML-s from the relative href-s.
-
-   If SILENT is non-zero, do not barf on baseless relative links.  */
-urlpos *
-get_urls_html (const char *file, const char *this_url, int silent,
-	       int dash_p_leaf_HTML)
-{
-  long nread;
-  FILE *fp;
-  char *orig_buf;
-  const char *buf;
-  int step, first_time;
-  urlpos *first, *current, *old;
-
-  if (file && !HYPHENP (file))
+  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+  head = tail = NULL;
+  text = fm->content;
+  text_end = fm->content + fm->length;
+  while (text < text_end)
    {
-      fp = fopen (file, "rb");
-      if (!fp)
-	{
-	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-	  return NULL;
-	}
-    }
-  else
-    fp = stdin;
-  /* Load the file.  */
-  load_file (fp, &orig_buf, &nread);
-  if (file && !HYPHENP (file))
-    fclose (fp);
-  DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
-  first = current = NULL;
-  first_time = 1;
-  /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
-  for (buf = orig_buf;
-       (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
-			   dash_p_leaf_HTML));
-       buf += step)
-    {
-      int i, no_proto;
-      int size = step;
-      const char *pbuf = buf;
-      char *constr, *base;
-      const char *cbase;
-      char *needs_freeing, *url_data;
-
-      first_time = 0;
-
-      /* A frequent phenomenon that needs to be handled are pages
-         generated by brain-damaged HTML generators, which refer to to
-         URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
-         any spaces at the beginning or at the end of the string.
-         This is probably not strictly correct, but that's what the
-         browsers do, so we may follow.  May the authors of "WYSIWYG"
-         HTML tools burn in hell for the damage they've inflicted!  */
-      while ((pbuf < buf + step) && ISSPACE (*pbuf))
-        {
-          ++pbuf;
-          --size;
-        }
-      while (size && ISSPACE (pbuf[size - 1]))
-	--size;
-      if (!size)
-	break;
-
-      /* It would be nice if we could avoid allocating memory in this
-         loop, but I don't see an easy way.  To process the entities,
-         we need to either copy the data, or change it destructively.
-         I choose the former.
-
-	 We have two pointers: needs_freeing and url_data, because the
-	 code below does thing like url_data += <something>, and we
-	 want to pass the original string to free(). */
-      needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
-      size = strlen (url_data);
-
-      for (i = 0; protostrings[i]; i++)
-	{
-	  if (!strncasecmp (protostrings[i], url_data,
-			    MINVAL (strlen (protostrings[i]), size)))
-	    break;
-	}
-      /* Check for http:RELATIVE_URI.  See below for details.  */
-      if (protostrings[i]
-	  && !(strncasecmp (url_data, "http:", 5) == 0
-	       && strncasecmp (url_data, "http://", 7) != 0))
-	{
-	  no_proto = 0;
-	}
+      const char *line_beg = text;
+      const char *line_end = memchr (text, '\n', text_end - text);
+      if (!line_end)
+	line_end = text_end;
      else
+	++line_end;
+      text = line_end;
+      while (line_beg < line_end
+	     && ISSPACE (*line_beg))
+	++line_beg;
+      while (line_end > line_beg + 1
+	     && ISSPACE (*(line_end - 1)))
+	--line_end;
+      if (line_end > line_beg)
 	{
-	  no_proto = 1;
-	  /* This is for extremely brain-damaged pages that refer to
-	     relative URI-s as <a href="http:URL">.  Just strip off the
-	     silly leading "http:" (as well as any leading blanks
-	     before it).  */
-	  if ((size > 5) && !strncasecmp ("http:", url_data, 5))
-	    url_data += 5, size -= 5;
-	}
-      if (!no_proto)
-	{
-	  for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
-	    {
-	      if (!strncasecmp (sup_protos[i].name, url_data,
-			       MINVAL (strlen (sup_protos[i].name), size)))
-		break;
-	    }
-	  /* Do *not* accept a non-supported protocol.  */
-	  if (i == ARRAY_SIZE (sup_protos))
-	    {
-	      free (needs_freeing);
-	      continue;
-	    }
-	}
-      if (no_proto)
-	{
-	  /* First, construct the base, which can be relative itself.
-
-	     Criteria for creating the base are:
-	     1) html_base created by <base href="...">
-	     2) current URL
-	     3) base provided from the command line */
-	  cbase = html_base ();
-	  if (!cbase)
-	    cbase = this_url;
-	  if (!cbase)
-	    cbase = opt.base_href;
-	  if (!cbase)             /* Error condition -- a baseless
-				     relative link.  */
-	    {
-	      if (!opt.quiet && !silent)
-		{
-		  /* Use malloc, not alloca because this is called in
-                     a loop. */
-		  char *temp = (char *)malloc (size + 1);
-		  strncpy (temp, url_data, size);
-		  temp[size] = '\0';
-		  logprintf (LOG_NOTQUIET,
-			     _("Error (%s): Link %s without a base provided.\n"),
-			     file, temp);
-		  free (temp);
-		}
-	      free (needs_freeing);
-	      continue;
-	    }
-	  if (this_url)
-	    base = construct (this_url, cbase, strlen (cbase),
-			      !has_proto (cbase));
+	  urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
+	  memset (entry, 0, sizeof (*entry));
+	  entry->next = NULL;
+	  entry->url = strdupdelim (line_beg, line_end);
+	  if (!head)
+	    head = entry;
 	  else
-	    {
-	      /* Base must now be absolute, with host name and
-		 protocol.  */
-	      if (!has_proto (cbase))
-		{
-		  logprintf (LOG_NOTQUIET, _("\
-Error (%s): Base %s relative, without referer URL.\n"),
-			     file, cbase);
-		  free (needs_freeing);
-		  continue;
-		}
-	      base = xstrdup (cbase);
-	    }
-	  constr = construct (base, url_data, size, no_proto);
-	  free (base);
+	    tail->next = entry;
+	  tail = entry;
 	}
-      else /* has proto */
-	{
-	  constr = (char *)xmalloc (size + 1);
-	  strncpy (constr, url_data, size);
-	  constr[size] = '\0';
-	}
-#ifdef DEBUG
-      if (opt.debug)
-	{
-	  char *tmp;
-	  const char *tmp2;
-
-	  tmp2 = html_base ();
-	  /* Use malloc, not alloca because this is called in a loop. */
-	  tmp = (char *)xmalloc (size + 1);
-	  strncpy (tmp, url_data, size);
-	  tmp[size] = '\0';
-	  logprintf (LOG_ALWAYS,
-		     "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
-		     file, this_url ? this_url : "(null)",
-		     tmp2 ? tmp2 : "(null)", tmp, constr);
-	  free (tmp);
-	}
-#endif
-
-      /* Allocate the space.  */
-      old = current;
-      current = (urlpos *)xmalloc (sizeof (urlpos));
-      if (old)
-	old->next = current;
-      if (!first)
-	first = current;
-      /* Fill the values.  */
-      memset (current, 0, sizeof (*current));
-      current->next = NULL;
-      current->url = constr;
-      current->size = step;
-      current->pos = buf - orig_buf;
-      /* A URL is relative if the host and protocol are not named,
-	 and the name does not start with `/'.  */
-      if (no_proto && *url_data != '/')
-	current->flags |= (URELATIVE | UNOPROTO);
-      else if (no_proto)
-	current->flags |= UNOPROTO;
-      free (needs_freeing);
    }
-  free (orig_buf);
-
-  return first;
+  read_file_free (fm);
+  return head;
 }

 /* Free the linked list of urlpos.  */
@ -1527,103 +1301,59 @@ no_proxy_match (const char *host, const char **no_proxy)
    return !sufmatch (no_proxy, host);
 }

+static void write_backup_file PARAMS ((const char *, downloaded_file_t));
+
 /* Change the links in an HTML document.  Accepts a structure that
   defines the positions of all the links.  */
 void
 convert_links (const char *file, urlpos *l)
 {
+  struct file_memory *fm;
  FILE               *fp;
-  char               *buf, *p, *p2;
+  char               *p;
  downloaded_file_t  downloaded_file_return;
-  long               size;
+
+  {
+    /* First we do a "dry run": go through the list L and see whether
+       any URL needs to be converted in the first place.  If not, just
+       leave the file alone.  */
+    int count = 0;
+    urlpos *dry = l;
+    for (dry = l; dry; dry = dry->next)
+      if (dry->flags & (UABS2REL | UREL2ABS))
+	++count;
+    if (!count)
+      {
+	logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"),
+		   file);
+	return;
+      }
+  }

  logprintf (LOG_VERBOSE, _("Converting %s... "), file);
-  /* Read from the file....  */
-  fp = fopen (file, "rb");
-  if (!fp)
+
+  fm = read_file (file);
+  if (!fm)
    {
      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 		 file, strerror (errno));
      return;
    }
-  /* ...to a buffer.  */
-  load_file (fp, &buf, &size);
-  fclose (fp);
-
-  downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);

+  downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
  if (opt.backup_converted && downloaded_file_return)
-    /* Rather than just writing over the original .html file with the converted
-       version, save the former to *.orig.  Note we only do this for files we've
-       _successfully_ downloaded, so we don't clobber .orig files sitting around
-       from previous invocations. */
+    write_backup_file (file, downloaded_file_return);
+
+  /* Before opening the file for writing, unlink the file.  This is
+     important if the data in FM is mmaped.  In such case, nulling the
+     file, which is what fopen() below does, would make us read all
+     zeroes from the mmaped region.  */
+  if (unlink (file) < 0 && errno != ENOENT)
    {
-      /* Construct the backup filename as the original name plus ".orig". */
-      size_t         filename_len = strlen(file);
-      char*          filename_plus_orig_suffix;
-      boolean        already_wrote_backup_file = FALSE;
-      slist*         converted_file_ptr;
-      static slist*  converted_files = NULL;
-
-      if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
-	{
-	  /* Just write "orig" over "html".  We need to do it this way because
-	     when we're checking to see if we've downloaded the file before (to
-	     see if we can skip downloading it), we don't know if it's a
-	     text/html file.  Therefore we don't know yet at that stage that -E
-	     is going to cause us to tack on ".html", so we need to compare
-	     vs. the original URL plus ".orig", not the original URL plus
-	     ".html.orig". */
-	  filename_plus_orig_suffix = xmalloc(filename_len + 1);
-	  strcpy(filename_plus_orig_suffix, file);
-	  strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
-	}
-      else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
-	{
-	  /* Append ".orig" to the name. */
-	  filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
-	  strcpy(filename_plus_orig_suffix, file);
-	  strcpy(filename_plus_orig_suffix + filename_len, ".orig");
-	}
-
-      /* We can get called twice on the same URL thanks to the
-	 convert_all_links() call in main().  If we write the .orig file each
-	 time in such a case, it'll end up containing the first-pass conversion,
-	 not the original file.  So, see if we've already been called on this
-	 file. */
-      converted_file_ptr = converted_files;
-      while (converted_file_ptr != NULL)
-	if (strcmp(converted_file_ptr->string, file) == 0)
-	  {
-	    already_wrote_backup_file = TRUE;
-	    break;
-	  }
-	else
-	  converted_file_ptr = converted_file_ptr->next;
-
-      if (!already_wrote_backup_file)
-	{
-	  /* Rename <file> to <file>.orig before former gets written over. */
-	  if (rename(file, filename_plus_orig_suffix) != 0)
-	    logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
-		       file, filename_plus_orig_suffix, strerror (errno));
-
-	  /* Remember that we've already written a .orig backup for this file.
-	     Note that we never free this memory since we need it till the
-	     convert_all_links() call, which is one of the last things the
-	     program does before terminating.  BTW, I'm not sure if it would be
-	     safe to just set 'converted_file_ptr->string' to 'file' below,
-	     rather than making a copy of the string...  Another note is that I
-	     thought I could just add a field to the urlpos structure saying
-	     that we'd written a .orig file for this URL, but that didn't work,
-	     so I had to make this separate list. */
-	  converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
-	  converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
-	  converted_file_ptr->next = converted_files;
-	  converted_files = converted_file_ptr;
-	}
-
-      free(filename_plus_orig_suffix);
+      logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
+		 file, strerror (errno));
+      read_file_free (fm);
+      return;
    }
  /* Now open the file for writing.  */
  fp = fopen (file, "wb");
@ -1631,50 +1361,63 @@ convert_links (const char *file, urlpos *l)
    {
      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 		 file, strerror (errno));
-      free (buf);
+      read_file_free (fm);
      return;
    }
-  /* Presumably we have to loop through multiple URLs here (even though we're
-     only talking about a single local file) because of the -O option. */
-  for (p = buf; l; l = l->next)
+  /* Here we loop through all the URLs in file, replacing those of
+     them that are downloaded with relative references.  */
+  p = fm->content;
+  for (; l; l = l->next)
    {
-      if (l->pos >= size)
+      char *url_start = fm->content + l->pos;
+      if (l->pos >= fm->length)
 	{
 	  DEBUGP (("Something strange is going on.  Please investigate."));
 	  break;
 	}
-      /* If the URL already is relative or it is not to be converted
-	 for some other reason (e.g. because of not having been
-	 downloaded in the first place), skip it.  */
-      if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
+      /* If the URL is not to be converted, skip it.  */
+      if (!(l->flags & (UABS2REL | UREL2ABS)))
 	{
 	  DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
 		   l->pos, l->flags));
 	  continue;
 	}
-      /* Else, reach the position of the offending URL, echoing
-	 everything up to it to the outfile.  */
-      for (p2 = buf + l->pos; p < p2; p++)
-	putc (*p, fp);
+
+      /* Echo the file contents, up to the offending URL's opening
+         quote, to the outfile.  */
+      fwrite (p, 1, url_start - p, fp);
+      p = url_start;
      if (l->flags & UABS2REL)
-	/* Convert absolute URL to relative. */
 	{
+	  /* Convert absolute URL to relative. */
 	  char *newname = construct_relative (file, l->local_name);
-	  fprintf (fp, "%s", newname);
+	  putc (*p, fp);	/* quoting char */
+	  fputs (newname, fp);
+	  p += l->size - 1;
+	  putc (*p, fp);	/* close quote */
+	  ++p;
 	  DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
 		   l->url, newname, l->pos, file));
 	  free (newname);
 	}
-      p += l->size;
+      else if (l->flags & UREL2ABS)
+	{
+	  /* Convert the link to absolute URL. */
+	  char *newlink = l->url;
+	  putc (*p, fp);	/* quoting char */
+	  fputs (newlink, fp);
+	  p += l->size - 1;
+	  putc (*p, fp);	/* close quote */
+	  ++p;
+	  DEBUGP (("REL2ABS: <something> to %s at position %d in %s.\n",
+		   newlink, l->pos, file));
+	}
    }
  /* Output the rest of the file. */
-  if (p - buf < size)
-    {
-      for (p2 = buf + size; p < p2; p++)
-	putc (*p, fp);
-    }
+  if (p - fm->content < fm->length)
+    fwrite (p, 1, fm->length - (p - fm->content), fp);
  fclose (fp);
-  free (buf);
+  read_file_free (fm);
  logputs (LOG_VERBOSE, _("done.\n"));
 }

@ -1746,6 +1489,79 @@ add_url (urlpos *l, const char *url, const char *file)
  return t;
 }

+static void
+write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
+{
+  /* Rather than just writing over the original .html file with the
+     converted version, save the former to *.orig.  Note we only do
+     this for files we've _successfully_ downloaded, so we don't
+     clobber .orig files sitting around from previous invocations. */
+
+  /* Construct the backup filename as the original name plus ".orig". */
+  size_t         filename_len = strlen(file);
+  char*          filename_plus_orig_suffix;
+  boolean        already_wrote_backup_file = FALSE;
+  slist*         converted_file_ptr;
+  static slist*  converted_files = NULL;
+
+  if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+    {
+      /* Just write "orig" over "html".  We need to do it this way
+	 because when we're checking to see if we've downloaded the
+	 file before (to see if we can skip downloading it), we don't
+	 know if it's a text/html file.  Therefore we don't know yet
+	 at that stage that -E is going to cause us to tack on
+	 ".html", so we need to compare vs. the original URL plus
+	 ".orig", not the original URL plus ".html.orig". */
+      filename_plus_orig_suffix = alloca (filename_len + 1);
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+    }
+  else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+    {
+      /* Append ".orig" to the name. */
+      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
+      strcpy(filename_plus_orig_suffix, file);
+      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+    }
+
+  /* We can get called twice on the same URL thanks to the
+     convert_all_links() call in main().  If we write the .orig file
+     each time in such a case, it'll end up containing the first-pass
+     conversion, not the original file.  So, see if we've already been
+     called on this file. */
+  converted_file_ptr = converted_files;
+  while (converted_file_ptr != NULL)
+    if (strcmp(converted_file_ptr->string, file) == 0)
+      {
+	already_wrote_backup_file = TRUE;
+	break;
+      }
+    else
+      converted_file_ptr = converted_file_ptr->next;
+
+  if (!already_wrote_backup_file)
+    {
+      /* Rename <file> to <file>.orig before former gets written over. */
+      if (rename(file, filename_plus_orig_suffix) != 0)
+	logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
+		   file, filename_plus_orig_suffix, strerror (errno));
+
+      /* Remember that we've already written a .orig backup for this file.
+	 Note that we never free this memory since we need it till the
+	 convert_all_links() call, which is one of the last things the
+	 program does before terminating.  BTW, I'm not sure if it would be
+	 safe to just set 'converted_file_ptr->string' to 'file' below,
+	 rather than making a copy of the string...  Another note is that I
+	 thought I could just add a field to the urlpos structure saying
+	 that we'd written a .orig file for this URL, but that didn't work,
+	 so I had to make this separate list. */
+      converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+      converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
+      converted_file_ptr->next = converted_files;
+      converted_files = converted_file_ptr;
+    }
+}

 /* Remembers which files have been downloaded.  In the standard case, should be
   called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
@ -1798,3 +1614,10 @@ downloaded_file (downloaded_file_t  mode, const char*  file)
      return FILE_NOT_ALREADY_DOWNLOADED;
    }
 }
+
+/* Initialization of static stuff. */
+void
+url_init (void)
+{
+  init_unsafe_char_table ();
+}
--- a/src/url.h
+++ b/src/url.h
@ -88,6 +88,7 @@ struct urlinfo *newurl PARAMS ((void));
 void freeurl PARAMS ((struct urlinfo *, int));
 uerr_t urlproto PARAMS ((const char *));
 int skip_proto PARAMS ((const char *));
+int has_proto PARAMS ((const char *));
 int skip_uname PARAMS ((const char *));

 uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int));
@ -95,7 +96,7 @@ char *str_url PARAMS ((const struct urlinfo *, int));
 int url_equal PARAMS ((const char *, const char *));

 urlpos *get_urls_file PARAMS ((const char *));
-urlpos *get_urls_html PARAMS ((const char *, const char *, int, int));
+urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
 void free_urlpos PARAMS ((urlpos *));

 char *url_concat PARAMS ((const char *, const char *));
--- a/src/utils.c
+++ b/src/utils.c
@ -31,6 +31,9 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #ifdef HAVE_UNISTD_H
 # include <unistd.h>
 #endif
+#ifdef HAVE_MMAP
+# include <sys/mman.h>
+#endif
 #ifdef HAVE_PWD_H
 # include <pwd.h>
 #endif
@ -45,11 +48,13 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #ifdef NeXT
 # include <libc.h>		/* for access() */
 #endif
+#include <fcntl.h>
 #include <assert.h>

 #include "wget.h"
 #include "utils.h"
 #include "fnmatch.h"
+#include "hash.h"

 #ifndef errno
 extern int errno;
@ -736,28 +741,149 @@ read_whole_line (FILE *fp)
    line = xrealloc (line, length + 1);
  return line;
 }
+
+/* Read FILE into memory.  A pointer to `struct file_memory' are
+   returned; use struct element `content' to access file contents, and
+   the element `length' to know the file length.  `content' is *not*
+   zero-terminated, and you should *not* read or write beyond the [0,
+   length) range of characters.

-/* Load file pointed to by FP to memory and return the malloc-ed
-   buffer with the contents.  *NREAD will contain the number of read
-   bytes.  The file is loaded in chunks, allocated exponentially,
-   starting with FILE_BUFFER_SIZE bytes.  */
-void
-load_file (FILE *fp, char **buf, long *nread)
+   After you are done with the file contents, call read_file_free to
+   release the memory.
+
+   Depending on the operating system and the type of file that is
+   being read, read_file() either mmap's the file into memory, or
+   reads the file into the core using read().
+
+   If file is named "-", fileno(stdin) is used for reading instead.
+   If you want to read from a real file named "-", use "./-" instead.  */
+
+struct file_memory *
+read_file (const char *file)
 {
-  long bufsize;
+  int fd;
+  struct file_memory *fm;
+  long size;
+  int inhibit_close = 0;

-  bufsize = 512;
-  *nread = 0;
-  *buf = NULL;
-  while (!feof (fp) && !ferror (fp))
+  /* Some magic in the finest tradition of Perl and its kin: if FILE
+     is "-", just use stdin.  */
+  if (HYPHENP (file))
    {
-      *buf = (char *)xrealloc (*buf, bufsize + *nread);
-      *nread += fread (*buf + *nread, sizeof (char), bufsize, fp);
-      bufsize <<= 1;
+      fd = fileno (stdin);
+      inhibit_close = 1;
+      /* Note that we don't inhibit mmap() in this case.  If stdin is
+         redirected from a regular file, mmap() will still work.  */
    }
-  /* #### No indication of encountered error??  */
+  else
+    fd = open (file, O_RDONLY);
+  if (fd < 0)
+    return NULL;
+  fm = xmalloc (sizeof (struct file_memory));
+
+#ifdef HAVE_MMAP
+  {
+    struct stat buf;
+    if (fstat (fd, &buf) < 0)
+      goto mmap_lose;
+    fm->length = buf.st_size;
+    /* NOTE: As far as I know, the callers of this function never
+       modify the file text.  Relying on this would enable us to
+       specify PROT_READ and MAP_SHARED for a marginal gain in
+       efficiency, but at some cost to generality.  */
+    fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE, fd, 0);
+    if (fm->content == MAP_FAILED)
+      goto mmap_lose;
+    if (!inhibit_close)
+      close (fd);
+
+    fm->mmap_p = 1;
+    return fm;
+  }
+
+ mmap_lose:
+  /* The most common reason why mmap() fails is that FD does not point
+     to a plain file.  However, it's also possible that mmap() doesn't
+     work for a particular type of file.  Therefore, whenever mmap()
+     fails, we just fall back to the regular method.  */
+#endif /* HAVE_MMAP */
+
+  fm->length = 0;
+  size = 512;			/* number of bytes fm->contents can
+                                   hold at any given time. */
+  fm->content = xmalloc (size);
+  while (1)
+    {
+      long nread;
+      if (fm->length > size / 2)
+	{
+	  /* #### I'm not sure whether the whole exponential-growth
+             thing makes sense with kernel read.  On Linux at least,
+             read() refuses to read more than 4K from a file at a
+             single chunk anyway.  But other Unixes might optimize it
+             better, and it doesn't *hurt* anything, so I'm leaving
+             it.  */
+
+	  /* Normally, we grow SIZE exponentially to make the number
+             of calls to read() and realloc() logarithmic in relation
+             to file size.  However, read() can read an amount of data
+             smaller than requested, and it would be unreasonably to
+             double SIZE every time *something* was read.  Therefore,
+             we double SIZE only when the length exceeds half of the
+             entire allocated size.  */
+	  size <<= 1;
+	  fm->content = xrealloc (fm->content, size);
+	}
+      nread = read (fd, fm->content + fm->length, size - fm->length);
+      if (nread > 0)
+	/* Successful read. */
+	fm->length += nread;
+      else if (nread < 0)
+	/* Error. */
+	goto lose;
+      else
+	/* EOF */
+	break;
+    }
+  if (!inhibit_close)
+    close (fd);
+  if (size > fm->length && fm->length != 0)
+    /* Due to exponential growth of fm->content, the allocated region
+       might be much larger than what is actually needed.  */
+    fm->content = xrealloc (fm->content, fm->length);
+  fm->mmap_p = 0;
+  return fm;
+
+ lose:
+  if (!inhibit_close)
+    close (fd);
+  free (fm->content);
+  free (fm);
+  return NULL;
 }

+/* Release the resources held by FM.  Specifically, this calls
+   munmap() or free() on fm->content, depending whether mmap or
+   malloc/read were used to read in the file.  It also frees the
+   memory needed to hold the FM structure itself.  */
+
+void
+read_file_free (struct file_memory *fm)
+{
+#ifdef HAVE_MMAP
+  if (fm->mmap_p)
+    {
+      munmap (fm->content, fm->length);
+    }
+  else
+#endif
+    {
+      free (fm->content);
+    }
+  free (fm);
+}
+
 /* Free the pointers in a NULL-terminated vector of pointers, then
   free the pointer itself.  */
 void
@ -801,97 +927,42 @@ merge_vecs (char **v1, char **v2)
  return v1;
 }

-/* A set of simple-minded routines to store and search for strings in
-   a linked list.  You may add a string to the slist, and peek whether
-   it's still in there at any time later.  */
+/* A set of simple-minded routines to store strings in a linked list.
+   This used to also be used for searching, but now we have hash
+   tables for that.  */

-/* Add an element to the list.  If flags is NOSORT, the list will not
-   be sorted.  */
+/* Append an element to the list.  */
 slist *
-add_slist (slist *l, const char *s, int flags)
+slist_append (slist *l, const char *s)
 {
-  slist *t, *old, *beg;
-  int cmp;
+  slist *newel = (slist *)xmalloc (sizeof (slist));
+  slist *beg = l;

-  if (flags & NOSORT)
-    {
-      if (!l)
-	{
-	  t = (slist *)xmalloc (sizeof (slist));
-	  t->string = xstrdup (s);
-	  t->next = NULL;
-	  return t;
-	}
-      beg = l;
-      /* Find the last element.  */
-      while (l->next)
-	l = l->next;
-      t = (slist *)xmalloc (sizeof (slist));
-      l->next = t;
-      t->string = xstrdup (s);
-      t->next = NULL;
-      return beg;
-    }
-  /* Empty list or changing the first element.  */
-  if (!l || (cmp = strcmp (l->string, s)) > 0)
-    {
-      t = (slist *)xmalloc (sizeof (slist));
-      t->string = xstrdup (s);
-      t->next = l;
-      return t;
-    }
+  newel->string = xstrdup (s);
+  newel->next = NULL;

-  beg = l;
-  if (cmp == 0)
-    return beg;
-
-  /* Second two one-before-the-last element.  */
+  if (!l)
+    return newel;
+  /* Find the last element.  */
  while (l->next)
-    {
-      old = l;
-      l = l->next;
-      cmp = strcmp (s, l->string);
-      if (cmp == 0)             /* no repeating in the list */
-	return beg;
-      else if (cmp > 0)
-	continue;
-      /* If the next list element is greater than s, put s between the
-	 current and the next list element.  */
-      t = (slist *)xmalloc (sizeof (slist));
-      old->next = t;
-      t->next = l;
-      t->string = xstrdup (s);
-      return beg;
-    }
-  t = (slist *)xmalloc (sizeof (slist));
-  t->string = xstrdup (s);
-  /* Insert the new element after the last element.  */
-  l->next = t;
-  t->next = NULL;
+    l = l->next;
+  l->next = newel;
  return beg;
 }

 /* Is there a specific entry in the list?  */
 int
-in_slist (slist *l, const char *s)
+slist_contains (slist *l, const char *s)
 {
-  int cmp;
-
-  while (l)
-    {
-      cmp = strcmp (l->string, s);
-      if (cmp == 0)
-	return 1;
-      else if (cmp > 0)         /* the list is ordered!  */
-	return 0;
-      l = l->next;
-    }
+  for (; l; l = l->next)
+    if (!strcmp (l->string, s))
+      return 1;
  return 0;
 }

 /* Free the whole slist.  */
 void
-free_slist (slist *l)
+slist_free (slist *l)
 {
  slist *n;

@ -903,6 +974,58 @@ free_slist (slist *l)
      l = n;
    }
 }
+
+/* Sometimes it's useful to create "sets" of strings, i.e. special
+   hash tables where you want to store strings as keys and merely
+   query for their existence.  Here is a set of utility routines that
+   makes that transparent.  */
+
+void
+string_set_add (struct hash_table *ht, const char *s)
+{
+  /* We use "1" as value.  It provides us a useful and clear arbitrary
+     value, and it consumes no memory -- the pointers to the same
+     string "1" will be shared by all the key-value pairs in the hash
+     table.  */
+  hash_table_put (ht, xstrdup (s), "1");
+}
+
+int
+string_set_exists (struct hash_table *ht, const char *s)
+{
+  return hash_table_exists (ht, s);
+}
+
+static int
+string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
+{
+  free (key);
+  return 0;
+}
+
+void
+string_set_free (struct hash_table *ht)
+{
+  hash_table_map (ht, string_set_free_mapper, NULL);
+  hash_table_destroy (ht);
+}
+
+static int
+free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
+{
+  free (key);
+  free (value);
+  return 0;
+}
+
+/* Another utility function: call free() on all keys and values of HT.  */
+
+void
+free_keys_and_values (struct hash_table *ht)
+{
+  hash_table_map (ht, free_keys_and_values_mapper, NULL);
+}
+

 /* Engine for legible and legible_long_long; this function works on
   strings.  */
--- a/src/utils.h
+++ b/src/utils.h
@ -20,11 +20,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #ifndef UTILS_H
 #define UTILS_H

-/* Flags for slist.  */
-enum {
-   NOSORT     = 1
-};
-
 enum accd {
   ALLABS = 1
 };
@ -36,6 +31,14 @@ typedef struct _slist
  struct _slist *next;
 } slist;

+struct hash_table;
+
+struct file_memory {
+  char *content;
+  long length;
+  int mmap_p;
+};
+
 char *time_str PARAMS ((time_t *));
 const char *uerrmsg PARAMS ((uerr_t));

@ -58,13 +61,19 @@ int accdir PARAMS ((const char *s, enum accd));
 char *suffix PARAMS ((const char *s));

 char *read_whole_line PARAMS ((FILE *));
-void load_file PARAMS ((FILE *, char **, long *));
+struct file_memory *read_file PARAMS ((const char *));
+void read_file_free PARAMS ((struct file_memory *));

 void free_vec PARAMS ((char **));
 char **merge_vecs PARAMS ((char **, char **));
-slist *add_slist PARAMS ((slist *, const char *, int));
-int in_slist PARAMS ((slist *, const char *));
-void free_slist PARAMS ((slist *));
+slist *slist_append PARAMS ((slist *, const char *));
+int slist_contains PARAMS ((slist *, const char *));
+void slist_free PARAMS ((slist *));
+
+void string_set_add PARAMS ((struct hash_table *, const char *));
+int string_set_exists PARAMS ((struct hash_table *, const char *));
+void string_set_free PARAMS ((struct hash_table *));
+void free_keys_and_values PARAMS ((struct hash_table *));

 char *legible PARAMS ((long));
 char *legible_very_long PARAMS ((VERY_LONG_TYPE));
--- a/src/wget.h
+++ b/src/wget.h
@ -71,7 +71,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */

 /* Print X if debugging is enabled; a no-op otherwise.  */
 #ifdef DEBUG
-# define DEBUGP(x) do { debug_logprintf x; } while (0)
+# define DEBUGP(x) do { if (opt.debug) { debug_logprintf x; } } while (0)
 #else  /* not DEBUG */
 # define DEBUGP(x) DO_NOTHING
 #endif /* not DEBUG */