mirror of
https://github.com/mirror/wget.git
synced 2025-01-10 20:30:09 +08:00
b0b1c815c1
- use mmap() to read whole files in core instead of allocating memory and read'ing it. - use a new, more general, HTML parser (html-parse.c) and interface to it from Wget (html-url.c). - respect <meta name=robots content=nofollow> (easy with the new HTML parser). - use hash tables instead of linked lists in places where the lists were used to facilitate mappings. - rewrite the code in host.c to be more readable and faster (hash tables instead of home-grown lists.) - make convert_links properly convert partial URLs to complete ones for those URLs that have *not* been downloaded. - use HTTP persistent connections where available. very simple-minded, caches the last connection to the server. Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
404 lines
11 KiB
C
404 lines
11 KiB
C
/* Hash tables.
|
||
Copyright (C) 2000 Free Software Foundation, Inc.
|
||
|
||
This file is part of Wget.
|
||
|
||
This program is free software; you can redistribute it and/or modify
|
||
it under the terms of the GNU General Public License as published by
|
||
the Free Software Foundation; either version 2 of the License, or
|
||
(at your option) any later version.
|
||
|
||
This program is distributed in the hope that it will be useful,
|
||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
GNU General Public License for more details.
|
||
|
||
You should have received a copy of the GNU General Public License
|
||
along with this program; if not, write to the Free Software
|
||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||
|
||
#ifdef HAVE_CONFIG_H
|
||
# include <config.h>
|
||
#endif
|
||
|
||
#include <stdlib.h>
|
||
#include <assert.h>
|
||
|
||
#include "wget.h"
|
||
#include "utils.h"
|
||
|
||
#include "hash.h"
|
||
|
||
#ifdef STANDALONE
|
||
# define xmalloc malloc
|
||
# define xrealloc realloc
|
||
#endif
|
||
|
||
/* This file implements simple hash tables based on linear probing.
|
||
The hash table stores key-value pairs in a contiguous array. Both
|
||
key and value are void pointers that the hash and test functions
|
||
know how to handle.
|
||
|
||
Although Knuth & co. recommend double hashing over linear probing,
|
||
we use the latter because it accesses array elements sequentially
|
||
in case of a collision, yielding in better cache behaviour and
|
||
ultimately in better speed. To avoid collision problems with
|
||
linear probing, we make sure that the table grows as soon as the
|
||
fullness/size ratio exceeds 75%. */
|
||
|
||
struct ht_pair {
|
||
void *key;
|
||
void *value;
|
||
};
|
||
|
||
struct hash_table {
|
||
unsigned long (*hash_function) (const void *);
|
||
int (*test_function) (const void *, const void *);
|
||
|
||
int size; /* size of the array */
|
||
int fullness; /* number of non-empty fields */
|
||
int count; /* number of non-empty, non-deleted
|
||
fields. */
|
||
|
||
struct ht_pair *pairs;
|
||
};
|
||
|
||
#define ENTRY_DELETED ((void *)0xdeadbeef)
|
||
|
||
#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
|
||
#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL)
|
||
|
||
/* Find a prime near, but greather than or equal to SIZE. */
|
||
|
||
int
|
||
prime_size (int size)
|
||
{
|
||
static const unsigned long primes [] = {
|
||
19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
|
||
1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
|
||
19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
|
||
204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
|
||
1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
|
||
10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
|
||
50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
|
||
243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
|
||
1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
|
||
};
|
||
int i;
|
||
for (i = 0; i < ARRAY_SIZE (primes); i++)
|
||
if (primes[i] >= size)
|
||
return primes[i];
|
||
/* huh? */
|
||
return size;
|
||
}
|
||
|
||
/* Create a hash table of INITIAL_SIZE with hash function
|
||
HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
|
||
start out with a "small" table which will be regrown as needed,
|
||
specify 0 as INITIAL_SIZE. */
|
||
|
||
struct hash_table *
|
||
hash_table_new (int initial_size,
|
||
unsigned long (*hash_function) (const void *),
|
||
int (*test_function) (const void *, const void *))
|
||
{
|
||
struct hash_table *ht
|
||
= (struct hash_table *)xmalloc (sizeof (struct hash_table));
|
||
ht->hash_function = hash_function;
|
||
ht->test_function = test_function;
|
||
ht->size = prime_size (initial_size);
|
||
ht->fullness = 0;
|
||
ht->count = 0;
|
||
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
|
||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||
return ht;
|
||
}
|
||
|
||
/* Free the data associated with hash table HT. */
|
||
|
||
void
|
||
hash_table_destroy (struct hash_table *ht)
|
||
{
|
||
free (ht->pairs);
|
||
free (ht);
|
||
}
|
||
|
||
/* Get the value that corresponds to the key KEY in the hash table HT.
|
||
If no value is found, return NULL. Note that NULL is a legal value
|
||
for value; if you are storing NULLs in your hash table, you can use
|
||
hash_table_exists to be sure that a (possibly NULL) value exists in
|
||
the table. */
|
||
|
||
void *
|
||
hash_table_get (struct hash_table *ht, const void *key)
|
||
{
|
||
int location = ht->hash_function (key) % ht->size;
|
||
while (1)
|
||
{
|
||
struct ht_pair *the_pair = ht->pairs + location;
|
||
if (EMPTY_ENTRY_P (the_pair->key))
|
||
return NULL;
|
||
else if (DELETED_ENTRY_P (the_pair->key)
|
||
|| !ht->test_function (key, the_pair->key))
|
||
{
|
||
++location;
|
||
if (location == ht->size)
|
||
location = 0;
|
||
}
|
||
else
|
||
return the_pair->value;
|
||
}
|
||
}
|
||
|
||
/* Return 1 if KEY exists in HT, 0 otherwise. */
|
||
|
||
int
|
||
hash_table_exists (struct hash_table *ht, const void *key)
|
||
{
|
||
int location = ht->hash_function (key) % ht->size;
|
||
while (1)
|
||
{
|
||
struct ht_pair *the_pair = ht->pairs + location;
|
||
if (EMPTY_ENTRY_P (the_pair->key))
|
||
return 0;
|
||
else if (DELETED_ENTRY_P (the_pair->key)
|
||
|| !ht->test_function (key, the_pair->key))
|
||
{
|
||
++location;
|
||
if (location == ht->size)
|
||
location = 0;
|
||
}
|
||
else
|
||
return 1;
|
||
}
|
||
}
|
||
|
||
#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
|
||
|
||
/* Grow hash table HT as necessary, and rehash all the key-value
|
||
pairs. */
|
||
|
||
static void
|
||
grow_hash_table (struct hash_table *ht)
|
||
{
|
||
int i;
|
||
struct ht_pair *old_pairs = ht->pairs;
|
||
int old_count = ht->count; /* for assert() below */
|
||
int old_size = ht->size;
|
||
|
||
/* Normally, the idea is to double ht->size (and round it to next
|
||
prime) on each regrow:
|
||
|
||
ht->size = prime_size (ht->size * 2);
|
||
|
||
But it is possible that the table has large fullness because of
|
||
the many deleted entries. If that is the case, we don't want to
|
||
blindly grow the table; we just want to rehash it. For that
|
||
reason, we use ht->count as the relevant parameter. MAX is used
|
||
only because we don't want to actually shrink the table. (But
|
||
maybe that's wrong.) */
|
||
|
||
int needed_size = prime_size (ht->count * 2);
|
||
ht->size = MAX (old_size, needed_size);
|
||
|
||
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
|
||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||
|
||
/* Need to reset these two; hash_table_put will reinitialize them. */
|
||
ht->fullness = 0;
|
||
ht->count = 0;
|
||
for (i = 0; i < old_size; i++)
|
||
{
|
||
struct ht_pair *the_pair = old_pairs + i;
|
||
if (!EMPTY_ENTRY_P (the_pair->key)
|
||
&& !DELETED_ENTRY_P (the_pair->key))
|
||
hash_table_put (ht, the_pair->key, the_pair->value);
|
||
}
|
||
assert (ht->count == old_count);
|
||
free (old_pairs);
|
||
}
|
||
|
||
/* Put VALUE in the hash table HT under the key KEY. This regrows the
|
||
table if necessary. */
|
||
|
||
void
|
||
hash_table_put (struct hash_table *ht, const void *key, void *value)
|
||
{
|
||
int location = ht->hash_function (key) % ht->size;
|
||
while (1)
|
||
{
|
||
struct ht_pair *the_pair = ht->pairs + location;
|
||
if (EMPTY_ENTRY_P (the_pair->key))
|
||
{
|
||
++ht->fullness;
|
||
++ht->count;
|
||
just_insert:
|
||
the_pair->key = (void *)key; /* const? */
|
||
the_pair->value = value;
|
||
break;
|
||
}
|
||
else if (DELETED_ENTRY_P (the_pair->key))
|
||
{
|
||
/* We're replacing a deleteed entry, so ht->count gets
|
||
increased, but ht->fullness remains unchanged. */
|
||
++ht->count;
|
||
goto just_insert;
|
||
}
|
||
else if (ht->test_function (key, the_pair->key))
|
||
{
|
||
/* We're replacing an existing entry, so ht->count and
|
||
ht->fullness remain unchanged. */
|
||
goto just_insert;
|
||
}
|
||
else
|
||
{
|
||
++location;
|
||
if (location == ht->size)
|
||
location = 0;
|
||
}
|
||
}
|
||
if (ht->fullness * 4 > ht->size * 3)
|
||
/* When fullness exceeds 75% of size, regrow the table. */
|
||
grow_hash_table (ht);
|
||
}
|
||
|
||
/* Remove KEY from HT. */
|
||
|
||
int
|
||
hash_table_remove (struct hash_table *ht, const void *key)
|
||
{
|
||
int location = ht->hash_function (key) % ht->size;
|
||
while (1)
|
||
{
|
||
struct ht_pair *the_pair = ht->pairs + location;
|
||
if (EMPTY_ENTRY_P (the_pair->key))
|
||
return 0;
|
||
else if (DELETED_ENTRY_P (the_pair->key)
|
||
|| !ht->test_function (key, the_pair->key))
|
||
{
|
||
++location;
|
||
if (location == ht->size)
|
||
location = 0;
|
||
}
|
||
else
|
||
{
|
||
/* We don't really remove an entry from the hash table: we
|
||
just mark it as deleted. This is because there may be
|
||
other entries located after this entry whose hash number
|
||
points to a location before this entry. (Example: keys
|
||
A, B and C have the same hash. If you were to really
|
||
*delete* B from the table, C could no longer be found.)
|
||
|
||
As an optimization, it might be worthwhile to check
|
||
whether the immediately preceding entry is empty and, if
|
||
so, really delete the pair (set it to empty and decrease
|
||
the fullness along with the count). I *think* it should
|
||
be safe. */
|
||
the_pair->key = ENTRY_DELETED;
|
||
--ht->count;
|
||
return 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
void
|
||
hash_table_clear (struct hash_table *ht)
|
||
{
|
||
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
|
||
ht->fullness = 0;
|
||
ht->count = 0;
|
||
}
|
||
|
||
void
|
||
hash_table_map (struct hash_table *ht,
|
||
int (*mapfun) (void *, void *, void *),
|
||
void *closure)
|
||
{
|
||
int i;
|
||
for (i = 0; i < ht->size; i++)
|
||
{
|
||
struct ht_pair *the_pair = ht->pairs + i;
|
||
if (!EMPTY_ENTRY_P (the_pair->key)
|
||
&& !DELETED_ENTRY_P (the_pair->key))
|
||
if (mapfun (the_pair->key, the_pair->value, closure))
|
||
return;
|
||
}
|
||
}
|
||
|
||
/* Support for hash tables whose keys are strings. */
|
||
|
||
/* supposedly from the Dragon Book P436. */
|
||
unsigned long
|
||
string_hash (const void *sv)
|
||
{
|
||
unsigned int h = 0;
|
||
unsigned const char *x = (unsigned const char *) sv;
|
||
|
||
while (*x)
|
||
{
|
||
unsigned int g;
|
||
h = (h << 4) + *x++;
|
||
if ((g = h & 0xf0000000) != 0)
|
||
h = (h ^ (g >> 24)) ^ g;
|
||
}
|
||
|
||
return h;
|
||
}
|
||
|
||
int
|
||
string_cmp (const void *s1, const void *s2)
|
||
{
|
||
return !strcmp ((const char *)s1, (const char *)s2);
|
||
}
|
||
|
||
struct hash_table *
|
||
make_string_hash_table (int initial_size)
|
||
{
|
||
return hash_table_new (initial_size, string_hash, string_cmp);
|
||
}
|
||
|
||
|
||
#ifdef STANDALONE
|
||
|
||
#include <stdio.h>
|
||
#include <string.h>
|
||
|
||
int
|
||
print_hash_table_mapper (const void *key, void *value, void *count)
|
||
{
|
||
++*(int *)count;
|
||
printf ("%s: %s\n", (const char *)key, (char *)value);
|
||
return 0;
|
||
}
|
||
|
||
void
|
||
print_hash (struct hash_table *sht)
|
||
{
|
||
int debug_count = 0;
|
||
hash_table_map (sht, print_hash_table_mapper, &debug_count);
|
||
assert (debug_count == sht->count);
|
||
}
|
||
|
||
int
|
||
main (void)
|
||
{
|
||
struct hash_table *ht = make_string_hash_table (0);
|
||
char line[80];
|
||
while ((fgets (line, sizeof (line), stdin)))
|
||
{
|
||
int len = strlen (line);
|
||
if (len <= 1)
|
||
continue;
|
||
line[--len] = '\0';
|
||
hash_table_put (ht, strdup (line), "here I am!");
|
||
if (len % 2)
|
||
hash_table_remove (ht, line);
|
||
}
|
||
print_hash (ht);
|
||
#if 0
|
||
printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
|
||
#endif
|
||
return 0;
|
||
}
|
||
#endif
|