wget/src/hash.c
hniksic b0b1c815c1 [svn] A bunch of new features:
- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.
2000-11-19 12:50:10 -08:00

404 lines
11 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Hash tables.
Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdlib.h>
#include <assert.h>
#include "wget.h"
#include "utils.h"
#include "hash.h"
#ifdef STANDALONE
# define xmalloc malloc
# define xrealloc realloc
#endif
/* This file implements simple hash tables based on linear probing.
The hash table stores key-value pairs in a contiguous array. Both
key and value are void pointers that the hash and test functions
know how to handle.
Although Knuth & co. recommend double hashing over linear probing,
we use the latter because it accesses array elements sequentially
in case of a collision, yielding in better cache behaviour and
ultimately in better speed. To avoid collision problems with
linear probing, we make sure that the table grows as soon as the
fullness/size ratio exceeds 75%. */
struct ht_pair {
void *key;
void *value;
};
struct hash_table {
unsigned long (*hash_function) (const void *);
int (*test_function) (const void *, const void *);
int size; /* size of the array */
int fullness; /* number of non-empty fields */
int count; /* number of non-empty, non-deleted
fields. */
struct ht_pair *pairs;
};
#define ENTRY_DELETED ((void *)0xdeadbeef)
#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL)
/* Find a prime near, but greather than or equal to SIZE. */
int
prime_size (int size)
{
static const unsigned long primes [] = {
19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
};
int i;
for (i = 0; i < ARRAY_SIZE (primes); i++)
if (primes[i] >= size)
return primes[i];
/* huh? */
return size;
}
/* Create a hash table of INITIAL_SIZE with hash function
HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
start out with a "small" table which will be regrown as needed,
specify 0 as INITIAL_SIZE. */
struct hash_table *
hash_table_new (int initial_size,
unsigned long (*hash_function) (const void *),
int (*test_function) (const void *, const void *))
{
struct hash_table *ht
= (struct hash_table *)xmalloc (sizeof (struct hash_table));
ht->hash_function = hash_function;
ht->test_function = test_function;
ht->size = prime_size (initial_size);
ht->fullness = 0;
ht->count = 0;
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
return ht;
}
/* Free the data associated with hash table HT. */
void
hash_table_destroy (struct hash_table *ht)
{
free (ht->pairs);
free (ht);
}
/* Get the value that corresponds to the key KEY in the hash table HT.
If no value is found, return NULL. Note that NULL is a legal value
for value; if you are storing NULLs in your hash table, you can use
hash_table_exists to be sure that a (possibly NULL) value exists in
the table. */
void *
hash_table_get (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return NULL;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
return the_pair->value;
}
}
/* Return 1 if KEY exists in HT, 0 otherwise. */
int
hash_table_exists (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return 0;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
return 1;
}
}
#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
/* Grow hash table HT as necessary, and rehash all the key-value
pairs. */
static void
grow_hash_table (struct hash_table *ht)
{
int i;
struct ht_pair *old_pairs = ht->pairs;
int old_count = ht->count; /* for assert() below */
int old_size = ht->size;
/* Normally, the idea is to double ht->size (and round it to next
prime) on each regrow:
ht->size = prime_size (ht->size * 2);
But it is possible that the table has large fullness because of
the many deleted entries. If that is the case, we don't want to
blindly grow the table; we just want to rehash it. For that
reason, we use ht->count as the relevant parameter. MAX is used
only because we don't want to actually shrink the table. (But
maybe that's wrong.) */
int needed_size = prime_size (ht->count * 2);
ht->size = MAX (old_size, needed_size);
ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
/* Need to reset these two; hash_table_put will reinitialize them. */
ht->fullness = 0;
ht->count = 0;
for (i = 0; i < old_size; i++)
{
struct ht_pair *the_pair = old_pairs + i;
if (!EMPTY_ENTRY_P (the_pair->key)
&& !DELETED_ENTRY_P (the_pair->key))
hash_table_put (ht, the_pair->key, the_pair->value);
}
assert (ht->count == old_count);
free (old_pairs);
}
/* Put VALUE in the hash table HT under the key KEY. This regrows the
table if necessary. */
void
hash_table_put (struct hash_table *ht, const void *key, void *value)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
{
++ht->fullness;
++ht->count;
just_insert:
the_pair->key = (void *)key; /* const? */
the_pair->value = value;
break;
}
else if (DELETED_ENTRY_P (the_pair->key))
{
/* We're replacing a deleteed entry, so ht->count gets
increased, but ht->fullness remains unchanged. */
++ht->count;
goto just_insert;
}
else if (ht->test_function (key, the_pair->key))
{
/* We're replacing an existing entry, so ht->count and
ht->fullness remain unchanged. */
goto just_insert;
}
else
{
++location;
if (location == ht->size)
location = 0;
}
}
if (ht->fullness * 4 > ht->size * 3)
/* When fullness exceeds 75% of size, regrow the table. */
grow_hash_table (ht);
}
/* Remove KEY from HT. */
int
hash_table_remove (struct hash_table *ht, const void *key)
{
int location = ht->hash_function (key) % ht->size;
while (1)
{
struct ht_pair *the_pair = ht->pairs + location;
if (EMPTY_ENTRY_P (the_pair->key))
return 0;
else if (DELETED_ENTRY_P (the_pair->key)
|| !ht->test_function (key, the_pair->key))
{
++location;
if (location == ht->size)
location = 0;
}
else
{
/* We don't really remove an entry from the hash table: we
just mark it as deleted. This is because there may be
other entries located after this entry whose hash number
points to a location before this entry. (Example: keys
A, B and C have the same hash. If you were to really
*delete* B from the table, C could no longer be found.)
As an optimization, it might be worthwhile to check
whether the immediately preceding entry is empty and, if
so, really delete the pair (set it to empty and decrease
the fullness along with the count). I *think* it should
be safe. */
the_pair->key = ENTRY_DELETED;
--ht->count;
return 1;
}
}
}
void
hash_table_clear (struct hash_table *ht)
{
memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
ht->fullness = 0;
ht->count = 0;
}
void
hash_table_map (struct hash_table *ht,
int (*mapfun) (void *, void *, void *),
void *closure)
{
int i;
for (i = 0; i < ht->size; i++)
{
struct ht_pair *the_pair = ht->pairs + i;
if (!EMPTY_ENTRY_P (the_pair->key)
&& !DELETED_ENTRY_P (the_pair->key))
if (mapfun (the_pair->key, the_pair->value, closure))
return;
}
}
/* Support for hash tables whose keys are strings. */
/* supposedly from the Dragon Book P436. */
unsigned long
string_hash (const void *sv)
{
unsigned int h = 0;
unsigned const char *x = (unsigned const char *) sv;
while (*x)
{
unsigned int g;
h = (h << 4) + *x++;
if ((g = h & 0xf0000000) != 0)
h = (h ^ (g >> 24)) ^ g;
}
return h;
}
int
string_cmp (const void *s1, const void *s2)
{
return !strcmp ((const char *)s1, (const char *)s2);
}
struct hash_table *
make_string_hash_table (int initial_size)
{
return hash_table_new (initial_size, string_hash, string_cmp);
}
#ifdef STANDALONE
#include <stdio.h>
#include <string.h>
int
print_hash_table_mapper (const void *key, void *value, void *count)
{
++*(int *)count;
printf ("%s: %s\n", (const char *)key, (char *)value);
return 0;
}
void
print_hash (struct hash_table *sht)
{
int debug_count = 0;
hash_table_map (sht, print_hash_table_mapper, &debug_count);
assert (debug_count == sht->count);
}
int
main (void)
{
struct hash_table *ht = make_string_hash_table (0);
char line[80];
while ((fgets (line, sizeof (line), stdin)))
{
int len = strlen (line);
if (len <= 1)
continue;
line[--len] = '\0';
hash_table_put (ht, strdup (line), "here I am!");
if (len % 2)
hash_table_remove (ht, line);
}
print_hash (ht);
#if 0
printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
#endif
return 0;
}
#endif