[project @ 2005-01-11 16:04:08 by simonmar]
[packages/old-time.git] / Data / HashTable.hs
1 {-# OPTIONS_GHC -fno-implicit-prelude #-}
2
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : Data.HashTable
6 -- Copyright : (c) The University of Glasgow 2003
7 -- License : BSD-style (see the file libraries/base/LICENSE)
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : provisional
11 -- Portability : portable
12 --
13 -- An implementation of extensible hash tables, as described in
14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
15 -- pp. 446--457. The implementation is also derived from the one
16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
17 --
18 -----------------------------------------------------------------------------
19
20 module Data.HashTable (
21 -- * Basic hash table operations
22 HashTable, new, insert, delete, lookup, update,
23 -- * Converting to and from lists
24 fromList, toList,
25 -- * Hash functions
26 -- $hash_functions
27 hashInt, hashString,
28 prime,
29 -- * Diagnostics
30 longestChain
31 ) where
32
33 -- This module is imported by Data.Dynamic, which is pretty low down in the
34 -- module hierarchy, so don't import "high-level" modules
35
36 #ifdef __GLASGOW_HASKELL__
37 import GHC.Base
38 #else
39 import Prelude hiding ( lookup )
40 #endif
41 import Data.Tuple ( fst )
42 import Data.Bits
43 import Data.Maybe
44 import Data.List ( maximumBy, filter, length, concat, foldl )
45 import Data.Int ( Int32 )
46
47 #if defined(__GLASGOW_HASKELL__)
48 import GHC.Num
49 import GHC.Real ( Integral(..), fromIntegral )
50
51 import GHC.IOBase ( IO, IOArray, newIOArray, readIOArray, writeIOArray,
52 unsafeReadIOArray, unsafeWriteIOArray,
53 IORef, newIORef, readIORef, writeIORef )
54 import GHC.Err ( undefined )
55 #else
56 import Data.Char ( ord )
57 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
58 # if defined(__HUGS__)
59 import Hugs.IOArray ( IOArray, newIOArray, readIOArray, writeIOArray,
60 unsafeReadIOArray, unsafeWriteIOArray )
61 # elif defined(__NHC__)
62 import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray)
63 # endif
64 #endif
65 import Control.Monad ( when, mapM, sequence_ )
66
67
68 -----------------------------------------------------------------------
69 myReadArray :: IOArray Int32 a -> Int32 -> IO a
70 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
71 #if defined(DEBUG) || defined(__NHC__)
72 myReadArray = readIOArray
73 myWriteArray = writeIOArray
74 #else
75 myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
76 myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
77 #endif
78
79 -- | A hash table mapping keys of type @key@ to values of type @val@.
80 --
81 -- The implementation will grow the hash table as necessary, trying to
82 -- maintain a reasonable average load per bucket in the table.
83 --
84 newtype HashTable key val = HashTable (IORef (HT key val))
85 -- TODO: the IORef should really be an MVar.
86
87 data HT key val
88 = HT {
89 split :: !Int32, -- Next bucket to split when expanding
90 max_bucket :: !Int32, -- Max bucket of smaller table
91 mask1 :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
92 mask2 :: !Int32, -- Mask for doing the mod of h_2 (larger table)
93 kcount :: !Int32, -- Number of keys
94 bcount :: !Int32, -- Number of buckets
95 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
96 hash_fn :: key -> Int32,
97 cmp :: key -> key -> Bool
98 }
99
100 {-
101 ALTERNATIVE IMPLEMENTATION:
102
103 This works out slightly slower, because there's a tradeoff between
104 allocating a complete new HT structure each time a modification is
105 made (in the version above), and allocating new Int32s each time one
106 of them is modified, as below. Using FastMutInt instead of IORef
107 Int32 helps, but yields an implementation which has about the same
108 performance as the version above (and is more complex).
109
110 data HashTable key val
111 = HashTable {
112 split :: !(IORef Int32), -- Next bucket to split when expanding
113 max_bucket :: !(IORef Int32), -- Max bucket of smaller table
114 mask1 :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
115 mask2 :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
116 kcount :: !(IORef Int32), -- Number of keys
117 bcount :: !(IORef Int32), -- Number of buckets
118 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
119 hash_fn :: key -> Int32,
120 cmp :: key -> key -> Bool
121 }
122 -}
123
124
125 -- -----------------------------------------------------------------------------
126 -- Sample hash functions
127
128 -- $hash_functions
129 --
130 -- This implementation of hash tables uses the low-order /n/ bits of the hash
131 -- value for a key, where /n/ varies as the hash table grows. A good hash
132 -- function therefore will give an even distribution regardless of /n/.
133 --
134 -- If your keyspace is integrals such that the low-order bits between
135 -- keys are highly variable, then you could get away with using 'id'
136 -- as the hash function.
137 --
138 -- We provide some sample hash functions for 'Int' and 'String' below.
139
140 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
141 -- where P is a suitable prime (currently 1500007). Should give
142 -- reasonable results for most distributions of 'Int' values, except
143 -- when the keys are all multiples of the prime!
144 --
145 hashInt :: Int -> Int32
146 hashInt = (`rem` prime) . fromIntegral
147
148 -- | A sample hash function for 'String's. The implementation is:
149 --
150 -- > hashString = fromIntegral . foldr f 0
151 -- > where f c m = ord c + (m * 128) `rem` 1500007
152 --
153 -- which seems to give reasonable results.
154 --
155 hashString :: String -> Int32
156 hashString = fromIntegral . foldl f 0
157 where f m c = ord c + (m * 128) `rem` fromIntegral prime
158
159 -- | A prime larger than the maximum hash table size
160 prime :: Int32
161 prime = 1500007
162
163 -- -----------------------------------------------------------------------------
164 -- Parameters
165
166 sEGMENT_SIZE = 1024 :: Int32 -- Size of a single hash table segment
167 sEGMENT_SHIFT = 10 :: Int -- derived
168 sEGMENT_MASK = 0x3ff :: Int32 -- derived
169
170 dIR_SIZE = 1024 :: Int32 -- Size of the segment directory
171 -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
172
173 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
174
175 -- -----------------------------------------------------------------------------
176 -- Creating a new hash table
177
178 -- | Creates a new hash table. The following property should hold for the @eq@
179 -- and @hash@ functions passed to 'new':
180 --
181 -- > eq A B => hash A == hash B
182 --
183 new
184 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
185 -> (key -> Int32) -- ^ @hash@: A hash function on keys
186 -> IO (HashTable key val) -- ^ Returns: an empty hash table
187
188 new cmp hash_fn = do
189 -- make a new hash table with a single, empty, segment
190 dir <- newIOArray (0,dIR_SIZE-1) undefined
191 segment <- newIOArray (0,sEGMENT_SIZE-1) []
192 myWriteArray dir 0 segment
193
194 let
195 split = 0
196 max = sEGMENT_SIZE
197 mask1 = (sEGMENT_SIZE - 1)
198 mask2 = (2 * sEGMENT_SIZE - 1)
199 kcount = 0
200 bcount = sEGMENT_SIZE
201
202 ht = HT { dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
203 kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
204 }
205
206 table <- newIORef ht
207 return (HashTable table)
208
209 -- -----------------------------------------------------------------------------
210 -- Inserting a key\/value pair into the hash table
211
212 -- | Inserts an key\/value mapping into the hash table.
213 --
214 -- Note that 'insert' doesn't remove the old entry from the table -
215 -- the behaviour is like an association list, where 'lookup' returns
216 -- the most-recently-inserted mapping for a key in the table. The
217 -- reason for this is to keep 'insert' as efficient as possible. If
218 -- you need to update a mapping, then we provide 'update'.
219 --
220 insert :: HashTable key val -> key -> val -> IO ()
221
222 insert (HashTable ref) key val = do
223 table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
224 let table1 = table{ kcount = k+1 }
225 table2 <-
226 if (k > hLOAD * b)
227 then expandHashTable table1
228 else return table1
229 writeIORef ref table2
230 (segment_index,segment_offset) <- tableLocation table2 key
231 segment <- myReadArray dir segment_index
232 bucket <- myReadArray segment segment_offset
233 myWriteArray segment segment_offset ((key,val):bucket)
234 return ()
235
236 bucketIndex :: HT key val -> key -> IO Int32
237 bucketIndex HT{ hash_fn=hash_fn,
238 split=split,
239 mask1=mask1,
240 mask2=mask2 } key = do
241 let
242 h = fromIntegral (hash_fn key)
243 small_bucket = h .&. mask1
244 large_bucket = h .&. mask2
245 --
246 if small_bucket < split
247 then return large_bucket
248 else return small_bucket
249
250 tableLocation :: HT key val -> key -> IO (Int32,Int32)
251 tableLocation table key = do
252 bucket_index <- bucketIndex table key
253 let
254 segment_index = bucket_index `shiftR` sEGMENT_SHIFT
255 segment_offset = bucket_index .&. sEGMENT_MASK
256 --
257 return (segment_index,segment_offset)
258
259 expandHashTable :: HT key val -> IO (HT key val)
260 expandHashTable
261 table@HT{ dir=dir,
262 split=split,
263 max_bucket=max,
264 bcount=bcount,
265 mask2=mask2 } = do
266 let
267 oldsegment = split `shiftR` sEGMENT_SHIFT
268 oldindex = split .&. sEGMENT_MASK
269
270 newbucket = max + split
271 newsegment = newbucket `shiftR` sEGMENT_SHIFT
272 newindex = newbucket .&. sEGMENT_MASK
273 --
274 if newsegment >= dIR_SIZE -- make sure we don't overflow the table.
275 then return table
276 else do
277 --
278 when (newindex == 0) $
279 do segment <- newIOArray (0,sEGMENT_SIZE-1) []
280 writeIOArray dir newsegment segment
281 -- doesn't happen very often, so we might as well use a safe
282 -- array index here.
283 --
284 let table' =
285 if (split+1) < max
286 then table{ split = split+1,
287 bcount = bcount+1 }
288 -- we've expanded all the buckets in this table, so start from
289 -- the beginning again.
290 else table{ split = 0,
291 bcount = bcount+1,
292 max_bucket = max * 2,
293 mask1 = mask2,
294 mask2 = mask2 `shiftL` 1 .|. 1 }
295 let
296 split_bucket old new [] = do
297 segment <- myReadArray dir oldsegment
298 myWriteArray segment oldindex old
299 segment <- myReadArray dir newsegment
300 myWriteArray segment newindex new
301 split_bucket old new ((k,v):xs) = do
302 h <- bucketIndex table' k
303 if h == newbucket
304 then split_bucket old ((k,v):new) xs
305 else split_bucket ((k,v):old) new xs
306 --
307 segment <- myReadArray dir oldsegment
308 bucket <- myReadArray segment oldindex
309 split_bucket [] [] bucket
310 return table'
311
312 -- -----------------------------------------------------------------------------
313 -- Deleting a mapping from the hash table
314
315 -- | Remove an entry from the hash table.
316 delete :: HashTable key val -> key -> IO ()
317
318 delete (HashTable ref) key = do
319 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
320 (segment_index,segment_offset) <- tableLocation table key
321 segment <- myReadArray dir segment_index
322 bucket <- myReadArray segment segment_offset
323 myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
324 return ()
325
326 -- -----------------------------------------------------------------------------
327 -- Deleting a mapping from the hash table
328
329 -- | Updates an entry in the hash table, returning 'True' if there was
330 -- already an entry for this key, or 'False' otherwise. After 'update'
331 -- there will always be exactly one entry for the given key in the table.
332 --
333 -- 'insert' is more efficient than 'update' if you don't care about
334 -- multiple entries, or you know for sure that multiple entries can't
335 -- occur. However, 'update' is more efficient than 'delete' followed
336 -- by 'insert'.
337 update :: HashTable key val -> key -> val -> IO Bool
338
339 update (HashTable ref) key val = do
340 table@HT{ kcount=k, bcount=b, dir=dir, cmp=cmp } <- readIORef ref
341 let table1 = table{ kcount = k+1 }
342 -- optimistically expand the table
343 table2 <-
344 if (k > hLOAD * b)
345 then expandHashTable table1
346 else return table1
347 writeIORef ref table2
348 (segment_index,segment_offset) <- tableLocation table2 key
349 segment <- myReadArray dir segment_index
350 bucket <- myReadArray segment segment_offset
351 let
352 (deleted,bucket') = foldr filt (0::Int32,[]) bucket
353 filt pair@(k,v) (deleted,bucket)
354 | key `cmp` k = (deleted+1, bucket)
355 | otherwise = (deleted, pair:bucket)
356 -- in
357 myWriteArray segment segment_offset ((key,val):bucket')
358 -- update the table load, taking into account the number of
359 -- items we just deleted.
360 writeIORef ref table2{ kcount = kcount table2 - deleted }
361 return (deleted /= 0)
362
363 -- -----------------------------------------------------------------------------
364 -- Looking up an entry in the hash table
365
366 -- | Looks up the value of a key in the hash table.
367 lookup :: HashTable key val -> key -> IO (Maybe val)
368
369 lookup (HashTable ref) key = do
370 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
371 (segment_index,segment_offset) <- tableLocation table key
372 segment <- myReadArray dir segment_index
373 bucket <- myReadArray segment segment_offset
374 case [ val | (key',val) <- bucket, cmp key key' ] of
375 [] -> return Nothing
376 (v:_) -> return (Just v)
377
378 -- -----------------------------------------------------------------------------
379 -- Converting to/from lists
380
381 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
382 -- is taken from the Eq instance for the key type.
383 --
384 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
385 fromList hash_fn list = do
386 table <- new (==) hash_fn
387 sequence_ [ insert table k v | (k,v) <- list ]
388 return table
389
390 -- | Converts a hash table to a list of key\/value pairs.
391 --
392 toList :: HashTable key val -> IO [(key,val)]
393 toList (HashTable ref) = do
394 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
395 --
396 let
397 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
398 --
399 segments <- mapM (segmentContents dir) [0 .. max_segment]
400 return (concat segments)
401 where
402 segmentContents dir seg_index = do
403 segment <- myReadArray dir seg_index
404 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
405 return (concat bs)
406
407 -- -----------------------------------------------------------------------------
408 -- Diagnostics
409
410 -- | This function is useful for determining whether your hash function
411 -- is working well for your data set. It returns the longest chain
412 -- of key\/value pairs in the hash table for which all the keys hash to
413 -- the same bucket. If this chain is particularly long (say, longer
414 -- than 10 elements), then it might be a good idea to try a different
415 -- hash function.
416 --
417 longestChain :: HashTable key val -> IO [(key,val)]
418 longestChain (HashTable ref) = do
419 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
420 --
421 let
422 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
423 --
424 --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
425 segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
426 return (maximumBy lengthCmp segments)
427 where
428 segmentMaxChainLength dir seg_index = do
429 segment <- myReadArray dir seg_index
430 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
431 return (maximumBy lengthCmp bs)
432
433 lengthCmp x y = length x `compare` length y