9fe78991d742603b421517495078e729ebf34619
[packages/base.git] / Data / HashTable.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3 {-# OPTIONS_GHC -funbox-strict-fields -fno-warn-name-shadowing #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : Data.HashTable
8 -- Copyright : (c) The University of Glasgow 2003
9 -- License : BSD-style (see the file libraries/base/LICENSE)
10 --
11 -- Maintainer : libraries@haskell.org
12 -- Stability : provisional
13 -- Portability : portable
14 --
15 -- An implementation of extensible hash tables, as described in
16 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
17 -- pp. 446--457. The implementation is also derived from the one
18 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
19 --
20 -----------------------------------------------------------------------------
21
22 module Data.HashTable (
23 -- * Basic hash table operations
24 HashTable, new, newHint, insert, delete, lookup, update,
25 -- * Converting to and from lists
26 fromList, toList,
27 -- * Hash functions
28 -- $hash_functions
29 hashInt, hashString,
30 prime,
31 -- * Diagnostics
32 longestChain
33 ) where
34
35 -- This module is imported by Data.Dynamic, which is pretty low down in the
36 -- module hierarchy, so don't import "high-level" modules
37
38 #ifdef __GLASGOW_HASKELL__
39 import GHC.Base
40 #else
41 import Prelude hiding ( lookup )
42 #endif
43 import Data.Tuple ( fst )
44 import Data.Bits
45 import Data.Maybe
46 import Data.List ( maximumBy, length, concat, foldl', partition )
47 import Data.Int ( Int32 )
48
49 #if defined(__GLASGOW_HASKELL__)
50 import GHC.Num
51 import GHC.Real ( fromIntegral )
52 import GHC.Show ( Show(..) )
53 import GHC.Int ( Int64 )
54
55 import GHC.IO
56 import GHC.IOArray
57 import GHC.IORef
58 #else
59 import Data.Char ( ord )
60 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
61 import System.IO.Unsafe ( unsafePerformIO )
62 import Data.Int ( Int64 )
63 # if defined(__HUGS__)
64 import Hugs.IOArray ( IOArray, newIOArray,
65 unsafeReadIOArray, unsafeWriteIOArray )
66 # elif defined(__NHC__)
67 import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
68 # endif
69 #endif
70 import Control.Monad ( mapM, mapM_, sequence_ )
71
72
73 -----------------------------------------------------------------------
74
75 iNSTRUMENTED :: Bool
76 iNSTRUMENTED = False
77
78 -----------------------------------------------------------------------
79
80 readHTArray :: HTArray a -> Int32 -> IO a
81 writeMutArray :: MutArray a -> Int32 -> a -> IO ()
82 newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
83 newMutArray = newIOArray
84 type MutArray a = IOArray Int32 a
85 type HTArray a = MutArray a
86 #if defined(DEBUG) || defined(__NHC__)
87 readHTArray = readIOArray
88 writeMutArray = writeIOArray
89 #else
90 readHTArray arr i = unsafeReadIOArray arr (fromIntegral i)
91 writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
92 #endif
93
94 data HashTable key val = HashTable {
95 cmp :: !(key -> key -> Bool),
96 hash_fn :: !(key -> Int32),
97 tab :: !(IORef (HT key val))
98 }
99 -- TODO: the IORef should really be an MVar.
100
101 data HT key val
102 = HT {
103 kcount :: !Int32, -- Total number of keys.
104 bmask :: !Int32,
105 buckets :: !(HTArray [(key,val)])
106 }
107
108 -- ------------------------------------------------------------
109 -- Instrumentation for performance tuning
110
111 -- This ought to be roundly ignored after optimization when
112 -- iNSTRUMENTED=False.
113
114 -- STRICT version of modifyIORef!
115 modifyIORef :: IORef a -> (a -> a) -> IO ()
116 modifyIORef r f = do
117 v <- readIORef r
118 let z = f v in z `seq` writeIORef r z
119
120 data HashData = HD {
121 tables :: !Integer,
122 insertions :: !Integer,
123 lookups :: !Integer,
124 totBuckets :: !Integer,
125 maxEntries :: !Int32,
126 maxChain :: !Int,
127 maxBuckets :: !Int32
128 } deriving (Eq, Show)
129
130 {-# NOINLINE hashData #-}
131 hashData :: IORef HashData
132 hashData = unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
133 totBuckets=0, maxEntries=0,
134 maxChain=0, maxBuckets=tABLE_MIN } ))
135
136 instrument :: (HashData -> HashData) -> IO ()
137 instrument i | iNSTRUMENTED = modifyIORef hashData i
138 | otherwise = return ()
139
140 recordNew :: IO ()
141 recordNew = instrument rec
142 where rec hd@HD{ tables=t, totBuckets=b } =
143 hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
144
145 recordIns :: Int32 -> Int32 -> [a] -> IO ()
146 recordIns i sz bkt = instrument rec
147 where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
148 hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
149 maxChain=mc `max` length bkt }
150
151 recordResize :: Int32 -> Int32 -> IO ()
152 recordResize older newer = instrument rec
153 where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
154 hd{ totBuckets=b+fromIntegral (newer-older),
155 maxBuckets=mx `max` newer }
156
157 recordLookup :: IO ()
158 recordLookup = instrument lkup
159 where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
160
161 -- stats :: IO String
162 -- stats = fmap show $ readIORef hashData
163
164 -- ----------------------------------------------------------------------------
165 -- Sample hash functions
166
167 -- $hash_functions
168 --
169 -- This implementation of hash tables uses the low-order /n/ bits of the hash
170 -- value for a key, where /n/ varies as the hash table grows. A good hash
171 -- function therefore will give an even distribution regardless of /n/.
172 --
173 -- If your keyspace is integrals such that the low-order bits between
174 -- keys are highly variable, then you could get away with using 'fromIntegral'
175 -- as the hash function.
176 --
177 -- We provide some sample hash functions for 'Int' and 'String' below.
178
179 golden :: Int32
180 golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
181 -- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
182 -- but that has bad mulHi properties (even adding 2^32 to get its inverse)
183 -- Whereas the above works well and contains no hash duplications for
184 -- [-32767..65536]
185
186 hashInt32 :: Int32 -> Int32
187 hashInt32 x = mulHi x golden + x
188
189 -- | A sample (and useful) hash function for Int and Int32,
190 -- implemented by extracting the uppermost 32 bits of the 64-bit
191 -- result of multiplying by a 33-bit constant. The constant is from
192 -- Knuth, derived from the golden ratio:
193 --
194 -- > golden = round ((sqrt 5 - 1) * 2^32)
195 --
196 -- We get good key uniqueness on small inputs
197 -- (a problem with previous versions):
198 -- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
199 --
200 hashInt :: Int -> Int32
201 hashInt x = hashInt32 (fromIntegral x)
202
203 -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
204 mulHi :: Int32 -> Int32 -> Int32
205 mulHi a b = fromIntegral (r `shiftR` 32)
206 where r :: Int64
207 r = fromIntegral a * fromIntegral b
208
209 -- | A sample hash function for Strings. We keep multiplying by the
210 -- golden ratio and adding. The implementation is:
211 --
212 -- > hashString = foldl' f golden
213 -- > where f m c = fromIntegral (ord c) * magic + hashInt32 m
214 -- > magic = 0xdeadbeef
215 --
216 -- Where hashInt32 works just as hashInt shown above.
217 --
218 -- Knuth argues that repeated multiplication by the golden ratio
219 -- will minimize gaps in the hash space, and thus it's a good choice
220 -- for combining together multiple keys to form one.
221 --
222 -- Here we know that individual characters c are often small, and this
223 -- produces frequent collisions if we use ord c alone. A
224 -- particular problem are the shorter low ASCII and ISO-8859-1
225 -- character strings. We pre-multiply by a magic twiddle factor to
226 -- obtain a good distribution. In fact, given the following test:
227 --
228 -- > testp :: Int32 -> Int
229 -- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
230 -- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
231 -- > hs = foldl' f golden
232 -- > f m c = fromIntegral (ord c) * k + hashInt32 m
233 -- > n = 100000
234 --
235 -- We discover that testp magic = 0.
236
237 hashString :: String -> Int32
238 hashString = foldl' f golden
239 where f m c = fromIntegral (ord c) * magic + hashInt32 m
240 magic = 0xdeadbeef
241
242 -- | A prime larger than the maximum hash table size
243 prime :: Int32
244 prime = 33554467
245
246 -- -----------------------------------------------------------------------------
247 -- Parameters
248
249 tABLE_MAX :: Int32
250 tABLE_MAX = 32 * 1024 * 1024 -- Maximum size of hash table
251 tABLE_MIN :: Int32
252 tABLE_MIN = 8
253
254 hLOAD :: Int32
255 hLOAD = 7 -- Maximum average load of a single hash bucket
256
257 hYSTERESIS :: Int32
258 hYSTERESIS = 64 -- entries to ignore in load computation
259
260 {- Hysteresis favors long association-list-like behavior for small tables. -}
261
262 -- -----------------------------------------------------------------------------
263 -- Creating a new hash table
264
265 -- | Creates a new hash table. The following property should hold for the @eq@
266 -- and @hash@ functions passed to 'new':
267 --
268 -- > eq A B => hash A == hash B
269 --
270 new
271 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
272 -> (key -> Int32) -- ^ @hash@: A hash function on keys
273 -> IO (HashTable key val) -- ^ Returns: an empty hash table
274
275 new cmpr hash = do
276 recordNew
277 -- make a new hash table with a single, empty, segment
278 let mask = tABLE_MIN-1
279 bkts <- newMutArray (0,mask) []
280
281 let
282 kcnt = 0
283 ht = HT { buckets=bkts, kcount=kcnt, bmask=mask }
284
285 table <- newIORef ht
286 return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
287
288 {-
289 bitTwiddleSameAs takes as arguments positive Int32s less than maxBound/2 and
290 returns the smallest power of 2 that is greater than or equal to the
291 argument.
292 http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
293 -}
294 bitTwiddleSameAs :: Int32 -> Int32
295 bitTwiddleSameAs v0 =
296 let v1 = v0-1
297 v2 = v1 .|. (v1`shiftR`1)
298 v3 = v2 .|. (v2`shiftR`2)
299 v4 = v3 .|. (v3`shiftR`4)
300 v5 = v4 .|. (v4`shiftR`8)
301 v6 = v5 .|. (v5`shiftR`16)
302 in v6+1
303
304 {-
305 powerOver takes as arguments Int32s and returns the smallest power of 2
306 that is greater than or equal to the argument if that power of 2 is
307 within [tABLE_MIN,tABLE_MAX]
308 -}
309 powerOver :: Int32 -> Int32
310 powerOver n =
311 if n <= tABLE_MIN
312 then tABLE_MIN
313 else if n >= tABLE_MAX
314 then tABLE_MAX
315 else bitTwiddleSameAs n
316
317 -- | Creates a new hash table with the given minimum size.
318 newHint
319 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
320 -> (key -> Int32) -- ^ @hash@: A hash function on keys
321 -> Int -- ^ @minSize@: initial table size
322 -> IO (HashTable key val) -- ^ Returns: an empty hash table
323
324 newHint cmpr hash minSize = do
325 recordNew
326 -- make a new hash table with a single, empty, segment
327 let mask = powerOver $ fromIntegral minSize
328 bkts <- newMutArray (0,mask) []
329
330 let
331 kcnt = 0
332 ht = HT { buckets=bkts, kcount=kcnt, bmask=mask }
333
334 table <- newIORef ht
335 return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
336
337 -- -----------------------------------------------------------------------------
338 -- Inserting a key\/value pair into the hash table
339
340 -- | Inserts a key\/value mapping into the hash table.
341 --
342 -- Note that 'insert' doesn't remove the old entry from the table -
343 -- the behaviour is like an association list, where 'lookup' returns
344 -- the most-recently-inserted mapping for a key in the table. The
345 -- reason for this is to keep 'insert' as efficient as possible. If
346 -- you need to update a mapping, then we provide 'update'.
347 --
348 insert :: HashTable key val -> key -> val -> IO ()
349
350 insert ht key val =
351 updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
352
353
354 -- ------------------------------------------------------------
355 -- The core of the implementation is lurking down here, in findBucket,
356 -- updatingBucket, and expandHashTable.
357
358 tooBig :: Int32 -> Int32 -> Bool
359 tooBig k b = k-hYSTERESIS > hLOAD * b
360
361 -- index of bucket within table.
362 bucketIndex :: Int32 -> Int32 -> Int32
363 bucketIndex mask h = h .&. mask
364
365 -- find the bucket in which the key belongs.
366 -- returns (key equality, bucket index, bucket)
367 --
368 -- This rather grab-bag approach gives enough power to do pretty much
369 -- any bucket-finding thing you might want to do. We rely on inlining
370 -- to throw away the stuff we don't want. I'm proud to say that this
371 -- plus updatingBucket below reduce most of the other definitions to a
372 -- few lines of code, while actually speeding up the hashtable
373 -- implementation when compared with a version which does everything
374 -- from scratch.
375 {-# INLINE findBucket #-}
376 findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
377 findBucket HashTable{ tab=ref, hash_fn=hash} key = do
378 table@HT{ buckets=bkts, bmask=b } <- readIORef ref
379 let indx = bucketIndex b (hash key)
380 bucket <- readHTArray bkts indx
381 return (table, indx, bucket)
382
383 data Inserts = CanInsert
384 | Can'tInsert
385 deriving (Eq)
386
387 -- updatingBucket is the real workhorse of all single-element table
388 -- updates. It takes a hashtable and a key, along with a function
389 -- describing what to do with the bucket in which that key belongs. A
390 -- flag indicates whether this function may perform table insertions.
391 -- The function returns the new contents of the bucket, the number of
392 -- bucket entries inserted (negative if entries were deleted), and a
393 -- value which becomes the return value for the function as a whole.
394 -- The table sizing is enforced here, calling out to expandSubTable as
395 -- necessary.
396
397 -- This function is intended to be inlined and specialized for every
398 -- calling context (eg every provided bucketFn).
399 {-# INLINE updatingBucket #-}
400
401 updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
402 HashTable key val -> key ->
403 IO a
404 updatingBucket canEnlarge bucketFn
405 ht@HashTable{ tab=ref, hash_fn=hash } key = do
406 (table@HT{ kcount=k, buckets=bkts, bmask=b },
407 indx, bckt) <- findBucket ht key
408 (bckt', inserts, result) <- return $ bucketFn bckt
409 let k' = k + inserts
410 table1 = table { kcount=k' }
411 writeMutArray bkts indx bckt'
412 table2 <- if canEnlarge == CanInsert && inserts > 0 then do
413 recordIns inserts k' bckt'
414 if tooBig k' b
415 then expandHashTable hash table1
416 else return table1
417 else return table1
418 writeIORef ref table2
419 return result
420
421 expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
422 expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
423 let
424 oldsize = mask + 1
425 newmask = mask + mask + 1
426 recordResize oldsize (newmask+1)
427 --
428 if newmask > tABLE_MAX-1
429 then return table
430 else do
431 --
432 newbkts <- newMutArray (0,newmask) []
433
434 let
435 splitBucket oldindex = do
436 bucket <- readHTArray bkts oldindex
437 let (oldb,newb) =
438 partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
439 writeMutArray newbkts oldindex oldb
440 writeMutArray newbkts (oldindex + oldsize) newb
441 mapM_ splitBucket [0..mask]
442
443 return ( table{ buckets=newbkts, bmask=newmask } )
444
445 -- -----------------------------------------------------------------------------
446 -- Deleting a mapping from the hash table
447
448 -- Remove a key from a bucket
449 deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
450 deleteBucket _ [] = ([],0,())
451 deleteBucket del (pair@(k,_):bucket) =
452 case deleteBucket del bucket of
453 (bucket', dels, _) | del k -> dels' `seq` (bucket', dels', ())
454 | otherwise -> (pair:bucket', dels, ())
455 where dels' = dels - 1
456
457 -- | Remove an entry from the hash table.
458 delete :: HashTable key val -> key -> IO ()
459
460 delete ht@HashTable{ cmp=eq } key =
461 updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
462
463 -- -----------------------------------------------------------------------------
464 -- Updating a mapping in the hash table
465
466 -- | Updates an entry in the hash table, returning 'True' if there was
467 -- already an entry for this key, or 'False' otherwise. After 'update'
468 -- there will always be exactly one entry for the given key in the table.
469 --
470 -- 'insert' is more efficient than 'update' if you don't care about
471 -- multiple entries, or you know for sure that multiple entries can't
472 -- occur. However, 'update' is more efficient than 'delete' followed
473 -- by 'insert'.
474 update :: HashTable key val -> key -> val -> IO Bool
475
476 update ht@HashTable{ cmp=eq } key val =
477 updatingBucket CanInsert
478 (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
479 in ((key,val):bucket', 1+dels, dels/=0))
480 ht key
481
482 -- -----------------------------------------------------------------------------
483 -- Looking up an entry in the hash table
484
485 -- | Looks up the value of a key in the hash table.
486 lookup :: HashTable key val -> key -> IO (Maybe val)
487
488 lookup ht@HashTable{ cmp=eq } key = do
489 recordLookup
490 (_, _, bucket) <- findBucket ht key
491 let firstHit (k,v) r | eq key k = Just v
492 | otherwise = r
493 return (foldr firstHit Nothing bucket)
494
495 -- -----------------------------------------------------------------------------
496 -- Converting to/from lists
497
498 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
499 -- is taken from the Eq instance for the key type.
500 --
501 fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
502 fromList hash list = do
503 table <- new (==) hash
504 sequence_ [ insert table k v | (k,v) <- list ]
505 return table
506
507 -- | Converts a hash table to a list of key\/value pairs.
508 --
509 toList :: HashTable key val -> IO [(key,val)]
510 toList = mapReduce id concat
511
512 {-# INLINE mapReduce #-}
513 mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
514 mapReduce m r HashTable{ tab=ref } = do
515 HT{ buckets=bckts, bmask=b } <- readIORef ref
516 fmap r (mapM (fmap m . readHTArray bckts) [0..b])
517
518 -- -----------------------------------------------------------------------------
519 -- Diagnostics
520
521 -- | This function is useful for determining whether your hash
522 -- function is working well for your data set. It returns the longest
523 -- chain of key\/value pairs in the hash table for which all the keys
524 -- hash to the same bucket. If this chain is particularly long (say,
525 -- longer than 14 elements or so), then it might be a good idea to try
526 -- a different hash function.
527 --
528 longestChain :: HashTable key val -> IO [(key,val)]
529 longestChain = mapReduce id (maximumBy lengthCmp)
530 where lengthCmp (_:x)(_:y) = lengthCmp x y
531 lengthCmp [] [] = EQ
532 lengthCmp [] _ = LT
533 lengthCmp _ [] = GT