Use explicit language extensions & remove extension fields from base.cabal
[ghc.git] / libraries / base / Data / HashTable.hs
1 {-# LANGUAGE CPP, NoImplicitPrelude #-}
2 {-# OPTIONS_GHC -funbox-strict-fields -fno-warn-name-shadowing #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Data.HashTable
7 -- Copyright : (c) The University of Glasgow 2003
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : portable
13 --
14 -- An implementation of extensible hash tables, as described in
15 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
16 -- pp. 446--457. The implementation is also derived from the one
17 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
18 --
19 -----------------------------------------------------------------------------
20
21 module Data.HashTable (
22 -- * Basic hash table operations
23 HashTable, new, newHint, insert, delete, lookup, update,
24 -- * Converting to and from lists
25 fromList, toList,
26 -- * Hash functions
27 -- $hash_functions
28 hashInt, hashString,
29 prime,
30 -- * Diagnostics
31 longestChain
32 ) where
33
34 -- This module is imported by Data.Dynamic, which is pretty low down in the
35 -- module hierarchy, so don't import "high-level" modules
36
37 #ifdef __GLASGOW_HASKELL__
38 import GHC.Base
39 #else
40 import Prelude hiding ( lookup )
41 #endif
42 import Data.Tuple ( fst )
43 import Data.Bits
44 import Data.Maybe
45 import Data.List ( maximumBy, length, concat, foldl', partition )
46 import Data.Int ( Int32 )
47
48 #if defined(__GLASGOW_HASKELL__)
49 import GHC.Num
50 import GHC.Real ( fromIntegral )
51 import GHC.Show ( Show(..) )
52 import GHC.Int ( Int64 )
53
54 import GHC.IO
55 import GHC.IOArray
56 import GHC.IORef
57 #else
58 import Data.Char ( ord )
59 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
60 import System.IO.Unsafe ( unsafePerformIO )
61 import Data.Int ( Int64 )
62 # if defined(__HUGS__)
63 import Hugs.IOArray ( IOArray, newIOArray,
64 unsafeReadIOArray, unsafeWriteIOArray )
65 # elif defined(__NHC__)
66 import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
67 # endif
68 #endif
69 import Control.Monad ( mapM, mapM_, sequence_ )
70
71
72 -----------------------------------------------------------------------
73
74 iNSTRUMENTED :: Bool
75 iNSTRUMENTED = False
76
77 -----------------------------------------------------------------------
78
79 readHTArray :: HTArray a -> Int32 -> IO a
80 writeMutArray :: MutArray a -> Int32 -> a -> IO ()
81 newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
82 newMutArray = newIOArray
83 type MutArray a = IOArray Int32 a
84 type HTArray a = MutArray a
85 #if defined(DEBUG) || defined(__NHC__)
86 readHTArray = readIOArray
87 writeMutArray = writeIOArray
88 #else
89 readHTArray arr i = unsafeReadIOArray arr (fromIntegral i)
90 writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
91 #endif
92
93 data HashTable key val = HashTable {
94 cmp :: !(key -> key -> Bool),
95 hash_fn :: !(key -> Int32),
96 tab :: !(IORef (HT key val))
97 }
98 -- TODO: the IORef should really be an MVar.
99
100 data HT key val
101 = HT {
102 kcount :: !Int32, -- Total number of keys.
103 bmask :: !Int32,
104 buckets :: !(HTArray [(key,val)])
105 }
106
107 -- ------------------------------------------------------------
108 -- Instrumentation for performance tuning
109
110 -- This ought to be roundly ignored after optimization when
111 -- iNSTRUMENTED=False.
112
113 -- STRICT version of modifyIORef!
114 modifyIORef :: IORef a -> (a -> a) -> IO ()
115 modifyIORef r f = do
116 v <- readIORef r
117 let z = f v in z `seq` writeIORef r z
118
119 data HashData = HD {
120 tables :: !Integer,
121 insertions :: !Integer,
122 lookups :: !Integer,
123 totBuckets :: !Integer,
124 maxEntries :: !Int32,
125 maxChain :: !Int,
126 maxBuckets :: !Int32
127 } deriving (Eq, Show)
128
129 {-# NOINLINE hashData #-}
130 hashData :: IORef HashData
131 hashData = unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
132 totBuckets=0, maxEntries=0,
133 maxChain=0, maxBuckets=tABLE_MIN } ))
134
135 instrument :: (HashData -> HashData) -> IO ()
136 instrument i | iNSTRUMENTED = modifyIORef hashData i
137 | otherwise = return ()
138
139 recordNew :: IO ()
140 recordNew = instrument rec
141 where rec hd@HD{ tables=t, totBuckets=b } =
142 hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
143
144 recordIns :: Int32 -> Int32 -> [a] -> IO ()
145 recordIns i sz bkt = instrument rec
146 where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
147 hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
148 maxChain=mc `max` length bkt }
149
150 recordResize :: Int32 -> Int32 -> IO ()
151 recordResize older newer = instrument rec
152 where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
153 hd{ totBuckets=b+fromIntegral (newer-older),
154 maxBuckets=mx `max` newer }
155
156 recordLookup :: IO ()
157 recordLookup = instrument lkup
158 where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
159
160 -- stats :: IO String
161 -- stats = fmap show $ readIORef hashData
162
163 -- ----------------------------------------------------------------------------
164 -- Sample hash functions
165
166 -- $hash_functions
167 --
168 -- This implementation of hash tables uses the low-order /n/ bits of the hash
169 -- value for a key, where /n/ varies as the hash table grows. A good hash
170 -- function therefore will give an even distribution regardless of /n/.
171 --
172 -- If your keyspace is integrals such that the low-order bits between
173 -- keys are highly variable, then you could get away with using 'fromIntegral'
174 -- as the hash function.
175 --
176 -- We provide some sample hash functions for 'Int' and 'String' below.
177
178 golden :: Int32
179 golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
180 -- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
181 -- but that has bad mulHi properties (even adding 2^32 to get its inverse)
182 -- Whereas the above works well and contains no hash duplications for
183 -- [-32767..65536]
184
185 hashInt32 :: Int32 -> Int32
186 hashInt32 x = mulHi x golden + x
187
188 -- | A sample (and useful) hash function for Int and Int32,
189 -- implemented by extracting the uppermost 32 bits of the 64-bit
190 -- result of multiplying by a 33-bit constant. The constant is from
191 -- Knuth, derived from the golden ratio:
192 --
193 -- > golden = round ((sqrt 5 - 1) * 2^32)
194 --
195 -- We get good key uniqueness on small inputs
196 -- (a problem with previous versions):
197 -- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
198 --
199 hashInt :: Int -> Int32
200 hashInt x = hashInt32 (fromIntegral x)
201
202 -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
203 mulHi :: Int32 -> Int32 -> Int32
204 mulHi a b = fromIntegral (r `shiftR` 32)
205 where r :: Int64
206 r = fromIntegral a * fromIntegral b
207
208 -- | A sample hash function for Strings. We keep multiplying by the
209 -- golden ratio and adding. The implementation is:
210 --
211 -- > hashString = foldl' f golden
212 -- > where f m c = fromIntegral (ord c) * magic + hashInt32 m
213 -- > magic = 0xdeadbeef
214 --
215 -- Where hashInt32 works just as hashInt shown above.
216 --
217 -- Knuth argues that repeated multiplication by the golden ratio
218 -- will minimize gaps in the hash space, and thus it's a good choice
219 -- for combining together multiple keys to form one.
220 --
221 -- Here we know that individual characters c are often small, and this
222 -- produces frequent collisions if we use ord c alone. A
223 -- particular problem are the shorter low ASCII and ISO-8859-1
224 -- character strings. We pre-multiply by a magic twiddle factor to
225 -- obtain a good distribution. In fact, given the following test:
226 --
227 -- > testp :: Int32 -> Int
228 -- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
229 -- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
230 -- > hs = foldl' f golden
231 -- > f m c = fromIntegral (ord c) * k + hashInt32 m
232 -- > n = 100000
233 --
234 -- We discover that testp magic = 0.
235
236 hashString :: String -> Int32
237 hashString = foldl' f golden
238 where f m c = fromIntegral (ord c) * magic + hashInt32 m
239 magic = 0xdeadbeef
240
241 -- | A prime larger than the maximum hash table size
242 prime :: Int32
243 prime = 33554467
244
245 -- -----------------------------------------------------------------------------
246 -- Parameters
247
248 tABLE_MAX :: Int32
249 tABLE_MAX = 32 * 1024 * 1024 -- Maximum size of hash table
250 tABLE_MIN :: Int32
251 tABLE_MIN = 8
252
253 hLOAD :: Int32
254 hLOAD = 7 -- Maximum average load of a single hash bucket
255
256 hYSTERESIS :: Int32
257 hYSTERESIS = 64 -- entries to ignore in load computation
258
259 {- Hysteresis favors long association-list-like behavior for small tables. -}
260
261 -- -----------------------------------------------------------------------------
262 -- Creating a new hash table
263
264 -- | Creates a new hash table. The following property should hold for the @eq@
265 -- and @hash@ functions passed to 'new':
266 --
267 -- > eq A B => hash A == hash B
268 --
269 new
270 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
271 -> (key -> Int32) -- ^ @hash@: A hash function on keys
272 -> IO (HashTable key val) -- ^ Returns: an empty hash table
273
274 new cmpr hash = do
275 recordNew
276 -- make a new hash table with a single, empty, segment
277 let mask = tABLE_MIN-1
278 bkts <- newMutArray (0,mask) []
279
280 let
281 kcnt = 0
282 ht = HT { buckets=bkts, kcount=kcnt, bmask=mask }
283
284 table <- newIORef ht
285 return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
286
287 {-
288 bitTwiddleSameAs takes as arguments positive Int32s less than maxBound/2 and
289 returns the smallest power of 2 that is greater than or equal to the
290 argument.
291 http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
292 -}
293 bitTwiddleSameAs :: Int32 -> Int32
294 bitTwiddleSameAs v0 =
295 let v1 = v0-1
296 v2 = v1 .|. (v1`shiftR`1)
297 v3 = v2 .|. (v2`shiftR`2)
298 v4 = v3 .|. (v3`shiftR`4)
299 v5 = v4 .|. (v4`shiftR`8)
300 v6 = v5 .|. (v5`shiftR`16)
301 in v6+1
302
303 {-
304 powerOver takes as arguments Int32s and returns the smallest power of 2
305 that is greater than or equal to the argument if that power of 2 is
306 within [tABLE_MIN,tABLE_MAX]
307 -}
308 powerOver :: Int32 -> Int32
309 powerOver n =
310 if n <= tABLE_MIN
311 then tABLE_MIN
312 else if n >= tABLE_MAX
313 then tABLE_MAX
314 else bitTwiddleSameAs n
315
316 -- | Creates a new hash table with the given minimum size.
317 newHint
318 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
319 -> (key -> Int32) -- ^ @hash@: A hash function on keys
320 -> Int -- ^ @minSize@: initial table size
321 -> IO (HashTable key val) -- ^ Returns: an empty hash table
322
323 newHint cmpr hash minSize = do
324 recordNew
325 -- make a new hash table with a single, empty, segment
326 let mask = powerOver $ fromIntegral minSize
327 bkts <- newMutArray (0,mask) []
328
329 let
330 kcnt = 0
331 ht = HT { buckets=bkts, kcount=kcnt, bmask=mask }
332
333 table <- newIORef ht
334 return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
335
336 -- -----------------------------------------------------------------------------
337 -- Inserting a key\/value pair into the hash table
338
339 -- | Inserts a key\/value mapping into the hash table.
340 --
341 -- Note that 'insert' doesn't remove the old entry from the table -
342 -- the behaviour is like an association list, where 'lookup' returns
343 -- the most-recently-inserted mapping for a key in the table. The
344 -- reason for this is to keep 'insert' as efficient as possible. If
345 -- you need to update a mapping, then we provide 'update'.
346 --
347 insert :: HashTable key val -> key -> val -> IO ()
348
349 insert ht key val =
350 updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
351
352
353 -- ------------------------------------------------------------
354 -- The core of the implementation is lurking down here, in findBucket,
355 -- updatingBucket, and expandHashTable.
356
357 tooBig :: Int32 -> Int32 -> Bool
358 tooBig k b = k-hYSTERESIS > hLOAD * b
359
360 -- index of bucket within table.
361 bucketIndex :: Int32 -> Int32 -> Int32
362 bucketIndex mask h = h .&. mask
363
364 -- find the bucket in which the key belongs.
365 -- returns (key equality, bucket index, bucket)
366 --
367 -- This rather grab-bag approach gives enough power to do pretty much
368 -- any bucket-finding thing you might want to do. We rely on inlining
369 -- to throw away the stuff we don't want. I'm proud to say that this
370 -- plus updatingBucket below reduce most of the other definitions to a
371 -- few lines of code, while actually speeding up the hashtable
372 -- implementation when compared with a version which does everything
373 -- from scratch.
374 {-# INLINE findBucket #-}
375 findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
376 findBucket HashTable{ tab=ref, hash_fn=hash} key = do
377 table@HT{ buckets=bkts, bmask=b } <- readIORef ref
378 let indx = bucketIndex b (hash key)
379 bucket <- readHTArray bkts indx
380 return (table, indx, bucket)
381
382 data Inserts = CanInsert
383 | Can'tInsert
384 deriving (Eq)
385
386 -- updatingBucket is the real workhorse of all single-element table
387 -- updates. It takes a hashtable and a key, along with a function
388 -- describing what to do with the bucket in which that key belongs. A
389 -- flag indicates whether this function may perform table insertions.
390 -- The function returns the new contents of the bucket, the number of
391 -- bucket entries inserted (negative if entries were deleted), and a
392 -- value which becomes the return value for the function as a whole.
393 -- The table sizing is enforced here, calling out to expandSubTable as
394 -- necessary.
395
396 -- This function is intended to be inlined and specialized for every
397 -- calling context (eg every provided bucketFn).
398 {-# INLINE updatingBucket #-}
399
400 updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
401 HashTable key val -> key ->
402 IO a
403 updatingBucket canEnlarge bucketFn
404 ht@HashTable{ tab=ref, hash_fn=hash } key = do
405 (table@HT{ kcount=k, buckets=bkts, bmask=b },
406 indx, bckt) <- findBucket ht key
407 (bckt', inserts, result) <- return $ bucketFn bckt
408 let k' = k + inserts
409 table1 = table { kcount=k' }
410 writeMutArray bkts indx bckt'
411 table2 <- if canEnlarge == CanInsert && inserts > 0 then do
412 recordIns inserts k' bckt'
413 if tooBig k' b
414 then expandHashTable hash table1
415 else return table1
416 else return table1
417 writeIORef ref table2
418 return result
419
420 expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
421 expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
422 let
423 oldsize = mask + 1
424 newmask = mask + mask + 1
425 recordResize oldsize (newmask+1)
426 --
427 if newmask > tABLE_MAX-1
428 then return table
429 else do
430 --
431 newbkts <- newMutArray (0,newmask) []
432
433 let
434 splitBucket oldindex = do
435 bucket <- readHTArray bkts oldindex
436 let (oldb,newb) =
437 partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
438 writeMutArray newbkts oldindex oldb
439 writeMutArray newbkts (oldindex + oldsize) newb
440 mapM_ splitBucket [0..mask]
441
442 return ( table{ buckets=newbkts, bmask=newmask } )
443
444 -- -----------------------------------------------------------------------------
445 -- Deleting a mapping from the hash table
446
447 -- Remove a key from a bucket
448 deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
449 deleteBucket _ [] = ([],0,())
450 deleteBucket del (pair@(k,_):bucket) =
451 case deleteBucket del bucket of
452 (bucket', dels, _) | del k -> dels' `seq` (bucket', dels', ())
453 | otherwise -> (pair:bucket', dels, ())
454 where dels' = dels - 1
455
456 -- | Remove an entry from the hash table.
457 delete :: HashTable key val -> key -> IO ()
458
459 delete ht@HashTable{ cmp=eq } key =
460 updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
461
462 -- -----------------------------------------------------------------------------
463 -- Updating a mapping in the hash table
464
465 -- | Updates an entry in the hash table, returning 'True' if there was
466 -- already an entry for this key, or 'False' otherwise. After 'update'
467 -- there will always be exactly one entry for the given key in the table.
468 --
469 -- 'insert' is more efficient than 'update' if you don't care about
470 -- multiple entries, or you know for sure that multiple entries can't
471 -- occur. However, 'update' is more efficient than 'delete' followed
472 -- by 'insert'.
473 update :: HashTable key val -> key -> val -> IO Bool
474
475 update ht@HashTable{ cmp=eq } key val =
476 updatingBucket CanInsert
477 (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
478 in ((key,val):bucket', 1+dels, dels/=0))
479 ht key
480
481 -- -----------------------------------------------------------------------------
482 -- Looking up an entry in the hash table
483
484 -- | Looks up the value of a key in the hash table.
485 lookup :: HashTable key val -> key -> IO (Maybe val)
486
487 lookup ht@HashTable{ cmp=eq } key = do
488 recordLookup
489 (_, _, bucket) <- findBucket ht key
490 let firstHit (k,v) r | eq key k = Just v
491 | otherwise = r
492 return (foldr firstHit Nothing bucket)
493
494 -- -----------------------------------------------------------------------------
495 -- Converting to/from lists
496
497 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
498 -- is taken from the Eq instance for the key type.
499 --
500 fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
501 fromList hash list = do
502 table <- new (==) hash
503 sequence_ [ insert table k v | (k,v) <- list ]
504 return table
505
506 -- | Converts a hash table to a list of key\/value pairs.
507 --
508 toList :: HashTable key val -> IO [(key,val)]
509 toList = mapReduce id concat
510
511 {-# INLINE mapReduce #-}
512 mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
513 mapReduce m r HashTable{ tab=ref } = do
514 HT{ buckets=bckts, bmask=b } <- readIORef ref
515 fmap r (mapM (fmap m . readHTArray bckts) [0..b])
516
517 -- -----------------------------------------------------------------------------
518 -- Diagnostics
519
520 -- | This function is useful for determining whether your hash
521 -- function is working well for your data set. It returns the longest
522 -- chain of key\/value pairs in the hash table for which all the keys
523 -- hash to the same bucket. If this chain is particularly long (say,
524 -- longer than 14 elements or so), then it might be a good idea to try
525 -- a different hash function.
526 --
527 longestChain :: HashTable key val -> IO [(key,val)]
528 longestChain = mapReduce id (maximumBy lengthCmp)
529 where lengthCmp (_:x)(_:y) = lengthCmp x y
530 lengthCmp [] [] = EQ
531 lengthCmp [] _ = LT
532 lengthCmp _ [] = GT