Make sure the threaded threadDelay sleeps at least as long as it is asked to
[packages/old-time.git] / Data / HashTable.hs
1 {-# OPTIONS_GHC -fno-implicit-prelude #-}
2
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : Data.HashTable
6 -- Copyright : (c) The University of Glasgow 2003
7 -- License : BSD-style (see the file libraries/base/LICENSE)
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : provisional
11 -- Portability : portable
12 --
13 -- An implementation of extensible hash tables, as described in
14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
15 -- pp. 446--457. The implementation is also derived from the one
16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
17 --
18 -----------------------------------------------------------------------------
19
20 module Data.HashTable (
21 -- * Basic hash table operations
22 HashTable, new, insert, delete, lookup, update,
23 -- * Converting to and from lists
24 fromList, toList,
25 -- * Hash functions
26 -- $hash_functions
27 hashInt, hashString,
28 prime,
29 -- * Diagnostics
30 longestChain
31 ) where
32
33 -- This module is imported by Data.Dynamic, which is pretty low down in the
34 -- module hierarchy, so don't import "high-level" modules
35
36 #ifdef __GLASGOW_HASKELL__
37 import GHC.Base
38 #else
39 import Prelude hiding ( lookup )
40 #endif
41 import Data.Tuple ( fst )
42 import Data.Bits
43 import Data.Maybe
44 import Data.List ( maximumBy, length, concat, foldl', partition )
45 import Data.Int ( Int32 )
46
47 #if defined(__GLASGOW_HASKELL__)
48 import GHC.Num
49 import GHC.Real ( fromIntegral )
50 import GHC.Show ( Show(..) )
51 import GHC.Int ( Int64 )
52
53 import GHC.IOBase ( IO, IOArray, newIOArray,
54 unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO,
55 IORef, newIORef, readIORef, writeIORef )
56 #else
57 import Data.Char ( ord )
58 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
59 import System.IO.Unsafe ( unsafePerformIO )
60 import Data.Int ( Int64 )
61 # if defined(__HUGS__)
62 import Hugs.IOArray ( IOArray, newIOArray,
63 unsafeReadIOArray, unsafeWriteIOArray )
64 # elif defined(__NHC__)
65 import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
66 # endif
67 #endif
68 import Control.Monad ( mapM, mapM_, sequence_ )
69
70
71 -----------------------------------------------------------------------
72
73 iNSTRUMENTED :: Bool
74 iNSTRUMENTED = False
75
76 -----------------------------------------------------------------------
77
78 readHTArray :: HTArray a -> Int32 -> IO a
79 writeMutArray :: MutArray a -> Int32 -> a -> IO ()
80 freezeArray :: MutArray a -> IO (HTArray a)
81 thawArray :: HTArray a -> IO (MutArray a)
82 newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
83 #if defined(DEBUG) || defined(__NHC__)
84 type MutArray a = IOArray Int32 a
85 type HTArray a = MutArray a
86 newMutArray = newIOArray
87 readHTArray = readIOArray
88 writeMutArray = writeIOArray
89 freezeArray = return
90 thawArray = return
91 #else
92 type MutArray a = IOArray Int32 a
93 type HTArray a = MutArray a -- Array Int32 a
94 newMutArray = newIOArray
95 readHTArray arr i = readMutArray arr i -- return $! (unsafeAt arr (fromIntegral i))
96 readMutArray :: MutArray a -> Int32 -> IO a
97 readMutArray arr i = unsafeReadIOArray arr (fromIntegral i)
98 writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
99 freezeArray = return -- unsafeFreeze
100 thawArray = return -- unsafeThaw
101 #endif
102
103 data HashTable key val = HashTable {
104 cmp :: !(key -> key -> Bool),
105 hash_fn :: !(key -> Int32),
106 tab :: !(IORef (HT key val))
107 }
108 -- TODO: the IORef should really be an MVar.
109
110 data HT key val
111 = HT {
112 kcount :: !Int32, -- Total number of keys.
113 bmask :: !Int32,
114 buckets :: !(HTArray [(key,val)])
115 }
116
117 -- ------------------------------------------------------------
118 -- Instrumentation for performance tuning
119
120 -- This ought to be roundly ignored after optimization when
121 -- iNSTRUMENTED=False.
122
123 -- STRICT version of modifyIORef!
124 modifyIORef :: IORef a -> (a -> a) -> IO ()
125 modifyIORef r f = do
126 v <- readIORef r
127 let z = f v in z `seq` writeIORef r z
128
129 data HashData = HD {
130 tables :: !Integer,
131 insertions :: !Integer,
132 lookups :: !Integer,
133 totBuckets :: !Integer,
134 maxEntries :: !Int32,
135 maxChain :: !Int,
136 maxBuckets :: !Int32
137 } deriving (Eq, Show)
138
139 {-# NOINLINE hashData #-}
140 hashData :: IORef HashData
141 hashData = unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
142 totBuckets=0, maxEntries=0,
143 maxChain=0, maxBuckets=tABLE_MIN } ))
144
145 instrument :: (HashData -> HashData) -> IO ()
146 instrument i | iNSTRUMENTED = modifyIORef hashData i
147 | otherwise = return ()
148
149 recordNew :: IO ()
150 recordNew = instrument rec
151 where rec hd@HD{ tables=t, totBuckets=b } =
152 hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
153
154 recordIns :: Int32 -> Int32 -> [a] -> IO ()
155 recordIns i sz bkt = instrument rec
156 where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
157 hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
158 maxChain=mc `max` length bkt }
159
160 recordResize :: Int32 -> Int32 -> IO ()
161 recordResize older newer = instrument rec
162 where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
163 hd{ totBuckets=b+fromIntegral (newer-older),
164 maxBuckets=mx `max` newer }
165
166 recordLookup :: IO ()
167 recordLookup = instrument lkup
168 where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
169
170 -- stats :: IO String
171 -- stats = fmap show $ readIORef hashData
172
173 -- -----------------------------------------------------------------------------
174 -- Sample hash functions
175
176 -- $hash_functions
177 --
178 -- This implementation of hash tables uses the low-order /n/ bits of the hash
179 -- value for a key, where /n/ varies as the hash table grows. A good hash
180 -- function therefore will give an even distribution regardless of /n/.
181 --
182 -- If your keyspace is integrals such that the low-order bits between
183 -- keys are highly variable, then you could get away with using 'id'
184 -- as the hash function.
185 --
186 -- We provide some sample hash functions for 'Int' and 'String' below.
187
188 golden :: Int32
189 golden = -1640531527
190
191 -- | A sample (and useful) hash function for Int and Int32,
192 -- implemented by extracting the uppermost 32 bits of the 64-bit
193 -- result of multiplying by a 32-bit constant. The constant is from
194 -- Knuth, derived from the golden ratio:
195 --
196 -- > golden = round ((sqrt 5 - 1) * 2^31) :: Int
197 hashInt :: Int -> Int32
198 hashInt x = mulHi (fromIntegral x) golden
199
200 -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
201 mulHi :: Int32 -> Int32 -> Int32
202 mulHi a b = fromIntegral (r `shiftR` 32)
203 where r :: Int64
204 r = fromIntegral a * fromIntegral b :: Int64
205
206 -- | A sample hash function for Strings. We keep multiplying by the
207 -- golden ratio and adding. The implementation is:
208 --
209 -- > hashString = foldl' f 0
210 -- > where f m c = fromIntegral (ord c) + mulHi m golden
211 --
212 -- Note that this has not been extensively tested for reasonability,
213 -- but Knuth argues that repeated multiplication by the golden ratio
214 -- will minimize gaps in the hash space.
215 hashString :: String -> Int32
216 hashString = foldl' f 0
217 where f m c = fromIntegral (ord c) + mulHi m golden
218
219 -- | A prime larger than the maximum hash table size
220 prime :: Int32
221 prime = 33554467
222
223 -- -----------------------------------------------------------------------------
224 -- Parameters
225
226 tABLE_MAX :: Int32
227 tABLE_MAX = 32 * 1024 * 1024 -- Maximum size of hash table
228 tABLE_MIN :: Int32
229 tABLE_MIN = 8
230
231 hLOAD :: Int32
232 hLOAD = 7 -- Maximum average load of a single hash bucket
233
234 hYSTERESIS :: Int32
235 hYSTERESIS = 64 -- entries to ignore in load computation
236
237 {- Hysteresis favors long association-list-like behavior for small tables. -}
238
239 -- -----------------------------------------------------------------------------
240 -- Creating a new hash table
241
242 -- | Creates a new hash table. The following property should hold for the @eq@
243 -- and @hash@ functions passed to 'new':
244 --
245 -- > eq A B => hash A == hash B
246 --
247 new
248 :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
249 -> (key -> Int32) -- ^ @hash@: A hash function on keys
250 -> IO (HashTable key val) -- ^ Returns: an empty hash table
251
252 new cmpr hash = do
253 recordNew
254 -- make a new hash table with a single, empty, segment
255 let mask = tABLE_MIN-1
256 bkts' <- newMutArray (0,mask) []
257 bkts <- freezeArray bkts'
258
259 let
260 kcnt = 0
261 ht = HT { buckets=bkts, kcount=kcnt, bmask=mask }
262
263 table <- newIORef ht
264 return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
265
266 -- -----------------------------------------------------------------------------
267 -- Inserting a key\/value pair into the hash table
268
269 -- | Inserts a key\/value mapping into the hash table.
270 --
271 -- Note that 'insert' doesn't remove the old entry from the table -
272 -- the behaviour is like an association list, where 'lookup' returns
273 -- the most-recently-inserted mapping for a key in the table. The
274 -- reason for this is to keep 'insert' as efficient as possible. If
275 -- you need to update a mapping, then we provide 'update'.
276 --
277 insert :: HashTable key val -> key -> val -> IO ()
278
279 insert ht key val =
280 updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
281
282
283 -- ------------------------------------------------------------
284 -- The core of the implementation is lurking down here, in findBucket,
285 -- updatingBucket, and expandHashTable.
286
287 tooBig :: Int32 -> Int32 -> Bool
288 tooBig k b = k-hYSTERESIS > hLOAD * b
289
290 -- index of bucket within table.
291 bucketIndex :: Int32 -> Int32 -> Int32
292 bucketIndex mask h = h .&. mask
293
294 -- find the bucket in which the key belongs.
295 -- returns (key equality, bucket index, bucket)
296 --
297 -- This rather grab-bag approach gives enough power to do pretty much
298 -- any bucket-finding thing you might want to do. We rely on inlining
299 -- to throw away the stuff we don't want. I'm proud to say that this
300 -- plus updatingBucket below reduce most of the other definitions to a
301 -- few lines of code, while actually speeding up the hashtable
302 -- implementation when compared with a version which does everything
303 -- from scratch.
304 {-# INLINE findBucket #-}
305 findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
306 findBucket HashTable{ tab=ref, hash_fn=hash} key = do
307 table@HT{ buckets=bkts, bmask=b } <- readIORef ref
308 let indx = bucketIndex b (hash key)
309 bucket <- readHTArray bkts indx
310 return (table, indx, bucket)
311
312 data Inserts = CanInsert
313 | Can'tInsert
314 deriving (Eq)
315
316 -- updatingBucket is the real workhorse of all single-element table
317 -- updates. It takes a hashtable and a key, along with a function
318 -- describing what to do with the bucket in which that key belongs. A
319 -- flag indicates whether this function may perform table insertions.
320 -- The function returns the new contents of the bucket, the number of
321 -- bucket entries inserted (negative if entries were deleted), and a
322 -- value which becomes the return value for the function as a whole.
323 -- The table sizing is enforced here, calling out to expandSubTable as
324 -- necessary.
325
326 -- This function is intended to be inlined and specialized for every
327 -- calling context (eg every provided bucketFn).
328 {-# INLINE updatingBucket #-}
329
330 updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
331 HashTable key val -> key ->
332 IO a
333 updatingBucket canEnlarge bucketFn
334 ht@HashTable{ tab=ref, hash_fn=hash } key = do
335 (table@HT{ kcount=k, buckets=bkts, bmask=b },
336 indx, bckt) <- findBucket ht key
337 (bckt', inserts, result) <- return $ bucketFn bckt
338 let k' = k + inserts
339 table1 = table { kcount=k' }
340 bkts' <- thawArray bkts
341 writeMutArray bkts' indx bckt'
342 freezeArray bkts'
343 table2 <- if canEnlarge == CanInsert && inserts > 0 then do
344 recordIns inserts k' bckt'
345 if tooBig k' b
346 then expandHashTable hash table1
347 else return table1
348 else return table1
349 writeIORef ref table2
350 return result
351
352 expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
353 expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
354 let
355 oldsize = mask + 1
356 newmask = mask + mask + 1
357 recordResize oldsize (newmask+1)
358 --
359 if newmask > tABLE_MAX-1
360 then return table
361 else do
362 --
363 newbkts' <- newMutArray (0,newmask) []
364
365 let
366 splitBucket oldindex = do
367 bucket <- readHTArray bkts oldindex
368 let (oldb,newb) =
369 partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
370 writeMutArray newbkts' oldindex oldb
371 writeMutArray newbkts' (oldindex + oldsize) newb
372 mapM_ splitBucket [0..mask]
373
374 newbkts <- freezeArray newbkts'
375
376 return ( table{ buckets=newbkts, bmask=newmask } )
377
378 -- -----------------------------------------------------------------------------
379 -- Deleting a mapping from the hash table
380
381 -- Remove a key from a bucket
382 deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
383 deleteBucket _ [] = ([],0,())
384 deleteBucket del (pair@(k,_):bucket) =
385 case deleteBucket del bucket of
386 (bucket', dels, _) | del k -> dels' `seq` (bucket', dels', ())
387 | otherwise -> (pair:bucket', dels, ())
388 where dels' = dels - 1
389
390 -- | Remove an entry from the hash table.
391 delete :: HashTable key val -> key -> IO ()
392
393 delete ht@HashTable{ cmp=eq } key =
394 updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
395
396 -- -----------------------------------------------------------------------------
397 -- Updating a mapping in the hash table
398
399 -- | Updates an entry in the hash table, returning 'True' if there was
400 -- already an entry for this key, or 'False' otherwise. After 'update'
401 -- there will always be exactly one entry for the given key in the table.
402 --
403 -- 'insert' is more efficient than 'update' if you don't care about
404 -- multiple entries, or you know for sure that multiple entries can't
405 -- occur. However, 'update' is more efficient than 'delete' followed
406 -- by 'insert'.
407 update :: HashTable key val -> key -> val -> IO Bool
408
409 update ht@HashTable{ cmp=eq } key val =
410 updatingBucket CanInsert
411 (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
412 in ((key,val):bucket', 1+dels, dels/=0))
413 ht key
414
415 -- -----------------------------------------------------------------------------
416 -- Looking up an entry in the hash table
417
418 -- | Looks up the value of a key in the hash table.
419 lookup :: HashTable key val -> key -> IO (Maybe val)
420
421 lookup ht@HashTable{ cmp=eq } key = do
422 recordLookup
423 (_, _, bucket) <- findBucket ht key
424 let firstHit (k,v) r | eq key k = Just v
425 | otherwise = r
426 return (foldr firstHit Nothing bucket)
427
428 -- -----------------------------------------------------------------------------
429 -- Converting to/from lists
430
431 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
432 -- is taken from the Eq instance for the key type.
433 --
434 fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
435 fromList hash list = do
436 table <- new (==) hash
437 sequence_ [ insert table k v | (k,v) <- list ]
438 return table
439
440 -- | Converts a hash table to a list of key\/value pairs.
441 --
442 toList :: HashTable key val -> IO [(key,val)]
443 toList = mapReduce id concat
444
445 {-# INLINE mapReduce #-}
446 mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
447 mapReduce m r HashTable{ tab=ref } = do
448 HT{ buckets=bckts, bmask=b } <- readIORef ref
449 fmap r (mapM (fmap m . readHTArray bckts) [0..b])
450
451 -- -----------------------------------------------------------------------------
452 -- Diagnostics
453
454 -- | This function is useful for determining whether your hash
455 -- function is working well for your data set. It returns the longest
456 -- chain of key\/value pairs in the hash table for which all the keys
457 -- hash to the same bucket. If this chain is particularly long (say,
458 -- longer than 14 elements or so), then it might be a good idea to try
459 -- a different hash function.
460 --
461 longestChain :: HashTable key val -> IO [(key,val)]
462 longestChain = mapReduce id (maximumBy lengthCmp)
463 where lengthCmp (_:x)(_:y) = lengthCmp x y
464 lengthCmp [] [] = EQ
465 lengthCmp [] _ = LT
466 lengthCmp _ [] = GT