Wibble in docs for new ForeignPtr functionsn
[packages/random.git] / Data / ByteString / Char8.hs
1 {-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
2 --
3 -- Module : Data.ByteString.Char8
4 -- Copyright : (c) Don Stewart 2006
5 -- License : BSD-style
6 --
7 -- Maintainer : dons@cse.unsw.edu.au
8 -- Stability : experimental
9 -- Portability : portable (tested with GHC>=6.4.1 and Hugs 2005)
10 --
11
12 --
13 -- | Manipulate 'ByteString's using 'Char' operations. All Chars will be
14 -- truncated to 8 bits. It can be expected that these functions will run
15 -- at identical speeds to their 'Word8' equivalents in "Data.ByteString".
16 --
17 -- More specifically these byte strings are taken to be in the
18 -- subset of Unicode covered by code points 0-255. This covers
19 -- Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls.
20 --
21 -- See:
22 --
23 -- * <http://www.unicode.org/charts/>
24 --
25 -- * <http://www.unicode.org/charts/PDF/U0000.pdf>
26 --
27 -- * <http://www.unicode.org/charts/PDF/U0080.pdf>
28 --
29 -- This module is intended to be imported @qualified@, to avoid name
30 -- clashes with "Prelude" functions. eg.
31 --
32 -- > import qualified Data.ByteString.Char8 as B
33 --
34
35 module Data.ByteString.Char8 (
36
37 -- * The @ByteString@ type
38 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
39
40 -- * Introducing and eliminating 'ByteString's
41 empty, -- :: ByteString
42 singleton, -- :: Char -> ByteString
43 pack, -- :: String -> ByteString
44 unpack, -- :: ByteString -> String
45
46 -- * Basic interface
47 cons, -- :: Char -> ByteString -> ByteString
48 snoc, -- :: ByteString -> Char -> ByteString
49 append, -- :: ByteString -> ByteString -> ByteString
50 head, -- :: ByteString -> Char
51 last, -- :: ByteString -> Char
52 tail, -- :: ByteString -> ByteString
53 init, -- :: ByteString -> ByteString
54 null, -- :: ByteString -> Bool
55 length, -- :: ByteString -> Int
56
57 -- * Transformating ByteStrings
58 map, -- :: (Char -> Char) -> ByteString -> ByteString
59 reverse, -- :: ByteString -> ByteString
60 intersperse, -- :: Char -> ByteString -> ByteString
61 transpose, -- :: [ByteString] -> [ByteString]
62
63 -- * Reducing 'ByteString's (folds)
64 foldl, -- :: (a -> Char -> a) -> a -> ByteString -> a
65 foldl', -- :: (a -> Char -> a) -> a -> ByteString -> a
66 foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char
67 foldl1', -- :: (Char -> Char -> Char) -> ByteString -> Char
68 foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a
69 foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char
70
71 -- ** Special folds
72 concat, -- :: [ByteString] -> ByteString
73 concatMap, -- :: (Char -> ByteString) -> ByteString -> ByteString
74 any, -- :: (Char -> Bool) -> ByteString -> Bool
75 all, -- :: (Char -> Bool) -> ByteString -> Bool
76 maximum, -- :: ByteString -> Char
77 minimum, -- :: ByteString -> Char
78
79 -- * Building ByteStrings
80 -- ** Scans
81 scanl, -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
82 scanl1, -- :: (Char -> Char -> Char) -> ByteString -> ByteString
83 -- scanr, -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
84 -- scanr1, -- :: (Char -> Char -> Char) -> ByteString -> ByteString
85
86 -- ** Accumulating maps
87 -- mapAccumL, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString)
88 -- mapAccumR, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString)
89 mapIndexed, -- :: (Int -> Char -> Char) -> ByteString -> ByteString
90
91 -- * Generating and unfolding ByteStrings
92 replicate, -- :: Int -> Char -> ByteString
93 unfoldr, -- :: (a -> Maybe (Char, a)) -> a -> ByteString
94 unfoldrN, -- :: Int -> (a -> Maybe (Char, a)) -> a -> (ByteString, Maybe a)
95
96 -- * Substrings
97
98 -- ** Breaking strings
99 take, -- :: Int -> ByteString -> ByteString
100 drop, -- :: Int -> ByteString -> ByteString
101 splitAt, -- :: Int -> ByteString -> (ByteString, ByteString)
102 takeWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
103 dropWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
104 span, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
105 spanEnd, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
106 break, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
107 group, -- :: ByteString -> [ByteString]
108 groupBy, -- :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
109 inits, -- :: ByteString -> [ByteString]
110 tails, -- :: ByteString -> [ByteString]
111
112 -- ** Breaking and dropping on specific Chars
113 breakChar, -- :: Char -> ByteString -> (ByteString, ByteString)
114 spanChar, -- :: Char -> ByteString -> (ByteString, ByteString)
115 breakSpace, -- :: ByteString -> (ByteString,ByteString)
116 dropSpace, -- :: ByteString -> ByteString
117 dropSpaceEnd, -- :: ByteString -> ByteString
118
119 -- ** Breaking into many substrings
120 split, -- :: Char -> ByteString -> [ByteString]
121 splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString]
122 tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString]
123
124 -- ** Breaking into lines and words
125 lines, -- :: ByteString -> [ByteString]
126 words, -- :: ByteString -> [ByteString]
127 unlines, -- :: [ByteString] -> ByteString
128 unwords, -- :: ByteString -> [ByteString]
129
130 lines', -- :: ByteString -> [ByteString]
131 unlines', -- :: [ByteString] -> ByteString
132 linesCRLF', -- :: ByteString -> [ByteString]
133 unlinesCRLF', -- :: [ByteString] -> ByteString
134 words', -- :: ByteString -> [ByteString]
135 unwords', -- :: ByteString -> [ByteString]
136
137 lineIndices, -- :: ByteString -> [Int]
138 betweenLines, -- :: ByteString -> ByteString -> ByteString -> Maybe (ByteString)
139
140 -- ** Joining strings
141 join, -- :: ByteString -> [ByteString] -> ByteString
142 joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString
143
144
145 -- ** Searching for substrings
146 isPrefixOf, -- :: ByteString -> ByteString -> Bool
147 isSuffixOf, -- :: ByteString -> ByteString -> Bool
148 isSubstringOf, -- :: ByteString -> ByteString -> Bool
149 findSubstring, -- :: ByteString -> ByteString -> Maybe Int
150 findSubstrings, -- :: ByteString -> ByteString -> [Int]
151
152 -- * Searching ByteStrings
153
154 -- ** Searching by equality
155 elem, -- :: Char -> ByteString -> Bool
156 notElem, -- :: Char -> ByteString -> Bool
157 filterChar, -- :: Char -> ByteString -> ByteString
158 filterNotChar, -- :: Char -> ByteString -> ByteString
159
160 -- ** Searching with a predicate
161 find, -- :: (Char -> Bool) -> ByteString -> Maybe Char
162 filter, -- :: (Char -> Bool) -> ByteString -> ByteString
163 -- partition -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
164
165 -- * Indexing ByteStrings
166 index, -- :: ByteString -> Int -> Char
167 elemIndex, -- :: Char -> ByteString -> Maybe Int
168 elemIndices, -- :: Char -> ByteString -> [Int]
169 elemIndexEnd, -- :: Char -> ByteString -> Maybe Int
170 findIndex, -- :: (Char -> Bool) -> ByteString -> Maybe Int
171 findIndices, -- :: (Char -> Bool) -> ByteString -> [Int]
172 count, -- :: Char -> ByteString -> Int
173
174 -- * Zipping and unzipping ByteStrings
175 zip, -- :: ByteString -> ByteString -> [(Char,Char)]
176 zipWith, -- :: (Char -> Char -> c) -> ByteString -> ByteString -> [c]
177 unzip, -- :: [(Char,Char)] -> (ByteString,ByteString)
178
179 -- * Ordered ByteStrings
180 sort, -- :: ByteString -> ByteString
181
182 -- * Unchecked access
183 unsafeHead, -- :: ByteString -> Char
184 unsafeTail, -- :: ByteString -> ByteString
185 unsafeIndex, -- :: ByteString -> Int -> Char
186 w2c, -- :: Word8 -> Char
187 c2w, -- :: Char -> Word8
188
189 -- * Reading from ByteStrings
190 readInt, -- :: ByteString -> Maybe Int
191 unsafeReadInt, -- :: ByteString -> Maybe Int
192
193 -- * Copying ByteStrings
194 copy, -- :: ByteString -> ByteString
195
196 -- * I\/O with @ByteString@s
197
198 -- ** Standard input and output
199
200 #if defined(__GLASGOW_HASKELL__)
201 getLine, -- :: IO ByteString
202 #endif
203 getContents, -- :: IO ByteString
204 putStr, -- :: ByteString -> IO ()
205 putStrLn, -- :: ByteString -> IO ()
206
207 -- ** Files
208 readFile, -- :: FilePath -> IO ByteString
209 writeFile, -- :: FilePath -> ByteString -> IO ()
210 -- mmapFile, -- :: FilePath -> IO ByteString
211
212 -- ** I\/O with Handles
213 #if defined(__GLASGOW_HASKELL__)
214 getArgs, -- :: IO [ByteString]
215 hGetLine, -- :: Handle -> IO ByteString
216 hGetNonBlocking, -- :: Handle -> Int -> IO ByteString
217 #endif
218 hGetContents, -- :: Handle -> IO ByteString
219 hGet, -- :: Handle -> Int -> IO ByteString
220 hPut, -- :: Handle -> ByteString -> IO ()
221
222 #if defined(__GLASGOW_HASKELL__)
223 -- * Low level construction
224 -- | For constructors from foreign language types see "Data.ByteString"
225 packAddress, -- :: Addr# -> ByteString
226 unsafePackAddress, -- :: Int -> Addr# -> ByteString
227 #endif
228
229 -- * Utilities (needed for array fusion)
230 #if defined(__GLASGOW_HASKELL__)
231 unpackList,
232 #endif
233 noAL, NoAL, loopArr, loopAcc, loopSndAcc,
234 loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL,
235 filter', map'
236
237 ) where
238
239 import qualified Prelude as P
240 import Prelude hiding (reverse,head,tail,last,init,null
241 ,length,map,lines,foldl,foldr,unlines
242 ,concat,any,take,drop,splitAt,takeWhile
243 ,dropWhile,span,break,elem,filter,unwords
244 ,words,maximum,minimum,all,concatMap,scanl,scanl1
245 ,foldl1,foldr1,readFile,writeFile,replicate
246 ,getContents,getLine,putStr,putStrLn
247 ,zip,zipWith,unzip,notElem)
248
249 import qualified Data.ByteString as B
250
251 -- Listy functions transparently exported
252 import Data.ByteString (ByteString(..)
253 ,empty,null,length,tail,init,append
254 ,inits,tails,reverse,transpose
255 ,concat,take,drop,splitAt,join
256 ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
257 ,findSubstrings,unsafeTail,copy,group
258
259 ,getContents, putStr, putStrLn
260 ,readFile, {-mmapFile,-} writeFile
261 ,hGetContents, hGet, hPut
262 #if defined(__GLASGOW_HASKELL__)
263 ,getLine, getArgs, hGetLine, hGetNonBlocking
264 ,packAddress, unsafePackAddress
265 ,unpackList
266 #endif
267 ,noAL, NoAL, loopArr, loopAcc, loopSndAcc
268 ,loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL
269 ,useAsCString, unsafeUseAsCString
270 )
271
272 import Data.Char
273
274 import qualified Data.List as List (intersperse)
275
276 import Foreign
277 import Foreign.C.Types (CLong)
278 import Foreign.Marshal.Utils (with)
279
280 #if defined(__GLASGOW_HASKELL__)
281 import GHC.Base (Char(..),unsafeChr,unpackCString#,unsafeCoerce#)
282 import GHC.IOBase (IO(..),stToIO)
283 import GHC.Prim (Addr#,writeWord8OffAddr#,realWorld#,plusAddr#)
284 import GHC.Ptr (Ptr(..))
285 import GHC.ST (ST(..))
286 #endif
287
288 #define STRICT1(f) f a | a `seq` False = undefined
289 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
290 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
291
292 ------------------------------------------------------------------------
293
294 -- | /O(1)/ Convert a 'Char' into a 'ByteString'
295 singleton :: Char -> ByteString
296 singleton = B.singleton . c2w
297 {-# INLINE singleton #-}
298
299 -- | /O(n)/ Convert a 'String' into a 'ByteString'
300 --
301 -- For applications with large numbers of string literals, pack can be a
302 -- bottleneck. In such cases, consider using packAddress (GHC only).
303 pack :: String -> ByteString
304 #if !defined(__GLASGOW_HASKELL__)
305
306 pack str = B.create (P.length str) $ \p -> go p str
307 where go _ [] = return ()
308 go p (x:xs) = poke p (c2w x) >> go (p `plusPtr` 1) xs
309
310 #else /* hack away */
311
312 pack str = B.create (P.length str) $ \(Ptr p) -> stToIO (go p str)
313 where
314 go :: Addr# -> [Char] -> ST a ()
315 go _ [] = return ()
316 go p (C# c:cs) = writeByte p (unsafeCoerce# c) >> go (p `plusAddr#` 1#) cs
317
318 writeByte p c = ST $ \s# ->
319 case writeWord8OffAddr# p 0# c s# of s2# -> (# s2#, () #)
320 {-# INLINE writeByte #-}
321
322 {-# RULES
323 "pack/packAddress" forall s# .
324 pack (unpackCString# s#) = B.packAddress s#
325 #-}
326
327 #endif
328
329 {-# INLINE pack #-}
330
331 -- | /O(n)/ Converts a 'ByteString' to a 'String'.
332 unpack :: ByteString -> [Char]
333 unpack = B.unpackWith w2c
334 {-# INLINE unpack #-}
335
336 -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
337 -- complexity, as it requires a memcpy.
338 cons :: Char -> ByteString -> ByteString
339 cons = B.cons . c2w
340 {-# INLINE cons #-}
341
342 -- | /O(n)/ Append a Char to the end of a 'ByteString'. Similar to
343 -- 'cons', this function performs a memcpy.
344 snoc :: ByteString -> Char -> ByteString
345 snoc p = B.snoc p . c2w
346 {-# INLINE snoc #-}
347
348 -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
349 head :: ByteString -> Char
350 head = w2c . B.head
351 {-# INLINE head #-}
352
353 -- | /O(1)/ Extract the last element of a packed string, which must be non-empty.
354 last :: ByteString -> Char
355 last = w2c . B.last
356 {-# INLINE last #-}
357
358 -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each element of @xs@
359 map :: (Char -> Char) -> ByteString -> ByteString
360 map f = B.map (c2w . f . w2c)
361 {-# INLINE map #-}
362
363 -- | /O(n)/ The 'intersperse' function takes a Char and a 'ByteString'
364 -- and \`intersperses\' that Char between the elements of the
365 -- 'ByteString'. It is analogous to the intersperse function on Lists.
366 intersperse :: Char -> ByteString -> ByteString
367 intersperse = B.intersperse . c2w
368 {-# INLINE intersperse #-}
369
370 -- | 'foldl', applied to a binary operator, a starting value (typically
371 -- the left-identity of the operator), and a ByteString, reduces the
372 -- ByteString using the binary operator, from left to right.
373 foldl :: (a -> Char -> a) -> a -> ByteString -> a
374 foldl f = B.foldl (\a c -> f a (w2c c))
375 {-# INLINE foldl #-}
376
377 -- | 'foldl\'' is like foldl, but strict in the accumulator.
378 foldl' :: (a -> Char -> a) -> a -> ByteString -> a
379 foldl' f = B.foldl' (\a c -> f a (w2c c))
380 {-# INLINE foldl' #-}
381
382 -- | 'foldr', applied to a binary operator, a starting value
383 -- (typically the right-identity of the operator), and a packed string,
384 -- reduces the packed string using the binary operator, from right to left.
385 foldr :: (Char -> a -> a) -> a -> ByteString -> a
386 foldr f = B.foldr (\c a -> f (w2c c) a)
387 {-# INLINE foldr #-}
388
389 -- | 'foldl1' is a variant of 'foldl' that has no starting value
390 -- argument, and thus must be applied to non-empty 'ByteStrings'.
391 foldl1 :: (Char -> Char -> Char) -> ByteString -> Char
392 foldl1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
393 {-# INLINE foldl1 #-}
394
395 -- | A strict version of 'foldl1'
396 foldl1' :: (Char -> Char -> Char) -> ByteString -> Char
397 foldl1' f ps = w2c (B.foldl1' (\x y -> c2w (f (w2c x) (w2c y))) ps)
398 {-# INLINE foldl1' #-}
399
400 -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
401 -- and thus must be applied to non-empty 'ByteString's
402 foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
403 foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
404 {-# INLINE foldr1 #-}
405
406 -- | Map a function over a 'ByteString' and concatenate the results
407 concatMap :: (Char -> ByteString) -> ByteString -> ByteString
408 concatMap f = B.concatMap (f . w2c)
409 {-# INLINE concatMap #-}
410
411 -- | Applied to a predicate and a ByteString, 'any' determines if
412 -- any element of the 'ByteString' satisfies the predicate.
413 any :: (Char -> Bool) -> ByteString -> Bool
414 any f = B.any (f . w2c)
415 {-# INLINE any #-}
416
417 -- | Applied to a predicate and a 'ByteString', 'all' determines if
418 -- all elements of the 'ByteString' satisfy the predicate.
419 all :: (Char -> Bool) -> ByteString -> Bool
420 all f = B.all (f . w2c)
421 {-# INLINE all #-}
422
423 -- | 'maximum' returns the maximum value from a 'ByteString'
424 maximum :: ByteString -> Char
425 maximum = w2c . B.maximum
426 {-# INLINE maximum #-}
427
428 -- | 'minimum' returns the minimum value from a 'ByteString'
429 minimum :: ByteString -> Char
430 minimum = w2c . B.minimum
431 {-# INLINE minimum #-}
432
433 -- | /O(n)/ map Char functions, provided with the index at each position
434 mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString
435 mapIndexed f = B.mapIndexed (\i c -> c2w (f i (w2c c)))
436 {-# INLINE mapIndexed #-}
437
438 -- | 'scanl' is similar to 'foldl', but returns a list of successive
439 -- reduced values from the left:
440 --
441 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
442 --
443 -- Note that
444 --
445 -- > last (scanl f z xs) == foldl f z xs.
446 scanl :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
447 scanl f z = B.scanl (\a b -> c2w (f (w2c a) (w2c b))) (c2w z)
448
449 -- | 'scanl1' is a variant of 'scanl' that has no starting value argument:
450 --
451 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
452 scanl1 :: (Char -> Char -> Char) -> ByteString -> ByteString
453 scanl1 f = B.scanl1 (\a b -> c2w (f (w2c a) (w2c b)))
454
455 -- | /O(n)/ 'replicate' @n x@ is a ByteString of length @n@ with @x@
456 -- the value of every element. The following holds:
457 --
458 -- > replicate w c = unfoldr w (\u -> Just (u,u)) c
459 --
460 -- This implemenation uses @memset(3)@
461 replicate :: Int -> Char -> ByteString
462 replicate w = B.replicate w . c2w
463 {-# INLINE replicate #-}
464
465 -- | /O(n)/, where /n/ is the length of the result. The 'unfoldr'
466 -- function is analogous to the List \'unfoldr\'. 'unfoldr' builds a
467 -- ByteString from a seed value. The function takes the element and
468 -- returns 'Nothing' if it is done producing the ByteString or returns
469 -- 'Just' @(a,b)@, in which case, @a@ is the next character in the string,
470 -- and @b@ is the seed value for further production.
471 --
472 -- Examples:
473 --
474 -- > unfoldr (\x -> if x <= '9' then Just (x, succ x) else Nothing) '0' == "0123456789"
475 unfoldr :: (a -> Maybe (Char, a)) -> a -> ByteString
476 unfoldr f x0 = B.unfoldr (fmap k . f) x0
477 where k (i, j) = (c2w i, j)
478
479 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a ByteString from a seed
480 -- value. However, the length of the result is limited by the first
481 -- argument to 'unfoldrN'. This function is more efficient than 'unfoldr'
482 -- when the maximum length of the result is known.
483 --
484 -- The following equation relates 'unfoldrN' and 'unfoldr':
485 --
486 -- > unfoldrN n f s == take n (unfoldr f s)
487 unfoldrN :: Int -> (a -> Maybe (Char, a)) -> a -> (ByteString, Maybe a)
488 unfoldrN n f w = B.unfoldrN n ((k `fmap`) . f) w
489 where k (i,j) = (c2w i, j)
490 {-# INLINE unfoldrN #-}
491
492 -- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
493 -- returns the longest prefix (possibly empty) of @xs@ of elements that
494 -- satisfy @p@.
495 takeWhile :: (Char -> Bool) -> ByteString -> ByteString
496 takeWhile f = B.takeWhile (f . w2c)
497 {-# INLINE takeWhile #-}
498
499 -- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
500 dropWhile :: (Char -> Bool) -> ByteString -> ByteString
501 dropWhile f = B.dropWhile (f . w2c)
502 {-# INLINE dropWhile #-}
503
504 -- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
505 break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
506 break f = B.break (f . w2c)
507 {-# INLINE break #-}
508
509 -- | 'span' @p xs@ breaks the ByteString into two segments. It is
510 -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
511 span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
512 span f = B.span (f . w2c)
513 {-# INLINE span #-}
514
515 -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
516 -- We have
517 --
518 -- > spanEnd (not.isSpace) "x y z" == ("x y ","z")
519 --
520 -- and
521 --
522 -- > spanEnd (not . isSpace) ps
523 -- > ==
524 -- > let (x,y) = span (not.isSpace) (reverse ps) in (reverse y, reverse x)
525 --
526 spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
527 spanEnd f = B.spanEnd (f . w2c)
528 {-# INLINE spanEnd #-}
529
530 -- | 'breakChar' breaks its ByteString argument at the first occurence
531 -- of the specified Char. It is more efficient than 'break' as it is
532 -- implemented with @memchr(3)@. I.e.
533 --
534 -- > break (=='c') "abcd" == breakChar 'c' "abcd"
535 --
536 breakChar :: Char -> ByteString -> (ByteString, ByteString)
537 breakChar = B.breakByte . c2w
538 {-# INLINE breakChar #-}
539
540 -- | 'spanChar' breaks its ByteString argument at the first
541 -- occurence of a Char other than its argument. It is more efficient
542 -- than 'span (==)'
543 --
544 -- > span (=='c') "abcd" == spanByte 'c' "abcd"
545 --
546 spanChar :: Char -> ByteString -> (ByteString, ByteString)
547 spanChar = B.spanByte . c2w
548 {-# INLINE spanChar #-}
549
550 -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
551 -- argument, consuming the delimiter. I.e.
552 --
553 -- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
554 -- > split 'a' "aXaXaXa" == ["","X","X","X"]
555 -- > split 'x' "x" == ["",""]
556 --
557 -- and
558 --
559 -- > join [c] . split c == id
560 -- > split == splitWith . (==)
561 --
562 -- As for all splitting functions in this library, this function does
563 -- not copy the substrings, it just constructs new 'ByteStrings' that
564 -- are slices of the original.
565 --
566 split :: Char -> ByteString -> [ByteString]
567 split = B.split . c2w
568 {-# INLINE split #-}
569
570 -- | /O(n)/ Splits a 'ByteString' into components delimited by
571 -- separators, where the predicate returns True for a separator element.
572 -- The resulting components do not contain the separators. Two adjacent
573 -- separators result in an empty component in the output. eg.
574 --
575 -- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
576 --
577 splitWith :: (Char -> Bool) -> ByteString -> [ByteString]
578 splitWith f = B.splitWith (f . w2c)
579 {-# INLINE splitWith #-}
580 -- the inline makes a big difference here.
581
582 -- | Like 'splitWith', except that sequences of adjacent separators are
583 -- treated as a single separator. eg.
584 --
585 -- > tokens (=='a') "aabbaca" == ["bb","c"]
586 --
587 tokens :: (Char -> Bool) -> ByteString -> [ByteString]
588 tokens f = B.tokens (f . w2c)
589 {-# INLINE tokens #-}
590
591 -- | The 'groupBy' function is the non-overloaded version of 'group'.
592 groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
593 groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
594
595 -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
596 -- char. Around 4 times faster than the generalised join.
597 --
598 joinWithChar :: Char -> ByteString -> ByteString -> ByteString
599 joinWithChar = B.joinWithByte . c2w
600 {-# INLINE joinWithChar #-}
601
602 -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
603 index :: ByteString -> Int -> Char
604 index = (w2c .) . B.index
605 {-# INLINE index #-}
606
607 -- | /O(n)/ The 'elemIndex' function returns the index of the first
608 -- element in the given 'ByteString' which is equal (by memchr) to the
609 -- query element, or 'Nothing' if there is no such element.
610 elemIndex :: Char -> ByteString -> Maybe Int
611 elemIndex = B.elemIndex . c2w
612 {-# INLINE elemIndex #-}
613
614 -- | /O(n)/ The 'elemIndexEnd' function returns the last index of the
615 -- element in the given 'ByteString' which is equal to the query
616 -- element, or 'Nothing' if there is no such element. The following
617 -- holds:
618 --
619 -- > elemIndexEnd c xs ==
620 -- > (-) (length xs - 1) `fmap` elemIndex c (reverse xs)
621 --
622 elemIndexEnd :: Char -> ByteString -> Maybe Int
623 elemIndexEnd = B.elemIndexEnd . c2w
624 {-# INLINE elemIndexEnd #-}
625
626 -- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
627 -- the indices of all elements equal to the query element, in ascending order.
628 elemIndices :: Char -> ByteString -> [Int]
629 elemIndices = B.elemIndices . c2w
630 {-# INLINE elemIndices #-}
631
632 -- | The 'findIndex' function takes a predicate and a 'ByteString' and
633 -- returns the index of the first element in the ByteString satisfying the predicate.
634 findIndex :: (Char -> Bool) -> ByteString -> Maybe Int
635 findIndex f = B.findIndex (f . w2c)
636 {-# INLINE findIndex #-}
637
638 -- | The 'findIndices' function extends 'findIndex', by returning the
639 -- indices of all elements satisfying the predicate, in ascending order.
640 findIndices :: (Char -> Bool) -> ByteString -> [Int]
641 findIndices f = B.findIndices (f . w2c)
642
643 -- | count returns the number of times its argument appears in the ByteString
644 --
645 -- > count = length . elemIndices
646 --
647 -- Also
648 --
649 -- > count '\n' == length . lines
650 --
651 -- But more efficiently than using length on the intermediate list.
652 count :: Char -> ByteString -> Int
653 count c = B.count (c2w c)
654
655 -- | /O(n)/ 'elem' is the 'ByteString' membership predicate. This
656 -- implementation uses @memchr(3)@.
657 elem :: Char -> ByteString -> Bool
658 elem c = B.elem (c2w c)
659 {-# INLINE elem #-}
660
661 -- | /O(n)/ 'notElem' is the inverse of 'elem'
662 notElem :: Char -> ByteString -> Bool
663 notElem c = B.notElem (c2w c)
664 {-# INLINE notElem #-}
665
666 -- | /O(n)/ 'filter', applied to a predicate and a ByteString,
667 -- returns a ByteString containing those characters that satisfy the
668 -- predicate.
669 filter :: (Char -> Bool) -> ByteString -> ByteString
670 filter f = B.filter (f . w2c)
671 {-# INLINE filter #-}
672
673 -- | /O(n)/ The 'find' function takes a predicate and a ByteString,
674 -- and returns the first element in matching the predicate, or 'Nothing'
675 -- if there is no such element.
676 find :: (Char -> Bool) -> ByteString -> Maybe Char
677 find f ps = w2c `fmap` B.find (f . w2c) ps
678 {-# INLINE find #-}
679
680 -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
681 -- case of filtering a single Char. It is more efficient to use
682 -- filterChar in this case.
683 --
684 -- > filterChar == filter . (==)
685 --
686 -- filterChar is around 10x faster, and uses much less space, than its
687 -- filter equivalent
688 --
689 filterChar :: Char -> ByteString -> ByteString
690 filterChar c = B.filterByte (c2w c)
691 {-# INLINE filterChar #-}
692
693 -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
694 -- case of filtering a single Char out of a list. It is more efficient
695 -- to use /filterNotChar/ in this case.
696 --
697 -- > filterNotChar == filter . (/=)
698 --
699 -- filterNotChar is around 3x faster, and uses much less space, than its
700 -- filter equivalent
701 --
702 filterNotChar :: Char -> ByteString -> ByteString
703 filterNotChar c = B.filterNotByte (c2w c)
704 {-# INLINE filterNotChar #-}
705
706 -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
707 -- corresponding pairs of Chars. If one input ByteString is short,
708 -- excess elements of the longer ByteString are discarded. This is
709 -- equivalent to a pair of 'unpack' operations, and so space
710 -- usage may be large for multi-megabyte ByteStrings
711 zip :: ByteString -> ByteString -> [(Char,Char)]
712 zip ps qs
713 | B.null ps || B.null qs = []
714 | otherwise = (unsafeHead ps, unsafeHead qs) : zip (B.unsafeTail ps) (B.unsafeTail qs)
715
716 -- | 'zipWith' generalises 'zip' by zipping with the function given as
717 -- the first argument, instead of a tupling function. For example,
718 -- @'zipWith' (+)@ is applied to two ByteStrings to produce the list
719 -- of corresponding sums.
720 zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a]
721 zipWith f = B.zipWith ((. w2c) . f . w2c)
722
723 -- | 'unzip' transforms a list of pairs of Chars into a pair of
724 -- ByteStrings. Note that this performs two 'pack' operations.
725 unzip :: [(Char,Char)] -> (ByteString,ByteString)
726 unzip ls = (pack (P.map fst ls), pack (P.map snd ls))
727 {-# INLINE unzip #-}
728
729 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits
730 -- the check for the empty case, which is good for performance, but
731 -- there is an obligation on the programmer to provide a proof that the
732 -- ByteString is non-empty.
733 unsafeHead :: ByteString -> Char
734 unsafeHead = w2c . B.unsafeHead
735 {-# INLINE unsafeHead #-}
736
737 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a Char.
738 -- This omits the bounds check, which means there is an accompanying
739 -- obligation on the programmer to ensure the bounds are checked in some
740 -- other way.
741 unsafeIndex :: ByteString -> Int -> Char
742 unsafeIndex = (w2c .) . B.unsafeIndex
743 {-# INLINE unsafeIndex #-}
744
745 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
746 w2c :: Word8 -> Char
747 #if !defined(__GLASGOW_HASKELL__)
748 w2c = chr . fromIntegral
749 #else
750 w2c = unsafeChr . fromIntegral
751 #endif
752 {-# INLINE w2c #-}
753
754 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
755 -- silently truncates to 8 bits Chars > '\255'. It is provided as
756 -- convenience for ByteString construction.
757 c2w :: Char -> Word8
758 c2w = fromIntegral . ord
759 {-# INLINE c2w #-}
760
761 -- ---------------------------------------------------------------------
762 -- Things that depend on the encoding
763
764 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
765 -- broken at the first whitespace byte. I.e.
766 --
767 -- > break isSpace == breakSpace
768 --
769 breakSpace :: ByteString -> (ByteString,ByteString)
770 breakSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
771 i <- firstspace (p `plusPtr` s) 0 l
772 return $ case () of {_
773 | i == 0 -> (empty, PS x s l)
774 | i == l -> (PS x s l, empty)
775 | otherwise -> (PS x s i, PS x (s+i) (l-i))
776 }
777 {-# INLINE breakSpace #-}
778
779 firstspace :: Ptr Word8 -> Int -> Int -> IO Int
780 STRICT3(firstspace)
781 firstspace ptr n m
782 | n >= m = return n
783 | otherwise = do w <- peekByteOff ptr n
784 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
785
786 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
787 -- white space Chars removed from the front. It is more efficient than
788 -- calling dropWhile for removing whitespace. I.e.
789 --
790 -- > dropWhile isSpace == dropSpace
791 --
792 dropSpace :: ByteString -> ByteString
793 dropSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
794 i <- firstnonspace (p `plusPtr` s) 0 l
795 return $ if i == l then empty else PS x (s+i) (l-i)
796 {-# INLINE dropSpace #-}
797
798 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
799 STRICT3(firstnonspace)
800 firstnonspace ptr n m
801 | n >= m = return n
802 | otherwise = do w <- peekElemOff ptr n
803 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
804
805 -- | 'dropSpaceEnd' efficiently returns the 'ByteString' argument with
806 -- white space removed from the end. I.e.
807 --
808 -- > reverse . (dropWhile isSpace) . reverse == dropSpaceEnd
809 --
810 -- but it is more efficient than using multiple reverses.
811 --
812 dropSpaceEnd :: ByteString -> ByteString
813 dropSpaceEnd (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
814 i <- lastnonspace (p `plusPtr` s) (l-1)
815 return $ if i == (-1) then empty else PS x s (i+1)
816 {-# INLINE dropSpaceEnd #-}
817
818 lastnonspace :: Ptr Word8 -> Int -> IO Int
819 STRICT2(lastnonspace)
820 lastnonspace ptr n
821 | n < 0 = return n
822 | otherwise = do w <- peekElemOff ptr n
823 if isSpaceWord8 w then lastnonspace ptr (n-1) else return n
824
825 -- | 'lines' breaks a ByteString up into a list of ByteStrings at
826 -- newline Chars. The resulting strings do not contain newlines.
827 --
828 lines :: ByteString -> [ByteString]
829 lines ps
830 | null ps = []
831 | otherwise = case search ps of
832 Nothing -> [ps]
833 Just n -> take n ps : lines (drop (n+1) ps)
834 where search = elemIndex '\n'
835 {-# INLINE lines #-}
836
837 {-# Bogus rule, wrong if there's not \n at end of line
838
839 "length.lines/count"
840 P.length . lines = count '\n'
841
842 #-}
843
844 {-
845 -- Just as fast, but more complex. Should be much faster, I thought.
846 lines :: ByteString -> [ByteString]
847 lines (PS _ _ 0) = []
848 lines (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
849 let ptr = p `plusPtr` s
850
851 STRICT1(loop)
852 loop n = do
853 let q = memchr (ptr `plusPtr` n) 0x0a (fromIntegral (l-n))
854 if q == nullPtr
855 then return [PS x (s+n) (l-n)]
856 else do let i = q `minusPtr` ptr
857 ls <- loop (i+1)
858 return $! PS x (s+n) (i-n) : ls
859 loop 0
860 -}
861
862 -- | 'unlines' is an inverse operation to 'lines'. It joins lines,
863 -- after appending a terminating newline to each.
864 unlines :: [ByteString] -> ByteString
865 unlines [] = empty
866 unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space
867 where nl = singleton '\n'
868
869 -- | 'words' breaks a ByteString up into a list of words, which
870 -- were delimited by Chars representing white space. And
871 --
872 -- > tokens isSpace = words
873 --
874 words :: ByteString -> [ByteString]
875 words = B.tokens isSpaceWord8
876 {-# INLINE words #-}
877
878 -- | The 'unwords' function is analogous to the 'unlines' function, on words.
879 unwords :: [ByteString] -> ByteString
880 unwords = join (singleton ' ')
881 {-# INLINE unwords #-}
882
883 -- | /O(n)/ Indicies of newlines. Shorthand for
884 --
885 -- > elemIndices '\n'
886 --
887 lineIndices :: ByteString -> [Int]
888 lineIndices = elemIndices '\n'
889 {-# INLINE lineIndices #-}
890
891 -- | 'lines\'' behaves like 'lines', in that it breaks a ByteString on
892 -- newline Chars. However, unlike the Prelude functions, 'lines\'' and
893 -- 'unlines\'' correctly reconstruct lines that are missing terminating
894 -- newlines characters. I.e.
895 --
896 -- > unlines (lines "a\nb\nc") == "a\nb\nc\n"
897 -- > unlines' (lines' "a\nb\nc") == "a\nb\nc"
898 --
899 -- Note that this means:
900 --
901 -- > lines "a\nb\nc\n" == ["a","b","c"]
902 -- > lines' "a\nb\nc\n" == ["a","b","c",""]
903 --
904 lines' :: ByteString -> [ByteString]
905 lines' ps = ps `seq` case elemIndex '\n' ps of
906 Nothing -> [ps]
907 Just n -> take n ps : lines' (drop (n+1) ps)
908
909 -- | 'linesCRLF\'' behaves like 'lines\'', but breaks on (\\cr?\\lf)
910 linesCRLF' :: ByteString -> [ByteString]
911 linesCRLF' ps = ps `seq` case elemIndex '\n' ps of
912 Nothing -> [ps]
913 Just 0 -> empty : linesCRLF' (drop 1 ps)
914 Just n -> let k = if ps `unsafeIndex` (n-1) == '\r' then n-1 else n
915 in take k ps : linesCRLF' (drop (n+1) ps)
916
917 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
918 -- retores lines that do not have terminating newlines (see the
919 -- description for 'lines\'').
920 --
921 unlines' :: [ByteString] -> ByteString
922 unlines' ss = concat $ intersperse_newlines ss
923 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
924 intersperse_newlines s = s
925 newline = singleton '\n'
926
927 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
928 -- retores lines that do not have terminating newlines (see the
929 -- description for 'lines\''). Uses CRLF instead of LF.
930 --
931 unlinesCRLF' :: [ByteString] -> ByteString
932 unlinesCRLF' ss = concat $ intersperse_newlines ss
933 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
934 intersperse_newlines s = s
935 newline = pack "\r\n"
936
937 -- | 'words\'' behaves like 'words', with the exception that it produces
938 -- output on ByteStrings with trailing whitespace that can be
939 -- correctly inverted by 'unwords'. I.e.
940 --
941 -- > words "a b c " == ["a","b","c"]
942 -- > words' "a b c " == ["a","b","c",""]
943 --
944 -- > unwords $ words "a b c " == "a b c"
945 -- > unwords $ words' "a b c " == "a b c "
946 --
947 words' :: ByteString -> [ByteString]
948 words' = B.splitWith isSpaceWord8
949
950 -- | 'unwords\'' behaves like 'unwords'. It is provided for consistency
951 -- with the other invertable words and lines functions.
952 unwords' :: [ByteString] -> ByteString
953 unwords' = unwords
954
955 -- | 'betweenLines' returns the ByteString between the two lines given,
956 -- or Nothing if they do not appear. The returned string is the first
957 -- and shortest string such that the line before it is the given first
958 -- line, and the line after it is the given second line.
959 betweenLines :: ByteString -- ^ First line to look for
960 -> ByteString -- ^ Second line to look for
961 -> ByteString -- ^ 'ByteString' to look in
962 -> Maybe (ByteString)
963
964 betweenLines start end ps =
965 case P.break (start ==) (lines ps) of
966 (_, _:rest@(PS ps1 s1 _:_)) ->
967 case P.break (end ==) rest of
968 (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1)
969 _ -> Nothing
970 _ -> Nothing
971
972 -- ---------------------------------------------------------------------
973 -- Reading from ByteStrings
974
975 -- | readInt skips any whitespace at the beginning of its argument, and
976 -- reads an Int from the beginning of the ByteString. If there is no
977 -- integer at the beginning of the string, it returns Nothing, otherwise
978 -- it just returns the int read, and the rest of the string.
979 readInt :: ByteString -> Maybe (Int, ByteString)
980 readInt p@(PS x s l) = inlinePerformIO $ useAsCString p $ \cstr ->
981 with (castPtr cstr) $ \endpp -> do
982 val <- c_strtol (castPtr cstr) endpp 0
983 skipped <- (`minusPtr` cstr) `fmap` peek endpp
984 return $ if skipped == 0
985 then Nothing
986 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
987
988 -- | unsafeReadInt is like readInt, but requires a null terminated
989 -- ByteString. It avoids a copy if this is the case. It returns the Int
990 -- read, if any, and the rest of the string.
991 unsafeReadInt :: ByteString -> Maybe (Int, ByteString)
992 unsafeReadInt p@(PS x s l) = inlinePerformIO $ unsafeUseAsCString p $ \cstr ->
993 with (castPtr cstr) $ \endpp -> do
994 val <- c_strtol (castPtr cstr) endpp 0
995 skipped <- (`minusPtr` cstr) `fmap` peek endpp
996 return $ if skipped == 0
997 then Nothing
998 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
999
1000 foreign import ccall unsafe "stdlib.h strtol" c_strtol
1001 :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong
1002
1003 {-
1004 --
1005 -- not quite there yet
1006 --
1007 readInt :: ByteString -> Maybe (Int, ByteString)
1008 readInt = go 0
1009 where
1010 STRICT2(go)
1011 go i ps
1012 | B.null ps = Nothing
1013 | x == '-' = neg 0 xs
1014 | otherwise = pos (parse x) xs
1015 where (x, xs) = (ps `unsafeIndex` 0, unsafeTail ps)
1016
1017 STRICT2(neg)
1018 neg n qs | isSpace x = return $ Just ((i-n),xs)
1019 | otherwise = neg (parse x + (10 * n)) xs
1020 where (x, xs) = (qs `unsafeIndex` 0, unsafeTail qs)
1021
1022 STRICT2(pos)
1023 pos n qs | isSpace x = go (i+n) xs
1024 | otherwise = pos (parse x + (10 * n)) xs
1025 where (x, xs) = (qs `unsafeIndexWord8` 0, unsafeTail qs)
1026
1027 parse w = fromIntegral (w - 48) :: Int
1028 {-# INLINE parse #-}
1029 -}
1030
1031 -- ---------------------------------------------------------------------
1032 -- Internals
1033
1034 -- Just like inlinePerformIO, but we inline it. Big performance gains as
1035 -- it exposes lots of things to further inlining
1036 --
1037 {-# INLINE inlinePerformIO #-}
1038 inlinePerformIO :: IO a -> a
1039 #if defined(__GLASGOW_HASKELL__)
1040 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
1041 #else
1042 inlinePerformIO = unsafePerformIO
1043 #endif
1044
1045 -- Selects white-space characters in the Latin-1 range
1046 -- ordered by frequency
1047 -- Idea from Ketil
1048 isSpaceWord8 :: Word8 -> Bool
1049 isSpaceWord8 w = case w of
1050 0x20 -> True -- SPACE
1051 0x0A -> True -- LF, \n
1052 0x09 -> True -- HT, \t
1053 0x0C -> True -- FF, \f
1054 0x0D -> True -- CR, \r
1055 0x0B -> True -- VT, \v
1056 0xA0 -> True -- spotted by QC..
1057 _ -> False
1058 {-# INLINE isSpaceWord8 #-}
1059
1060 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
1061 -- slightly faster for one-shot cases.
1062 map' :: (Char -> Char) -> ByteString -> ByteString
1063 map' f = B.map' (c2w . f . w2c)
1064
1065 -- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be
1066 -- around 2x faster for some one-shot applications.
1067 filter' :: (Char -> Bool) -> ByteString -> ByteString
1068 filter' f = B.filter' (f . w2c)