Merge pull request #125 from Jubobs/master
[packages/text.git] / Data / Text / Lazy.hs
1 {-# OPTIONS_GHC -fno-warn-orphans #-}
2 {-# LANGUAGE BangPatterns, MagicHash, CPP #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text.Lazy
12 -- Copyright : (c) 2009, 2010, 2012 Bryan O'Sullivan
13 --
14 -- License : BSD-style
15 -- Maintainer : bos@serpentine.com
16 -- Stability : experimental
17 -- Portability : GHC
18 --
19 -- A time and space-efficient implementation of Unicode text using
20 -- lists of packed arrays.
21 --
22 -- /Note/: Read below the synopsis for important notes on the use of
23 -- this module.
24 --
25 -- The representation used by this module is suitable for high
26 -- performance use and for streaming large quantities of data. It
27 -- provides a means to manipulate a large body of text without
28 -- requiring that the entire content be resident in memory.
29 --
30 -- Some operations, such as 'concat', 'append', 'reverse' and 'cons',
31 -- have better time complexity than their "Data.Text" equivalents, due
32 -- to the underlying representation being a list of chunks. For other
33 -- operations, lazy 'Text's are usually within a few percent of strict
34 -- ones, but often with better heap usage if used in a streaming
35 -- fashion. For data larger than available memory, or if you have
36 -- tight memory constraints, this module will be the only option.
37 --
38 -- This module is intended to be imported @qualified@, to avoid name
39 -- clashes with "Prelude" functions. eg.
40 --
41 -- > import qualified Data.Text.Lazy as L
42
43 module Data.Text.Lazy
44 (
45 -- * Fusion
46 -- $fusion
47
48 -- * Acceptable data
49 -- $replacement
50
51 -- * Types
52 Text
53
54 -- * Creation and elimination
55 , pack
56 , unpack
57 , singleton
58 , empty
59 , fromChunks
60 , toChunks
61 , toStrict
62 , fromStrict
63 , foldrChunks
64 , foldlChunks
65
66 -- * Basic interface
67 , cons
68 , snoc
69 , append
70 , uncons
71 , head
72 , last
73 , tail
74 , init
75 , null
76 , length
77 , compareLength
78
79 -- * Transformations
80 , map
81 , intercalate
82 , intersperse
83 , transpose
84 , reverse
85 , replace
86
87 -- ** Case conversion
88 -- $case
89 , toCaseFold
90 , toLower
91 , toUpper
92 , toTitle
93
94 -- ** Justification
95 , justifyLeft
96 , justifyRight
97 , center
98
99 -- * Folds
100 , foldl
101 , foldl'
102 , foldl1
103 , foldl1'
104 , foldr
105 , foldr1
106
107 -- ** Special folds
108 , concat
109 , concatMap
110 , any
111 , all
112 , maximum
113 , minimum
114
115 -- * Construction
116
117 -- ** Scans
118 , scanl
119 , scanl1
120 , scanr
121 , scanr1
122
123 -- ** Accumulating maps
124 , mapAccumL
125 , mapAccumR
126
127 -- ** Generation and unfolding
128 , repeat
129 , replicate
130 , cycle
131 , iterate
132 , unfoldr
133 , unfoldrN
134
135 -- * Substrings
136
137 -- ** Breaking strings
138 , take
139 , takeEnd
140 , drop
141 , dropEnd
142 , takeWhile
143 , takeWhileEnd
144 , dropWhile
145 , dropWhileEnd
146 , dropAround
147 , strip
148 , stripStart
149 , stripEnd
150 , splitAt
151 , span
152 , breakOn
153 , breakOnEnd
154 , break
155 , group
156 , groupBy
157 , inits
158 , tails
159
160 -- ** Breaking into many substrings
161 -- $split
162 , splitOn
163 , split
164 , chunksOf
165 -- , breakSubstring
166
167 -- ** Breaking into lines and words
168 , lines
169 , words
170 , unlines
171 , unwords
172
173 -- * Predicates
174 , isPrefixOf
175 , isSuffixOf
176 , isInfixOf
177
178 -- ** View patterns
179 , stripPrefix
180 , stripSuffix
181 , commonPrefixes
182
183 -- * Searching
184 , filter
185 , find
186 , breakOnAll
187 , partition
188
189 -- , findSubstring
190
191 -- * Indexing
192 , index
193 , count
194
195 -- * Zipping and unzipping
196 , zip
197 , zipWith
198
199 -- -* Ordered text
200 -- , sort
201 ) where
202
203 import Prelude (Char, Bool(..), Maybe(..), String,
204 Eq(..), Ord(..), Ordering(..), Read(..), Show(..),
205 (&&), (||), (+), (-), (.), ($), (++),
206 error, flip, fmap, fromIntegral, not, otherwise, quot)
207 import qualified Prelude as P
208 #if defined(HAVE_DEEPSEQ)
209 import Control.DeepSeq (NFData(..))
210 #endif
211 import Data.Int (Int64)
212 import qualified Data.List as L
213 import Data.Char (isSpace)
214 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
215 Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
216 import Data.Binary (Binary(get, put))
217 import Data.Monoid (Monoid(..))
218 #if MIN_VERSION_base(4,9,0)
219 import Data.Semigroup (Semigroup(..))
220 #endif
221 import Data.String (IsString(..))
222 import qualified Data.Text as T
223 import qualified Data.Text.Internal as T
224 import qualified Data.Text.Internal.Fusion.Common as S
225 import qualified Data.Text.Unsafe as T
226 import qualified Data.Text.Internal.Lazy.Fusion as S
227 import Data.Text.Internal.Fusion.Types (PairS(..))
228 import Data.Text.Internal.Lazy.Fusion (stream, unstream)
229 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldlChunks,
230 foldrChunks, smallChunkSize)
231 import Data.Text.Internal (firstf, safe, text)
232 import Data.Text.Lazy.Encoding (decodeUtf8', encodeUtf8)
233 import qualified Data.Text.Internal.Functions as F
234 import Data.Text.Internal.Lazy.Search (indices)
235 #if __GLASGOW_HASKELL__ >= 702
236 import qualified GHC.CString as GHC
237 #else
238 import qualified GHC.Base as GHC
239 #endif
240 #if __GLASGOW_HASKELL__ >= 708
241 import qualified GHC.Exts as Exts
242 #endif
243 import GHC.Prim (Addr#)
244 #if MIN_VERSION_base(4,7,0)
245 import Text.Printf (PrintfArg, formatArg, formatString)
246 #endif
247
248 -- $fusion
249 --
250 -- Most of the functions in this module are subject to /fusion/,
251 -- meaning that a pipeline of such functions will usually allocate at
252 -- most one 'Text' value.
253 --
254 -- As an example, consider the following pipeline:
255 --
256 -- > import Data.Text.Lazy as T
257 -- > import Data.Text.Lazy.Encoding as E
258 -- > import Data.ByteString.Lazy (ByteString)
259 -- >
260 -- > countChars :: ByteString -> Int
261 -- > countChars = T.length . T.toUpper . E.decodeUtf8
262 --
263 -- From the type signatures involved, this looks like it should
264 -- allocate one 'ByteString' value, and two 'Text' values. However,
265 -- when a module is compiled with optimisation enabled under GHC, the
266 -- two intermediate 'Text' values will be optimised away, and the
267 -- function will be compiled down to a single loop over the source
268 -- 'ByteString'.
269 --
270 -- Functions that can be fused by the compiler are documented with the
271 -- phrase \"Subject to fusion\".
272
273 -- $replacement
274 --
275 -- A 'Text' value is a sequence of Unicode scalar values, as defined
276 -- in §3.9, definition D76 of the Unicode 5.2 standard:
277 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35>. As
278 -- such, a 'Text' cannot contain values in the range U+D800 to U+DFFF
279 -- inclusive. Haskell implementations admit all Unicode code points
280 -- (&#xa7;3.4, definition D10) as 'Char' values, including code points
281 -- from this invalid range. This means that there are some 'Char'
282 -- values that are not valid Unicode scalar values, and the functions
283 -- in this module must handle those cases.
284 --
285 -- Within this module, many functions construct a 'Text' from one or
286 -- more 'Char' values. Those functions will substitute 'Char' values
287 -- that are not valid Unicode scalar values with the replacement
288 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
289 -- inspection and replacement are documented with the phrase
290 -- \"Performs replacement on invalid scalar values\".
291 --
292 -- (One reason for this policy of replacement is that internally, a
293 -- 'Text' value is represented as packed UTF-16 data. Values in the
294 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
295 -- code points, and so cannot be represented. The functions replace
296 -- invalid scalar values, instead of dropping them, as a security
297 -- measure. For details, see Unicode Technical Report 36, &#xa7;3.5:
298 -- <http://unicode.org/reports/tr36#Deletion_of_Noncharacters>)
299
300 equal :: Text -> Text -> Bool
301 equal Empty Empty = True
302 equal Empty _ = False
303 equal _ Empty = False
304 equal (Chunk a as) (Chunk b bs) =
305 case compare lenA lenB of
306 LT -> a == (T.takeWord16 lenA b) &&
307 as `equal` Chunk (T.dropWord16 lenA b) bs
308 EQ -> a == b && as `equal` bs
309 GT -> T.takeWord16 lenB a == b &&
310 Chunk (T.dropWord16 lenB a) as `equal` bs
311 where lenA = T.lengthWord16 a
312 lenB = T.lengthWord16 b
313
314 instance Eq Text where
315 (==) = equal
316 {-# INLINE (==) #-}
317
318 instance Ord Text where
319 compare = compareText
320
321 compareText :: Text -> Text -> Ordering
322 compareText Empty Empty = EQ
323 compareText Empty _ = LT
324 compareText _ Empty = GT
325 compareText (Chunk a0 as) (Chunk b0 bs) = outer a0 b0
326 where
327 outer ta@(T.Text arrA offA lenA) tb@(T.Text arrB offB lenB) = go 0 0
328 where
329 go !i !j
330 | i >= lenA = compareText as (chunk (T.Text arrB (offB+j) (lenB-j)) bs)
331 | j >= lenB = compareText (chunk (T.Text arrA (offA+i) (lenA-i)) as) bs
332 | a < b = LT
333 | a > b = GT
334 | otherwise = go (i+di) (j+dj)
335 where T.Iter a di = T.iter ta i
336 T.Iter b dj = T.iter tb j
337
338 instance Show Text where
339 showsPrec p ps r = showsPrec p (unpack ps) r
340
341 instance Read Text where
342 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
343
344 #if MIN_VERSION_base(4,9,0)
345 -- Semigroup orphan instances for older GHCs are provided by
346 -- 'semigroups` package
347
348 instance Semigroup Text where
349 (<>) = append
350 #endif
351
352 instance Monoid Text where
353 mempty = empty
354 #if MIN_VERSION_base(4,9,0)
355 mappend = (<>) -- future-proof definition
356 #else
357 mappend = append
358 #endif
359 mconcat = concat
360
361 instance IsString Text where
362 fromString = pack
363
364 #if __GLASGOW_HASKELL__ >= 708
365 instance Exts.IsList Text where
366 type Item Text = Char
367 fromList = pack
368 toList = unpack
369 #endif
370
371 #if defined(HAVE_DEEPSEQ)
372 instance NFData Text where
373 rnf Empty = ()
374 rnf (Chunk _ ts) = rnf ts
375 #endif
376
377 instance Binary Text where
378 put t = put (encodeUtf8 t)
379 get = do
380 bs <- get
381 case decodeUtf8' bs of
382 P.Left exn -> P.fail (P.show exn)
383 P.Right a -> P.return a
384
385 -- | This instance preserves data abstraction at the cost of inefficiency.
386 -- We omit reflection services for the sake of data abstraction.
387 --
388 -- This instance was created by copying the updated behavior of
389 -- @"Data.Text".@'Data.Text.Text'
390 instance Data Text where
391 gfoldl f z txt = z pack `f` (unpack txt)
392 toConstr _ = packConstr
393 gunfold k z c = case constrIndex c of
394 1 -> k (z pack)
395 _ -> error "Data.Text.Lazy.Text.gunfold"
396 dataTypeOf _ = textDataType
397
398 #if MIN_VERSION_base(4,7,0)
399 -- | Only defined for @base-4.7.0.0@ and later
400 instance PrintfArg Text where
401 formatArg txt = formatString $ unpack txt
402 #endif
403
404 packConstr :: Constr
405 packConstr = mkConstr textDataType "pack" [] Prefix
406
407 textDataType :: DataType
408 textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr]
409
410 -- | /O(n)/ Convert a 'String' into a 'Text'.
411 --
412 -- Subject to fusion. Performs replacement on invalid scalar values.
413 pack :: String -> Text
414 pack = unstream . S.streamList . L.map safe
415 {-# INLINE [1] pack #-}
416
417 -- | /O(n)/ Convert a 'Text' into a 'String'.
418 -- Subject to fusion.
419 unpack :: Text -> String
420 unpack t = S.unstreamList (stream t)
421 {-# INLINE [1] unpack #-}
422
423 -- | /O(n)/ Convert a literal string into a Text.
424 unpackCString# :: Addr# -> Text
425 unpackCString# addr# = unstream (S.streamCString# addr#)
426 {-# NOINLINE unpackCString# #-}
427
428 {-# RULES "TEXT literal" forall a.
429 unstream (S.streamList (L.map safe (GHC.unpackCString# a)))
430 = unpackCString# a #-}
431
432 {-# RULES "TEXT literal UTF8" forall a.
433 unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a)))
434 = unpackCString# a #-}
435
436 {-# RULES "LAZY TEXT empty literal"
437 unstream (S.streamList (L.map safe []))
438 = Empty #-}
439
440 {-# RULES "LAZY TEXT empty literal" forall a.
441 unstream (S.streamList (L.map safe [a]))
442 = Chunk (T.singleton a) Empty #-}
443
444 -- | /O(1)/ Convert a character into a Text. Subject to fusion.
445 -- Performs replacement on invalid scalar values.
446 singleton :: Char -> Text
447 singleton c = Chunk (T.singleton c) Empty
448 {-# INLINE [1] singleton #-}
449
450 {-# RULES
451 "LAZY TEXT singleton -> fused" [~1] forall c.
452 singleton c = unstream (S.singleton c)
453 "LAZY TEXT singleton -> unfused" [1] forall c.
454 unstream (S.singleton c) = singleton c
455 #-}
456
457 -- | /O(c)/ Convert a list of strict 'T.Text's into a lazy 'Text'.
458 fromChunks :: [T.Text] -> Text
459 fromChunks cs = L.foldr chunk Empty cs
460
461 -- | /O(n)/ Convert a lazy 'Text' into a list of strict 'T.Text's.
462 toChunks :: Text -> [T.Text]
463 toChunks cs = foldrChunks (:) [] cs
464
465 -- | /O(n)/ Convert a lazy 'Text' into a strict 'T.Text'.
466 toStrict :: Text -> T.Text
467 toStrict t = T.concat (toChunks t)
468 {-# INLINE [1] toStrict #-}
469
470 -- | /O(c)/ Convert a strict 'T.Text' into a lazy 'Text'.
471 fromStrict :: T.Text -> Text
472 fromStrict t = chunk t Empty
473 {-# INLINE [1] fromStrict #-}
474
475 -- -----------------------------------------------------------------------------
476 -- * Basic functions
477
478 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
479 -- is more costly than its 'List' counterpart because it requires
480 -- copying a new array. Subject to fusion.
481 cons :: Char -> Text -> Text
482 cons c t = Chunk (T.singleton c) t
483 {-# INLINE [1] cons #-}
484
485 infixr 5 `cons`
486
487 {-# RULES
488 "LAZY TEXT cons -> fused" [~1] forall c t.
489 cons c t = unstream (S.cons c (stream t))
490 "LAZY TEXT cons -> unfused" [1] forall c t.
491 unstream (S.cons c (stream t)) = cons c t
492 #-}
493
494 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
495 -- entire array in the process, unless fused. Subject to fusion.
496 snoc :: Text -> Char -> Text
497 snoc t c = foldrChunks Chunk (singleton c) t
498 {-# INLINE [1] snoc #-}
499
500 {-# RULES
501 "LAZY TEXT snoc -> fused" [~1] forall t c.
502 snoc t c = unstream (S.snoc (stream t) c)
503 "LAZY TEXT snoc -> unfused" [1] forall t c.
504 unstream (S.snoc (stream t) c) = snoc t c
505 #-}
506
507 -- | /O(n\/c)/ Appends one 'Text' to another. Subject to fusion.
508 append :: Text -> Text -> Text
509 append xs ys = foldrChunks Chunk ys xs
510 {-# INLINE [1] append #-}
511
512 {-# RULES
513 "LAZY TEXT append -> fused" [~1] forall t1 t2.
514 append t1 t2 = unstream (S.append (stream t1) (stream t2))
515 "LAZY TEXT append -> unfused" [1] forall t1 t2.
516 unstream (S.append (stream t1) (stream t2)) = append t1 t2
517 #-}
518
519 -- | /O(1)/ Returns the first character and rest of a 'Text', or
520 -- 'Nothing' if empty. Subject to fusion.
521 uncons :: Text -> Maybe (Char, Text)
522 uncons Empty = Nothing
523 uncons (Chunk t ts) = Just (T.unsafeHead t, ts')
524 where ts' | T.compareLength t 1 == EQ = ts
525 | otherwise = Chunk (T.unsafeTail t) ts
526 {-# INLINE uncons #-}
527
528 -- | /O(1)/ Returns the first character of a 'Text', which must be
529 -- non-empty. Subject to fusion.
530 head :: Text -> Char
531 head t = S.head (stream t)
532 {-# INLINE head #-}
533
534 -- | /O(1)/ Returns all characters after the head of a 'Text', which
535 -- must be non-empty. Subject to fusion.
536 tail :: Text -> Text
537 tail (Chunk t ts) = chunk (T.tail t) ts
538 tail Empty = emptyError "tail"
539 {-# INLINE [1] tail #-}
540
541 {-# RULES
542 "LAZY TEXT tail -> fused" [~1] forall t.
543 tail t = unstream (S.tail (stream t))
544 "LAZY TEXT tail -> unfused" [1] forall t.
545 unstream (S.tail (stream t)) = tail t
546 #-}
547
548 -- | /O(1)/ Returns all but the last character of a 'Text', which must
549 -- be non-empty. Subject to fusion.
550 init :: Text -> Text
551 init (Chunk t0 ts0) = go t0 ts0
552 where go t (Chunk t' ts) = Chunk t (go t' ts)
553 go t Empty = chunk (T.init t) Empty
554 init Empty = emptyError "init"
555 {-# INLINE [1] init #-}
556
557 {-# RULES
558 "LAZY TEXT init -> fused" [~1] forall t.
559 init t = unstream (S.init (stream t))
560 "LAZY TEXT init -> unfused" [1] forall t.
561 unstream (S.init (stream t)) = init t
562 #-}
563
564 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
565 -- fusion.
566 null :: Text -> Bool
567 null Empty = True
568 null _ = False
569 {-# INLINE [1] null #-}
570
571 {-# RULES
572 "LAZY TEXT null -> fused" [~1] forall t.
573 null t = S.null (stream t)
574 "LAZY TEXT null -> unfused" [1] forall t.
575 S.null (stream t) = null t
576 #-}
577
578 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
579 -- Subject to fusion.
580 isSingleton :: Text -> Bool
581 isSingleton = S.isSingleton . stream
582 {-# INLINE isSingleton #-}
583
584 -- | /O(1)/ Returns the last character of a 'Text', which must be
585 -- non-empty. Subject to fusion.
586 last :: Text -> Char
587 last Empty = emptyError "last"
588 last (Chunk t ts) = go t ts
589 where go _ (Chunk t' ts') = go t' ts'
590 go t' Empty = T.last t'
591 {-# INLINE [1] last #-}
592
593 {-# RULES
594 "LAZY TEXT last -> fused" [~1] forall t.
595 last t = S.last (stream t)
596 "LAZY TEXT last -> unfused" [1] forall t.
597 S.last (stream t) = last t
598 #-}
599
600 -- | /O(n)/ Returns the number of characters in a 'Text'.
601 -- Subject to fusion.
602 length :: Text -> Int64
603 length = foldlChunks go 0
604 where go l t = l + fromIntegral (T.length t)
605 {-# INLINE [1] length #-}
606
607 {-# RULES
608 "LAZY TEXT length -> fused" [~1] forall t.
609 length t = S.length (stream t)
610 "LAZY TEXT length -> unfused" [1] forall t.
611 S.length (stream t) = length t
612 #-}
613
614 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
615 -- Subject to fusion.
616 --
617 -- This function gives the same answer as comparing against the result
618 -- of 'length', but can short circuit if the count of characters is
619 -- greater than the number, and hence be more efficient.
620 compareLength :: Text -> Int64 -> Ordering
621 compareLength t n = S.compareLengthI (stream t) n
622 {-# INLINE [1] compareLength #-}
623
624 -- We don't apply those otherwise appealing length-to-compareLength
625 -- rewrite rules here, because they can change the strictness
626 -- properties of code.
627
628 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
629 -- each element of @t@. Subject to fusion. Performs replacement on
630 -- invalid scalar values.
631 map :: (Char -> Char) -> Text -> Text
632 map f t = unstream (S.map (safe . f) (stream t))
633 {-# INLINE [1] map #-}
634
635 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
636 -- 'Text's and concatenates the list after interspersing the first
637 -- argument between each element of the list.
638 intercalate :: Text -> [Text] -> Text
639 intercalate t = concat . (F.intersperse t)
640 {-# INLINE intercalate #-}
641
642 -- | /O(n)/ The 'intersperse' function takes a character and places it
643 -- between the characters of a 'Text'. Subject to fusion. Performs
644 -- replacement on invalid scalar values.
645 intersperse :: Char -> Text -> Text
646 intersperse c t = unstream (S.intersperse (safe c) (stream t))
647 {-# INLINE intersperse #-}
648
649 -- | /O(n)/ Left-justify a string to the given length, using the
650 -- specified fill character on the right. Subject to fusion. Performs
651 -- replacement on invalid scalar values.
652 --
653 -- Examples:
654 --
655 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
656 -- > justifyLeft 3 'x' "foobar" == "foobar"
657 justifyLeft :: Int64 -> Char -> Text -> Text
658 justifyLeft k c t
659 | len >= k = t
660 | otherwise = t `append` replicateChar (k-len) c
661 where len = length t
662 {-# INLINE [1] justifyLeft #-}
663
664 {-# RULES
665 "LAZY TEXT justifyLeft -> fused" [~1] forall k c t.
666 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
667 "LAZY TEXT justifyLeft -> unfused" [1] forall k c t.
668 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
669 #-}
670
671 -- | /O(n)/ Right-justify a string to the given length, using the
672 -- specified fill character on the left. Performs replacement on
673 -- invalid scalar values.
674 --
675 -- Examples:
676 --
677 -- > justifyRight 7 'x' "bar" == "xxxxbar"
678 -- > justifyRight 3 'x' "foobar" == "foobar"
679 justifyRight :: Int64 -> Char -> Text -> Text
680 justifyRight k c t
681 | len >= k = t
682 | otherwise = replicateChar (k-len) c `append` t
683 where len = length t
684 {-# INLINE justifyRight #-}
685
686 -- | /O(n)/ Center a string to the given length, using the specified
687 -- fill character on either side. Performs replacement on invalid
688 -- scalar values.
689 --
690 -- Examples:
691 --
692 -- > center 8 'x' "HS" = "xxxHSxxx"
693 center :: Int64 -> Char -> Text -> Text
694 center k c t
695 | len >= k = t
696 | otherwise = replicateChar l c `append` t `append` replicateChar r c
697 where len = length t
698 d = k - len
699 r = d `quot` 2
700 l = d - r
701 {-# INLINE center #-}
702
703 -- | /O(n)/ The 'transpose' function transposes the rows and columns
704 -- of its 'Text' argument. Note that this function uses 'pack',
705 -- 'unpack', and the list version of transpose, and is thus not very
706 -- efficient.
707 transpose :: [Text] -> [Text]
708 transpose ts = L.map (\ss -> Chunk (T.pack ss) Empty)
709 (L.transpose (L.map unpack ts))
710 -- TODO: make this fast
711
712 -- | /O(n)/ 'reverse' @t@ returns the elements of @t@ in reverse order.
713 reverse :: Text -> Text
714 reverse = rev Empty
715 where rev a Empty = a
716 rev a (Chunk t ts) = rev (Chunk (T.reverse t) a) ts
717
718 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
719 -- @haystack@ with @replacement@.
720 --
721 -- This function behaves as though it was defined as follows:
722 --
723 -- @
724 -- replace needle replacement haystack =
725 -- 'intercalate' replacement ('splitOn' needle haystack)
726 -- @
727 --
728 -- As this suggests, each occurrence is replaced exactly once. So if
729 -- @needle@ occurs in @replacement@, that occurrence will /not/ itself
730 -- be replaced recursively:
731 --
732 -- > replace "oo" "foo" "oo" == "foo"
733 --
734 -- In cases where several instances of @needle@ overlap, only the
735 -- first one will be replaced:
736 --
737 -- > replace "ofo" "bar" "ofofo" == "barfo"
738 --
739 -- In (unlikely) bad cases, this function's time complexity degrades
740 -- towards /O(n*m)/.
741 replace :: Text
742 -- ^ @needle@ to search for. If this string is empty, an
743 -- error will occur.
744 -> Text
745 -- ^ @replacement@ to replace @needle@ with.
746 -> Text
747 -- ^ @haystack@ in which to search.
748 -> Text
749 replace s d = intercalate d . splitOn s
750 {-# INLINE replace #-}
751
752 -- ----------------------------------------------------------------------------
753 -- ** Case conversions (folds)
754
755 -- $case
756 --
757 -- With Unicode text, it is incorrect to use combinators like @map
758 -- toUpper@ to case convert each character of a string individually.
759 -- Instead, use the whole-string case conversion functions from this
760 -- module. For correctness in different writing systems, these
761 -- functions may map one input character to two or three output
762 -- characters.
763
764 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
765 --
766 -- This function is mainly useful for performing caseless (or case
767 -- insensitive) string comparisons.
768 --
769 -- A string @x@ is a caseless match for a string @y@ if and only if:
770 --
771 -- @toCaseFold x == toCaseFold y@
772 --
773 -- The result string may be longer than the input string, and may
774 -- differ from applying 'toLower' to the input string. For instance,
775 -- the Armenian small ligature men now (U+FB13) is case folded to the
776 -- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
777 -- case folded to the Greek small letter letter mu (U+03BC) instead of
778 -- itself.
779 toCaseFold :: Text -> Text
780 toCaseFold t = unstream (S.toCaseFold (stream t))
781 {-# INLINE [0] toCaseFold #-}
782
783 -- | /O(n)/ Convert a string to lower case, using simple case
784 -- conversion. Subject to fusion.
785 --
786 -- The result string may be longer than the input string. For
787 -- instance, the Latin capital letter I with dot above (U+0130) maps
788 -- to the sequence Latin small letter i (U+0069) followed by combining
789 -- dot above (U+0307).
790 toLower :: Text -> Text
791 toLower t = unstream (S.toLower (stream t))
792 {-# INLINE toLower #-}
793
794 -- | /O(n)/ Convert a string to upper case, using simple case
795 -- conversion. Subject to fusion.
796 --
797 -- The result string may be longer than the input string. For
798 -- instance, the German eszett (U+00DF) maps to the two-letter
799 -- sequence SS.
800 toUpper :: Text -> Text
801 toUpper t = unstream (S.toUpper (stream t))
802 {-# INLINE toUpper #-}
803
804
805 -- | /O(n)/ Convert a string to title case, using simple case
806 -- conversion. Subject to fusion.
807 --
808 -- The first letter of the input is converted to title case, as is
809 -- every subsequent letter that immediately follows a non-letter.
810 -- Every letter that immediately follows another letter is converted
811 -- to lower case.
812 --
813 -- The result string may be longer than the input string. For example,
814 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
815 -- sequence Latin capital letter F (U+0046) followed by Latin small
816 -- letter l (U+006C).
817 --
818 -- /Note/: this function does not take language or culture specific
819 -- rules into account. For instance, in English, different style
820 -- guides disagree on whether the book name \"The Hill of the Red
821 -- Fox\" is correctly title cased&#x2014;but this function will
822 -- capitalize /every/ word.
823 toTitle :: Text -> Text
824 toTitle t = unstream (S.toTitle (stream t))
825 {-# INLINE toTitle #-}
826
827 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
828 -- (typically the left-identity of the operator), and a 'Text',
829 -- reduces the 'Text' using the binary operator, from left to right.
830 -- Subject to fusion.
831 foldl :: (a -> Char -> a) -> a -> Text -> a
832 foldl f z t = S.foldl f z (stream t)
833 {-# INLINE foldl #-}
834
835 -- | /O(n)/ A strict version of 'foldl'.
836 -- Subject to fusion.
837 foldl' :: (a -> Char -> a) -> a -> Text -> a
838 foldl' f z t = S.foldl' f z (stream t)
839 {-# INLINE foldl' #-}
840
841 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
842 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
843 foldl1 :: (Char -> Char -> Char) -> Text -> Char
844 foldl1 f t = S.foldl1 f (stream t)
845 {-# INLINE foldl1 #-}
846
847 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
848 foldl1' :: (Char -> Char -> Char) -> Text -> Char
849 foldl1' f t = S.foldl1' f (stream t)
850 {-# INLINE foldl1' #-}
851
852 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
853 -- (typically the right-identity of the operator), and a 'Text',
854 -- reduces the 'Text' using the binary operator, from right to left.
855 -- Subject to fusion.
856 foldr :: (Char -> a -> a) -> a -> Text -> a
857 foldr f z t = S.foldr f z (stream t)
858 {-# INLINE foldr #-}
859
860 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
861 -- and thus must be applied to a non-empty 'Text'. Subject to
862 -- fusion.
863 foldr1 :: (Char -> Char -> Char) -> Text -> Char
864 foldr1 f t = S.foldr1 f (stream t)
865 {-# INLINE foldr1 #-}
866
867 -- | /O(n)/ Concatenate a list of 'Text's.
868 concat :: [Text] -> Text
869 concat = to
870 where
871 go Empty css = to css
872 go (Chunk c cs) css = Chunk c (go cs css)
873 to [] = Empty
874 to (cs:css) = go cs css
875 {-# INLINE concat #-}
876
877 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
878 -- concatenate the results.
879 concatMap :: (Char -> Text) -> Text -> Text
880 concatMap f = concat . foldr ((:) . f) []
881 {-# INLINE concatMap #-}
882
883 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
884 -- 'Text' @t@ satisfies the predicate @p@. Subject to fusion.
885 any :: (Char -> Bool) -> Text -> Bool
886 any p t = S.any p (stream t)
887 {-# INLINE any #-}
888
889 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
890 -- 'Text' @t@ satisfy the predicate @p@. Subject to fusion.
891 all :: (Char -> Bool) -> Text -> Bool
892 all p t = S.all p (stream t)
893 {-# INLINE all #-}
894
895 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
896 -- must be non-empty. Subject to fusion.
897 maximum :: Text -> Char
898 maximum t = S.maximum (stream t)
899 {-# INLINE maximum #-}
900
901 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
902 -- must be non-empty. Subject to fusion.
903 minimum :: Text -> Char
904 minimum t = S.minimum (stream t)
905 {-# INLINE minimum #-}
906
907 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
908 -- successive reduced values from the left. Subject to fusion.
909 -- Performs replacement on invalid scalar values.
910 --
911 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
912 --
913 -- Note that
914 --
915 -- > last (scanl f z xs) == foldl f z xs.
916 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
917 scanl f z t = unstream (S.scanl g z (stream t))
918 where g a b = safe (f a b)
919 {-# INLINE scanl #-}
920
921 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
922 -- value argument. Subject to fusion. Performs replacement on
923 -- invalid scalar values.
924 --
925 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
926 scanl1 :: (Char -> Char -> Char) -> Text -> Text
927 scanl1 f t0 = case uncons t0 of
928 Nothing -> empty
929 Just (t,ts) -> scanl f t ts
930 {-# INLINE scanl1 #-}
931
932 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
933 -- replacement on invalid scalar values.
934 --
935 -- > scanr f v == reverse . scanl (flip f) v . reverse
936 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
937 scanr f v = reverse . scanl g v . reverse
938 where g a b = safe (f b a)
939
940 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
941 -- value argument. Performs replacement on invalid scalar values.
942 scanr1 :: (Char -> Char -> Char) -> Text -> Text
943 scanr1 f t | null t = empty
944 | otherwise = scanr f (last t) (init t)
945
946 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
947 -- function to each element of a 'Text', passing an accumulating
948 -- parameter from left to right, and returns a final 'Text'. Performs
949 -- replacement on invalid scalar values.
950 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
951 mapAccumL f = go
952 where
953 go z (Chunk c cs) = (z'', Chunk c' cs')
954 where (z', c') = T.mapAccumL f z c
955 (z'', cs') = go z' cs
956 go z Empty = (z, Empty)
957 {-# INLINE mapAccumL #-}
958
959 -- | The 'mapAccumR' function behaves like a combination of 'map' and
960 -- a strict 'foldr'; it applies a function to each element of a
961 -- 'Text', passing an accumulating parameter from right to left, and
962 -- returning a final value of this accumulator together with the new
963 -- 'Text'. Performs replacement on invalid scalar values.
964 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
965 mapAccumR f = go
966 where
967 go z (Chunk c cs) = (z'', Chunk c' cs')
968 where (z'', c') = T.mapAccumR f z' c
969 (z', cs') = go z cs
970 go z Empty = (z, Empty)
971 {-# INLINE mapAccumR #-}
972
973 -- | @'repeat' x@ is an infinite 'Text', with @x@ the value of every
974 -- element.
975 repeat :: Char -> Text
976 repeat c = let t = Chunk (T.replicate smallChunkSize (T.singleton c)) t
977 in t
978
979 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
980 -- @t@ repeated @n@ times.
981 replicate :: Int64 -> Text -> Text
982 replicate n t
983 | null t || n <= 0 = empty
984 | isSingleton t = replicateChar n (head t)
985 | otherwise = concat (rep 0)
986 where rep !i | i >= n = []
987 | otherwise = t : rep (i+1)
988 {-# INLINE [1] replicate #-}
989
990 -- | 'cycle' ties a finite, non-empty 'Text' into a circular one, or
991 -- equivalently, the infinite repetition of the original 'Text'.
992 cycle :: Text -> Text
993 cycle Empty = emptyError "cycle"
994 cycle t = let t' = foldrChunks Chunk t' t
995 in t'
996
997 -- | @'iterate' f x@ returns an infinite 'Text' of repeated applications
998 -- of @f@ to @x@:
999 --
1000 -- > iterate f x == [x, f x, f (f x), ...]
1001 iterate :: (Char -> Char) -> Char -> Text
1002 iterate f c = let t c' = Chunk (T.singleton c') (t (f c'))
1003 in t c
1004
1005 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
1006 -- value of every element. Subject to fusion.
1007 replicateChar :: Int64 -> Char -> Text
1008 replicateChar n c = unstream (S.replicateCharI n (safe c))
1009 {-# INLINE replicateChar #-}
1010
1011 {-# RULES
1012 "LAZY TEXT replicate/singleton -> replicateChar" [~1] forall n c.
1013 replicate n (singleton c) = replicateChar n c
1014 #-}
1015
1016 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
1017 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
1018 -- 'Text' from a seed value. The function takes the element and
1019 -- returns 'Nothing' if it is done producing the 'Text', otherwise
1020 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
1021 -- string, and @b@ is the seed value for further production. Performs
1022 -- replacement on invalid scalar values.
1023 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
1024 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
1025 {-# INLINE unfoldr #-}
1026
1027 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
1028 -- value. However, the length of the result should be limited by the
1029 -- first argument to 'unfoldrN'. This function is more efficient than
1030 -- 'unfoldr' when the maximum length of the result is known and
1031 -- correct, otherwise its performance is similar to 'unfoldr'.
1032 -- Performs replacement on invalid scalar values.
1033 unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Text
1034 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
1035 {-# INLINE unfoldrN #-}
1036
1037 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
1038 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
1039 -- the length of the Text. Subject to fusion.
1040 take :: Int64 -> Text -> Text
1041 take i _ | i <= 0 = Empty
1042 take i t0 = take' i t0
1043 where take' 0 _ = Empty
1044 take' _ Empty = Empty
1045 take' n (Chunk t ts)
1046 | n < len = Chunk (T.take (fromIntegral n) t) Empty
1047 | otherwise = Chunk t (take' (n - len) ts)
1048 where len = fromIntegral (T.length t)
1049 {-# INLINE [1] take #-}
1050
1051 {-# RULES
1052 "LAZY TEXT take -> fused" [~1] forall n t.
1053 take n t = unstream (S.take n (stream t))
1054 "LAZY TEXT take -> unfused" [1] forall n t.
1055 unstream (S.take n (stream t)) = take n t
1056 #-}
1057
1058 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
1059 -- taking @n@ characters from the end of @t@.
1060 --
1061 -- Examples:
1062 --
1063 -- > takeEnd 3 "foobar" == "bar"
1064 takeEnd :: Int64 -> Text -> Text
1065 takeEnd n t0
1066 | n <= 0 = empty
1067 | otherwise = takeChunk n empty . L.reverse . toChunks $ t0
1068 where takeChunk _ acc [] = acc
1069 takeChunk i acc (t:ts)
1070 | i <= l = chunk (T.takeEnd (fromIntegral i) t) acc
1071 | otherwise = takeChunk (i-l) (Chunk t acc) ts
1072 where l = fromIntegral (T.length t)
1073
1074 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
1075 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
1076 -- is greater than the length of the 'Text'. Subject to fusion.
1077 drop :: Int64 -> Text -> Text
1078 drop i t0
1079 | i <= 0 = t0
1080 | otherwise = drop' i t0
1081 where drop' 0 ts = ts
1082 drop' _ Empty = Empty
1083 drop' n (Chunk t ts)
1084 | n < len = Chunk (T.drop (fromIntegral n) t) ts
1085 | otherwise = drop' (n - len) ts
1086 where len = fromIntegral (T.length t)
1087 {-# INLINE [1] drop #-}
1088
1089 {-# RULES
1090 "LAZY TEXT drop -> fused" [~1] forall n t.
1091 drop n t = unstream (S.drop n (stream t))
1092 "LAZY TEXT drop -> unfused" [1] forall n t.
1093 unstream (S.drop n (stream t)) = drop n t
1094 #-}
1095
1096 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1097 -- dropping @n@ characters from the end of @t@.
1098 --
1099 -- Examples:
1100 --
1101 -- > dropEnd 3 "foobar" == "foo"
1102 dropEnd :: Int64 -> Text -> Text
1103 dropEnd n t0
1104 | n <= 0 = t0
1105 | otherwise = dropChunk n . L.reverse . toChunks $ t0
1106 where dropChunk _ [] = empty
1107 dropChunk m (t:ts)
1108 | m >= l = dropChunk (m-l) ts
1109 | otherwise = fromChunks . L.reverse $
1110 T.dropEnd (fromIntegral m) t : ts
1111 where l = fromIntegral (T.length t)
1112
1113 -- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16'
1114 -- values dropped, or the empty 'Text' if @n@ is greater than the
1115 -- number of 'Word16' values present.
1116 dropWords :: Int64 -> Text -> Text
1117 dropWords i t0
1118 | i <= 0 = t0
1119 | otherwise = drop' i t0
1120 where drop' 0 ts = ts
1121 drop' _ Empty = Empty
1122 drop' n (Chunk (T.Text arr off len) ts)
1123 | n < len' = chunk (text arr (off+n') (len-n')) ts
1124 | otherwise = drop' (n - len') ts
1125 where len' = fromIntegral len
1126 n' = fromIntegral n
1127
1128 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1129 -- returns the longest prefix (possibly empty) of elements that
1130 -- satisfy @p@. Subject to fusion.
1131 takeWhile :: (Char -> Bool) -> Text -> Text
1132 takeWhile p t0 = takeWhile' t0
1133 where takeWhile' Empty = Empty
1134 takeWhile' (Chunk t ts) =
1135 case T.findIndex (not . p) t of
1136 Just n | n > 0 -> Chunk (T.take n t) Empty
1137 | otherwise -> Empty
1138 Nothing -> Chunk t (takeWhile' ts)
1139 {-# INLINE [1] takeWhile #-}
1140
1141 {-# RULES
1142 "LAZY TEXT takeWhile -> fused" [~1] forall p t.
1143 takeWhile p t = unstream (S.takeWhile p (stream t))
1144 "LAZY TEXT takeWhile -> unfused" [1] forall p t.
1145 unstream (S.takeWhile p (stream t)) = takeWhile p t
1146 #-}
1147 -- | /O(n)/ 'takeWhileEnd', applied to a predicate @p@ and a 'Text',
1148 -- returns the longest suffix (possibly empty) of elements that
1149 -- satisfy @p@.
1150 -- Examples:
1151 --
1152 -- > takeWhileEnd (=='o') "foo" == "oo"
1153 takeWhileEnd :: (Char -> Bool) -> Text -> Text
1154 takeWhileEnd p = takeChunk empty . L.reverse . toChunks
1155 where takeChunk acc [] = acc
1156 takeChunk acc (t:ts) = if T.length t' < T.length t
1157 then (Chunk t' acc)
1158 else takeChunk (Chunk t' acc) ts
1159 where t' = T.takeWhileEnd p t
1160 {-# INLINE takeWhileEnd #-}
1161
1162 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1163 -- 'takeWhile' @p@ @t@. Subject to fusion.
1164 dropWhile :: (Char -> Bool) -> Text -> Text
1165 dropWhile p t0 = dropWhile' t0
1166 where dropWhile' Empty = Empty
1167 dropWhile' (Chunk t ts) =
1168 case T.findIndex (not . p) t of
1169 Just n -> Chunk (T.drop n t) ts
1170 Nothing -> dropWhile' ts
1171 {-# INLINE [1] dropWhile #-}
1172
1173 {-# RULES
1174 "LAZY TEXT dropWhile -> fused" [~1] forall p t.
1175 dropWhile p t = unstream (S.dropWhile p (stream t))
1176 "LAZY TEXT dropWhile -> unfused" [1] forall p t.
1177 unstream (S.dropWhile p (stream t)) = dropWhile p t
1178 #-}
1179 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1180 -- dropping characters that fail the predicate @p@ from the end of
1181 -- @t@.
1182 -- Examples:
1183 --
1184 -- > dropWhileEnd (=='.') "foo..." == "foo"
1185 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1186 dropWhileEnd p = go
1187 where go Empty = Empty
1188 go (Chunk t Empty) = if T.null t'
1189 then Empty
1190 else Chunk t' Empty
1191 where t' = T.dropWhileEnd p t
1192 go (Chunk t ts) = case go ts of
1193 Empty -> go (Chunk t Empty)
1194 ts' -> Chunk t ts'
1195 {-# INLINE dropWhileEnd #-}
1196
1197 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1198 -- dropping characters that fail the predicate @p@ from both the
1199 -- beginning and end of @t@. Subject to fusion.
1200 dropAround :: (Char -> Bool) -> Text -> Text
1201 dropAround p = dropWhile p . dropWhileEnd p
1202 {-# INLINE [1] dropAround #-}
1203
1204 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1205 --
1206 -- > dropWhile isSpace
1207 stripStart :: Text -> Text
1208 stripStart = dropWhile isSpace
1209 {-# INLINE [1] stripStart #-}
1210
1211 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1212 --
1213 -- > dropWhileEnd isSpace
1214 stripEnd :: Text -> Text
1215 stripEnd = dropWhileEnd isSpace
1216 {-# INLINE [1] stripEnd #-}
1217
1218 -- | /O(n)/ Remove leading and trailing white space from a string.
1219 -- Equivalent to:
1220 --
1221 -- > dropAround isSpace
1222 strip :: Text -> Text
1223 strip = dropAround isSpace
1224 {-# INLINE [1] strip #-}
1225
1226 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1227 -- prefix of @t@ of length @n@, and whose second is the remainder of
1228 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1229 splitAt :: Int64 -> Text -> (Text, Text)
1230 splitAt = loop
1231 where loop _ Empty = (empty, empty)
1232 loop n t | n <= 0 = (empty, t)
1233 loop n (Chunk t ts)
1234 | n < len = let (t',t'') = T.splitAt (fromIntegral n) t
1235 in (Chunk t' Empty, Chunk t'' ts)
1236 | otherwise = let (ts',ts'') = loop (n - len) ts
1237 in (Chunk t ts', ts'')
1238 where len = fromIntegral (T.length t)
1239
1240 -- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first
1241 -- element is a prefix of @t@ whose chunks contain @n@ 'Word16'
1242 -- values, and whose second is the remainder of the string.
1243 splitAtWord :: Int64 -> Text -> PairS Text Text
1244 splitAtWord _ Empty = empty :*: empty
1245 splitAtWord x (Chunk c@(T.Text arr off len) cs)
1246 | y >= len = let h :*: t = splitAtWord (x-fromIntegral len) cs
1247 in Chunk c h :*: t
1248 | otherwise = chunk (text arr off y) empty :*:
1249 chunk (text arr (off+y) (len-y)) cs
1250 where y = fromIntegral x
1251
1252 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1253 -- non-'null') in @haystack@. The first element of the returned tuple
1254 -- is the prefix of @haystack@ before @needle@ is matched. The second
1255 -- is the remainder of @haystack@, starting with the match.
1256 --
1257 -- Examples:
1258 --
1259 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1260 -- > breakOn "/" "foobar" ==> ("foobar", "")
1261 --
1262 -- Laws:
1263 --
1264 -- > append prefix match == haystack
1265 -- > where (prefix, match) = breakOn needle haystack
1266 --
1267 -- If you need to break a string by a substring repeatedly (e.g. you
1268 -- want to break on every instance of a substring), use 'breakOnAll'
1269 -- instead, as it has lower startup overhead.
1270 --
1271 -- This function is strict in its first argument, and lazy in its
1272 -- second.
1273 --
1274 -- In (unlikely) bad cases, this function's time complexity degrades
1275 -- towards /O(n*m)/.
1276 breakOn :: Text -> Text -> (Text, Text)
1277 breakOn pat src
1278 | null pat = emptyError "breakOn"
1279 | otherwise = case indices pat src of
1280 [] -> (src, empty)
1281 (x:_) -> let h :*: t = splitAtWord x src
1282 in (h, t)
1283
1284 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the string.
1285 --
1286 -- The first element of the returned tuple is the prefix of @haystack@
1287 -- up to and including the last match of @needle@. The second is the
1288 -- remainder of @haystack@, following the match.
1289 --
1290 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1291 breakOnEnd :: Text -> Text -> (Text, Text)
1292 breakOnEnd pat src = let (a,b) = breakOn (reverse pat) (reverse src)
1293 in (reverse b, reverse a)
1294 {-# INLINE breakOnEnd #-}
1295
1296 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1297 -- @haystack@. Each element of the returned list consists of a pair:
1298 --
1299 -- * The entire string prior to the /k/th match (i.e. the prefix)
1300 --
1301 -- * The /k/th match, followed by the remainder of the string
1302 --
1303 -- Examples:
1304 --
1305 -- > breakOnAll "::" ""
1306 -- > ==> []
1307 -- > breakOnAll "/" "a/b/c/"
1308 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1309 --
1310 -- This function is strict in its first argument, and lazy in its
1311 -- second.
1312 --
1313 -- In (unlikely) bad cases, this function's time complexity degrades
1314 -- towards /O(n*m)/.
1315 --
1316 -- The @needle@ parameter may not be empty.
1317 breakOnAll :: Text -- ^ @needle@ to search for
1318 -> Text -- ^ @haystack@ in which to search
1319 -> [(Text, Text)]
1320 breakOnAll pat src
1321 | null pat = emptyError "breakOnAll"
1322 | otherwise = go 0 empty src (indices pat src)
1323 where
1324 go !n p s (x:xs) = let h :*: t = splitAtWord (x-n) s
1325 h' = append p h
1326 in (h',t) : go x h' t xs
1327 go _ _ _ _ = []
1328
1329 -- | /O(n)/ 'break' is like 'span', but the prefix returned is over
1330 -- elements that fail the predicate @p@.
1331 break :: (Char -> Bool) -> Text -> (Text, Text)
1332 break p t0 = break' t0
1333 where break' Empty = (empty, empty)
1334 break' c@(Chunk t ts) =
1335 case T.findIndex p t of
1336 Nothing -> let (ts', ts'') = break' ts
1337 in (Chunk t ts', ts'')
1338 Just n | n == 0 -> (Empty, c)
1339 | otherwise -> let (a,b) = T.splitAt n t
1340 in (Chunk a Empty, Chunk b ts)
1341
1342 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1343 -- a pair whose first element is the longest prefix (possibly empty)
1344 -- of @t@ of elements that satisfy @p@, and whose second is the
1345 -- remainder of the list.
1346 span :: (Char -> Bool) -> Text -> (Text, Text)
1347 span p = break (not . p)
1348 {-# INLINE span #-}
1349
1350 -- | The 'group' function takes a 'Text' and returns a list of 'Text's
1351 -- such that the concatenation of the result is equal to the argument.
1352 -- Moreover, each sublist in the result contains only equal elements.
1353 -- For example,
1354 --
1355 -- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
1356 --
1357 -- It is a special case of 'groupBy', which allows the programmer to
1358 -- supply their own equality test.
1359 group :: Text -> [Text]
1360 group = groupBy (==)
1361 {-# INLINE group #-}
1362
1363 -- | The 'groupBy' function is the non-overloaded version of 'group'.
1364 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1365 groupBy _ Empty = []
1366 groupBy eq (Chunk t ts) = cons x ys : groupBy eq zs
1367 where (ys,zs) = span (eq x) xs
1368 x = T.unsafeHead t
1369 xs = chunk (T.unsafeTail t) ts
1370
1371 -- | /O(n)/ Return all initial segments of the given 'Text',
1372 -- shortest first.
1373 inits :: Text -> [Text]
1374 inits = (Empty :) . inits'
1375 where inits' Empty = []
1376 inits' (Chunk t ts) = L.map (\t' -> Chunk t' Empty) (L.tail (T.inits t))
1377 ++ L.map (Chunk t) (inits' ts)
1378
1379 -- | /O(n)/ Return all final segments of the given 'Text', longest
1380 -- first.
1381 tails :: Text -> [Text]
1382 tails Empty = Empty : []
1383 tails ts@(Chunk t ts')
1384 | T.length t == 1 = ts : tails ts'
1385 | otherwise = ts : tails (Chunk (T.unsafeTail t) ts')
1386
1387 -- $split
1388 --
1389 -- Splitting functions in this library do not perform character-wise
1390 -- copies to create substrings; they just construct new 'Text's that
1391 -- are slices of the original.
1392
1393 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
1394 -- argument (which cannot be an empty string), consuming the
1395 -- delimiter. An empty delimiter is invalid, and will cause an error
1396 -- to be raised.
1397 --
1398 -- Examples:
1399 --
1400 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1401 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1402 -- > splitOn "x" "x" == ["",""]
1403 --
1404 -- and
1405 --
1406 -- > intercalate s . splitOn s == id
1407 -- > splitOn (singleton c) == split (==c)
1408 --
1409 -- (Note: the string @s@ to split on above cannot be empty.)
1410 --
1411 -- This function is strict in its first argument, and lazy in its
1412 -- second.
1413 --
1414 -- In (unlikely) bad cases, this function's time complexity degrades
1415 -- towards /O(n*m)/.
1416 splitOn :: Text
1417 -- ^ String to split on. If this string is empty, an error
1418 -- will occur.
1419 -> Text
1420 -- ^ Input text.
1421 -> [Text]
1422 splitOn pat src
1423 | null pat = emptyError "splitOn"
1424 | isSingleton pat = split (== head pat) src
1425 | otherwise = go 0 (indices pat src) src
1426 where
1427 go _ [] cs = [cs]
1428 go !i (x:xs) cs = let h :*: t = splitAtWord (x-i) cs
1429 in h : go (x+l) xs (dropWords l t)
1430 l = foldlChunks (\a (T.Text _ _ b) -> a + fromIntegral b) 0 pat
1431 {-# INLINE [1] splitOn #-}
1432
1433 {-# RULES
1434 "LAZY TEXT splitOn/singleton -> split/==" [~1] forall c t.
1435 splitOn (singleton c) t = split (==c) t
1436 #-}
1437
1438 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1439 -- where the predicate returns True for a separator element. The
1440 -- resulting components do not contain the separators. Two adjacent
1441 -- separators result in an empty component in the output. eg.
1442 --
1443 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1444 -- > split (=='a') [] == [""]
1445 split :: (Char -> Bool) -> Text -> [Text]
1446 split _ Empty = [Empty]
1447 split p (Chunk t0 ts0) = comb [] (T.split p t0) ts0
1448 where comb acc (s:[]) Empty = revChunks (s:acc) : []
1449 comb acc (s:[]) (Chunk t ts) = comb (s:acc) (T.split p t) ts
1450 comb acc (s:ss) ts = revChunks (s:acc) : comb [] ss ts
1451 comb _ [] _ = impossibleError "split"
1452 {-# INLINE split #-}
1453
1454 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1455 -- element may be shorter than the other chunks, depending on the
1456 -- length of the input. Examples:
1457 --
1458 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1459 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1460 chunksOf :: Int64 -> Text -> [Text]
1461 chunksOf k = go
1462 where
1463 go t = case splitAt k t of
1464 (a,b) | null a -> []
1465 | otherwise -> a : go b
1466 {-# INLINE chunksOf #-}
1467
1468 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1469 -- newline 'Char's. The resulting strings do not contain newlines.
1470 lines :: Text -> [Text]
1471 lines Empty = []
1472 lines t = let (l,t') = break ((==) '\n') t
1473 in l : if null t' then []
1474 else lines (tail t')
1475
1476 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1477 -- representing white space.
1478 words :: Text -> [Text]
1479 words = L.filter (not . null) . split isSpace
1480 {-# INLINE words #-}
1481
1482 -- | /O(n)/ Joins lines, after appending a terminating newline to
1483 -- each.
1484 unlines :: [Text] -> Text
1485 unlines = concat . L.map (`snoc` '\n')
1486 {-# INLINE unlines #-}
1487
1488 -- | /O(n)/ Joins words using single space characters.
1489 unwords :: [Text] -> Text
1490 unwords = intercalate (singleton ' ')
1491 {-# INLINE unwords #-}
1492
1493 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1494 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1495 isPrefixOf :: Text -> Text -> Bool
1496 isPrefixOf Empty _ = True
1497 isPrefixOf _ Empty = False
1498 isPrefixOf (Chunk x xs) (Chunk y ys)
1499 | lx == ly = x == y && isPrefixOf xs ys
1500 | lx < ly = x == yh && isPrefixOf xs (Chunk yt ys)
1501 | otherwise = xh == y && isPrefixOf (Chunk xt xs) ys
1502 where (xh,xt) = T.splitAt ly x
1503 (yh,yt) = T.splitAt lx y
1504 lx = T.length x
1505 ly = T.length y
1506 {-# INLINE [1] isPrefixOf #-}
1507
1508 {-# RULES
1509 "LAZY TEXT isPrefixOf -> fused" [~1] forall s t.
1510 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1511 "LAZY TEXT isPrefixOf -> unfused" [1] forall s t.
1512 S.isPrefixOf (stream s) (stream t) = isPrefixOf s t
1513 #-}
1514
1515 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1516 -- 'True' iff the first is a suffix of the second.
1517 isSuffixOf :: Text -> Text -> Bool
1518 isSuffixOf x y = reverse x `isPrefixOf` reverse y
1519 {-# INLINE isSuffixOf #-}
1520 -- TODO: a better implementation
1521
1522 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1523 -- 'True' iff the first is contained, wholly and intact, anywhere
1524 -- within the second.
1525 --
1526 -- This function is strict in its first argument, and lazy in its
1527 -- second.
1528 --
1529 -- In (unlikely) bad cases, this function's time complexity degrades
1530 -- towards /O(n*m)/.
1531 isInfixOf :: Text -> Text -> Bool
1532 isInfixOf needle haystack
1533 | null needle = True
1534 | isSingleton needle = S.elem (head needle) . S.stream $ haystack
1535 | otherwise = not . L.null . indices needle $ haystack
1536 {-# INLINE [1] isInfixOf #-}
1537
1538 {-# RULES
1539 "LAZY TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1540 isInfixOf (singleton n) h = S.elem n (S.stream h)
1541 #-}
1542
1543 -------------------------------------------------------------------------------
1544 -- * View patterns
1545
1546 -- | /O(n)/ Return the suffix of the second string if its prefix
1547 -- matches the entire first string.
1548 --
1549 -- Examples:
1550 --
1551 -- > stripPrefix "foo" "foobar" == Just "bar"
1552 -- > stripPrefix "" "baz" == Just "baz"
1553 -- > stripPrefix "foo" "quux" == Nothing
1554 --
1555 -- This is particularly useful with the @ViewPatterns@ extension to
1556 -- GHC, as follows:
1557 --
1558 -- > {-# LANGUAGE ViewPatterns #-}
1559 -- > import Data.Text.Lazy as T
1560 -- >
1561 -- > fnordLength :: Text -> Int
1562 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1563 -- > fnordLength _ = -1
1564 stripPrefix :: Text -> Text -> Maybe Text
1565 stripPrefix p t
1566 | null p = Just t
1567 | otherwise = case commonPrefixes p t of
1568 Just (_,c,r) | null c -> Just r
1569 _ -> Nothing
1570
1571 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1572 -- and return it, along with the suffixes of each string at which they
1573 -- no longer match.
1574 --
1575 -- If the strings do not have a common prefix or either one is empty,
1576 -- this function returns 'Nothing'.
1577 --
1578 -- Examples:
1579 --
1580 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1581 -- > commonPrefixes "veeble" "fetzer" == Nothing
1582 -- > commonPrefixes "" "baz" == Nothing
1583 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1584 commonPrefixes Empty _ = Nothing
1585 commonPrefixes _ Empty = Nothing
1586 commonPrefixes a0 b0 = Just (go a0 b0 [])
1587 where
1588 go t0@(Chunk x xs) t1@(Chunk y ys) ps
1589 = case T.commonPrefixes x y of
1590 Just (p,a,b)
1591 | T.null a -> go xs (chunk b ys) (p:ps)
1592 | T.null b -> go (chunk a xs) ys (p:ps)
1593 | otherwise -> (fromChunks (L.reverse (p:ps)),chunk a xs, chunk b ys)
1594 Nothing -> (fromChunks (L.reverse ps),t0,t1)
1595 go t0 t1 ps = (fromChunks (L.reverse ps),t0,t1)
1596
1597 -- | /O(n)/ Return the prefix of the second string if its suffix
1598 -- matches the entire first string.
1599 --
1600 -- Examples:
1601 --
1602 -- > stripSuffix "bar" "foobar" == Just "foo"
1603 -- > stripSuffix "" "baz" == Just "baz"
1604 -- > stripSuffix "foo" "quux" == Nothing
1605 --
1606 -- This is particularly useful with the @ViewPatterns@ extension to
1607 -- GHC, as follows:
1608 --
1609 -- > {-# LANGUAGE ViewPatterns #-}
1610 -- > import Data.Text.Lazy as T
1611 -- >
1612 -- > quuxLength :: Text -> Int
1613 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1614 -- > quuxLength _ = -1
1615 stripSuffix :: Text -> Text -> Maybe Text
1616 stripSuffix p t = reverse `fmap` stripPrefix (reverse p) (reverse t)
1617
1618 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1619 -- returns a 'Text' containing those characters that satisfy the
1620 -- predicate.
1621 filter :: (Char -> Bool) -> Text -> Text
1622 filter p t = unstream (S.filter p (stream t))
1623 {-# INLINE filter #-}
1624
1625 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1626 -- returns the first element in matching the predicate, or 'Nothing'
1627 -- if there is no such element.
1628 find :: (Char -> Bool) -> Text -> Maybe Char
1629 find p t = S.findBy p (stream t)
1630 {-# INLINE find #-}
1631
1632 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1633 -- and returns the pair of 'Text's with elements which do and do not
1634 -- satisfy the predicate, respectively; i.e.
1635 --
1636 -- > partition p t == (filter p t, filter (not . p) t)
1637 partition :: (Char -> Bool) -> Text -> (Text, Text)
1638 partition p t = (filter p t, filter (not . p) t)
1639 {-# INLINE partition #-}
1640
1641 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1642 index :: Text -> Int64 -> Char
1643 index t n = S.index (stream t) n
1644 {-# INLINE index #-}
1645
1646 -- | /O(n+m)/ The 'count' function returns the number of times the
1647 -- query string appears in the given 'Text'. An empty query string is
1648 -- invalid, and will cause an error to be raised.
1649 --
1650 -- In (unlikely) bad cases, this function's time complexity degrades
1651 -- towards /O(n*m)/.
1652 count :: Text -> Text -> Int64
1653 count pat src
1654 | null pat = emptyError "count"
1655 | otherwise = go 0 (indices pat src)
1656 where go !n [] = n
1657 go !n (_:xs) = go (n+1) xs
1658 {-# INLINE [1] count #-}
1659
1660 {-# RULES
1661 "LAZY TEXT count/singleton -> countChar" [~1] forall c t.
1662 count (singleton c) t = countChar c t
1663 #-}
1664
1665 -- | /O(n)/ The 'countChar' function returns the number of times the
1666 -- query element appears in the given 'Text'. Subject to fusion.
1667 countChar :: Char -> Text -> Int64
1668 countChar c t = S.countChar c (stream t)
1669
1670 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1671 -- corresponding pairs of bytes. If one input 'Text' is short,
1672 -- excess elements of the longer 'Text' are discarded. This is
1673 -- equivalent to a pair of 'unpack' operations.
1674 zip :: Text -> Text -> [(Char,Char)]
1675 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1676 {-# INLINE [0] zip #-}
1677
1678 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1679 -- given as the first argument, instead of a tupling function.
1680 -- Performs replacement on invalid scalar values.
1681 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1682 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1683 where g a b = safe (f a b)
1684 {-# INLINE [0] zipWith #-}
1685
1686 revChunks :: [T.Text] -> Text
1687 revChunks = L.foldl' (flip chunk) Empty
1688
1689 emptyError :: String -> a
1690 emptyError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": empty input")
1691
1692 impossibleError :: String -> a
1693 impossibleError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": impossible case")