Further improve docs for replace
[packages/text.git] / Data / Text / Lazy.hs
1 {-# OPTIONS_GHC -fno-warn-orphans #-}
2 {-# LANGUAGE BangPatterns, MagicHash, CPP #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text.Lazy
12 -- Copyright : (c) 2009, 2010, 2012 Bryan O'Sullivan
13 --
14 -- License : BSD-style
15 -- Maintainer : bos@serpentine.com
16 -- Stability : experimental
17 -- Portability : GHC
18 --
19 -- A time and space-efficient implementation of Unicode text using
20 -- lists of packed arrays.
21 --
22 -- /Note/: Read below the synopsis for important notes on the use of
23 -- this module.
24 --
25 -- The representation used by this module is suitable for high
26 -- performance use and for streaming large quantities of data. It
27 -- provides a means to manipulate a large body of text without
28 -- requiring that the entire content be resident in memory.
29 --
30 -- Some operations, such as 'concat', 'append', 'reverse' and 'cons',
31 -- have better time complexity than their "Data.Text" equivalents, due
32 -- to the underlying representation being a list of chunks. For other
33 -- operations, lazy 'Text's are usually within a few percent of strict
34 -- ones, but often with better heap usage if used in a streaming
35 -- fashion. For data larger than available memory, or if you have
36 -- tight memory constraints, this module will be the only option.
37 --
38 -- This module is intended to be imported @qualified@, to avoid name
39 -- clashes with "Prelude" functions. eg.
40 --
41 -- > import qualified Data.Text.Lazy as L
42
43 module Data.Text.Lazy
44 (
45 -- * Fusion
46 -- $fusion
47
48 -- * Acceptable data
49 -- $replacement
50
51 -- * Types
52 Text
53
54 -- * Creation and elimination
55 , pack
56 , unpack
57 , singleton
58 , empty
59 , fromChunks
60 , toChunks
61 , toStrict
62 , fromStrict
63 , foldrChunks
64 , foldlChunks
65
66 -- * Basic interface
67 , cons
68 , snoc
69 , append
70 , uncons
71 , head
72 , last
73 , tail
74 , init
75 , null
76 , length
77 , compareLength
78
79 -- * Transformations
80 , map
81 , intercalate
82 , intersperse
83 , transpose
84 , reverse
85 , replace
86
87 -- ** Case conversion
88 -- $case
89 , toCaseFold
90 , toLower
91 , toUpper
92 , toTitle
93
94 -- ** Justification
95 , justifyLeft
96 , justifyRight
97 , center
98
99 -- * Folds
100 , foldl
101 , foldl'
102 , foldl1
103 , foldl1'
104 , foldr
105 , foldr1
106
107 -- ** Special folds
108 , concat
109 , concatMap
110 , any
111 , all
112 , maximum
113 , minimum
114
115 -- * Construction
116
117 -- ** Scans
118 , scanl
119 , scanl1
120 , scanr
121 , scanr1
122
123 -- ** Accumulating maps
124 , mapAccumL
125 , mapAccumR
126
127 -- ** Generation and unfolding
128 , replicate
129 , unfoldr
130 , unfoldrN
131
132 -- * Substrings
133
134 -- ** Breaking strings
135 , take
136 , takeEnd
137 , drop
138 , dropEnd
139 , takeWhile
140 , dropWhile
141 , dropWhileEnd
142 , dropAround
143 , strip
144 , stripStart
145 , stripEnd
146 , splitAt
147 , span
148 , breakOn
149 , breakOnEnd
150 , break
151 , group
152 , groupBy
153 , inits
154 , tails
155
156 -- ** Breaking into many substrings
157 -- $split
158 , splitOn
159 , split
160 , chunksOf
161 -- , breakSubstring
162
163 -- ** Breaking into lines and words
164 , lines
165 , words
166 , unlines
167 , unwords
168
169 -- * Predicates
170 , isPrefixOf
171 , isSuffixOf
172 , isInfixOf
173
174 -- ** View patterns
175 , stripPrefix
176 , stripSuffix
177 , commonPrefixes
178
179 -- * Searching
180 , filter
181 , find
182 , breakOnAll
183 , partition
184
185 -- , findSubstring
186
187 -- * Indexing
188 , index
189 , count
190
191 -- * Zipping and unzipping
192 , zip
193 , zipWith
194
195 -- -* Ordered text
196 -- , sort
197 ) where
198
199 import Prelude (Char, Bool(..), Maybe(..), String,
200 Eq(..), Ord(..), Ordering(..), Read(..), Show(..),
201 (&&), (||), (+), (-), (.), ($), (++),
202 error, flip, fmap, fromIntegral, not, otherwise, quot)
203 import qualified Prelude as P
204 #if defined(HAVE_DEEPSEQ)
205 import Control.DeepSeq (NFData(..))
206 #endif
207 import Data.Int (Int64)
208 import qualified Data.List as L
209 import Data.Char (isSpace)
210 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf))
211 import Data.Data (mkNoRepType)
212 import Data.Monoid (Monoid(..))
213 import Data.String (IsString(..))
214 import qualified Data.Text as T
215 import qualified Data.Text.Internal as T
216 import qualified Data.Text.Internal.Fusion.Common as S
217 import qualified Data.Text.Unsafe as T
218 import qualified Data.Text.Internal.Lazy.Fusion as S
219 import Data.Text.Internal.Fusion.Types (PairS(..))
220 import Data.Text.Internal.Lazy.Fusion (stream, unstream)
221 import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldlChunks, foldrChunks)
222 import Data.Text.Internal (firstf, safe, text)
223 import qualified Data.Text.Internal.Functions as F
224 import Data.Text.Internal.Lazy.Search (indices)
225 #if __GLASGOW_HASKELL__ >= 702
226 import qualified GHC.CString as GHC
227 #else
228 import qualified GHC.Base as GHC
229 #endif
230 #if __GLASGOW_HASKELL__ >= 708
231 import qualified GHC.Exts as Exts
232 #endif
233 import GHC.Prim (Addr#)
234
235 -- $fusion
236 --
237 -- Most of the functions in this module are subject to /fusion/,
238 -- meaning that a pipeline of such functions will usually allocate at
239 -- most one 'Text' value.
240 --
241 -- As an example, consider the following pipeline:
242 --
243 -- > import Data.Text.Lazy as T
244 -- > import Data.Text.Lazy.Encoding as E
245 -- > import Data.ByteString.Lazy (ByteString)
246 -- >
247 -- > countChars :: ByteString -> Int
248 -- > countChars = T.length . T.toUpper . E.decodeUtf8
249 --
250 -- From the type signatures involved, this looks like it should
251 -- allocate one 'ByteString' value, and two 'Text' values. However,
252 -- when a module is compiled with optimisation enabled under GHC, the
253 -- two intermediate 'Text' values will be optimised away, and the
254 -- function will be compiled down to a single loop over the source
255 -- 'ByteString'.
256 --
257 -- Functions that can be fused by the compiler are documented with the
258 -- phrase \"Subject to fusion\".
259
260 -- $replacement
261 --
262 -- A 'Text' value is a sequence of Unicode scalar values, as defined
263 -- in §3.9, definition D76 of the Unicode 5.2 standard:
264 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35>. As
265 -- such, a 'Text' cannot contain values in the range U+D800 to U+DFFF
266 -- inclusive. Haskell implementations admit all Unicode code points
267 -- (&#xa7;3.4, definition D10) as 'Char' values, including code points
268 -- from this invalid range. This means that there are some 'Char'
269 -- values that are not valid Unicode scalar values, and the functions
270 -- in this module must handle those cases.
271 --
272 -- Within this module, many functions construct a 'Text' from one or
273 -- more 'Char' values. Those functions will substitute 'Char' values
274 -- that are not valid Unicode scalar values with the replacement
275 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
276 -- inspection and replacement are documented with the phrase
277 -- \"Performs replacement on invalid scalar values\".
278 --
279 -- (One reason for this policy of replacement is that internally, a
280 -- 'Text' value is represented as packed UTF-16 data. Values in the
281 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
282 -- code points, and so cannot be represented. The functions replace
283 -- invalid scalar values, instead of dropping them, as a security
284 -- measure. For details, see Unicode Technical Report 36, &#xa7;3.5:
285 -- <http://unicode.org/reports/tr36#Deletion_of_Noncharacters>)
286
287 equal :: Text -> Text -> Bool
288 equal Empty Empty = True
289 equal Empty _ = False
290 equal _ Empty = False
291 equal (Chunk a as) (Chunk b bs) =
292 case compare lenA lenB of
293 LT -> a == (T.takeWord16 lenA b) &&
294 as `equal` Chunk (T.dropWord16 lenA b) bs
295 EQ -> a == b && as `equal` bs
296 GT -> T.takeWord16 lenB a == b &&
297 Chunk (T.dropWord16 lenB a) as `equal` bs
298 where lenA = T.lengthWord16 a
299 lenB = T.lengthWord16 b
300
301 instance Eq Text where
302 (==) = equal
303 {-# INLINE (==) #-}
304
305 instance Ord Text where
306 compare = compareText
307
308 compareText :: Text -> Text -> Ordering
309 compareText Empty Empty = EQ
310 compareText Empty _ = LT
311 compareText _ Empty = GT
312 compareText (Chunk a0 as) (Chunk b0 bs) = outer a0 b0
313 where
314 outer ta@(T.Text arrA offA lenA) tb@(T.Text arrB offB lenB) = go 0 0
315 where
316 go !i !j
317 | i >= lenA = compareText as (chunk (T.Text arrB (offB+j) (lenB-j)) bs)
318 | j >= lenB = compareText (chunk (T.Text arrA (offA+i) (lenA-i)) as) bs
319 | a < b = LT
320 | a > b = GT
321 | otherwise = go (i+di) (j+dj)
322 where T.Iter a di = T.iter ta i
323 T.Iter b dj = T.iter tb j
324
325 instance Show Text where
326 showsPrec p ps r = showsPrec p (unpack ps) r
327
328 instance Read Text where
329 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
330
331 instance Monoid Text where
332 mempty = empty
333 mappend = append
334 mconcat = concat
335
336 instance IsString Text where
337 fromString = pack
338
339 #if __GLASGOW_HASKELL__ >= 708
340 instance Exts.IsList Text where
341 type Item Text = Char
342 fromList = pack
343 toList = unpack
344 #endif
345
346 #if defined(HAVE_DEEPSEQ)
347 instance NFData Text where
348 rnf Empty = ()
349 rnf (Chunk _ ts) = rnf ts
350 #endif
351
352 instance Data Text where
353 gfoldl f z txt = z pack `f` (unpack txt)
354 toConstr _ = error "Data.Text.Lazy.Text.toConstr"
355 gunfold _ _ = error "Data.Text.Lazy.Text.gunfold"
356 dataTypeOf _ = mkNoRepType "Data.Text.Lazy.Text"
357
358 -- | /O(n)/ Convert a 'String' into a 'Text'.
359 --
360 -- Subject to fusion. Performs replacement on invalid scalar values.
361 pack :: String -> Text
362 pack = unstream . S.streamList . L.map safe
363 {-# INLINE [1] pack #-}
364
365 -- | /O(n)/ Convert a 'Text' into a 'String'.
366 -- Subject to fusion.
367 unpack :: Text -> String
368 unpack t = S.unstreamList (stream t)
369 {-# INLINE [1] unpack #-}
370
371 -- | /O(n)/ Convert a literal string into a Text.
372 unpackCString# :: Addr# -> Text
373 unpackCString# addr# = unstream (S.streamCString# addr#)
374 {-# NOINLINE unpackCString# #-}
375
376 {-# RULES "TEXT literal" forall a.
377 unstream (S.streamList (L.map safe (GHC.unpackCString# a)))
378 = unpackCString# a #-}
379
380 {-# RULES "TEXT literal UTF8" forall a.
381 unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a)))
382 = unpackCString# a #-}
383
384 {-# RULES "LAZY TEXT empty literal"
385 unstream (S.streamList (L.map safe []))
386 = Empty #-}
387
388 {-# RULES "LAZY TEXT empty literal" forall a.
389 unstream (S.streamList (L.map safe [a]))
390 = Chunk (T.singleton a) Empty #-}
391
392 -- | /O(1)/ Convert a character into a Text. Subject to fusion.
393 -- Performs replacement on invalid scalar values.
394 singleton :: Char -> Text
395 singleton c = Chunk (T.singleton c) Empty
396 {-# INLINE [1] singleton #-}
397
398 {-# RULES
399 "LAZY TEXT singleton -> fused" [~1] forall c.
400 singleton c = unstream (S.singleton c)
401 "LAZY TEXT singleton -> unfused" [1] forall c.
402 unstream (S.singleton c) = singleton c
403 #-}
404
405 -- | /O(c)/ Convert a list of strict 'T.Text's into a lazy 'Text'.
406 fromChunks :: [T.Text] -> Text
407 fromChunks cs = L.foldr chunk Empty cs
408
409 -- | /O(n)/ Convert a lazy 'Text' into a list of strict 'T.Text's.
410 toChunks :: Text -> [T.Text]
411 toChunks cs = foldrChunks (:) [] cs
412
413 -- | /O(n)/ Convert a lazy 'Text' into a strict 'T.Text'.
414 toStrict :: Text -> T.Text
415 toStrict t = T.concat (toChunks t)
416 {-# INLINE [1] toStrict #-}
417
418 -- | /O(c)/ Convert a strict 'T.Text' into a lazy 'Text'.
419 fromStrict :: T.Text -> Text
420 fromStrict t = chunk t Empty
421 {-# INLINE [1] fromStrict #-}
422
423 -- -----------------------------------------------------------------------------
424 -- * Basic functions
425
426 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
427 -- is more costly than its 'List' counterpart because it requires
428 -- copying a new array. Subject to fusion.
429 cons :: Char -> Text -> Text
430 cons c t = Chunk (T.singleton c) t
431 {-# INLINE [1] cons #-}
432
433 infixr 5 `cons`
434
435 {-# RULES
436 "LAZY TEXT cons -> fused" [~1] forall c t.
437 cons c t = unstream (S.cons c (stream t))
438 "LAZY TEXT cons -> unfused" [1] forall c t.
439 unstream (S.cons c (stream t)) = cons c t
440 #-}
441
442 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
443 -- entire array in the process, unless fused. Subject to fusion.
444 snoc :: Text -> Char -> Text
445 snoc t c = foldrChunks Chunk (singleton c) t
446 {-# INLINE [1] snoc #-}
447
448 {-# RULES
449 "LAZY TEXT snoc -> fused" [~1] forall t c.
450 snoc t c = unstream (S.snoc (stream t) c)
451 "LAZY TEXT snoc -> unfused" [1] forall t c.
452 unstream (S.snoc (stream t) c) = snoc t c
453 #-}
454
455 -- | /O(n\/c)/ Appends one 'Text' to another. Subject to fusion.
456 append :: Text -> Text -> Text
457 append xs ys = foldrChunks Chunk ys xs
458 {-# INLINE [1] append #-}
459
460 {-# RULES
461 "LAZY TEXT append -> fused" [~1] forall t1 t2.
462 append t1 t2 = unstream (S.append (stream t1) (stream t2))
463 "LAZY TEXT append -> unfused" [1] forall t1 t2.
464 unstream (S.append (stream t1) (stream t2)) = append t1 t2
465 #-}
466
467 -- | /O(1)/ Returns the first character and rest of a 'Text', or
468 -- 'Nothing' if empty. Subject to fusion.
469 uncons :: Text -> Maybe (Char, Text)
470 uncons Empty = Nothing
471 uncons (Chunk t ts) = Just (T.unsafeHead t, ts')
472 where ts' | T.compareLength t 1 == EQ = ts
473 | otherwise = Chunk (T.unsafeTail t) ts
474 {-# INLINE uncons #-}
475
476 -- | /O(1)/ Returns the first character of a 'Text', which must be
477 -- non-empty. Subject to fusion.
478 head :: Text -> Char
479 head t = S.head (stream t)
480 {-# INLINE head #-}
481
482 -- | /O(1)/ Returns all characters after the head of a 'Text', which
483 -- must be non-empty. Subject to fusion.
484 tail :: Text -> Text
485 tail (Chunk t ts) = chunk (T.tail t) ts
486 tail Empty = emptyError "tail"
487 {-# INLINE [1] tail #-}
488
489 {-# RULES
490 "LAZY TEXT tail -> fused" [~1] forall t.
491 tail t = unstream (S.tail (stream t))
492 "LAZY TEXT tail -> unfused" [1] forall t.
493 unstream (S.tail (stream t)) = tail t
494 #-}
495
496 -- | /O(1)/ Returns all but the last character of a 'Text', which must
497 -- be non-empty. Subject to fusion.
498 init :: Text -> Text
499 init (Chunk t0 ts0) = go t0 ts0
500 where go t (Chunk t' ts) = Chunk t (go t' ts)
501 go t Empty = chunk (T.init t) Empty
502 init Empty = emptyError "init"
503 {-# INLINE [1] init #-}
504
505 {-# RULES
506 "LAZY TEXT init -> fused" [~1] forall t.
507 init t = unstream (S.init (stream t))
508 "LAZY TEXT init -> unfused" [1] forall t.
509 unstream (S.init (stream t)) = init t
510 #-}
511
512 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
513 -- fusion.
514 null :: Text -> Bool
515 null Empty = True
516 null _ = False
517 {-# INLINE [1] null #-}
518
519 {-# RULES
520 "LAZY TEXT null -> fused" [~1] forall t.
521 null t = S.null (stream t)
522 "LAZY TEXT null -> unfused" [1] forall t.
523 S.null (stream t) = null t
524 #-}
525
526 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
527 -- Subject to fusion.
528 isSingleton :: Text -> Bool
529 isSingleton = S.isSingleton . stream
530 {-# INLINE isSingleton #-}
531
532 -- | /O(1)/ Returns the last character of a 'Text', which must be
533 -- non-empty. Subject to fusion.
534 last :: Text -> Char
535 last Empty = emptyError "last"
536 last (Chunk t ts) = go t ts
537 where go _ (Chunk t' ts') = go t' ts'
538 go t' Empty = T.last t'
539 {-# INLINE [1] last #-}
540
541 {-# RULES
542 "LAZY TEXT last -> fused" [~1] forall t.
543 last t = S.last (stream t)
544 "LAZY TEXT last -> unfused" [1] forall t.
545 S.last (stream t) = last t
546 #-}
547
548 -- | /O(n)/ Returns the number of characters in a 'Text'.
549 -- Subject to fusion.
550 length :: Text -> Int64
551 length = foldlChunks go 0
552 where go l t = l + fromIntegral (T.length t)
553 {-# INLINE [1] length #-}
554
555 {-# RULES
556 "LAZY TEXT length -> fused" [~1] forall t.
557 length t = S.length (stream t)
558 "LAZY TEXT length -> unfused" [1] forall t.
559 S.length (stream t) = length t
560 #-}
561
562 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
563 -- Subject to fusion.
564 --
565 -- This function gives the same answer as comparing against the result
566 -- of 'length', but can short circuit if the count of characters is
567 -- greater than the number, and hence be more efficient.
568 compareLength :: Text -> Int64 -> Ordering
569 compareLength t n = S.compareLengthI (stream t) n
570 {-# INLINE [1] compareLength #-}
571
572 -- We don't apply those otherwise appealing length-to-compareLength
573 -- rewrite rules here, because they can change the strictness
574 -- properties of code.
575
576 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
577 -- each element of @t@. Subject to fusion. Performs replacement on
578 -- invalid scalar values.
579 map :: (Char -> Char) -> Text -> Text
580 map f t = unstream (S.map (safe . f) (stream t))
581 {-# INLINE [1] map #-}
582
583 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
584 -- 'Text's and concatenates the list after interspersing the first
585 -- argument between each element of the list.
586 intercalate :: Text -> [Text] -> Text
587 intercalate t = concat . (F.intersperse t)
588 {-# INLINE intercalate #-}
589
590 -- | /O(n)/ The 'intersperse' function takes a character and places it
591 -- between the characters of a 'Text'. Subject to fusion. Performs
592 -- replacement on invalid scalar values.
593 intersperse :: Char -> Text -> Text
594 intersperse c t = unstream (S.intersperse (safe c) (stream t))
595 {-# INLINE intersperse #-}
596
597 -- | /O(n)/ Left-justify a string to the given length, using the
598 -- specified fill character on the right. Subject to fusion. Performs
599 -- replacement on invalid scalar values.
600 --
601 -- Examples:
602 --
603 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
604 -- > justifyLeft 3 'x' "foobar" == "foobar"
605 justifyLeft :: Int64 -> Char -> Text -> Text
606 justifyLeft k c t
607 | len >= k = t
608 | otherwise = t `append` replicateChar (k-len) c
609 where len = length t
610 {-# INLINE [1] justifyLeft #-}
611
612 {-# RULES
613 "LAZY TEXT justifyLeft -> fused" [~1] forall k c t.
614 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
615 "LAZY TEXT justifyLeft -> unfused" [1] forall k c t.
616 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
617 #-}
618
619 -- | /O(n)/ Right-justify a string to the given length, using the
620 -- specified fill character on the left. Performs replacement on
621 -- invalid scalar values.
622 --
623 -- Examples:
624 --
625 -- > justifyRight 7 'x' "bar" == "xxxxbar"
626 -- > justifyRight 3 'x' "foobar" == "foobar"
627 justifyRight :: Int64 -> Char -> Text -> Text
628 justifyRight k c t
629 | len >= k = t
630 | otherwise = replicateChar (k-len) c `append` t
631 where len = length t
632 {-# INLINE justifyRight #-}
633
634 -- | /O(n)/ Center a string to the given length, using the specified
635 -- fill character on either side. Performs replacement on invalid
636 -- scalar values.
637 --
638 -- Examples:
639 --
640 -- > center 8 'x' "HS" = "xxxHSxxx"
641 center :: Int64 -> Char -> Text -> Text
642 center k c t
643 | len >= k = t
644 | otherwise = replicateChar l c `append` t `append` replicateChar r c
645 where len = length t
646 d = k - len
647 r = d `quot` 2
648 l = d - r
649 {-# INLINE center #-}
650
651 -- | /O(n)/ The 'transpose' function transposes the rows and columns
652 -- of its 'Text' argument. Note that this function uses 'pack',
653 -- 'unpack', and the list version of transpose, and is thus not very
654 -- efficient.
655 transpose :: [Text] -> [Text]
656 transpose ts = L.map (\ss -> Chunk (T.pack ss) Empty)
657 (L.transpose (L.map unpack ts))
658 -- TODO: make this fast
659
660 -- | /O(n)/ 'reverse' @t@ returns the elements of @t@ in reverse order.
661 reverse :: Text -> Text
662 reverse = rev Empty
663 where rev a Empty = a
664 rev a (Chunk t ts) = rev (Chunk (T.reverse t) a) ts
665
666 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
667 -- @haystack@ with @replacement@.
668 --
669 -- This function behaves as though it was defined as follows:
670 --
671 --@
672 -- replace needle replacement haystack =
673 -- 'intercalate' replacement ('splitOn' needle haystack)
674 --@
675 --
676 -- As this suggests, each occurrence is replaced exactly once. So if
677 -- @needle@ occurs in @replacement@, that occurrence will /not/ itself
678 -- be replaced recursively:
679 --
680 -- > replace "oo" "foo" "oo" == "foo"
681 --
682 -- In cases where several instances of @needle@ overlap, only the
683 -- first one will be replaced:
684 --
685 -- > replace "ofo" "bar" "ofofo" == "barfo"
686 --
687 -- In (unlikely) bad cases, this function's time complexity degrades
688 -- towards /O(n*m)/.
689 replace :: Text -- ^ Text to search for
690 -> Text -- ^ Replacement text
691 -> Text -- ^ Input text
692 -> Text
693 replace s d = intercalate d . splitOn s
694 {-# INLINE replace #-}
695
696 -- ----------------------------------------------------------------------------
697 -- ** Case conversions (folds)
698
699 -- $case
700 --
701 -- With Unicode text, it is incorrect to use combinators like @map
702 -- toUpper@ to case convert each character of a string individually.
703 -- Instead, use the whole-string case conversion functions from this
704 -- module. For correctness in different writing systems, these
705 -- functions may map one input character to two or three output
706 -- characters.
707
708 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
709 --
710 -- This function is mainly useful for performing caseless (or case
711 -- insensitive) string comparisons.
712 --
713 -- A string @x@ is a caseless match for a string @y@ if and only if:
714 --
715 -- @toCaseFold x == toCaseFold y@
716 --
717 -- The result string may be longer than the input string, and may
718 -- differ from applying 'toLower' to the input string. For instance,
719 -- the Armenian small ligature men now (U+FB13) is case folded to the
720 -- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
721 -- case folded to the Greek small letter letter mu (U+03BC) instead of
722 -- itself.
723 toCaseFold :: Text -> Text
724 toCaseFold t = unstream (S.toCaseFold (stream t))
725 {-# INLINE [0] toCaseFold #-}
726
727 -- | /O(n)/ Convert a string to lower case, using simple case
728 -- conversion. Subject to fusion.
729 --
730 -- The result string may be longer than the input string. For
731 -- instance, the Latin capital letter I with dot above (U+0130) maps
732 -- to the sequence Latin small letter i (U+0069) followed by combining
733 -- dot above (U+0307).
734 toLower :: Text -> Text
735 toLower t = unstream (S.toLower (stream t))
736 {-# INLINE toLower #-}
737
738 -- | /O(n)/ Convert a string to upper case, using simple case
739 -- conversion. Subject to fusion.
740 --
741 -- The result string may be longer than the input string. For
742 -- instance, the German eszett (U+00DF) maps to the two-letter
743 -- sequence SS.
744 toUpper :: Text -> Text
745 toUpper t = unstream (S.toUpper (stream t))
746 {-# INLINE toUpper #-}
747
748
749 -- | /O(n)/ Convert a string to title case, using simple case
750 -- conversion. Subject to fusion.
751 --
752 -- The first letter of the input is converted to title case, as is
753 -- every subsequent letter that immediately follows a non-letter.
754 -- Every letter that immediately follows another letter is converted
755 -- to lower case.
756 --
757 -- The result string may be longer than the input string. For example,
758 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
759 -- sequence Latin capital letter F (U+0046) followed by Latin small
760 -- letter l (U+006C).
761 --
762 -- /Note/: this function does not take language or culture specific
763 -- rules into account. For instance, in English, different style
764 -- guides disagree on whether the book name \"The Hill of the Red
765 -- Fox\" is correctly title cased&#x2014;but this function will
766 -- capitalize /every/ word.
767 toTitle :: Text -> Text
768 toTitle t = unstream (S.toTitle (stream t))
769 {-# INLINE toTitle #-}
770
771 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
772 -- (typically the left-identity of the operator), and a 'Text',
773 -- reduces the 'Text' using the binary operator, from left to right.
774 -- Subject to fusion.
775 foldl :: (a -> Char -> a) -> a -> Text -> a
776 foldl f z t = S.foldl f z (stream t)
777 {-# INLINE foldl #-}
778
779 -- | /O(n)/ A strict version of 'foldl'.
780 -- Subject to fusion.
781 foldl' :: (a -> Char -> a) -> a -> Text -> a
782 foldl' f z t = S.foldl' f z (stream t)
783 {-# INLINE foldl' #-}
784
785 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
786 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
787 foldl1 :: (Char -> Char -> Char) -> Text -> Char
788 foldl1 f t = S.foldl1 f (stream t)
789 {-# INLINE foldl1 #-}
790
791 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
792 foldl1' :: (Char -> Char -> Char) -> Text -> Char
793 foldl1' f t = S.foldl1' f (stream t)
794 {-# INLINE foldl1' #-}
795
796 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
797 -- (typically the right-identity of the operator), and a 'Text',
798 -- reduces the 'Text' using the binary operator, from right to left.
799 -- Subject to fusion.
800 foldr :: (Char -> a -> a) -> a -> Text -> a
801 foldr f z t = S.foldr f z (stream t)
802 {-# INLINE foldr #-}
803
804 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
805 -- and thus must be applied to a non-empty 'Text'. Subject to
806 -- fusion.
807 foldr1 :: (Char -> Char -> Char) -> Text -> Char
808 foldr1 f t = S.foldr1 f (stream t)
809 {-# INLINE foldr1 #-}
810
811 -- | /O(n)/ Concatenate a list of 'Text's.
812 concat :: [Text] -> Text
813 concat = to
814 where
815 go Empty css = to css
816 go (Chunk c cs) css = Chunk c (go cs css)
817 to [] = Empty
818 to (cs:css) = go cs css
819 {-# INLINE concat #-}
820
821 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
822 -- concatenate the results.
823 concatMap :: (Char -> Text) -> Text -> Text
824 concatMap f = concat . foldr ((:) . f) []
825 {-# INLINE concatMap #-}
826
827 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
828 -- 'Text' @t@ satisifes the predicate @p@. Subject to fusion.
829 any :: (Char -> Bool) -> Text -> Bool
830 any p t = S.any p (stream t)
831 {-# INLINE any #-}
832
833 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
834 -- 'Text' @t@ satisify the predicate @p@. Subject to fusion.
835 all :: (Char -> Bool) -> Text -> Bool
836 all p t = S.all p (stream t)
837 {-# INLINE all #-}
838
839 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
840 -- must be non-empty. Subject to fusion.
841 maximum :: Text -> Char
842 maximum t = S.maximum (stream t)
843 {-# INLINE maximum #-}
844
845 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
846 -- must be non-empty. Subject to fusion.
847 minimum :: Text -> Char
848 minimum t = S.minimum (stream t)
849 {-# INLINE minimum #-}
850
851 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
852 -- successive reduced values from the left. Subject to fusion.
853 -- Performs replacement on invalid scalar values.
854 --
855 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
856 --
857 -- Note that
858 --
859 -- > last (scanl f z xs) == foldl f z xs.
860 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
861 scanl f z t = unstream (S.scanl g z (stream t))
862 where g a b = safe (f a b)
863 {-# INLINE scanl #-}
864
865 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
866 -- value argument. Subject to fusion. Performs replacement on
867 -- invalid scalar values.
868 --
869 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
870 scanl1 :: (Char -> Char -> Char) -> Text -> Text
871 scanl1 f t0 = case uncons t0 of
872 Nothing -> empty
873 Just (t,ts) -> scanl f t ts
874 {-# INLINE scanl1 #-}
875
876 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
877 -- replacement on invalid scalar values.
878 --
879 -- > scanr f v == reverse . scanl (flip f) v . reverse
880 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
881 scanr f v = reverse . scanl g v . reverse
882 where g a b = safe (f b a)
883
884 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
885 -- value argument. Performs replacement on invalid scalar values.
886 scanr1 :: (Char -> Char -> Char) -> Text -> Text
887 scanr1 f t | null t = empty
888 | otherwise = scanr f (last t) (init t)
889
890 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
891 -- function to each element of a 'Text', passing an accumulating
892 -- parameter from left to right, and returns a final 'Text'. Performs
893 -- replacement on invalid scalar values.
894 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
895 mapAccumL f = go
896 where
897 go z (Chunk c cs) = (z'', Chunk c' cs')
898 where (z', c') = T.mapAccumL f z c
899 (z'', cs') = go z' cs
900 go z Empty = (z, Empty)
901 {-# INLINE mapAccumL #-}
902
903 -- | The 'mapAccumR' function behaves like a combination of 'map' and
904 -- a strict 'foldr'; it applies a function to each element of a
905 -- 'Text', passing an accumulating parameter from right to left, and
906 -- returning a final value of this accumulator together with the new
907 -- 'Text'. Performs replacement on invalid scalar values.
908 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
909 mapAccumR f = go
910 where
911 go z (Chunk c cs) = (z'', Chunk c' cs')
912 where (z'', c') = T.mapAccumR f z' c
913 (z', cs') = go z cs
914 go z Empty = (z, Empty)
915 {-# INLINE mapAccumR #-}
916
917 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
918 -- @t@ repeated @n@ times.
919 replicate :: Int64 -> Text -> Text
920 replicate n t
921 | null t || n <= 0 = empty
922 | isSingleton t = replicateChar n (head t)
923 | otherwise = concat (rep 0)
924 where rep !i | i >= n = []
925 | otherwise = t : rep (i+1)
926 {-# INLINE [1] replicate #-}
927
928 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
929 -- value of every element. Subject to fusion.
930 replicateChar :: Int64 -> Char -> Text
931 replicateChar n c = unstream (S.replicateCharI n (safe c))
932 {-# INLINE replicateChar #-}
933
934 {-# RULES
935 "LAZY TEXT replicate/singleton -> replicateChar" [~1] forall n c.
936 replicate n (singleton c) = replicateChar n c
937 #-}
938
939 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
940 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
941 -- 'Text' from a seed value. The function takes the element and
942 -- returns 'Nothing' if it is done producing the 'Text', otherwise
943 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
944 -- string, and @b@ is the seed value for further production. Performs
945 -- replacement on invalid scalar values.
946 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
947 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
948 {-# INLINE unfoldr #-}
949
950 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
951 -- value. However, the length of the result should be limited by the
952 -- first argument to 'unfoldrN'. This function is more efficient than
953 -- 'unfoldr' when the maximum length of the result is known and
954 -- correct, otherwise its performance is similar to 'unfoldr'.
955 -- Performs replacement on invalid scalar values.
956 unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Text
957 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
958 {-# INLINE unfoldrN #-}
959
960 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
961 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
962 -- the length of the Text. Subject to fusion.
963 take :: Int64 -> Text -> Text
964 take i _ | i <= 0 = Empty
965 take i t0 = take' i t0
966 where take' 0 _ = Empty
967 take' _ Empty = Empty
968 take' n (Chunk t ts)
969 | n < len = Chunk (T.take (fromIntegral n) t) Empty
970 | otherwise = Chunk t (take' (n - len) ts)
971 where len = fromIntegral (T.length t)
972 {-# INLINE [1] take #-}
973
974 {-# RULES
975 "LAZY TEXT take -> fused" [~1] forall n t.
976 take n t = unstream (S.take n (stream t))
977 "LAZY TEXT take -> unfused" [1] forall n t.
978 unstream (S.take n (stream t)) = take n t
979 #-}
980
981 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
982 -- taking @n@ characters from the end of @t@.
983 --
984 -- Examples:
985 --
986 -- > takeEnd 3 "foobar" == "bar"
987 takeEnd :: Int64 -> Text -> Text
988 takeEnd n t0
989 | n <= 0 = empty
990 | otherwise = takeChunk n empty . L.reverse . toChunks $ t0
991 where takeChunk _ acc [] = acc
992 takeChunk i acc (t:ts)
993 | i <= l = chunk (T.takeEnd (fromIntegral i) t) acc
994 | otherwise = takeChunk (i-l) (Chunk t acc) ts
995 where l = fromIntegral (T.length t)
996
997 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
998 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
999 -- is greater than the length of the 'Text'. Subject to fusion.
1000 drop :: Int64 -> Text -> Text
1001 drop i t0
1002 | i <= 0 = t0
1003 | otherwise = drop' i t0
1004 where drop' 0 ts = ts
1005 drop' _ Empty = Empty
1006 drop' n (Chunk t ts)
1007 | n < len = Chunk (T.drop (fromIntegral n) t) ts
1008 | otherwise = drop' (n - len) ts
1009 where len = fromIntegral (T.length t)
1010 {-# INLINE [1] drop #-}
1011
1012 {-# RULES
1013 "LAZY TEXT drop -> fused" [~1] forall n t.
1014 drop n t = unstream (S.drop n (stream t))
1015 "LAZY TEXT drop -> unfused" [1] forall n t.
1016 unstream (S.drop n (stream t)) = drop n t
1017 #-}
1018
1019 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1020 -- dropping @n@ characters from the end of @t@.
1021 --
1022 -- Examples:
1023 --
1024 -- > dropEnd 3 "foobar" == "foo"
1025 dropEnd :: Int64 -> Text -> Text
1026 dropEnd n t0
1027 | n <= 0 = t0
1028 | otherwise = dropChunk n . L.reverse . toChunks $ t0
1029 where dropChunk _ [] = empty
1030 dropChunk m (t:ts)
1031 | m >= l = dropChunk (m-l) ts
1032 | otherwise = fromChunks . L.reverse $
1033 T.dropEnd (fromIntegral m) t : ts
1034 where l = fromIntegral (T.length t)
1035
1036 -- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16'
1037 -- values dropped, or the empty 'Text' if @n@ is greater than the
1038 -- number of 'Word16' values present.
1039 dropWords :: Int64 -> Text -> Text
1040 dropWords i t0
1041 | i <= 0 = t0
1042 | otherwise = drop' i t0
1043 where drop' 0 ts = ts
1044 drop' _ Empty = Empty
1045 drop' n (Chunk (T.Text arr off len) ts)
1046 | n < len' = chunk (text arr (off+n') (len-n')) ts
1047 | otherwise = drop' (n - len') ts
1048 where len' = fromIntegral len
1049 n' = fromIntegral n
1050
1051 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1052 -- returns the longest prefix (possibly empty) of elements that
1053 -- satisfy @p@. Subject to fusion.
1054 takeWhile :: (Char -> Bool) -> Text -> Text
1055 takeWhile p t0 = takeWhile' t0
1056 where takeWhile' Empty = Empty
1057 takeWhile' (Chunk t ts) =
1058 case T.findIndex (not . p) t of
1059 Just n | n > 0 -> Chunk (T.take n t) Empty
1060 | otherwise -> Empty
1061 Nothing -> Chunk t (takeWhile' ts)
1062 {-# INLINE [1] takeWhile #-}
1063
1064 {-# RULES
1065 "LAZY TEXT takeWhile -> fused" [~1] forall p t.
1066 takeWhile p t = unstream (S.takeWhile p (stream t))
1067 "LAZY TEXT takeWhile -> unfused" [1] forall p t.
1068 unstream (S.takeWhile p (stream t)) = takeWhile p t
1069 #-}
1070
1071 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1072 -- 'takeWhile' @p@ @t@. Subject to fusion.
1073 dropWhile :: (Char -> Bool) -> Text -> Text
1074 dropWhile p t0 = dropWhile' t0
1075 where dropWhile' Empty = Empty
1076 dropWhile' (Chunk t ts) =
1077 case T.findIndex (not . p) t of
1078 Just n -> Chunk (T.drop n t) ts
1079 Nothing -> dropWhile' ts
1080 {-# INLINE [1] dropWhile #-}
1081
1082 {-# RULES
1083 "LAZY TEXT dropWhile -> fused" [~1] forall p t.
1084 dropWhile p t = unstream (S.dropWhile p (stream t))
1085 "LAZY TEXT dropWhile -> unfused" [1] forall p t.
1086 unstream (S.dropWhile p (stream t)) = dropWhile p t
1087 #-}
1088 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1089 -- dropping characters that fail the predicate @p@ from the end of
1090 -- @t@.
1091 -- Examples:
1092 --
1093 -- > dropWhileEnd (=='.') "foo..." == "foo"
1094 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1095 dropWhileEnd p = go
1096 where go Empty = Empty
1097 go (Chunk t Empty) = if T.null t'
1098 then Empty
1099 else Chunk t' Empty
1100 where t' = T.dropWhileEnd p t
1101 go (Chunk t ts) = case go ts of
1102 Empty -> go (Chunk t Empty)
1103 ts' -> Chunk t ts'
1104 {-# INLINE dropWhileEnd #-}
1105
1106 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1107 -- dropping characters that fail the predicate @p@ from both the
1108 -- beginning and end of @t@. Subject to fusion.
1109 dropAround :: (Char -> Bool) -> Text -> Text
1110 dropAround p = dropWhile p . dropWhileEnd p
1111 {-# INLINE [1] dropAround #-}
1112
1113 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1114 --
1115 -- > dropWhile isSpace
1116 stripStart :: Text -> Text
1117 stripStart = dropWhile isSpace
1118 {-# INLINE [1] stripStart #-}
1119
1120 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1121 --
1122 -- > dropWhileEnd isSpace
1123 stripEnd :: Text -> Text
1124 stripEnd = dropWhileEnd isSpace
1125 {-# INLINE [1] stripEnd #-}
1126
1127 -- | /O(n)/ Remove leading and trailing white space from a string.
1128 -- Equivalent to:
1129 --
1130 -- > dropAround isSpace
1131 strip :: Text -> Text
1132 strip = dropAround isSpace
1133 {-# INLINE [1] strip #-}
1134
1135 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1136 -- prefix of @t@ of length @n@, and whose second is the remainder of
1137 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1138 splitAt :: Int64 -> Text -> (Text, Text)
1139 splitAt = loop
1140 where loop _ Empty = (empty, empty)
1141 loop n t | n <= 0 = (empty, t)
1142 loop n (Chunk t ts)
1143 | n < len = let (t',t'') = T.splitAt (fromIntegral n) t
1144 in (Chunk t' Empty, Chunk t'' ts)
1145 | otherwise = let (ts',ts'') = loop (n - len) ts
1146 in (Chunk t ts', ts'')
1147 where len = fromIntegral (T.length t)
1148
1149 -- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first
1150 -- element is a prefix of @t@ whose chunks contain @n@ 'Word16'
1151 -- values, and whose second is the remainder of the string.
1152 splitAtWord :: Int64 -> Text -> PairS Text Text
1153 splitAtWord _ Empty = empty :*: empty
1154 splitAtWord x (Chunk c@(T.Text arr off len) cs)
1155 | y >= len = let h :*: t = splitAtWord (x-fromIntegral len) cs
1156 in Chunk c h :*: t
1157 | otherwise = chunk (text arr off y) empty :*:
1158 chunk (text arr (off+y) (len-y)) cs
1159 where y = fromIntegral x
1160
1161 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1162 -- non-'null') in @haystack@. The first element of the returned tuple
1163 -- is the prefix of @haystack@ before @needle@ is matched. The second
1164 -- is the remainder of @haystack@, starting with the match.
1165 --
1166 -- Examples:
1167 --
1168 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1169 -- > breakOn "/" "foobar" ==> ("foobar", "")
1170 --
1171 -- Laws:
1172 --
1173 -- > append prefix match == haystack
1174 -- > where (prefix, match) = breakOn needle haystack
1175 --
1176 -- If you need to break a string by a substring repeatedly (e.g. you
1177 -- want to break on every instance of a substring), use 'breakOnAll'
1178 -- instead, as it has lower startup overhead.
1179 --
1180 -- This function is strict in its first argument, and lazy in its
1181 -- second.
1182 --
1183 -- In (unlikely) bad cases, this function's time complexity degrades
1184 -- towards /O(n*m)/.
1185 breakOn :: Text -> Text -> (Text, Text)
1186 breakOn pat src
1187 | null pat = emptyError "breakOn"
1188 | otherwise = case indices pat src of
1189 [] -> (src, empty)
1190 (x:_) -> let h :*: t = splitAtWord x src
1191 in (h, t)
1192
1193 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the string.
1194 --
1195 -- The first element of the returned tuple is the prefix of @haystack@
1196 -- up to and including the last match of @needle@. The second is the
1197 -- remainder of @haystack@, following the match.
1198 --
1199 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1200 breakOnEnd :: Text -> Text -> (Text, Text)
1201 breakOnEnd pat src = let (a,b) = breakOn (reverse pat) (reverse src)
1202 in (reverse b, reverse a)
1203 {-# INLINE breakOnEnd #-}
1204
1205 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1206 -- @haystack@. Each element of the returned list consists of a pair:
1207 --
1208 -- * The entire string prior to the /k/th match (i.e. the prefix)
1209 --
1210 -- * The /k/th match, followed by the remainder of the string
1211 --
1212 -- Examples:
1213 --
1214 -- > breakOnAll "::" ""
1215 -- > ==> []
1216 -- > breakOnAll "/" "a/b/c/"
1217 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1218 --
1219 -- This function is strict in its first argument, and lazy in its
1220 -- second.
1221 --
1222 -- In (unlikely) bad cases, this function's time complexity degrades
1223 -- towards /O(n*m)/.
1224 --
1225 -- The @needle@ parameter may not be empty.
1226 breakOnAll :: Text -- ^ @needle@ to search for
1227 -> Text -- ^ @haystack@ in which to search
1228 -> [(Text, Text)]
1229 breakOnAll pat src
1230 | null pat = emptyError "breakOnAll"
1231 | otherwise = go 0 empty src (indices pat src)
1232 where
1233 go !n p s (x:xs) = let h :*: t = splitAtWord (x-n) s
1234 h' = append p h
1235 in (h',t) : go x h' t xs
1236 go _ _ _ _ = []
1237
1238 -- | /O(n)/ 'break' is like 'span', but the prefix returned is over
1239 -- elements that fail the predicate @p@.
1240 break :: (Char -> Bool) -> Text -> (Text, Text)
1241 break p t0 = break' t0
1242 where break' Empty = (empty, empty)
1243 break' c@(Chunk t ts) =
1244 case T.findIndex p t of
1245 Nothing -> let (ts', ts'') = break' ts
1246 in (Chunk t ts', ts'')
1247 Just n | n == 0 -> (Empty, c)
1248 | otherwise -> let (a,b) = T.splitAt n t
1249 in (Chunk a Empty, Chunk b ts)
1250
1251 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1252 -- a pair whose first element is the longest prefix (possibly empty)
1253 -- of @t@ of elements that satisfy @p@, and whose second is the
1254 -- remainder of the list.
1255 span :: (Char -> Bool) -> Text -> (Text, Text)
1256 span p = break (not . p)
1257 {-# INLINE span #-}
1258
1259 -- | The 'group' function takes a 'Text' and returns a list of 'Text's
1260 -- such that the concatenation of the result is equal to the argument.
1261 -- Moreover, each sublist in the result contains only equal elements.
1262 -- For example,
1263 --
1264 -- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
1265 --
1266 -- It is a special case of 'groupBy', which allows the programmer to
1267 -- supply their own equality test.
1268 group :: Text -> [Text]
1269 group = groupBy (==)
1270 {-# INLINE group #-}
1271
1272 -- | The 'groupBy' function is the non-overloaded version of 'group'.
1273 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1274 groupBy _ Empty = []
1275 groupBy eq (Chunk t ts) = cons x ys : groupBy eq zs
1276 where (ys,zs) = span (eq x) xs
1277 x = T.unsafeHead t
1278 xs = chunk (T.unsafeTail t) ts
1279
1280 -- | /O(n)/ Return all initial segments of the given 'Text',
1281 -- shortest first.
1282 inits :: Text -> [Text]
1283 inits = (Empty :) . inits'
1284 where inits' Empty = []
1285 inits' (Chunk t ts) = L.map (\t' -> Chunk t' Empty) (L.tail (T.inits t))
1286 ++ L.map (Chunk t) (inits' ts)
1287
1288 -- | /O(n)/ Return all final segments of the given 'Text', longest
1289 -- first.
1290 tails :: Text -> [Text]
1291 tails Empty = Empty : []
1292 tails ts@(Chunk t ts')
1293 | T.length t == 1 = ts : tails ts'
1294 | otherwise = ts : tails (Chunk (T.unsafeTail t) ts')
1295
1296 -- $split
1297 --
1298 -- Splitting functions in this library do not perform character-wise
1299 -- copies to create substrings; they just construct new 'Text's that
1300 -- are slices of the original.
1301
1302 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first
1303 -- 'Text' argument, consuming the delimiter. An empty delimiter is
1304 -- invalid, and will cause an error to be raised.
1305 --
1306 -- Examples:
1307 --
1308 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1309 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1310 -- > splitOn "x" "x" == ["",""]
1311 --
1312 -- and
1313 --
1314 -- > intercalate s . splitOn s == id
1315 -- > splitOn (singleton c) == split (==c)
1316 --
1317 -- This function is strict in its first argument, and lazy in its
1318 -- second.
1319 --
1320 -- In (unlikely) bad cases, this function's time complexity degrades
1321 -- towards /O(n*m)/.
1322 splitOn :: Text -- ^ Text to split on
1323 -> Text -- ^ Input text
1324 -> [Text]
1325 splitOn pat src
1326 | null pat = emptyError "splitOn"
1327 | isSingleton pat = split (== head pat) src
1328 | otherwise = go 0 (indices pat src) src
1329 where
1330 go _ [] cs = [cs]
1331 go !i (x:xs) cs = let h :*: t = splitAtWord (x-i) cs
1332 in h : go (x+l) xs (dropWords l t)
1333 l = foldlChunks (\a (T.Text _ _ b) -> a + fromIntegral b) 0 pat
1334 {-# INLINE [1] splitOn #-}
1335
1336 {-# RULES
1337 "LAZY TEXT splitOn/singleton -> split/==" [~1] forall c t.
1338 splitOn (singleton c) t = split (==c) t
1339 #-}
1340
1341 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1342 -- where the predicate returns True for a separator element. The
1343 -- resulting components do not contain the separators. Two adjacent
1344 -- separators result in an empty component in the output. eg.
1345 --
1346 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1347 -- > split (=='a') [] == [""]
1348 split :: (Char -> Bool) -> Text -> [Text]
1349 split _ Empty = [Empty]
1350 split p (Chunk t0 ts0) = comb [] (T.split p t0) ts0
1351 where comb acc (s:[]) Empty = revChunks (s:acc) : []
1352 comb acc (s:[]) (Chunk t ts) = comb (s:acc) (T.split p t) ts
1353 comb acc (s:ss) ts = revChunks (s:acc) : comb [] ss ts
1354 comb _ [] _ = impossibleError "split"
1355 {-# INLINE split #-}
1356
1357 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1358 -- element may be shorter than the other chunks, depending on the
1359 -- length of the input. Examples:
1360 --
1361 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1362 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1363 chunksOf :: Int64 -> Text -> [Text]
1364 chunksOf k = go
1365 where
1366 go t = case splitAt k t of
1367 (a,b) | null a -> []
1368 | otherwise -> a : go b
1369 {-# INLINE chunksOf #-}
1370
1371 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1372 -- newline 'Char's. The resulting strings do not contain newlines.
1373 lines :: Text -> [Text]
1374 lines Empty = []
1375 lines t = let (l,t') = break ((==) '\n') t
1376 in l : if null t' then []
1377 else lines (tail t')
1378
1379 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1380 -- representing white space.
1381 words :: Text -> [Text]
1382 words = L.filter (not . null) . split isSpace
1383 {-# INLINE words #-}
1384
1385 -- | /O(n)/ Joins lines, after appending a terminating newline to
1386 -- each.
1387 unlines :: [Text] -> Text
1388 unlines = concat . L.map (`snoc` '\n')
1389 {-# INLINE unlines #-}
1390
1391 -- | /O(n)/ Joins words using single space characters.
1392 unwords :: [Text] -> Text
1393 unwords = intercalate (singleton ' ')
1394 {-# INLINE unwords #-}
1395
1396 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1397 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1398 isPrefixOf :: Text -> Text -> Bool
1399 isPrefixOf Empty _ = True
1400 isPrefixOf _ Empty = False
1401 isPrefixOf (Chunk x xs) (Chunk y ys)
1402 | lx == ly = x == y && isPrefixOf xs ys
1403 | lx < ly = x == yh && isPrefixOf xs (Chunk yt ys)
1404 | otherwise = xh == y && isPrefixOf (Chunk xt xs) ys
1405 where (xh,xt) = T.splitAt ly x
1406 (yh,yt) = T.splitAt lx y
1407 lx = T.length x
1408 ly = T.length y
1409 {-# INLINE [1] isPrefixOf #-}
1410
1411 {-# RULES
1412 "LAZY TEXT isPrefixOf -> fused" [~1] forall s t.
1413 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1414 "LAZY TEXT isPrefixOf -> unfused" [1] forall s t.
1415 S.isPrefixOf (stream s) (stream t) = isPrefixOf s t
1416 #-}
1417
1418 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1419 -- 'True' iff the first is a suffix of the second.
1420 isSuffixOf :: Text -> Text -> Bool
1421 isSuffixOf x y = reverse x `isPrefixOf` reverse y
1422 {-# INLINE isSuffixOf #-}
1423 -- TODO: a better implementation
1424
1425 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1426 -- 'True' iff the first is contained, wholly and intact, anywhere
1427 -- within the second.
1428 --
1429 -- This function is strict in its first argument, and lazy in its
1430 -- second.
1431 --
1432 -- In (unlikely) bad cases, this function's time complexity degrades
1433 -- towards /O(n*m)/.
1434 isInfixOf :: Text -> Text -> Bool
1435 isInfixOf needle haystack
1436 | null needle = True
1437 | isSingleton needle = S.elem (head needle) . S.stream $ haystack
1438 | otherwise = not . L.null . indices needle $ haystack
1439 {-# INLINE [1] isInfixOf #-}
1440
1441 {-# RULES
1442 "LAZY TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1443 isInfixOf (singleton n) h = S.elem n (S.stream h)
1444 #-}
1445
1446 -------------------------------------------------------------------------------
1447 -- * View patterns
1448
1449 -- | /O(n)/ Return the suffix of the second string if its prefix
1450 -- matches the entire first string.
1451 --
1452 -- Examples:
1453 --
1454 -- > stripPrefix "foo" "foobar" == Just "bar"
1455 -- > stripPrefix "" "baz" == Just "baz"
1456 -- > stripPrefix "foo" "quux" == Nothing
1457 --
1458 -- This is particularly useful with the @ViewPatterns@ extension to
1459 -- GHC, as follows:
1460 --
1461 -- > {-# LANGUAGE ViewPatterns #-}
1462 -- > import Data.Text.Lazy as T
1463 -- >
1464 -- > fnordLength :: Text -> Int
1465 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1466 -- > fnordLength _ = -1
1467 stripPrefix :: Text -> Text -> Maybe Text
1468 stripPrefix p t
1469 | null p = Just t
1470 | otherwise = case commonPrefixes p t of
1471 Just (_,c,r) | null c -> Just r
1472 _ -> Nothing
1473
1474 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1475 -- and return it, along with the suffixes of each string at which they
1476 -- no longer match.
1477 --
1478 -- If the strings do not have a common prefix or either one is empty,
1479 -- this function returns 'Nothing'.
1480 --
1481 -- Examples:
1482 --
1483 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1484 -- > commonPrefixes "veeble" "fetzer" == Nothing
1485 -- > commonPrefixes "" "baz" == Nothing
1486 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1487 commonPrefixes Empty _ = Nothing
1488 commonPrefixes _ Empty = Nothing
1489 commonPrefixes a0 b0 = Just (go a0 b0 [])
1490 where
1491 go t0@(Chunk x xs) t1@(Chunk y ys) ps
1492 = case T.commonPrefixes x y of
1493 Just (p,a,b)
1494 | T.null a -> go xs (chunk b ys) (p:ps)
1495 | T.null b -> go (chunk a xs) ys (p:ps)
1496 | otherwise -> (fromChunks (L.reverse (p:ps)),chunk a xs, chunk b ys)
1497 Nothing -> (fromChunks (L.reverse ps),t0,t1)
1498 go t0 t1 ps = (fromChunks (L.reverse ps),t0,t1)
1499
1500 -- | /O(n)/ Return the prefix of the second string if its suffix
1501 -- matches the entire first string.
1502 --
1503 -- Examples:
1504 --
1505 -- > stripSuffix "bar" "foobar" == Just "foo"
1506 -- > stripSuffix "" "baz" == Just "baz"
1507 -- > stripSuffix "foo" "quux" == Nothing
1508 --
1509 -- This is particularly useful with the @ViewPatterns@ extension to
1510 -- GHC, as follows:
1511 --
1512 -- > {-# LANGUAGE ViewPatterns #-}
1513 -- > import Data.Text.Lazy as T
1514 -- >
1515 -- > quuxLength :: Text -> Int
1516 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1517 -- > quuxLength _ = -1
1518 stripSuffix :: Text -> Text -> Maybe Text
1519 stripSuffix p t = reverse `fmap` stripPrefix (reverse p) (reverse t)
1520
1521 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1522 -- returns a 'Text' containing those characters that satisfy the
1523 -- predicate.
1524 filter :: (Char -> Bool) -> Text -> Text
1525 filter p t = unstream (S.filter p (stream t))
1526 {-# INLINE filter #-}
1527
1528 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1529 -- returns the first element in matching the predicate, or 'Nothing'
1530 -- if there is no such element.
1531 find :: (Char -> Bool) -> Text -> Maybe Char
1532 find p t = S.findBy p (stream t)
1533 {-# INLINE find #-}
1534
1535 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1536 -- and returns the pair of 'Text's with elements which do and do not
1537 -- satisfy the predicate, respectively; i.e.
1538 --
1539 -- > partition p t == (filter p t, filter (not . p) t)
1540 partition :: (Char -> Bool) -> Text -> (Text, Text)
1541 partition p t = (filter p t, filter (not . p) t)
1542 {-# INLINE partition #-}
1543
1544 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1545 index :: Text -> Int64 -> Char
1546 index t n = S.index (stream t) n
1547 {-# INLINE index #-}
1548
1549 -- | /O(n+m)/ The 'count' function returns the number of times the
1550 -- query string appears in the given 'Text'. An empty query string is
1551 -- invalid, and will cause an error to be raised.
1552 --
1553 -- In (unlikely) bad cases, this function's time complexity degrades
1554 -- towards /O(n*m)/.
1555 count :: Text -> Text -> Int64
1556 count pat src
1557 | null pat = emptyError "count"
1558 | otherwise = go 0 (indices pat src)
1559 where go !n [] = n
1560 go !n (_:xs) = go (n+1) xs
1561 {-# INLINE [1] count #-}
1562
1563 {-# RULES
1564 "LAZY TEXT count/singleton -> countChar" [~1] forall c t.
1565 count (singleton c) t = countChar c t
1566 #-}
1567
1568 -- | /O(n)/ The 'countChar' function returns the number of times the
1569 -- query element appears in the given 'Text'. Subject to fusion.
1570 countChar :: Char -> Text -> Int64
1571 countChar c t = S.countChar c (stream t)
1572
1573 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1574 -- corresponding pairs of bytes. If one input 'Text' is short,
1575 -- excess elements of the longer 'Text' are discarded. This is
1576 -- equivalent to a pair of 'unpack' operations.
1577 zip :: Text -> Text -> [(Char,Char)]
1578 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1579 {-# INLINE [0] zip #-}
1580
1581 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1582 -- given as the first argument, instead of a tupling function.
1583 -- Performs replacement on invalid scalar values.
1584 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1585 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1586 where g a b = safe (f a b)
1587 {-# INLINE [0] zipWith #-}
1588
1589 revChunks :: [T.Text] -> Text
1590 revChunks = L.foldl' (flip chunk) Empty
1591
1592 emptyError :: String -> a
1593 emptyError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": empty input")
1594
1595 impossibleError :: String -> a
1596 impossibleError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": impossible case")