ffe303be23f9610fcefb4950ff25d815e480fb55
[packages/text.git] / Data / Text.hs
1 {-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
2 {-# OPTIONS_GHC -fno-warn-orphans #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text
12 -- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
13 -- (c) 2009 Duncan Coutts,
14 -- (c) 2008, 2009 Tom Harper
15 --
16 -- License : BSD-style
17 -- Maintainer : bos@serpentine.com
18 -- Stability : experimental
19 -- Portability : GHC
20 --
21 -- A time and space-efficient implementation of Unicode text.
22 -- Suitable for performance critical use, both in terms of large data
23 -- quantities and high speed.
24 --
25 -- /Note/: Read below the synopsis for important notes on the use of
26 -- this module.
27 --
28 -- This module is intended to be imported @qualified@, to avoid name
29 -- clashes with "Prelude" functions, e.g.
30 --
31 -- > import qualified Data.Text as T
32 --
33 -- To use an extended and very rich family of functions for working
34 -- with Unicode text (including normalization, regular expressions,
35 -- non-standard encodings, text breaking, and locales), see
36 -- <http://hackage.haskell.org/package/text-icu the text-icu package >.
37
38 module Data.Text
39 (
40 -- * Strict vs lazy types
41 -- $strict
42
43 -- * Acceptable data
44 -- $replacement
45
46 -- * Fusion
47 -- $fusion
48
49 -- * Types
50 Text
51
52 -- * Creation and elimination
53 , pack
54 , unpack
55 , singleton
56 , empty
57
58 -- * Basic interface
59 , cons
60 , snoc
61 , append
62 , uncons
63 , head
64 , last
65 , tail
66 , init
67 , null
68 , length
69 , compareLength
70
71 -- * Transformations
72 , map
73 , intercalate
74 , intersperse
75 , transpose
76 , reverse
77 , replace
78
79 -- ** Case conversion
80 -- $case
81 , toCaseFold
82 , toLower
83 , toUpper
84 , toTitle
85
86 -- ** Justification
87 , justifyLeft
88 , justifyRight
89 , center
90
91 -- * Folds
92 , foldl
93 , foldl'
94 , foldl1
95 , foldl1'
96 , foldr
97 , foldr1
98
99 -- ** Special folds
100 , concat
101 , concatMap
102 , any
103 , all
104 , maximum
105 , minimum
106
107 -- * Construction
108
109 -- ** Scans
110 , scanl
111 , scanl1
112 , scanr
113 , scanr1
114
115 -- ** Accumulating maps
116 , mapAccumL
117 , mapAccumR
118
119 -- ** Generation and unfolding
120 , replicate
121 , unfoldr
122 , unfoldrN
123
124 -- * Substrings
125
126 -- ** Breaking strings
127 , take
128 , takeEnd
129 , drop
130 , dropEnd
131 , takeWhile
132 , dropWhile
133 , dropWhileEnd
134 , dropAround
135 , strip
136 , stripStart
137 , stripEnd
138 , splitAt
139 , breakOn
140 , breakOnEnd
141 , break
142 , span
143 , group
144 , groupBy
145 , inits
146 , tails
147
148 -- ** Breaking into many substrings
149 -- $split
150 , splitOn
151 , split
152 , chunksOf
153
154 -- ** Breaking into lines and words
155 , lines
156 --, lines'
157 , words
158 , unlines
159 , unwords
160
161 -- * Predicates
162 , isPrefixOf
163 , isSuffixOf
164 , isInfixOf
165
166 -- ** View patterns
167 , stripPrefix
168 , stripSuffix
169 , commonPrefixes
170
171 -- * Searching
172 , filter
173 , breakOnAll
174 , find
175 , partition
176
177 -- , findSubstring
178
179 -- * Indexing
180 -- $index
181 , index
182 , findIndex
183 , count
184
185 -- * Zipping
186 , zip
187 , zipWith
188
189 -- -* Ordered text
190 -- , sort
191
192 -- * Low level operations
193 , copy
194 ) where
195
196 import Prelude (Char, Bool(..), Int, Maybe(..), String,
197 Eq(..), Ord(..), Ordering(..), (++),
198 Read(..), Show(..),
199 (&&), (||), (+), (-), (.), ($), ($!), (>>),
200 not, return, otherwise, quot)
201 #if defined(HAVE_DEEPSEQ)
202 import Control.DeepSeq (NFData(rnf))
203 #endif
204 #if defined(ASSERTS)
205 import Control.Exception (assert)
206 #endif
207 import Data.Char (isSpace)
208 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
209 Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
210 import Control.Monad (foldM)
211 import Control.Monad.ST (ST)
212 import qualified Data.Text.Array as A
213 import qualified Data.List as L
214 import Data.Monoid (Monoid(..))
215 import Data.String (IsString(..))
216 import qualified Data.Text.Internal.Fusion as S
217 import qualified Data.Text.Internal.Fusion.Common as S
218 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
219 import Data.Text.Internal.Private (span_)
220 import Data.Text.Internal (Text(..), empty, empty_, firstf, mul, safe, text)
221 import qualified Prelude as P
222 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
223 reverseIter_, unsafeHead, unsafeTail)
224 import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeWrite)
225 import qualified Data.Text.Internal.Functions as F
226 import qualified Data.Text.Internal.Encoding.Utf16 as U16
227 import Data.Text.Internal.Search (indices)
228 #if defined(__HADDOCK__)
229 import Data.ByteString (ByteString)
230 import qualified Data.Text.Lazy as L
231 import Data.Int (Int64)
232 #endif
233 #if __GLASGOW_HASKELL__ >= 702
234 import qualified GHC.CString as GHC
235 #else
236 import qualified GHC.Base as GHC
237 #endif
238 #if __GLASGOW_HASKELL__ >= 708
239 import qualified GHC.Exts as Exts
240 #endif
241 import GHC.Prim (Addr#)
242
243 -- $strict
244 --
245 -- This package provides both strict and lazy 'Text' types. The
246 -- strict type is provided by the "Data.Text" module, while the lazy
247 -- type is provided by the "Data.Text.Lazy" module. Internally, the
248 -- lazy @Text@ type consists of a list of strict chunks.
249 --
250 -- The strict 'Text' type requires that an entire string fit into
251 -- memory at once. The lazy 'Data.Text.Lazy.Text' type is capable of
252 -- streaming strings that are larger than memory using a small memory
253 -- footprint. In many cases, the overhead of chunked streaming makes
254 -- the lazy 'Data.Text.Lazy.Text' type slower than its strict
255 -- counterpart, but this is not always the case. Sometimes, the time
256 -- complexity of a function in one module may be different from the
257 -- other, due to their differing internal structures.
258 --
259 -- Each module provides an almost identical API, with the main
260 -- difference being that the strict module uses 'Int' values for
261 -- lengths and counts, while the lazy module uses 'Data.Int.Int64'
262 -- lengths.
263
264 -- $replacement
265 --
266 -- A 'Text' value is a sequence of Unicode scalar values, as defined
267 -- in
268 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
269 -- As such, a 'Text' cannot contain values in the range U+D800 to
270 -- U+DFFF inclusive. Haskell implementations admit all Unicode code
271 -- points
272 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
273 -- as 'Char' values, including code points from this invalid range.
274 -- This means that there are some 'Char' values that are not valid
275 -- Unicode scalar values, and the functions in this module must handle
276 -- those cases.
277 --
278 -- Within this module, many functions construct a 'Text' from one or
279 -- more 'Char' values. Those functions will substitute 'Char' values
280 -- that are not valid Unicode scalar values with the replacement
281 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
282 -- inspection and replacement are documented with the phrase
283 -- \"Performs replacement on invalid scalar values\".
284 --
285 -- (One reason for this policy of replacement is that internally, a
286 -- 'Text' value is represented as packed UTF-16 data. Values in the
287 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
288 -- code points, and so cannot be represented. The functions replace
289 -- invalid scalar values, instead of dropping them, as a security
290 -- measure. For details, see
291 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
292
293 -- $fusion
294 --
295 -- Most of the functions in this module are subject to /fusion/,
296 -- meaning that a pipeline of such functions will usually allocate at
297 -- most one 'Text' value.
298 --
299 -- As an example, consider the following pipeline:
300 --
301 -- > import Data.Text as T
302 -- > import Data.Text.Encoding as E
303 -- > import Data.ByteString (ByteString)
304 -- >
305 -- > countChars :: ByteString -> Int
306 -- > countChars = T.length . T.toUpper . E.decodeUtf8
307 --
308 -- From the type signatures involved, this looks like it should
309 -- allocate one 'Data.ByteString.ByteString' value, and two 'Text'
310 -- values. However, when a module is compiled with optimisation
311 -- enabled under GHC, the two intermediate 'Text' values will be
312 -- optimised away, and the function will be compiled down to a single
313 -- loop over the source 'Data.ByteString.ByteString'.
314 --
315 -- Functions that can be fused by the compiler are documented with the
316 -- phrase \"Subject to fusion\".
317
318 instance Eq Text where
319 Text arrA offA lenA == Text arrB offB lenB
320 | lenA == lenB = A.equal arrA offA arrB offB lenA
321 | otherwise = False
322 {-# INLINE (==) #-}
323
324 instance Ord Text where
325 compare = compareText
326
327 instance Show Text where
328 showsPrec p ps r = showsPrec p (unpack ps) r
329
330 instance Read Text where
331 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
332
333 instance Monoid Text where
334 mempty = empty
335 mappend = append
336 mconcat = concat
337
338 instance IsString Text where
339 fromString = pack
340
341 #if __GLASGOW_HASKELL__ >= 708
342 instance Exts.IsList Text where
343 type Item Text = Char
344 fromList = pack
345 toList = unpack
346 #endif
347
348 #if defined(HAVE_DEEPSEQ)
349 instance NFData Text where rnf !_ = ()
350 #endif
351
352 -- | This instance preserves data abstraction at the cost of inefficiency.
353 -- We omit reflection services for the sake of data abstraction.
354 --
355 -- This instance was created by copying the updated behavior of
356 -- @"Data.Set".@'Data.Set.Set' and @"Data.Map".@'Data.Map.Map'. If you
357 -- feel a mistake has been made, please feel free to submit
358 -- improvements.
359 --
360 -- The original discussion is archived here:
361 -- <http://groups.google.com/group/haskell-cafe/browse_thread/thread/b5bbb1b28a7e525d/0639d46852575b93 could we get a Data instance for Data.Text.Text? >
362 --
363 -- The followup discussion that changed the behavior of 'Data.Set.Set'
364 -- and 'Data.Map.Map' is archived here:
365 -- <http://markmail.org/message/trovdc6zkphyi3cr#query:+page:1+mid:a46der3iacwjcf6n+state:results Proposal: Allow gunfold for Data.Map, ... >
366
367 instance Data Text where
368 gfoldl f z txt = z pack `f` (unpack txt)
369 toConstr _ = packConstr
370 gunfold k z c = case constrIndex c of
371 1 -> k (z pack)
372 _ -> P.error "gunfold"
373 dataTypeOf _ = textDataType
374
375 packConstr :: Constr
376 packConstr = mkConstr textDataType "pack" [] Prefix
377
378 textDataType :: DataType
379 textDataType = mkDataType "Data.Text.Text" [packConstr]
380
381 -- | /O(n)/ Compare two 'Text' values lexicographically.
382 compareText :: Text -> Text -> Ordering
383 compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
384 | lenA == 0 && lenB == 0 = EQ
385 | otherwise = go 0 0
386 where
387 go !i !j
388 | i >= lenA || j >= lenB = compare lenA lenB
389 | a < b = LT
390 | a > b = GT
391 | otherwise = go (i+di) (j+dj)
392 where Iter a di = iter ta i
393 Iter b dj = iter tb j
394
395 -- -----------------------------------------------------------------------------
396 -- * Conversion to/from 'Text'
397
398 -- | /O(n)/ Convert a 'String' into a 'Text'. Subject to
399 -- fusion. Performs replacement on invalid scalar values.
400 pack :: String -> Text
401 pack = unstream . S.map safe . S.streamList
402 {-# INLINE [1] pack #-}
403
404 -- | /O(n)/ Convert a 'Text' into a 'String'. Subject to fusion.
405 unpack :: Text -> String
406 unpack = S.unstreamList . stream
407 {-# INLINE [1] unpack #-}
408
409 -- | /O(n)/ Convert a literal string into a Text. Subject to fusion.
410 unpackCString# :: Addr# -> Text
411 unpackCString# addr# = unstream (S.streamCString# addr#)
412 {-# NOINLINE unpackCString# #-}
413
414 {-# RULES "TEXT literal" forall a.
415 unstream (S.map safe (S.streamList (GHC.unpackCString# a)))
416 = unpackCString# a #-}
417
418 {-# RULES "TEXT literal UTF8" forall a.
419 unstream (S.map safe (S.streamList (GHC.unpackCStringUtf8# a)))
420 = unpackCString# a #-}
421
422 {-# RULES "TEXT empty literal"
423 unstream (S.map safe (S.streamList []))
424 = empty_ #-}
425
426 {-# RULES "TEXT singleton literal" forall a.
427 unstream (S.map safe (S.streamList [a]))
428 = singleton_ a #-}
429
430 -- | /O(1)/ Convert a character into a Text. Subject to fusion.
431 -- Performs replacement on invalid scalar values.
432 singleton :: Char -> Text
433 singleton = unstream . S.singleton . safe
434 {-# INLINE [1] singleton #-}
435
436 {-# RULES "TEXT singleton" forall a.
437 unstream (S.singleton (safe a))
438 = singleton_ a #-}
439
440 -- This is intended to reduce inlining bloat.
441 singleton_ :: Char -> Text
442 singleton_ c = Text (A.run x) 0 len
443 where x :: ST s (A.MArray s)
444 x = do arr <- A.new len
445 _ <- unsafeWrite arr 0 d
446 return arr
447 len | d < '\x10000' = 1
448 | otherwise = 2
449 d = safe c
450 {-# NOINLINE singleton_ #-}
451
452 -- -----------------------------------------------------------------------------
453 -- * Basic functions
454
455 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
456 -- is more costly than its 'List' counterpart because it requires
457 -- copying a new array. Subject to fusion. Performs replacement on
458 -- invalid scalar values.
459 cons :: Char -> Text -> Text
460 cons c t = unstream (S.cons (safe c) (stream t))
461 {-# INLINE cons #-}
462
463 infixr 5 `cons`
464
465 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
466 -- entire array in the process, unless fused. Subject to fusion.
467 -- Performs replacement on invalid scalar values.
468 snoc :: Text -> Char -> Text
469 snoc t c = unstream (S.snoc (stream t) (safe c))
470 {-# INLINE snoc #-}
471
472 -- | /O(n)/ Appends one 'Text' to the other by copying both of them
473 -- into a new 'Text'. Subject to fusion.
474 append :: Text -> Text -> Text
475 append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
476 | len1 == 0 = b
477 | len2 == 0 = a
478 | len > 0 = Text (A.run x) 0 len
479 | otherwise = overflowError "append"
480 where
481 len = len1+len2
482 x :: ST s (A.MArray s)
483 x = do
484 arr <- A.new len
485 A.copyI arr 0 arr1 off1 len1
486 A.copyI arr len1 arr2 off2 len
487 return arr
488 {-# NOINLINE append #-}
489
490 {-# RULES
491 "TEXT append -> fused" [~1] forall t1 t2.
492 append t1 t2 = unstream (S.append (stream t1) (stream t2))
493 "TEXT append -> unfused" [1] forall t1 t2.
494 unstream (S.append (stream t1) (stream t2)) = append t1 t2
495 #-}
496
497 -- | /O(1)/ Returns the first character of a 'Text', which must be
498 -- non-empty. Subject to fusion.
499 head :: Text -> Char
500 head t = S.head (stream t)
501 {-# INLINE head #-}
502
503 -- | /O(1)/ Returns the first character and rest of a 'Text', or
504 -- 'Nothing' if empty. Subject to fusion.
505 uncons :: Text -> Maybe (Char, Text)
506 uncons t@(Text arr off len)
507 | len <= 0 = Nothing
508 | otherwise = Just (c, text arr (off+d) (len-d))
509 where Iter c d = iter t 0
510 {-# INLINE [1] uncons #-}
511
512 -- | Lifted from Control.Arrow and specialized.
513 second :: (b -> c) -> (a,b) -> (a,c)
514 second f (a, b) = (a, f b)
515
516 -- | /O(1)/ Returns the last character of a 'Text', which must be
517 -- non-empty. Subject to fusion.
518 last :: Text -> Char
519 last (Text arr off len)
520 | len <= 0 = emptyError "last"
521 | n < 0xDC00 || n > 0xDFFF = unsafeChr n
522 | otherwise = U16.chr2 n0 n
523 where n = A.unsafeIndex arr (off+len-1)
524 n0 = A.unsafeIndex arr (off+len-2)
525 {-# INLINE [1] last #-}
526
527 {-# RULES
528 "TEXT last -> fused" [~1] forall t.
529 last t = S.last (stream t)
530 "TEXT last -> unfused" [1] forall t.
531 S.last (stream t) = last t
532 #-}
533
534 -- | /O(1)/ Returns all characters after the head of a 'Text', which
535 -- must be non-empty. Subject to fusion.
536 tail :: Text -> Text
537 tail t@(Text arr off len)
538 | len <= 0 = emptyError "tail"
539 | otherwise = text arr (off+d) (len-d)
540 where d = iter_ t 0
541 {-# INLINE [1] tail #-}
542
543 {-# RULES
544 "TEXT tail -> fused" [~1] forall t.
545 tail t = unstream (S.tail (stream t))
546 "TEXT tail -> unfused" [1] forall t.
547 unstream (S.tail (stream t)) = tail t
548 #-}
549
550 -- | /O(1)/ Returns all but the last character of a 'Text', which must
551 -- be non-empty. Subject to fusion.
552 init :: Text -> Text
553 init (Text arr off len) | len <= 0 = emptyError "init"
554 | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
555 | otherwise = text arr off (len-1)
556 where
557 n = A.unsafeIndex arr (off+len-1)
558 {-# INLINE [1] init #-}
559
560 {-# RULES
561 "TEXT init -> fused" [~1] forall t.
562 init t = unstream (S.init (stream t))
563 "TEXT init -> unfused" [1] forall t.
564 unstream (S.init (stream t)) = init t
565 #-}
566
567 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
568 -- fusion.
569 null :: Text -> Bool
570 null (Text _arr _off len) =
571 #if defined(ASSERTS)
572 assert (len >= 0) $
573 #endif
574 len <= 0
575 {-# INLINE [1] null #-}
576
577 {-# RULES
578 "TEXT null -> fused" [~1] forall t.
579 null t = S.null (stream t)
580 "TEXT null -> unfused" [1] forall t.
581 S.null (stream t) = null t
582 #-}
583
584 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
585 -- Subject to fusion.
586 isSingleton :: Text -> Bool
587 isSingleton = S.isSingleton . stream
588 {-# INLINE isSingleton #-}
589
590 -- | /O(n)/ Returns the number of characters in a 'Text'.
591 -- Subject to fusion.
592 length :: Text -> Int
593 length t = S.length (stream t)
594 {-# INLINE length #-}
595
596 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
597 -- Subject to fusion.
598 --
599 -- This function gives the same answer as comparing against the result
600 -- of 'length', but can short circuit if the count of characters is
601 -- greater than the number, and hence be more efficient.
602 compareLength :: Text -> Int -> Ordering
603 compareLength t n = S.compareLengthI (stream t) n
604 {-# INLINE [1] compareLength #-}
605
606 {-# RULES
607 "TEXT compareN/length -> compareLength" [~1] forall t n.
608 compare (length t) n = compareLength t n
609 #-}
610
611 {-# RULES
612 "TEXT ==N/length -> compareLength/==EQ" [~1] forall t n.
613 (==) (length t) n = compareLength t n == EQ
614 #-}
615
616 {-# RULES
617 "TEXT /=N/length -> compareLength//=EQ" [~1] forall t n.
618 (/=) (length t) n = compareLength t n /= EQ
619 #-}
620
621 {-# RULES
622 "TEXT <N/length -> compareLength/==LT" [~1] forall t n.
623 (<) (length t) n = compareLength t n == LT
624 #-}
625
626 {-# RULES
627 "TEXT <=N/length -> compareLength//=GT" [~1] forall t n.
628 (<=) (length t) n = compareLength t n /= GT
629 #-}
630
631 {-# RULES
632 "TEXT >N/length -> compareLength/==GT" [~1] forall t n.
633 (>) (length t) n = compareLength t n == GT
634 #-}
635
636 {-# RULES
637 "TEXT >=N/length -> compareLength//=LT" [~1] forall t n.
638 (>=) (length t) n = compareLength t n /= LT
639 #-}
640
641 -- -----------------------------------------------------------------------------
642 -- * Transformations
643 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
644 -- each element of @t@. Subject to fusion. Performs replacement on
645 -- invalid scalar values.
646 map :: (Char -> Char) -> Text -> Text
647 map f t = unstream (S.map (safe . f) (stream t))
648 {-# INLINE [1] map #-}
649
650 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
651 -- 'Text's and concatenates the list after interspersing the first
652 -- argument between each element of the list.
653 intercalate :: Text -> [Text] -> Text
654 intercalate t = concat . (F.intersperse t)
655 {-# INLINE intercalate #-}
656
657 -- | /O(n)/ The 'intersperse' function takes a character and places it
658 -- between the characters of a 'Text'. Subject to fusion. Performs
659 -- replacement on invalid scalar values.
660 intersperse :: Char -> Text -> Text
661 intersperse c t = unstream (S.intersperse (safe c) (stream t))
662 {-# INLINE intersperse #-}
663
664 -- | /O(n)/ Reverse the characters of a string. Subject to fusion.
665 reverse :: Text -> Text
666 reverse t = S.reverse (stream t)
667 {-# INLINE reverse #-}
668
669 -- | /O(m+n)/ Replace every occurrence of @needle@ in @haystack@ with @repl@.
670 --
671 -- If @needle@ occurs in @repl@, it won't be replaced twice:
672 --
673 -- > replace "oo" "foo" "oo" == "foo"
674 --
675 -- If several instances of @needle@ overlap, only the first one will be
676 -- replaced:
677 --
678 -- > replace "ofo" "bar" "ofofo" == "barfo"
679 --
680 -- In (unlikely) bad cases, this function's time complexity degrades
681 -- towards /O(n*m)/.
682 replace :: Text -- ^ @needle@ to search for
683 -> Text -- ^ @repl@ to replace @needle@ with
684 -> Text -- ^ @haystack@ in which to search
685 -> Text
686 replace needle@(Text _ _ neeLen)
687 (Text repArr repOff repLen)
688 haystack@(Text hayArr hayOff hayLen)
689 | neeLen == 0 = emptyError "replace"
690 | L.null ixs = haystack
691 | len > 0 = Text (A.run x) 0 len
692 | otherwise = empty
693 where
694 ixs = indices needle haystack
695 len = hayLen - (neeLen - repLen) `mul` L.length ixs
696 x :: ST s (A.MArray s)
697 x = do
698 marr <- A.new len
699 let loop (i:is) o d = do
700 let d0 = d + i - o
701 d1 = d0 + repLen
702 A.copyI marr d hayArr (hayOff+o) d0
703 A.copyI marr d0 repArr repOff d1
704 loop is (i + neeLen) d1
705 loop [] o d = A.copyI marr d hayArr (hayOff+o) len
706 loop ixs 0 0
707 return marr
708
709 -- ----------------------------------------------------------------------------
710 -- ** Case conversions (folds)
711
712 -- $case
713 --
714 -- When case converting 'Text' values, do not use combinators like
715 -- @map toUpper@ to case convert each character of a string
716 -- individually, as this gives incorrect results according to the
717 -- rules of some writing systems. The whole-string case conversion
718 -- functions from this module, such as @toUpper@, obey the correct
719 -- case conversion rules. As a result, these functions may map one
720 -- input character to two or three output characters. For examples,
721 -- see the documentation of each function.
722 --
723 -- /Note/: In some languages, case conversion is a locale- and
724 -- context-dependent operation. The case conversion functions in this
725 -- module are /not/ locale sensitive. Programs that require locale
726 -- sensitivity should use appropriate versions of the
727 -- <http://hackage.haskell.org/package/text-icu-0.6.3.7/docs/Data-Text-ICU.html#g:4 case mapping functions from the text-icu package >.
728
729 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
730 --
731 -- This function is mainly useful for performing caseless (also known
732 -- as case insensitive) string comparisons.
733 --
734 -- A string @x@ is a caseless match for a string @y@ if and only if:
735 --
736 -- @toCaseFold x == toCaseFold y@
737 --
738 -- The result string may be longer than the input string, and may
739 -- differ from applying 'toLower' to the input string. For instance,
740 -- the Armenian small ligature \"&#xfb13;\" (men now, U+FB13) is case
741 -- folded to the sequence \"&#x574;\" (men, U+0574) followed by
742 -- \"&#x576;\" (now, U+0576), while the Greek \"&#xb5;\" (micro sign,
743 -- U+00B5) is case folded to \"&#x3bc;\" (small letter mu, U+03BC)
744 -- instead of itself.
745 toCaseFold :: Text -> Text
746 toCaseFold t = unstream (S.toCaseFold (stream t))
747 {-# INLINE [0] toCaseFold #-}
748
749 -- | /O(n)/ Convert a string to lower case, using simple case
750 -- conversion. Subject to fusion.
751 --
752 -- The result string may be longer than the input string. For
753 -- instance, \"&#x130;\" (Latin capital letter I with dot above,
754 -- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069)
755 -- followed by \" &#x307;\" (combining dot above, U+0307).
756 toLower :: Text -> Text
757 toLower t = unstream (S.toLower (stream t))
758 {-# INLINE toLower #-}
759
760 -- | /O(n)/ Convert a string to upper case, using simple case
761 -- conversion. Subject to fusion.
762 --
763 -- The result string may be longer than the input string. For
764 -- instance, the German \"&#xdf;\" (eszett, U+00DF) maps to the
765 -- two-letter sequence \"SS\".
766 toUpper :: Text -> Text
767 toUpper t = unstream (S.toUpper (stream t))
768 {-# INLINE toUpper #-}
769
770 -- | /O(n)/ Convert a string to title case, using simple case
771 -- conversion. Subject to fusion.
772 --
773 -- The first letter of the input is converted to title case, as is
774 -- every subsequent letter that immediately follows a non-letter.
775 -- Every letter that immediately follows another letter is converted
776 -- to lower case.
777 --
778 -- The result string may be longer than the input string. For example,
779 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
780 -- sequence Latin capital letter F (U+0046) followed by Latin small
781 -- letter l (U+006C).
782 --
783 -- /Note/: this function does not take language or culture specific
784 -- rules into account. For instance, in English, different style
785 -- guides disagree on whether the book name \"The Hill of the Red
786 -- Fox\" is correctly title cased&#x2014;but this function will
787 -- capitalize /every/ word.
788 toTitle :: Text -> Text
789 toTitle t = unstream (S.toTitle (stream t))
790 {-# INLINE toTitle #-}
791
792 -- | /O(n)/ Left-justify a string to the given length, using the
793 -- specified fill character on the right. Subject to fusion.
794 -- Performs replacement on invalid scalar values.
795 --
796 -- Examples:
797 --
798 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
799 -- > justifyLeft 3 'x' "foobar" == "foobar"
800 justifyLeft :: Int -> Char -> Text -> Text
801 justifyLeft k c t
802 | len >= k = t
803 | otherwise = t `append` replicateChar (k-len) c
804 where len = length t
805 {-# INLINE [1] justifyLeft #-}
806
807 {-# RULES
808 "TEXT justifyLeft -> fused" [~1] forall k c t.
809 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
810 "TEXT justifyLeft -> unfused" [1] forall k c t.
811 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
812 #-}
813
814 -- | /O(n)/ Right-justify a string to the given length, using the
815 -- specified fill character on the left. Performs replacement on
816 -- invalid scalar values.
817 --
818 -- Examples:
819 --
820 -- > justifyRight 7 'x' "bar" == "xxxxbar"
821 -- > justifyRight 3 'x' "foobar" == "foobar"
822 justifyRight :: Int -> Char -> Text -> Text
823 justifyRight k c t
824 | len >= k = t
825 | otherwise = replicateChar (k-len) c `append` t
826 where len = length t
827 {-# INLINE justifyRight #-}
828
829 -- | /O(n)/ Center a string to the given length, using the specified
830 -- fill character on either side. Performs replacement on invalid
831 -- scalar values.
832 --
833 -- Examples:
834 --
835 -- > center 8 'x' "HS" = "xxxHSxxx"
836 center :: Int -> Char -> Text -> Text
837 center k c t
838 | len >= k = t
839 | otherwise = replicateChar l c `append` t `append` replicateChar r c
840 where len = length t
841 d = k - len
842 r = d `quot` 2
843 l = d - r
844 {-# INLINE center #-}
845
846 -- | /O(n)/ The 'transpose' function transposes the rows and columns
847 -- of its 'Text' argument. Note that this function uses 'pack',
848 -- 'unpack', and the list version of transpose, and is thus not very
849 -- efficient.
850 transpose :: [Text] -> [Text]
851 transpose ts = P.map pack (L.transpose (P.map unpack ts))
852
853 -- -----------------------------------------------------------------------------
854 -- * Reducing 'Text's (folds)
855
856 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
857 -- (typically the left-identity of the operator), and a 'Text',
858 -- reduces the 'Text' using the binary operator, from left to right.
859 -- Subject to fusion.
860 foldl :: (a -> Char -> a) -> a -> Text -> a
861 foldl f z t = S.foldl f z (stream t)
862 {-# INLINE foldl #-}
863
864 -- | /O(n)/ A strict version of 'foldl'. Subject to fusion.
865 foldl' :: (a -> Char -> a) -> a -> Text -> a
866 foldl' f z t = S.foldl' f z (stream t)
867 {-# INLINE foldl' #-}
868
869 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
870 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
871 foldl1 :: (Char -> Char -> Char) -> Text -> Char
872 foldl1 f t = S.foldl1 f (stream t)
873 {-# INLINE foldl1 #-}
874
875 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
876 foldl1' :: (Char -> Char -> Char) -> Text -> Char
877 foldl1' f t = S.foldl1' f (stream t)
878 {-# INLINE foldl1' #-}
879
880 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
881 -- (typically the right-identity of the operator), and a 'Text',
882 -- reduces the 'Text' using the binary operator, from right to left.
883 -- Subject to fusion.
884 foldr :: (Char -> a -> a) -> a -> Text -> a
885 foldr f z t = S.foldr f z (stream t)
886 {-# INLINE foldr #-}
887
888 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
889 -- and thus must be applied to a non-empty 'Text'. Subject to
890 -- fusion.
891 foldr1 :: (Char -> Char -> Char) -> Text -> Char
892 foldr1 f t = S.foldr1 f (stream t)
893 {-# INLINE foldr1 #-}
894
895 -- -----------------------------------------------------------------------------
896 -- ** Special folds
897
898 -- | /O(n)/ Concatenate a list of 'Text's.
899 concat :: [Text] -> Text
900 concat ts = case ts' of
901 [] -> empty
902 [t] -> t
903 _ -> Text (A.run go) 0 len
904 where
905 ts' = L.filter (not . null) ts
906 len = sumP "concat" $ L.map lengthWord16 ts'
907 go :: ST s (A.MArray s)
908 go = do
909 arr <- A.new len
910 let step i (Text a o l) =
911 let !j = i + l in A.copyI arr i a o j >> return j
912 foldM step 0 ts' >> return arr
913
914 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
915 -- concatenate the results.
916 concatMap :: (Char -> Text) -> Text -> Text
917 concatMap f = concat . foldr ((:) . f) []
918 {-# INLINE concatMap #-}
919
920 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
921 -- 'Text' @t@ satisifes the predicate @p@. Subject to fusion.
922 any :: (Char -> Bool) -> Text -> Bool
923 any p t = S.any p (stream t)
924 {-# INLINE any #-}
925
926 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
927 -- 'Text' @t@ satisify the predicate @p@. Subject to fusion.
928 all :: (Char -> Bool) -> Text -> Bool
929 all p t = S.all p (stream t)
930 {-# INLINE all #-}
931
932 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
933 -- must be non-empty. Subject to fusion.
934 maximum :: Text -> Char
935 maximum t = S.maximum (stream t)
936 {-# INLINE maximum #-}
937
938 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
939 -- must be non-empty. Subject to fusion.
940 minimum :: Text -> Char
941 minimum t = S.minimum (stream t)
942 {-# INLINE minimum #-}
943
944 -- -----------------------------------------------------------------------------
945 -- * Building 'Text's
946
947 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
948 -- successive reduced values from the left. Subject to fusion.
949 -- Performs replacement on invalid scalar values.
950 --
951 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
952 --
953 -- Note that
954 --
955 -- > last (scanl f z xs) == foldl f z xs.
956 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
957 scanl f z t = unstream (S.scanl g z (stream t))
958 where g a b = safe (f a b)
959 {-# INLINE scanl #-}
960
961 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
962 -- value argument. Subject to fusion. Performs replacement on
963 -- invalid scalar values.
964 --
965 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
966 scanl1 :: (Char -> Char -> Char) -> Text -> Text
967 scanl1 f t | null t = empty
968 | otherwise = scanl f (unsafeHead t) (unsafeTail t)
969 {-# INLINE scanl1 #-}
970
971 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
972 -- replacement on invalid scalar values.
973 --
974 -- > scanr f v == reverse . scanl (flip f) v . reverse
975 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
976 scanr f z = S.reverse . S.reverseScanr g z . reverseStream
977 where g a b = safe (f a b)
978 {-# INLINE scanr #-}
979
980 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
981 -- value argument. Subject to fusion. Performs replacement on
982 -- invalid scalar values.
983 scanr1 :: (Char -> Char -> Char) -> Text -> Text
984 scanr1 f t | null t = empty
985 | otherwise = scanr f (last t) (init t)
986 {-# INLINE scanr1 #-}
987
988 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
989 -- function to each element of a 'Text', passing an accumulating
990 -- parameter from left to right, and returns a final 'Text'. Performs
991 -- replacement on invalid scalar values.
992 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
993 mapAccumL f z0 = S.mapAccumL g z0 . stream
994 where g a b = second safe (f a b)
995 {-# INLINE mapAccumL #-}
996
997 -- | The 'mapAccumR' function behaves like a combination of 'map' and
998 -- a strict 'foldr'; it applies a function to each element of a
999 -- 'Text', passing an accumulating parameter from right to left, and
1000 -- returning a final value of this accumulator together with the new
1001 -- 'Text'.
1002 -- Performs replacement on invalid scalar values.
1003 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
1004 mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
1005 where g a b = second safe (f a b)
1006 {-# INLINE mapAccumR #-}
1007
1008 -- -----------------------------------------------------------------------------
1009 -- ** Generating and unfolding 'Text's
1010
1011 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
1012 -- @t@ repeated @n@ times.
1013 replicate :: Int -> Text -> Text
1014 replicate n t@(Text a o l)
1015 | n <= 0 || l <= 0 = empty
1016 | n == 1 = t
1017 | isSingleton t = replicateChar n (unsafeHead t)
1018 | otherwise = Text (A.run x) 0 len
1019 where
1020 len = l `mul` n
1021 x :: ST s (A.MArray s)
1022 x = do
1023 arr <- A.new len
1024 let loop !d !i | i >= n = return arr
1025 | otherwise = let m = d + l
1026 in A.copyI arr d a o m >> loop m (i+1)
1027 loop 0 0
1028 {-# INLINE [1] replicate #-}
1029
1030 {-# RULES
1031 "TEXT replicate/singleton -> replicateChar" [~1] forall n c.
1032 replicate n (singleton c) = replicateChar n c
1033 #-}
1034
1035 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
1036 -- value of every element. Subject to fusion.
1037 replicateChar :: Int -> Char -> Text
1038 replicateChar n c = unstream (S.replicateCharI n (safe c))
1039 {-# INLINE replicateChar #-}
1040
1041 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
1042 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
1043 -- 'Text' from a seed value. The function takes the element and
1044 -- returns 'Nothing' if it is done producing the 'Text', otherwise
1045 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
1046 -- string, and @b@ is the seed value for further production. Subject
1047 -- to fusion. Performs replacement on invalid scalar values.
1048 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
1049 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
1050 {-# INLINE unfoldr #-}
1051
1052 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
1053 -- value. However, the length of the result should be limited by the
1054 -- first argument to 'unfoldrN'. This function is more efficient than
1055 -- 'unfoldr' when the maximum length of the result is known and
1056 -- correct, otherwise its performance is similar to 'unfoldr'. Subject
1057 -- to fusion. Performs replacement on invalid scalar values.
1058 unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Text
1059 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
1060 {-# INLINE unfoldrN #-}
1061
1062 -- -----------------------------------------------------------------------------
1063 -- * Substrings
1064
1065 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
1066 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
1067 -- the length of the Text. Subject to fusion.
1068 take :: Int -> Text -> Text
1069 take n t@(Text arr off len)
1070 | n <= 0 = empty
1071 | n >= len = t
1072 | otherwise = text arr off (iterN n t)
1073 {-# INLINE [1] take #-}
1074
1075 iterN :: Int -> Text -> Int
1076 iterN n t@(Text _arr _off len) = loop 0 0
1077 where loop !i !cnt
1078 | i >= len || cnt >= n = i
1079 | otherwise = loop (i+d) (cnt+1)
1080 where d = iter_ t i
1081
1082 {-# RULES
1083 "TEXT take -> fused" [~1] forall n t.
1084 take n t = unstream (S.take n (stream t))
1085 "TEXT take -> unfused" [1] forall n t.
1086 unstream (S.take n (stream t)) = take n t
1087 #-}
1088
1089 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
1090 -- taking @n@ characters from the end of @t@.
1091 --
1092 -- Examples:
1093 --
1094 -- > takeEnd 3 "foobar" == "bar"
1095 takeEnd :: Int -> Text -> Text
1096 takeEnd n t@(Text arr off len)
1097 | n <= 0 = empty
1098 | n >= len = t
1099 | otherwise = text arr (off+i) (len-i)
1100 where i = iterNEnd n t
1101
1102 iterNEnd :: Int -> Text -> Int
1103 iterNEnd n t@(Text _arr _off len) = loop (len-1) n
1104 where loop i !m
1105 | i <= 0 = 0
1106 | m <= 1 = i
1107 | otherwise = loop (i+d) (m-1)
1108 where d = reverseIter_ t i
1109
1110 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
1111 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
1112 -- is greater than the length of the 'Text'. Subject to fusion.
1113 drop :: Int -> Text -> Text
1114 drop n t@(Text arr off len)
1115 | n <= 0 = t
1116 | n >= len = empty
1117 | otherwise = text arr (off+i) (len-i)
1118 where i = iterN n t
1119 {-# INLINE [1] drop #-}
1120
1121 {-# RULES
1122 "TEXT drop -> fused" [~1] forall n t.
1123 drop n t = unstream (S.drop n (stream t))
1124 "TEXT drop -> unfused" [1] forall n t.
1125 unstream (S.drop n (stream t)) = drop n t
1126 #-}
1127
1128 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1129 -- dropping @n@ characters from the end of @t@.
1130 --
1131 -- Examples:
1132 --
1133 -- > dropEnd 3 "foobar" == "foo"
1134 dropEnd :: Int -> Text -> Text
1135 dropEnd n t@(Text arr off len)
1136 | n <= 0 = t
1137 | n >= len = empty
1138 | otherwise = text arr off (iterNEnd n t)
1139
1140 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1141 -- returns the longest prefix (possibly empty) of elements that
1142 -- satisfy @p@. Subject to fusion.
1143 takeWhile :: (Char -> Bool) -> Text -> Text
1144 takeWhile p t@(Text arr off len) = loop 0
1145 where loop !i | i >= len = t
1146 | p c = loop (i+d)
1147 | otherwise = text arr off i
1148 where Iter c d = iter t i
1149 {-# INLINE [1] takeWhile #-}
1150
1151 {-# RULES
1152 "TEXT takeWhile -> fused" [~1] forall p t.
1153 takeWhile p t = unstream (S.takeWhile p (stream t))
1154 "TEXT takeWhile -> unfused" [1] forall p t.
1155 unstream (S.takeWhile p (stream t)) = takeWhile p t
1156 #-}
1157
1158 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1159 -- 'takeWhile' @p@ @t@. Subject to fusion.
1160 dropWhile :: (Char -> Bool) -> Text -> Text
1161 dropWhile p t@(Text arr off len) = loop 0 0
1162 where loop !i !l | l >= len = empty
1163 | p c = loop (i+d) (l+d)
1164 | otherwise = Text arr (off+i) (len-l)
1165 where Iter c d = iter t i
1166 {-# INLINE [1] dropWhile #-}
1167
1168 {-# RULES
1169 "TEXT dropWhile -> fused" [~1] forall p t.
1170 dropWhile p t = unstream (S.dropWhile p (stream t))
1171 "TEXT dropWhile -> unfused" [1] forall p t.
1172 unstream (S.dropWhile p (stream t)) = dropWhile p t
1173 #-}
1174
1175 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1176 -- dropping characters that fail the predicate @p@ from the end of
1177 -- @t@. Subject to fusion.
1178 -- Examples:
1179 --
1180 -- > dropWhileEnd (=='.') "foo..." == "foo"
1181 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1182 dropWhileEnd p t@(Text arr off len) = loop (len-1) len
1183 where loop !i !l | l <= 0 = empty
1184 | p c = loop (i+d) (l+d)
1185 | otherwise = Text arr off l
1186 where (c,d) = reverseIter t i
1187 {-# INLINE [1] dropWhileEnd #-}
1188
1189 {-# RULES
1190 "TEXT dropWhileEnd -> fused" [~1] forall p t.
1191 dropWhileEnd p t = S.reverse (S.dropWhile p (S.reverseStream t))
1192 "TEXT dropWhileEnd -> unfused" [1] forall p t.
1193 S.reverse (S.dropWhile p (S.reverseStream t)) = dropWhileEnd p t
1194 #-}
1195
1196 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1197 -- dropping characters that fail the predicate @p@ from both the
1198 -- beginning and end of @t@. Subject to fusion.
1199 dropAround :: (Char -> Bool) -> Text -> Text
1200 dropAround p = dropWhile p . dropWhileEnd p
1201 {-# INLINE [1] dropAround #-}
1202
1203 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1204 --
1205 -- > dropWhile isSpace
1206 stripStart :: Text -> Text
1207 stripStart = dropWhile isSpace
1208 {-# INLINE [1] stripStart #-}
1209
1210 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1211 --
1212 -- > dropWhileEnd isSpace
1213 stripEnd :: Text -> Text
1214 stripEnd = dropWhileEnd isSpace
1215 {-# INLINE [1] stripEnd #-}
1216
1217 -- | /O(n)/ Remove leading and trailing white space from a string.
1218 -- Equivalent to:
1219 --
1220 -- > dropAround isSpace
1221 strip :: Text -> Text
1222 strip = dropAround isSpace
1223 {-# INLINE [1] strip #-}
1224
1225 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1226 -- prefix of @t@ of length @n@, and whose second is the remainder of
1227 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1228 splitAt :: Int -> Text -> (Text, Text)
1229 splitAt n t@(Text arr off len)
1230 | n <= 0 = (empty, t)
1231 | n >= len = (t, empty)
1232 | otherwise = let k = iterN n t
1233 in (text arr off k, text arr (off+k) (len-k))
1234
1235 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1236 -- a pair whose first element is the longest prefix (possibly empty)
1237 -- of @t@ of elements that satisfy @p@, and whose second is the
1238 -- remainder of the list.
1239 span :: (Char -> Bool) -> Text -> (Text, Text)
1240 span p t = case span_ p t of
1241 (# hd,tl #) -> (hd,tl)
1242 {-# INLINE span #-}
1243
1244 -- | /O(n)/ 'break' is like 'span', but the prefix returned is
1245 -- over elements that fail the predicate @p@.
1246 break :: (Char -> Bool) -> Text -> (Text, Text)
1247 break p = span (not . p)
1248 {-# INLINE break #-}
1249
1250 -- | /O(n)/ Group characters in a string according to a predicate.
1251 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1252 groupBy p = loop
1253 where
1254 loop t@(Text arr off len)
1255 | null t = []
1256 | otherwise = text arr off n : loop (text arr (off+n) (len-n))
1257 where Iter c d = iter t 0
1258 n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
1259
1260 -- | Returns the /array/ index (in units of 'Word16') at which a
1261 -- character may be found. This is /not/ the same as the logical
1262 -- index returned by e.g. 'findIndex'.
1263 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
1264 findAIndexOrEnd q t@(Text _arr _off len) = go 0
1265 where go !i | i >= len || q c = i
1266 | otherwise = go (i+d)
1267 where Iter c d = iter t i
1268
1269 -- | /O(n)/ Group characters in a string by equality.
1270 group :: Text -> [Text]
1271 group = groupBy (==)
1272
1273 -- | /O(n)/ Return all initial segments of the given 'Text', shortest
1274 -- first.
1275 inits :: Text -> [Text]
1276 inits t@(Text arr off len) = loop 0
1277 where loop i | i >= len = [t]
1278 | otherwise = Text arr off i : loop (i + iter_ t i)
1279
1280 -- | /O(n)/ Return all final segments of the given 'Text', longest
1281 -- first.
1282 tails :: Text -> [Text]
1283 tails t | null t = [empty]
1284 | otherwise = t : tails (unsafeTail t)
1285
1286 -- $split
1287 --
1288 -- Splitting functions in this library do not perform character-wise
1289 -- copies to create substrings; they just construct new 'Text's that
1290 -- are slices of the original.
1291
1292 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first
1293 -- 'Text' argument, consuming the delimiter. An empty delimiter is
1294 -- invalid, and will cause an error to be raised.
1295 --
1296 -- Examples:
1297 --
1298 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1299 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1300 -- > splitOn "x" "x" == ["",""]
1301 --
1302 -- and
1303 --
1304 -- > intercalate s . splitOn s == id
1305 -- > splitOn (singleton c) == split (==c)
1306 --
1307 -- In (unlikely) bad cases, this function's time complexity degrades
1308 -- towards /O(n*m)/.
1309 splitOn :: Text -> Text -> [Text]
1310 splitOn pat@(Text _ _ l) src@(Text arr off len)
1311 | l <= 0 = emptyError "splitOn"
1312 | isSingleton pat = split (== unsafeHead pat) src
1313 | otherwise = go 0 (indices pat src)
1314 where
1315 go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs
1316 go s _ = [text arr (s+off) (len-s)]
1317 {-# INLINE [1] splitOn #-}
1318
1319 {-# RULES
1320 "TEXT splitOn/singleton -> split/==" [~1] forall c t.
1321 splitOn (singleton c) t = split (==c) t
1322 #-}
1323
1324 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1325 -- where the predicate returns True for a separator element. The
1326 -- resulting components do not contain the separators. Two adjacent
1327 -- separators result in an empty component in the output. eg.
1328 --
1329 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1330 -- > split (=='a') "" == [""]
1331 split :: (Char -> Bool) -> Text -> [Text]
1332 split _ t@(Text _off _arr 0) = [t]
1333 split p t = loop t
1334 where loop s | null s' = [l]
1335 | otherwise = l : loop (unsafeTail s')
1336 where (# l, s' #) = span_ (not . p) s
1337 {-# INLINE split #-}
1338
1339 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1340 -- element may be shorter than the other chunks, depending on the
1341 -- length of the input. Examples:
1342 --
1343 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1344 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1345 chunksOf :: Int -> Text -> [Text]
1346 chunksOf k = go
1347 where
1348 go t = case splitAt k t of
1349 (a,b) | null a -> []
1350 | otherwise -> a : go b
1351 {-# INLINE chunksOf #-}
1352
1353 -- ----------------------------------------------------------------------------
1354 -- * Searching
1355
1356 -------------------------------------------------------------------------------
1357 -- ** Searching with a predicate
1358
1359 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1360 -- returns the first element matching the predicate, or 'Nothing' if
1361 -- there is no such element.
1362 find :: (Char -> Bool) -> Text -> Maybe Char
1363 find p t = S.findBy p (stream t)
1364 {-# INLINE find #-}
1365
1366 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1367 -- and returns the pair of 'Text's with elements which do and do not
1368 -- satisfy the predicate, respectively; i.e.
1369 --
1370 -- > partition p t == (filter p t, filter (not . p) t)
1371 partition :: (Char -> Bool) -> Text -> (Text, Text)
1372 partition p t = (filter p t, filter (not . p) t)
1373 {-# INLINE partition #-}
1374
1375 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1376 -- returns a 'Text' containing those characters that satisfy the
1377 -- predicate.
1378 filter :: (Char -> Bool) -> Text -> Text
1379 filter p t = unstream (S.filter p (stream t))
1380 {-# INLINE filter #-}
1381
1382 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1383 -- non-'null') in @haystack@. The first element of the returned tuple
1384 -- is the prefix of @haystack@ before @needle@ is matched. The second
1385 -- is the remainder of @haystack@, starting with the match.
1386 --
1387 -- Examples:
1388 --
1389 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1390 -- > breakOn "/" "foobar" ==> ("foobar", "")
1391 --
1392 -- Laws:
1393 --
1394 -- > append prefix match == haystack
1395 -- > where (prefix, match) = breakOn needle haystack
1396 --
1397 -- If you need to break a string by a substring repeatedly (e.g. you
1398 -- want to break on every instance of a substring), use 'breakOnAll'
1399 -- instead, as it has lower startup overhead.
1400 --
1401 -- In (unlikely) bad cases, this function's time complexity degrades
1402 -- towards /O(n*m)/.
1403 breakOn :: Text -> Text -> (Text, Text)
1404 breakOn pat src@(Text arr off len)
1405 | null pat = emptyError "breakOn"
1406 | otherwise = case indices pat src of
1407 [] -> (src, empty)
1408 (x:_) -> (text arr off x, text arr (off+x) (len-x))
1409 {-# INLINE breakOn #-}
1410
1411 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
1412 -- string.
1413 --
1414 -- The first element of the returned tuple is the prefix of @haystack@
1415 -- up to and including the last match of @needle@. The second is the
1416 -- remainder of @haystack@, following the match.
1417 --
1418 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1419 breakOnEnd :: Text -> Text -> (Text, Text)
1420 breakOnEnd pat src = (reverse b, reverse a)
1421 where (a,b) = breakOn (reverse pat) (reverse src)
1422 {-# INLINE breakOnEnd #-}
1423
1424 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1425 -- @haystack@. Each element of the returned list consists of a pair:
1426 --
1427 -- * The entire string prior to the /k/th match (i.e. the prefix)
1428 --
1429 -- * The /k/th match, followed by the remainder of the string
1430 --
1431 -- Examples:
1432 --
1433 -- > breakOnAll "::" ""
1434 -- > ==> []
1435 -- > breakOnAll "/" "a/b/c/"
1436 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1437 --
1438 -- In (unlikely) bad cases, this function's time complexity degrades
1439 -- towards /O(n*m)/.
1440 --
1441 -- The @needle@ parameter may not be empty.
1442 breakOnAll :: Text -- ^ @needle@ to search for
1443 -> Text -- ^ @haystack@ in which to search
1444 -> [(Text, Text)]
1445 breakOnAll pat src@(Text arr off slen)
1446 | null pat = emptyError "breakOnAll"
1447 | otherwise = L.map step (indices pat src)
1448 where
1449 step x = (chunk 0 x, chunk x (slen-x))
1450 chunk !n !l = text arr (n+off) l
1451 {-# INLINE breakOnAll #-}
1452
1453 -------------------------------------------------------------------------------
1454 -- ** Indexing 'Text's
1455
1456 -- $index
1457 --
1458 -- If you think of a 'Text' value as an array of 'Char' values (which
1459 -- it is not), you run the risk of writing inefficient code.
1460 --
1461 -- An idiom that is common in some languages is to find the numeric
1462 -- offset of a character or substring, then use that number to split
1463 -- or trim the searched string. With a 'Text' value, this approach
1464 -- would require two /O(n)/ operations: one to perform the search, and
1465 -- one to operate from wherever the search ended.
1466 --
1467 -- For example, suppose you have a string that you want to split on
1468 -- the substring @\"::\"@, such as @\"foo::bar::quux\"@. Instead of
1469 -- searching for the index of @\"::\"@ and taking the substrings
1470 -- before and after that index, you would instead use @breakOnAll \"::\"@.
1471
1472 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1473 index :: Text -> Int -> Char
1474 index t n = S.index (stream t) n
1475 {-# INLINE index #-}
1476
1477 -- | /O(n)/ The 'findIndex' function takes a predicate and a 'Text'
1478 -- and returns the index of the first element in the 'Text' satisfying
1479 -- the predicate. Subject to fusion.
1480 findIndex :: (Char -> Bool) -> Text -> Maybe Int
1481 findIndex p t = S.findIndex p (stream t)
1482 {-# INLINE findIndex #-}
1483
1484 -- | /O(n+m)/ The 'count' function returns the number of times the
1485 -- query string appears in the given 'Text'. An empty query string is
1486 -- invalid, and will cause an error to be raised.
1487 --
1488 -- In (unlikely) bad cases, this function's time complexity degrades
1489 -- towards /O(n*m)/.
1490 count :: Text -> Text -> Int
1491 count pat src
1492 | null pat = emptyError "count"
1493 | isSingleton pat = countChar (unsafeHead pat) src
1494 | otherwise = L.length (indices pat src)
1495 {-# INLINE [1] count #-}
1496
1497 {-# RULES
1498 "TEXT count/singleton -> countChar" [~1] forall c t.
1499 count (singleton c) t = countChar c t
1500 #-}
1501
1502 -- | /O(n)/ The 'countChar' function returns the number of times the
1503 -- query element appears in the given 'Text'. Subject to fusion.
1504 countChar :: Char -> Text -> Int
1505 countChar c t = S.countChar c (stream t)
1506 {-# INLINE countChar #-}
1507
1508 -------------------------------------------------------------------------------
1509 -- * Zipping
1510
1511 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1512 -- corresponding pairs of bytes. If one input 'Text' is short,
1513 -- excess elements of the longer 'Text' are discarded. This is
1514 -- equivalent to a pair of 'unpack' operations.
1515 zip :: Text -> Text -> [(Char,Char)]
1516 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1517 {-# INLINE [0] zip #-}
1518
1519 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1520 -- given as the first argument, instead of a tupling function.
1521 -- Performs replacement on invalid scalar values.
1522 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1523 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1524 where g a b = safe (f a b)
1525 {-# INLINE [0] zipWith #-}
1526
1527 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1528 -- representing white space.
1529 words :: Text -> [Text]
1530 words t@(Text arr off len) = loop 0 0
1531 where
1532 loop !start !n
1533 | n >= len = if start == n
1534 then []
1535 else [Text arr (start+off) (n-start)]
1536 | isSpace c =
1537 if start == n
1538 then loop (start+1) (start+1)
1539 else Text arr (start+off) (n-start) : loop (n+d) (n+d)
1540 | otherwise = loop start (n+d)
1541 where Iter c d = iter t n
1542 {-# INLINE words #-}
1543
1544 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1545 -- newline 'Char's. The resulting strings do not contain newlines.
1546 lines :: Text -> [Text]
1547 lines ps | null ps = []
1548 | otherwise = h : if null t
1549 then []
1550 else lines (unsafeTail t)
1551 where (# h,t #) = span_ (/= '\n') ps
1552 {-# INLINE lines #-}
1553
1554 {-
1555 -- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
1556 -- boundaries.
1557 --
1558 -- A line boundary is considered to be either a line feed, a carriage
1559 -- return immediately followed by a line feed, or a carriage return.
1560 -- This accounts for both Unix and Windows line ending conventions,
1561 -- and for the old convention used on Mac OS 9 and earlier.
1562 lines' :: Text -> [Text]
1563 lines' ps | null ps = []
1564 | otherwise = h : case uncons t of
1565 Nothing -> []
1566 Just (c,t')
1567 | c == '\n' -> lines t'
1568 | c == '\r' -> case uncons t' of
1569 Just ('\n',t'') -> lines t''
1570 _ -> lines t'
1571 where (h,t) = span notEOL ps
1572 notEOL c = c /= '\n' && c /= '\r'
1573 {-# INLINE lines' #-}
1574 -}
1575
1576 -- | /O(n)/ Joins lines, after appending a terminating newline to
1577 -- each.
1578 unlines :: [Text] -> Text
1579 unlines = concat . L.map (`snoc` '\n')
1580 {-# INLINE unlines #-}
1581
1582 -- | /O(n)/ Joins words using single space characters.
1583 unwords :: [Text] -> Text
1584 unwords = intercalate (singleton ' ')
1585 {-# INLINE unwords #-}
1586
1587 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1588 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1589 isPrefixOf :: Text -> Text -> Bool
1590 isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
1591 alen <= blen && S.isPrefixOf (stream a) (stream b)
1592 {-# INLINE [1] isPrefixOf #-}
1593
1594 {-# RULES
1595 "TEXT isPrefixOf -> fused" [~1] forall s t.
1596 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1597 #-}
1598
1599 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1600 -- 'True' iff the first is a suffix of the second.
1601 isSuffixOf :: Text -> Text -> Bool
1602 isSuffixOf a@(Text _aarr _aoff alen) b@(Text barr boff blen) =
1603 d >= 0 && a == b'
1604 where d = blen - alen
1605 b' | d == 0 = b
1606 | otherwise = Text barr (boff+d) alen
1607 {-# INLINE isSuffixOf #-}
1608
1609 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1610 -- 'True' iff the first is contained, wholly and intact, anywhere
1611 -- within the second.
1612 --
1613 -- In (unlikely) bad cases, this function's time complexity degrades
1614 -- towards /O(n*m)/.
1615 isInfixOf :: Text -> Text -> Bool
1616 isInfixOf needle haystack
1617 | null needle = True
1618 | isSingleton needle = S.elem (unsafeHead needle) . S.stream $ haystack
1619 | otherwise = not . L.null . indices needle $ haystack
1620 {-# INLINE [1] isInfixOf #-}
1621
1622 {-# RULES
1623 "TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1624 isInfixOf (singleton n) h = S.elem n (S.stream h)
1625 #-}
1626
1627 -------------------------------------------------------------------------------
1628 -- * View patterns
1629
1630 -- | /O(n)/ Return the suffix of the second string if its prefix
1631 -- matches the entire first string.
1632 --
1633 -- Examples:
1634 --
1635 -- > stripPrefix "foo" "foobar" == Just "bar"
1636 -- > stripPrefix "" "baz" == Just "baz"
1637 -- > stripPrefix "foo" "quux" == Nothing
1638 --
1639 -- This is particularly useful with the @ViewPatterns@ extension to
1640 -- GHC, as follows:
1641 --
1642 -- > {-# LANGUAGE ViewPatterns #-}
1643 -- > import Data.Text as T
1644 -- >
1645 -- > fnordLength :: Text -> Int
1646 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1647 -- > fnordLength _ = -1
1648 stripPrefix :: Text -> Text -> Maybe Text
1649 stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
1650 | p `isPrefixOf` t = Just $! text arr (off+plen) (len-plen)
1651 | otherwise = Nothing
1652
1653 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1654 -- and return it, along with the suffixes of each string at which they
1655 -- no longer match.
1656 --
1657 -- If the strings do not have a common prefix or either one is empty,
1658 -- this function returns 'Nothing'.
1659 --
1660 -- Examples:
1661 --
1662 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1663 -- > commonPrefixes "veeble" "fetzer" == Nothing
1664 -- > commonPrefixes "" "baz" == Nothing
1665 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1666 commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
1667 where
1668 go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
1669 | i > 0 = Just (Text arr0 off0 i,
1670 text arr0 (off0+i) (len0-i),
1671 text arr1 (off1+j) (len1-j))
1672 | otherwise = Nothing
1673 where Iter a d0 = iter t0 i
1674 Iter b d1 = iter t1 j
1675
1676 -- | /O(n)/ Return the prefix of the second string if its suffix
1677 -- matches the entire first string.
1678 --
1679 -- Examples:
1680 --
1681 -- > stripSuffix "bar" "foobar" == Just "foo"
1682 -- > stripSuffix "" "baz" == Just "baz"
1683 -- > stripSuffix "foo" "quux" == Nothing
1684 --
1685 -- This is particularly useful with the @ViewPatterns@ extension to
1686 -- GHC, as follows:
1687 --
1688 -- > {-# LANGUAGE ViewPatterns #-}
1689 -- > import Data.Text as T
1690 -- >
1691 -- > quuxLength :: Text -> Int
1692 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1693 -- > quuxLength _ = -1
1694 stripSuffix :: Text -> Text -> Maybe Text
1695 stripSuffix p@(Text _arr _off plen) t@(Text arr off len)
1696 | p `isSuffixOf` t = Just $! text arr off (len-plen)
1697 | otherwise = Nothing
1698
1699 -- | Add a list of non-negative numbers. Errors out on overflow.
1700 sumP :: String -> [Int] -> Int
1701 sumP fun = go 0
1702 where go !a (x:xs)
1703 | ax >= 0 = go ax xs
1704 | otherwise = overflowError fun
1705 where ax = a + x
1706 go a _ = a
1707
1708 emptyError :: String -> a
1709 emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input"
1710
1711 overflowError :: String -> a
1712 overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow"
1713
1714 -- | /O(n)/ Make a distinct copy of the given string, sharing no
1715 -- storage with the original string.
1716 --
1717 -- As an example, suppose you read a large string, of which you need
1718 -- only a small portion. If you do not use 'copy', the entire original
1719 -- array will be kept alive in memory by the smaller string. Making a
1720 -- copy \"breaks the link\" to the original array, allowing it to be
1721 -- garbage collected if there are no other live references to it.
1722 copy :: Text -> Text
1723 copy (Text arr off len) = Text (A.run go) 0 len
1724 where
1725 go :: ST s (A.MArray s)
1726 go = do
1727 marr <- A.new len
1728 A.copyI marr 0 arr off len
1729 return marr