Merge pull request #125 from Jubobs/master
[packages/text.git] / Data / Text.hs
1 {-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
2 {-# OPTIONS_GHC -fno-warn-orphans #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text
12 -- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
13 -- (c) 2009 Duncan Coutts,
14 -- (c) 2008, 2009 Tom Harper
15 --
16 -- License : BSD-style
17 -- Maintainer : bos@serpentine.com
18 -- Stability : experimental
19 -- Portability : GHC
20 --
21 -- A time and space-efficient implementation of Unicode text.
22 -- Suitable for performance critical use, both in terms of large data
23 -- quantities and high speed.
24 --
25 -- /Note/: Read below the synopsis for important notes on the use of
26 -- this module.
27 --
28 -- This module is intended to be imported @qualified@, to avoid name
29 -- clashes with "Prelude" functions, e.g.
30 --
31 -- > import qualified Data.Text as T
32 --
33 -- To use an extended and very rich family of functions for working
34 -- with Unicode text (including normalization, regular expressions,
35 -- non-standard encodings, text breaking, and locales), see
36 -- <http://hackage.haskell.org/package/text-icu the text-icu package >.
37
38 module Data.Text
39 (
40 -- * Strict vs lazy types
41 -- $strict
42
43 -- * Acceptable data
44 -- $replacement
45
46 -- * Fusion
47 -- $fusion
48
49 -- * Types
50 Text
51
52 -- * Creation and elimination
53 , pack
54 , unpack
55 , singleton
56 , empty
57
58 -- * Basic interface
59 , cons
60 , snoc
61 , append
62 , uncons
63 , head
64 , last
65 , tail
66 , init
67 , null
68 , length
69 , compareLength
70
71 -- * Transformations
72 , map
73 , intercalate
74 , intersperse
75 , transpose
76 , reverse
77 , replace
78
79 -- ** Case conversion
80 -- $case
81 , toCaseFold
82 , toLower
83 , toUpper
84 , toTitle
85
86 -- ** Justification
87 , justifyLeft
88 , justifyRight
89 , center
90
91 -- * Folds
92 , foldl
93 , foldl'
94 , foldl1
95 , foldl1'
96 , foldr
97 , foldr1
98
99 -- ** Special folds
100 , concat
101 , concatMap
102 , any
103 , all
104 , maximum
105 , minimum
106
107 -- * Construction
108
109 -- ** Scans
110 , scanl
111 , scanl1
112 , scanr
113 , scanr1
114
115 -- ** Accumulating maps
116 , mapAccumL
117 , mapAccumR
118
119 -- ** Generation and unfolding
120 , replicate
121 , unfoldr
122 , unfoldrN
123
124 -- * Substrings
125
126 -- ** Breaking strings
127 , take
128 , takeEnd
129 , drop
130 , dropEnd
131 , takeWhile
132 , takeWhileEnd
133 , dropWhile
134 , dropWhileEnd
135 , dropAround
136 , strip
137 , stripStart
138 , stripEnd
139 , splitAt
140 , breakOn
141 , breakOnEnd
142 , break
143 , span
144 , group
145 , groupBy
146 , inits
147 , tails
148
149 -- ** Breaking into many substrings
150 -- $split
151 , splitOn
152 , split
153 , chunksOf
154
155 -- ** Breaking into lines and words
156 , lines
157 --, lines'
158 , words
159 , unlines
160 , unwords
161
162 -- * Predicates
163 , isPrefixOf
164 , isSuffixOf
165 , isInfixOf
166
167 -- ** View patterns
168 , stripPrefix
169 , stripSuffix
170 , commonPrefixes
171
172 -- * Searching
173 , filter
174 , breakOnAll
175 , find
176 , partition
177
178 -- , findSubstring
179
180 -- * Indexing
181 -- $index
182 , index
183 , findIndex
184 , count
185
186 -- * Zipping
187 , zip
188 , zipWith
189
190 -- -* Ordered text
191 -- , sort
192
193 -- * Low level operations
194 , copy
195 , unpackCString#
196 ) where
197
198 import Prelude (Char, Bool(..), Int, Maybe(..), String,
199 Eq(..), Ord(..), Ordering(..), (++),
200 Read(..),
201 (&&), (||), (+), (-), (.), ($), ($!), (>>),
202 not, return, otherwise, quot)
203 #if defined(HAVE_DEEPSEQ)
204 import Control.DeepSeq (NFData(rnf))
205 #endif
206 #if defined(ASSERTS)
207 import Control.Exception (assert)
208 #endif
209 import Data.Char (isSpace)
210 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
211 Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
212 import Control.Monad (foldM)
213 import Control.Monad.ST (ST)
214 import qualified Data.Text.Array as A
215 import qualified Data.List as L
216 import Data.Binary (Binary(get, put))
217 import Data.Monoid (Monoid(..))
218 #if MIN_VERSION_base(4,9,0)
219 import Data.Semigroup (Semigroup(..))
220 #endif
221 import Data.String (IsString(..))
222 import qualified Data.Text.Internal.Fusion as S
223 import qualified Data.Text.Internal.Fusion.Common as S
224 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
225 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
226 import Data.Text.Internal.Private (span_)
227 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
228 import Data.Text.Show (singleton, unpack, unpackCString#)
229 import qualified Prelude as P
230 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
231 reverseIter_, unsafeHead, unsafeTail)
232 import Data.Text.Internal.Unsafe.Char (unsafeChr)
233 import qualified Data.Text.Internal.Functions as F
234 import qualified Data.Text.Internal.Encoding.Utf16 as U16
235 import Data.Text.Internal.Search (indices)
236 #if defined(__HADDOCK__)
237 import Data.ByteString (ByteString)
238 import qualified Data.Text.Lazy as L
239 import Data.Int (Int64)
240 #endif
241 #if __GLASGOW_HASKELL__ >= 708
242 import qualified GHC.Exts as Exts
243 #endif
244 #if MIN_VERSION_base(4,7,0)
245 import Text.Printf (PrintfArg, formatArg, formatString)
246 #endif
247
248 -- $strict
249 --
250 -- This package provides both strict and lazy 'Text' types. The
251 -- strict type is provided by the "Data.Text" module, while the lazy
252 -- type is provided by the "Data.Text.Lazy" module. Internally, the
253 -- lazy @Text@ type consists of a list of strict chunks.
254 --
255 -- The strict 'Text' type requires that an entire string fit into
256 -- memory at once. The lazy 'Data.Text.Lazy.Text' type is capable of
257 -- streaming strings that are larger than memory using a small memory
258 -- footprint. In many cases, the overhead of chunked streaming makes
259 -- the lazy 'Data.Text.Lazy.Text' type slower than its strict
260 -- counterpart, but this is not always the case. Sometimes, the time
261 -- complexity of a function in one module may be different from the
262 -- other, due to their differing internal structures.
263 --
264 -- Each module provides an almost identical API, with the main
265 -- difference being that the strict module uses 'Int' values for
266 -- lengths and counts, while the lazy module uses 'Data.Int.Int64'
267 -- lengths.
268
269 -- $replacement
270 --
271 -- A 'Text' value is a sequence of Unicode scalar values, as defined
272 -- in
273 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
274 -- As such, a 'Text' cannot contain values in the range U+D800 to
275 -- U+DFFF inclusive. Haskell implementations admit all Unicode code
276 -- points
277 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
278 -- as 'Char' values, including code points from this invalid range.
279 -- This means that there are some 'Char' values that are not valid
280 -- Unicode scalar values, and the functions in this module must handle
281 -- those cases.
282 --
283 -- Within this module, many functions construct a 'Text' from one or
284 -- more 'Char' values. Those functions will substitute 'Char' values
285 -- that are not valid Unicode scalar values with the replacement
286 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
287 -- inspection and replacement are documented with the phrase
288 -- \"Performs replacement on invalid scalar values\".
289 --
290 -- (One reason for this policy of replacement is that internally, a
291 -- 'Text' value is represented as packed UTF-16 data. Values in the
292 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
293 -- code points, and so cannot be represented. The functions replace
294 -- invalid scalar values, instead of dropping them, as a security
295 -- measure. For details, see
296 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
297
298 -- $fusion
299 --
300 -- Most of the functions in this module are subject to /fusion/,
301 -- meaning that a pipeline of such functions will usually allocate at
302 -- most one 'Text' value.
303 --
304 -- As an example, consider the following pipeline:
305 --
306 -- > import Data.Text as T
307 -- > import Data.Text.Encoding as E
308 -- > import Data.ByteString (ByteString)
309 -- >
310 -- > countChars :: ByteString -> Int
311 -- > countChars = T.length . T.toUpper . E.decodeUtf8
312 --
313 -- From the type signatures involved, this looks like it should
314 -- allocate one 'Data.ByteString.ByteString' value, and two 'Text'
315 -- values. However, when a module is compiled with optimisation
316 -- enabled under GHC, the two intermediate 'Text' values will be
317 -- optimised away, and the function will be compiled down to a single
318 -- loop over the source 'Data.ByteString.ByteString'.
319 --
320 -- Functions that can be fused by the compiler are documented with the
321 -- phrase \"Subject to fusion\".
322
323 instance Eq Text where
324 Text arrA offA lenA == Text arrB offB lenB
325 | lenA == lenB = A.equal arrA offA arrB offB lenA
326 | otherwise = False
327 {-# INLINE (==) #-}
328
329 instance Ord Text where
330 compare = compareText
331
332 instance Read Text where
333 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
334
335 #if MIN_VERSION_base(4,9,0)
336 -- Semigroup orphan instances for older GHCs are provided by
337 -- 'semigroups` package
338
339 instance Semigroup Text where
340 (<>) = append
341 #endif
342
343 instance Monoid Text where
344 mempty = empty
345 #if MIN_VERSION_base(4,9,0)
346 mappend = (<>) -- future-proof definition
347 #else
348 mappend = append
349 #endif
350 mconcat = concat
351
352 instance IsString Text where
353 fromString = pack
354
355 #if __GLASGOW_HASKELL__ >= 708
356 instance Exts.IsList Text where
357 type Item Text = Char
358 fromList = pack
359 toList = unpack
360 #endif
361
362 #if defined(HAVE_DEEPSEQ)
363 instance NFData Text where rnf !_ = ()
364 #endif
365
366 instance Binary Text where
367 put t = put (encodeUtf8 t)
368 get = do
369 bs <- get
370 case decodeUtf8' bs of
371 P.Left exn -> P.fail (P.show exn)
372 P.Right a -> P.return a
373
374 -- | This instance preserves data abstraction at the cost of inefficiency.
375 -- We omit reflection services for the sake of data abstraction.
376 --
377 -- This instance was created by copying the updated behavior of
378 -- @"Data.Set".@'Data.Set.Set' and @"Data.Map".@'Data.Map.Map'. If you
379 -- feel a mistake has been made, please feel free to submit
380 -- improvements.
381 --
382 -- The original discussion is archived here:
383 -- <http://groups.google.com/group/haskell-cafe/browse_thread/thread/b5bbb1b28a7e525d/0639d46852575b93 could we get a Data instance for Data.Text.Text? >
384 --
385 -- The followup discussion that changed the behavior of 'Data.Set.Set'
386 -- and 'Data.Map.Map' is archived here:
387 -- <http://markmail.org/message/trovdc6zkphyi3cr#query:+page:1+mid:a46der3iacwjcf6n+state:results Proposal: Allow gunfold for Data.Map, ... >
388
389 instance Data Text where
390 gfoldl f z txt = z pack `f` (unpack txt)
391 toConstr _ = packConstr
392 gunfold k z c = case constrIndex c of
393 1 -> k (z pack)
394 _ -> P.error "gunfold"
395 dataTypeOf _ = textDataType
396
397 #if MIN_VERSION_base(4,7,0)
398 -- | Only defined for @base-4.7.0.0@ and later
399 instance PrintfArg Text where
400 formatArg txt = formatString $ unpack txt
401 #endif
402
403 packConstr :: Constr
404 packConstr = mkConstr textDataType "pack" [] Prefix
405
406 textDataType :: DataType
407 textDataType = mkDataType "Data.Text.Text" [packConstr]
408
409 -- | /O(n)/ Compare two 'Text' values lexicographically.
410 compareText :: Text -> Text -> Ordering
411 compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
412 | lenA == 0 && lenB == 0 = EQ
413 | otherwise = go 0 0
414 where
415 go !i !j
416 | i >= lenA || j >= lenB = compare lenA lenB
417 | a < b = LT
418 | a > b = GT
419 | otherwise = go (i+di) (j+dj)
420 where Iter a di = iter ta i
421 Iter b dj = iter tb j
422
423 -- -----------------------------------------------------------------------------
424 -- * Conversion to/from 'Text'
425
426 -- | /O(n)/ Convert a 'String' into a 'Text'. Subject to
427 -- fusion. Performs replacement on invalid scalar values.
428 pack :: String -> Text
429 pack = unstream . S.map safe . S.streamList
430 {-# INLINE [1] pack #-}
431
432 -- -----------------------------------------------------------------------------
433 -- * Basic functions
434
435 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
436 -- is more costly than its 'List' counterpart because it requires
437 -- copying a new array. Subject to fusion. Performs replacement on
438 -- invalid scalar values.
439 cons :: Char -> Text -> Text
440 cons c t = unstream (S.cons (safe c) (stream t))
441 {-# INLINE cons #-}
442
443 infixr 5 `cons`
444
445 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
446 -- entire array in the process, unless fused. Subject to fusion.
447 -- Performs replacement on invalid scalar values.
448 snoc :: Text -> Char -> Text
449 snoc t c = unstream (S.snoc (stream t) (safe c))
450 {-# INLINE snoc #-}
451
452 -- | /O(n)/ Appends one 'Text' to the other by copying both of them
453 -- into a new 'Text'. Subject to fusion.
454 append :: Text -> Text -> Text
455 append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
456 | len1 == 0 = b
457 | len2 == 0 = a
458 | len > 0 = Text (A.run x) 0 len
459 | otherwise = overflowError "append"
460 where
461 len = len1+len2
462 x :: ST s (A.MArray s)
463 x = do
464 arr <- A.new len
465 A.copyI arr 0 arr1 off1 len1
466 A.copyI arr len1 arr2 off2 len
467 return arr
468 {-# NOINLINE append #-}
469
470 {-# RULES
471 "TEXT append -> fused" [~1] forall t1 t2.
472 append t1 t2 = unstream (S.append (stream t1) (stream t2))
473 "TEXT append -> unfused" [1] forall t1 t2.
474 unstream (S.append (stream t1) (stream t2)) = append t1 t2
475 #-}
476
477 -- | /O(1)/ Returns the first character of a 'Text', which must be
478 -- non-empty. Subject to fusion.
479 head :: Text -> Char
480 head t = S.head (stream t)
481 {-# INLINE head #-}
482
483 -- | /O(1)/ Returns the first character and rest of a 'Text', or
484 -- 'Nothing' if empty. Subject to fusion.
485 uncons :: Text -> Maybe (Char, Text)
486 uncons t@(Text arr off len)
487 | len <= 0 = Nothing
488 | otherwise = Just (c, text arr (off+d) (len-d))
489 where Iter c d = iter t 0
490 {-# INLINE [1] uncons #-}
491
492 -- | Lifted from Control.Arrow and specialized.
493 second :: (b -> c) -> (a,b) -> (a,c)
494 second f (a, b) = (a, f b)
495
496 -- | /O(1)/ Returns the last character of a 'Text', which must be
497 -- non-empty. Subject to fusion.
498 last :: Text -> Char
499 last (Text arr off len)
500 | len <= 0 = emptyError "last"
501 | n < 0xDC00 || n > 0xDFFF = unsafeChr n
502 | otherwise = U16.chr2 n0 n
503 where n = A.unsafeIndex arr (off+len-1)
504 n0 = A.unsafeIndex arr (off+len-2)
505 {-# INLINE [1] last #-}
506
507 {-# RULES
508 "TEXT last -> fused" [~1] forall t.
509 last t = S.last (stream t)
510 "TEXT last -> unfused" [1] forall t.
511 S.last (stream t) = last t
512 #-}
513
514 -- | /O(1)/ Returns all characters after the head of a 'Text', which
515 -- must be non-empty. Subject to fusion.
516 tail :: Text -> Text
517 tail t@(Text arr off len)
518 | len <= 0 = emptyError "tail"
519 | otherwise = text arr (off+d) (len-d)
520 where d = iter_ t 0
521 {-# INLINE [1] tail #-}
522
523 {-# RULES
524 "TEXT tail -> fused" [~1] forall t.
525 tail t = unstream (S.tail (stream t))
526 "TEXT tail -> unfused" [1] forall t.
527 unstream (S.tail (stream t)) = tail t
528 #-}
529
530 -- | /O(1)/ Returns all but the last character of a 'Text', which must
531 -- be non-empty. Subject to fusion.
532 init :: Text -> Text
533 init (Text arr off len) | len <= 0 = emptyError "init"
534 | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
535 | otherwise = text arr off (len-1)
536 where
537 n = A.unsafeIndex arr (off+len-1)
538 {-# INLINE [1] init #-}
539
540 {-# RULES
541 "TEXT init -> fused" [~1] forall t.
542 init t = unstream (S.init (stream t))
543 "TEXT init -> unfused" [1] forall t.
544 unstream (S.init (stream t)) = init t
545 #-}
546
547 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
548 -- fusion.
549 null :: Text -> Bool
550 null (Text _arr _off len) =
551 #if defined(ASSERTS)
552 assert (len >= 0) $
553 #endif
554 len <= 0
555 {-# INLINE [1] null #-}
556
557 {-# RULES
558 "TEXT null -> fused" [~1] forall t.
559 null t = S.null (stream t)
560 "TEXT null -> unfused" [1] forall t.
561 S.null (stream t) = null t
562 #-}
563
564 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
565 -- Subject to fusion.
566 isSingleton :: Text -> Bool
567 isSingleton = S.isSingleton . stream
568 {-# INLINE isSingleton #-}
569
570 -- | /O(n)/ Returns the number of characters in a 'Text'.
571 -- Subject to fusion.
572 length :: Text -> Int
573 length t = S.length (stream t)
574 {-# INLINE length #-}
575
576 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
577 -- Subject to fusion.
578 --
579 -- This function gives the same answer as comparing against the result
580 -- of 'length', but can short circuit if the count of characters is
581 -- greater than the number, and hence be more efficient.
582 compareLength :: Text -> Int -> Ordering
583 compareLength t n = S.compareLengthI (stream t) n
584 {-# INLINE [1] compareLength #-}
585
586 {-# RULES
587 "TEXT compareN/length -> compareLength" [~1] forall t n.
588 compare (length t) n = compareLength t n
589 #-}
590
591 {-# RULES
592 "TEXT ==N/length -> compareLength/==EQ" [~1] forall t n.
593 (==) (length t) n = compareLength t n == EQ
594 #-}
595
596 {-# RULES
597 "TEXT /=N/length -> compareLength//=EQ" [~1] forall t n.
598 (/=) (length t) n = compareLength t n /= EQ
599 #-}
600
601 {-# RULES
602 "TEXT <N/length -> compareLength/==LT" [~1] forall t n.
603 (<) (length t) n = compareLength t n == LT
604 #-}
605
606 {-# RULES
607 "TEXT <=N/length -> compareLength//=GT" [~1] forall t n.
608 (<=) (length t) n = compareLength t n /= GT
609 #-}
610
611 {-# RULES
612 "TEXT >N/length -> compareLength/==GT" [~1] forall t n.
613 (>) (length t) n = compareLength t n == GT
614 #-}
615
616 {-# RULES
617 "TEXT >=N/length -> compareLength//=LT" [~1] forall t n.
618 (>=) (length t) n = compareLength t n /= LT
619 #-}
620
621 -- -----------------------------------------------------------------------------
622 -- * Transformations
623 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
624 -- each element of @t@. Subject to fusion. Performs replacement on
625 -- invalid scalar values.
626 map :: (Char -> Char) -> Text -> Text
627 map f t = unstream (S.map (safe . f) (stream t))
628 {-# INLINE [1] map #-}
629
630 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
631 -- 'Text's and concatenates the list after interspersing the first
632 -- argument between each element of the list.
633 intercalate :: Text -> [Text] -> Text
634 intercalate t = concat . (F.intersperse t)
635 {-# INLINE intercalate #-}
636
637 -- | /O(n)/ The 'intersperse' function takes a character and places it
638 -- between the characters of a 'Text'. Subject to fusion. Performs
639 -- replacement on invalid scalar values.
640 intersperse :: Char -> Text -> Text
641 intersperse c t = unstream (S.intersperse (safe c) (stream t))
642 {-# INLINE intersperse #-}
643
644 -- | /O(n)/ Reverse the characters of a string. Subject to fusion.
645 reverse :: Text -> Text
646 reverse t = S.reverse (stream t)
647 {-# INLINE reverse #-}
648
649 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
650 -- @haystack@ with @replacement@.
651 --
652 -- This function behaves as though it was defined as follows:
653 --
654 -- @
655 -- replace needle replacement haystack =
656 -- 'intercalate' replacement ('splitOn' needle haystack)
657 -- @
658 --
659 -- As this suggests, each occurrence is replaced exactly once. So if
660 -- @needle@ occurs in @replacement@, that occurrence will /not/ itself
661 -- be replaced recursively:
662 --
663 -- > replace "oo" "foo" "oo" == "foo"
664 --
665 -- In cases where several instances of @needle@ overlap, only the
666 -- first one will be replaced:
667 --
668 -- > replace "ofo" "bar" "ofofo" == "barfo"
669 --
670 -- In (unlikely) bad cases, this function's time complexity degrades
671 -- towards /O(n*m)/.
672 replace :: Text
673 -- ^ @needle@ to search for. If this string is empty, an
674 -- error will occur.
675 -> Text
676 -- ^ @replacement@ to replace @needle@ with.
677 -> Text
678 -- ^ @haystack@ in which to search.
679 -> Text
680 replace needle@(Text _ _ neeLen)
681 (Text repArr repOff repLen)
682 haystack@(Text hayArr hayOff hayLen)
683 | neeLen == 0 = emptyError "replace"
684 | L.null ixs = haystack
685 | len > 0 = Text (A.run x) 0 len
686 | otherwise = empty
687 where
688 ixs = indices needle haystack
689 len = hayLen - (neeLen - repLen) `mul` L.length ixs
690 x :: ST s (A.MArray s)
691 x = do
692 marr <- A.new len
693 let loop (i:is) o d = do
694 let d0 = d + i - o
695 d1 = d0 + repLen
696 A.copyI marr d hayArr (hayOff+o) d0
697 A.copyI marr d0 repArr repOff d1
698 loop is (i + neeLen) d1
699 loop [] o d = A.copyI marr d hayArr (hayOff+o) len
700 loop ixs 0 0
701 return marr
702
703 -- ----------------------------------------------------------------------------
704 -- ** Case conversions (folds)
705
706 -- $case
707 --
708 -- When case converting 'Text' values, do not use combinators like
709 -- @map toUpper@ to case convert each character of a string
710 -- individually, as this gives incorrect results according to the
711 -- rules of some writing systems. The whole-string case conversion
712 -- functions from this module, such as @toUpper@, obey the correct
713 -- case conversion rules. As a result, these functions may map one
714 -- input character to two or three output characters. For examples,
715 -- see the documentation of each function.
716 --
717 -- /Note/: In some languages, case conversion is a locale- and
718 -- context-dependent operation. The case conversion functions in this
719 -- module are /not/ locale sensitive. Programs that require locale
720 -- sensitivity should use appropriate versions of the
721 -- <http://hackage.haskell.org/package/text-icu-0.6.3.7/docs/Data-Text-ICU.html#g:4 case mapping functions from the text-icu package >.
722
723 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
724 --
725 -- This function is mainly useful for performing caseless (also known
726 -- as case insensitive) string comparisons.
727 --
728 -- A string @x@ is a caseless match for a string @y@ if and only if:
729 --
730 -- @toCaseFold x == toCaseFold y@
731 --
732 -- The result string may be longer than the input string, and may
733 -- differ from applying 'toLower' to the input string. For instance,
734 -- the Armenian small ligature \"&#xfb13;\" (men now, U+FB13) is case
735 -- folded to the sequence \"&#x574;\" (men, U+0574) followed by
736 -- \"&#x576;\" (now, U+0576), while the Greek \"&#xb5;\" (micro sign,
737 -- U+00B5) is case folded to \"&#x3bc;\" (small letter mu, U+03BC)
738 -- instead of itself.
739 toCaseFold :: Text -> Text
740 toCaseFold t = unstream (S.toCaseFold (stream t))
741 {-# INLINE toCaseFold #-}
742
743 -- | /O(n)/ Convert a string to lower case, using simple case
744 -- conversion. Subject to fusion.
745 --
746 -- The result string may be longer than the input string. For
747 -- instance, \"&#x130;\" (Latin capital letter I with dot above,
748 -- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069)
749 -- followed by \" &#x307;\" (combining dot above, U+0307).
750 toLower :: Text -> Text
751 toLower t = unstream (S.toLower (stream t))
752 {-# INLINE toLower #-}
753
754 -- | /O(n)/ Convert a string to upper case, using simple case
755 -- conversion. Subject to fusion.
756 --
757 -- The result string may be longer than the input string. For
758 -- instance, the German \"&#xdf;\" (eszett, U+00DF) maps to the
759 -- two-letter sequence \"SS\".
760 toUpper :: Text -> Text
761 toUpper t = unstream (S.toUpper (stream t))
762 {-# INLINE toUpper #-}
763
764 -- | /O(n)/ Convert a string to title case, using simple case
765 -- conversion. Subject to fusion.
766 --
767 -- The first letter of the input is converted to title case, as is
768 -- every subsequent letter that immediately follows a non-letter.
769 -- Every letter that immediately follows another letter is converted
770 -- to lower case.
771 --
772 -- The result string may be longer than the input string. For example,
773 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
774 -- sequence Latin capital letter F (U+0046) followed by Latin small
775 -- letter l (U+006C).
776 --
777 -- /Note/: this function does not take language or culture specific
778 -- rules into account. For instance, in English, different style
779 -- guides disagree on whether the book name \"The Hill of the Red
780 -- Fox\" is correctly title cased&#x2014;but this function will
781 -- capitalize /every/ word.
782 toTitle :: Text -> Text
783 toTitle t = unstream (S.toTitle (stream t))
784 {-# INLINE toTitle #-}
785
786 -- | /O(n)/ Left-justify a string to the given length, using the
787 -- specified fill character on the right. Subject to fusion.
788 -- Performs replacement on invalid scalar values.
789 --
790 -- Examples:
791 --
792 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
793 -- > justifyLeft 3 'x' "foobar" == "foobar"
794 justifyLeft :: Int -> Char -> Text -> Text
795 justifyLeft k c t
796 | len >= k = t
797 | otherwise = t `append` replicateChar (k-len) c
798 where len = length t
799 {-# INLINE [1] justifyLeft #-}
800
801 {-# RULES
802 "TEXT justifyLeft -> fused" [~1] forall k c t.
803 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
804 "TEXT justifyLeft -> unfused" [1] forall k c t.
805 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
806 #-}
807
808 -- | /O(n)/ Right-justify a string to the given length, using the
809 -- specified fill character on the left. Performs replacement on
810 -- invalid scalar values.
811 --
812 -- Examples:
813 --
814 -- > justifyRight 7 'x' "bar" == "xxxxbar"
815 -- > justifyRight 3 'x' "foobar" == "foobar"
816 justifyRight :: Int -> Char -> Text -> Text
817 justifyRight k c t
818 | len >= k = t
819 | otherwise = replicateChar (k-len) c `append` t
820 where len = length t
821 {-# INLINE justifyRight #-}
822
823 -- | /O(n)/ Center a string to the given length, using the specified
824 -- fill character on either side. Performs replacement on invalid
825 -- scalar values.
826 --
827 -- Examples:
828 --
829 -- > center 8 'x' "HS" = "xxxHSxxx"
830 center :: Int -> Char -> Text -> Text
831 center k c t
832 | len >= k = t
833 | otherwise = replicateChar l c `append` t `append` replicateChar r c
834 where len = length t
835 d = k - len
836 r = d `quot` 2
837 l = d - r
838 {-# INLINE center #-}
839
840 -- | /O(n)/ The 'transpose' function transposes the rows and columns
841 -- of its 'Text' argument. Note that this function uses 'pack',
842 -- 'unpack', and the list version of transpose, and is thus not very
843 -- efficient.
844 transpose :: [Text] -> [Text]
845 transpose ts = P.map pack (L.transpose (P.map unpack ts))
846
847 -- -----------------------------------------------------------------------------
848 -- * Reducing 'Text's (folds)
849
850 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
851 -- (typically the left-identity of the operator), and a 'Text',
852 -- reduces the 'Text' using the binary operator, from left to right.
853 -- Subject to fusion.
854 foldl :: (a -> Char -> a) -> a -> Text -> a
855 foldl f z t = S.foldl f z (stream t)
856 {-# INLINE foldl #-}
857
858 -- | /O(n)/ A strict version of 'foldl'. Subject to fusion.
859 foldl' :: (a -> Char -> a) -> a -> Text -> a
860 foldl' f z t = S.foldl' f z (stream t)
861 {-# INLINE foldl' #-}
862
863 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
864 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
865 foldl1 :: (Char -> Char -> Char) -> Text -> Char
866 foldl1 f t = S.foldl1 f (stream t)
867 {-# INLINE foldl1 #-}
868
869 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
870 foldl1' :: (Char -> Char -> Char) -> Text -> Char
871 foldl1' f t = S.foldl1' f (stream t)
872 {-# INLINE foldl1' #-}
873
874 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
875 -- (typically the right-identity of the operator), and a 'Text',
876 -- reduces the 'Text' using the binary operator, from right to left.
877 -- Subject to fusion.
878 foldr :: (Char -> a -> a) -> a -> Text -> a
879 foldr f z t = S.foldr f z (stream t)
880 {-# INLINE foldr #-}
881
882 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
883 -- and thus must be applied to a non-empty 'Text'. Subject to
884 -- fusion.
885 foldr1 :: (Char -> Char -> Char) -> Text -> Char
886 foldr1 f t = S.foldr1 f (stream t)
887 {-# INLINE foldr1 #-}
888
889 -- -----------------------------------------------------------------------------
890 -- ** Special folds
891
892 -- | /O(n)/ Concatenate a list of 'Text's.
893 concat :: [Text] -> Text
894 concat ts = case ts' of
895 [] -> empty
896 [t] -> t
897 _ -> Text (A.run go) 0 len
898 where
899 ts' = L.filter (not . null) ts
900 len = sumP "concat" $ L.map lengthWord16 ts'
901 go :: ST s (A.MArray s)
902 go = do
903 arr <- A.new len
904 let step i (Text a o l) =
905 let !j = i + l in A.copyI arr i a o j >> return j
906 foldM step 0 ts' >> return arr
907
908 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
909 -- concatenate the results.
910 concatMap :: (Char -> Text) -> Text -> Text
911 concatMap f = concat . foldr ((:) . f) []
912 {-# INLINE concatMap #-}
913
914 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
915 -- 'Text' @t@ satisfies the predicate @p@. Subject to fusion.
916 any :: (Char -> Bool) -> Text -> Bool
917 any p t = S.any p (stream t)
918 {-# INLINE any #-}
919
920 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
921 -- 'Text' @t@ satisfy the predicate @p@. Subject to fusion.
922 all :: (Char -> Bool) -> Text -> Bool
923 all p t = S.all p (stream t)
924 {-# INLINE all #-}
925
926 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
927 -- must be non-empty. Subject to fusion.
928 maximum :: Text -> Char
929 maximum t = S.maximum (stream t)
930 {-# INLINE maximum #-}
931
932 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
933 -- must be non-empty. Subject to fusion.
934 minimum :: Text -> Char
935 minimum t = S.minimum (stream t)
936 {-# INLINE minimum #-}
937
938 -- -----------------------------------------------------------------------------
939 -- * Building 'Text's
940
941 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
942 -- successive reduced values from the left. Subject to fusion.
943 -- Performs replacement on invalid scalar values.
944 --
945 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
946 --
947 -- Note that
948 --
949 -- > last (scanl f z xs) == foldl f z xs.
950 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
951 scanl f z t = unstream (S.scanl g z (stream t))
952 where g a b = safe (f a b)
953 {-# INLINE scanl #-}
954
955 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
956 -- value argument. Subject to fusion. Performs replacement on
957 -- invalid scalar values.
958 --
959 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
960 scanl1 :: (Char -> Char -> Char) -> Text -> Text
961 scanl1 f t | null t = empty
962 | otherwise = scanl f (unsafeHead t) (unsafeTail t)
963 {-# INLINE scanl1 #-}
964
965 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
966 -- replacement on invalid scalar values.
967 --
968 -- > scanr f v == reverse . scanl (flip f) v . reverse
969 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
970 scanr f z = S.reverse . S.reverseScanr g z . reverseStream
971 where g a b = safe (f a b)
972 {-# INLINE scanr #-}
973
974 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
975 -- value argument. Subject to fusion. Performs replacement on
976 -- invalid scalar values.
977 scanr1 :: (Char -> Char -> Char) -> Text -> Text
978 scanr1 f t | null t = empty
979 | otherwise = scanr f (last t) (init t)
980 {-# INLINE scanr1 #-}
981
982 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
983 -- function to each element of a 'Text', passing an accumulating
984 -- parameter from left to right, and returns a final 'Text'. Performs
985 -- replacement on invalid scalar values.
986 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
987 mapAccumL f z0 = S.mapAccumL g z0 . stream
988 where g a b = second safe (f a b)
989 {-# INLINE mapAccumL #-}
990
991 -- | The 'mapAccumR' function behaves like a combination of 'map' and
992 -- a strict 'foldr'; it applies a function to each element of a
993 -- 'Text', passing an accumulating parameter from right to left, and
994 -- returning a final value of this accumulator together with the new
995 -- 'Text'.
996 -- Performs replacement on invalid scalar values.
997 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
998 mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
999 where g a b = second safe (f a b)
1000 {-# INLINE mapAccumR #-}
1001
1002 -- -----------------------------------------------------------------------------
1003 -- ** Generating and unfolding 'Text's
1004
1005 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
1006 -- @t@ repeated @n@ times.
1007 replicate :: Int -> Text -> Text
1008 replicate n t@(Text a o l)
1009 | n <= 0 || l <= 0 = empty
1010 | n == 1 = t
1011 | isSingleton t = replicateChar n (unsafeHead t)
1012 | otherwise = Text (A.run x) 0 len
1013 where
1014 len = l `mul` n
1015 x :: ST s (A.MArray s)
1016 x = do
1017 arr <- A.new len
1018 let loop !d !i | i >= n = return arr
1019 | otherwise = let m = d + l
1020 in A.copyI arr d a o m >> loop m (i+1)
1021 loop 0 0
1022 {-# INLINE [1] replicate #-}
1023
1024 {-# RULES
1025 "TEXT replicate/singleton -> replicateChar" [~1] forall n c.
1026 replicate n (singleton c) = replicateChar n c
1027 #-}
1028
1029 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
1030 -- value of every element. Subject to fusion.
1031 replicateChar :: Int -> Char -> Text
1032 replicateChar n c = unstream (S.replicateCharI n (safe c))
1033 {-# INLINE replicateChar #-}
1034
1035 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
1036 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
1037 -- 'Text' from a seed value. The function takes the element and
1038 -- returns 'Nothing' if it is done producing the 'Text', otherwise
1039 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
1040 -- string, and @b@ is the seed value for further production. Subject
1041 -- to fusion. Performs replacement on invalid scalar values.
1042 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
1043 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
1044 {-# INLINE unfoldr #-}
1045
1046 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
1047 -- value. However, the length of the result should be limited by the
1048 -- first argument to 'unfoldrN'. This function is more efficient than
1049 -- 'unfoldr' when the maximum length of the result is known and
1050 -- correct, otherwise its performance is similar to 'unfoldr'. Subject
1051 -- to fusion. Performs replacement on invalid scalar values.
1052 unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Text
1053 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
1054 {-# INLINE unfoldrN #-}
1055
1056 -- -----------------------------------------------------------------------------
1057 -- * Substrings
1058
1059 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
1060 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
1061 -- the length of the Text. Subject to fusion.
1062 take :: Int -> Text -> Text
1063 take n t@(Text arr off len)
1064 | n <= 0 = empty
1065 | n >= len = t
1066 | otherwise = text arr off (iterN n t)
1067 {-# INLINE [1] take #-}
1068
1069 iterN :: Int -> Text -> Int
1070 iterN n t@(Text _arr _off len) = loop 0 0
1071 where loop !i !cnt
1072 | i >= len || cnt >= n = i
1073 | otherwise = loop (i+d) (cnt+1)
1074 where d = iter_ t i
1075
1076 {-# RULES
1077 "TEXT take -> fused" [~1] forall n t.
1078 take n t = unstream (S.take n (stream t))
1079 "TEXT take -> unfused" [1] forall n t.
1080 unstream (S.take n (stream t)) = take n t
1081 #-}
1082
1083 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
1084 -- taking @n@ characters from the end of @t@.
1085 --
1086 -- Examples:
1087 --
1088 -- > takeEnd 3 "foobar" == "bar"
1089 takeEnd :: Int -> Text -> Text
1090 takeEnd n t@(Text arr off len)
1091 | n <= 0 = empty
1092 | n >= len = t
1093 | otherwise = text arr (off+i) (len-i)
1094 where i = iterNEnd n t
1095
1096 iterNEnd :: Int -> Text -> Int
1097 iterNEnd n t@(Text _arr _off len) = loop (len-1) n
1098 where loop i !m
1099 | i <= 0 = 0
1100 | m <= 1 = i
1101 | otherwise = loop (i+d) (m-1)
1102 where d = reverseIter_ t i
1103
1104 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
1105 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
1106 -- is greater than the length of the 'Text'. Subject to fusion.
1107 drop :: Int -> Text -> Text
1108 drop n t@(Text arr off len)
1109 | n <= 0 = t
1110 | n >= len = empty
1111 | otherwise = text arr (off+i) (len-i)
1112 where i = iterN n t
1113 {-# INLINE [1] drop #-}
1114
1115 {-# RULES
1116 "TEXT drop -> fused" [~1] forall n t.
1117 drop n t = unstream (S.drop n (stream t))
1118 "TEXT drop -> unfused" [1] forall n t.
1119 unstream (S.drop n (stream t)) = drop n t
1120 #-}
1121
1122 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1123 -- dropping @n@ characters from the end of @t@.
1124 --
1125 -- Examples:
1126 --
1127 -- > dropEnd 3 "foobar" == "foo"
1128 dropEnd :: Int -> Text -> Text
1129 dropEnd n t@(Text arr off len)
1130 | n <= 0 = t
1131 | n >= len = empty
1132 | otherwise = text arr off (iterNEnd n t)
1133
1134 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1135 -- returns the longest prefix (possibly empty) of elements that
1136 -- satisfy @p@. Subject to fusion.
1137 takeWhile :: (Char -> Bool) -> Text -> Text
1138 takeWhile p t@(Text arr off len) = loop 0
1139 where loop !i | i >= len = t
1140 | p c = loop (i+d)
1141 | otherwise = text arr off i
1142 where Iter c d = iter t i
1143 {-# INLINE [1] takeWhile #-}
1144
1145 {-# RULES
1146 "TEXT takeWhile -> fused" [~1] forall p t.
1147 takeWhile p t = unstream (S.takeWhile p (stream t))
1148 "TEXT takeWhile -> unfused" [1] forall p t.
1149 unstream (S.takeWhile p (stream t)) = takeWhile p t
1150 #-}
1151
1152 -- | /O(n)/ 'takeWhileEnd', applied to a predicate @p@ and a 'Text',
1153 -- returns the longest suffix (possibly empty) of elements that
1154 -- satisfy @p@. Subject to fusion.
1155 -- Examples:
1156 --
1157 -- > takeWhileEnd (=='o') "foo" == "oo"
1158 takeWhileEnd :: (Char -> Bool) -> Text -> Text
1159 takeWhileEnd p t@(Text arr off len) = loop (len-1) len
1160 where loop !i !l | l <= 0 = t
1161 | p c = loop (i+d) (l+d)
1162 | otherwise = text arr (off+l) (len-l)
1163 where (c,d) = reverseIter t i
1164 {-# INLINE [1] takeWhileEnd #-}
1165
1166 {-# RULES
1167 "TEXT takeWhileEnd -> fused" [~1] forall p t.
1168 takeWhileEnd p t = S.reverse (S.takeWhile p (S.reverseStream t))
1169 "TEXT takeWhileEnd -> unfused" [1] forall p t.
1170 S.reverse (S.takeWhile p (S.reverseStream t)) = takeWhileEnd p t
1171 #-}
1172
1173 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1174 -- 'takeWhile' @p@ @t@. Subject to fusion.
1175 dropWhile :: (Char -> Bool) -> Text -> Text
1176 dropWhile p t@(Text arr off len) = loop 0 0
1177 where loop !i !l | l >= len = empty
1178 | p c = loop (i+d) (l+d)
1179 | otherwise = Text arr (off+i) (len-l)
1180 where Iter c d = iter t i
1181 {-# INLINE [1] dropWhile #-}
1182
1183 {-# RULES
1184 "TEXT dropWhile -> fused" [~1] forall p t.
1185 dropWhile p t = unstream (S.dropWhile p (stream t))
1186 "TEXT dropWhile -> unfused" [1] forall p t.
1187 unstream (S.dropWhile p (stream t)) = dropWhile p t
1188 #-}
1189
1190 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1191 -- dropping characters that fail the predicate @p@ from the end of
1192 -- @t@. Subject to fusion.
1193 -- Examples:
1194 --
1195 -- > dropWhileEnd (=='.') "foo..." == "foo"
1196 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1197 dropWhileEnd p t@(Text arr off len) = loop (len-1) len
1198 where loop !i !l | l <= 0 = empty
1199 | p c = loop (i+d) (l+d)
1200 | otherwise = Text arr off l
1201 where (c,d) = reverseIter t i
1202 {-# INLINE [1] dropWhileEnd #-}
1203
1204 {-# RULES
1205 "TEXT dropWhileEnd -> fused" [~1] forall p t.
1206 dropWhileEnd p t = S.reverse (S.dropWhile p (S.reverseStream t))
1207 "TEXT dropWhileEnd -> unfused" [1] forall p t.
1208 S.reverse (S.dropWhile p (S.reverseStream t)) = dropWhileEnd p t
1209 #-}
1210
1211 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1212 -- dropping characters that fail the predicate @p@ from both the
1213 -- beginning and end of @t@. Subject to fusion.
1214 dropAround :: (Char -> Bool) -> Text -> Text
1215 dropAround p = dropWhile p . dropWhileEnd p
1216 {-# INLINE [1] dropAround #-}
1217
1218 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1219 --
1220 -- > dropWhile isSpace
1221 stripStart :: Text -> Text
1222 stripStart = dropWhile isSpace
1223 {-# INLINE [1] stripStart #-}
1224
1225 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1226 --
1227 -- > dropWhileEnd isSpace
1228 stripEnd :: Text -> Text
1229 stripEnd = dropWhileEnd isSpace
1230 {-# INLINE [1] stripEnd #-}
1231
1232 -- | /O(n)/ Remove leading and trailing white space from a string.
1233 -- Equivalent to:
1234 --
1235 -- > dropAround isSpace
1236 strip :: Text -> Text
1237 strip = dropAround isSpace
1238 {-# INLINE [1] strip #-}
1239
1240 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1241 -- prefix of @t@ of length @n@, and whose second is the remainder of
1242 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1243 splitAt :: Int -> Text -> (Text, Text)
1244 splitAt n t@(Text arr off len)
1245 | n <= 0 = (empty, t)
1246 | n >= len = (t, empty)
1247 | otherwise = let k = iterN n t
1248 in (text arr off k, text arr (off+k) (len-k))
1249
1250 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1251 -- a pair whose first element is the longest prefix (possibly empty)
1252 -- of @t@ of elements that satisfy @p@, and whose second is the
1253 -- remainder of the list.
1254 span :: (Char -> Bool) -> Text -> (Text, Text)
1255 span p t = case span_ p t of
1256 (# hd,tl #) -> (hd,tl)
1257 {-# INLINE span #-}
1258
1259 -- | /O(n)/ 'break' is like 'span', but the prefix returned is
1260 -- over elements that fail the predicate @p@.
1261 break :: (Char -> Bool) -> Text -> (Text, Text)
1262 break p = span (not . p)
1263 {-# INLINE break #-}
1264
1265 -- | /O(n)/ Group characters in a string according to a predicate.
1266 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1267 groupBy p = loop
1268 where
1269 loop t@(Text arr off len)
1270 | null t = []
1271 | otherwise = text arr off n : loop (text arr (off+n) (len-n))
1272 where Iter c d = iter t 0
1273 n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
1274
1275 -- | Returns the /array/ index (in units of 'Word16') at which a
1276 -- character may be found. This is /not/ the same as the logical
1277 -- index returned by e.g. 'findIndex'.
1278 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
1279 findAIndexOrEnd q t@(Text _arr _off len) = go 0
1280 where go !i | i >= len || q c = i
1281 | otherwise = go (i+d)
1282 where Iter c d = iter t i
1283
1284 -- | /O(n)/ Group characters in a string by equality.
1285 group :: Text -> [Text]
1286 group = groupBy (==)
1287
1288 -- | /O(n)/ Return all initial segments of the given 'Text', shortest
1289 -- first.
1290 inits :: Text -> [Text]
1291 inits t@(Text arr off len) = loop 0
1292 where loop i | i >= len = [t]
1293 | otherwise = Text arr off i : loop (i + iter_ t i)
1294
1295 -- | /O(n)/ Return all final segments of the given 'Text', longest
1296 -- first.
1297 tails :: Text -> [Text]
1298 tails t | null t = [empty]
1299 | otherwise = t : tails (unsafeTail t)
1300
1301 -- $split
1302 --
1303 -- Splitting functions in this library do not perform character-wise
1304 -- copies to create substrings; they just construct new 'Text's that
1305 -- are slices of the original.
1306
1307 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
1308 -- argument (which cannot be empty), consuming the delimiter. An empty
1309 -- delimiter is invalid, and will cause an error to be raised.
1310 --
1311 -- Examples:
1312 --
1313 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1314 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1315 -- > splitOn "x" "x" == ["",""]
1316 --
1317 -- and
1318 --
1319 -- > intercalate s . splitOn s == id
1320 -- > splitOn (singleton c) == split (==c)
1321 --
1322 -- (Note: the string @s@ to split on above cannot be empty.)
1323 --
1324 -- In (unlikely) bad cases, this function's time complexity degrades
1325 -- towards /O(n*m)/.
1326 splitOn :: Text
1327 -- ^ String to split on. If this string is empty, an error
1328 -- will occur.
1329 -> Text
1330 -- ^ Input text.
1331 -> [Text]
1332 splitOn pat@(Text _ _ l) src@(Text arr off len)
1333 | l <= 0 = emptyError "splitOn"
1334 | isSingleton pat = split (== unsafeHead pat) src
1335 | otherwise = go 0 (indices pat src)
1336 where
1337 go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs
1338 go s _ = [text arr (s+off) (len-s)]
1339 {-# INLINE [1] splitOn #-}
1340
1341 {-# RULES
1342 "TEXT splitOn/singleton -> split/==" [~1] forall c t.
1343 splitOn (singleton c) t = split (==c) t
1344 #-}
1345
1346 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1347 -- where the predicate returns True for a separator element. The
1348 -- resulting components do not contain the separators. Two adjacent
1349 -- separators result in an empty component in the output. eg.
1350 --
1351 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1352 -- > split (=='a') "" == [""]
1353 split :: (Char -> Bool) -> Text -> [Text]
1354 split _ t@(Text _off _arr 0) = [t]
1355 split p t = loop t
1356 where loop s | null s' = [l]
1357 | otherwise = l : loop (unsafeTail s')
1358 where (# l, s' #) = span_ (not . p) s
1359 {-# INLINE split #-}
1360
1361 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1362 -- element may be shorter than the other chunks, depending on the
1363 -- length of the input. Examples:
1364 --
1365 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1366 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1367 chunksOf :: Int -> Text -> [Text]
1368 chunksOf k = go
1369 where
1370 go t = case splitAt k t of
1371 (a,b) | null a -> []
1372 | otherwise -> a : go b
1373 {-# INLINE chunksOf #-}
1374
1375 -- ----------------------------------------------------------------------------
1376 -- * Searching
1377
1378 -------------------------------------------------------------------------------
1379 -- ** Searching with a predicate
1380
1381 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1382 -- returns the first element matching the predicate, or 'Nothing' if
1383 -- there is no such element.
1384 find :: (Char -> Bool) -> Text -> Maybe Char
1385 find p t = S.findBy p (stream t)
1386 {-# INLINE find #-}
1387
1388 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1389 -- and returns the pair of 'Text's with elements which do and do not
1390 -- satisfy the predicate, respectively; i.e.
1391 --
1392 -- > partition p t == (filter p t, filter (not . p) t)
1393 partition :: (Char -> Bool) -> Text -> (Text, Text)
1394 partition p t = (filter p t, filter (not . p) t)
1395 {-# INLINE partition #-}
1396
1397 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1398 -- returns a 'Text' containing those characters that satisfy the
1399 -- predicate.
1400 filter :: (Char -> Bool) -> Text -> Text
1401 filter p t = unstream (S.filter p (stream t))
1402 {-# INLINE filter #-}
1403
1404 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1405 -- non-'null') in @haystack@. The first element of the returned tuple
1406 -- is the prefix of @haystack@ before @needle@ is matched. The second
1407 -- is the remainder of @haystack@, starting with the match.
1408 --
1409 -- Examples:
1410 --
1411 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1412 -- > breakOn "/" "foobar" ==> ("foobar", "")
1413 --
1414 -- Laws:
1415 --
1416 -- > append prefix match == haystack
1417 -- > where (prefix, match) = breakOn needle haystack
1418 --
1419 -- If you need to break a string by a substring repeatedly (e.g. you
1420 -- want to break on every instance of a substring), use 'breakOnAll'
1421 -- instead, as it has lower startup overhead.
1422 --
1423 -- In (unlikely) bad cases, this function's time complexity degrades
1424 -- towards /O(n*m)/.
1425 breakOn :: Text -> Text -> (Text, Text)
1426 breakOn pat src@(Text arr off len)
1427 | null pat = emptyError "breakOn"
1428 | otherwise = case indices pat src of
1429 [] -> (src, empty)
1430 (x:_) -> (text arr off x, text arr (off+x) (len-x))
1431 {-# INLINE breakOn #-}
1432
1433 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
1434 -- string.
1435 --
1436 -- The first element of the returned tuple is the prefix of @haystack@
1437 -- up to and including the last match of @needle@. The second is the
1438 -- remainder of @haystack@, following the match.
1439 --
1440 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1441 breakOnEnd :: Text -> Text -> (Text, Text)
1442 breakOnEnd pat src = (reverse b, reverse a)
1443 where (a,b) = breakOn (reverse pat) (reverse src)
1444 {-# INLINE breakOnEnd #-}
1445
1446 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1447 -- @haystack@. Each element of the returned list consists of a pair:
1448 --
1449 -- * The entire string prior to the /k/th match (i.e. the prefix)
1450 --
1451 -- * The /k/th match, followed by the remainder of the string
1452 --
1453 -- Examples:
1454 --
1455 -- > breakOnAll "::" ""
1456 -- > ==> []
1457 -- > breakOnAll "/" "a/b/c/"
1458 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1459 --
1460 -- In (unlikely) bad cases, this function's time complexity degrades
1461 -- towards /O(n*m)/.
1462 --
1463 -- The @needle@ parameter may not be empty.
1464 breakOnAll :: Text -- ^ @needle@ to search for
1465 -> Text -- ^ @haystack@ in which to search
1466 -> [(Text, Text)]
1467 breakOnAll pat src@(Text arr off slen)
1468 | null pat = emptyError "breakOnAll"
1469 | otherwise = L.map step (indices pat src)
1470 where
1471 step x = (chunk 0 x, chunk x (slen-x))
1472 chunk !n !l = text arr (n+off) l
1473 {-# INLINE breakOnAll #-}
1474
1475 -------------------------------------------------------------------------------
1476 -- ** Indexing 'Text's
1477
1478 -- $index
1479 --
1480 -- If you think of a 'Text' value as an array of 'Char' values (which
1481 -- it is not), you run the risk of writing inefficient code.
1482 --
1483 -- An idiom that is common in some languages is to find the numeric
1484 -- offset of a character or substring, then use that number to split
1485 -- or trim the searched string. With a 'Text' value, this approach
1486 -- would require two /O(n)/ operations: one to perform the search, and
1487 -- one to operate from wherever the search ended.
1488 --
1489 -- For example, suppose you have a string that you want to split on
1490 -- the substring @\"::\"@, such as @\"foo::bar::quux\"@. Instead of
1491 -- searching for the index of @\"::\"@ and taking the substrings
1492 -- before and after that index, you would instead use @breakOnAll \"::\"@.
1493
1494 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1495 index :: Text -> Int -> Char
1496 index t n = S.index (stream t) n
1497 {-# INLINE index #-}
1498
1499 -- | /O(n)/ The 'findIndex' function takes a predicate and a 'Text'
1500 -- and returns the index of the first element in the 'Text' satisfying
1501 -- the predicate. Subject to fusion.
1502 findIndex :: (Char -> Bool) -> Text -> Maybe Int
1503 findIndex p t = S.findIndex p (stream t)
1504 {-# INLINE findIndex #-}
1505
1506 -- | /O(n+m)/ The 'count' function returns the number of times the
1507 -- query string appears in the given 'Text'. An empty query string is
1508 -- invalid, and will cause an error to be raised.
1509 --
1510 -- In (unlikely) bad cases, this function's time complexity degrades
1511 -- towards /O(n*m)/.
1512 count :: Text -> Text -> Int
1513 count pat src
1514 | null pat = emptyError "count"
1515 | isSingleton pat = countChar (unsafeHead pat) src
1516 | otherwise = L.length (indices pat src)
1517 {-# INLINE [1] count #-}
1518
1519 {-# RULES
1520 "TEXT count/singleton -> countChar" [~1] forall c t.
1521 count (singleton c) t = countChar c t
1522 #-}
1523
1524 -- | /O(n)/ The 'countChar' function returns the number of times the
1525 -- query element appears in the given 'Text'. Subject to fusion.
1526 countChar :: Char -> Text -> Int
1527 countChar c t = S.countChar c (stream t)
1528 {-# INLINE countChar #-}
1529
1530 -------------------------------------------------------------------------------
1531 -- * Zipping
1532
1533 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1534 -- corresponding pairs of bytes. If one input 'Text' is short,
1535 -- excess elements of the longer 'Text' are discarded. This is
1536 -- equivalent to a pair of 'unpack' operations.
1537 zip :: Text -> Text -> [(Char,Char)]
1538 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1539 {-# INLINE zip #-}
1540
1541 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1542 -- given as the first argument, instead of a tupling function.
1543 -- Performs replacement on invalid scalar values.
1544 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1545 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1546 where g a b = safe (f a b)
1547 {-# INLINE zipWith #-}
1548
1549 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1550 -- representing white space.
1551 words :: Text -> [Text]
1552 words t@(Text arr off len) = loop 0 0
1553 where
1554 loop !start !n
1555 | n >= len = if start == n
1556 then []
1557 else [Text arr (start+off) (n-start)]
1558 | isSpace c =
1559 if start == n
1560 then loop (start+1) (start+1)
1561 else Text arr (start+off) (n-start) : loop (n+d) (n+d)
1562 | otherwise = loop start (n+d)
1563 where Iter c d = iter t n
1564 {-# INLINE words #-}
1565
1566 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1567 -- newline 'Char's. The resulting strings do not contain newlines.
1568 lines :: Text -> [Text]
1569 lines ps | null ps = []
1570 | otherwise = h : if null t
1571 then []
1572 else lines (unsafeTail t)
1573 where (# h,t #) = span_ (/= '\n') ps
1574 {-# INLINE lines #-}
1575
1576 {-
1577 -- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
1578 -- boundaries.
1579 --
1580 -- A line boundary is considered to be either a line feed, a carriage
1581 -- return immediately followed by a line feed, or a carriage return.
1582 -- This accounts for both Unix and Windows line ending conventions,
1583 -- and for the old convention used on Mac OS 9 and earlier.
1584 lines' :: Text -> [Text]
1585 lines' ps | null ps = []
1586 | otherwise = h : case uncons t of
1587 Nothing -> []
1588 Just (c,t')
1589 | c == '\n' -> lines t'
1590 | c == '\r' -> case uncons t' of
1591 Just ('\n',t'') -> lines t''
1592 _ -> lines t'
1593 where (h,t) = span notEOL ps
1594 notEOL c = c /= '\n' && c /= '\r'
1595 {-# INLINE lines' #-}
1596 -}
1597
1598 -- | /O(n)/ Joins lines, after appending a terminating newline to
1599 -- each.
1600 unlines :: [Text] -> Text
1601 unlines = concat . L.map (`snoc` '\n')
1602 {-# INLINE unlines #-}
1603
1604 -- | /O(n)/ Joins words using single space characters.
1605 unwords :: [Text] -> Text
1606 unwords = intercalate (singleton ' ')
1607 {-# INLINE unwords #-}
1608
1609 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1610 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1611 isPrefixOf :: Text -> Text -> Bool
1612 isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
1613 alen <= blen && S.isPrefixOf (stream a) (stream b)
1614 {-# INLINE [1] isPrefixOf #-}
1615
1616 {-# RULES
1617 "TEXT isPrefixOf -> fused" [~1] forall s t.
1618 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1619 #-}
1620
1621 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1622 -- 'True' iff the first is a suffix of the second.
1623 isSuffixOf :: Text -> Text -> Bool
1624 isSuffixOf a@(Text _aarr _aoff alen) b@(Text barr boff blen) =
1625 d >= 0 && a == b'
1626 where d = blen - alen
1627 b' | d == 0 = b
1628 | otherwise = Text barr (boff+d) alen
1629 {-# INLINE isSuffixOf #-}
1630
1631 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1632 -- 'True' iff the first is contained, wholly and intact, anywhere
1633 -- within the second.
1634 --
1635 -- In (unlikely) bad cases, this function's time complexity degrades
1636 -- towards /O(n*m)/.
1637 isInfixOf :: Text -> Text -> Bool
1638 isInfixOf needle haystack
1639 | null needle = True
1640 | isSingleton needle = S.elem (unsafeHead needle) . S.stream $ haystack
1641 | otherwise = not . L.null . indices needle $ haystack
1642 {-# INLINE [1] isInfixOf #-}
1643
1644 {-# RULES
1645 "TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1646 isInfixOf (singleton n) h = S.elem n (S.stream h)
1647 #-}
1648
1649 -------------------------------------------------------------------------------
1650 -- * View patterns
1651
1652 -- | /O(n)/ Return the suffix of the second string if its prefix
1653 -- matches the entire first string.
1654 --
1655 -- Examples:
1656 --
1657 -- > stripPrefix "foo" "foobar" == Just "bar"
1658 -- > stripPrefix "" "baz" == Just "baz"
1659 -- > stripPrefix "foo" "quux" == Nothing
1660 --
1661 -- This is particularly useful with the @ViewPatterns@ extension to
1662 -- GHC, as follows:
1663 --
1664 -- > {-# LANGUAGE ViewPatterns #-}
1665 -- > import Data.Text as T
1666 -- >
1667 -- > fnordLength :: Text -> Int
1668 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1669 -- > fnordLength _ = -1
1670 stripPrefix :: Text -> Text -> Maybe Text
1671 stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
1672 | p `isPrefixOf` t = Just $! text arr (off+plen) (len-plen)
1673 | otherwise = Nothing
1674
1675 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1676 -- and return it, along with the suffixes of each string at which they
1677 -- no longer match.
1678 --
1679 -- If the strings do not have a common prefix or either one is empty,
1680 -- this function returns 'Nothing'.
1681 --
1682 -- Examples:
1683 --
1684 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1685 -- > commonPrefixes "veeble" "fetzer" == Nothing
1686 -- > commonPrefixes "" "baz" == Nothing
1687 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1688 commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
1689 where
1690 go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
1691 | i > 0 = Just (Text arr0 off0 i,
1692 text arr0 (off0+i) (len0-i),
1693 text arr1 (off1+j) (len1-j))
1694 | otherwise = Nothing
1695 where Iter a d0 = iter t0 i
1696 Iter b d1 = iter t1 j
1697
1698 -- | /O(n)/ Return the prefix of the second string if its suffix
1699 -- matches the entire first string.
1700 --
1701 -- Examples:
1702 --
1703 -- > stripSuffix "bar" "foobar" == Just "foo"
1704 -- > stripSuffix "" "baz" == Just "baz"
1705 -- > stripSuffix "foo" "quux" == Nothing
1706 --
1707 -- This is particularly useful with the @ViewPatterns@ extension to
1708 -- GHC, as follows:
1709 --
1710 -- > {-# LANGUAGE ViewPatterns #-}
1711 -- > import Data.Text as T
1712 -- >
1713 -- > quuxLength :: Text -> Int
1714 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1715 -- > quuxLength _ = -1
1716 stripSuffix :: Text -> Text -> Maybe Text
1717 stripSuffix p@(Text _arr _off plen) t@(Text arr off len)
1718 | p `isSuffixOf` t = Just $! text arr off (len-plen)
1719 | otherwise = Nothing
1720
1721 -- | Add a list of non-negative numbers. Errors out on overflow.
1722 sumP :: String -> [Int] -> Int
1723 sumP fun = go 0
1724 where go !a (x:xs)
1725 | ax >= 0 = go ax xs
1726 | otherwise = overflowError fun
1727 where ax = a + x
1728 go a _ = a
1729
1730 emptyError :: String -> a
1731 emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input"
1732
1733 overflowError :: String -> a
1734 overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow"
1735
1736 -- | /O(n)/ Make a distinct copy of the given string, sharing no
1737 -- storage with the original string.
1738 --
1739 -- As an example, suppose you read a large string, of which you need
1740 -- only a small portion. If you do not use 'copy', the entire original
1741 -- array will be kept alive in memory by the smaller string. Making a
1742 -- copy \"breaks the link\" to the original array, allowing it to be
1743 -- garbage collected if there are no other live references to it.
1744 copy :: Text -> Text
1745 copy (Text arr off len) = Text (A.run go) 0 len
1746 where
1747 go :: ST s (A.MArray s)
1748 go = do
1749 marr <- A.new len
1750 A.copyI marr 0 arr off len
1751 return marr