Improve performance of strict uncons by ~5%
[packages/text.git] / Data / Text.hs
1 {-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
2 {-# OPTIONS_GHC -fno-warn-orphans #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text
12 -- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
13 -- (c) 2009 Duncan Coutts,
14 -- (c) 2008, 2009 Tom Harper
15 --
16 -- License : BSD-style
17 -- Maintainer : bos@serpentine.com
18 -- Stability : experimental
19 -- Portability : GHC
20 --
21 -- A time and space-efficient implementation of Unicode text.
22 -- Suitable for performance critical use, both in terms of large data
23 -- quantities and high speed.
24 --
25 -- /Note/: Read below the synopsis for important notes on the use of
26 -- this module.
27 --
28 -- This module is intended to be imported @qualified@, to avoid name
29 -- clashes with "Prelude" functions, e.g.
30 --
31 -- > import qualified Data.Text as T
32 --
33 -- To use an extended and very rich family of functions for working
34 -- with Unicode text (including normalization, regular expressions,
35 -- non-standard encodings, text breaking, and locales), see
36 -- <http://hackage.haskell.org/package/text-icu the text-icu package >.
37
38 module Data.Text
39 (
40 -- * Strict vs lazy types
41 -- $strict
42
43 -- * Acceptable data
44 -- $replacement
45
46 -- * Fusion
47 -- $fusion
48
49 -- * Types
50 Text
51
52 -- * Creation and elimination
53 , pack
54 , unpack
55 , singleton
56 , empty
57
58 -- * Basic interface
59 , cons
60 , snoc
61 , append
62 , uncons
63 , head
64 , last
65 , tail
66 , init
67 , null
68 , length
69 , compareLength
70
71 -- * Transformations
72 , map
73 , intercalate
74 , intersperse
75 , transpose
76 , reverse
77 , replace
78
79 -- ** Case conversion
80 -- $case
81 , toCaseFold
82 , toLower
83 , toUpper
84 , toTitle
85
86 -- ** Justification
87 , justifyLeft
88 , justifyRight
89 , center
90
91 -- * Folds
92 , foldl
93 , foldl'
94 , foldl1
95 , foldl1'
96 , foldr
97 , foldr1
98
99 -- ** Special folds
100 , concat
101 , concatMap
102 , any
103 , all
104 , maximum
105 , minimum
106
107 -- * Construction
108
109 -- ** Scans
110 , scanl
111 , scanl1
112 , scanr
113 , scanr1
114
115 -- ** Accumulating maps
116 , mapAccumL
117 , mapAccumR
118
119 -- ** Generation and unfolding
120 , replicate
121 , unfoldr
122 , unfoldrN
123
124 -- * Substrings
125
126 -- ** Breaking strings
127 , take
128 , takeEnd
129 , drop
130 , dropEnd
131 , takeWhile
132 , takeWhileEnd
133 , dropWhile
134 , dropWhileEnd
135 , dropAround
136 , strip
137 , stripStart
138 , stripEnd
139 , splitAt
140 , breakOn
141 , breakOnEnd
142 , break
143 , span
144 , group
145 , groupBy
146 , inits
147 , tails
148
149 -- ** Breaking into many substrings
150 -- $split
151 , splitOn
152 , split
153 , chunksOf
154
155 -- ** Breaking into lines and words
156 , lines
157 --, lines'
158 , words
159 , unlines
160 , unwords
161
162 -- * Predicates
163 , isPrefixOf
164 , isSuffixOf
165 , isInfixOf
166
167 -- ** View patterns
168 , stripPrefix
169 , stripSuffix
170 , commonPrefixes
171
172 -- * Searching
173 , filter
174 , breakOnAll
175 , find
176 , partition
177
178 -- , findSubstring
179
180 -- * Indexing
181 -- $index
182 , index
183 , findIndex
184 , count
185
186 -- * Zipping
187 , zip
188 , zipWith
189
190 -- -* Ordered text
191 -- , sort
192
193 -- * Low level operations
194 , copy
195 , unpackCString#
196 ) where
197
198 import Prelude (Char, Bool(..), Int, Maybe(..), String,
199 Eq(..), Ord(..), Ordering(..), (++),
200 Read(..),
201 (&&), (||), (+), (-), (.), ($), ($!), (>>),
202 not, return, otherwise, quot)
203 #if defined(HAVE_DEEPSEQ)
204 import Control.DeepSeq (NFData(rnf))
205 #endif
206 #if defined(ASSERTS)
207 import Control.Exception (assert)
208 #endif
209 import Data.Char (isSpace)
210 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
211 Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
212 import Control.Monad (foldM)
213 import Control.Monad.ST (ST)
214 import qualified Data.Text.Array as A
215 import qualified Data.List as L
216 import Data.Binary (Binary(get, put))
217 import Data.Monoid (Monoid(..))
218 #if MIN_VERSION_base(4,9,0)
219 import Data.Semigroup (Semigroup(..))
220 #endif
221 import Data.String (IsString(..))
222 import qualified Data.Text.Internal.Fusion as S
223 import qualified Data.Text.Internal.Fusion.Common as S
224 import Data.Text.Encoding (decodeUtf8', encodeUtf8)
225 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
226 import Data.Text.Internal.Private (span_)
227 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
228 import Data.Text.Show (singleton, unpack, unpackCString#)
229 import qualified Prelude as P
230 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
231 reverseIter_, unsafeHead, unsafeTail)
232 import Data.Text.Internal.Unsafe.Char (unsafeChr)
233 import qualified Data.Text.Internal.Functions as F
234 import qualified Data.Text.Internal.Encoding.Utf16 as U16
235 import Data.Text.Internal.Search (indices)
236 #if defined(__HADDOCK__)
237 import Data.ByteString (ByteString)
238 import qualified Data.Text.Lazy as L
239 import Data.Int (Int64)
240 #endif
241 #if __GLASGOW_HASKELL__ >= 708
242 import qualified GHC.Exts as Exts
243 #endif
244 #if MIN_VERSION_base(4,7,0)
245 import Text.Printf (PrintfArg, formatArg, formatString)
246 #endif
247
248 -- $strict
249 --
250 -- This package provides both strict and lazy 'Text' types. The
251 -- strict type is provided by the "Data.Text" module, while the lazy
252 -- type is provided by the "Data.Text.Lazy" module. Internally, the
253 -- lazy @Text@ type consists of a list of strict chunks.
254 --
255 -- The strict 'Text' type requires that an entire string fit into
256 -- memory at once. The lazy 'Data.Text.Lazy.Text' type is capable of
257 -- streaming strings that are larger than memory using a small memory
258 -- footprint. In many cases, the overhead of chunked streaming makes
259 -- the lazy 'Data.Text.Lazy.Text' type slower than its strict
260 -- counterpart, but this is not always the case. Sometimes, the time
261 -- complexity of a function in one module may be different from the
262 -- other, due to their differing internal structures.
263 --
264 -- Each module provides an almost identical API, with the main
265 -- difference being that the strict module uses 'Int' values for
266 -- lengths and counts, while the lazy module uses 'Data.Int.Int64'
267 -- lengths.
268
269 -- $replacement
270 --
271 -- A 'Text' value is a sequence of Unicode scalar values, as defined
272 -- in
273 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
274 -- As such, a 'Text' cannot contain values in the range U+D800 to
275 -- U+DFFF inclusive. Haskell implementations admit all Unicode code
276 -- points
277 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
278 -- as 'Char' values, including code points from this invalid range.
279 -- This means that there are some 'Char' values that are not valid
280 -- Unicode scalar values, and the functions in this module must handle
281 -- those cases.
282 --
283 -- Within this module, many functions construct a 'Text' from one or
284 -- more 'Char' values. Those functions will substitute 'Char' values
285 -- that are not valid Unicode scalar values with the replacement
286 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
287 -- inspection and replacement are documented with the phrase
288 -- \"Performs replacement on invalid scalar values\".
289 --
290 -- (One reason for this policy of replacement is that internally, a
291 -- 'Text' value is represented as packed UTF-16 data. Values in the
292 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
293 -- code points, and so cannot be represented. The functions replace
294 -- invalid scalar values, instead of dropping them, as a security
295 -- measure. For details, see
296 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
297
298 -- $fusion
299 --
300 -- Most of the functions in this module are subject to /fusion/,
301 -- meaning that a pipeline of such functions will usually allocate at
302 -- most one 'Text' value.
303 --
304 -- As an example, consider the following pipeline:
305 --
306 -- > import Data.Text as T
307 -- > import Data.Text.Encoding as E
308 -- > import Data.ByteString (ByteString)
309 -- >
310 -- > countChars :: ByteString -> Int
311 -- > countChars = T.length . T.toUpper . E.decodeUtf8
312 --
313 -- From the type signatures involved, this looks like it should
314 -- allocate one 'Data.ByteString.ByteString' value, and two 'Text'
315 -- values. However, when a module is compiled with optimisation
316 -- enabled under GHC, the two intermediate 'Text' values will be
317 -- optimised away, and the function will be compiled down to a single
318 -- loop over the source 'Data.ByteString.ByteString'.
319 --
320 -- Functions that can be fused by the compiler are documented with the
321 -- phrase \"Subject to fusion\".
322
323 instance Eq Text where
324 Text arrA offA lenA == Text arrB offB lenB
325 | lenA == lenB = A.equal arrA offA arrB offB lenA
326 | otherwise = False
327 {-# INLINE (==) #-}
328
329 instance Ord Text where
330 compare = compareText
331
332 instance Read Text where
333 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
334
335 #if MIN_VERSION_base(4,9,0)
336 -- Semigroup orphan instances for older GHCs are provided by
337 -- 'semigroups` package
338
339 instance Semigroup Text where
340 (<>) = append
341 #endif
342
343 instance Monoid Text where
344 mempty = empty
345 #if MIN_VERSION_base(4,9,0)
346 mappend = (<>) -- future-proof definition
347 #else
348 mappend = append
349 #endif
350 mconcat = concat
351
352 instance IsString Text where
353 fromString = pack
354
355 #if __GLASGOW_HASKELL__ >= 708
356 instance Exts.IsList Text where
357 type Item Text = Char
358 fromList = pack
359 toList = unpack
360 #endif
361
362 #if defined(HAVE_DEEPSEQ)
363 instance NFData Text where rnf !_ = ()
364 #endif
365
366 instance Binary Text where
367 put t = put (encodeUtf8 t)
368 get = do
369 bs <- get
370 case decodeUtf8' bs of
371 P.Left exn -> P.fail (P.show exn)
372 P.Right a -> P.return a
373
374 -- | This instance preserves data abstraction at the cost of inefficiency.
375 -- We omit reflection services for the sake of data abstraction.
376 --
377 -- This instance was created by copying the updated behavior of
378 -- @"Data.Set".@'Data.Set.Set' and @"Data.Map".@'Data.Map.Map'. If you
379 -- feel a mistake has been made, please feel free to submit
380 -- improvements.
381 --
382 -- The original discussion is archived here:
383 -- <http://groups.google.com/group/haskell-cafe/browse_thread/thread/b5bbb1b28a7e525d/0639d46852575b93 could we get a Data instance for Data.Text.Text? >
384 --
385 -- The followup discussion that changed the behavior of 'Data.Set.Set'
386 -- and 'Data.Map.Map' is archived here:
387 -- <http://markmail.org/message/trovdc6zkphyi3cr#query:+page:1+mid:a46der3iacwjcf6n+state:results Proposal: Allow gunfold for Data.Map, ... >
388
389 instance Data Text where
390 gfoldl f z txt = z pack `f` (unpack txt)
391 toConstr _ = packConstr
392 gunfold k z c = case constrIndex c of
393 1 -> k (z pack)
394 _ -> P.error "gunfold"
395 dataTypeOf _ = textDataType
396
397 #if MIN_VERSION_base(4,7,0)
398 -- | Only defined for @base-4.7.0.0@ and later
399 instance PrintfArg Text where
400 formatArg txt = formatString $ unpack txt
401 #endif
402
403 packConstr :: Constr
404 packConstr = mkConstr textDataType "pack" [] Prefix
405
406 textDataType :: DataType
407 textDataType = mkDataType "Data.Text.Text" [packConstr]
408
409 -- | /O(n)/ Compare two 'Text' values lexicographically.
410 compareText :: Text -> Text -> Ordering
411 compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
412 | lenA == 0 && lenB == 0 = EQ
413 | otherwise = go 0 0
414 where
415 go !i !j
416 | i >= lenA || j >= lenB = compare lenA lenB
417 | a < b = LT
418 | a > b = GT
419 | otherwise = go (i+di) (j+dj)
420 where Iter a di = iter ta i
421 Iter b dj = iter tb j
422
423 -- -----------------------------------------------------------------------------
424 -- * Conversion to/from 'Text'
425
426 -- | /O(n)/ Convert a 'String' into a 'Text'. Subject to
427 -- fusion. Performs replacement on invalid scalar values.
428 pack :: String -> Text
429 pack = unstream . S.map safe . S.streamList
430 {-# INLINE [1] pack #-}
431
432 -- -----------------------------------------------------------------------------
433 -- * Basic functions
434
435 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
436 -- is more costly than its 'List' counterpart because it requires
437 -- copying a new array. Subject to fusion. Performs replacement on
438 -- invalid scalar values.
439 cons :: Char -> Text -> Text
440 cons c t = unstream (S.cons (safe c) (stream t))
441 {-# INLINE cons #-}
442
443 infixr 5 `cons`
444
445 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
446 -- entire array in the process, unless fused. Subject to fusion.
447 -- Performs replacement on invalid scalar values.
448 snoc :: Text -> Char -> Text
449 snoc t c = unstream (S.snoc (stream t) (safe c))
450 {-# INLINE snoc #-}
451
452 -- | /O(n)/ Appends one 'Text' to the other by copying both of them
453 -- into a new 'Text'. Subject to fusion.
454 append :: Text -> Text -> Text
455 append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
456 | len1 == 0 = b
457 | len2 == 0 = a
458 | len > 0 = Text (A.run x) 0 len
459 | otherwise = overflowError "append"
460 where
461 len = len1+len2
462 x :: ST s (A.MArray s)
463 x = do
464 arr <- A.new len
465 A.copyI arr 0 arr1 off1 len1
466 A.copyI arr len1 arr2 off2 len
467 return arr
468 {-# NOINLINE append #-}
469
470 {-# RULES
471 "TEXT append -> fused" [~1] forall t1 t2.
472 append t1 t2 = unstream (S.append (stream t1) (stream t2))
473 "TEXT append -> unfused" [1] forall t1 t2.
474 unstream (S.append (stream t1) (stream t2)) = append t1 t2
475 #-}
476
477 -- | /O(1)/ Returns the first character of a 'Text', which must be
478 -- non-empty. Subject to fusion.
479 head :: Text -> Char
480 head t = S.head (stream t)
481 {-# INLINE head #-}
482
483 -- | /O(1)/ Returns the first character and rest of a 'Text', or
484 -- 'Nothing' if empty. Subject to fusion.
485 uncons :: Text -> Maybe (Char, Text)
486 uncons t@(Text arr off len)
487 | len <= 0 = Nothing
488 | otherwise = Just $ let !(Iter c d) = iter t 0
489 in (c, text arr (off+d) (len-d))
490 {-# INLINE [1] uncons #-}
491
492 -- | Lifted from Control.Arrow and specialized.
493 second :: (b -> c) -> (a,b) -> (a,c)
494 second f (a, b) = (a, f b)
495
496 -- | /O(1)/ Returns the last character of a 'Text', which must be
497 -- non-empty. Subject to fusion.
498 last :: Text -> Char
499 last (Text arr off len)
500 | len <= 0 = emptyError "last"
501 | n < 0xDC00 || n > 0xDFFF = unsafeChr n
502 | otherwise = U16.chr2 n0 n
503 where n = A.unsafeIndex arr (off+len-1)
504 n0 = A.unsafeIndex arr (off+len-2)
505 {-# INLINE [1] last #-}
506
507 {-# RULES
508 "TEXT last -> fused" [~1] forall t.
509 last t = S.last (stream t)
510 "TEXT last -> unfused" [1] forall t.
511 S.last (stream t) = last t
512 #-}
513
514 -- | /O(1)/ Returns all characters after the head of a 'Text', which
515 -- must be non-empty. Subject to fusion.
516 tail :: Text -> Text
517 tail t@(Text arr off len)
518 | len <= 0 = emptyError "tail"
519 | otherwise = text arr (off+d) (len-d)
520 where d = iter_ t 0
521 {-# INLINE [1] tail #-}
522
523 {-# RULES
524 "TEXT tail -> fused" [~1] forall t.
525 tail t = unstream (S.tail (stream t))
526 "TEXT tail -> unfused" [1] forall t.
527 unstream (S.tail (stream t)) = tail t
528 #-}
529
530 -- | /O(1)/ Returns all but the last character of a 'Text', which must
531 -- be non-empty. Subject to fusion.
532 init :: Text -> Text
533 init (Text arr off len) | len <= 0 = emptyError "init"
534 | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
535 | otherwise = text arr off (len-1)
536 where
537 n = A.unsafeIndex arr (off+len-1)
538 {-# INLINE [1] init #-}
539
540 {-# RULES
541 "TEXT init -> fused" [~1] forall t.
542 init t = unstream (S.init (stream t))
543 "TEXT init -> unfused" [1] forall t.
544 unstream (S.init (stream t)) = init t
545 #-}
546
547 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
548 -- fusion.
549 null :: Text -> Bool
550 null (Text _arr _off len) =
551 #if defined(ASSERTS)
552 assert (len >= 0) $
553 #endif
554 len <= 0
555 {-# INLINE [1] null #-}
556
557 {-# RULES
558 "TEXT null -> fused" [~1] forall t.
559 null t = S.null (stream t)
560 "TEXT null -> unfused" [1] forall t.
561 S.null (stream t) = null t
562 #-}
563
564 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
565 -- Subject to fusion.
566 isSingleton :: Text -> Bool
567 isSingleton = S.isSingleton . stream
568 {-# INLINE isSingleton #-}
569
570 -- | /O(n)/ Returns the number of characters in a 'Text'.
571 -- Subject to fusion.
572 length :: Text -> Int
573 length t = S.length (stream t)
574 {-# INLINE length #-}
575
576 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
577 -- Subject to fusion.
578 --
579 -- This function gives the same answer as comparing against the result
580 -- of 'length', but can short circuit if the count of characters is
581 -- greater than the number, and hence be more efficient.
582 compareLength :: Text -> Int -> Ordering
583 compareLength t n = S.compareLengthI (stream t) n
584 {-# INLINE [1] compareLength #-}
585
586 {-# RULES
587 "TEXT compareN/length -> compareLength" [~1] forall t n.
588 compare (length t) n = compareLength t n
589 #-}
590
591 {-# RULES
592 "TEXT ==N/length -> compareLength/==EQ" [~1] forall t n.
593 (==) (length t) n = compareLength t n == EQ
594 #-}
595
596 {-# RULES
597 "TEXT /=N/length -> compareLength//=EQ" [~1] forall t n.
598 (/=) (length t) n = compareLength t n /= EQ
599 #-}
600
601 {-# RULES
602 "TEXT <N/length -> compareLength/==LT" [~1] forall t n.
603 (<) (length t) n = compareLength t n == LT
604 #-}
605
606 {-# RULES
607 "TEXT <=N/length -> compareLength//=GT" [~1] forall t n.
608 (<=) (length t) n = compareLength t n /= GT
609 #-}
610
611 {-# RULES
612 "TEXT >N/length -> compareLength/==GT" [~1] forall t n.
613 (>) (length t) n = compareLength t n == GT
614 #-}
615
616 {-# RULES
617 "TEXT >=N/length -> compareLength//=LT" [~1] forall t n.
618 (>=) (length t) n = compareLength t n /= LT
619 #-}
620
621 -- -----------------------------------------------------------------------------
622 -- * Transformations
623 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
624 -- each element of @t@. Subject to fusion. Performs replacement on
625 -- invalid scalar values.
626 map :: (Char -> Char) -> Text -> Text
627 map f t = unstream (S.map (safe . f) (stream t))
628 {-# INLINE [1] map #-}
629
630 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
631 -- 'Text's and concatenates the list after interspersing the first
632 -- argument between each element of the list.
633 intercalate :: Text -> [Text] -> Text
634 intercalate t = concat . (F.intersperse t)
635 {-# INLINE intercalate #-}
636
637 -- | /O(n)/ The 'intersperse' function takes a character and places it
638 -- between the characters of a 'Text'. Subject to fusion. Performs
639 -- replacement on invalid scalar values.
640 intersperse :: Char -> Text -> Text
641 intersperse c t = unstream (S.intersperse (safe c) (stream t))
642 {-# INLINE intersperse #-}
643
644 -- | /O(n)/ Reverse the characters of a string. Subject to fusion.
645 reverse :: Text -> Text
646 reverse t = S.reverse (stream t)
647 {-# INLINE reverse #-}
648
649 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
650 -- @haystack@ with @replacement@.
651 --
652 -- This function behaves as though it was defined as follows:
653 --
654 -- @
655 -- replace needle replacement haystack =
656 -- 'intercalate' replacement ('splitOn' needle haystack)
657 -- @
658 --
659 -- As this suggests, each occurrence is replaced exactly once. So if
660 -- @needle@ occurs in @replacement@, that occurrence will /not/ itself
661 -- be replaced recursively:
662 --
663 -- > replace "oo" "foo" "oo" == "foo"
664 --
665 -- In cases where several instances of @needle@ overlap, only the
666 -- first one will be replaced:
667 --
668 -- > replace "ofo" "bar" "ofofo" == "barfo"
669 --
670 -- In (unlikely) bad cases, this function's time complexity degrades
671 -- towards /O(n*m)/.
672 replace :: Text
673 -- ^ @needle@ to search for. If this string is empty, an
674 -- error will occur.
675 -> Text
676 -- ^ @replacement@ to replace @needle@ with.
677 -> Text
678 -- ^ @haystack@ in which to search.
679 -> Text
680 replace needle@(Text _ _ neeLen)
681 (Text repArr repOff repLen)
682 haystack@(Text hayArr hayOff hayLen)
683 | neeLen == 0 = emptyError "replace"
684 | L.null ixs = haystack
685 | len > 0 = Text (A.run x) 0 len
686 | otherwise = empty
687 where
688 ixs = indices needle haystack
689 len = hayLen - (neeLen - repLen) `mul` L.length ixs
690 x :: ST s (A.MArray s)
691 x = do
692 marr <- A.new len
693 let loop (i:is) o d = do
694 let d0 = d + i - o
695 d1 = d0 + repLen
696 A.copyI marr d hayArr (hayOff+o) d0
697 A.copyI marr d0 repArr repOff d1
698 loop is (i + neeLen) d1
699 loop [] o d = A.copyI marr d hayArr (hayOff+o) len
700 loop ixs 0 0
701 return marr
702
703 -- ----------------------------------------------------------------------------
704 -- ** Case conversions (folds)
705
706 -- $case
707 --
708 -- When case converting 'Text' values, do not use combinators like
709 -- @map toUpper@ to case convert each character of a string
710 -- individually, as this gives incorrect results according to the
711 -- rules of some writing systems. The whole-string case conversion
712 -- functions from this module, such as @toUpper@, obey the correct
713 -- case conversion rules. As a result, these functions may map one
714 -- input character to two or three output characters. For examples,
715 -- see the documentation of each function.
716 --
717 -- /Note/: In some languages, case conversion is a locale- and
718 -- context-dependent operation. The case conversion functions in this
719 -- module are /not/ locale sensitive. Programs that require locale
720 -- sensitivity should use appropriate versions of the
721 -- <http://hackage.haskell.org/package/text-icu-0.6.3.7/docs/Data-Text-ICU.html#g:4 case mapping functions from the text-icu package >.
722
723 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
724 --
725 -- This function is mainly useful for performing caseless (also known
726 -- as case insensitive) string comparisons.
727 --
728 -- A string @x@ is a caseless match for a string @y@ if and only if:
729 --
730 -- @toCaseFold x == toCaseFold y@
731 --
732 -- The result string may be longer than the input string, and may
733 -- differ from applying 'toLower' to the input string. For instance,
734 -- the Armenian small ligature \"&#xfb13;\" (men now, U+FB13) is case
735 -- folded to the sequence \"&#x574;\" (men, U+0574) followed by
736 -- \"&#x576;\" (now, U+0576), while the Greek \"&#xb5;\" (micro sign,
737 -- U+00B5) is case folded to \"&#x3bc;\" (small letter mu, U+03BC)
738 -- instead of itself.
739 toCaseFold :: Text -> Text
740 toCaseFold t = unstream (S.toCaseFold (stream t))
741 {-# INLINE toCaseFold #-}
742
743 -- | /O(n)/ Convert a string to lower case, using simple case
744 -- conversion. Subject to fusion.
745 --
746 -- The result string may be longer than the input string. For
747 -- instance, \"&#x130;\" (Latin capital letter I with dot above,
748 -- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069)
749 -- followed by \" &#x307;\" (combining dot above, U+0307).
750 toLower :: Text -> Text
751 toLower t = unstream (S.toLower (stream t))
752 {-# INLINE toLower #-}
753
754 -- | /O(n)/ Convert a string to upper case, using simple case
755 -- conversion. Subject to fusion.
756 --
757 -- The result string may be longer than the input string. For
758 -- instance, the German \"&#xdf;\" (eszett, U+00DF) maps to the
759 -- two-letter sequence \"SS\".
760 toUpper :: Text -> Text
761 toUpper t = unstream (S.toUpper (stream t))
762 {-# INLINE toUpper #-}
763
764 -- | /O(n)/ Convert a string to title case, using simple case
765 -- conversion. Subject to fusion.
766 --
767 -- The first letter of the input is converted to title case, as is
768 -- every subsequent letter that immediately follows a non-letter.
769 -- Every letter that immediately follows another letter is converted
770 -- to lower case.
771 --
772 -- The result string may be longer than the input string. For example,
773 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
774 -- sequence Latin capital letter F (U+0046) followed by Latin small
775 -- letter l (U+006C).
776 --
777 -- /Note/: this function does not take language or culture specific
778 -- rules into account. For instance, in English, different style
779 -- guides disagree on whether the book name \"The Hill of the Red
780 -- Fox\" is correctly title cased&#x2014;but this function will
781 -- capitalize /every/ word.
782 toTitle :: Text -> Text
783 toTitle t = unstream (S.toTitle (stream t))
784 {-# INLINE toTitle #-}
785
786 -- | /O(n)/ Left-justify a string to the given length, using the
787 -- specified fill character on the right. Subject to fusion.
788 -- Performs replacement on invalid scalar values.
789 --
790 -- Examples:
791 --
792 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
793 -- > justifyLeft 3 'x' "foobar" == "foobar"
794 justifyLeft :: Int -> Char -> Text -> Text
795 justifyLeft k c t
796 | len >= k = t
797 | otherwise = t `append` replicateChar (k-len) c
798 where len = length t
799 {-# INLINE [1] justifyLeft #-}
800
801 {-# RULES
802 "TEXT justifyLeft -> fused" [~1] forall k c t.
803 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
804 "TEXT justifyLeft -> unfused" [1] forall k c t.
805 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
806 #-}
807
808 -- | /O(n)/ Right-justify a string to the given length, using the
809 -- specified fill character on the left. Performs replacement on
810 -- invalid scalar values.
811 --
812 -- Examples:
813 --
814 -- > justifyRight 7 'x' "bar" == "xxxxbar"
815 -- > justifyRight 3 'x' "foobar" == "foobar"
816 justifyRight :: Int -> Char -> Text -> Text
817 justifyRight k c t
818 | len >= k = t
819 | otherwise = replicateChar (k-len) c `append` t
820 where len = length t
821 {-# INLINE justifyRight #-}
822
823 -- | /O(n)/ Center a string to the given length, using the specified
824 -- fill character on either side. Performs replacement on invalid
825 -- scalar values.
826 --
827 -- Examples:
828 --
829 -- > center 8 'x' "HS" = "xxxHSxxx"
830 center :: Int -> Char -> Text -> Text
831 center k c t
832 | len >= k = t
833 | otherwise = replicateChar l c `append` t `append` replicateChar r c
834 where len = length t
835 d = k - len
836 r = d `quot` 2
837 l = d - r
838 {-# INLINE center #-}
839
840 -- | /O(n)/ The 'transpose' function transposes the rows and columns
841 -- of its 'Text' argument. Note that this function uses 'pack',
842 -- 'unpack', and the list version of transpose, and is thus not very
843 -- efficient.
844 transpose :: [Text] -> [Text]
845 transpose ts = P.map pack (L.transpose (P.map unpack ts))
846
847 -- -----------------------------------------------------------------------------
848 -- * Reducing 'Text's (folds)
849
850 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
851 -- (typically the left-identity of the operator), and a 'Text',
852 -- reduces the 'Text' using the binary operator, from left to right.
853 -- Subject to fusion.
854 foldl :: (a -> Char -> a) -> a -> Text -> a
855 foldl f z t = S.foldl f z (stream t)
856 {-# INLINE foldl #-}
857
858 -- | /O(n)/ A strict version of 'foldl'. Subject to fusion.
859 foldl' :: (a -> Char -> a) -> a -> Text -> a
860 foldl' f z t = S.foldl' f z (stream t)
861 {-# INLINE foldl' #-}
862
863 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
864 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
865 foldl1 :: (Char -> Char -> Char) -> Text -> Char
866 foldl1 f t = S.foldl1 f (stream t)
867 {-# INLINE foldl1 #-}
868
869 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
870 foldl1' :: (Char -> Char -> Char) -> Text -> Char
871 foldl1' f t = S.foldl1' f (stream t)
872 {-# INLINE foldl1' #-}
873
874 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
875 -- (typically the right-identity of the operator), and a 'Text',
876 -- reduces the 'Text' using the binary operator, from right to left.
877 -- Subject to fusion.
878 foldr :: (Char -> a -> a) -> a -> Text -> a
879 foldr f z t = S.foldr f z (stream t)
880 {-# INLINE foldr #-}
881
882 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
883 -- and thus must be applied to a non-empty 'Text'. Subject to
884 -- fusion.
885 foldr1 :: (Char -> Char -> Char) -> Text -> Char
886 foldr1 f t = S.foldr1 f (stream t)
887 {-# INLINE foldr1 #-}
888
889 -- -----------------------------------------------------------------------------
890 -- ** Special folds
891
892 -- | /O(n)/ Concatenate a list of 'Text's.
893 concat :: [Text] -> Text
894 concat ts = case ts' of
895 [] -> empty
896 [t] -> t
897 _ -> Text (A.run go) 0 len
898 where
899 ts' = L.filter (not . null) ts
900 len = sumP "concat" $ L.map lengthWord16 ts'
901 go :: ST s (A.MArray s)
902 go = do
903 arr <- A.new len
904 let step i (Text a o l) =
905 let !j = i + l in A.copyI arr i a o j >> return j
906 foldM step 0 ts' >> return arr
907
908 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
909 -- concatenate the results.
910 concatMap :: (Char -> Text) -> Text -> Text
911 concatMap f = concat . foldr ((:) . f) []
912 {-# INLINE concatMap #-}
913
914 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
915 -- 'Text' @t@ satisfies the predicate @p@. Subject to fusion.
916 any :: (Char -> Bool) -> Text -> Bool
917 any p t = S.any p (stream t)
918 {-# INLINE any #-}
919
920 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
921 -- 'Text' @t@ satisfy the predicate @p@. Subject to fusion.
922 all :: (Char -> Bool) -> Text -> Bool
923 all p t = S.all p (stream t)
924 {-# INLINE all #-}
925
926 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
927 -- must be non-empty. Subject to fusion.
928 maximum :: Text -> Char
929 maximum t = S.maximum (stream t)
930 {-# INLINE maximum #-}
931
932 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
933 -- must be non-empty. Subject to fusion.
934 minimum :: Text -> Char
935 minimum t = S.minimum (stream t)
936 {-# INLINE minimum #-}
937
938 -- -----------------------------------------------------------------------------
939 -- * Building 'Text's
940
941 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
942 -- successive reduced values from the left. Subject to fusion.
943 -- Performs replacement on invalid scalar values.
944 --
945 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
946 --
947 -- Note that
948 --
949 -- > last (scanl f z xs) == foldl f z xs.
950 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
951 scanl f z t = unstream (S.scanl g z (stream t))
952 where g a b = safe (f a b)
953 {-# INLINE scanl #-}
954
955 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
956 -- value argument. Subject to fusion. Performs replacement on
957 -- invalid scalar values.
958 --
959 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
960 scanl1 :: (Char -> Char -> Char) -> Text -> Text
961 scanl1 f t | null t = empty
962 | otherwise = scanl f (unsafeHead t) (unsafeTail t)
963 {-# INLINE scanl1 #-}
964
965 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
966 -- replacement on invalid scalar values.
967 --
968 -- > scanr f v == reverse . scanl (flip f) v . reverse
969 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
970 scanr f z = S.reverse . S.reverseScanr g z . reverseStream
971 where g a b = safe (f a b)
972 {-# INLINE scanr #-}
973
974 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
975 -- value argument. Subject to fusion. Performs replacement on
976 -- invalid scalar values.
977 scanr1 :: (Char -> Char -> Char) -> Text -> Text
978 scanr1 f t | null t = empty
979 | otherwise = scanr f (last t) (init t)
980 {-# INLINE scanr1 #-}
981
982 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
983 -- function to each element of a 'Text', passing an accumulating
984 -- parameter from left to right, and returns a final 'Text'. Performs
985 -- replacement on invalid scalar values.
986 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
987 mapAccumL f z0 = S.mapAccumL g z0 . stream
988 where g a b = second safe (f a b)
989 {-# INLINE mapAccumL #-}
990
991 -- | The 'mapAccumR' function behaves like a combination of 'map' and
992 -- a strict 'foldr'; it applies a function to each element of a
993 -- 'Text', passing an accumulating parameter from right to left, and
994 -- returning a final value of this accumulator together with the new
995 -- 'Text'.
996 -- Performs replacement on invalid scalar values.
997 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
998 mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
999 where g a b = second safe (f a b)
1000 {-# INLINE mapAccumR #-}
1001
1002 -- -----------------------------------------------------------------------------
1003 -- ** Generating and unfolding 'Text's
1004
1005 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
1006 -- @t@ repeated @n@ times.
1007 replicate :: Int -> Text -> Text
1008 replicate n t@(Text a o l)
1009 | n <= 0 || l <= 0 = empty
1010 | n == 1 = t
1011 | isSingleton t = replicateChar n (unsafeHead t)
1012 | otherwise = Text (A.run x) 0 len
1013 where
1014 len = l `mul` n
1015 x :: ST s (A.MArray s)
1016 x = do
1017 arr <- A.new len
1018 let loop !d !i | i >= n = return arr
1019 | otherwise = let m = d + l
1020 in A.copyI arr d a o m >> loop m (i+1)
1021 loop 0 0
1022 {-# INLINE [1] replicate #-}
1023
1024 {-# RULES
1025 "TEXT replicate/singleton -> replicateChar" [~1] forall n c.
1026 replicate n (singleton c) = replicateChar n c
1027 #-}
1028
1029 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
1030 -- value of every element. Subject to fusion.
1031 replicateChar :: Int -> Char -> Text
1032 replicateChar n c = unstream (S.replicateCharI n (safe c))
1033 {-# INLINE replicateChar #-}
1034
1035 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
1036 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
1037 -- 'Text' from a seed value. The function takes the element and
1038 -- returns 'Nothing' if it is done producing the 'Text', otherwise
1039 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
1040 -- string, and @b@ is the seed value for further production. Subject
1041 -- to fusion. Performs replacement on invalid scalar values.
1042 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
1043 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
1044 {-# INLINE unfoldr #-}
1045
1046 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
1047 -- value. However, the length of the result should be limited by the
1048 -- first argument to 'unfoldrN'. This function is more efficient than
1049 -- 'unfoldr' when the maximum length of the result is known and
1050 -- correct, otherwise its performance is similar to 'unfoldr'. Subject
1051 -- to fusion. Performs replacement on invalid scalar values.
1052 unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Text
1053 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
1054 {-# INLINE unfoldrN #-}
1055
1056 -- -----------------------------------------------------------------------------
1057 -- * Substrings
1058
1059 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
1060 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
1061 -- the length of the Text. Subject to fusion.
1062 take :: Int -> Text -> Text
1063 take n t@(Text arr off len)
1064 | n <= 0 = empty
1065 | n >= len = t
1066 | otherwise = text arr off (iterN n t)
1067 {-# INLINE [1] take #-}
1068
1069 iterN :: Int -> Text -> Int
1070 iterN n t@(Text _arr _off len) = loop 0 0
1071 where loop !i !cnt
1072 | i >= len || cnt >= n = i
1073 | otherwise = loop (i+d) (cnt+1)
1074 where d = iter_ t i
1075
1076 {-# RULES
1077 "TEXT take -> fused" [~1] forall n t.
1078 take n t = unstream (S.take n (stream t))
1079 "TEXT take -> unfused" [1] forall n t.
1080 unstream (S.take n (stream t)) = take n t
1081 #-}
1082
1083 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
1084 -- taking @n@ characters from the end of @t@.
1085 --
1086 -- Examples:
1087 --
1088 -- > takeEnd 3 "foobar" == "bar"
1089 takeEnd :: Int -> Text -> Text
1090 takeEnd n t@(Text arr off len)
1091 | n <= 0 = empty
1092 | n >= len = t
1093 | otherwise = text arr (off+i) (len-i)
1094 where i = iterNEnd n t
1095
1096 iterNEnd :: Int -> Text -> Int
1097 iterNEnd n t@(Text _arr _off len) = loop (len-1) n
1098 where loop i !m
1099 | i <= 0 = 0
1100 | m <= 1 = i
1101 | otherwise = loop (i+d) (m-1)
1102 where d = reverseIter_ t i
1103
1104 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
1105 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
1106 -- is greater than the length of the 'Text'. Subject to fusion.
1107 drop :: Int -> Text -> Text
1108 drop n t@(Text arr off len)
1109 | n <= 0 = t
1110 | n >= len = empty
1111 | otherwise = text arr (off+i) (len-i)
1112 where i = iterN n t
1113 {-# INLINE [1] drop #-}
1114
1115 {-# RULES
1116 "TEXT drop -> fused" [~1] forall n t.
1117 drop n t = unstream (S.drop n (stream t))
1118 "TEXT drop -> unfused" [1] forall n t.
1119 unstream (S.drop n (stream t)) = drop n t
1120 #-}
1121
1122 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1123 -- dropping @n@ characters from the end of @t@.
1124 --
1125 -- Examples:
1126 --
1127 -- > dropEnd 3 "foobar" == "foo"
1128 dropEnd :: Int -> Text -> Text
1129 dropEnd n t@(Text arr off len)
1130 | n <= 0 = t
1131 | n >= len = empty
1132 | otherwise = text arr off (iterNEnd n t)
1133
1134 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1135 -- returns the longest prefix (possibly empty) of elements that
1136 -- satisfy @p@. Subject to fusion.
1137 takeWhile :: (Char -> Bool) -> Text -> Text
1138 takeWhile p t@(Text arr off len) = loop 0
1139 where loop !i | i >= len = t
1140 | p c = loop (i+d)
1141 | otherwise = text arr off i
1142 where Iter c d = iter t i
1143 {-# INLINE [1] takeWhile #-}
1144
1145 {-# RULES
1146 "TEXT takeWhile -> fused" [~1] forall p t.
1147 takeWhile p t = unstream (S.takeWhile p (stream t))
1148 "TEXT takeWhile -> unfused" [1] forall p t.
1149 unstream (S.takeWhile p (stream t)) = takeWhile p t
1150 #-}
1151
1152 -- | /O(n)/ 'takeWhileEnd', applied to a predicate @p@ and a 'Text',
1153 -- returns the longest suffix (possibly empty) of elements that
1154 -- satisfy @p@. Subject to fusion.
1155 -- Examples:
1156 --
1157 -- > takeWhileEnd (=='o') "foo" == "oo"
1158 takeWhileEnd :: (Char -> Bool) -> Text -> Text
1159 takeWhileEnd p t@(Text arr off len) = loop (len-1) len
1160 where loop !i !l | l <= 0 = t
1161 | p c = loop (i+d) (l+d)
1162 | otherwise = text arr (off+l) (len-l)
1163 where (c,d) = reverseIter t i
1164 {-# INLINE [1] takeWhileEnd #-}
1165
1166 {-# RULES
1167 "TEXT takeWhileEnd -> fused" [~1] forall p t.
1168 takeWhileEnd p t = S.reverse (S.takeWhile p (S.reverseStream t))
1169 "TEXT takeWhileEnd -> unfused" [1] forall p t.
1170 S.reverse (S.takeWhile p (S.reverseStream t)) = takeWhileEnd p t
1171 #-}
1172
1173 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1174 -- 'takeWhile' @p@ @t@. Subject to fusion.
1175 dropWhile :: (Char -> Bool) -> Text -> Text
1176 dropWhile p t@(Text arr off len) = loop 0 0
1177 where loop !i !l | l >= len = empty
1178 | p c = loop (i+d) (l+d)
1179 | otherwise = Text arr (off+i) (len-l)
1180 where Iter c d = iter t i
1181 {-# INLINE [1] dropWhile #-}
1182
1183 {-# RULES
1184 "TEXT dropWhile -> fused" [~1] forall p t.
1185 dropWhile p t = unstream (S.dropWhile p (stream t))
1186 "TEXT dropWhile -> unfused" [1] forall p t.
1187 unstream (S.dropWhile p (stream t)) = dropWhile p t
1188 #-}
1189
1190 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1191 -- dropping characters that satisfy the predicate @p@ from the end of
1192 -- @t@. Subject to fusion.
1193 --
1194 -- Examples:
1195 --
1196 -- > dropWhileEnd (=='.') "foo..." == "foo"
1197 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1198 dropWhileEnd p t@(Text arr off len) = loop (len-1) len
1199 where loop !i !l | l <= 0 = empty
1200 | p c = loop (i+d) (l+d)
1201 | otherwise = Text arr off l
1202 where (c,d) = reverseIter t i
1203 {-# INLINE [1] dropWhileEnd #-}
1204
1205 {-# RULES
1206 "TEXT dropWhileEnd -> fused" [~1] forall p t.
1207 dropWhileEnd p t = S.reverse (S.dropWhile p (S.reverseStream t))
1208 "TEXT dropWhileEnd -> unfused" [1] forall p t.
1209 S.reverse (S.dropWhile p (S.reverseStream t)) = dropWhileEnd p t
1210 #-}
1211
1212 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1213 -- dropping characters that satisfy the predicate @p@ from both the
1214 -- beginning and end of @t@. Subject to fusion.
1215 dropAround :: (Char -> Bool) -> Text -> Text
1216 dropAround p = dropWhile p . dropWhileEnd p
1217 {-# INLINE [1] dropAround #-}
1218
1219 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1220 --
1221 -- > dropWhile isSpace
1222 stripStart :: Text -> Text
1223 stripStart = dropWhile isSpace
1224 {-# INLINE [1] stripStart #-}
1225
1226 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1227 --
1228 -- > dropWhileEnd isSpace
1229 stripEnd :: Text -> Text
1230 stripEnd = dropWhileEnd isSpace
1231 {-# INLINE [1] stripEnd #-}
1232
1233 -- | /O(n)/ Remove leading and trailing white space from a string.
1234 -- Equivalent to:
1235 --
1236 -- > dropAround isSpace
1237 strip :: Text -> Text
1238 strip = dropAround isSpace
1239 {-# INLINE [1] strip #-}
1240
1241 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1242 -- prefix of @t@ of length @n@, and whose second is the remainder of
1243 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1244 splitAt :: Int -> Text -> (Text, Text)
1245 splitAt n t@(Text arr off len)
1246 | n <= 0 = (empty, t)
1247 | n >= len = (t, empty)
1248 | otherwise = let k = iterN n t
1249 in (text arr off k, text arr (off+k) (len-k))
1250
1251 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1252 -- a pair whose first element is the longest prefix (possibly empty)
1253 -- of @t@ of elements that satisfy @p@, and whose second is the
1254 -- remainder of the list.
1255 span :: (Char -> Bool) -> Text -> (Text, Text)
1256 span p t = case span_ p t of
1257 (# hd,tl #) -> (hd,tl)
1258 {-# INLINE span #-}
1259
1260 -- | /O(n)/ 'break' is like 'span', but the prefix returned is
1261 -- over elements that fail the predicate @p@.
1262 break :: (Char -> Bool) -> Text -> (Text, Text)
1263 break p = span (not . p)
1264 {-# INLINE break #-}
1265
1266 -- | /O(n)/ Group characters in a string according to a predicate.
1267 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1268 groupBy p = loop
1269 where
1270 loop t@(Text arr off len)
1271 | null t = []
1272 | otherwise = text arr off n : loop (text arr (off+n) (len-n))
1273 where Iter c d = iter t 0
1274 n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
1275
1276 -- | Returns the /array/ index (in units of 'Word16') at which a
1277 -- character may be found. This is /not/ the same as the logical
1278 -- index returned by e.g. 'findIndex'.
1279 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
1280 findAIndexOrEnd q t@(Text _arr _off len) = go 0
1281 where go !i | i >= len || q c = i
1282 | otherwise = go (i+d)
1283 where Iter c d = iter t i
1284
1285 -- | /O(n)/ Group characters in a string by equality.
1286 group :: Text -> [Text]
1287 group = groupBy (==)
1288
1289 -- | /O(n)/ Return all initial segments of the given 'Text', shortest
1290 -- first.
1291 inits :: Text -> [Text]
1292 inits t@(Text arr off len) = loop 0
1293 where loop i | i >= len = [t]
1294 | otherwise = Text arr off i : loop (i + iter_ t i)
1295
1296 -- | /O(n)/ Return all final segments of the given 'Text', longest
1297 -- first.
1298 tails :: Text -> [Text]
1299 tails t | null t = [empty]
1300 | otherwise = t : tails (unsafeTail t)
1301
1302 -- $split
1303 --
1304 -- Splitting functions in this library do not perform character-wise
1305 -- copies to create substrings; they just construct new 'Text's that
1306 -- are slices of the original.
1307
1308 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
1309 -- argument (which cannot be empty), consuming the delimiter. An empty
1310 -- delimiter is invalid, and will cause an error to be raised.
1311 --
1312 -- Examples:
1313 --
1314 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1315 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1316 -- > splitOn "x" "x" == ["",""]
1317 --
1318 -- and
1319 --
1320 -- > intercalate s . splitOn s == id
1321 -- > splitOn (singleton c) == split (==c)
1322 --
1323 -- (Note: the string @s@ to split on above cannot be empty.)
1324 --
1325 -- In (unlikely) bad cases, this function's time complexity degrades
1326 -- towards /O(n*m)/.
1327 splitOn :: Text
1328 -- ^ String to split on. If this string is empty, an error
1329 -- will occur.
1330 -> Text
1331 -- ^ Input text.
1332 -> [Text]
1333 splitOn pat@(Text _ _ l) src@(Text arr off len)
1334 | l <= 0 = emptyError "splitOn"
1335 | isSingleton pat = split (== unsafeHead pat) src
1336 | otherwise = go 0 (indices pat src)
1337 where
1338 go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs
1339 go s _ = [text arr (s+off) (len-s)]
1340 {-# INLINE [1] splitOn #-}
1341
1342 {-# RULES
1343 "TEXT splitOn/singleton -> split/==" [~1] forall c t.
1344 splitOn (singleton c) t = split (==c) t
1345 #-}
1346
1347 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1348 -- where the predicate returns True for a separator element. The
1349 -- resulting components do not contain the separators. Two adjacent
1350 -- separators result in an empty component in the output. eg.
1351 --
1352 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1353 -- > split (=='a') "" == [""]
1354 split :: (Char -> Bool) -> Text -> [Text]
1355 split _ t@(Text _off _arr 0) = [t]
1356 split p t = loop t
1357 where loop s | null s' = [l]
1358 | otherwise = l : loop (unsafeTail s')
1359 where (# l, s' #) = span_ (not . p) s
1360 {-# INLINE split #-}
1361
1362 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1363 -- element may be shorter than the other chunks, depending on the
1364 -- length of the input. Examples:
1365 --
1366 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1367 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1368 chunksOf :: Int -> Text -> [Text]
1369 chunksOf k = go
1370 where
1371 go t = case splitAt k t of
1372 (a,b) | null a -> []
1373 | otherwise -> a : go b
1374 {-# INLINE chunksOf #-}
1375
1376 -- ----------------------------------------------------------------------------
1377 -- * Searching
1378
1379 -------------------------------------------------------------------------------
1380 -- ** Searching with a predicate
1381
1382 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1383 -- returns the first element matching the predicate, or 'Nothing' if
1384 -- there is no such element.
1385 find :: (Char -> Bool) -> Text -> Maybe Char
1386 find p t = S.findBy p (stream t)
1387 {-# INLINE find #-}
1388
1389 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1390 -- and returns the pair of 'Text's with elements which do and do not
1391 -- satisfy the predicate, respectively; i.e.
1392 --
1393 -- > partition p t == (filter p t, filter (not . p) t)
1394 partition :: (Char -> Bool) -> Text -> (Text, Text)
1395 partition p t = (filter p t, filter (not . p) t)
1396 {-# INLINE partition #-}
1397
1398 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1399 -- returns a 'Text' containing those characters that satisfy the
1400 -- predicate.
1401 filter :: (Char -> Bool) -> Text -> Text
1402 filter p t = unstream (S.filter p (stream t))
1403 {-# INLINE filter #-}
1404
1405 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1406 -- non-'null') in @haystack@. The first element of the returned tuple
1407 -- is the prefix of @haystack@ before @needle@ is matched. The second
1408 -- is the remainder of @haystack@, starting with the match.
1409 --
1410 -- Examples:
1411 --
1412 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1413 -- > breakOn "/" "foobar" ==> ("foobar", "")
1414 --
1415 -- Laws:
1416 --
1417 -- > append prefix match == haystack
1418 -- > where (prefix, match) = breakOn needle haystack
1419 --
1420 -- If you need to break a string by a substring repeatedly (e.g. you
1421 -- want to break on every instance of a substring), use 'breakOnAll'
1422 -- instead, as it has lower startup overhead.
1423 --
1424 -- In (unlikely) bad cases, this function's time complexity degrades
1425 -- towards /O(n*m)/.
1426 breakOn :: Text -> Text -> (Text, Text)
1427 breakOn pat src@(Text arr off len)
1428 | null pat = emptyError "breakOn"
1429 | otherwise = case indices pat src of
1430 [] -> (src, empty)
1431 (x:_) -> (text arr off x, text arr (off+x) (len-x))
1432 {-# INLINE breakOn #-}
1433
1434 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
1435 -- string.
1436 --
1437 -- The first element of the returned tuple is the prefix of @haystack@
1438 -- up to and including the last match of @needle@. The second is the
1439 -- remainder of @haystack@, following the match.
1440 --
1441 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1442 breakOnEnd :: Text -> Text -> (Text, Text)
1443 breakOnEnd pat src = (reverse b, reverse a)
1444 where (a,b) = breakOn (reverse pat) (reverse src)
1445 {-# INLINE breakOnEnd #-}
1446
1447 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1448 -- @haystack@. Each element of the returned list consists of a pair:
1449 --
1450 -- * The entire string prior to the /k/th match (i.e. the prefix)
1451 --
1452 -- * The /k/th match, followed by the remainder of the string
1453 --
1454 -- Examples:
1455 --
1456 -- > breakOnAll "::" ""
1457 -- > ==> []
1458 -- > breakOnAll "/" "a/b/c/"
1459 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1460 --
1461 -- In (unlikely) bad cases, this function's time complexity degrades
1462 -- towards /O(n*m)/.
1463 --
1464 -- The @needle@ parameter may not be empty.
1465 breakOnAll :: Text -- ^ @needle@ to search for
1466 -> Text -- ^ @haystack@ in which to search
1467 -> [(Text, Text)]
1468 breakOnAll pat src@(Text arr off slen)
1469 | null pat = emptyError "breakOnAll"
1470 | otherwise = L.map step (indices pat src)
1471 where
1472 step x = (chunk 0 x, chunk x (slen-x))
1473 chunk !n !l = text arr (n+off) l
1474 {-# INLINE breakOnAll #-}
1475
1476 -------------------------------------------------------------------------------
1477 -- ** Indexing 'Text's
1478
1479 -- $index
1480 --
1481 -- If you think of a 'Text' value as an array of 'Char' values (which
1482 -- it is not), you run the risk of writing inefficient code.
1483 --
1484 -- An idiom that is common in some languages is to find the numeric
1485 -- offset of a character or substring, then use that number to split
1486 -- or trim the searched string. With a 'Text' value, this approach
1487 -- would require two /O(n)/ operations: one to perform the search, and
1488 -- one to operate from wherever the search ended.
1489 --
1490 -- For example, suppose you have a string that you want to split on
1491 -- the substring @\"::\"@, such as @\"foo::bar::quux\"@. Instead of
1492 -- searching for the index of @\"::\"@ and taking the substrings
1493 -- before and after that index, you would instead use @breakOnAll \"::\"@.
1494
1495 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1496 index :: Text -> Int -> Char
1497 index t n = S.index (stream t) n
1498 {-# INLINE index #-}
1499
1500 -- | /O(n)/ The 'findIndex' function takes a predicate and a 'Text'
1501 -- and returns the index of the first element in the 'Text' satisfying
1502 -- the predicate. Subject to fusion.
1503 findIndex :: (Char -> Bool) -> Text -> Maybe Int
1504 findIndex p t = S.findIndex p (stream t)
1505 {-# INLINE findIndex #-}
1506
1507 -- | /O(n+m)/ The 'count' function returns the number of times the
1508 -- query string appears in the given 'Text'. An empty query string is
1509 -- invalid, and will cause an error to be raised.
1510 --
1511 -- In (unlikely) bad cases, this function's time complexity degrades
1512 -- towards /O(n*m)/.
1513 count :: Text -> Text -> Int
1514 count pat src
1515 | null pat = emptyError "count"
1516 | isSingleton pat = countChar (unsafeHead pat) src
1517 | otherwise = L.length (indices pat src)
1518 {-# INLINE [1] count #-}
1519
1520 {-# RULES
1521 "TEXT count/singleton -> countChar" [~1] forall c t.
1522 count (singleton c) t = countChar c t
1523 #-}
1524
1525 -- | /O(n)/ The 'countChar' function returns the number of times the
1526 -- query element appears in the given 'Text'. Subject to fusion.
1527 countChar :: Char -> Text -> Int
1528 countChar c t = S.countChar c (stream t)
1529 {-# INLINE countChar #-}
1530
1531 -------------------------------------------------------------------------------
1532 -- * Zipping
1533
1534 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1535 -- corresponding pairs of bytes. If one input 'Text' is short,
1536 -- excess elements of the longer 'Text' are discarded. This is
1537 -- equivalent to a pair of 'unpack' operations.
1538 zip :: Text -> Text -> [(Char,Char)]
1539 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1540 {-# INLINE zip #-}
1541
1542 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1543 -- given as the first argument, instead of a tupling function.
1544 -- Performs replacement on invalid scalar values.
1545 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1546 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1547 where g a b = safe (f a b)
1548 {-# INLINE zipWith #-}
1549
1550 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1551 -- representing white space.
1552 words :: Text -> [Text]
1553 words t@(Text arr off len) = loop 0 0
1554 where
1555 loop !start !n
1556 | n >= len = if start == n
1557 then []
1558 else [Text arr (start+off) (n-start)]
1559 | isSpace c =
1560 if start == n
1561 then loop (start+1) (start+1)
1562 else Text arr (start+off) (n-start) : loop (n+d) (n+d)
1563 | otherwise = loop start (n+d)
1564 where Iter c d = iter t n
1565 {-# INLINE words #-}
1566
1567 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1568 -- newline 'Char's. The resulting strings do not contain newlines.
1569 lines :: Text -> [Text]
1570 lines ps | null ps = []
1571 | otherwise = h : if null t
1572 then []
1573 else lines (unsafeTail t)
1574 where (# h,t #) = span_ (/= '\n') ps
1575 {-# INLINE lines #-}
1576
1577 {-
1578 -- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
1579 -- boundaries.
1580 --
1581 -- A line boundary is considered to be either a line feed, a carriage
1582 -- return immediately followed by a line feed, or a carriage return.
1583 -- This accounts for both Unix and Windows line ending conventions,
1584 -- and for the old convention used on Mac OS 9 and earlier.
1585 lines' :: Text -> [Text]
1586 lines' ps | null ps = []
1587 | otherwise = h : case uncons t of
1588 Nothing -> []
1589 Just (c,t')
1590 | c == '\n' -> lines t'
1591 | c == '\r' -> case uncons t' of
1592 Just ('\n',t'') -> lines t''
1593 _ -> lines t'
1594 where (h,t) = span notEOL ps
1595 notEOL c = c /= '\n' && c /= '\r'
1596 {-# INLINE lines' #-}
1597 -}
1598
1599 -- | /O(n)/ Joins lines, after appending a terminating newline to
1600 -- each.
1601 unlines :: [Text] -> Text
1602 unlines = concat . L.map (`snoc` '\n')
1603 {-# INLINE unlines #-}
1604
1605 -- | /O(n)/ Joins words using single space characters.
1606 unwords :: [Text] -> Text
1607 unwords = intercalate (singleton ' ')
1608 {-# INLINE unwords #-}
1609
1610 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1611 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1612 isPrefixOf :: Text -> Text -> Bool
1613 isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
1614 alen <= blen && S.isPrefixOf (stream a) (stream b)
1615 {-# INLINE [1] isPrefixOf #-}
1616
1617 {-# RULES
1618 "TEXT isPrefixOf -> fused" [~1] forall s t.
1619 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1620 #-}
1621
1622 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1623 -- 'True' iff the first is a suffix of the second.
1624 isSuffixOf :: Text -> Text -> Bool
1625 isSuffixOf a@(Text _aarr _aoff alen) b@(Text barr boff blen) =
1626 d >= 0 && a == b'
1627 where d = blen - alen
1628 b' | d == 0 = b
1629 | otherwise = Text barr (boff+d) alen
1630 {-# INLINE isSuffixOf #-}
1631
1632 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1633 -- 'True' iff the first is contained, wholly and intact, anywhere
1634 -- within the second.
1635 --
1636 -- In (unlikely) bad cases, this function's time complexity degrades
1637 -- towards /O(n*m)/.
1638 isInfixOf :: Text -> Text -> Bool
1639 isInfixOf needle haystack
1640 | null needle = True
1641 | isSingleton needle = S.elem (unsafeHead needle) . S.stream $ haystack
1642 | otherwise = not . L.null . indices needle $ haystack
1643 {-# INLINE [1] isInfixOf #-}
1644
1645 {-# RULES
1646 "TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1647 isInfixOf (singleton n) h = S.elem n (S.stream h)
1648 #-}
1649
1650 -------------------------------------------------------------------------------
1651 -- * View patterns
1652
1653 -- | /O(n)/ Return the suffix of the second string if its prefix
1654 -- matches the entire first string.
1655 --
1656 -- Examples:
1657 --
1658 -- > stripPrefix "foo" "foobar" == Just "bar"
1659 -- > stripPrefix "" "baz" == Just "baz"
1660 -- > stripPrefix "foo" "quux" == Nothing
1661 --
1662 -- This is particularly useful with the @ViewPatterns@ extension to
1663 -- GHC, as follows:
1664 --
1665 -- > {-# LANGUAGE ViewPatterns #-}
1666 -- > import Data.Text as T
1667 -- >
1668 -- > fnordLength :: Text -> Int
1669 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1670 -- > fnordLength _ = -1
1671 stripPrefix :: Text -> Text -> Maybe Text
1672 stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
1673 | p `isPrefixOf` t = Just $! text arr (off+plen) (len-plen)
1674 | otherwise = Nothing
1675
1676 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1677 -- and return it, along with the suffixes of each string at which they
1678 -- no longer match.
1679 --
1680 -- If the strings do not have a common prefix or either one is empty,
1681 -- this function returns 'Nothing'.
1682 --
1683 -- Examples:
1684 --
1685 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1686 -- > commonPrefixes "veeble" "fetzer" == Nothing
1687 -- > commonPrefixes "" "baz" == Nothing
1688 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1689 commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
1690 where
1691 go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
1692 | i > 0 = Just (Text arr0 off0 i,
1693 text arr0 (off0+i) (len0-i),
1694 text arr1 (off1+j) (len1-j))
1695 | otherwise = Nothing
1696 where Iter a d0 = iter t0 i
1697 Iter b d1 = iter t1 j
1698
1699 -- | /O(n)/ Return the prefix of the second string if its suffix
1700 -- matches the entire first string.
1701 --
1702 -- Examples:
1703 --
1704 -- > stripSuffix "bar" "foobar" == Just "foo"
1705 -- > stripSuffix "" "baz" == Just "baz"
1706 -- > stripSuffix "foo" "quux" == Nothing
1707 --
1708 -- This is particularly useful with the @ViewPatterns@ extension to
1709 -- GHC, as follows:
1710 --
1711 -- > {-# LANGUAGE ViewPatterns #-}
1712 -- > import Data.Text as T
1713 -- >
1714 -- > quuxLength :: Text -> Int
1715 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1716 -- > quuxLength _ = -1
1717 stripSuffix :: Text -> Text -> Maybe Text
1718 stripSuffix p@(Text _arr _off plen) t@(Text arr off len)
1719 | p `isSuffixOf` t = Just $! text arr off (len-plen)
1720 | otherwise = Nothing
1721
1722 -- | Add a list of non-negative numbers. Errors out on overflow.
1723 sumP :: String -> [Int] -> Int
1724 sumP fun = go 0
1725 where go !a (x:xs)
1726 | ax >= 0 = go ax xs
1727 | otherwise = overflowError fun
1728 where ax = a + x
1729 go a _ = a
1730
1731 emptyError :: String -> a
1732 emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input"
1733
1734 overflowError :: String -> a
1735 overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow"
1736
1737 -- | /O(n)/ Make a distinct copy of the given string, sharing no
1738 -- storage with the original string.
1739 --
1740 -- As an example, suppose you read a large string, of which you need
1741 -- only a small portion. If you do not use 'copy', the entire original
1742 -- array will be kept alive in memory by the smaller string. Making a
1743 -- copy \"breaks the link\" to the original array, allowing it to be
1744 -- garbage collected if there are no other live references to it.
1745 copy :: Text -> Text
1746 copy (Text arr off len) = Text (A.run go) 0 len
1747 where
1748 go :: ST s (A.MArray s)
1749 go = do
1750 marr <- A.new len
1751 A.copyI marr 0 arr off len
1752 return marr