909e5be3e939813e3036bb75c256f0951a6f9397
[packages/text.git] / Data / Text.hs
1 {-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
2 {-# OPTIONS_GHC -fno-warn-orphans #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 #if __GLASGOW_HASKELL__ >= 708
7 {-# LANGUAGE TypeFamilies #-}
8 #endif
9
10 -- |
11 -- Module : Data.Text
12 -- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
13 -- (c) 2009 Duncan Coutts,
14 -- (c) 2008, 2009 Tom Harper
15 --
16 -- License : BSD-style
17 -- Maintainer : bos@serpentine.com
18 -- Stability : experimental
19 -- Portability : GHC
20 --
21 -- A time and space-efficient implementation of Unicode text.
22 -- Suitable for performance critical use, both in terms of large data
23 -- quantities and high speed.
24 --
25 -- /Note/: Read below the synopsis for important notes on the use of
26 -- this module.
27 --
28 -- This module is intended to be imported @qualified@, to avoid name
29 -- clashes with "Prelude" functions, e.g.
30 --
31 -- > import qualified Data.Text as T
32 --
33 -- To use an extended and very rich family of functions for working
34 -- with Unicode text (including normalization, regular expressions,
35 -- non-standard encodings, text breaking, and locales), see
36 -- <http://hackage.haskell.org/package/text-icu the text-icu package >.
37
38 module Data.Text
39 (
40 -- * Strict vs lazy types
41 -- $strict
42
43 -- * Acceptable data
44 -- $replacement
45
46 -- * Fusion
47 -- $fusion
48
49 -- * Types
50 Text
51
52 -- * Creation and elimination
53 , pack
54 , unpack
55 , singleton
56 , empty
57
58 -- * Basic interface
59 , cons
60 , snoc
61 , append
62 , uncons
63 , head
64 , last
65 , tail
66 , init
67 , null
68 , length
69 , compareLength
70
71 -- * Transformations
72 , map
73 , intercalate
74 , intersperse
75 , transpose
76 , reverse
77 , replace
78
79 -- ** Case conversion
80 -- $case
81 , toCaseFold
82 , toLower
83 , toUpper
84 , toTitle
85
86 -- ** Justification
87 , justifyLeft
88 , justifyRight
89 , center
90
91 -- * Folds
92 , foldl
93 , foldl'
94 , foldl1
95 , foldl1'
96 , foldr
97 , foldr1
98
99 -- ** Special folds
100 , concat
101 , concatMap
102 , any
103 , all
104 , maximum
105 , minimum
106
107 -- * Construction
108
109 -- ** Scans
110 , scanl
111 , scanl1
112 , scanr
113 , scanr1
114
115 -- ** Accumulating maps
116 , mapAccumL
117 , mapAccumR
118
119 -- ** Generation and unfolding
120 , replicate
121 , unfoldr
122 , unfoldrN
123
124 -- * Substrings
125
126 -- ** Breaking strings
127 , take
128 , takeEnd
129 , drop
130 , dropEnd
131 , takeWhile
132 , dropWhile
133 , dropWhileEnd
134 , dropAround
135 , strip
136 , stripStart
137 , stripEnd
138 , splitAt
139 , breakOn
140 , breakOnEnd
141 , break
142 , span
143 , group
144 , groupBy
145 , inits
146 , tails
147
148 -- ** Breaking into many substrings
149 -- $split
150 , splitOn
151 , split
152 , chunksOf
153
154 -- ** Breaking into lines and words
155 , lines
156 --, lines'
157 , words
158 , unlines
159 , unwords
160
161 -- * Predicates
162 , isPrefixOf
163 , isSuffixOf
164 , isInfixOf
165
166 -- ** View patterns
167 , stripPrefix
168 , stripSuffix
169 , commonPrefixes
170
171 -- * Searching
172 , filter
173 , breakOnAll
174 , find
175 , partition
176
177 -- , findSubstring
178
179 -- * Indexing
180 -- $index
181 , index
182 , findIndex
183 , count
184
185 -- * Zipping
186 , zip
187 , zipWith
188
189 -- -* Ordered text
190 -- , sort
191
192 -- * Low level operations
193 , copy
194 ) where
195
196 import Prelude (Char, Bool(..), Int, Maybe(..), String,
197 Eq(..), Ord(..), Ordering(..), (++),
198 Read(..),
199 (&&), (||), (+), (-), (.), ($), ($!), (>>),
200 not, return, otherwise, quot)
201 #if defined(HAVE_DEEPSEQ)
202 import Control.DeepSeq (NFData(rnf))
203 #endif
204 #if defined(ASSERTS)
205 import Control.Exception (assert)
206 #endif
207 import Data.Char (isSpace)
208 import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
209 Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
210 import Control.Monad (foldM)
211 import Control.Monad.ST (ST)
212 import qualified Data.Text.Array as A
213 import qualified Data.List as L
214 import Data.Binary (Binary(get, put))
215 import Data.Monoid (Monoid(..))
216 import Data.String (IsString(..))
217 import qualified Data.Text.Internal.Fusion as S
218 import qualified Data.Text.Internal.Fusion.Common as S
219 import Data.Text.Encoding (decodeUtf8, encodeUtf8)
220 import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
221 import Data.Text.Internal.Private (span_)
222 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
223 import Data.Text.Show (singleton, unpack)
224 import qualified Prelude as P
225 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
226 reverseIter_, unsafeHead, unsafeTail)
227 import Data.Text.Internal.Unsafe.Char (unsafeChr)
228 import qualified Data.Text.Internal.Functions as F
229 import qualified Data.Text.Internal.Encoding.Utf16 as U16
230 import Data.Text.Internal.Search (indices)
231 #if defined(__HADDOCK__)
232 import Data.ByteString (ByteString)
233 import qualified Data.Text.Lazy as L
234 import Data.Int (Int64)
235 #endif
236 #if __GLASGOW_HASKELL__ >= 708
237 import qualified GHC.Exts as Exts
238 #endif
239
240 -- $strict
241 --
242 -- This package provides both strict and lazy 'Text' types. The
243 -- strict type is provided by the "Data.Text" module, while the lazy
244 -- type is provided by the "Data.Text.Lazy" module. Internally, the
245 -- lazy @Text@ type consists of a list of strict chunks.
246 --
247 -- The strict 'Text' type requires that an entire string fit into
248 -- memory at once. The lazy 'Data.Text.Lazy.Text' type is capable of
249 -- streaming strings that are larger than memory using a small memory
250 -- footprint. In many cases, the overhead of chunked streaming makes
251 -- the lazy 'Data.Text.Lazy.Text' type slower than its strict
252 -- counterpart, but this is not always the case. Sometimes, the time
253 -- complexity of a function in one module may be different from the
254 -- other, due to their differing internal structures.
255 --
256 -- Each module provides an almost identical API, with the main
257 -- difference being that the strict module uses 'Int' values for
258 -- lengths and counts, while the lazy module uses 'Data.Int.Int64'
259 -- lengths.
260
261 -- $replacement
262 --
263 -- A 'Text' value is a sequence of Unicode scalar values, as defined
264 -- in
265 -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
266 -- As such, a 'Text' cannot contain values in the range U+D800 to
267 -- U+DFFF inclusive. Haskell implementations admit all Unicode code
268 -- points
269 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
270 -- as 'Char' values, including code points from this invalid range.
271 -- This means that there are some 'Char' values that are not valid
272 -- Unicode scalar values, and the functions in this module must handle
273 -- those cases.
274 --
275 -- Within this module, many functions construct a 'Text' from one or
276 -- more 'Char' values. Those functions will substitute 'Char' values
277 -- that are not valid Unicode scalar values with the replacement
278 -- character \"&#xfffd;\" (U+FFFD). Functions that perform this
279 -- inspection and replacement are documented with the phrase
280 -- \"Performs replacement on invalid scalar values\".
281 --
282 -- (One reason for this policy of replacement is that internally, a
283 -- 'Text' value is represented as packed UTF-16 data. Values in the
284 -- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
285 -- code points, and so cannot be represented. The functions replace
286 -- invalid scalar values, instead of dropping them, as a security
287 -- measure. For details, see
288 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
289
290 -- $fusion
291 --
292 -- Most of the functions in this module are subject to /fusion/,
293 -- meaning that a pipeline of such functions will usually allocate at
294 -- most one 'Text' value.
295 --
296 -- As an example, consider the following pipeline:
297 --
298 -- > import Data.Text as T
299 -- > import Data.Text.Encoding as E
300 -- > import Data.ByteString (ByteString)
301 -- >
302 -- > countChars :: ByteString -> Int
303 -- > countChars = T.length . T.toUpper . E.decodeUtf8
304 --
305 -- From the type signatures involved, this looks like it should
306 -- allocate one 'Data.ByteString.ByteString' value, and two 'Text'
307 -- values. However, when a module is compiled with optimisation
308 -- enabled under GHC, the two intermediate 'Text' values will be
309 -- optimised away, and the function will be compiled down to a single
310 -- loop over the source 'Data.ByteString.ByteString'.
311 --
312 -- Functions that can be fused by the compiler are documented with the
313 -- phrase \"Subject to fusion\".
314
315 instance Eq Text where
316 Text arrA offA lenA == Text arrB offB lenB
317 | lenA == lenB = A.equal arrA offA arrB offB lenA
318 | otherwise = False
319 {-# INLINE (==) #-}
320
321 instance Ord Text where
322 compare = compareText
323
324 instance Read Text where
325 readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
326
327 instance Monoid Text where
328 mempty = empty
329 mappend = append
330 mconcat = concat
331
332 instance IsString Text where
333 fromString = pack
334
335 #if __GLASGOW_HASKELL__ >= 708
336 instance Exts.IsList Text where
337 type Item Text = Char
338 fromList = pack
339 toList = unpack
340 #endif
341
342 #if defined(HAVE_DEEPSEQ)
343 instance NFData Text where rnf !_ = ()
344 #endif
345
346 instance Binary Text where
347 put t = put (encodeUtf8 t)
348 get = P.fmap decodeUtf8 get
349
350 -- | This instance preserves data abstraction at the cost of inefficiency.
351 -- We omit reflection services for the sake of data abstraction.
352 --
353 -- This instance was created by copying the updated behavior of
354 -- @"Data.Set".@'Data.Set.Set' and @"Data.Map".@'Data.Map.Map'. If you
355 -- feel a mistake has been made, please feel free to submit
356 -- improvements.
357 --
358 -- The original discussion is archived here:
359 -- <http://groups.google.com/group/haskell-cafe/browse_thread/thread/b5bbb1b28a7e525d/0639d46852575b93 could we get a Data instance for Data.Text.Text? >
360 --
361 -- The followup discussion that changed the behavior of 'Data.Set.Set'
362 -- and 'Data.Map.Map' is archived here:
363 -- <http://markmail.org/message/trovdc6zkphyi3cr#query:+page:1+mid:a46der3iacwjcf6n+state:results Proposal: Allow gunfold for Data.Map, ... >
364
365 instance Data Text where
366 gfoldl f z txt = z pack `f` (unpack txt)
367 toConstr _ = packConstr
368 gunfold k z c = case constrIndex c of
369 1 -> k (z pack)
370 _ -> P.error "gunfold"
371 dataTypeOf _ = textDataType
372
373 packConstr :: Constr
374 packConstr = mkConstr textDataType "pack" [] Prefix
375
376 textDataType :: DataType
377 textDataType = mkDataType "Data.Text.Text" [packConstr]
378
379 -- | /O(n)/ Compare two 'Text' values lexicographically.
380 compareText :: Text -> Text -> Ordering
381 compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
382 | lenA == 0 && lenB == 0 = EQ
383 | otherwise = go 0 0
384 where
385 go !i !j
386 | i >= lenA || j >= lenB = compare lenA lenB
387 | a < b = LT
388 | a > b = GT
389 | otherwise = go (i+di) (j+dj)
390 where Iter a di = iter ta i
391 Iter b dj = iter tb j
392
393 -- -----------------------------------------------------------------------------
394 -- * Conversion to/from 'Text'
395
396 -- | /O(n)/ Convert a 'String' into a 'Text'. Subject to
397 -- fusion. Performs replacement on invalid scalar values.
398 pack :: String -> Text
399 pack = unstream . S.map safe . S.streamList
400 {-# INLINE [1] pack #-}
401
402 -- -----------------------------------------------------------------------------
403 -- * Basic functions
404
405 -- | /O(n)/ Adds a character to the front of a 'Text'. This function
406 -- is more costly than its 'List' counterpart because it requires
407 -- copying a new array. Subject to fusion. Performs replacement on
408 -- invalid scalar values.
409 cons :: Char -> Text -> Text
410 cons c t = unstream (S.cons (safe c) (stream t))
411 {-# INLINE cons #-}
412
413 infixr 5 `cons`
414
415 -- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
416 -- entire array in the process, unless fused. Subject to fusion.
417 -- Performs replacement on invalid scalar values.
418 snoc :: Text -> Char -> Text
419 snoc t c = unstream (S.snoc (stream t) (safe c))
420 {-# INLINE snoc #-}
421
422 -- | /O(n)/ Appends one 'Text' to the other by copying both of them
423 -- into a new 'Text'. Subject to fusion.
424 append :: Text -> Text -> Text
425 append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
426 | len1 == 0 = b
427 | len2 == 0 = a
428 | len > 0 = Text (A.run x) 0 len
429 | otherwise = overflowError "append"
430 where
431 len = len1+len2
432 x :: ST s (A.MArray s)
433 x = do
434 arr <- A.new len
435 A.copyI arr 0 arr1 off1 len1
436 A.copyI arr len1 arr2 off2 len
437 return arr
438 {-# NOINLINE append #-}
439
440 {-# RULES
441 "TEXT append -> fused" [~1] forall t1 t2.
442 append t1 t2 = unstream (S.append (stream t1) (stream t2))
443 "TEXT append -> unfused" [1] forall t1 t2.
444 unstream (S.append (stream t1) (stream t2)) = append t1 t2
445 #-}
446
447 -- | /O(1)/ Returns the first character of a 'Text', which must be
448 -- non-empty. Subject to fusion.
449 head :: Text -> Char
450 head t = S.head (stream t)
451 {-# INLINE head #-}
452
453 -- | /O(1)/ Returns the first character and rest of a 'Text', or
454 -- 'Nothing' if empty. Subject to fusion.
455 uncons :: Text -> Maybe (Char, Text)
456 uncons t@(Text arr off len)
457 | len <= 0 = Nothing
458 | otherwise = Just (c, text arr (off+d) (len-d))
459 where Iter c d = iter t 0
460 {-# INLINE [1] uncons #-}
461
462 -- | Lifted from Control.Arrow and specialized.
463 second :: (b -> c) -> (a,b) -> (a,c)
464 second f (a, b) = (a, f b)
465
466 -- | /O(1)/ Returns the last character of a 'Text', which must be
467 -- non-empty. Subject to fusion.
468 last :: Text -> Char
469 last (Text arr off len)
470 | len <= 0 = emptyError "last"
471 | n < 0xDC00 || n > 0xDFFF = unsafeChr n
472 | otherwise = U16.chr2 n0 n
473 where n = A.unsafeIndex arr (off+len-1)
474 n0 = A.unsafeIndex arr (off+len-2)
475 {-# INLINE [1] last #-}
476
477 {-# RULES
478 "TEXT last -> fused" [~1] forall t.
479 last t = S.last (stream t)
480 "TEXT last -> unfused" [1] forall t.
481 S.last (stream t) = last t
482 #-}
483
484 -- | /O(1)/ Returns all characters after the head of a 'Text', which
485 -- must be non-empty. Subject to fusion.
486 tail :: Text -> Text
487 tail t@(Text arr off len)
488 | len <= 0 = emptyError "tail"
489 | otherwise = text arr (off+d) (len-d)
490 where d = iter_ t 0
491 {-# INLINE [1] tail #-}
492
493 {-# RULES
494 "TEXT tail -> fused" [~1] forall t.
495 tail t = unstream (S.tail (stream t))
496 "TEXT tail -> unfused" [1] forall t.
497 unstream (S.tail (stream t)) = tail t
498 #-}
499
500 -- | /O(1)/ Returns all but the last character of a 'Text', which must
501 -- be non-empty. Subject to fusion.
502 init :: Text -> Text
503 init (Text arr off len) | len <= 0 = emptyError "init"
504 | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
505 | otherwise = text arr off (len-1)
506 where
507 n = A.unsafeIndex arr (off+len-1)
508 {-# INLINE [1] init #-}
509
510 {-# RULES
511 "TEXT init -> fused" [~1] forall t.
512 init t = unstream (S.init (stream t))
513 "TEXT init -> unfused" [1] forall t.
514 unstream (S.init (stream t)) = init t
515 #-}
516
517 -- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
518 -- fusion.
519 null :: Text -> Bool
520 null (Text _arr _off len) =
521 #if defined(ASSERTS)
522 assert (len >= 0) $
523 #endif
524 len <= 0
525 {-# INLINE [1] null #-}
526
527 {-# RULES
528 "TEXT null -> fused" [~1] forall t.
529 null t = S.null (stream t)
530 "TEXT null -> unfused" [1] forall t.
531 S.null (stream t) = null t
532 #-}
533
534 -- | /O(1)/ Tests whether a 'Text' contains exactly one character.
535 -- Subject to fusion.
536 isSingleton :: Text -> Bool
537 isSingleton = S.isSingleton . stream
538 {-# INLINE isSingleton #-}
539
540 -- | /O(n)/ Returns the number of characters in a 'Text'.
541 -- Subject to fusion.
542 length :: Text -> Int
543 length t = S.length (stream t)
544 {-# INLINE length #-}
545
546 -- | /O(n)/ Compare the count of characters in a 'Text' to a number.
547 -- Subject to fusion.
548 --
549 -- This function gives the same answer as comparing against the result
550 -- of 'length', but can short circuit if the count of characters is
551 -- greater than the number, and hence be more efficient.
552 compareLength :: Text -> Int -> Ordering
553 compareLength t n = S.compareLengthI (stream t) n
554 {-# INLINE [1] compareLength #-}
555
556 {-# RULES
557 "TEXT compareN/length -> compareLength" [~1] forall t n.
558 compare (length t) n = compareLength t n
559 #-}
560
561 {-# RULES
562 "TEXT ==N/length -> compareLength/==EQ" [~1] forall t n.
563 (==) (length t) n = compareLength t n == EQ
564 #-}
565
566 {-# RULES
567 "TEXT /=N/length -> compareLength//=EQ" [~1] forall t n.
568 (/=) (length t) n = compareLength t n /= EQ
569 #-}
570
571 {-# RULES
572 "TEXT <N/length -> compareLength/==LT" [~1] forall t n.
573 (<) (length t) n = compareLength t n == LT
574 #-}
575
576 {-# RULES
577 "TEXT <=N/length -> compareLength//=GT" [~1] forall t n.
578 (<=) (length t) n = compareLength t n /= GT
579 #-}
580
581 {-# RULES
582 "TEXT >N/length -> compareLength/==GT" [~1] forall t n.
583 (>) (length t) n = compareLength t n == GT
584 #-}
585
586 {-# RULES
587 "TEXT >=N/length -> compareLength//=LT" [~1] forall t n.
588 (>=) (length t) n = compareLength t n /= LT
589 #-}
590
591 -- -----------------------------------------------------------------------------
592 -- * Transformations
593 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
594 -- each element of @t@. Subject to fusion. Performs replacement on
595 -- invalid scalar values.
596 map :: (Char -> Char) -> Text -> Text
597 map f t = unstream (S.map (safe . f) (stream t))
598 {-# INLINE [1] map #-}
599
600 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
601 -- 'Text's and concatenates the list after interspersing the first
602 -- argument between each element of the list.
603 intercalate :: Text -> [Text] -> Text
604 intercalate t = concat . (F.intersperse t)
605 {-# INLINE intercalate #-}
606
607 -- | /O(n)/ The 'intersperse' function takes a character and places it
608 -- between the characters of a 'Text'. Subject to fusion. Performs
609 -- replacement on invalid scalar values.
610 intersperse :: Char -> Text -> Text
611 intersperse c t = unstream (S.intersperse (safe c) (stream t))
612 {-# INLINE intersperse #-}
613
614 -- | /O(n)/ Reverse the characters of a string. Subject to fusion.
615 reverse :: Text -> Text
616 reverse t = S.reverse (stream t)
617 {-# INLINE reverse #-}
618
619 -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
620 -- @haystack@ with @replacement@.
621 --
622 -- This function behaves as though it was defined as follows:
623 --
624 -- @
625 -- replace needle replacement haystack =
626 -- 'intercalate' replacement ('splitOn' needle haystack)
627 -- @
628 --
629 -- As this suggests, each occurrence is replaced exactly once. So if
630 -- @needle@ occurs in @replacement@, that occurrence will /not/ itself
631 -- be replaced recursively:
632 --
633 -- > replace "oo" "foo" "oo" == "foo"
634 --
635 -- In cases where several instances of @needle@ overlap, only the
636 -- first one will be replaced:
637 --
638 -- > replace "ofo" "bar" "ofofo" == "barfo"
639 --
640 -- In (unlikely) bad cases, this function's time complexity degrades
641 -- towards /O(n*m)/.
642 replace :: Text
643 -- ^ @needle@ to search for. If this string is empty, an
644 -- error will occur.
645 -> Text
646 -- ^ @replacement@ to replace @needle@ with.
647 -> Text
648 -- ^ @haystack@ in which to search.
649 -> Text
650 replace needle@(Text _ _ neeLen)
651 (Text repArr repOff repLen)
652 haystack@(Text hayArr hayOff hayLen)
653 | neeLen == 0 = emptyError "replace"
654 | L.null ixs = haystack
655 | len > 0 = Text (A.run x) 0 len
656 | otherwise = empty
657 where
658 ixs = indices needle haystack
659 len = hayLen - (neeLen - repLen) `mul` L.length ixs
660 x :: ST s (A.MArray s)
661 x = do
662 marr <- A.new len
663 let loop (i:is) o d = do
664 let d0 = d + i - o
665 d1 = d0 + repLen
666 A.copyI marr d hayArr (hayOff+o) d0
667 A.copyI marr d0 repArr repOff d1
668 loop is (i + neeLen) d1
669 loop [] o d = A.copyI marr d hayArr (hayOff+o) len
670 loop ixs 0 0
671 return marr
672
673 -- ----------------------------------------------------------------------------
674 -- ** Case conversions (folds)
675
676 -- $case
677 --
678 -- When case converting 'Text' values, do not use combinators like
679 -- @map toUpper@ to case convert each character of a string
680 -- individually, as this gives incorrect results according to the
681 -- rules of some writing systems. The whole-string case conversion
682 -- functions from this module, such as @toUpper@, obey the correct
683 -- case conversion rules. As a result, these functions may map one
684 -- input character to two or three output characters. For examples,
685 -- see the documentation of each function.
686 --
687 -- /Note/: In some languages, case conversion is a locale- and
688 -- context-dependent operation. The case conversion functions in this
689 -- module are /not/ locale sensitive. Programs that require locale
690 -- sensitivity should use appropriate versions of the
691 -- <http://hackage.haskell.org/package/text-icu-0.6.3.7/docs/Data-Text-ICU.html#g:4 case mapping functions from the text-icu package >.
692
693 -- | /O(n)/ Convert a string to folded case. Subject to fusion.
694 --
695 -- This function is mainly useful for performing caseless (also known
696 -- as case insensitive) string comparisons.
697 --
698 -- A string @x@ is a caseless match for a string @y@ if and only if:
699 --
700 -- @toCaseFold x == toCaseFold y@
701 --
702 -- The result string may be longer than the input string, and may
703 -- differ from applying 'toLower' to the input string. For instance,
704 -- the Armenian small ligature \"&#xfb13;\" (men now, U+FB13) is case
705 -- folded to the sequence \"&#x574;\" (men, U+0574) followed by
706 -- \"&#x576;\" (now, U+0576), while the Greek \"&#xb5;\" (micro sign,
707 -- U+00B5) is case folded to \"&#x3bc;\" (small letter mu, U+03BC)
708 -- instead of itself.
709 toCaseFold :: Text -> Text
710 toCaseFold t = unstream (S.toCaseFold (stream t))
711 {-# INLINE toCaseFold #-}
712
713 -- | /O(n)/ Convert a string to lower case, using simple case
714 -- conversion. Subject to fusion.
715 --
716 -- The result string may be longer than the input string. For
717 -- instance, \"&#x130;\" (Latin capital letter I with dot above,
718 -- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069)
719 -- followed by \" &#x307;\" (combining dot above, U+0307).
720 toLower :: Text -> Text
721 toLower t = unstream (S.toLower (stream t))
722 {-# INLINE toLower #-}
723
724 -- | /O(n)/ Convert a string to upper case, using simple case
725 -- conversion. Subject to fusion.
726 --
727 -- The result string may be longer than the input string. For
728 -- instance, the German \"&#xdf;\" (eszett, U+00DF) maps to the
729 -- two-letter sequence \"SS\".
730 toUpper :: Text -> Text
731 toUpper t = unstream (S.toUpper (stream t))
732 {-# INLINE toUpper #-}
733
734 -- | /O(n)/ Convert a string to title case, using simple case
735 -- conversion. Subject to fusion.
736 --
737 -- The first letter of the input is converted to title case, as is
738 -- every subsequent letter that immediately follows a non-letter.
739 -- Every letter that immediately follows another letter is converted
740 -- to lower case.
741 --
742 -- The result string may be longer than the input string. For example,
743 -- the Latin small ligature &#xfb02; (U+FB02) is converted to the
744 -- sequence Latin capital letter F (U+0046) followed by Latin small
745 -- letter l (U+006C).
746 --
747 -- /Note/: this function does not take language or culture specific
748 -- rules into account. For instance, in English, different style
749 -- guides disagree on whether the book name \"The Hill of the Red
750 -- Fox\" is correctly title cased&#x2014;but this function will
751 -- capitalize /every/ word.
752 toTitle :: Text -> Text
753 toTitle t = unstream (S.toTitle (stream t))
754 {-# INLINE toTitle #-}
755
756 -- | /O(n)/ Left-justify a string to the given length, using the
757 -- specified fill character on the right. Subject to fusion.
758 -- Performs replacement on invalid scalar values.
759 --
760 -- Examples:
761 --
762 -- > justifyLeft 7 'x' "foo" == "fooxxxx"
763 -- > justifyLeft 3 'x' "foobar" == "foobar"
764 justifyLeft :: Int -> Char -> Text -> Text
765 justifyLeft k c t
766 | len >= k = t
767 | otherwise = t `append` replicateChar (k-len) c
768 where len = length t
769 {-# INLINE [1] justifyLeft #-}
770
771 {-# RULES
772 "TEXT justifyLeft -> fused" [~1] forall k c t.
773 justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
774 "TEXT justifyLeft -> unfused" [1] forall k c t.
775 unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
776 #-}
777
778 -- | /O(n)/ Right-justify a string to the given length, using the
779 -- specified fill character on the left. Performs replacement on
780 -- invalid scalar values.
781 --
782 -- Examples:
783 --
784 -- > justifyRight 7 'x' "bar" == "xxxxbar"
785 -- > justifyRight 3 'x' "foobar" == "foobar"
786 justifyRight :: Int -> Char -> Text -> Text
787 justifyRight k c t
788 | len >= k = t
789 | otherwise = replicateChar (k-len) c `append` t
790 where len = length t
791 {-# INLINE justifyRight #-}
792
793 -- | /O(n)/ Center a string to the given length, using the specified
794 -- fill character on either side. Performs replacement on invalid
795 -- scalar values.
796 --
797 -- Examples:
798 --
799 -- > center 8 'x' "HS" = "xxxHSxxx"
800 center :: Int -> Char -> Text -> Text
801 center k c t
802 | len >= k = t
803 | otherwise = replicateChar l c `append` t `append` replicateChar r c
804 where len = length t
805 d = k - len
806 r = d `quot` 2
807 l = d - r
808 {-# INLINE center #-}
809
810 -- | /O(n)/ The 'transpose' function transposes the rows and columns
811 -- of its 'Text' argument. Note that this function uses 'pack',
812 -- 'unpack', and the list version of transpose, and is thus not very
813 -- efficient.
814 transpose :: [Text] -> [Text]
815 transpose ts = P.map pack (L.transpose (P.map unpack ts))
816
817 -- -----------------------------------------------------------------------------
818 -- * Reducing 'Text's (folds)
819
820 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
821 -- (typically the left-identity of the operator), and a 'Text',
822 -- reduces the 'Text' using the binary operator, from left to right.
823 -- Subject to fusion.
824 foldl :: (a -> Char -> a) -> a -> Text -> a
825 foldl f z t = S.foldl f z (stream t)
826 {-# INLINE foldl #-}
827
828 -- | /O(n)/ A strict version of 'foldl'. Subject to fusion.
829 foldl' :: (a -> Char -> a) -> a -> Text -> a
830 foldl' f z t = S.foldl' f z (stream t)
831 {-# INLINE foldl' #-}
832
833 -- | /O(n)/ A variant of 'foldl' that has no starting value argument,
834 -- and thus must be applied to a non-empty 'Text'. Subject to fusion.
835 foldl1 :: (Char -> Char -> Char) -> Text -> Char
836 foldl1 f t = S.foldl1 f (stream t)
837 {-# INLINE foldl1 #-}
838
839 -- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
840 foldl1' :: (Char -> Char -> Char) -> Text -> Char
841 foldl1' f t = S.foldl1' f (stream t)
842 {-# INLINE foldl1' #-}
843
844 -- | /O(n)/ 'foldr', applied to a binary operator, a starting value
845 -- (typically the right-identity of the operator), and a 'Text',
846 -- reduces the 'Text' using the binary operator, from right to left.
847 -- Subject to fusion.
848 foldr :: (Char -> a -> a) -> a -> Text -> a
849 foldr f z t = S.foldr f z (stream t)
850 {-# INLINE foldr #-}
851
852 -- | /O(n)/ A variant of 'foldr' that has no starting value argument,
853 -- and thus must be applied to a non-empty 'Text'. Subject to
854 -- fusion.
855 foldr1 :: (Char -> Char -> Char) -> Text -> Char
856 foldr1 f t = S.foldr1 f (stream t)
857 {-# INLINE foldr1 #-}
858
859 -- -----------------------------------------------------------------------------
860 -- ** Special folds
861
862 -- | /O(n)/ Concatenate a list of 'Text's.
863 concat :: [Text] -> Text
864 concat ts = case ts' of
865 [] -> empty
866 [t] -> t
867 _ -> Text (A.run go) 0 len
868 where
869 ts' = L.filter (not . null) ts
870 len = sumP "concat" $ L.map lengthWord16 ts'
871 go :: ST s (A.MArray s)
872 go = do
873 arr <- A.new len
874 let step i (Text a o l) =
875 let !j = i + l in A.copyI arr i a o j >> return j
876 foldM step 0 ts' >> return arr
877
878 -- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
879 -- concatenate the results.
880 concatMap :: (Char -> Text) -> Text -> Text
881 concatMap f = concat . foldr ((:) . f) []
882 {-# INLINE concatMap #-}
883
884 -- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
885 -- 'Text' @t@ satisifes the predicate @p@. Subject to fusion.
886 any :: (Char -> Bool) -> Text -> Bool
887 any p t = S.any p (stream t)
888 {-# INLINE any #-}
889
890 -- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
891 -- 'Text' @t@ satisify the predicate @p@. Subject to fusion.
892 all :: (Char -> Bool) -> Text -> Bool
893 all p t = S.all p (stream t)
894 {-# INLINE all #-}
895
896 -- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
897 -- must be non-empty. Subject to fusion.
898 maximum :: Text -> Char
899 maximum t = S.maximum (stream t)
900 {-# INLINE maximum #-}
901
902 -- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
903 -- must be non-empty. Subject to fusion.
904 minimum :: Text -> Char
905 minimum t = S.minimum (stream t)
906 {-# INLINE minimum #-}
907
908 -- -----------------------------------------------------------------------------
909 -- * Building 'Text's
910
911 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
912 -- successive reduced values from the left. Subject to fusion.
913 -- Performs replacement on invalid scalar values.
914 --
915 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
916 --
917 -- Note that
918 --
919 -- > last (scanl f z xs) == foldl f z xs.
920 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
921 scanl f z t = unstream (S.scanl g z (stream t))
922 where g a b = safe (f a b)
923 {-# INLINE scanl #-}
924
925 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
926 -- value argument. Subject to fusion. Performs replacement on
927 -- invalid scalar values.
928 --
929 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
930 scanl1 :: (Char -> Char -> Char) -> Text -> Text
931 scanl1 f t | null t = empty
932 | otherwise = scanl f (unsafeHead t) (unsafeTail t)
933 {-# INLINE scanl1 #-}
934
935 -- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
936 -- replacement on invalid scalar values.
937 --
938 -- > scanr f v == reverse . scanl (flip f) v . reverse
939 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
940 scanr f z = S.reverse . S.reverseScanr g z . reverseStream
941 where g a b = safe (f a b)
942 {-# INLINE scanr #-}
943
944 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
945 -- value argument. Subject to fusion. Performs replacement on
946 -- invalid scalar values.
947 scanr1 :: (Char -> Char -> Char) -> Text -> Text
948 scanr1 f t | null t = empty
949 | otherwise = scanr f (last t) (init t)
950 {-# INLINE scanr1 #-}
951
952 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
953 -- function to each element of a 'Text', passing an accumulating
954 -- parameter from left to right, and returns a final 'Text'. Performs
955 -- replacement on invalid scalar values.
956 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
957 mapAccumL f z0 = S.mapAccumL g z0 . stream
958 where g a b = second safe (f a b)
959 {-# INLINE mapAccumL #-}
960
961 -- | The 'mapAccumR' function behaves like a combination of 'map' and
962 -- a strict 'foldr'; it applies a function to each element of a
963 -- 'Text', passing an accumulating parameter from right to left, and
964 -- returning a final value of this accumulator together with the new
965 -- 'Text'.
966 -- Performs replacement on invalid scalar values.
967 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
968 mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
969 where g a b = second safe (f a b)
970 {-# INLINE mapAccumR #-}
971
972 -- -----------------------------------------------------------------------------
973 -- ** Generating and unfolding 'Text's
974
975 -- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
976 -- @t@ repeated @n@ times.
977 replicate :: Int -> Text -> Text
978 replicate n t@(Text a o l)
979 | n <= 0 || l <= 0 = empty
980 | n == 1 = t
981 | isSingleton t = replicateChar n (unsafeHead t)
982 | otherwise = Text (A.run x) 0 len
983 where
984 len = l `mul` n
985 x :: ST s (A.MArray s)
986 x = do
987 arr <- A.new len
988 let loop !d !i | i >= n = return arr
989 | otherwise = let m = d + l
990 in A.copyI arr d a o m >> loop m (i+1)
991 loop 0 0
992 {-# INLINE [1] replicate #-}
993
994 {-# RULES
995 "TEXT replicate/singleton -> replicateChar" [~1] forall n c.
996 replicate n (singleton c) = replicateChar n c
997 #-}
998
999 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
1000 -- value of every element. Subject to fusion.
1001 replicateChar :: Int -> Char -> Text
1002 replicateChar n c = unstream (S.replicateCharI n (safe c))
1003 {-# INLINE replicateChar #-}
1004
1005 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
1006 -- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
1007 -- 'Text' from a seed value. The function takes the element and
1008 -- returns 'Nothing' if it is done producing the 'Text', otherwise
1009 -- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
1010 -- string, and @b@ is the seed value for further production. Subject
1011 -- to fusion. Performs replacement on invalid scalar values.
1012 unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
1013 unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
1014 {-# INLINE unfoldr #-}
1015
1016 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
1017 -- value. However, the length of the result should be limited by the
1018 -- first argument to 'unfoldrN'. This function is more efficient than
1019 -- 'unfoldr' when the maximum length of the result is known and
1020 -- correct, otherwise its performance is similar to 'unfoldr'. Subject
1021 -- to fusion. Performs replacement on invalid scalar values.
1022 unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Text
1023 unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
1024 {-# INLINE unfoldrN #-}
1025
1026 -- -----------------------------------------------------------------------------
1027 -- * Substrings
1028
1029 -- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
1030 -- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
1031 -- the length of the Text. Subject to fusion.
1032 take :: Int -> Text -> Text
1033 take n t@(Text arr off len)
1034 | n <= 0 = empty
1035 | n >= len = t
1036 | otherwise = text arr off (iterN n t)
1037 {-# INLINE [1] take #-}
1038
1039 iterN :: Int -> Text -> Int
1040 iterN n t@(Text _arr _off len) = loop 0 0
1041 where loop !i !cnt
1042 | i >= len || cnt >= n = i
1043 | otherwise = loop (i+d) (cnt+1)
1044 where d = iter_ t i
1045
1046 {-# RULES
1047 "TEXT take -> fused" [~1] forall n t.
1048 take n t = unstream (S.take n (stream t))
1049 "TEXT take -> unfused" [1] forall n t.
1050 unstream (S.take n (stream t)) = take n t
1051 #-}
1052
1053 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
1054 -- taking @n@ characters from the end of @t@.
1055 --
1056 -- Examples:
1057 --
1058 -- > takeEnd 3 "foobar" == "bar"
1059 takeEnd :: Int -> Text -> Text
1060 takeEnd n t@(Text arr off len)
1061 | n <= 0 = empty
1062 | n >= len = t
1063 | otherwise = text arr (off+i) (len-i)
1064 where i = iterNEnd n t
1065
1066 iterNEnd :: Int -> Text -> Int
1067 iterNEnd n t@(Text _arr _off len) = loop (len-1) n
1068 where loop i !m
1069 | i <= 0 = 0
1070 | m <= 1 = i
1071 | otherwise = loop (i+d) (m-1)
1072 where d = reverseIter_ t i
1073
1074 -- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
1075 -- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
1076 -- is greater than the length of the 'Text'. Subject to fusion.
1077 drop :: Int -> Text -> Text
1078 drop n t@(Text arr off len)
1079 | n <= 0 = t
1080 | n >= len = empty
1081 | otherwise = text arr (off+i) (len-i)
1082 where i = iterN n t
1083 {-# INLINE [1] drop #-}
1084
1085 {-# RULES
1086 "TEXT drop -> fused" [~1] forall n t.
1087 drop n t = unstream (S.drop n (stream t))
1088 "TEXT drop -> unfused" [1] forall n t.
1089 unstream (S.drop n (stream t)) = drop n t
1090 #-}
1091
1092 -- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
1093 -- dropping @n@ characters from the end of @t@.
1094 --
1095 -- Examples:
1096 --
1097 -- > dropEnd 3 "foobar" == "foo"
1098 dropEnd :: Int -> Text -> Text
1099 dropEnd n t@(Text arr off len)
1100 | n <= 0 = t
1101 | n >= len = empty
1102 | otherwise = text arr off (iterNEnd n t)
1103
1104 -- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
1105 -- returns the longest prefix (possibly empty) of elements that
1106 -- satisfy @p@. Subject to fusion.
1107 takeWhile :: (Char -> Bool) -> Text -> Text
1108 takeWhile p t@(Text arr off len) = loop 0
1109 where loop !i | i >= len = t
1110 | p c = loop (i+d)
1111 | otherwise = text arr off i
1112 where Iter c d = iter t i
1113 {-# INLINE [1] takeWhile #-}
1114
1115 {-# RULES
1116 "TEXT takeWhile -> fused" [~1] forall p t.
1117 takeWhile p t = unstream (S.takeWhile p (stream t))
1118 "TEXT takeWhile -> unfused" [1] forall p t.
1119 unstream (S.takeWhile p (stream t)) = takeWhile p t
1120 #-}
1121
1122 -- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
1123 -- 'takeWhile' @p@ @t@. Subject to fusion.
1124 dropWhile :: (Char -> Bool) -> Text -> Text
1125 dropWhile p t@(Text arr off len) = loop 0 0
1126 where loop !i !l | l >= len = empty
1127 | p c = loop (i+d) (l+d)
1128 | otherwise = Text arr (off+i) (len-l)
1129 where Iter c d = iter t i
1130 {-# INLINE [1] dropWhile #-}
1131
1132 {-# RULES
1133 "TEXT dropWhile -> fused" [~1] forall p t.
1134 dropWhile p t = unstream (S.dropWhile p (stream t))
1135 "TEXT dropWhile -> unfused" [1] forall p t.
1136 unstream (S.dropWhile p (stream t)) = dropWhile p t
1137 #-}
1138
1139 -- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
1140 -- dropping characters that fail the predicate @p@ from the end of
1141 -- @t@. Subject to fusion.
1142 -- Examples:
1143 --
1144 -- > dropWhileEnd (=='.') "foo..." == "foo"
1145 dropWhileEnd :: (Char -> Bool) -> Text -> Text
1146 dropWhileEnd p t@(Text arr off len) = loop (len-1) len
1147 where loop !i !l | l <= 0 = empty
1148 | p c = loop (i+d) (l+d)
1149 | otherwise = Text arr off l
1150 where (c,d) = reverseIter t i
1151 {-# INLINE [1] dropWhileEnd #-}
1152
1153 {-# RULES
1154 "TEXT dropWhileEnd -> fused" [~1] forall p t.
1155 dropWhileEnd p t = S.reverse (S.dropWhile p (S.reverseStream t))
1156 "TEXT dropWhileEnd -> unfused" [1] forall p t.
1157 S.reverse (S.dropWhile p (S.reverseStream t)) = dropWhileEnd p t
1158 #-}
1159
1160 -- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
1161 -- dropping characters that fail the predicate @p@ from both the
1162 -- beginning and end of @t@. Subject to fusion.
1163 dropAround :: (Char -> Bool) -> Text -> Text
1164 dropAround p = dropWhile p . dropWhileEnd p
1165 {-# INLINE [1] dropAround #-}
1166
1167 -- | /O(n)/ Remove leading white space from a string. Equivalent to:
1168 --
1169 -- > dropWhile isSpace
1170 stripStart :: Text -> Text
1171 stripStart = dropWhile isSpace
1172 {-# INLINE [1] stripStart #-}
1173
1174 -- | /O(n)/ Remove trailing white space from a string. Equivalent to:
1175 --
1176 -- > dropWhileEnd isSpace
1177 stripEnd :: Text -> Text
1178 stripEnd = dropWhileEnd isSpace
1179 {-# INLINE [1] stripEnd #-}
1180
1181 -- | /O(n)/ Remove leading and trailing white space from a string.
1182 -- Equivalent to:
1183 --
1184 -- > dropAround isSpace
1185 strip :: Text -> Text
1186 strip = dropAround isSpace
1187 {-# INLINE [1] strip #-}
1188
1189 -- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
1190 -- prefix of @t@ of length @n@, and whose second is the remainder of
1191 -- the string. It is equivalent to @('take' n t, 'drop' n t)@.
1192 splitAt :: Int -> Text -> (Text, Text)
1193 splitAt n t@(Text arr off len)
1194 | n <= 0 = (empty, t)
1195 | n >= len = (t, empty)
1196 | otherwise = let k = iterN n t
1197 in (text arr off k, text arr (off+k) (len-k))
1198
1199 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
1200 -- a pair whose first element is the longest prefix (possibly empty)
1201 -- of @t@ of elements that satisfy @p@, and whose second is the
1202 -- remainder of the list.
1203 span :: (Char -> Bool) -> Text -> (Text, Text)
1204 span p t = case span_ p t of
1205 (# hd,tl #) -> (hd,tl)
1206 {-# INLINE span #-}
1207
1208 -- | /O(n)/ 'break' is like 'span', but the prefix returned is
1209 -- over elements that fail the predicate @p@.
1210 break :: (Char -> Bool) -> Text -> (Text, Text)
1211 break p = span (not . p)
1212 {-# INLINE break #-}
1213
1214 -- | /O(n)/ Group characters in a string according to a predicate.
1215 groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
1216 groupBy p = loop
1217 where
1218 loop t@(Text arr off len)
1219 | null t = []
1220 | otherwise = text arr off n : loop (text arr (off+n) (len-n))
1221 where Iter c d = iter t 0
1222 n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
1223
1224 -- | Returns the /array/ index (in units of 'Word16') at which a
1225 -- character may be found. This is /not/ the same as the logical
1226 -- index returned by e.g. 'findIndex'.
1227 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
1228 findAIndexOrEnd q t@(Text _arr _off len) = go 0
1229 where go !i | i >= len || q c = i
1230 | otherwise = go (i+d)
1231 where Iter c d = iter t i
1232
1233 -- | /O(n)/ Group characters in a string by equality.
1234 group :: Text -> [Text]
1235 group = groupBy (==)
1236
1237 -- | /O(n)/ Return all initial segments of the given 'Text', shortest
1238 -- first.
1239 inits :: Text -> [Text]
1240 inits t@(Text arr off len) = loop 0
1241 where loop i | i >= len = [t]
1242 | otherwise = Text arr off i : loop (i + iter_ t i)
1243
1244 -- | /O(n)/ Return all final segments of the given 'Text', longest
1245 -- first.
1246 tails :: Text -> [Text]
1247 tails t | null t = [empty]
1248 | otherwise = t : tails (unsafeTail t)
1249
1250 -- $split
1251 --
1252 -- Splitting functions in this library do not perform character-wise
1253 -- copies to create substrings; they just construct new 'Text's that
1254 -- are slices of the original.
1255
1256 -- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
1257 -- argument (which cannot be empty), consuming the delimiter. An empty
1258 -- delimiter is invalid, and will cause an error to be raised.
1259 --
1260 -- Examples:
1261 --
1262 -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
1263 -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
1264 -- > splitOn "x" "x" == ["",""]
1265 --
1266 -- and
1267 --
1268 -- > intercalate s . splitOn s == id
1269 -- > splitOn (singleton c) == split (==c)
1270 --
1271 -- (Note: the string @s@ to split on above cannot be empty.)
1272 --
1273 -- In (unlikely) bad cases, this function's time complexity degrades
1274 -- towards /O(n*m)/.
1275 splitOn :: Text
1276 -- ^ String to split on. If this string is empty, an error
1277 -- will occur.
1278 -> Text
1279 -- ^ Input text.
1280 -> [Text]
1281 splitOn pat@(Text _ _ l) src@(Text arr off len)
1282 | l <= 0 = emptyError "splitOn"
1283 | isSingleton pat = split (== unsafeHead pat) src
1284 | otherwise = go 0 (indices pat src)
1285 where
1286 go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs
1287 go s _ = [text arr (s+off) (len-s)]
1288 {-# INLINE [1] splitOn #-}
1289
1290 {-# RULES
1291 "TEXT splitOn/singleton -> split/==" [~1] forall c t.
1292 splitOn (singleton c) t = split (==c) t
1293 #-}
1294
1295 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
1296 -- where the predicate returns True for a separator element. The
1297 -- resulting components do not contain the separators. Two adjacent
1298 -- separators result in an empty component in the output. eg.
1299 --
1300 -- > split (=='a') "aabbaca" == ["","","bb","c",""]
1301 -- > split (=='a') "" == [""]
1302 split :: (Char -> Bool) -> Text -> [Text]
1303 split _ t@(Text _off _arr 0) = [t]
1304 split p t = loop t
1305 where loop s | null s' = [l]
1306 | otherwise = l : loop (unsafeTail s')
1307 where (# l, s' #) = span_ (not . p) s
1308 {-# INLINE split #-}
1309
1310 -- | /O(n)/ Splits a 'Text' into components of length @k@. The last
1311 -- element may be shorter than the other chunks, depending on the
1312 -- length of the input. Examples:
1313 --
1314 -- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
1315 -- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
1316 chunksOf :: Int -> Text -> [Text]
1317 chunksOf k = go
1318 where
1319 go t = case splitAt k t of
1320 (a,b) | null a -> []
1321 | otherwise -> a : go b
1322 {-# INLINE chunksOf #-}
1323
1324 -- ----------------------------------------------------------------------------
1325 -- * Searching
1326
1327 -------------------------------------------------------------------------------
1328 -- ** Searching with a predicate
1329
1330 -- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
1331 -- returns the first element matching the predicate, or 'Nothing' if
1332 -- there is no such element.
1333 find :: (Char -> Bool) -> Text -> Maybe Char
1334 find p t = S.findBy p (stream t)
1335 {-# INLINE find #-}
1336
1337 -- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
1338 -- and returns the pair of 'Text's with elements which do and do not
1339 -- satisfy the predicate, respectively; i.e.
1340 --
1341 -- > partition p t == (filter p t, filter (not . p) t)
1342 partition :: (Char -> Bool) -> Text -> (Text, Text)
1343 partition p t = (filter p t, filter (not . p) t)
1344 {-# INLINE partition #-}
1345
1346 -- | /O(n)/ 'filter', applied to a predicate and a 'Text',
1347 -- returns a 'Text' containing those characters that satisfy the
1348 -- predicate.
1349 filter :: (Char -> Bool) -> Text -> Text
1350 filter p t = unstream (S.filter p (stream t))
1351 {-# INLINE filter #-}
1352
1353 -- | /O(n+m)/ Find the first instance of @needle@ (which must be
1354 -- non-'null') in @haystack@. The first element of the returned tuple
1355 -- is the prefix of @haystack@ before @needle@ is matched. The second
1356 -- is the remainder of @haystack@, starting with the match.
1357 --
1358 -- Examples:
1359 --
1360 -- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
1361 -- > breakOn "/" "foobar" ==> ("foobar", "")
1362 --
1363 -- Laws:
1364 --
1365 -- > append prefix match == haystack
1366 -- > where (prefix, match) = breakOn needle haystack
1367 --
1368 -- If you need to break a string by a substring repeatedly (e.g. you
1369 -- want to break on every instance of a substring), use 'breakOnAll'
1370 -- instead, as it has lower startup overhead.
1371 --
1372 -- In (unlikely) bad cases, this function's time complexity degrades
1373 -- towards /O(n*m)/.
1374 breakOn :: Text -> Text -> (Text, Text)
1375 breakOn pat src@(Text arr off len)
1376 | null pat = emptyError "breakOn"
1377 | otherwise = case indices pat src of
1378 [] -> (src, empty)
1379 (x:_) -> (text arr off x, text arr (off+x) (len-x))
1380 {-# INLINE breakOn #-}
1381
1382 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
1383 -- string.
1384 --
1385 -- The first element of the returned tuple is the prefix of @haystack@
1386 -- up to and including the last match of @needle@. The second is the
1387 -- remainder of @haystack@, following the match.
1388 --
1389 -- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
1390 breakOnEnd :: Text -> Text -> (Text, Text)
1391 breakOnEnd pat src = (reverse b, reverse a)
1392 where (a,b) = breakOn (reverse pat) (reverse src)
1393 {-# INLINE breakOnEnd #-}
1394
1395 -- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
1396 -- @haystack@. Each element of the returned list consists of a pair:
1397 --
1398 -- * The entire string prior to the /k/th match (i.e. the prefix)
1399 --
1400 -- * The /k/th match, followed by the remainder of the string
1401 --
1402 -- Examples:
1403 --
1404 -- > breakOnAll "::" ""
1405 -- > ==> []
1406 -- > breakOnAll "/" "a/b/c/"
1407 -- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
1408 --
1409 -- In (unlikely) bad cases, this function's time complexity degrades
1410 -- towards /O(n*m)/.
1411 --
1412 -- The @needle@ parameter may not be empty.
1413 breakOnAll :: Text -- ^ @needle@ to search for
1414 -> Text -- ^ @haystack@ in which to search
1415 -> [(Text, Text)]
1416 breakOnAll pat src@(Text arr off slen)
1417 | null pat = emptyError "breakOnAll"
1418 | otherwise = L.map step (indices pat src)
1419 where
1420 step x = (chunk 0 x, chunk x (slen-x))
1421 chunk !n !l = text arr (n+off) l
1422 {-# INLINE breakOnAll #-}
1423
1424 -------------------------------------------------------------------------------
1425 -- ** Indexing 'Text's
1426
1427 -- $index
1428 --
1429 -- If you think of a 'Text' value as an array of 'Char' values (which
1430 -- it is not), you run the risk of writing inefficient code.
1431 --
1432 -- An idiom that is common in some languages is to find the numeric
1433 -- offset of a character or substring, then use that number to split
1434 -- or trim the searched string. With a 'Text' value, this approach
1435 -- would require two /O(n)/ operations: one to perform the search, and
1436 -- one to operate from wherever the search ended.
1437 --
1438 -- For example, suppose you have a string that you want to split on
1439 -- the substring @\"::\"@, such as @\"foo::bar::quux\"@. Instead of
1440 -- searching for the index of @\"::\"@ and taking the substrings
1441 -- before and after that index, you would instead use @breakOnAll \"::\"@.
1442
1443 -- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
1444 index :: Text -> Int -> Char
1445 index t n = S.index (stream t) n
1446 {-# INLINE index #-}
1447
1448 -- | /O(n)/ The 'findIndex' function takes a predicate and a 'Text'
1449 -- and returns the index of the first element in the 'Text' satisfying
1450 -- the predicate. Subject to fusion.
1451 findIndex :: (Char -> Bool) -> Text -> Maybe Int
1452 findIndex p t = S.findIndex p (stream t)
1453 {-# INLINE findIndex #-}
1454
1455 -- | /O(n+m)/ The 'count' function returns the number of times the
1456 -- query string appears in the given 'Text'. An empty query string is
1457 -- invalid, and will cause an error to be raised.
1458 --
1459 -- In (unlikely) bad cases, this function's time complexity degrades
1460 -- towards /O(n*m)/.
1461 count :: Text -> Text -> Int
1462 count pat src
1463 | null pat = emptyError "count"
1464 | isSingleton pat = countChar (unsafeHead pat) src
1465 | otherwise = L.length (indices pat src)
1466 {-# INLINE [1] count #-}
1467
1468 {-# RULES
1469 "TEXT count/singleton -> countChar" [~1] forall c t.
1470 count (singleton c) t = countChar c t
1471 #-}
1472
1473 -- | /O(n)/ The 'countChar' function returns the number of times the
1474 -- query element appears in the given 'Text'. Subject to fusion.
1475 countChar :: Char -> Text -> Int
1476 countChar c t = S.countChar c (stream t)
1477 {-# INLINE countChar #-}
1478
1479 -------------------------------------------------------------------------------
1480 -- * Zipping
1481
1482 -- | /O(n)/ 'zip' takes two 'Text's and returns a list of
1483 -- corresponding pairs of bytes. If one input 'Text' is short,
1484 -- excess elements of the longer 'Text' are discarded. This is
1485 -- equivalent to a pair of 'unpack' operations.
1486 zip :: Text -> Text -> [(Char,Char)]
1487 zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
1488 {-# INLINE zip #-}
1489
1490 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
1491 -- given as the first argument, instead of a tupling function.
1492 -- Performs replacement on invalid scalar values.
1493 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
1494 zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
1495 where g a b = safe (f a b)
1496 {-# INLINE zipWith #-}
1497
1498 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
1499 -- representing white space.
1500 words :: Text -> [Text]
1501 words t@(Text arr off len) = loop 0 0
1502 where
1503 loop !start !n
1504 | n >= len = if start == n
1505 then []
1506 else [Text arr (start+off) (n-start)]
1507 | isSpace c =
1508 if start == n
1509 then loop (start+1) (start+1)
1510 else Text arr (start+off) (n-start) : loop (n+d) (n+d)
1511 | otherwise = loop start (n+d)
1512 where Iter c d = iter t n
1513 {-# INLINE words #-}
1514
1515 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
1516 -- newline 'Char's. The resulting strings do not contain newlines.
1517 lines :: Text -> [Text]
1518 lines ps | null ps = []
1519 | otherwise = h : if null t
1520 then []
1521 else lines (unsafeTail t)
1522 where (# h,t #) = span_ (/= '\n') ps
1523 {-# INLINE lines #-}
1524
1525 {-
1526 -- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
1527 -- boundaries.
1528 --
1529 -- A line boundary is considered to be either a line feed, a carriage
1530 -- return immediately followed by a line feed, or a carriage return.
1531 -- This accounts for both Unix and Windows line ending conventions,
1532 -- and for the old convention used on Mac OS 9 and earlier.
1533 lines' :: Text -> [Text]
1534 lines' ps | null ps = []
1535 | otherwise = h : case uncons t of
1536 Nothing -> []
1537 Just (c,t')
1538 | c == '\n' -> lines t'
1539 | c == '\r' -> case uncons t' of
1540 Just ('\n',t'') -> lines t''
1541 _ -> lines t'
1542 where (h,t) = span notEOL ps
1543 notEOL c = c /= '\n' && c /= '\r'
1544 {-# INLINE lines' #-}
1545 -}
1546
1547 -- | /O(n)/ Joins lines, after appending a terminating newline to
1548 -- each.
1549 unlines :: [Text] -> Text
1550 unlines = concat . L.map (`snoc` '\n')
1551 {-# INLINE unlines #-}
1552
1553 -- | /O(n)/ Joins words using single space characters.
1554 unwords :: [Text] -> Text
1555 unwords = intercalate (singleton ' ')
1556 {-# INLINE unwords #-}
1557
1558 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
1559 -- 'True' iff the first is a prefix of the second. Subject to fusion.
1560 isPrefixOf :: Text -> Text -> Bool
1561 isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
1562 alen <= blen && S.isPrefixOf (stream a) (stream b)
1563 {-# INLINE [1] isPrefixOf #-}
1564
1565 {-# RULES
1566 "TEXT isPrefixOf -> fused" [~1] forall s t.
1567 isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
1568 #-}
1569
1570 -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
1571 -- 'True' iff the first is a suffix of the second.
1572 isSuffixOf :: Text -> Text -> Bool
1573 isSuffixOf a@(Text _aarr _aoff alen) b@(Text barr boff blen) =
1574 d >= 0 && a == b'
1575 where d = blen - alen
1576 b' | d == 0 = b
1577 | otherwise = Text barr (boff+d) alen
1578 {-# INLINE isSuffixOf #-}
1579
1580 -- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
1581 -- 'True' iff the first is contained, wholly and intact, anywhere
1582 -- within the second.
1583 --
1584 -- In (unlikely) bad cases, this function's time complexity degrades
1585 -- towards /O(n*m)/.
1586 isInfixOf :: Text -> Text -> Bool
1587 isInfixOf needle haystack
1588 | null needle = True
1589 | isSingleton needle = S.elem (unsafeHead needle) . S.stream $ haystack
1590 | otherwise = not . L.null . indices needle $ haystack
1591 {-# INLINE [1] isInfixOf #-}
1592
1593 {-# RULES
1594 "TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
1595 isInfixOf (singleton n) h = S.elem n (S.stream h)
1596 #-}
1597
1598 -------------------------------------------------------------------------------
1599 -- * View patterns
1600
1601 -- | /O(n)/ Return the suffix of the second string if its prefix
1602 -- matches the entire first string.
1603 --
1604 -- Examples:
1605 --
1606 -- > stripPrefix "foo" "foobar" == Just "bar"
1607 -- > stripPrefix "" "baz" == Just "baz"
1608 -- > stripPrefix "foo" "quux" == Nothing
1609 --
1610 -- This is particularly useful with the @ViewPatterns@ extension to
1611 -- GHC, as follows:
1612 --
1613 -- > {-# LANGUAGE ViewPatterns #-}
1614 -- > import Data.Text as T
1615 -- >
1616 -- > fnordLength :: Text -> Int
1617 -- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
1618 -- > fnordLength _ = -1
1619 stripPrefix :: Text -> Text -> Maybe Text
1620 stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
1621 | p `isPrefixOf` t = Just $! text arr (off+plen) (len-plen)
1622 | otherwise = Nothing
1623
1624 -- | /O(n)/ Find the longest non-empty common prefix of two strings
1625 -- and return it, along with the suffixes of each string at which they
1626 -- no longer match.
1627 --
1628 -- If the strings do not have a common prefix or either one is empty,
1629 -- this function returns 'Nothing'.
1630 --
1631 -- Examples:
1632 --
1633 -- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
1634 -- > commonPrefixes "veeble" "fetzer" == Nothing
1635 -- > commonPrefixes "" "baz" == Nothing
1636 commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
1637 commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
1638 where
1639 go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
1640 | i > 0 = Just (Text arr0 off0 i,
1641 text arr0 (off0+i) (len0-i),
1642 text arr1 (off1+j) (len1-j))
1643 | otherwise = Nothing
1644 where Iter a d0 = iter t0 i
1645 Iter b d1 = iter t1 j
1646
1647 -- | /O(n)/ Return the prefix of the second string if its suffix
1648 -- matches the entire first string.
1649 --
1650 -- Examples:
1651 --
1652 -- > stripSuffix "bar" "foobar" == Just "foo"
1653 -- > stripSuffix "" "baz" == Just "baz"
1654 -- > stripSuffix "foo" "quux" == Nothing
1655 --
1656 -- This is particularly useful with the @ViewPatterns@ extension to
1657 -- GHC, as follows:
1658 --
1659 -- > {-# LANGUAGE ViewPatterns #-}
1660 -- > import Data.Text as T
1661 -- >
1662 -- > quuxLength :: Text -> Int
1663 -- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
1664 -- > quuxLength _ = -1
1665 stripSuffix :: Text -> Text -> Maybe Text
1666 stripSuffix p@(Text _arr _off plen) t@(Text arr off len)
1667 | p `isSuffixOf` t = Just $! text arr off (len-plen)
1668 | otherwise = Nothing
1669
1670 -- | Add a list of non-negative numbers. Errors out on overflow.
1671 sumP :: String -> [Int] -> Int
1672 sumP fun = go 0
1673 where go !a (x:xs)
1674 | ax >= 0 = go ax xs
1675 | otherwise = overflowError fun
1676 where ax = a + x
1677 go a _ = a
1678
1679 emptyError :: String -> a
1680 emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input"
1681
1682 overflowError :: String -> a
1683 overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow"
1684
1685 -- | /O(n)/ Make a distinct copy of the given string, sharing no
1686 -- storage with the original string.
1687 --
1688 -- As an example, suppose you read a large string, of which you need
1689 -- only a small portion. If you do not use 'copy', the entire original
1690 -- array will be kept alive in memory by the smaller string. Making a
1691 -- copy \"breaks the link\" to the original array, allowing it to be
1692 -- garbage collected if there are no other live references to it.
1693 copy :: Text -> Text
1694 copy (Text arr off len) = Text (A.run go) 0 len
1695 where
1696 go :: ST s (A.MArray s)
1697 go = do
1698 marr <- A.new len
1699 A.copyI marr 0 arr off len
1700 return marr