Merge branch 'master' into feature-new-bytestring-builder
[packages/text.git] / Data / Text / Lazy / Encoding.hs
1 {-# LANGUAGE BangPatterns,CPP #-}
2 #if __GLASGOW_HASKELL__ >= 702
3 {-# LANGUAGE Trustworthy #-}
4 #endif
5 -- |
6 -- Module : Data.Text.Lazy.Encoding
7 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan
8 --
9 -- License : BSD-style
10 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
11 -- duncan@haskell.org
12 -- Stability : experimental
13 -- Portability : portable
14 --
15 -- Functions for converting lazy 'Text' values to and from lazy
16 -- 'ByteString', using several standard encodings.
17 --
18 -- To gain access to a much larger variety of encodings, use the
19 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
20
21 module Data.Text.Lazy.Encoding
22 (
23 -- * Decoding ByteStrings to Text
24 -- $strict
25 decodeASCII
26 , decodeLatin1
27 , decodeUtf8
28 , decodeUtf16LE
29 , decodeUtf16BE
30 , decodeUtf32LE
31 , decodeUtf32BE
32
33 -- ** Catchable failure
34 , decodeUtf8'
35
36 -- ** Controllable error handling
37 , decodeUtf8With
38 , decodeUtf16LEWith
39 , decodeUtf16BEWith
40 , decodeUtf32LEWith
41 , decodeUtf32BEWith
42
43 -- * Encoding Text to ByteStrings
44 , encodeUtf8
45 , encodeUtf16LE
46 , encodeUtf16BE
47 , encodeUtf32LE
48 , encodeUtf32BE
49 ) where
50
51 import Control.Exception (evaluate, try)
52 import Data.Bits ((.&.))
53 import Data.Monoid (mempty, (<>))
54 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
55 import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
56 import qualified Data.ByteString as S
57 import qualified Data.ByteString.Lazy as B
58 import qualified Data.ByteString.Lazy.Internal as B
59 import qualified Data.ByteString.Unsafe as S
60 import qualified Data.ByteString.Builder as B
61 import qualified Data.ByteString.Builder.Prim as BP
62 import qualified Data.Text as T
63 import qualified Data.Text.Encoding as TE
64 import qualified Data.Text.Lazy.Encoding.Fusion as E
65 import qualified Data.Text.Lazy.Fusion as F
66 import Data.Text.Unsafe (unsafeDupablePerformIO)
67
68 -- $strict
69 --
70 -- All of the single-parameter functions for decoding bytestrings
71 -- encoded in one of the Unicode Transformation Formats (UTF) operate
72 -- in a /strict/ mode: each will throw an exception if given invalid
73 -- input.
74 --
75 -- Each function has a variant, whose name is suffixed with -'With',
76 -- that gives greater control over the handling of decoding errors.
77 -- For instance, 'decodeUtf8' will throw an exception, but
78 -- 'decodeUtf8With' allows the programmer to determine what to do on a
79 -- decoding error.
80
81 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
82 -- encoded text.
83 --
84 -- This function is deprecated. Use 'decodeLatin1' instead.
85 decodeASCII :: B.ByteString -> Text
86 decodeASCII = decodeUtf8
87 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
88
89 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
90 decodeLatin1 :: B.ByteString -> Text
91 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
92
93 -- | Decode a 'ByteString' containing UTF-8 encoded text.
94 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
95 decodeUtf8With onErr bs0 = fast bs0
96 where
97 decode = TE.decodeUtf8With onErr
98 fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
99 | otherwise = chunk (decode h) (slow t ps)
100 where (h,t) = S.splitAt pivot p
101 pivot | at 1 = len-1
102 | at 2 = len-2
103 | otherwise = len-3
104 len = S.length p
105 at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
106 fast B.Empty = empty
107 slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
108 case B.uncons bs of
109 Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
110 | otherwise -> slow i' bs'
111 where i' = S.snoc i w
112 Nothing -> case S.uncons i of
113 Just (j,i') ->
114 case onErr desc (Just j) of
115 Nothing -> slow i' bs
116 Just c -> Chunk (T.singleton c) (slow i' bs)
117 Nothing ->
118 case onErr desc Nothing of
119 Nothing -> empty
120 Just c -> Chunk (T.singleton c) empty
121 isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
122 ix 1 .&. 0x80 == 0 ||
123 (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
124 (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
125 (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
126 where len = S.length bs
127 ix n = S.unsafeIndex bs (len-n)
128 desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
129 {-# INLINE[0] decodeUtf8With #-}
130
131 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
132 -- to be valid.
133 --
134 -- If the input contains any invalid UTF-8 data, an exception will be
135 -- thrown that cannot be caught in pure code. For more control over
136 -- the handling of invalid data, use 'decodeUtf8'' or
137 -- 'decodeUtf8With'.
138 decodeUtf8 :: B.ByteString -> Text
139 decodeUtf8 = decodeUtf8With strictDecode
140 {-# INLINE[0] decodeUtf8 #-}
141
142 -- This rule seems to cause performance loss.
143 {- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
144 forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
145
146 -- | Decode a 'ByteString' containing UTF-8 encoded text..
147 --
148 -- If the input contains any invalid UTF-8 data, the relevant
149 -- exception will be returned, otherwise the decoded text.
150 --
151 -- /Note/: this function is /not/ lazy, as it must decode its entire
152 -- input before it can return a result. If you need lazy (streaming)
153 -- decoding, use 'decodeUtf8With' in lenient mode.
154 decodeUtf8' :: B.ByteString -> Either UnicodeException Text
155 decodeUtf8' bs = unsafeDupablePerformIO $ do
156 let t = decodeUtf8 bs
157 try (evaluate (rnf t `seq` t))
158 where
159 rnf Empty = ()
160 rnf (Chunk _ ts) = rnf ts
161 {-# INLINE decodeUtf8' #-}
162
163 encodeUtf8 :: Text -> B.ByteString
164 encodeUtf8 =
165 B.toLazyByteString . go
166 where
167 go Empty = mempty
168 go (Chunk c cs) =
169 TE.encodeUtf8Escaped (BP.liftFixedToBounded BP.word8) c <> go cs
170
171
172 -- | Decode text from little endian UTF-16 encoding.
173 decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
174 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
175 {-# INLINE decodeUtf16LEWith #-}
176
177 -- | Decode text from little endian UTF-16 encoding.
178 --
179 -- If the input contains any invalid little endian UTF-16 data, an
180 -- exception will be thrown. For more control over the handling of
181 -- invalid data, use 'decodeUtf16LEWith'.
182 decodeUtf16LE :: B.ByteString -> Text
183 decodeUtf16LE = decodeUtf16LEWith strictDecode
184 {-# INLINE decodeUtf16LE #-}
185
186 -- | Decode text from big endian UTF-16 encoding.
187 decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
188 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
189 {-# INLINE decodeUtf16BEWith #-}
190
191 -- | Decode text from big endian UTF-16 encoding.
192 --
193 -- If the input contains any invalid big endian UTF-16 data, an
194 -- exception will be thrown. For more control over the handling of
195 -- invalid data, use 'decodeUtf16BEWith'.
196 decodeUtf16BE :: B.ByteString -> Text
197 decodeUtf16BE = decodeUtf16BEWith strictDecode
198 {-# INLINE decodeUtf16BE #-}
199
200 -- | Encode text using little endian UTF-16 encoding.
201 encodeUtf16LE :: Text -> B.ByteString
202 encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
203 {-# INLINE encodeUtf16LE #-}
204
205 -- | Encode text using big endian UTF-16 encoding.
206 encodeUtf16BE :: Text -> B.ByteString
207 encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
208 {-# INLINE encodeUtf16BE #-}
209
210 -- | Decode text from little endian UTF-32 encoding.
211 decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
212 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
213 {-# INLINE decodeUtf32LEWith #-}
214
215 -- | Decode text from little endian UTF-32 encoding.
216 --
217 -- If the input contains any invalid little endian UTF-32 data, an
218 -- exception will be thrown. For more control over the handling of
219 -- invalid data, use 'decodeUtf32LEWith'.
220 decodeUtf32LE :: B.ByteString -> Text
221 decodeUtf32LE = decodeUtf32LEWith strictDecode
222 {-# INLINE decodeUtf32LE #-}
223
224 -- | Decode text from big endian UTF-32 encoding.
225 decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
226 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
227 {-# INLINE decodeUtf32BEWith #-}
228
229 -- | Decode text from big endian UTF-32 encoding.
230 --
231 -- If the input contains any invalid big endian UTF-32 data, an
232 -- exception will be thrown. For more control over the handling of
233 -- invalid data, use 'decodeUtf32BEWith'.
234 decodeUtf32BE :: B.ByteString -> Text
235 decodeUtf32BE = decodeUtf32BEWith strictDecode
236 {-# INLINE decodeUtf32BE #-}
237
238 -- | Encode text using little endian UTF-32 encoding.
239 encodeUtf32LE :: Text -> B.ByteString
240 encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
241 {-# INLINE encodeUtf32LE #-}
242
243 -- | Encode text using big endian UTF-32 encoding.
244 encodeUtf32BE :: Text -> B.ByteString
245 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
246 {-# INLINE encodeUtf32BE #-}