Top-level interfaces are safe, marked trustworthy
[packages/text.git] / Data / Text / Lazy / Encoding.hs
1 {-# LANGUAGE BangPatterns,CPP #-}
2 #if __GLASGOW_HASKELL__ >= 702
3 {-# LANGUAGE Trustworthy #-}
4 #endif
5 -- |
6 -- Module : Data.Text.Lazy.Encoding
7 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan
8 --
9 -- License : BSD-style
10 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
11 -- duncan@haskell.org
12 -- Stability : experimental
13 -- Portability : portable
14 --
15 -- Functions for converting lazy 'Text' values to and from lazy
16 -- 'ByteString', using several standard encodings.
17 --
18 -- To gain access to a much larger variety of encodings, use the
19 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
20
21 module Data.Text.Lazy.Encoding
22 (
23 -- * Decoding ByteStrings to Text
24 -- $strict
25 decodeASCII
26 , decodeUtf8
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
31
32 -- ** Catchable failure
33 , decodeUtf8'
34
35 -- ** Controllable error handling
36 , decodeUtf8With
37 , decodeUtf16LEWith
38 , decodeUtf16BEWith
39 , decodeUtf32LEWith
40 , decodeUtf32BEWith
41
42 -- * Encoding Text to ByteStrings
43 , encodeUtf8
44 , encodeUtf16LE
45 , encodeUtf16BE
46 , encodeUtf32LE
47 , encodeUtf32BE
48 ) where
49
50 import Control.Exception (evaluate, try)
51 import Data.Bits ((.&.))
52 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
53 import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
54 import System.IO.Unsafe (unsafePerformIO)
55 import qualified Data.ByteString as S
56 import qualified Data.ByteString.Lazy as B
57 import qualified Data.ByteString.Lazy.Internal as B
58 import qualified Data.ByteString.Unsafe as S
59 import qualified Data.Text as T
60 import qualified Data.Text.Encoding as TE
61 import qualified Data.Text.Lazy.Encoding.Fusion as E
62 import qualified Data.Text.Lazy.Fusion as F
63
64 -- $strict
65 --
66 -- All of the single-parameter functions for decoding bytestrings
67 -- encoded in one of the Unicode Transformation Formats (UTF) operate
68 -- in a /strict/ mode: each will throw an exception if given invalid
69 -- input.
70 --
71 -- Each function has a variant, whose name is suffixed with -'With',
72 -- that gives greater control over the handling of decoding errors.
73 -- For instance, 'decodeUtf8' will throw an exception, but
74 -- 'decodeUtf8With' allows the programmer to determine what to do on a
75 -- decoding error.
76
77 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
78 -- encoded text.
79 --
80 -- This function is deprecated. Use 'decodeUtf8' instead.
81 decodeASCII :: B.ByteString -> Text
82 decodeASCII = decodeUtf8
83 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
84
85 -- | Decode a 'ByteString' containing UTF-8 encoded text.
86 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
87 decodeUtf8With onErr bs0 = fast bs0
88 where
89 decode = TE.decodeUtf8With onErr
90 fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
91 | otherwise = chunk (decode h) (slow t ps)
92 where (h,t) = S.splitAt pivot p
93 pivot | at 1 = len-1
94 | at 2 = len-2
95 | otherwise = len-3
96 len = S.length p
97 at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
98 fast B.Empty = empty
99 slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
100 case B.uncons bs of
101 Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
102 | otherwise -> slow i' bs'
103 where i' = S.snoc i w
104 Nothing -> case S.uncons i of
105 Just (j,i') ->
106 case onErr desc (Just j) of
107 Nothing -> slow i' bs
108 Just c -> Chunk (T.singleton c) (slow i' bs)
109 Nothing ->
110 case onErr desc Nothing of
111 Nothing -> empty
112 Just c -> Chunk (T.singleton c) empty
113 isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
114 ix 1 .&. 0x80 == 0 ||
115 (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
116 (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
117 (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
118 where len = S.length bs
119 ix n = S.unsafeIndex bs (len-n)
120 desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
121 {-# INLINE[0] decodeUtf8With #-}
122
123 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
124 -- to be valid.
125 --
126 -- If the input contains any invalid UTF-8 data, an exception will be
127 -- thrown that cannot be caught in pure code. For more control over
128 -- the handling of invalid data, use 'decodeUtf8'' or
129 -- 'decodeUtf8With'.
130 decodeUtf8 :: B.ByteString -> Text
131 decodeUtf8 = decodeUtf8With strictDecode
132 {-# INLINE[0] decodeUtf8 #-}
133
134 -- This rule seems to cause performance loss.
135 {- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
136 forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
137
138 -- | Decode a 'ByteString' containing UTF-8 encoded text..
139 --
140 -- If the input contains any invalid UTF-8 data, the relevant
141 -- exception will be returned, otherwise the decoded text.
142 --
143 -- /Note/: this function is /not/ lazy, as it must decode its entire
144 -- input before it can return a result. If you need lazy (streaming)
145 -- decoding, use 'decodeUtf8With' in lenient mode.
146 decodeUtf8' :: B.ByteString -> Either UnicodeException Text
147 decodeUtf8' bs = unsafePerformIO $ do
148 let t = decodeUtf8 bs
149 try (evaluate (rnf t `seq` t))
150 where
151 rnf Empty = ()
152 rnf (Chunk _ ts) = rnf ts
153 {-# INLINE decodeUtf8' #-}
154
155 encodeUtf8 :: Text -> B.ByteString
156 encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
157 encodeUtf8 Empty = B.Empty
158
159 -- | Decode text from little endian UTF-16 encoding.
160 decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
161 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
162 {-# INLINE decodeUtf16LEWith #-}
163
164 -- | Decode text from little endian UTF-16 encoding.
165 --
166 -- If the input contains any invalid little endian UTF-16 data, an
167 -- exception will be thrown. For more control over the handling of
168 -- invalid data, use 'decodeUtf16LEWith'.
169 decodeUtf16LE :: B.ByteString -> Text
170 decodeUtf16LE = decodeUtf16LEWith strictDecode
171 {-# INLINE decodeUtf16LE #-}
172
173 -- | Decode text from big endian UTF-16 encoding.
174 decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
175 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
176 {-# INLINE decodeUtf16BEWith #-}
177
178 -- | Decode text from big endian UTF-16 encoding.
179 --
180 -- If the input contains any invalid big endian UTF-16 data, an
181 -- exception will be thrown. For more control over the handling of
182 -- invalid data, use 'decodeUtf16BEWith'.
183 decodeUtf16BE :: B.ByteString -> Text
184 decodeUtf16BE = decodeUtf16BEWith strictDecode
185 {-# INLINE decodeUtf16BE #-}
186
187 -- | Encode text using little endian UTF-16 encoding.
188 encodeUtf16LE :: Text -> B.ByteString
189 encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
190 {-# INLINE encodeUtf16LE #-}
191
192 -- | Encode text using big endian UTF-16 encoding.
193 encodeUtf16BE :: Text -> B.ByteString
194 encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
195 {-# INLINE encodeUtf16BE #-}
196
197 -- | Decode text from little endian UTF-32 encoding.
198 decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
199 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
200 {-# INLINE decodeUtf32LEWith #-}
201
202 -- | Decode text from little endian UTF-32 encoding.
203 --
204 -- If the input contains any invalid little endian UTF-32 data, an
205 -- exception will be thrown. For more control over the handling of
206 -- invalid data, use 'decodeUtf32LEWith'.
207 decodeUtf32LE :: B.ByteString -> Text
208 decodeUtf32LE = decodeUtf32LEWith strictDecode
209 {-# INLINE decodeUtf32LE #-}
210
211 -- | Decode text from big endian UTF-32 encoding.
212 decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
213 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
214 {-# INLINE decodeUtf32BEWith #-}
215
216 -- | Decode text from big endian UTF-32 encoding.
217 --
218 -- If the input contains any invalid big endian UTF-32 data, an
219 -- exception will be thrown. For more control over the handling of
220 -- invalid data, use 'decodeUtf32BEWith'.
221 decodeUtf32BE :: B.ByteString -> Text
222 decodeUtf32BE = decodeUtf32BEWith strictDecode
223 {-# INLINE decodeUtf32BE #-}
224
225 -- | Encode text using little endian UTF-32 encoding.
226 encodeUtf32LE :: Text -> B.ByteString
227 encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
228 {-# INLINE encodeUtf32LE #-}
229
230 -- | Encode text using big endian UTF-32 encoding.
231 encodeUtf32BE :: Text -> B.ByteString
232 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
233 {-# INLINE encodeUtf32BE #-}