Use unsafeDupablePerformIO where possible
[packages/text.git] / Data / Text / Encoding.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, MagicHash,
2 UnliftedFFITypes #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 -- |
7 -- Module : Data.Text.Encoding
8 -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
9 -- (c) 2009 Duncan Coutts,
10 -- (c) 2008, 2009 Tom Harper
11 --
12 -- License : BSD-style
13 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
14 -- duncan@haskell.org
15 -- Stability : experimental
16 -- Portability : portable
17 --
18 -- Functions for converting 'Text' values to and from 'ByteString',
19 -- using several standard encodings.
20 --
21 -- To gain access to a much larger family of encodings, use the
22 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
23
24 module Data.Text.Encoding
25 (
26 -- * Decoding ByteStrings to Text
27 -- $strict
28 decodeASCII
29 , decodeLatin1
30 , decodeUtf8
31 , decodeUtf16LE
32 , decodeUtf16BE
33 , decodeUtf32LE
34 , decodeUtf32BE
35
36 -- ** Catchable failure
37 , decodeUtf8'
38
39 -- ** Controllable error handling
40 , decodeUtf8With
41 , decodeUtf16LEWith
42 , decodeUtf16BEWith
43 , decodeUtf32LEWith
44 , decodeUtf32BEWith
45
46 -- * Encoding Text to ByteStrings
47 , encodeUtf8
48 , encodeUtf16LE
49 , encodeUtf16BE
50 , encodeUtf32LE
51 , encodeUtf32BE
52 ) where
53
54 import Control.Exception (evaluate, try)
55 #if __GLASGOW_HASKELL__ >= 702
56 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
57 #else
58 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
59 #endif
60 import Data.Bits ((.&.))
61 import Data.ByteString as B
62 import Data.ByteString.Internal as B
63 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
64 import Data.Text.Internal (Text(..), safe, textP)
65 import Data.Text.Private (runText)
66 import Data.Text.UnsafeChar (ord, unsafeWrite)
67 import Data.Text.UnsafeShift (shiftL, shiftR)
68 import Data.Word (Word8)
69 import Foreign.C.Types (CSize)
70 import Foreign.ForeignPtr (withForeignPtr)
71 import Foreign.Marshal.Utils (with)
72 import Foreign.Ptr (Ptr, minusPtr, plusPtr)
73 import Foreign.Storable (peek, poke)
74 import GHC.Base (MutableByteArray#)
75 import qualified Data.Text.Array as A
76 import qualified Data.Text.Encoding.Fusion as E
77 import qualified Data.Text.Encoding.Utf16 as U16
78 import qualified Data.Text.Fusion as F
79 import Data.Text.Unsafe (unsafeDupablePerformIO)
80
81 -- $strict
82 --
83 -- All of the single-parameter functions for decoding bytestrings
84 -- encoded in one of the Unicode Transformation Formats (UTF) operate
85 -- in a /strict/ mode: each will throw an exception if given invalid
86 -- input.
87 --
88 -- Each function has a variant, whose name is suffixed with -'With',
89 -- that gives greater control over the handling of decoding errors.
90 -- For instance, 'decodeUtf8' will throw an exception, but
91 -- 'decodeUtf8With' allows the programmer to determine what to do on a
92 -- decoding error.
93
94 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
95 -- encoded text.
96 --
97 -- This function is deprecated. Use 'decodeLatin1' instead.
98 decodeASCII :: ByteString -> Text
99 decodeASCII = decodeUtf8
100 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
101
102 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
103 --
104 -- 'decodeLatin1' is semantically equivalent to
105 -- @Data.Text.pack . Data.ByteString.Char8.unpack@
106 decodeLatin1 :: ByteString -> Text
107 decodeLatin1 (PS fp off len) = textP a 0 len
108 where
109 a = A.run (A.new len >>= unsafeIOToST . go)
110 go dest = withForeignPtr fp $ \ptr -> do
111 c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
112 return dest
113
114 -- | Decode a 'ByteString' containing UTF-8 encoded text.
115 decodeUtf8With :: OnDecodeError -> ByteString -> Text
116 decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
117 let go dest = withForeignPtr fp $ \ptr ->
118 with (0::CSize) $ \destOffPtr -> do
119 let end = ptr `plusPtr` (off + len)
120 loop curPtr = do
121 curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
122 if curPtr' == end
123 then do
124 n <- peek destOffPtr
125 unsafeSTToIO (done dest (fromIntegral n))
126 else do
127 x <- peek curPtr'
128 case onErr desc (Just x) of
129 Nothing -> loop $ curPtr' `plusPtr` 1
130 Just c -> do
131 destOff <- peek destOffPtr
132 w <- unsafeSTToIO $
133 unsafeWrite dest (fromIntegral destOff) (safe c)
134 poke destOffPtr (destOff + fromIntegral w)
135 loop $ curPtr' `plusPtr` 1
136 loop (ptr `plusPtr` off)
137 (unsafeIOToST . go) =<< A.new len
138 where
139 desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
140 {- INLINE[0] decodeUtf8With #-}
141
142 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
143 -- to be valid.
144 --
145 -- If the input contains any invalid UTF-8 data, an exception will be
146 -- thrown that cannot be caught in pure code. For more control over
147 -- the handling of invalid data, use 'decodeUtf8'' or
148 -- 'decodeUtf8With'.
149 decodeUtf8 :: ByteString -> Text
150 decodeUtf8 = decodeUtf8With strictDecode
151 {-# INLINE[0] decodeUtf8 #-}
152 {-# RULES "STREAM stream/decodeUtf8 fusion" [1]
153 forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
154
155 -- | Decode a 'ByteString' containing UTF-8 encoded text..
156 --
157 -- If the input contains any invalid UTF-8 data, the relevant
158 -- exception will be returned, otherwise the decoded text.
159 decodeUtf8' :: ByteString -> Either UnicodeException Text
160 decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
161 {-# INLINE decodeUtf8' #-}
162
163 -- | Encode text using UTF-8 encoding.
164 encodeUtf8 :: Text -> ByteString
165 encodeUtf8 (Text arr off len) = unsafeDupablePerformIO $ do
166 let size0 = max len 4
167 mallocByteString size0 >>= start size0 off 0
168 where
169 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
170 where
171 loop n1 m1 ptr = go n1 m1
172 where
173 offLen = off + len
174 go !n !m
175 | n == offLen = return (PS fp 0 m)
176 | otherwise = do
177 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
178 ensure k act
179 | size-m >= k = act
180 | otherwise = {-# SCC "resizeUtf8/ensure" #-} do
181 let newSize = size `shiftL` 1
182 fp' <- mallocByteString newSize
183 withForeignPtr fp' $ \ptr' ->
184 memcpy ptr' ptr (fromIntegral m)
185 start newSize n m fp'
186 {-# INLINE ensure #-}
187 case A.unsafeIndex arr n of
188 w| w <= 0x7F -> ensure 1 $ do
189 poke (ptr `plusPtr` m) (fromIntegral w :: Word8)
190 -- A single ASCII octet is likely to start a run of
191 -- them. We see better performance when we
192 -- special-case this assumption.
193 let end = ptr `plusPtr` size
194 ascii !t !u
195 | t == offLen || u == end || v >= 0x80 =
196 go t (u `minusPtr` ptr)
197 | otherwise = do
198 poke u (fromIntegral v :: Word8)
199 ascii (t+1) (u `plusPtr` 1)
200 where v = A.unsafeIndex arr t
201 ascii (n+1) (ptr `plusPtr` (m+1))
202 | w <= 0x7FF -> ensure 2 $ do
203 poke8 m $ (w `shiftR` 6) + 0xC0
204 poke8 (m+1) $ (w .&. 0x3f) + 0x80
205 go (n+1) (m+2)
206 | 0xD800 <= w && w <= 0xDBFF -> ensure 4 $ do
207 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
208 poke8 m $ (c `shiftR` 18) + 0xF0
209 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
210 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
211 poke8 (m+3) $ (c .&. 0x3F) + 0x80
212 go (n+2) (m+4)
213 | otherwise -> ensure 3 $ do
214 poke8 m $ (w `shiftR` 12) + 0xE0
215 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
216 poke8 (m+2) $ (w .&. 0x3F) + 0x80
217 go (n+1) (m+3)
218
219 -- | Decode text from little endian UTF-16 encoding.
220 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
221 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
222 {-# INLINE decodeUtf16LEWith #-}
223
224 -- | Decode text from little endian UTF-16 encoding.
225 --
226 -- If the input contains any invalid little endian UTF-16 data, an
227 -- exception will be thrown. For more control over the handling of
228 -- invalid data, use 'decodeUtf16LEWith'.
229 decodeUtf16LE :: ByteString -> Text
230 decodeUtf16LE = decodeUtf16LEWith strictDecode
231 {-# INLINE decodeUtf16LE #-}
232
233 -- | Decode text from big endian UTF-16 encoding.
234 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
235 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
236 {-# INLINE decodeUtf16BEWith #-}
237
238 -- | Decode text from big endian UTF-16 encoding.
239 --
240 -- If the input contains any invalid big endian UTF-16 data, an
241 -- exception will be thrown. For more control over the handling of
242 -- invalid data, use 'decodeUtf16BEWith'.
243 decodeUtf16BE :: ByteString -> Text
244 decodeUtf16BE = decodeUtf16BEWith strictDecode
245 {-# INLINE decodeUtf16BE #-}
246
247 -- | Encode text using little endian UTF-16 encoding.
248 encodeUtf16LE :: Text -> ByteString
249 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
250 {-# INLINE encodeUtf16LE #-}
251
252 -- | Encode text using big endian UTF-16 encoding.
253 encodeUtf16BE :: Text -> ByteString
254 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
255 {-# INLINE encodeUtf16BE #-}
256
257 -- | Decode text from little endian UTF-32 encoding.
258 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
259 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
260 {-# INLINE decodeUtf32LEWith #-}
261
262 -- | Decode text from little endian UTF-32 encoding.
263 --
264 -- If the input contains any invalid little endian UTF-32 data, an
265 -- exception will be thrown. For more control over the handling of
266 -- invalid data, use 'decodeUtf32LEWith'.
267 decodeUtf32LE :: ByteString -> Text
268 decodeUtf32LE = decodeUtf32LEWith strictDecode
269 {-# INLINE decodeUtf32LE #-}
270
271 -- | Decode text from big endian UTF-32 encoding.
272 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
273 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
274 {-# INLINE decodeUtf32BEWith #-}
275
276 -- | Decode text from big endian UTF-32 encoding.
277 --
278 -- If the input contains any invalid big endian UTF-32 data, an
279 -- exception will be thrown. For more control over the handling of
280 -- invalid data, use 'decodeUtf32BEWith'.
281 decodeUtf32BE :: ByteString -> Text
282 decodeUtf32BE = decodeUtf32BEWith strictDecode
283 {-# INLINE decodeUtf32BE #-}
284
285 -- | Encode text using little endian UTF-32 encoding.
286 encodeUtf32LE :: Text -> ByteString
287 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
288 {-# INLINE encodeUtf32LE #-}
289
290 -- | Encode text using big endian UTF-32 encoding.
291 encodeUtf32BE :: Text -> ByteString
292 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
293 {-# INLINE encodeUtf32BE #-}
294
295 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
296 :: MutableByteArray# s -> Ptr CSize
297 -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
298
299 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
300 :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()