Merge branch 'master' of https://github.com/bos/text
[packages/text.git] / Data / Text / Encoding.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3 #if __GLASGOW_HASKELL__ >= 702
4 {-# LANGUAGE Trustworthy #-}
5 #endif
6 -- |
7 -- Module : Data.Text.Encoding
8 -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
9 -- (c) 2009 Duncan Coutts,
10 -- (c) 2008, 2009 Tom Harper
11 --
12 -- License : BSD-style
13 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
14 -- duncan@haskell.org
15 -- Stability : experimental
16 -- Portability : portable
17 --
18 -- Functions for converting 'Text' values to and from 'ByteString',
19 -- using several standard encodings.
20 --
21 -- To gain access to a much larger family of encodings, use the
22 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
23
24 module Data.Text.Encoding
25 (
26 -- * Decoding ByteStrings to Text
27 -- $strict
28 decodeASCII
29 , decodeLatin1
30 , decodeUtf8
31 , decodeUtf16LE
32 , decodeUtf16BE
33 , decodeUtf32LE
34 , decodeUtf32BE
35
36 -- ** Catchable failure
37 , decodeUtf8'
38
39 -- ** Controllable error handling
40 , decodeUtf8With
41 , decodeUtf16LEWith
42 , decodeUtf16BEWith
43 , decodeUtf32LEWith
44 , decodeUtf32BEWith
45
46 -- ** Stream oriented decoding
47 -- $stream
48 , streamDecodeUtf8
49 , streamDecodeUtf8With
50 , Decoding(..)
51
52 -- * Encoding Text to ByteStrings
53 , encodeUtf8
54 , encodeUtf16LE
55 , encodeUtf16BE
56 , encodeUtf32LE
57 , encodeUtf32BE
58 ) where
59
60 import Control.Exception (evaluate, try)
61 #if __GLASGOW_HASKELL__ >= 702
62 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
63 #else
64 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
65 #endif
66 import Control.Monad.ST (runST)
67 import Data.Bits ((.&.))
68 import Data.ByteString as B
69 import Data.ByteString.Internal as B
70 import Data.Text ()
71 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
72 import Data.Text.Internal (Text(..), safe, textP)
73 import Data.Text.Private (runText)
74 import Data.Text.UnsafeChar (ord, unsafeWrite)
75 import Data.Text.UnsafeShift (shiftL, shiftR)
76 import Data.Word (Word8, Word32)
77 import Foreign.C.Types (CSize)
78 import Foreign.ForeignPtr (withForeignPtr)
79 import Foreign.Marshal.Utils (with)
80 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
81 import Foreign.Storable (Storable, peek, poke)
82 import GHC.Base (MutableByteArray#)
83 import qualified Data.Text.Array as A
84 import qualified Data.Text.Encoding.Fusion as E
85 import qualified Data.Text.Encoding.Utf16 as U16
86 import qualified Data.Text.Fusion as F
87 import Data.Text.Unsafe (unsafeDupablePerformIO)
88
89 #include "text_cbits.h"
90
91 -- $strict
92 --
93 -- All of the single-parameter functions for decoding bytestrings
94 -- encoded in one of the Unicode Transformation Formats (UTF) operate
95 -- in a /strict/ mode: each will throw an exception if given invalid
96 -- input.
97 --
98 -- Each function has a variant, whose name is suffixed with -'With',
99 -- that gives greater control over the handling of decoding errors.
100 -- For instance, 'decodeUtf8' will throw an exception, but
101 -- 'decodeUtf8With' allows the programmer to determine what to do on a
102 -- decoding error.
103
104 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
105 -- encoded text.
106 --
107 -- This function is deprecated. Use 'decodeLatin1' instead.
108 decodeASCII :: ByteString -> Text
109 decodeASCII = decodeUtf8
110 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
111
112 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
113 --
114 -- 'decodeLatin1' is semantically equivalent to
115 -- @Data.Text.pack . Data.ByteString.Char8.unpack@
116 decodeLatin1 :: ByteString -> Text
117 decodeLatin1 (PS fp off len) = textP a 0 len
118 where
119 a = A.run (A.new len >>= unsafeIOToST . go)
120 go dest = withForeignPtr fp $ \ptr -> do
121 c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
122 return dest
123
124 -- | Decode a 'ByteString' containing UTF-8 encoded text.
125 decodeUtf8With :: OnDecodeError -> ByteString -> Text
126 decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
127 let go dest = withForeignPtr fp $ \ptr ->
128 with (0::CSize) $ \destOffPtr -> do
129 let end = ptr `plusPtr` (off + len)
130 loop curPtr = do
131 curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
132 if curPtr' == end
133 then do
134 n <- peek destOffPtr
135 unsafeSTToIO (done dest (fromIntegral n))
136 else do
137 x <- peek curPtr'
138 case onErr desc (Just x) of
139 Nothing -> loop $ curPtr' `plusPtr` 1
140 Just c -> do
141 destOff <- peek destOffPtr
142 w <- unsafeSTToIO $
143 unsafeWrite dest (fromIntegral destOff) (safe c)
144 poke destOffPtr (destOff + fromIntegral w)
145 loop $ curPtr' `plusPtr` 1
146 loop (ptr `plusPtr` off)
147 (unsafeIOToST . go) =<< A.new len
148 where
149 desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
150 {- INLINE[0] decodeUtf8With #-}
151
152 -- $stream
153 --
154 -- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
155 -- a 'ByteString' that represents a possibly incomplete input (e.g. a
156 -- packet from a network stream) that may not end on a UTF-8 boundary.
157 --
158 -- The first element of the result is the maximal chunk of 'Text' that
159 -- can be decoded from the given input. The second is a function which
160 -- accepts another 'ByteString'. That string will be assumed to
161 -- directly follow the string that was passed as input to the original
162 -- function, and it will in turn be decoded.
163 --
164 -- To help understand the use of these functions, consider the Unicode
165 -- string @\"hi &#9731;\"@. If encoded as UTF-8, this becomes @\"hi
166 -- \\xe2\\x98\\x83\"@; the final @\'&#9731;\'@ is encoded as 3 bytes.
167 --
168 -- Now suppose that we receive this encoded string as 3 packets that
169 -- are split up on untidy boundaries: @[\"hi \\xe2\", \"\\x98\",
170 -- \"\\x83\"]@. We cannot decode the entire Unicode string until we
171 -- have received all three packets, but we would like to make progress
172 -- as we receive each one.
173 --
174 -- @
175 -- let 'Some' t0 f0 = 'streamDecodeUtf8' \"hi \\xe2\"
176 -- t0 == \"hi \" :: 'Text'
177 -- @
178 --
179 -- We use the continuation @f0@ to decode our second packet.
180 --
181 -- @
182 -- let 'Some' t1 f1 = f0 \"\\x98\"
183 -- t1 == \"\"
184 -- @
185 --
186 -- We could not give @f0@ enough input to decode anything, so it
187 -- returned an empty string. Once we feed our second continuation @f1@
188 -- the last byte of input, it will make progress.
189 --
190 -- @
191 -- let 'Some' t2 f2 = f1 \"\\x83\"
192 -- t2 == \"&#9731;\"
193 -- @
194 --
195 -- If given invalid input, an exception will be thrown by the function
196 -- or continuation where it is encountered.
197
198 -- | A stream oriented decoding result.
199 data Decoding = Some Text ByteString (ByteString -> Decoding)
200
201 instance Show Decoding where
202 showsPrec d (Some t bs _) = showParen (d > prec) $
203 showString "Some " . showsPrec prec' t .
204 showChar ' ' . showsPrec prec' bs .
205 showString " _"
206 where prec = 10; prec' = prec + 1
207
208 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
209 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
210
211 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
212 -- encoded text that is known to be valid.
213 --
214 -- If the input contains any invalid UTF-8 data, an exception will be
215 -- thrown (either by this function or a continuation) that cannot be
216 -- caught in pure code. For more control over the handling of invalid
217 -- data, use 'streamDecodeUtf8With'.
218 streamDecodeUtf8 :: ByteString -> Decoding
219 streamDecodeUtf8 = streamDecodeUtf8With strictDecode
220
221 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
222 -- encoded text.
223 streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
224 streamDecodeUtf8With onErr = decodeChunk 0 0
225 where
226 -- We create a slightly larger than necessary buffer to accommodate a
227 -- potential surrogate pair started in the last buffer
228 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
229 decodeChunk codepoint0 state0 bs@(PS fp off len) =
230 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
231 where
232 decodeChunkToBuffer :: A.MArray s -> IO Decoding
233 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
234 with (0::CSize) $ \destOffPtr ->
235 with codepoint0 $ \codepointPtr ->
236 with state0 $ \statePtr ->
237 with nullPtr $ \curPtrPtr ->
238 let end = ptr `plusPtr` (off + len)
239 loop curPtr = do
240 poke curPtrPtr curPtr
241 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
242 curPtrPtr end codepointPtr statePtr
243 state <- peek statePtr
244 case state of
245 UTF8_REJECT -> do
246 -- We encountered an encoding error
247 x <- peek curPtr'
248 case onErr desc (Just x) of
249 Nothing -> loop $ curPtr' `plusPtr` 1
250 Just c -> do
251 destOff <- peek destOffPtr
252 w <- unsafeSTToIO $
253 unsafeWrite dest (fromIntegral destOff) (safe c)
254 poke destOffPtr (destOff + fromIntegral w)
255 poke statePtr 0
256 loop $ curPtr' `plusPtr` 1
257
258 _ -> do
259 -- We encountered the end of the buffer while decoding
260 n <- peek destOffPtr
261 codepoint <- peek codepointPtr
262 chunkText <- unsafeSTToIO $ do
263 arr <- A.unsafeFreeze dest
264 return $! textP arr 0 (fromIntegral n)
265 lastPtr <- peek curPtrPtr
266 let left = lastPtr `minusPtr` curPtr
267 return $ Some chunkText (B.drop left bs)
268 (decodeChunk codepoint state)
269 in loop (ptr `plusPtr` off)
270 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
271
272 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
273 -- to be valid.
274 --
275 -- If the input contains any invalid UTF-8 data, an exception will be
276 -- thrown that cannot be caught in pure code. For more control over
277 -- the handling of invalid data, use 'decodeUtf8'' or
278 -- 'decodeUtf8With'.
279 decodeUtf8 :: ByteString -> Text
280 decodeUtf8 = decodeUtf8With strictDecode
281 {-# INLINE[0] decodeUtf8 #-}
282 {-# RULES "STREAM stream/decodeUtf8 fusion" [1]
283 forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
284
285 -- | Decode a 'ByteString' containing UTF-8 encoded text.
286 --
287 -- If the input contains any invalid UTF-8 data, the relevant
288 -- exception will be returned, otherwise the decoded text.
289 decodeUtf8' :: ByteString -> Either UnicodeException Text
290 decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
291 {-# INLINE decodeUtf8' #-}
292
293 -- | Encode text using UTF-8 encoding.
294 encodeUtf8 :: Text -> ByteString
295 encodeUtf8 (Text arr off len) = unsafeDupablePerformIO $ do
296 let size0 = max len 4
297 mallocByteString size0 >>= start size0 off 0
298 where
299 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
300 where
301 loop n1 m1 ptr = go n1 m1
302 where
303 offLen = off + len
304 go !n !m
305 | n == offLen = return (PS fp 0 m)
306 | otherwise = do
307 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
308 ensure k act
309 | size-m >= k = act
310 | otherwise = {-# SCC "resizeUtf8/ensure" #-} do
311 let newSize = size `shiftL` 1
312 fp' <- mallocByteString newSize
313 withForeignPtr fp' $ \ptr' ->
314 memcpy ptr' ptr (fromIntegral m)
315 start newSize n m fp'
316 {-# INLINE ensure #-}
317 case A.unsafeIndex arr n of
318 w| w <= 0x7F -> ensure 1 $ do
319 poke (ptr `plusPtr` m) (fromIntegral w :: Word8)
320 -- A single ASCII octet is likely to start a run of
321 -- them. We see better performance when we
322 -- special-case this assumption.
323 let end = ptr `plusPtr` size
324 ascii !t !u
325 | t == offLen || u == end || v >= 0x80 =
326 go t (u `minusPtr` ptr)
327 | otherwise = do
328 poke u (fromIntegral v :: Word8)
329 ascii (t+1) (u `plusPtr` 1)
330 where v = A.unsafeIndex arr t
331 ascii (n+1) (ptr `plusPtr` (m+1))
332 | w <= 0x7FF -> ensure 2 $ do
333 poke8 m $ (w `shiftR` 6) + 0xC0
334 poke8 (m+1) $ (w .&. 0x3f) + 0x80
335 go (n+1) (m+2)
336 | 0xD800 <= w && w <= 0xDBFF -> ensure 4 $ do
337 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
338 poke8 m $ (c `shiftR` 18) + 0xF0
339 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
340 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
341 poke8 (m+3) $ (c .&. 0x3F) + 0x80
342 go (n+2) (m+4)
343 | otherwise -> ensure 3 $ do
344 poke8 m $ (w `shiftR` 12) + 0xE0
345 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
346 poke8 (m+2) $ (w .&. 0x3F) + 0x80
347 go (n+1) (m+3)
348
349 -- | Decode text from little endian UTF-16 encoding.
350 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
351 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
352 {-# INLINE decodeUtf16LEWith #-}
353
354 -- | Decode text from little endian UTF-16 encoding.
355 --
356 -- If the input contains any invalid little endian UTF-16 data, an
357 -- exception will be thrown. For more control over the handling of
358 -- invalid data, use 'decodeUtf16LEWith'.
359 decodeUtf16LE :: ByteString -> Text
360 decodeUtf16LE = decodeUtf16LEWith strictDecode
361 {-# INLINE decodeUtf16LE #-}
362
363 -- | Decode text from big endian UTF-16 encoding.
364 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
365 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
366 {-# INLINE decodeUtf16BEWith #-}
367
368 -- | Decode text from big endian UTF-16 encoding.
369 --
370 -- If the input contains any invalid big endian UTF-16 data, an
371 -- exception will be thrown. For more control over the handling of
372 -- invalid data, use 'decodeUtf16BEWith'.
373 decodeUtf16BE :: ByteString -> Text
374 decodeUtf16BE = decodeUtf16BEWith strictDecode
375 {-# INLINE decodeUtf16BE #-}
376
377 -- | Encode text using little endian UTF-16 encoding.
378 encodeUtf16LE :: Text -> ByteString
379 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
380 {-# INLINE encodeUtf16LE #-}
381
382 -- | Encode text using big endian UTF-16 encoding.
383 encodeUtf16BE :: Text -> ByteString
384 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
385 {-# INLINE encodeUtf16BE #-}
386
387 -- | Decode text from little endian UTF-32 encoding.
388 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
389 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
390 {-# INLINE decodeUtf32LEWith #-}
391
392 -- | Decode text from little endian UTF-32 encoding.
393 --
394 -- If the input contains any invalid little endian UTF-32 data, an
395 -- exception will be thrown. For more control over the handling of
396 -- invalid data, use 'decodeUtf32LEWith'.
397 decodeUtf32LE :: ByteString -> Text
398 decodeUtf32LE = decodeUtf32LEWith strictDecode
399 {-# INLINE decodeUtf32LE #-}
400
401 -- | Decode text from big endian UTF-32 encoding.
402 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
403 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
404 {-# INLINE decodeUtf32BEWith #-}
405
406 -- | Decode text from big endian UTF-32 encoding.
407 --
408 -- If the input contains any invalid big endian UTF-32 data, an
409 -- exception will be thrown. For more control over the handling of
410 -- invalid data, use 'decodeUtf32BEWith'.
411 decodeUtf32BE :: ByteString -> Text
412 decodeUtf32BE = decodeUtf32BEWith strictDecode
413 {-# INLINE decodeUtf32BE #-}
414
415 -- | Encode text using little endian UTF-32 encoding.
416 encodeUtf32LE :: Text -> ByteString
417 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
418 {-# INLINE encodeUtf32LE #-}
419
420 -- | Encode text using big endian UTF-32 encoding.
421 encodeUtf32BE :: Text -> ByteString
422 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
423 {-# INLINE encodeUtf32BE #-}
424
425 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
426 :: MutableByteArray# s -> Ptr CSize
427 -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
428
429 foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
430 :: MutableByteArray# s -> Ptr CSize
431 -> Ptr (Ptr Word8) -> Ptr Word8
432 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
433
434 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
435 :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()