Use unsafeDupablePerformIO where possible
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 , streamUtf16LE
22 , streamUtf16BE
23 , streamUtf32LE
24 , streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Encoding.Error
37 import Data.Text.Fusion (Step(..), Stream(..))
38 import Data.Text.Fusion.Size
39 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
40 import Data.Text.UnsafeShift (shiftL)
41 import Data.Word (Word8, Word16, Word32)
42 import qualified Data.Text.Encoding.Utf8 as U8
43 import qualified Data.Text.Encoding.Utf16 as U16
44 import qualified Data.Text.Encoding.Utf32 as U32
45 import Data.Text.Unsafe (unsafeDupablePerformIO)
46 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
47 import Foreign.Storable (pokeByteOff)
48 import Data.ByteString.Internal (mallocByteString, memcpy)
49 #if defined(ASSERTS)
50 import Control.Exception (assert)
51 #endif
52 import qualified Data.ByteString.Internal as B
53
54 data S = S0
55 | S1 {-# UNPACK #-} !Word8
56 | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
57 | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
58 | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
59
60 data T = T !ByteString !S {-# UNPACK #-} !Int
61
62 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
63 -- UTF-8 encoding.
64 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
65 streamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
66 where
67 next (T bs@(Chunk ps _) S0 i)
68 | i < len && U8.validate1 a =
69 Yield (unsafeChr8 a) (T bs S0 (i+1))
70 | i + 1 < len && U8.validate2 a b =
71 Yield (U8.chr2 a b) (T bs S0 (i+2))
72 | i + 2 < len && U8.validate3 a b c =
73 Yield (U8.chr3 a b c) (T bs S0 (i+3))
74 | i + 3 < len && U8.validate4 a b c d =
75 Yield (U8.chr4 a b c d) (T bs S0 (i+4))
76 where len = B.length ps
77 a = B.unsafeIndex ps i
78 b = B.unsafeIndex ps (i+1)
79 c = B.unsafeIndex ps (i+2)
80 d = B.unsafeIndex ps (i+3)
81 next st@(T bs s i) =
82 case s of
83 S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es
84 S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es
85 S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es
86 S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
87 _ -> consume st
88 where es = T bs S0 i
89 consume (T bs@(Chunk ps rest) s i)
90 | i >= B.length ps = consume (T rest s 0)
91 | otherwise =
92 case s of
93 S0 -> next (T bs (S1 x) (i+1))
94 S1 a -> next (T bs (S2 a x) (i+1))
95 S2 a b -> next (T bs (S3 a b x) (i+1))
96 S3 a b c -> next (T bs (S4 a b c x) (i+1))
97 S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
98 (T bs (S3 b c d) (i+1))
99 where x = B.unsafeIndex ps i
100 consume (T Empty S0 _) = Done
101 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
102 {-# INLINE [0] streamUtf8 #-}
103
104 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
105 -- endian UTF-16 encoding.
106 streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
107 streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
108 where
109 next (T bs@(Chunk ps _) S0 i)
110 | i + 1 < len && U16.validate1 x1 =
111 Yield (unsafeChr x1) (T bs S0 (i+2))
112 | i + 3 < len && U16.validate2 x1 x2 =
113 Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
114 where len = B.length ps
115 x1 = c (idx i) (idx (i + 1))
116 x2 = c (idx (i + 2)) (idx (i + 3))
117 c w1 w2 = w1 + (w2 `shiftL` 8)
118 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
119 next st@(T bs s i) =
120 case s of
121 S2 w1 w2 | U16.validate1 (c w1 w2) ->
122 Yield (unsafeChr (c w1 w2)) es
123 S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
124 Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
125 _ -> consume st
126 where es = T bs S0 i
127 c :: Word8 -> Word8 -> Word16
128 c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8)
129 consume (T bs@(Chunk ps rest) s i)
130 | i >= B.length ps = consume (T rest s 0)
131 | otherwise =
132 case s of
133 S0 -> next (T bs (S1 x) (i+1))
134 S1 w1 -> next (T bs (S2 w1 x) (i+1))
135 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
136 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
137 S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1)
138 (T bs (S3 w2 w3 w4) (i+1))
139 where x = B.unsafeIndex ps i
140 consume (T Empty S0 _) = Done
141 consume st = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st
142 {-# INLINE [0] streamUtf16LE #-}
143
144 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
145 -- endian UTF-16 encoding.
146 streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
147 streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
148 where
149 next (T bs@(Chunk ps _) S0 i)
150 | i + 1 < len && U16.validate1 x1 =
151 Yield (unsafeChr x1) (T bs S0 (i+2))
152 | i + 3 < len && U16.validate2 x1 x2 =
153 Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
154 where len = B.length ps
155 x1 = c (idx i) (idx (i + 1))
156 x2 = c (idx (i + 2)) (idx (i + 3))
157 c w1 w2 = (w1 `shiftL` 8) + w2
158 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
159 next st@(T bs s i) =
160 case s of
161 S2 w1 w2 | U16.validate1 (c w1 w2) ->
162 Yield (unsafeChr (c w1 w2)) es
163 S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
164 Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
165 _ -> consume st
166 where es = T bs S0 i
167 c :: Word8 -> Word8 -> Word16
168 c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2
169 consume (T bs@(Chunk ps rest) s i)
170 | i >= B.length ps = consume (T rest s 0)
171 | otherwise =
172 case s of
173 S0 -> next (T bs (S1 x) (i+1))
174 S1 w1 -> next (T bs (S2 w1 x) (i+1))
175 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
176 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
177 S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1)
178 (T bs (S3 w2 w3 w4) (i+1))
179 where x = B.unsafeIndex ps i
180 consume (T Empty S0 _) = Done
181 consume st = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st
182 {-# INLINE [0] streamUtf16BE #-}
183
184 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
185 -- endian UTF-32 encoding.
186 streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
187 streamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
188 where
189 next (T bs@(Chunk ps _) S0 i)
190 | i + 3 < len && U32.validate x =
191 Yield (unsafeChr32 x) (T bs S0 (i+4))
192 where len = B.length ps
193 x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
194 x1 = idx i
195 x2 = idx (i+1)
196 x3 = idx (i+2)
197 x4 = idx (i+3)
198 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
199 next st@(T bs s i) =
200 case s of
201 S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
202 Yield (unsafeChr32 (c w1 w2 w3 w4)) es
203 _ -> consume st
204 where es = T bs S0 i
205 c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
206 c w1 w2 w3 w4 = shifted
207 where
208 shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
209 x1 = fromIntegral w1
210 x2 = fromIntegral w2
211 x3 = fromIntegral w3
212 x4 = fromIntegral w4
213 consume (T bs@(Chunk ps rest) s i)
214 | i >= B.length ps = consume (T rest s 0)
215 | otherwise =
216 case s of
217 S0 -> next (T bs (S1 x) (i+1))
218 S1 w1 -> next (T bs (S2 w1 x) (i+1))
219 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
220 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
221 S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1)
222 (T bs (S3 w2 w3 w4) (i+1))
223 where x = B.unsafeIndex ps i
224 consume (T Empty S0 _) = Done
225 consume st = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st
226 {-# INLINE [0] streamUtf32BE #-}
227
228 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
229 -- endian UTF-32 encoding.
230 streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
231 streamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
232 where
233 next (T bs@(Chunk ps _) S0 i)
234 | i + 3 < len && U32.validate x =
235 Yield (unsafeChr32 x) (T bs S0 (i+4))
236 where len = B.length ps
237 x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
238 x1 = idx i
239 x2 = idx (i+1)
240 x3 = idx (i+2)
241 x4 = idx (i+3)
242 idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
243 next st@(T bs s i) =
244 case s of
245 S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
246 Yield (unsafeChr32 (c w1 w2 w3 w4)) es
247 _ -> consume st
248 where es = T bs S0 i
249 c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
250 c w1 w2 w3 w4 = shifted
251 where
252 shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
253 x1 = fromIntegral w1
254 x2 = fromIntegral w2
255 x3 = fromIntegral w3
256 x4 = fromIntegral w4
257 consume (T bs@(Chunk ps rest) s i)
258 | i >= B.length ps = consume (T rest s 0)
259 | otherwise =
260 case s of
261 S0 -> next (T bs (S1 x) (i+1))
262 S1 w1 -> next (T bs (S2 w1 x) (i+1))
263 S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
264 S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
265 S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1)
266 (T bs (S3 w2 w3 w4) (i+1))
267 where x = B.unsafeIndex ps i
268 consume (T Empty S0 _) = Done
269 consume st = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st
270 {-# INLINE [0] streamUtf32LE #-}
271
272 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
273 unstreamChunks :: Int -> Stream Word8 -> ByteString
274 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
275 where chunk s1 len1 = unsafeDupablePerformIO $ do
276 let len = max 4 (min len1 chunkSize)
277 mallocByteString len >>= loop len 0 s1
278 where
279 loop !n !off !s fp = case next s of
280 Done | off == 0 -> return Empty
281 | otherwise -> return $! Chunk (trimUp fp off) Empty
282 Skip s' -> loop n off s' fp
283 Yield x s'
284 | off == chunkSize -> do
285 let !newLen = n - off
286 return $! Chunk (trimUp fp off) (chunk s newLen)
287 | off == n -> realloc fp n off s' x
288 | otherwise -> do
289 withForeignPtr fp $ \p -> pokeByteOff p off x
290 loop n (off+1) s' fp
291 {-# NOINLINE realloc #-}
292 realloc fp n off s x = do
293 let n' = min (n+n) chunkSize
294 fp' <- copy0 fp n n'
295 withForeignPtr fp' $ \p -> pokeByteOff p off x
296 loop n' (off+1) s fp'
297 trimUp fp off = B.PS fp 0 off
298 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
299 copy0 !src !srcLen !destLen =
300 #if defined(ASSERTS)
301 assert (srcLen <= destLen) $
302 #endif
303 do
304 dest <- mallocByteString destLen
305 withForeignPtr src $ \src' ->
306 withForeignPtr dest $ \dest' ->
307 memcpy dest' src' (fromIntegral srcLen)
308 return dest
309
310 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
311 unstream :: Stream Word8 -> ByteString
312 unstream = unstreamChunks defaultChunkSize
313
314 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
315 -> s -> Step s Char
316 decodeError func kind onErr mb i =
317 case onErr desc mb of
318 Nothing -> Skip i
319 Just c -> Yield c i
320 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
321 kind ++ " stream"