Fix a crash reported by Michael Snoyman.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns, Rank2Types #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Encoding.Error
37 import Data.Text.Fusion (Step(..), Stream(..))
38 import Data.Text.Fusion.Size
39 import Data.Text.UnsafeChar (unsafeChr8)
40 import Data.Word (Word8)
41 import qualified Data.Text.Encoding.Utf8 as U8
42 import System.IO.Unsafe (unsafePerformIO)
43 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
44 import Foreign.Storable (pokeByteOff)
45 import Data.ByteString.Internal (mallocByteString, memcpy)
46 import Control.Exception (assert)
47 import qualified Data.ByteString.Internal as B
48
49 data S = S0
50 | S1 {-# UNPACK #-} !Word8
51 | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
52 | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
53 | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
54
55 data T = T {-# UNPACK #-} !ByteString {-# UNPACK #-} !S {-# UNPACK #-} !Int
56
57 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
58 -- UTF-8 encoding.
59 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
60 streamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
61 where
62 next (T bs@(Chunk ps _) S0 i)
63 | i < len && U8.validate1 a =
64 Yield (unsafeChr8 a) (T bs S0 (i+1))
65 | i + 1 < len && U8.validate2 a b =
66 Yield (U8.chr2 a b) (T bs S0 (i+2))
67 | i + 2 < len && U8.validate3 a b c =
68 Yield (U8.chr3 a b c) (T bs S0 (i+3))
69 | i + 4 < len && U8.validate4 a b c d =
70 Yield (U8.chr4 a b c d) (T bs S0 (i+4))
71 where len = B.length ps
72 a = B.unsafeIndex ps i
73 b = B.unsafeIndex ps (i+1)
74 c = B.unsafeIndex ps (i+2)
75 d = B.unsafeIndex ps (i+3)
76 next st@(T bs s i) =
77 case s of
78 S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es
79 S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es
80 S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es
81 S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
82 _ -> consume st
83 where es = T bs S0 i
84 consume (T bs@(Chunk ps rest) s i)
85 | i >= B.length ps = consume (T rest s 0)
86 | otherwise =
87 case s of
88 S0 -> next (T bs (S1 x) (i+1))
89 S1 a -> next (T bs (S2 a x) (i+1))
90 S2 a b -> next (T bs (S3 a b x) (i+1))
91 S3 a b c -> next (T bs (S4 a b c x) (i+1))
92 S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
93 (T bs (S3 b c d) (i+1))
94 where x = B.unsafeIndex ps i
95 consume (T Empty S0 _) = Done
96 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
97 {-# INLINE [0] streamUtf8 #-}
98
99 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
100 unstreamChunks :: Int -> Stream Word8 -> ByteString
101 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
102 where chunk s1 len1 = unsafePerformIO $ do
103 let len = max 4 (min len1 chunkSize)
104 mallocByteString len >>= loop len 0 s1
105 where
106 loop !n !off !s fp = case next s of
107 Done | off == 0 -> return Empty
108 | otherwise -> return $! Chunk (trimUp fp off) Empty
109 Skip s' -> loop n off s' fp
110 Yield x s'
111 | off == chunkSize -> do
112 let !newLen = n - off
113 return $! Chunk (trimUp fp off) (chunk s newLen)
114 | off == n -> realloc fp n off s' x
115 | otherwise -> do
116 withForeignPtr fp $ \p -> pokeByteOff p off x
117 loop n (off+1) s' fp
118 {-# NOINLINE realloc #-}
119 realloc fp n off s x = do
120 let n' = min (n+n) chunkSize
121 fp' <- copy0 fp n n'
122 withForeignPtr fp' $ \p -> pokeByteOff p off x
123 loop n' (off+1) s fp'
124 trimUp fp off = B.PS fp 0 off
125 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
126 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
127 dest <- mallocByteString destLen
128 withForeignPtr src $ \src' ->
129 withForeignPtr dest $ \dest' ->
130 memcpy dest' src' (fromIntegral srcLen)
131 return dest
132
133 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
134 unstream :: Stream Word8 -> ByteString
135 unstream = unstreamChunks defaultChunkSize
136
137 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
138 -> s -> Step s Char
139 decodeError func kind onErr mb i =
140 case onErr desc mb of
141 Nothing -> Skip i
142 Just c -> Yield c i
143 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
144 kind ++ " stream"