Add controllable error handling and recovery code.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns, Rank2Types #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Encoding.Error
37 import Data.Text.Fusion (Step(..), Stream(..))
38 import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
39 import Data.Text.UnsafeChar (unsafeChr8)
40 import Data.Word (Word8)
41 import qualified Data.Text.Encoding.Utf8 as U8
42 import System.IO.Unsafe (unsafePerformIO)
43 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
44 import Foreign.Storable (pokeByteOff)
45 import Data.ByteString.Internal (mallocByteString, memcpy)
46 import Control.Exception (assert)
47 import qualified Data.ByteString.Internal as B
48
49 unknownLength :: Int
50 unknownLength = 4
51
52 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
53 -- UTF-8 encoding.
54 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
55 streamUtf8 onErr bs0 = Stream next (bs0 :!: empty :!: 0) unknownLength
56 where
57 empty = S N N N N
58 {-# INLINE next #-}
59 next (bs@(Chunk ps _) :!: S N _ _ _ :!: i)
60 | i < len && U8.validate1 a =
61 Yield (unsafeChr8 a) (bs :!: empty :!: i+1)
62 | i + 1 < len && U8.validate2 a b =
63 Yield (U8.chr2 a b) (bs :!: empty :!: i+2)
64 | i + 2 < len && U8.validate3 a b c =
65 Yield (U8.chr3 a b c) (bs :!: empty :!: i+3)
66 | i + 4 < len && U8.validate4 a b c d =
67 Yield (U8.chr4 a b c d) (bs :!: empty :!: i+4)
68 where len = B.length ps
69 a = B.unsafeIndex ps i
70 b = B.unsafeIndex ps (i+1)
71 c = B.unsafeIndex ps (i+2)
72 d = B.unsafeIndex ps (i+3)
73 next st@(bs :!: s :!: i) =
74 case s of
75 S (J a) N _ _ | U8.validate1 a ->
76 Yield (unsafeChr8 a) es
77 S (J a) (J b) N _ | U8.validate2 a b ->
78 Yield (U8.chr2 a b) es
79 S (J a) (J b) (J c) N | U8.validate3 a b c ->
80 Yield (U8.chr3 a b c) es
81 S (J a) (J b) (J c) (J d) | U8.validate4 a b c d ->
82 Yield (U8.chr4 a b c d) es
83 _ -> consume st
84 where es = bs :!: empty :!: i
85 {-# INLINE consume #-}
86 consume (bs@(Chunk ps rest) :!: s :!: i)
87 | i >= B.length ps = consume (rest :!: s :!: 0)
88 | otherwise =
89 case s of
90 S N _ _ _ -> next (bs :!: S x N N N :!: i+1)
91 S a N _ _ -> next (bs :!: S a x N N :!: i+1)
92 S a b N _ -> next (bs :!: S a b x N :!: i+1)
93 S a b c N -> next (bs :!: S a b c x :!: i+1)
94 S (J a) b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
95 (bs :!: S b c d N :!: i+1)
96 where x = J (B.unsafeIndex ps i)
97 consume (Empty :!: S N _ _ _ :!: _) = Done
98 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
99 {-# INLINE [0] streamUtf8 #-}
100
101 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
102 unstreamChunks :: Int -> Stream Word8 -> ByteString
103 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 len0
104 where chunk s1 len1 = unsafePerformIO $ do
105 let len = min (max len1 unknownLength) chunkSize
106 mallocByteString len >>= loop len 0 s1
107 where
108 loop !n !off !s fp = case next s of
109 Done | off == 0 -> return Empty
110 | otherwise -> do
111 bs <- trimUp fp off
112 return $! Chunk bs Empty
113 Skip s' -> loop n off s' fp
114 Yield x s'
115 | off == chunkSize -> do
116 bs <- trimUp fp off
117 return (Chunk bs (chunk s (n - B.length bs)))
118 | off == n -> realloc fp n off s' x
119 | otherwise -> do
120 withForeignPtr fp $ \p -> pokeByteOff p off x
121 loop n (off+1) s' fp
122 {-# NOINLINE realloc #-}
123 realloc fp n off s x = do
124 let n' = min (n+n) chunkSize
125 fp' <- copy0 fp n n'
126 withForeignPtr fp' $ \p -> pokeByteOff p off x
127 loop n' (off+1) s fp'
128 {-# NOINLINE trimUp #-}
129 trimUp fp off = return $! B.PS fp 0 off
130 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
131 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
132 dest <- mallocByteString destLen
133 withForeignPtr src $ \src' ->
134 withForeignPtr dest $ \dest' ->
135 memcpy dest' src' (fromIntegral srcLen)
136 return dest
137
138 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
139 unstream :: Stream Word8 -> ByteString
140 unstream = unstreamChunks defaultChunkSize
141
142 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
143 -> s -> Step s Char
144 decodeError func kind onErr mb i =
145 case onErr desc mb of
146 Nothing -> Skip i
147 Just c -> Yield c i
148 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
149 kind ++ " stream"