ZOMG size isn't everything.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns, Rank2Types #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Encoding.Error
37 import Data.Text.Fusion (Step(..), Stream(..))
38 import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
39 import Data.Text.Fusion.Size
40 import Data.Text.UnsafeChar (unsafeChr8)
41 import Data.Word (Word8)
42 import qualified Data.Text.Encoding.Utf8 as U8
43 import System.IO.Unsafe (unsafePerformIO)
44 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
45 import Foreign.Storable (pokeByteOff)
46 import Data.ByteString.Internal (mallocByteString, memcpy)
47 import Control.Exception (assert)
48 import qualified Data.ByteString.Internal as B
49
50 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
51 -- UTF-8 encoding.
52 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
53 streamUtf8 onErr bs0 = Stream next (bs0 :!: empty :!: 0) unknownSize
54 where
55 empty = S N N N N
56 {-# INLINE next #-}
57 next (bs@(Chunk ps _) :!: S N _ _ _ :!: i)
58 | i < len && U8.validate1 a =
59 Yield (unsafeChr8 a) (bs :!: empty :!: i+1)
60 | i + 1 < len && U8.validate2 a b =
61 Yield (U8.chr2 a b) (bs :!: empty :!: i+2)
62 | i + 2 < len && U8.validate3 a b c =
63 Yield (U8.chr3 a b c) (bs :!: empty :!: i+3)
64 | i + 4 < len && U8.validate4 a b c d =
65 Yield (U8.chr4 a b c d) (bs :!: empty :!: i+4)
66 where len = B.length ps
67 a = B.unsafeIndex ps i
68 b = B.unsafeIndex ps (i+1)
69 c = B.unsafeIndex ps (i+2)
70 d = B.unsafeIndex ps (i+3)
71 next st@(bs :!: s :!: i) =
72 case s of
73 S (J a) N _ _ | U8.validate1 a ->
74 Yield (unsafeChr8 a) es
75 S (J a) (J b) N _ | U8.validate2 a b ->
76 Yield (U8.chr2 a b) es
77 S (J a) (J b) (J c) N | U8.validate3 a b c ->
78 Yield (U8.chr3 a b c) es
79 S (J a) (J b) (J c) (J d) | U8.validate4 a b c d ->
80 Yield (U8.chr4 a b c d) es
81 _ -> consume st
82 where es = bs :!: empty :!: i
83 {-# INLINE consume #-}
84 consume (bs@(Chunk ps rest) :!: s :!: i)
85 | i >= B.length ps = consume (rest :!: s :!: 0)
86 | otherwise =
87 case s of
88 S N _ _ _ -> next (bs :!: S x N N N :!: i+1)
89 S a N _ _ -> next (bs :!: S a x N N :!: i+1)
90 S a b N _ -> next (bs :!: S a b x N :!: i+1)
91 S a b c N -> next (bs :!: S a b c x :!: i+1)
92 S (J a) b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
93 (bs :!: S b c d N :!: i+1)
94 where x = J (B.unsafeIndex ps i)
95 consume (Empty :!: S N _ _ _ :!: _) = Done
96 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
97 {-# INLINE [0] streamUtf8 #-}
98
99 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
100 unstreamChunks :: Int -> Stream Word8 -> ByteString
101 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
102 where chunk s1 len1 = unsafePerformIO $ do
103 let len = min len1 chunkSize
104 mallocByteString len >>= loop len 0 s1
105 where
106 loop !n !off !s fp = case next s of
107 Done | off == 0 -> return Empty
108 | otherwise -> do
109 bs <- trimUp fp off
110 return $! Chunk bs Empty
111 Skip s' -> loop n off s' fp
112 Yield x s'
113 | off == chunkSize -> do
114 bs <- trimUp fp off
115 return (Chunk bs (chunk s (n - B.length bs)))
116 | off == n -> realloc fp n off s' x
117 | otherwise -> do
118 withForeignPtr fp $ \p -> pokeByteOff p off x
119 loop n (off+1) s' fp
120 {-# NOINLINE realloc #-}
121 realloc fp n off s x = do
122 let n' = min (n+n) chunkSize
123 fp' <- copy0 fp n n'
124 withForeignPtr fp' $ \p -> pokeByteOff p off x
125 loop n' (off+1) s fp'
126 {-# NOINLINE trimUp #-}
127 trimUp fp off = return $! B.PS fp 0 off
128 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
129 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
130 dest <- mallocByteString destLen
131 withForeignPtr src $ \src' ->
132 withForeignPtr dest $ \dest' ->
133 memcpy dest' src' (fromIntegral srcLen)
134 return dest
135
136 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
137 unstream :: Stream Word8 -> ByteString
138 unstream = unstreamChunks defaultChunkSize
139
140 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
141 -> s -> Step s Char
142 decodeError func kind onErr mb i =
143 case onErr desc mb of
144 Nothing -> Skip i
145 Just c -> Yield c i
146 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
147 kind ++ " stream"