Replace crufty unboxed tuple with specialised one.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns, Rank2Types #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Encoding.Error
37 import Data.Text.Fusion (Step(..), Stream(..))
38 import Data.Text.Fusion.Size
39 import Data.Text.UnsafeChar (unsafeChr8)
40 import Data.Word (Word8)
41 import qualified Data.Text.Encoding.Utf8 as U8
42 import System.IO.Unsafe (unsafePerformIO)
43 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
44 import Foreign.Storable (pokeByteOff)
45 import Data.ByteString.Internal (mallocByteString, memcpy)
46 import Control.Exception (assert)
47 import qualified Data.ByteString.Internal as B
48
49 data S = S0
50 | S1 {-# UNPACK #-} !Word8
51 | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
52 | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
53 | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
54
55 data T = T {-# UNPACK #-} !ByteString {-# UNPACK #-} !S {-# UNPACK #-} !Int
56
57 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
58 -- UTF-8 encoding.
59 streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
60 streamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
61 where
62 next (T bs@(Chunk ps _) S0 i)
63 | i < len && U8.validate1 a =
64 Yield (unsafeChr8 a) (T bs S0 (i+1))
65 | i + 1 < len && U8.validate2 a b =
66 Yield (U8.chr2 a b) (T bs S0 (i+2))
67 | i + 2 < len && U8.validate3 a b c =
68 Yield (U8.chr3 a b c) (T bs S0 (i+3))
69 | i + 4 < len && U8.validate4 a b c d =
70 Yield (U8.chr4 a b c d) (T bs S0 (i+4))
71 where len = B.length ps
72 a = B.unsafeIndex ps i
73 b = B.unsafeIndex ps (i+1)
74 c = B.unsafeIndex ps (i+2)
75 d = B.unsafeIndex ps (i+3)
76 next st@(T bs s i) =
77 case s of
78 S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es
79 S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es
80 S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es
81 S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
82 _ -> consume st
83 where es = T bs S0 i
84 consume (T bs@(Chunk ps rest) s i)
85 | i >= B.length ps = consume (T rest s 0)
86 | otherwise =
87 case s of
88 S0 -> next (T bs (S1 x) (i+1))
89 S1 a -> next (T bs (S2 a x) (i+1))
90 S2 a b -> next (T bs (S3 a b x) (i+1))
91 S3 a b c -> next (T bs (S4 a b c x) (i+1))
92 S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
93 (T bs (S3 b c d) (i+1))
94 where x = B.unsafeIndex ps i
95 consume (T Empty S0 _) = Done
96 consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
97 {-# INLINE [0] streamUtf8 #-}
98
99 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
100 unstreamChunks :: Int -> Stream Word8 -> ByteString
101 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
102 where chunk s1 len1 = unsafePerformIO $ do
103 let len = min len1 chunkSize
104 mallocByteString len >>= loop len 0 s1
105 where
106 loop !n !off !s fp = case next s of
107 Done | off == 0 -> return Empty
108 | otherwise -> do
109 bs <- trimUp fp off
110 return $! Chunk bs Empty
111 Skip s' -> loop n off s' fp
112 Yield x s'
113 | off == chunkSize -> do
114 bs <- trimUp fp off
115 return (Chunk bs (chunk s (n - B.length bs)))
116 | off == n -> realloc fp n off s' x
117 | otherwise -> do
118 withForeignPtr fp $ \p -> pokeByteOff p off x
119 loop n (off+1) s' fp
120 {-# NOINLINE realloc #-}
121 realloc fp n off s x = do
122 let n' = min (n+n) chunkSize
123 fp' <- copy0 fp n n'
124 withForeignPtr fp' $ \p -> pokeByteOff p off x
125 loop n' (off+1) s fp'
126 {-# NOINLINE trimUp #-}
127 trimUp fp off = return $! B.PS fp 0 off
128 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
129 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
130 dest <- mallocByteString destLen
131 withForeignPtr src $ \src' ->
132 withForeignPtr dest $ \dest' ->
133 memcpy dest' src' (fromIntegral srcLen)
134 return dest
135
136 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
137 unstream :: Stream Word8 -> ByteString
138 unstream = unstreamChunks defaultChunkSize
139
140 decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
141 -> s -> Step s Char
142 decodeError func kind onErr mb i =
143 case onErr desc mb of
144 Nothing -> Skip i
145 Just c -> Yield c i
146 where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
147 kind ++ " stream"