Speed up the hopefully common case of a big enough chunk.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Fusion (Step(..), Stream(..))
37 import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
38 import Data.Text.UnsafeChar (unsafeChr8)
39 import Data.Word (Word8)
40 import qualified Data.Text.Encoding.Utf8 as U8
41 import System.IO.Unsafe (unsafePerformIO)
42 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
43 import Foreign.Storable (pokeByteOff)
44 import Data.ByteString.Internal (mallocByteString, memcpy)
45 import Control.Exception (assert)
46 import qualified Data.ByteString.Internal as B
47
48 unknownLength :: Int
49 unknownLength = 4
50
51 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
52 -- UTF-8 encoding.
53 streamUtf8 :: ByteString -> Stream Char
54 streamUtf8 bs0 = Stream next (bs0 :!: empty :!: 0) unknownLength
55 where
56 empty = S N N N N
57 {-# INLINE next #-}
58 next (bs@(Chunk ps _) :!: S N _ _ _ :!: i)
59 | i < len && U8.validate1 a =
60 Yield (unsafeChr8 a) (bs :!: empty :!: i+1)
61 | i + 1 < len && U8.validate2 a b =
62 Yield (U8.chr2 a b) (bs :!: empty :!: i+2)
63 | i + 2 < len && U8.validate3 a b c =
64 Yield (U8.chr3 a b c) (bs :!: empty :!: i+3)
65 | i + 4 < len && U8.validate4 a b c d =
66 Yield (U8.chr4 a b c d) (bs :!: empty :!: i+4)
67 where len = B.length ps
68 a = B.unsafeIndex ps i
69 b = B.unsafeIndex ps (i+1)
70 c = B.unsafeIndex ps (i+2)
71 d = B.unsafeIndex ps (i+3)
72 next st@(bs :!: s :!: i) =
73 case s of
74 S (J a) N _ _ | U8.validate1 a ->
75 Yield (unsafeChr8 a) es
76 S (J a) (J b) N _ | U8.validate2 a b ->
77 Yield (U8.chr2 a b) es
78 S (J a) (J b) (J c) N | U8.validate3 a b c ->
79 Yield (U8.chr3 a b c) es
80 S (J a) (J b) (J c) (J d) | U8.validate4 a b c d ->
81 Yield (U8.chr4 a b c d) es
82 _ -> consume st
83 where es = bs :!: empty :!: i
84 {-# INLINE consume #-}
85 consume (bs@(Chunk ps rest) :!: s :!: i)
86 | i >= len = consume (rest :!: s :!: 0)
87 | otherwise = next (bs :!: s' :!: i+1)
88 where s' = case s of
89 S N _ _ _ -> S x N N N
90 S a N _ _ -> S a x N N
91 S a b N _ -> S a b x N
92 S a b c N -> S a b c x
93 _ -> encodingError "streamUtf8" "UTF-8"
94 x = J (B.unsafeIndex ps i)
95 len = B.length ps
96 consume (Empty :!: S N _ _ _ :!: _) = Done
97 consume _ = encodingError "streamUtf8" "UTF-8"
98 {-# INLINE [0] streamUtf8 #-}
99
100 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
101 unstreamChunks :: Int -> Stream Word8 -> ByteString
102 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 len0
103 where chunk s1 len1 = unsafePerformIO $ do
104 let len = min (max len1 unknownLength) chunkSize
105 mallocByteString len >>= loop len 0 s1
106 where
107 loop !n !off !s fp = case next s of
108 Done | off == 0 -> return Empty
109 | otherwise -> do
110 bs <- trimUp fp off
111 return $! Chunk bs Empty
112 Skip s' -> loop n off s' fp
113 Yield x s'
114 | off == chunkSize -> do
115 bs <- trimUp fp off
116 return (Chunk bs (chunk s (n - B.length bs)))
117 | off == n -> realloc fp n off s' x
118 | otherwise -> do
119 withForeignPtr fp $ \p -> pokeByteOff p off x
120 loop n (off+1) s' fp
121 {-# NOINLINE realloc #-}
122 realloc fp n off s x = do
123 let n' = min (n+n) chunkSize
124 fp' <- copy0 fp n n'
125 withForeignPtr fp' $ \p -> pokeByteOff p off x
126 loop n' (off+1) s fp'
127 {-# NOINLINE trimUp #-}
128 trimUp fp off = return $! B.PS fp 0 off
129 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
130 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
131 dest <- mallocByteString destLen
132 withForeignPtr src $ \src' ->
133 withForeignPtr dest $ \dest' ->
134 memcpy dest' src' (fromIntegral srcLen)
135 return dest
136
137 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
138 unstream :: Stream Word8 -> ByteString
139 unstream = unstreamChunks defaultChunkSize
140
141 encodingError :: String -> String -> a
142 encodingError func encoding =
143 error $ "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Bad " ++
144 encoding ++ " stream"