Lazy UTF-8 encoding.
[packages/text.git] / Data / Text / Lazy / Encoding / Fusion.hs
1 {-# LANGUAGE BangPatterns #-}
2
3 -- |
4 -- Module : Data.Text.Lazy.Encoding.Fusion
5 -- Copyright : (c) Bryan O'Sullivan 2009
6 --
7 -- License : BSD-style
8 -- Maintainer : rtharper@aftereternity.co.uk, bos@serpentine.com,
9 -- duncan@haskell.org
10 -- Stability : experimental
11 -- Portability : portable
12 --
13 -- Fusible 'Stream'-oriented functions for converting between lazy
14 -- 'Text' and several common encodings.
15
16 module Data.Text.Lazy.Encoding.Fusion
17 (
18 -- * Streaming
19 -- streamASCII
20 streamUtf8
21 --, streamUtf16LE
22 --, streamUtf16BE
23 --, streamUtf32LE
24 --, streamUtf32BE
25
26 -- * Unstreaming
27 , unstream
28
29 , module Data.Text.Encoding.Fusion.Common
30 ) where
31
32 import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
33 import qualified Data.ByteString as B
34 import qualified Data.ByteString.Unsafe as B
35 import Data.Text.Encoding.Fusion.Common
36 import Data.Text.Fusion (Step(..), Stream(..))
37 import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
38 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
39 import Data.Word (Word8)
40 import qualified Data.Text.Encoding.Utf8 as U8
41 import System.IO.Unsafe (unsafePerformIO)
42 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
43 import Foreign.Storable (pokeByteOff)
44 import Data.ByteString.Internal (mallocByteString, memcpy)
45 import Control.Exception (assert)
46 import qualified Data.ByteString.Internal as B
47
48 unknownLength :: Int
49 unknownLength = 4
50
51 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
52 -- UTF-8 encoding.
53 streamUtf8 :: ByteString -> Stream Char
54 streamUtf8 bs0 = Stream next (bs0 :!: S N N N N :!: 0) unknownLength
55 where
56 {-# INLINE next #-}
57 next st@(bs :!: s :!: i) =
58 case s of
59 S (J a) N N N | U8.validate1 a ->
60 Yield (unsafeChr8 a) es
61 S (J a) (J b) N N | U8.validate2 a b ->
62 Yield (U8.chr2 a b) es
63 S (J a) (J b) (J c) N | U8.validate3 a b c ->
64 Yield (U8.chr3 a b c) es
65 S (J a) (J b) (J c) (J d) | U8.validate4 a b c d ->
66 Yield (U8.chr4 a b c d) es
67 _ -> consume st
68 where es = bs :!: S N N N N :!: i
69 {-# INLINE consume #-}
70 consume (c@(Chunk bs rest) :!: s :!: i)
71 | i >= len = consume (rest :!: s :!: 0)
72 | otherwise = next (c :!: s' :!: i+1)
73 where s' = case s of
74 S N _ _ _ -> S x N N N
75 S a N _ _ -> S a x N N
76 S a b N _ -> S a b x N
77 S a b c N -> S a b c x
78 _ -> encodingError "streamUtf8" "UTF-8"
79 x = J (B.unsafeIndex bs i)
80 len = B.length bs
81 consume (Empty :!: S N _ _ _ :!: _) = Done
82 consume _ = encodingError "streamUtf8" "UTF-8"
83 {-# INLINE [0] streamUtf8 #-}
84
85 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
86 unstreamChunks :: Int -> Stream Word8 -> ByteString
87 unstreamChunks chunkSize (Stream next s0 len) = chunk s0 len
88 where chunk s0 len = unsafePerformIO $ do
89 let safeLen = min (max len unknownLength) chunkSize
90 fp0 <- mallocByteString safeLen
91 loop fp0 safeLen 0 s0
92 where
93 loop !fp !n !off !s = case next s of
94 Done | off == 0 -> return Empty
95 | otherwise -> do
96 bs <- trimUp fp off
97 return $! Chunk bs Empty
98 Skip s' -> loop fp n off s'
99 Yield x s'
100 | off == chunkSize -> do
101 bs <- trimUp fp off
102 return (Chunk bs (chunk s (len - B.length bs)))
103 | off == n -> realloc fp n off s' x
104 | otherwise -> do
105 withForeignPtr fp $ \p -> pokeByteOff p off x
106 loop fp n (off+1) s'
107 {-# NOINLINE realloc #-}
108 realloc fp n off s x = do
109 let n' = min (n+n) chunkSize
110 fp' <- copy0 fp n n'
111 withForeignPtr fp' $ \p -> pokeByteOff p off x
112 loop fp' n' (off+1) s
113 {-# NOINLINE trimUp #-}
114 trimUp fp off = return $! B.PS fp 0 off
115 copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
116 copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
117 dest <- mallocByteString destLen
118 withForeignPtr src $ \src' ->
119 withForeignPtr dest $ \dest' ->
120 memcpy dest' src' (fromIntegral srcLen)
121 return dest
122
123 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
124 unstream :: Stream Word8 -> ByteString
125 unstream = unstreamChunks 64
126
127 encodingError :: String -> String -> a
128 encodingError func encoding =
129 error $ "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Bad " ++
130 encoding ++ " stream"