Rewrite of the IO library, including Unicode support
[packages/base.git] / GHC / IO / Encoding / UTF32.hs
1 {-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : GHC.IO.Encoding.UTF32
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
12 --
13 -- UTF-32 Codecs for the IO library
14 --
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
18 --
19 -----------------------------------------------------------------------------
20
21 module GHC.IO.Encoding.UTF32 (
22 utf32,
23 utf32_decode,
24 utf32_encode,
25
26 utf32be,
27 utf32be_decode,
28 utf32be_encode,
29
30 utf32le,
31 utf32le_decode,
32 utf32le_encode,
33 ) where
34
35 import GHC.Base
36 import GHC.Real
37 import GHC.Num
38 import GHC.IO
39 import GHC.IO.Exception
40 import GHC.IO.Buffer
41 import GHC.IO.Encoding.Types
42 import GHC.Word
43 import Data.Bits
44 import Data.Maybe
45 import GHC.IORef
46
47 -- -----------------------------------------------------------------------------
48 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
49
50 utf32 :: TextEncoding
51 utf32 = TextEncoding { mkTextDecoder = utf32_DF,
52 mkTextEncoder = utf32_EF }
53
54 utf32_DF :: IO TextDecoder
55 utf32_DF = do
56 seen_bom <- newIORef Nothing
57 return (BufferCodec (utf32_decode seen_bom) (return ()))
58
59 utf32_EF :: IO TextEncoder
60 utf32_EF = do
61 done_bom <- newIORef False
62 return (BufferCodec (utf32_encode done_bom) (return ()))
63
64 utf32_encode :: IORef Bool -> EncodeBuffer
65 utf32_encode done_bom input
66 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
67 = do
68 b <- readIORef done_bom
69 if b then utf32_native_encode input output
70 else if os - ow < 4
71 then return (input,output)
72 else do
73 writeIORef done_bom True
74 writeWord8Buf oraw ow bom0
75 writeWord8Buf oraw (ow+1) bom1
76 writeWord8Buf oraw (ow+2) bom2
77 writeWord8Buf oraw (ow+3) bom3
78 utf32_native_encode input output{ bufR = ow+4 }
79
80 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
81 utf32_decode seen_bom
82 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
83 output
84 = do
85 mb <- readIORef seen_bom
86 case mb of
87 Just decode -> decode input output
88 Nothing ->
89 if iw - ir < 4 then return (input,output) else do
90 c0 <- readWord8Buf iraw ir
91 c1 <- readWord8Buf iraw (ir+1)
92 c2 <- readWord8Buf iraw (ir+2)
93 c3 <- readWord8Buf iraw (ir+3)
94 case () of
95 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
96 writeIORef seen_bom (Just utf32be_decode)
97 utf32be_decode input{ bufL= ir+4 } output
98 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
99 writeIORef seen_bom (Just utf32le_decode)
100 utf32le_decode input{ bufL= ir+4 } output
101 | otherwise -> do
102 writeIORef seen_bom (Just utf32_native_decode)
103 utf32_native_decode input output
104
105
106 bom0, bom1, bom2, bom3 :: Word8
107 bom0 = 0
108 bom1 = 0
109 bom2 = 0xfe
110 bom3 = 0xff
111
112 -- choose UTF-32BE by default for UTF-32 output
113 utf32_native_decode :: DecodeBuffer
114 utf32_native_decode = utf32be_decode
115
116 utf32_native_encode :: EncodeBuffer
117 utf32_native_encode = utf32be_encode
118
119 -- -----------------------------------------------------------------------------
120 -- UTF32LE and UTF32BE
121
122 utf32be :: TextEncoding
123 utf32be = TextEncoding { mkTextDecoder = utf32be_DF,
124 mkTextEncoder = utf32be_EF }
125
126 utf32be_DF :: IO TextDecoder
127 utf32be_DF = return (BufferCodec utf32be_decode (return ()))
128
129 utf32be_EF :: IO TextEncoder
130 utf32be_EF = return (BufferCodec utf32be_encode (return ()))
131
132
133 utf32le :: TextEncoding
134 utf32le = TextEncoding { mkTextDecoder = utf32le_DF,
135 mkTextEncoder = utf32le_EF }
136
137 utf32le_DF :: IO TextDecoder
138 utf32le_DF = return (BufferCodec utf32le_decode (return ()))
139
140 utf32le_EF :: IO TextEncoder
141 utf32le_EF = return (BufferCodec utf32le_encode (return ()))
142
143
144
145 utf32be_decode :: DecodeBuffer
146 utf32be_decode
147 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
148 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
149 = let
150 loop !ir !ow
151 | ow >= os || iw - ir < 4 = done ir ow
152 | otherwise = do
153 c0 <- readWord8Buf iraw ir
154 c1 <- readWord8Buf iraw (ir+1)
155 c2 <- readWord8Buf iraw (ir+2)
156 c3 <- readWord8Buf iraw (ir+3)
157 let x1 = chr4 c0 c1 c2 c3
158 if not (validate x1) then invalid else do
159 writeCharBuf oraw ow x1
160 loop (ir+4) (ow+1)
161 where
162 invalid = if ir > ir0 then done ir ow else ioe_decodingError
163
164 -- lambda-lifted, to avoid thunks being built in the inner-loop:
165 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
166 else input{ bufL=ir },
167 output{ bufR=ow })
168 in
169 loop ir0 ow0
170
171 utf32le_decode :: DecodeBuffer
172 utf32le_decode
173 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
174 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
175 = let
176 loop !ir !ow
177 | ow >= os || iw - ir < 4 = done ir ow
178 | otherwise = do
179 c0 <- readWord8Buf iraw ir
180 c1 <- readWord8Buf iraw (ir+1)
181 c2 <- readWord8Buf iraw (ir+2)
182 c3 <- readWord8Buf iraw (ir+3)
183 let x1 = chr4 c3 c2 c1 c0
184 if not (validate x1) then invalid else do
185 writeCharBuf oraw ow x1
186 loop (ir+4) (ow+1)
187 where
188 invalid = if ir > ir0 then done ir ow else ioe_decodingError
189
190 -- lambda-lifted, to avoid thunks being built in the inner-loop:
191 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
192 else input{ bufL=ir },
193 output{ bufR=ow })
194 in
195 loop ir0 ow0
196
197 ioe_decodingError :: IO a
198 ioe_decodingError = ioException
199 (IOError Nothing InvalidArgument "utf32_decode"
200 "invalid UTF-32 byte sequence" Nothing Nothing)
201
202 utf32be_encode :: EncodeBuffer
203 utf32be_encode
204 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
205 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
206 = let
207 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
208 else input{ bufL=ir },
209 output{ bufR=ow })
210 loop !ir !ow
211 | ir >= iw = done ir ow
212 | os - ow < 4 = done ir ow
213 | otherwise = do
214 (c,ir') <- readCharBuf iraw ir
215 let (c0,c1,c2,c3) = ord4 c
216 writeWord8Buf oraw ow c0
217 writeWord8Buf oraw (ow+1) c1
218 writeWord8Buf oraw (ow+2) c2
219 writeWord8Buf oraw (ow+3) c3
220 loop ir' (ow+4)
221 in
222 loop ir0 ow0
223
224 utf32le_encode :: EncodeBuffer
225 utf32le_encode
226 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
227 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
228 = let
229 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
230 else input{ bufL=ir },
231 output{ bufR=ow })
232 loop !ir !ow
233 | ir >= iw = done ir ow
234 | os - ow < 4 = done ir ow
235 | otherwise = do
236 (c,ir') <- readCharBuf iraw ir
237 let (c0,c1,c2,c3) = ord4 c
238 writeWord8Buf oraw ow c3
239 writeWord8Buf oraw (ow+1) c2
240 writeWord8Buf oraw (ow+2) c1
241 writeWord8Buf oraw (ow+3) c0
242 loop ir' (ow+4)
243 in
244 loop ir0 ow0
245
246 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
247 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
248 C# (chr# (z1# +# z2# +# z3# +# z4#))
249 where
250 !y1# = word2Int# x1#
251 !y2# = word2Int# x2#
252 !y3# = word2Int# x3#
253 !y4# = word2Int# x4#
254 !z1# = uncheckedIShiftL# y1# 24#
255 !z2# = uncheckedIShiftL# y2# 16#
256 !z3# = uncheckedIShiftL# y3# 8#
257 !z4# = y4#
258 {-# INLINE chr4 #-}
259
260 ord4 :: Char -> (Word8,Word8,Word8,Word8)
261 ord4 c = (fromIntegral (x `shiftR` 24),
262 fromIntegral (x `shiftR` 16),
263 fromIntegral (x `shiftR` 8),
264 fromIntegral x)
265 where
266 x = ord c
267 {-# INLINE ord4 #-}
268
269
270 validate :: Char -> Bool
271 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
272 where x1 = ord c
273 {-# INLINE validate #-}