724b8ae408eb8194ec8c5ad73bf5eb8a041e15e3
[packages/base.git] / GHC / IO / Encoding / CodePage.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, BangPatterns, ForeignFunctionInterface, NoImplicitPrelude,
3 NondecreasingIndentation, MagicHash #-}
4 module GHC.IO.Encoding.CodePage(
5 #if !defined(mingw32_HOST_OS)
6 ) where
7 #else
8 codePageEncoding, mkCodePageEncoding,
9 localeEncoding, mkLocaleEncoding
10 ) where
11
12 import GHC.Base
13 import GHC.Show
14 import GHC.Num
15 import GHC.Enum
16 import GHC.Word
17 import GHC.IO (unsafePerformIO)
18 import GHC.IO.Encoding.Failure
19 import GHC.IO.Encoding.Types
20 import GHC.IO.Buffer
21 import Data.Bits
22 import Data.Maybe
23 import Data.List (lookup)
24
25 import GHC.IO.Encoding.CodePage.Table
26
27 import GHC.IO.Encoding.Latin1 (mkLatin1)
28 import GHC.IO.Encoding.UTF8 (mkUTF8)
29 import GHC.IO.Encoding.UTF16 (mkUTF16le, mkUTF16be)
30 import GHC.IO.Encoding.UTF32 (mkUTF32le, mkUTF32be)
31
32 -- note CodePage = UInt which might not work on Win64. But the Win32 package
33 -- also has this issue.
34 getCurrentCodePage :: IO Word32
35 getCurrentCodePage = do
36 conCP <- getConsoleCP
37 if conCP > 0
38 then return conCP
39 else getACP
40
41 -- Since the Win32 package depends on base, we have to import these ourselves:
42 foreign import stdcall unsafe "windows.h GetConsoleCP"
43 getConsoleCP :: IO Word32
44
45 foreign import stdcall unsafe "windows.h GetACP"
46 getACP :: IO Word32
47
48 {-# NOINLINE currentCodePage #-}
49 currentCodePage :: Word32
50 currentCodePage = unsafePerformIO getCurrentCodePage
51
52 localeEncoding :: TextEncoding
53 localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
54
55 mkLocaleEncoding :: CodingFailureMode -> TextEncoding
56 mkLocaleEncoding cfm = mkCodePageEncoding cfm currentCodePage
57
58
59 codePageEncoding :: Word32 -> TextEncoding
60 codePageEncoding = mkCodePageEncoding ErrorOnCodingFailure
61
62 mkCodePageEncoding :: CodingFailureMode -> Word32 -> TextEncoding
63 mkCodePageEncoding cfm 65001 = mkUTF8 cfm
64 mkCodePageEncoding cfm 1200 = mkUTF16le cfm
65 mkCodePageEncoding cfm 1201 = mkUTF16be cfm
66 mkCodePageEncoding cfm 12000 = mkUTF32le cfm
67 mkCodePageEncoding cfm 12001 = mkUTF32be cfm
68 mkCodePageEncoding cfm cp = maybe (mkLatin1 cfm) (buildEncoding cfm cp) (lookup cp codePageMap)
69
70 buildEncoding :: CodingFailureMode -> Word32 -> CodePageArrays -> TextEncoding
71 buildEncoding cfm cp SingleByteCP {decoderArray = dec, encoderArray = enc}
72 = TextEncoding {
73 textEncodingName = "CP" ++ show cp
74 , mkTextDecoder = return $ simpleCodec (recoverDecode cfm) $ decodeFromSingleByte dec
75 , mkTextEncoder = return $ simpleCodec (recoverEncode cfm) $ encodeToSingleByte enc
76 }
77
78 simpleCodec :: (Buffer from -> Buffer to -> IO (Buffer from, Buffer to))
79 -> (Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to))
80 -> BufferCodec from to ()
81 simpleCodec r f = BufferCodec {
82 encode = f,
83 recover = r,
84 close = return (),
85 getState = return (),
86 setState = return
87 }
88
89 decodeFromSingleByte :: ConvArray Char -> DecodeBuffer
90 decodeFromSingleByte convArr
91 input@Buffer { bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
92 output@Buffer { bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
93 = let
94 done why !ir !ow = return (why,
95 if ir==iw then input{ bufL=0, bufR=0}
96 else input{ bufL=ir},
97 output {bufR=ow})
98 loop !ir !ow
99 | ow >= os = done OutputUnderflow ir ow
100 | ir >= iw = done InputUnderflow ir ow
101 | otherwise = do
102 b <- readWord8Buf iraw ir
103 let c = lookupConv convArr b
104 if c=='\0' && b /= 0 then invalid else do
105 ow' <- writeCharBuf oraw ow c
106 loop (ir+1) ow'
107 where
108 invalid = done InvalidSequence ir ow
109 in loop ir0 ow0
110
111 encodeToSingleByte :: CompactArray Char Word8 -> EncodeBuffer
112 encodeToSingleByte CompactArray { encoderMax = maxChar,
113 encoderIndices = indices,
114 encoderValues = values }
115 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
116 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
117 = let
118 done why !ir !ow = return (why,
119 if ir==iw then input { bufL=0, bufR=0 }
120 else input { bufL=ir },
121 output {bufR=ow})
122 loop !ir !ow
123 | ow >= os = done OutputUnderflow ir ow
124 | ir >= iw = done InputUnderflow ir ow
125 | otherwise = do
126 (c,ir') <- readCharBuf iraw ir
127 case lookupCompact maxChar indices values c of
128 Nothing -> invalid
129 Just 0 | c /= '\0' -> invalid
130 Just b -> do
131 writeWord8Buf oraw ow b
132 loop ir' (ow+1)
133 where
134 invalid = done InvalidSequence ir ow
135 in
136 loop ir0 ow0
137
138
139 --------------------------------------------
140 -- Array access functions
141
142 -- {-# INLINE lookupConv #-}
143 lookupConv :: ConvArray Char -> Word8 -> Char
144 lookupConv a = indexChar a . fromEnum
145
146 {-# INLINE lookupCompact #-}
147 lookupCompact :: Char -> ConvArray Int -> ConvArray Word8 -> Char -> Maybe Word8
148 lookupCompact maxVal indexes values x
149 | x > maxVal = Nothing
150 | otherwise = Just $ indexWord8 values $ j + (i .&. mask)
151 where
152 i = fromEnum x
153 mask = (1 `shiftL` n) - 1
154 k = i `shiftR` n
155 j = indexInt indexes k
156 n = blockBitSize
157
158 {-# INLINE indexInt #-}
159 indexInt :: ConvArray Int -> Int -> Int
160 indexInt (ConvArray p) (I# i) = I# (indexInt16OffAddr# p i)
161
162 {-# INLINE indexWord8 #-}
163 indexWord8 :: ConvArray Word8 -> Int -> Word8
164 indexWord8 (ConvArray p) (I# i) = W8# (indexWord8OffAddr# p i)
165
166 {-# INLINE indexChar #-}
167 indexChar :: ConvArray Char -> Int -> Char
168 indexChar (ConvArray p) (I# i) = C# (chr# (indexInt16OffAddr# p i))
169
170 #endif