Initialize hs_init with UTF8 encoded arguments on Windows.
[ghc.git] / libraries / base / GHC / IO / Encoding.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3 {-# OPTIONS_GHC -funbox-strict-fields #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : GHC.IO.Encoding
8 -- Copyright : (c) The University of Glasgow, 2008-2009
9 -- License : see libraries/base/LICENSE
10 --
11 -- Maintainer : libraries@haskell.org
12 -- Stability : internal
13 -- Portability : non-portable
14 --
15 -- Text codecs for I/O
16 --
17 -----------------------------------------------------------------------------
18
19 module GHC.IO.Encoding (
20 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
21 latin1, latin1_encode, latin1_decode,
22 utf8, utf8_bom,
23 utf16, utf16le, utf16be,
24 utf32, utf32le, utf32be,
25 initLocaleEncoding,
26 getLocaleEncoding, getFileSystemEncoding, getForeignEncoding,
27 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding,
28 char8,
29 mkTextEncoding,
30 argvEncoding
31 ) where
32
33 import GHC.Base
34 import GHC.IO.Exception
35 import GHC.IO.Buffer
36 import GHC.IO.Encoding.Failure
37 import GHC.IO.Encoding.Types
38 #if !defined(mingw32_HOST_OS)
39 import qualified GHC.IO.Encoding.Iconv as Iconv
40 #else
41 import qualified GHC.IO.Encoding.CodePage as CodePage
42 import Text.Read (reads)
43 #endif
44 import qualified GHC.IO.Encoding.Latin1 as Latin1
45 import qualified GHC.IO.Encoding.UTF8 as UTF8
46 import qualified GHC.IO.Encoding.UTF16 as UTF16
47 import qualified GHC.IO.Encoding.UTF32 as UTF32
48 import GHC.List
49 import GHC.Word
50
51 import Data.IORef
52 import Data.Char (toUpper)
53 import System.IO.Unsafe (unsafePerformIO)
54
55 -- -----------------------------------------------------------------------------
56
57 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
58 -- directly to the first 256 Unicode code points, and is thus not a
59 -- complete Unicode encoding. An attempt to write a character greater than
60 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
61 latin1 :: TextEncoding
62 latin1 = Latin1.latin1_checked
63
64 -- | The UTF-8 Unicode encoding
65 utf8 :: TextEncoding
66 utf8 = UTF8.utf8
67
68 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
69 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
70 -- except that on input, the BOM sequence is ignored at the beginning
71 -- of the stream, and on output, the BOM sequence is prepended.
72 --
73 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
74 -- sometimes used to identify the encoding of a file.
75 --
76 utf8_bom :: TextEncoding
77 utf8_bom = UTF8.utf8_bom
78
79 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
80 -- indicate endianness).
81 utf16 :: TextEncoding
82 utf16 = UTF16.utf16
83
84 -- | The UTF-16 Unicode encoding (litte-endian)
85 utf16le :: TextEncoding
86 utf16le = UTF16.utf16le
87
88 -- | The UTF-16 Unicode encoding (big-endian)
89 utf16be :: TextEncoding
90 utf16be = UTF16.utf16be
91
92 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
93 -- indicate endianness).
94 utf32 :: TextEncoding
95 utf32 = UTF32.utf32
96
97 -- | The UTF-32 Unicode encoding (litte-endian)
98 utf32le :: TextEncoding
99 utf32le = UTF32.utf32le
100
101 -- | The UTF-32 Unicode encoding (big-endian)
102 utf32be :: TextEncoding
103 utf32be = UTF32.utf32be
104
105 -- | The Unicode encoding of the current locale
106 --
107 -- @since 4.5.0.0
108 getLocaleEncoding :: IO TextEncoding
109
110 -- | The Unicode encoding of the current locale, but allowing arbitrary
111 -- undecodable bytes to be round-tripped through it.
112 --
113 -- This 'TextEncoding' is used to decode and encode command line arguments
114 -- and environment variables on non-Windows platforms.
115 --
116 -- On Windows, this encoding *should not* be used if possible because
117 -- the use of code pages is deprecated: Strings should be retrieved
118 -- via the "wide" W-family of UTF-16 APIs instead
119 --
120 -- @since 4.5.0.0
121 getFileSystemEncoding :: IO TextEncoding
122
123 -- | The Unicode encoding of the current locale, but where undecodable
124 -- bytes are replaced with their closest visual match. Used for
125 -- the 'CString' marshalling functions in "Foreign.C.String"
126 --
127 -- @since 4.5.0.0
128 getForeignEncoding :: IO TextEncoding
129
130 -- | @since 4.5.0.0
131 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding :: TextEncoding -> IO ()
132
133 (getLocaleEncoding, setLocaleEncoding) = mkGlobal initLocaleEncoding
134 (getFileSystemEncoding, setFileSystemEncoding) = mkGlobal initFileSystemEncoding
135 (getForeignEncoding, setForeignEncoding) = mkGlobal initForeignEncoding
136
137 mkGlobal :: a -> (IO a, a -> IO ())
138 mkGlobal x = unsafePerformIO $ do
139 x_ref <- newIORef x
140 return (readIORef x_ref, writeIORef x_ref)
141
142 -- | @since 4.5.0.0
143 initLocaleEncoding, initFileSystemEncoding, initForeignEncoding :: TextEncoding
144
145 #if !defined(mingw32_HOST_OS)
146 -- It is rather important that we don't just call Iconv.mkIconvEncoding here
147 -- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode
148 -- lone surrogates without complaint.
149 --
150 -- By going through our Haskell implementations of those encodings, we are
151 -- guaranteed to catch such errors.
152 --
153 -- FIXME: this is not a complete solution because if the locale encoding is one
154 -- which we don't have a Haskell-side decoder for, iconv might still ignore the
155 -- lone surrogate in the input.
156 initLocaleEncoding = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure Iconv.localeEncodingName
157 initFileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure Iconv.localeEncodingName
158 initForeignEncoding = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure Iconv.localeEncodingName
159 #else
160 initLocaleEncoding = CodePage.localeEncoding
161 initFileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
162 initForeignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
163 #endif
164
165 -- See Note [Windows Unicode Arguments] in rts/RtsFlags.c
166 -- On Windows we assume hs_init argv is in utf8 encoding.
167
168 -- | Internal encoding of argv
169 argvEncoding :: IO TextEncoding
170 #if defined(mingw32_HOST_OS)
171 argvEncoding = return utf8
172 #else
173 argvEncoding = getFileSystemEncoding
174 #endif
175
176 -- | An encoding in which Unicode code points are translated to bytes
177 -- by taking the code point modulo 256. When decoding, bytes are
178 -- translated directly into the equivalent code point.
179 --
180 -- This encoding never fails in either direction. However, encoding
181 -- discards information, so encode followed by decode is not the
182 -- identity.
183 --
184 -- @since 4.4.0.0
185 char8 :: TextEncoding
186 char8 = Latin1.latin1
187
188 -- | Look up the named Unicode encoding. May fail with
189 --
190 -- * 'isDoesNotExistError' if the encoding is unknown
191 --
192 -- The set of known encodings is system-dependent, but includes at least:
193 --
194 -- * @UTF-8@
195 --
196 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
197 --
198 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
199 --
200 -- There is additional notation (borrowed from GNU iconv) for specifying
201 -- how illegal characters are handled:
202 --
203 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
204 -- all illegal sequences on input to be ignored, and on output
205 -- will drop all code points that have no representation in the
206 -- target encoding.
207 --
208 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
209 -- for illegal sequences or code points.
210 --
211 -- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism
212 -- to represent any invalid bytes in the input as Unicode codepoints (specifically,
213 -- as lone surrogates, which are normally invalid in UTF-32).
214 -- Upon output, these special codepoints are detected and turned back into the
215 -- corresponding original byte.
216 --
217 -- In theory, this mechanism allows arbitrary data to be roundtripped via
218 -- a 'String' with no loss of data. In practice, there are two limitations
219 -- to be aware of:
220 --
221 -- 1. This only stands a chance of working for an encoding which is an ASCII
222 -- superset, as for security reasons we refuse to escape any bytes smaller
223 -- than 128. Many encodings of interest are ASCII supersets (in particular,
224 -- you can assume that the locale encoding is an ASCII superset) but many
225 -- (such as UTF-16) are not.
226 --
227 -- 2. If the underlying encoding is not itself roundtrippable, this mechanism
228 -- can fail. Roundtrippable encodings are those which have an injective mapping
229 -- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,
230 -- Shift-JIS (CP932) and Big5 contain several different encodings of the same
231 -- Unicode codepoint.
232 --
233 -- On Windows, you can access supported code pages with the prefix
234 -- @CP@; for example, @\"CP1250\"@.
235 --
236 mkTextEncoding :: String -> IO TextEncoding
237 mkTextEncoding e = case mb_coding_failure_mode of
238 Nothing -> unknownEncodingErr e
239 Just cfm -> mkTextEncoding' cfm enc
240 where
241 (enc, suffix) = span (/= '/') e
242 mb_coding_failure_mode = case suffix of
243 "" -> Just ErrorOnCodingFailure
244 "//IGNORE" -> Just IgnoreCodingFailure
245 "//TRANSLIT" -> Just TransliterateCodingFailure
246 "//ROUNDTRIP" -> Just RoundtripFailure
247 _ -> Nothing
248
249 mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
250 mkTextEncoding' cfm enc =
251 case [toUpper c | c <- enc, c /= '-'] of
252 -- UTF-8 and friends we can handle ourselves
253 "UTF8" -> return $ UTF8.mkUTF8 cfm
254 "UTF16" -> return $ UTF16.mkUTF16 cfm
255 "UTF16LE" -> return $ UTF16.mkUTF16le cfm
256 "UTF16BE" -> return $ UTF16.mkUTF16be cfm
257 "UTF32" -> return $ UTF32.mkUTF32 cfm
258 "UTF32LE" -> return $ UTF32.mkUTF32le cfm
259 "UTF32BE" -> return $ UTF32.mkUTF32be cfm
260 -- On AIX, we want to avoid iconv, because it is either
261 -- a) totally broken, or b) non-reentrant, or c) actually works.
262 -- Detecting b) is difficult as you'd have to trigger the reentrancy
263 -- corruption.
264 -- Therefore, on AIX, we handle the popular ASCII and latin1 encodings
265 -- ourselves. For consistency, we do the same on other platforms.
266 -- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter
267 -- completely ignores the CodingFailureMode (TEST=encoding005).
268 _ | isAscii -> return (Latin1.mkAscii cfm)
269 _ | isLatin1 -> return (Latin1.mkLatin1_checked cfm)
270 #if defined(mingw32_HOST_OS)
271 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
272 _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
273 #else
274 -- Otherwise, handle other encoding needs via iconv.
275
276 -- Unfortunately there is no good way to determine whether iconv is actually
277 -- functional without telling it to do something.
278 _ -> do res <- Iconv.mkIconvEncoding cfm enc
279 case res of
280 Just e -> return e
281 Nothing -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
282 #endif
283 where
284 isAscii = enc `elem` asciiEncNames
285 isLatin1 = enc `elem` latin1EncNames
286 asciiEncNames = -- ASCII aliases specified by RFC 1345 and RFC 3808.
287 [ "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991"
288 , "US-ASCII", "us", "IBM367", "cp367", "csASCII", "ASCII", "ISO646-US"
289 ]
290 latin1EncNames = -- latin1 aliases specified by RFC 1345 and RFC 3808.
291 [ "ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1",
292 "l1", "IBM819", "CP819", "csISOLatin1"
293 ]
294
295
296 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
297 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
298 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
299
300 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
301 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
302 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
303
304 unknownEncodingErr :: String -> IO a
305 unknownEncodingErr e = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
306 ("unknown encoding:" ++ e) Nothing Nothing)