578a420faf2f6937b315c146b223bfdc42430b04
[ghc.git] / libraries / base / GHC / IO / Encoding.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3 {-# OPTIONS_GHC -funbox-strict-fields #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : GHC.IO.Encoding
8 -- Copyright : (c) The University of Glasgow, 2008-2009
9 -- License : see libraries/base/LICENSE
10 --
11 -- Maintainer : libraries@haskell.org
12 -- Stability : internal
13 -- Portability : non-portable
14 --
15 -- Text codecs for I/O
16 --
17 -----------------------------------------------------------------------------
18
19 module GHC.IO.Encoding (
20 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
21 latin1, latin1_encode, latin1_decode,
22 utf8, utf8_bom,
23 utf16, utf16le, utf16be,
24 utf32, utf32le, utf32be,
25 initLocaleEncoding,
26 getLocaleEncoding, getFileSystemEncoding, getForeignEncoding,
27 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding,
28 char8,
29 mkTextEncoding,
30 ) where
31
32 import GHC.Base
33 import GHC.IO.Exception
34 import GHC.IO.Buffer
35 import GHC.IO.Encoding.Failure
36 import GHC.IO.Encoding.Types
37 #if !defined(mingw32_HOST_OS)
38 import qualified GHC.IO.Encoding.Iconv as Iconv
39 #else
40 import qualified GHC.IO.Encoding.CodePage as CodePage
41 import Text.Read (reads)
42 #endif
43 import qualified GHC.IO.Encoding.Latin1 as Latin1
44 import qualified GHC.IO.Encoding.UTF8 as UTF8
45 import qualified GHC.IO.Encoding.UTF16 as UTF16
46 import qualified GHC.IO.Encoding.UTF32 as UTF32
47 import GHC.List
48 import GHC.Word
49
50 import Data.IORef
51 import Data.Char (toUpper)
52 import System.IO.Unsafe (unsafePerformIO)
53
54 -- -----------------------------------------------------------------------------
55
56 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
57 -- directly to the first 256 Unicode code points, and is thus not a
58 -- complete Unicode encoding. An attempt to write a character greater than
59 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
60 latin1 :: TextEncoding
61 latin1 = Latin1.latin1_checked
62
63 -- | The UTF-8 Unicode encoding
64 utf8 :: TextEncoding
65 utf8 = UTF8.utf8
66
67 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
68 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
69 -- except that on input, the BOM sequence is ignored at the beginning
70 -- of the stream, and on output, the BOM sequence is prepended.
71 --
72 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
73 -- sometimes used to identify the encoding of a file.
74 --
75 utf8_bom :: TextEncoding
76 utf8_bom = UTF8.utf8_bom
77
78 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
79 -- indicate endianness).
80 utf16 :: TextEncoding
81 utf16 = UTF16.utf16
82
83 -- | The UTF-16 Unicode encoding (litte-endian)
84 utf16le :: TextEncoding
85 utf16le = UTF16.utf16le
86
87 -- | The UTF-16 Unicode encoding (big-endian)
88 utf16be :: TextEncoding
89 utf16be = UTF16.utf16be
90
91 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
92 -- indicate endianness).
93 utf32 :: TextEncoding
94 utf32 = UTF32.utf32
95
96 -- | The UTF-32 Unicode encoding (litte-endian)
97 utf32le :: TextEncoding
98 utf32le = UTF32.utf32le
99
100 -- | The UTF-32 Unicode encoding (big-endian)
101 utf32be :: TextEncoding
102 utf32be = UTF32.utf32be
103
104 -- | The Unicode encoding of the current locale
105 --
106 -- @since 4.5.0.0
107 getLocaleEncoding :: IO TextEncoding
108
109 -- | The Unicode encoding of the current locale, but allowing arbitrary
110 -- undecodable bytes to be round-tripped through it.
111 --
112 -- This 'TextEncoding' is used to decode and encode command line arguments
113 -- and environment variables on non-Windows platforms.
114 --
115 -- On Windows, this encoding *should not* be used if possible because
116 -- the use of code pages is deprecated: Strings should be retrieved
117 -- via the "wide" W-family of UTF-16 APIs instead
118 --
119 -- @since 4.5.0.0
120 getFileSystemEncoding :: IO TextEncoding
121
122 -- | The Unicode encoding of the current locale, but where undecodable
123 -- bytes are replaced with their closest visual match. Used for
124 -- the 'CString' marshalling functions in "Foreign.C.String"
125 --
126 -- @since 4.5.0.0
127 getForeignEncoding :: IO TextEncoding
128
129 -- | @since 4.5.0.0
130 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding :: TextEncoding -> IO ()
131
132 (getLocaleEncoding, setLocaleEncoding) = mkGlobal initLocaleEncoding
133 (getFileSystemEncoding, setFileSystemEncoding) = mkGlobal initFileSystemEncoding
134 (getForeignEncoding, setForeignEncoding) = mkGlobal initForeignEncoding
135
136 mkGlobal :: a -> (IO a, a -> IO ())
137 mkGlobal x = unsafePerformIO $ do
138 x_ref <- newIORef x
139 return (readIORef x_ref, writeIORef x_ref)
140
141 -- | @since 4.5.0.0
142 initLocaleEncoding, initFileSystemEncoding, initForeignEncoding :: TextEncoding
143
144 #if !defined(mingw32_HOST_OS)
145 -- It is rather important that we don't just call Iconv.mkIconvEncoding here
146 -- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode
147 -- lone surrogates without complaint.
148 --
149 -- By going through our Haskell implementations of those encodings, we are
150 -- guaranteed to catch such errors.
151 --
152 -- FIXME: this is not a complete solution because if the locale encoding is one
153 -- which we don't have a Haskell-side decoder for, iconv might still ignore the
154 -- lone surrogate in the input.
155 initLocaleEncoding = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure Iconv.localeEncodingName
156 initFileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure Iconv.localeEncodingName
157 initForeignEncoding = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure Iconv.localeEncodingName
158 #else
159 initLocaleEncoding = CodePage.localeEncoding
160 initFileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
161 initForeignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
162 #endif
163
164 -- | An encoding in which Unicode code points are translated to bytes
165 -- by taking the code point modulo 256. When decoding, bytes are
166 -- translated directly into the equivalent code point.
167 --
168 -- This encoding never fails in either direction. However, encoding
169 -- discards information, so encode followed by decode is not the
170 -- identity.
171 --
172 -- @since 4.4.0.0
173 char8 :: TextEncoding
174 char8 = Latin1.latin1
175
176 -- | Look up the named Unicode encoding. May fail with
177 --
178 -- * 'isDoesNotExistError' if the encoding is unknown
179 --
180 -- The set of known encodings is system-dependent, but includes at least:
181 --
182 -- * @UTF-8@
183 --
184 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
185 --
186 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
187 --
188 -- There is additional notation (borrowed from GNU iconv) for specifying
189 -- how illegal characters are handled:
190 --
191 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
192 -- all illegal sequences on input to be ignored, and on output
193 -- will drop all code points that have no representation in the
194 -- target encoding.
195 --
196 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
197 -- for illegal sequences or code points.
198 --
199 -- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism
200 -- to represent any invalid bytes in the input as Unicode codepoints (specifically,
201 -- as lone surrogates, which are normally invalid in UTF-32).
202 -- Upon output, these special codepoints are detected and turned back into the
203 -- corresponding original byte.
204 --
205 -- In theory, this mechanism allows arbitrary data to be roundtripped via
206 -- a 'String' with no loss of data. In practice, there are two limitations
207 -- to be aware of:
208 --
209 -- 1. This only stands a chance of working for an encoding which is an ASCII
210 -- superset, as for security reasons we refuse to escape any bytes smaller
211 -- than 128. Many encodings of interest are ASCII supersets (in particular,
212 -- you can assume that the locale encoding is an ASCII superset) but many
213 -- (such as UTF-16) are not.
214 --
215 -- 2. If the underlying encoding is not itself roundtrippable, this mechanism
216 -- can fail. Roundtrippable encodings are those which have an injective mapping
217 -- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,
218 -- Shift-JIS (CP932) and Big5 contain several different encodings of the same
219 -- Unicode codepoint.
220 --
221 -- On Windows, you can access supported code pages with the prefix
222 -- @CP@; for example, @\"CP1250\"@.
223 --
224 mkTextEncoding :: String -> IO TextEncoding
225 mkTextEncoding e = case mb_coding_failure_mode of
226 Nothing -> unknownEncodingErr e
227 Just cfm -> mkTextEncoding' cfm enc
228 where
229 (enc, suffix) = span (/= '/') e
230 mb_coding_failure_mode = case suffix of
231 "" -> Just ErrorOnCodingFailure
232 "//IGNORE" -> Just IgnoreCodingFailure
233 "//TRANSLIT" -> Just TransliterateCodingFailure
234 "//ROUNDTRIP" -> Just RoundtripFailure
235 _ -> Nothing
236
237 mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
238 mkTextEncoding' cfm enc =
239 case [toUpper c | c <- enc, c /= '-'] of
240 -- UTF-8 and friends we can handle ourselves
241 "UTF8" -> return $ UTF8.mkUTF8 cfm
242 "UTF16" -> return $ UTF16.mkUTF16 cfm
243 "UTF16LE" -> return $ UTF16.mkUTF16le cfm
244 "UTF16BE" -> return $ UTF16.mkUTF16be cfm
245 "UTF32" -> return $ UTF32.mkUTF32 cfm
246 "UTF32LE" -> return $ UTF32.mkUTF32le cfm
247 "UTF32BE" -> return $ UTF32.mkUTF32be cfm
248 -- On AIX, we want to avoid iconv, because it is either
249 -- a) totally broken, or b) non-reentrant, or c) actually works.
250 -- Detecting b) is difficult as you'd have to trigger the reentrancy
251 -- corruption.
252 -- Therefore, on AIX, we handle the popular ASCII and latin1 encodings
253 -- ourselves. For consistency, we do the same on other platforms.
254 -- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter
255 -- completely ignores the CodingFailureMode (TEST=encoding005).
256 _ | isAscii -> return (Latin1.mkAscii cfm)
257 _ | isLatin1 -> return (Latin1.mkLatin1_checked cfm)
258 #if defined(mingw32_HOST_OS)
259 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
260 _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
261 #else
262 -- Otherwise, handle other encoding needs via iconv.
263
264 -- Unfortunately there is no good way to determine whether iconv is actually
265 -- functional without telling it to do something.
266 _ -> do res <- Iconv.mkIconvEncoding cfm enc
267 case res of
268 Just e -> return e
269 Nothing -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
270 #endif
271 where
272 isAscii = enc `elem` asciiEncNames
273 isLatin1 = enc `elem` latin1EncNames
274 asciiEncNames = -- ASCII aliases specified by RFC 1345 and RFC 3808.
275 [ "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991"
276 , "US-ASCII", "us", "IBM367", "cp367", "csASCII", "ASCII", "ISO646-US"
277 ]
278 latin1EncNames = -- latin1 aliases specified by RFC 1345 and RFC 3808.
279 [ "ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1",
280 "l1", "IBM819", "CP819", "csISOLatin1"
281 ]
282
283
284 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
285 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
286 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
287
288 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
289 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
290 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
291
292 unknownEncodingErr :: String -> IO a
293 unknownEncodingErr e = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
294 ("unknown encoding:" ++ e) Nothing Nothing)