052955c2ea85a1ae595fca49b29ec39fe282b4e6
[packages/base.git] / GHC / IO / Encoding.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
3 {-# OPTIONS_GHC -funbox-strict-fields #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : GHC.IO.Encoding
8 -- Copyright : (c) The University of Glasgow, 2008-2009
9 -- License : see libraries/base/LICENSE
10 --
11 -- Maintainer : libraries@haskell.org
12 -- Stability : internal
13 -- Portability : non-portable
14 --
15 -- Text codecs for I/O
16 --
17 -----------------------------------------------------------------------------
18
19 module GHC.IO.Encoding (
20 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
21 latin1, latin1_encode, latin1_decode,
22 utf8, utf8_bom,
23 utf16, utf16le, utf16be,
24 utf32, utf32le, utf32be,
25 initLocaleEncoding,
26 getLocaleEncoding, getFileSystemEncoding, getForeignEncoding,
27 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding,
28 char8,
29 mkTextEncoding,
30 ) where
31
32 import GHC.Base
33 import GHC.IO.Exception
34 import GHC.IO.Buffer
35 import GHC.IO.Encoding.Failure
36 import GHC.IO.Encoding.Types
37 #if !defined(mingw32_HOST_OS)
38 import qualified GHC.IO.Encoding.Iconv as Iconv
39 #else
40 import qualified GHC.IO.Encoding.CodePage as CodePage
41 import Text.Read (reads)
42 #endif
43 import qualified GHC.IO.Encoding.Latin1 as Latin1
44 import qualified GHC.IO.Encoding.UTF8 as UTF8
45 import qualified GHC.IO.Encoding.UTF16 as UTF16
46 import qualified GHC.IO.Encoding.UTF32 as UTF32
47 import GHC.Word
48
49 import Data.IORef
50 import Data.Char (toUpper)
51 import Data.List
52 import Data.Maybe
53 import System.IO.Unsafe (unsafePerformIO)
54
55 -- -----------------------------------------------------------------------------
56
57 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
58 -- directly to the first 256 Unicode code points, and is thus not a
59 -- complete Unicode encoding. An attempt to write a character greater than
60 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
61 latin1 :: TextEncoding
62 latin1 = Latin1.latin1_checked
63
64 -- | The UTF-8 Unicode encoding
65 utf8 :: TextEncoding
66 utf8 = UTF8.utf8
67
68 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
69 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
70 -- except that on input, the BOM sequence is ignored at the beginning
71 -- of the stream, and on output, the BOM sequence is prepended.
72 --
73 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
74 -- sometimes used to identify the encoding of a file.
75 --
76 utf8_bom :: TextEncoding
77 utf8_bom = UTF8.utf8_bom
78
79 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
80 -- indicate endianness).
81 utf16 :: TextEncoding
82 utf16 = UTF16.utf16
83
84 -- | The UTF-16 Unicode encoding (litte-endian)
85 utf16le :: TextEncoding
86 utf16le = UTF16.utf16le
87
88 -- | The UTF-16 Unicode encoding (big-endian)
89 utf16be :: TextEncoding
90 utf16be = UTF16.utf16be
91
92 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
93 -- indicate endianness).
94 utf32 :: TextEncoding
95 utf32 = UTF32.utf32
96
97 -- | The UTF-32 Unicode encoding (litte-endian)
98 utf32le :: TextEncoding
99 utf32le = UTF32.utf32le
100
101 -- | The UTF-32 Unicode encoding (big-endian)
102 utf32be :: TextEncoding
103 utf32be = UTF32.utf32be
104
105 -- | The Unicode encoding of the current locale
106 getLocaleEncoding :: IO TextEncoding
107
108 -- | The Unicode encoding of the current locale, but allowing arbitrary
109 -- undecodable bytes to be round-tripped through it.
110 --
111 -- This 'TextEncoding' is used to decode and encode command line arguments
112 -- and environment variables on non-Windows platforms.
113 --
114 -- On Windows, this encoding *should not* be used if possible because
115 -- the use of code pages is deprecated: Strings should be retrieved
116 -- via the "wide" W-family of UTF-16 APIs instead
117 getFileSystemEncoding :: IO TextEncoding
118
119 -- | The Unicode encoding of the current locale, but where undecodable
120 -- bytes are replaced with their closest visual match. Used for
121 -- the 'CString' marshalling functions in "Foreign.C.String"
122 getForeignEncoding :: IO TextEncoding
123
124 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding :: TextEncoding -> IO ()
125 (getLocaleEncoding, setLocaleEncoding) = mkGlobal initLocaleEncoding
126 (getFileSystemEncoding, setFileSystemEncoding) = mkGlobal initFileSystemEncoding
127 (getForeignEncoding, setForeignEncoding) = mkGlobal initForeignEncoding
128
129 mkGlobal :: a -> (IO a, a -> IO ())
130 mkGlobal x = unsafePerformIO $ do
131 x_ref <- newIORef x
132 return (readIORef x_ref, writeIORef x_ref)
133
134 initLocaleEncoding, initFileSystemEncoding, initForeignEncoding :: TextEncoding
135
136 #if !defined(mingw32_HOST_OS)
137 -- It is rather important that we don't just call Iconv.mkIconvEncoding here
138 -- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode
139 -- lone surrogates without complaint.
140 --
141 -- By going through our Haskell implementations of those encodings, we are
142 -- guaranteed to catch such errors.
143 --
144 -- FIXME: this is not a complete solution because if the locale encoding is one
145 -- which we don't have a Haskell-side decoder for, iconv might still ignore the
146 -- lone surrogate in the input.
147 initLocaleEncoding = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure Iconv.localeEncodingName
148 initFileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure Iconv.localeEncodingName
149 initForeignEncoding = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure Iconv.localeEncodingName
150 #else
151 initLocaleEncoding = CodePage.localeEncoding
152 initFileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
153 initForeignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
154 #endif
155
156 -- | An encoding in which Unicode code points are translated to bytes
157 -- by taking the code point modulo 256. When decoding, bytes are
158 -- translated directly into the equivalent code point.
159 --
160 -- This encoding never fails in either direction. However, encoding
161 -- discards information, so encode followed by decode is not the
162 -- identity.
163 char8 :: TextEncoding
164 char8 = Latin1.latin1
165
166 -- | Look up the named Unicode encoding. May fail with
167 --
168 -- * 'isDoesNotExistError' if the encoding is unknown
169 --
170 -- The set of known encodings is system-dependent, but includes at least:
171 --
172 -- * @UTF-8@
173 --
174 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
175 --
176 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
177 --
178 -- There is additional notation (borrowed from GNU iconv) for specifying
179 -- how illegal characters are handled:
180 --
181 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
182 -- all illegal sequences on input to be ignored, and on output
183 -- will drop all code points that have no representation in the
184 -- target encoding.
185 --
186 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
187 -- for illegal sequences or code points.
188 --
189 -- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism
190 -- to represent any invalid bytes in the input as Unicode codepoints (specifically,
191 -- as lone surrogates, which are normally invalid in UTF-32).
192 -- Upon output, these special codepoints are detected and turned back into the
193 -- corresponding original byte.
194 --
195 -- In theory, this mechanism allows arbitrary data to be roundtripped via
196 -- a 'String' with no loss of data. In practice, there are two limitations
197 -- to be aware of:
198 --
199 -- 1. This only stands a chance of working for an encoding which is an ASCII
200 -- superset, as for security reasons we refuse to escape any bytes smaller
201 -- than 128. Many encodings of interest are ASCII supersets (in particular,
202 -- you can assume that the locale encoding is an ASCII superset) but many
203 -- (such as UTF-16) are not.
204 --
205 -- 2. If the underlying encoding is not itself roundtrippable, this mechanism
206 -- can fail. Roundtrippable encodings are those which have an injective mapping
207 -- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,
208 -- Shift-JIS (CP932) and Big5 contain several different encodings of the same
209 -- Unicode codepoint.
210 --
211 -- On Windows, you can access supported code pages with the prefix
212 -- @CP@; for example, @\"CP1250\"@.
213 --
214 mkTextEncoding :: String -> IO TextEncoding
215 mkTextEncoding e = case mb_coding_failure_mode of
216 Nothing -> unknownEncodingErr e
217 Just cfm -> mkTextEncoding' cfm enc
218 where
219 (enc, suffix) = span (/= '/') e
220 mb_coding_failure_mode = case suffix of
221 "" -> Just ErrorOnCodingFailure
222 "//IGNORE" -> Just IgnoreCodingFailure
223 "//TRANSLIT" -> Just TransliterateCodingFailure
224 "//ROUNDTRIP" -> Just RoundtripFailure
225 _ -> Nothing
226
227 mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
228 mkTextEncoding' cfm enc = case [toUpper c | c <- enc, c /= '-'] of
229 "UTF8" -> return $ UTF8.mkUTF8 cfm
230 "UTF16" -> return $ UTF16.mkUTF16 cfm
231 "UTF16LE" -> return $ UTF16.mkUTF16le cfm
232 "UTF16BE" -> return $ UTF16.mkUTF16be cfm
233 "UTF32" -> return $ UTF32.mkUTF32 cfm
234 "UTF32LE" -> return $ UTF32.mkUTF32le cfm
235 "UTF32BE" -> return $ UTF32.mkUTF32be cfm
236 #if defined(mingw32_HOST_OS)
237 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
238 _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
239 #else
240 _ -> Iconv.mkIconvEncoding cfm enc
241 #endif
242
243 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
244 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
245 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
246
247 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
248 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
249 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
250
251 unknownEncodingErr :: String -> IO a
252 unknownEncodingErr e = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
253 ("unknown encoding:" ++ e) Nothing Nothing)