Add Haddock `/Since: 4.5.[01].0/` comments to symbols
[packages/base.git] / GHC / IO / Encoding.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
3 {-# OPTIONS_GHC -funbox-strict-fields #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : GHC.IO.Encoding
8 -- Copyright : (c) The University of Glasgow, 2008-2009
9 -- License : see libraries/base/LICENSE
10 --
11 -- Maintainer : libraries@haskell.org
12 -- Stability : internal
13 -- Portability : non-portable
14 --
15 -- Text codecs for I/O
16 --
17 -----------------------------------------------------------------------------
18
19 module GHC.IO.Encoding (
20 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
21 latin1, latin1_encode, latin1_decode,
22 utf8, utf8_bom,
23 utf16, utf16le, utf16be,
24 utf32, utf32le, utf32be,
25 initLocaleEncoding,
26 getLocaleEncoding, getFileSystemEncoding, getForeignEncoding,
27 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding,
28 char8,
29 mkTextEncoding,
30 ) where
31
32 import GHC.Base
33 import GHC.IO.Exception
34 import GHC.IO.Buffer
35 import GHC.IO.Encoding.Failure
36 import GHC.IO.Encoding.Types
37 #if !defined(mingw32_HOST_OS)
38 import qualified GHC.IO.Encoding.Iconv as Iconv
39 #else
40 import qualified GHC.IO.Encoding.CodePage as CodePage
41 import Text.Read (reads)
42 #endif
43 import qualified GHC.IO.Encoding.Latin1 as Latin1
44 import qualified GHC.IO.Encoding.UTF8 as UTF8
45 import qualified GHC.IO.Encoding.UTF16 as UTF16
46 import qualified GHC.IO.Encoding.UTF32 as UTF32
47 import GHC.Word
48
49 import Data.IORef
50 import Data.Char (toUpper)
51 import Data.List
52 import Data.Maybe
53 import System.IO.Unsafe (unsafePerformIO)
54
55 -- -----------------------------------------------------------------------------
56
57 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
58 -- directly to the first 256 Unicode code points, and is thus not a
59 -- complete Unicode encoding. An attempt to write a character greater than
60 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
61 latin1 :: TextEncoding
62 latin1 = Latin1.latin1_checked
63
64 -- | The UTF-8 Unicode encoding
65 utf8 :: TextEncoding
66 utf8 = UTF8.utf8
67
68 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
69 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
70 -- except that on input, the BOM sequence is ignored at the beginning
71 -- of the stream, and on output, the BOM sequence is prepended.
72 --
73 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
74 -- sometimes used to identify the encoding of a file.
75 --
76 utf8_bom :: TextEncoding
77 utf8_bom = UTF8.utf8_bom
78
79 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
80 -- indicate endianness).
81 utf16 :: TextEncoding
82 utf16 = UTF16.utf16
83
84 -- | The UTF-16 Unicode encoding (litte-endian)
85 utf16le :: TextEncoding
86 utf16le = UTF16.utf16le
87
88 -- | The UTF-16 Unicode encoding (big-endian)
89 utf16be :: TextEncoding
90 utf16be = UTF16.utf16be
91
92 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
93 -- indicate endianness).
94 utf32 :: TextEncoding
95 utf32 = UTF32.utf32
96
97 -- | The UTF-32 Unicode encoding (litte-endian)
98 utf32le :: TextEncoding
99 utf32le = UTF32.utf32le
100
101 -- | The UTF-32 Unicode encoding (big-endian)
102 utf32be :: TextEncoding
103 utf32be = UTF32.utf32be
104
105 -- | The Unicode encoding of the current locale
106 --
107 -- /Since: 4.5.0.0/
108 getLocaleEncoding :: IO TextEncoding
109
110 -- | The Unicode encoding of the current locale, but allowing arbitrary
111 -- undecodable bytes to be round-tripped through it.
112 --
113 -- This 'TextEncoding' is used to decode and encode command line arguments
114 -- and environment variables on non-Windows platforms.
115 --
116 -- On Windows, this encoding *should not* be used if possible because
117 -- the use of code pages is deprecated: Strings should be retrieved
118 -- via the "wide" W-family of UTF-16 APIs instead
119 --
120 -- /Since: 4.5.0.0/
121 getFileSystemEncoding :: IO TextEncoding
122
123 -- | The Unicode encoding of the current locale, but where undecodable
124 -- bytes are replaced with their closest visual match. Used for
125 -- the 'CString' marshalling functions in "Foreign.C.String"
126 --
127 -- /Since: 4.5.0.0/
128 getForeignEncoding :: IO TextEncoding
129
130 -- | /Since: 4.5.0.0/
131 setLocaleEncoding, setFileSystemEncoding, setForeignEncoding :: TextEncoding -> IO ()
132
133 (getLocaleEncoding, setLocaleEncoding) = mkGlobal initLocaleEncoding
134 (getFileSystemEncoding, setFileSystemEncoding) = mkGlobal initFileSystemEncoding
135 (getForeignEncoding, setForeignEncoding) = mkGlobal initForeignEncoding
136
137 mkGlobal :: a -> (IO a, a -> IO ())
138 mkGlobal x = unsafePerformIO $ do
139 x_ref <- newIORef x
140 return (readIORef x_ref, writeIORef x_ref)
141
142 -- | /Since: 4.5.0.0/
143 initLocaleEncoding, initFileSystemEncoding, initForeignEncoding :: TextEncoding
144
145 #if !defined(mingw32_HOST_OS)
146 -- It is rather important that we don't just call Iconv.mkIconvEncoding here
147 -- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode
148 -- lone surrogates without complaint.
149 --
150 -- By going through our Haskell implementations of those encodings, we are
151 -- guaranteed to catch such errors.
152 --
153 -- FIXME: this is not a complete solution because if the locale encoding is one
154 -- which we don't have a Haskell-side decoder for, iconv might still ignore the
155 -- lone surrogate in the input.
156 initLocaleEncoding = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure Iconv.localeEncodingName
157 initFileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure Iconv.localeEncodingName
158 initForeignEncoding = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure Iconv.localeEncodingName
159 #else
160 initLocaleEncoding = CodePage.localeEncoding
161 initFileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
162 initForeignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
163 #endif
164
165 -- | An encoding in which Unicode code points are translated to bytes
166 -- by taking the code point modulo 256. When decoding, bytes are
167 -- translated directly into the equivalent code point.
168 --
169 -- This encoding never fails in either direction. However, encoding
170 -- discards information, so encode followed by decode is not the
171 -- identity.
172 char8 :: TextEncoding
173 char8 = Latin1.latin1
174
175 -- | Look up the named Unicode encoding. May fail with
176 --
177 -- * 'isDoesNotExistError' if the encoding is unknown
178 --
179 -- The set of known encodings is system-dependent, but includes at least:
180 --
181 -- * @UTF-8@
182 --
183 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
184 --
185 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
186 --
187 -- There is additional notation (borrowed from GNU iconv) for specifying
188 -- how illegal characters are handled:
189 --
190 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
191 -- all illegal sequences on input to be ignored, and on output
192 -- will drop all code points that have no representation in the
193 -- target encoding.
194 --
195 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
196 -- for illegal sequences or code points.
197 --
198 -- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism
199 -- to represent any invalid bytes in the input as Unicode codepoints (specifically,
200 -- as lone surrogates, which are normally invalid in UTF-32).
201 -- Upon output, these special codepoints are detected and turned back into the
202 -- corresponding original byte.
203 --
204 -- In theory, this mechanism allows arbitrary data to be roundtripped via
205 -- a 'String' with no loss of data. In practice, there are two limitations
206 -- to be aware of:
207 --
208 -- 1. This only stands a chance of working for an encoding which is an ASCII
209 -- superset, as for security reasons we refuse to escape any bytes smaller
210 -- than 128. Many encodings of interest are ASCII supersets (in particular,
211 -- you can assume that the locale encoding is an ASCII superset) but many
212 -- (such as UTF-16) are not.
213 --
214 -- 2. If the underlying encoding is not itself roundtrippable, this mechanism
215 -- can fail. Roundtrippable encodings are those which have an injective mapping
216 -- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,
217 -- Shift-JIS (CP932) and Big5 contain several different encodings of the same
218 -- Unicode codepoint.
219 --
220 -- On Windows, you can access supported code pages with the prefix
221 -- @CP@; for example, @\"CP1250\"@.
222 --
223 mkTextEncoding :: String -> IO TextEncoding
224 mkTextEncoding e = case mb_coding_failure_mode of
225 Nothing -> unknownEncodingErr e
226 Just cfm -> mkTextEncoding' cfm enc
227 where
228 (enc, suffix) = span (/= '/') e
229 mb_coding_failure_mode = case suffix of
230 "" -> Just ErrorOnCodingFailure
231 "//IGNORE" -> Just IgnoreCodingFailure
232 "//TRANSLIT" -> Just TransliterateCodingFailure
233 "//ROUNDTRIP" -> Just RoundtripFailure
234 _ -> Nothing
235
236 mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
237 mkTextEncoding' cfm enc = case [toUpper c | c <- enc, c /= '-'] of
238 "UTF8" -> return $ UTF8.mkUTF8 cfm
239 "UTF16" -> return $ UTF16.mkUTF16 cfm
240 "UTF16LE" -> return $ UTF16.mkUTF16le cfm
241 "UTF16BE" -> return $ UTF16.mkUTF16be cfm
242 "UTF32" -> return $ UTF32.mkUTF32 cfm
243 "UTF32LE" -> return $ UTF32.mkUTF32le cfm
244 "UTF32BE" -> return $ UTF32.mkUTF32be cfm
245 #if defined(mingw32_HOST_OS)
246 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
247 _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
248 #else
249 _ -> Iconv.mkIconvEncoding cfm enc
250 #endif
251
252 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
253 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
254 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
255
256 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
257 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
258 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
259
260 unknownEncodingErr :: String -> IO a
261 unknownEncodingErr e = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
262 ("unknown encoding:" ++ e) Nothing Nothing)