Update base for latest Safe Haskell.
[packages/base.git] / GHC / IO / Encoding / Failure.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : GHC.IO.Encoding.Failure
7 -- Copyright : (c) The University of Glasgow, 2008-2011
8 -- License : see libraries/base/LICENSE
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : internal
12 -- Portability : non-portable
13 --
14 -- Types for specifying how text encoding/decoding fails
15 --
16 -----------------------------------------------------------------------------
17
18 module GHC.IO.Encoding.Failure (
19 CodingFailureMode(..), codingFailureModeSuffix,
20 isSurrogate,
21 surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
22 recoverDecode, recoverEncode
23 ) where
24
25 import GHC.IO
26 import GHC.IO.Buffer
27 import GHC.IO.Exception
28
29 import GHC.Base
30 import GHC.Word
31 import GHC.Show
32 import GHC.Num
33 import GHC.Real ( fromIntegral )
34
35 --import System.Posix.Internals
36
37 import Data.Maybe
38
39
40 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and
41 -- specifies how they handle illegal sequences.
42 data CodingFailureMode
43 = ErrorOnCodingFailure
44 -- ^ Throw an error when an illegal sequence is encountered
45 | IgnoreCodingFailure
46 -- ^ Attempt to ignore and recover if an illegal sequence is
47 -- encountered
48 | TransliterateCodingFailure
49 -- ^ Replace with the closest visual match upon an illegal
50 -- sequence
51 | RoundtripFailure
52 -- ^ Use the private-use escape mechanism to attempt to allow
53 -- illegal sequences to be roundtripped.
54 deriving (Show)
55 -- This will only work properly for those encodings which are
56 -- strict supersets of ASCII in the sense that valid ASCII data
57 -- is also valid in that encoding. This is not true for
58 -- e.g. UTF-16, because ASCII characters must be padded to two
59 -- bytes to retain their meaning.
60
61 -- Note [Roundtripping]
62 -- ~~~~~~~~~~~~~~~~~~~~
63 --
64 -- Roundtripping is based on the ideas of PEP383. However, unlike
65 -- PEP383 we do not wish to use lone surrogate codepoints to escape
66 -- undecodable bytes, because that may confuse Unicode processing
67 -- software written in Haskell. Instead, we use the range of
68 -- private-use characters from 0xEF80 to 0xEFFF designated for
69 -- "encoding hacks" by the ConScript Unicode Registery.
70 --
71 -- This introduces a technical problem when it comes to encoding back
72 -- to bytes using iconv. The iconv code will not fail when it tries to
73 -- encode a private-use character (as it would if trying to encode a
74 -- surrogate), which means that we won't get a chance to replace it
75 -- with the byte we originally escaped.
76 --
77 -- To work around this, when filling the buffer to be encoded (in
78 -- writeBlocks/withEncodedCString/newEncodedCString), we replace the
79 -- private-use characters with lone surrogates again! Likewise, when
80 -- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have
81 -- to do the inverse process.
82 --
83 -- The user of String should never see these lone surrogates, but it
84 -- ensures that iconv will throw an error when encountering them. We
85 -- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
86
87 codingFailureModeSuffix :: CodingFailureMode -> String
88 codingFailureModeSuffix ErrorOnCodingFailure = ""
89 codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
90 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
91 codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
92
93 -- | In transliterate mode, we use this character when decoding
94 -- unknown bytes.
95 --
96 -- This is the defined Unicode replacement character:
97 -- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
98 unrepresentableChar :: Char
99 unrepresentableChar = '\xFFFD'
100
101 -- It is extraordinarily important that this series of
102 -- predicates/transformers gets inlined, because they tend to be used
103 -- in inner loops related to text encoding. In particular,
104 -- surrogatifyRoundtripCharacter must be inlined (see #5536)
105
106 -- | Some characters are actually "surrogate" codepoints defined for
107 -- use in UTF-16. We need to signal an invalid character if we detect
108 -- them when encoding a sequence of 'Char's into 'Word8's because they
109 -- won't give valid Unicode.
110 --
111 -- We may also need to signal an invalid character if we detect them
112 -- when encoding a sequence of 'Char's into 'Word8's because the
113 -- 'RoundtripFailure' mode creates these to round-trip bytes through
114 -- our internal UTF-16 encoding.
115 {-# INLINE isSurrogate #-}
116 isSurrogate :: Char -> Bool
117 isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
118 || (0xDC00 <= x && x <= 0xDFFF)
119 where x = ord c
120
121 -- | Private use characters (in Strings) --> lone surrogates (in
122 -- Buffer CharBufElem) (We use some private-use characters for
123 -- roundtripping unknown bytes through a String)
124 {-# INLINE surrogatifyRoundtripCharacter #-}
125 surrogatifyRoundtripCharacter :: Char -> Char
126 surrogatifyRoundtripCharacter c
127 | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00))
128 | otherwise = c
129 where x = ord c
130
131 -- | Lone surrogates (in Buffer CharBufElem) --> private use
132 -- characters (in Strings) (We use some surrogate characters for
133 -- roundtripping unknown bytes through a String)
134 {-# INLINE desurrogatifyRoundtripCharacter #-}
135 desurrogatifyRoundtripCharacter :: Char -> Char
136 desurrogatifyRoundtripCharacter c
137 | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00))
138 | otherwise = c
139 where x = ord c
140
141 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
142 {-# INLINE escapeToRoundtripCharacterSurrogate #-}
143 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
144 escapeToRoundtripCharacterSurrogate b
145 | b < 128 = chr (fromIntegral b)
146 -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
147 -- work, this assumes encoding is ASCII-superset.
148 | otherwise = chr (0xDC00 + fromIntegral b)
149
150 -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
151 {-# INLINE unescapeRoundtripCharacterSurrogate #-}
152 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
153 unescapeRoundtripCharacterSurrogate c
154 | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
155 | otherwise = Nothing
156 where x = ord c
157
158 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
159 -> IO (Buffer Word8, Buffer Char)
160 recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
161 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
162 --puts $ "recoverDecode " ++ show ir
163 case cfm of
164 ErrorOnCodingFailure -> ioe_decodingError
165 IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
166 TransliterateCodingFailure -> do
167 ow' <- writeCharBuf oraw ow unrepresentableChar
168 return (input { bufL=ir+1 }, output { bufR=ow' })
169 RoundtripFailure -> do
170 b <- readWord8Buf iraw ir
171 ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
172 return (input { bufL=ir+1 }, output { bufR=ow' })
173
174 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
175 -> IO (Buffer Char, Buffer Word8)
176 recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
177 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
178 (c,ir') <- readCharBuf iraw ir
179 --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
180 case cfm of
181 IgnoreCodingFailure -> return (input { bufL=ir' }, output)
182 TransliterateCodingFailure -> do
183 if c == '?'
184 then return (input { bufL=ir' }, output)
185 else do
186 -- XXX: evil hack! To implement transliteration, we just
187 -- poke an ASCII ? into the input buffer and tell the caller
188 -- to try and decode again. This is *probably* safe given
189 -- current uses of TextEncoding.
190 --
191 -- The "if" test above ensures we skip if the encoding fails
192 -- to deal with the ?, though this should never happen in
193 -- practice as all encodings are in fact capable of
194 -- reperesenting all ASCII characters.
195 _ir' <- writeCharBuf iraw ir '?'
196 return (input, output)
197
198 -- This implementation does not work because e.g. UTF-16
199 -- requires 2 bytes to encode a simple ASCII value
200 --writeWord8Buf oraw ow unrepresentableByte
201 --return (input { bufL=ir' }, output { bufR=ow+1 })
202 RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
203 writeWord8Buf oraw ow x
204 return (input { bufL=ir' }, output { bufR=ow+1 })
205 _ -> ioe_encodingError
206
207 ioe_decodingError :: IO a
208 ioe_decodingError = ioException
209 (IOError Nothing InvalidArgument "recoverDecode"
210 "invalid byte sequence" Nothing Nothing)
211
212 ioe_encodingError :: IO a
213 ioe_encodingError = ioException
214 (IOError Nothing InvalidArgument "recoverEncode"
215 "invalid character" Nothing Nothing)
216