8cee4b3ff79fb7374e0828acd71e11141145ac6c
[packages/base.git] / GHC / IO / Encoding / Failure.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : GHC.IO.Encoding.Failure
6 -- Copyright : (c) The University of Glasgow, 2008-2011
7 -- License : see libraries/base/LICENSE
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
12 --
13 -- Types for specifying how text encoding/decoding fails
14 --
15 -----------------------------------------------------------------------------
16
17 module GHC.IO.Encoding.Failure (
18 CodingFailureMode(..), codingFailureModeSuffix,
19 isSurrogate,
20 surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
21 recoverDecode, recoverEncode
22 ) where
23
24 import GHC.IO
25 import GHC.IO.Buffer
26 import GHC.IO.Exception
27
28 import GHC.Base
29 import GHC.Word
30 import GHC.Show
31 import GHC.Num
32 import GHC.Real ( fromIntegral )
33
34 --import System.Posix.Internals
35
36 import Data.Maybe
37
38
39 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and
40 -- specifies how they handle illegal sequences.
41 data CodingFailureMode
42 = ErrorOnCodingFailure
43 -- ^ Throw an error when an illegal sequence is encountered
44 | IgnoreCodingFailure
45 -- ^ Attempt to ignore and recover if an illegal sequence is
46 -- encountered
47 | TransliterateCodingFailure
48 -- ^ Replace with the closest visual match upon an illegal
49 -- sequence
50 | RoundtripFailure
51 -- ^ Use the private-use escape mechanism to attempt to allow
52 -- illegal sequences to be roundtripped.
53 deriving (Show)
54 -- This will only work properly for those encodings which are
55 -- strict supersets of ASCII in the sense that valid ASCII data
56 -- is also valid in that encoding. This is not true for
57 -- e.g. UTF-16, because ASCII characters must be padded to two
58 -- bytes to retain their meaning.
59
60 -- Note [Roundtripping]
61 -- ~~~~~~~~~~~~~~~~~~~~
62 --
63 -- Roundtripping is based on the ideas of PEP383. However, unlike
64 -- PEP383 we do not wish to use lone surrogate codepoints to escape
65 -- undecodable bytes, because that may confuse Unicode processing
66 -- software written in Haskell. Instead, we use the range of
67 -- private-use characters from 0xEF80 to 0xEFFF designated for
68 -- "encoding hacks" by the ConScript Unicode Registery.
69 --
70 -- This introduces a technical problem when it comes to encoding back
71 -- to bytes using iconv. The iconv code will not fail when it tries to
72 -- encode a private-use character (as it would if trying to encode a
73 -- surrogate), which means that we won't get a chance to replace it
74 -- with the byte we originally escaped.
75 --
76 -- To work around this, when filling the buffer to be encoded (in
77 -- writeBlocks/withEncodedCString/newEncodedCString), we replace the
78 -- private-use characters with lone surrogates again! Likewise, when
79 -- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have
80 -- to do the inverse process.
81 --
82 -- The user of String should never see these lone surrogates, but it
83 -- ensures that iconv will throw an error when encountering them. We
84 -- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
85
86 codingFailureModeSuffix :: CodingFailureMode -> String
87 codingFailureModeSuffix ErrorOnCodingFailure = ""
88 codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
89 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
90 codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
91
92 -- | In transliterate mode, we use this character when decoding
93 -- unknown bytes.
94 --
95 -- This is the defined Unicode replacement character:
96 -- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
97 unrepresentableChar :: Char
98 unrepresentableChar = '\xFFFD'
99
100 -- It is extraordinarily important that this series of
101 -- predicates/transformers gets inlined, because they tend to be used
102 -- in inner loops related to text encoding. In particular,
103 -- surrogatifyRoundtripCharacter must be inlined (see #5536)
104
105 -- | Some characters are actually "surrogate" codepoints defined for
106 -- use in UTF-16. We need to signal an invalid character if we detect
107 -- them when encoding a sequence of 'Char's into 'Word8's because they
108 -- won't give valid Unicode.
109 --
110 -- We may also need to signal an invalid character if we detect them
111 -- when encoding a sequence of 'Char's into 'Word8's because the
112 -- 'RoundtripFailure' mode creates these to round-trip bytes through
113 -- our internal UTF-16 encoding.
114 {-# INLINE isSurrogate #-}
115 isSurrogate :: Char -> Bool
116 isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
117 || (0xDC00 <= x && x <= 0xDFFF)
118 where x = ord c
119
120 -- | Private use characters (in Strings) --> lone surrogates (in
121 -- Buffer CharBufElem) (We use some private-use characters for
122 -- roundtripping unknown bytes through a String)
123 {-# INLINE surrogatifyRoundtripCharacter #-}
124 surrogatifyRoundtripCharacter :: Char -> Char
125 surrogatifyRoundtripCharacter c
126 | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00))
127 | otherwise = c
128 where x = ord c
129
130 -- | Lone surrogates (in Buffer CharBufElem) --> private use
131 -- characters (in Strings) (We use some surrogate characters for
132 -- roundtripping unknown bytes through a String)
133 {-# INLINE desurrogatifyRoundtripCharacter #-}
134 desurrogatifyRoundtripCharacter :: Char -> Char
135 desurrogatifyRoundtripCharacter c
136 | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00))
137 | otherwise = c
138 where x = ord c
139
140 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
141 {-# INLINE escapeToRoundtripCharacterSurrogate #-}
142 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
143 escapeToRoundtripCharacterSurrogate b
144 | b < 128 = chr (fromIntegral b)
145 -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
146 -- work, this assumes encoding is ASCII-superset.
147 | otherwise = chr (0xDC00 + fromIntegral b)
148
149 -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
150 {-# INLINE unescapeRoundtripCharacterSurrogate #-}
151 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
152 unescapeRoundtripCharacterSurrogate c
153 | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
154 | otherwise = Nothing
155 where x = ord c
156
157 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
158 -> IO (Buffer Word8, Buffer Char)
159 recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
160 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
161 --puts $ "recoverDecode " ++ show ir
162 case cfm of
163 ErrorOnCodingFailure -> ioe_decodingError
164 IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
165 TransliterateCodingFailure -> do
166 ow' <- writeCharBuf oraw ow unrepresentableChar
167 return (input { bufL=ir+1 }, output { bufR=ow' })
168 RoundtripFailure -> do
169 b <- readWord8Buf iraw ir
170 ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
171 return (input { bufL=ir+1 }, output { bufR=ow' })
172
173 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
174 -> IO (Buffer Char, Buffer Word8)
175 recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
176 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
177 (c,ir') <- readCharBuf iraw ir
178 --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
179 case cfm of
180 IgnoreCodingFailure -> return (input { bufL=ir' }, output)
181 TransliterateCodingFailure -> do
182 if c == '?'
183 then return (input { bufL=ir' }, output)
184 else do
185 -- XXX: evil hack! To implement transliteration, we just
186 -- poke an ASCII ? into the input buffer and tell the caller
187 -- to try and decode again. This is *probably* safe given
188 -- current uses of TextEncoding.
189 --
190 -- The "if" test above ensures we skip if the encoding fails
191 -- to deal with the ?, though this should never happen in
192 -- practice as all encodings are in fact capable of
193 -- reperesenting all ASCII characters.
194 _ir' <- writeCharBuf iraw ir '?'
195 return (input, output)
196
197 -- This implementation does not work because e.g. UTF-16
198 -- requires 2 bytes to encode a simple ASCII value
199 --writeWord8Buf oraw ow unrepresentableByte
200 --return (input { bufL=ir' }, output { bufR=ow+1 })
201 RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
202 writeWord8Buf oraw ow x
203 return (input { bufL=ir' }, output { bufR=ow+1 })
204 _ -> ioe_encodingError
205
206 ioe_decodingError :: IO a
207 ioe_decodingError = ioException
208 (IOError Nothing InvalidArgument "recoverDecode"
209 "invalid byte sequence" Nothing Nothing)
210
211 ioe_encodingError :: IO a
212 ioe_encodingError = ioException
213 (IOError Nothing InvalidArgument "recoverEncode"
214 "invalid character" Nothing Nothing)