SafeHaskell: Added SafeHaskell to base
[ghc.git] / libraries / base / GHC / IO / Encoding / Failure.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : GHC.IO.Encoding.Failure
6 -- Copyright : (c) The University of Glasgow, 2008-2011
7 -- License : see libraries/base/LICENSE
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
12 --
13 -- Types for specifying how text encoding/decoding fails
14 --
15 -----------------------------------------------------------------------------
16
17 module GHC.IO.Encoding.Failure (
18 CodingFailureMode(..), codingFailureModeSuffix,
19 isSurrogate,
20 surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
21 recoverDecode, recoverEncode
22 ) where
23
24 import GHC.IO
25 import GHC.IO.Buffer
26 import GHC.IO.Exception
27
28 import GHC.Base
29 import GHC.Word
30 import GHC.Show
31 import GHC.Num
32 import GHC.Real ( fromIntegral )
33
34 --import System.Posix.Internals
35
36 import Data.Maybe
37
38 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
39 -- how they handle illegal sequences.
40 data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered
41 | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered
42 | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence
43 | RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
44 deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
45 -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
46 -- ASCII characters must be padded to two bytes to retain their meaning.
47
48 -- Note [Roundtripping]
49 -- ~~~~~~~~~~~~~~~~~~~~
50 --
51 -- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
52 -- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
53 -- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery.
54 --
55 -- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
56 -- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
57 -- chance to replace it with the byte we originally escaped.
58 --
59 -- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
60 -- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
61 -- we have to do the inverse process.
62 --
63 -- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
64 -- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
65
66 codingFailureModeSuffix :: CodingFailureMode -> String
67 codingFailureModeSuffix ErrorOnCodingFailure = ""
68 codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
69 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
70 codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
71
72 -- | In transliterate mode, we use this character when decoding unknown bytes.
73 --
74 -- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
75 unrepresentableChar :: Char
76 unrepresentableChar = '\xFFFD'
77
78 -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
79 -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
80 -- give valid Unicode.
81 --
82 -- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
83 -- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
84 isSurrogate :: Char -> Bool
85 isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
86 where x = ord c
87
88 -- | We use some private-use characters for roundtripping unknown bytes through a String
89 isRoundtripEscapeChar :: Char -> Bool
90 isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000
91 where x = ord c
92
93 -- | We use some surrogate characters for roundtripping unknown bytes through a String
94 isRoundtripEscapeSurrogateChar :: Char -> Bool
95 isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
96 where x = ord c
97
98 -- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
99 surrogatifyRoundtripCharacter :: Char -> Char
100 surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00)
101 | otherwise = c
102
103 -- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
104 desurrogatifyRoundtripCharacter :: Char -> Char
105 desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00)
106 | otherwise = c
107
108 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
109 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
110 escapeToRoundtripCharacterSurrogate b
111 | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
112 | otherwise = chr (0xDC00 + fromIntegral b)
113
114 -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
115 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
116 unescapeRoundtripCharacterSurrogate c
117 | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
118 | otherwise = Nothing
119 where x = ord c
120
121 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
122 recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
123 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
124 --puts $ "recoverDecode " ++ show ir
125 case cfm of
126 ErrorOnCodingFailure -> ioe_decodingError
127 IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
128 TransliterateCodingFailure -> do
129 ow' <- writeCharBuf oraw ow unrepresentableChar
130 return (input { bufL=ir+1 }, output { bufR=ow' })
131 RoundtripFailure -> do
132 b <- readWord8Buf iraw ir
133 ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
134 return (input { bufL=ir+1 }, output { bufR=ow' })
135
136 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
137 recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
138 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
139 (c,ir') <- readCharBuf iraw ir
140 --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
141 case cfm of
142 IgnoreCodingFailure -> return (input { bufL=ir' }, output)
143 TransliterateCodingFailure -> do
144 if c == '?'
145 then return (input { bufL=ir' }, output)
146 else do
147 -- XXX: evil hack! To implement transliteration, we just poke an
148 -- ASCII ? into the input buffer and tell the caller to try and decode
149 -- again. This is *probably* safe given current uses of TextEncoding.
150 --
151 -- The "if" test above ensures we skip if the encoding fails to deal with
152 -- the ?, though this should never happen in practice as all encodings are
153 -- in fact capable of reperesenting all ASCII characters.
154 _ir' <- writeCharBuf iraw ir '?'
155 return (input, output)
156
157 -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
158 -- encode a simple ASCII value
159 --writeWord8Buf oraw ow unrepresentableByte
160 --return (input { bufL=ir' }, output { bufR=ow+1 })
161 RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
162 writeWord8Buf oraw ow x
163 return (input { bufL=ir' }, output { bufR=ow+1 })
164 _ -> ioe_encodingError
165
166 ioe_decodingError :: IO a
167 ioe_decodingError = ioException
168 (IOError Nothing InvalidArgument "recoverDecode"
169 "invalid byte sequence" Nothing Nothing)
170
171 ioe_encodingError :: IO a
172 ioe_encodingError = ioException
173 (IOError Nothing InvalidArgument "recoverEncode"
174 "invalid character" Nothing Nothing)