061bd60c209a9868a846f0497160c243e7ee12dc
[ghc.git] / libraries / base / GHC / IO / Encoding / Iconv.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP
3 , NoImplicitPrelude
4 , NondecreasingIndentation
5 #-}
6 {-# OPTIONS_HADDOCK hide #-}
7
8 -----------------------------------------------------------------------------
9 -- |
10 -- Module : GHC.IO.Encoding.Iconv
11 -- Copyright : (c) The University of Glasgow, 2008-2009
12 -- License : see libraries/base/LICENSE
13 --
14 -- Maintainer : libraries@haskell.org
15 -- Stability : internal
16 -- Portability : non-portable
17 --
18 -- This module provides text encoding/decoding using iconv
19 --
20 -----------------------------------------------------------------------------
21
22 module GHC.IO.Encoding.Iconv (
23 #if !defined(mingw32_HOST_OS)
24 iconvEncoding, mkIconvEncoding,
25 localeEncodingName
26 #endif
27 ) where
28
29 #include "MachDeps.h"
30 #include "HsBaseConfig.h"
31
32 #if defined(mingw32_HOST_OS)
33 import GHC.Base () -- For build ordering
34 #else
35
36 import Foreign
37 import Foreign.C hiding (charIsRepresentable)
38 import Data.Maybe
39 import GHC.Base
40 import GHC.Foreign (charIsRepresentable)
41 import GHC.IO.Buffer
42 import GHC.IO.Encoding.Failure
43 import GHC.IO.Encoding.Types
44 import GHC.List (span)
45 import GHC.Num
46 import GHC.Show
47 import GHC.Real
48 import System.IO.Unsafe (unsafePerformIO)
49 import System.Posix.Internals
50
51 c_DEBUG_DUMP :: Bool
52 c_DEBUG_DUMP = False
53
54 iconv_trace :: String -> IO ()
55 iconv_trace s
56 | c_DEBUG_DUMP = puts s
57 | otherwise = return ()
58
59 -- -----------------------------------------------------------------------------
60 -- iconv encoders/decoders
61
62 {-# NOINLINE localeEncodingName #-}
63 localeEncodingName :: String
64 localeEncodingName = unsafePerformIO $ do
65 -- Use locale_charset() or nl_langinfo(CODESET) to get the encoding
66 -- if we have either of them.
67 cstr <- c_localeEncoding
68 peekCAString cstr -- Assume charset names are ASCII
69
70 -- We hope iconv_t is a storable type. It should be, since it has at least the
71 -- value -1, which is a possible return value from iconv_open.
72 type IConv = CLong -- ToDo: (#type iconv_t)
73
74 foreign import ccall unsafe "hs_iconv_open"
75 hs_iconv_open :: CString -> CString -> IO IConv
76
77 foreign import ccall unsafe "hs_iconv_close"
78 hs_iconv_close :: IConv -> IO CInt
79
80 foreign import ccall unsafe "hs_iconv"
81 hs_iconv :: IConv -> Ptr CString -> Ptr CSize -> Ptr CString -> Ptr CSize
82 -> IO CSize
83
84 foreign import ccall unsafe "localeEncoding"
85 c_localeEncoding :: IO CString
86
87 haskellChar :: String
88 #ifdef WORDS_BIGENDIAN
89 haskellChar | charSize == 2 = "UTF-16BE"
90 | otherwise = "UTF-32BE"
91 #else
92 haskellChar | charSize == 2 = "UTF-16LE"
93 | otherwise = "UTF-32LE"
94 #endif
95
96 char_shift :: Int
97 char_shift | charSize == 2 = 1
98 | otherwise = 2
99
100 iconvEncoding :: String -> IO (Maybe TextEncoding)
101 iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
102
103 -- | Construct an iconv-based 'TextEncoding' for the given character set and
104 -- 'CodingFailureMode'.
105 --
106 -- As iconv is missing in some minimal environments (e.g. #10298), this
107 -- checks to ensure that iconv is working properly before returning the
108 -- encoding, returning 'Nothing' if not.
109 mkIconvEncoding :: CodingFailureMode -> String -> IO (Maybe TextEncoding)
110 mkIconvEncoding cfm charset = do
111 let enc = TextEncoding {
112 textEncodingName = charset,
113 mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix)
114 (recoverDecode cfm) iconvDecode,
115 mkTextEncoder = newIConv haskellChar charset
116 (recoverEncode cfm) iconvEncode}
117 good <- charIsRepresentable enc 'a'
118 return $ if good
119 then Just enc
120 else Nothing
121 where
122 -- An annoying feature of GNU iconv is that the //PREFIXES only take
123 -- effect when they appear on the tocode parameter to iconv_open:
124 (raw_charset, suffix) = span (/= '/') charset
125
126 newIConv :: String -> String
127 -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
128 -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
129 -> IO (BufferCodec a b ())
130 newIConv from to rec fn =
131 -- Assume charset names are ASCII
132 withCAString from $ \ from_str ->
133 withCAString to $ \ to_str -> do
134 iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str
135 let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
136 return BufferCodec{
137 encode = fn iconvt,
138 recover = rec,
139 close = iclose,
140 -- iconv doesn't supply a way to save/restore the state
141 getState = return (),
142 setState = const $ return ()
143 }
144
145 iconvDecode :: IConv -> DecodeBuffer
146 iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
147
148 iconvEncode :: IConv -> EncodeBuffer
149 iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
150
151 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
152 -> IO (CodingProgress, Buffer a, Buffer b)
153 iconvRecode iconv_t
154 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
155 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
156 = do
157 iconv_trace ("haskellChar=" ++ show haskellChar)
158 iconv_trace ("iconvRecode before, input=" ++ show (summaryBuffer input))
159 iconv_trace ("iconvRecode before, output=" ++ show (summaryBuffer output))
160 withRawBuffer iraw $ \ piraw -> do
161 withRawBuffer oraw $ \ poraw -> do
162 with (piraw `plusPtr` (ir `shiftL` iscale)) $ \ p_inbuf -> do
163 with (poraw `plusPtr` (ow `shiftL` oscale)) $ \ p_outbuf -> do
164 with (fromIntegral ((iw-ir) `shiftL` iscale)) $ \ p_inleft -> do
165 with (fromIntegral ((os-ow) `shiftL` oscale)) $ \ p_outleft -> do
166 res <- hs_iconv iconv_t p_inbuf p_inleft p_outbuf p_outleft
167 new_inleft <- peek p_inleft
168 new_outleft <- peek p_outleft
169 let
170 new_inleft' = fromIntegral new_inleft `shiftR` iscale
171 new_outleft' = fromIntegral new_outleft `shiftR` oscale
172 new_input
173 | new_inleft == 0 = input { bufL = 0, bufR = 0 }
174 | otherwise = input { bufL = iw - new_inleft' }
175 new_output = output{ bufR = os - new_outleft' }
176 iconv_trace ("iconv res=" ++ show res)
177 iconv_trace ("iconvRecode after, input=" ++ show (summaryBuffer new_input))
178 iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
179 if (res /= -1)
180 then do -- all input translated
181 return (InputUnderflow, new_input, new_output)
182 else do
183 errno <- getErrno
184 case errno of
185 e | e == e2BIG -> return (OutputUnderflow, new_input, new_output)
186 | e == eINVAL -> return (InputUnderflow, new_input, new_output)
187 -- Sometimes iconv reports EILSEQ for a
188 -- character in the input even when there is no room
189 -- in the output; in this case we might be about to
190 -- change the encoding anyway, so the following bytes
191 -- could very well be in a different encoding.
192 --
193 -- Because we can only say InvalidSequence if there is at least
194 -- one element left in the output, we have to special case this.
195 | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
196 | otherwise -> do
197 iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
198 throwErrno "iconvRecoder"
199
200 #endif /* !mingw32_HOST_OS */
201