d91907132c34d17decc8d920de0613c1cf8d836b
[packages/base.git] / GHC / IO / Encoding / Iconv.hs
1 {-# LANGUAGE CPP
2 , NoImplicitPrelude
3 , ForeignFunctionInterface
4 , NondecreasingIndentation
5 #-}
6
7 -----------------------------------------------------------------------------
8 -- |
9 -- Module : GHC.IO.Encoding.Iconv
10 -- Copyright : (c) The University of Glasgow, 2008-2009
11 -- License : see libraries/base/LICENSE
12 --
13 -- Maintainer : libraries@haskell.org
14 -- Stability : internal
15 -- Portability : non-portable
16 --
17 -- This module provides text encoding/decoding using iconv
18 --
19 -----------------------------------------------------------------------------
20
21 -- #hide
22 module GHC.IO.Encoding.Iconv (
23 #if !defined(mingw32_HOST_OS)
24 iconvEncoding, mkIconvEncoding,
25 localeEncoding, mkLocaleEncoding
26 #endif
27 ) where
28
29 #include "MachDeps.h"
30 #include "HsBaseConfig.h"
31
32 #if !defined(mingw32_HOST_OS)
33
34 import Foreign hiding (unsafePerformIO)
35 import Foreign.C
36 import Data.Maybe
37 import GHC.Base
38 import GHC.IO.Buffer
39 import GHC.IO.Encoding.Failure
40 import GHC.IO.Encoding.Types
41 import GHC.List (span)
42 import GHC.Num
43 import GHC.Show
44 import GHC.Real
45 import System.IO.Unsafe (unsafePerformIO)
46 import System.Posix.Internals
47
48 c_DEBUG_DUMP :: Bool
49 c_DEBUG_DUMP = False
50
51 iconv_trace :: String -> IO ()
52 iconv_trace s
53 | c_DEBUG_DUMP = puts s
54 | otherwise = return ()
55
56 -- -----------------------------------------------------------------------------
57 -- iconv encoders/decoders
58
59 {-# NOINLINE localeEncodingName #-}
60 localeEncodingName :: String
61 localeEncodingName = unsafePerformIO $ do
62 -- Use locale_charset() or nl_langinfo(CODESET) to get the encoding
63 -- if we have either of them.
64 cstr <- c_localeEncoding
65 peekCAString cstr -- Assume charset names are ASCII
66
67 localeEncoding :: TextEncoding
68 localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
69
70 mkLocaleEncoding :: CodingFailureMode -> TextEncoding
71 mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
72
73 -- We hope iconv_t is a storable type. It should be, since it has at least the
74 -- value -1, which is a possible return value from iconv_open.
75 type IConv = CLong -- ToDo: (#type iconv_t)
76
77 foreign import ccall unsafe "hs_iconv_open"
78 hs_iconv_open :: CString -> CString -> IO IConv
79
80 foreign import ccall unsafe "hs_iconv_close"
81 hs_iconv_close :: IConv -> IO CInt
82
83 foreign import ccall unsafe "hs_iconv"
84 hs_iconv :: IConv -> Ptr CString -> Ptr CSize -> Ptr CString -> Ptr CSize
85 -> IO CSize
86
87 foreign import ccall unsafe "localeEncoding"
88 c_localeEncoding :: IO CString
89
90 haskellChar :: String
91 #ifdef WORDS_BIGENDIAN
92 haskellChar | charSize == 2 = "UTF-16BE"
93 | otherwise = "UTF-32BE"
94 #else
95 haskellChar | charSize == 2 = "UTF-16LE"
96 | otherwise = "UTF-32LE"
97 #endif
98
99 char_shift :: Int
100 char_shift | charSize == 2 = 1
101 | otherwise = 2
102
103 iconvEncoding :: String -> IO TextEncoding
104 iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
105
106 mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
107 mkIconvEncoding cfm charset = do
108 return (TextEncoding {
109 textEncodingName = charset,
110 mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
111 mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode})
112 where
113 -- An annoying feature of GNU iconv is that the //PREFIXES only take
114 -- effect when they appear on the tocode parameter to iconv_open:
115 (raw_charset, suffix) = span (/= '/') charset
116
117 newIConv :: String -> String
118 -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
119 -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
120 -> IO (BufferCodec a b ())
121 newIConv from to rec fn =
122 -- Assume charset names are ASCII
123 withCAString from $ \ from_str ->
124 withCAString to $ \ to_str -> do
125 iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str
126 let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
127 return BufferCodec{
128 encode = fn iconvt,
129 recover = rec,
130 close = iclose,
131 -- iconv doesn't supply a way to save/restore the state
132 getState = return (),
133 setState = const $ return ()
134 }
135
136 iconvDecode :: IConv -> DecodeBuffer
137 iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
138
139 iconvEncode :: IConv -> EncodeBuffer
140 iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
141
142 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
143 -> IO (CodingProgress, Buffer a, Buffer b)
144 iconvRecode iconv_t
145 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
146 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
147 = do
148 iconv_trace ("haskelChar=" ++ show haskellChar)
149 iconv_trace ("iconvRecode before, input=" ++ show (summaryBuffer input))
150 iconv_trace ("iconvRecode before, output=" ++ show (summaryBuffer output))
151 withRawBuffer iraw $ \ piraw -> do
152 withRawBuffer oraw $ \ poraw -> do
153 with (piraw `plusPtr` (ir `shiftL` iscale)) $ \ p_inbuf -> do
154 with (poraw `plusPtr` (ow `shiftL` oscale)) $ \ p_outbuf -> do
155 with (fromIntegral ((iw-ir) `shiftL` iscale)) $ \ p_inleft -> do
156 with (fromIntegral ((os-ow) `shiftL` oscale)) $ \ p_outleft -> do
157 res <- hs_iconv iconv_t p_inbuf p_inleft p_outbuf p_outleft
158 new_inleft <- peek p_inleft
159 new_outleft <- peek p_outleft
160 let
161 new_inleft' = fromIntegral new_inleft `shiftR` iscale
162 new_outleft' = fromIntegral new_outleft `shiftR` oscale
163 new_input
164 | new_inleft == 0 = input { bufL = 0, bufR = 0 }
165 | otherwise = input { bufL = iw - new_inleft' }
166 new_output = output{ bufR = os - new_outleft' }
167 iconv_trace ("iconv res=" ++ show res)
168 iconv_trace ("iconvRecode after, input=" ++ show (summaryBuffer new_input))
169 iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
170 if (res /= -1)
171 then do -- all input translated
172 return (InputUnderflow, new_input, new_output)
173 else do
174 errno <- getErrno
175 case errno of
176 e | e == e2BIG -> return (OutputUnderflow, new_input, new_output)
177 | e == eINVAL -> return (InputUnderflow, new_input, new_output)
178 -- Sometimes iconv reports EILSEQ for a
179 -- character in the input even when there is no room
180 -- in the output; in this case we might be about to
181 -- change the encoding anyway, so the following bytes
182 -- could very well be in a different encoding.
183 --
184 -- Because we can only say InvalidSequence if there is at least
185 -- one element left in the output, we have to special case this.
186 | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
187 | otherwise -> do
188 iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
189 throwErrno "iconvRecoder"
190
191 #endif /* !mingw32_HOST_OS */