Fix parsing of RealFloat with huge exponents (#7034).
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..) -- :: *; Show, Eq
21
22 , numberToInteger, numberToRational, numberToRangedRational
23
24 -- lexer
25 , lex -- :: ReadP Lexeme Skips leading spaces
26 , hsLex -- :: ReadP String
27 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
28
29 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
30 , readOctP -- :: Num a => ReadP a
31 , readDecP -- :: Num a => ReadP a
32 , readHexP -- :: Num a => ReadP a
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 #ifdef __GLASGOW_HASKELL__
39 import GHC.Base
40 import GHC.Char
41 import GHC.Num( Num(..), Integer )
42 import GHC.Show( Show(..) )
43 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
44 import GHC.Real( Integral, Rational, (%), fromIntegral,
45 toInteger, (^) )
46 import GHC.List
47 import GHC.Enum( minBound, maxBound )
48 #else
49 import Prelude hiding ( lex )
50 import Data.Char( chr, ord, isSpace, isAlpha, isAlphaNum )
51 import Data.Ratio( Ratio, (%) )
52 #endif
53 #ifdef __HUGS__
54 import Hugs.Prelude( Ratio(..) )
55 #endif
56 import Data.Maybe
57 import Control.Monad
58
59 -- -----------------------------------------------------------------------------
60 -- Lexing types
61
62 -- ^ Haskell lexemes.
63 data Lexeme
64 = Char Char -- ^ Character literal
65 | String String -- ^ String literal, with escapes interpreted
66 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
67 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
68 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
69 | Number Number
70 | EOF
71 deriving (Eq, Show)
72
73 data Number = MkNumber Int -- Base
74 Digits -- Integral part
75 | MkDecimal Digits -- Integral part
76 (Maybe Digits) -- Fractional part
77 (Maybe Integer) -- Exponent
78 deriving (Eq, Show)
79
80 numberToInteger :: Number -> Maybe Integer
81 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
82 numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart)
83 numberToInteger _ = Nothing
84
85 -- This takes a floatRange, and if the Rational would be outside of
86 -- the floatRange then it may return Nothing. Not that it will not
87 -- /necessarily/ return Nothing, but it is good enough to fix the
88 -- space problems in #5688
89 -- Ways this is conservative:
90 -- * the floatRange is in base 2, but we pretend it is in base 10
91 -- * we pad the floateRange a bit, just in case it is very small
92 -- and we would otherwise hit an edge case
93 -- * We only worry about numbers that have an exponent. If they don't
94 -- have an exponent then the Rational won't be much larger than the
95 -- Number, so there is no problem
96 numberToRangedRational :: (Int, Int) -> Number
97 -> Maybe Rational -- Nothing = Inf
98 numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
99 -- if exp is out of integer bounds,
100 -- then the number is definitely out of range
101 | exp > fromIntegral (maxBound :: Int) ||
102 exp < fromIntegral (minBound :: Int)
103 = Nothing
104 | otherwise
105 = let mFirstDigit = case dropWhile (0 ==) iPart of
106 iPart'@(_ : _) -> Just (length iPart')
107 [] -> case mFPart of
108 Nothing -> Nothing
109 Just fPart ->
110 case span (0 ==) fPart of
111 (_, []) -> Nothing
112 (zeroes, _) ->
113 Just (negate (length zeroes))
114 in case mFirstDigit of
115 Nothing -> Just 0
116 Just firstDigit ->
117 let firstDigit' = firstDigit + fromInteger exp
118 in if firstDigit' > (pos + 3)
119 then Nothing
120 else if firstDigit' < (neg - 3)
121 then Just 0
122 else Just (numberToRational n)
123 numberToRangedRational _ n = Just (numberToRational n)
124
125 numberToRational :: Number -> Rational
126 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
127 numberToRational (MkDecimal iPart mFPart mExp)
128 = let i = val 10 0 iPart
129 in case (mFPart, mExp) of
130 (Nothing, Nothing) -> i % 1
131 (Nothing, Just exp)
132 | exp >= 0 -> (i * (10 ^ exp)) % 1
133 | otherwise -> i % (10 ^ (- exp))
134 (Just fPart, Nothing) -> fracExp 0 i fPart
135 (Just fPart, Just exp) -> fracExp exp i fPart
136 -- fracExp is a bit more efficient in calculating the Rational.
137 -- Instead of calculating the fractional part alone, then
138 -- adding the integral part and finally multiplying with
139 -- 10 ^ exp if an exponent was given, do it all at once.
140
141 -- -----------------------------------------------------------------------------
142 -- Lexing
143
144 lex :: ReadP Lexeme
145 lex = skipSpaces >> lexToken
146
147 hsLex :: ReadP String
148 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
149 hsLex = do skipSpaces
150 (s,_) <- gather lexToken
151 return s
152
153 lexToken :: ReadP Lexeme
154 lexToken = lexEOF +++
155 lexLitChar +++
156 lexString +++
157 lexPunc +++
158 lexSymbol +++
159 lexId +++
160 lexNumber
161
162
163 -- ----------------------------------------------------------------------
164 -- End of file
165 lexEOF :: ReadP Lexeme
166 lexEOF = do s <- look
167 guard (null s)
168 return EOF
169
170 -- ---------------------------------------------------------------------------
171 -- Single character lexemes
172
173 lexPunc :: ReadP Lexeme
174 lexPunc =
175 do c <- satisfy isPuncChar
176 return (Punc [c])
177 where
178 isPuncChar c = c `elem` ",;()[]{}`"
179
180 -- ----------------------------------------------------------------------
181 -- Symbols
182
183 lexSymbol :: ReadP Lexeme
184 lexSymbol =
185 do s <- munch1 isSymbolChar
186 if s `elem` reserved_ops then
187 return (Punc s) -- Reserved-ops count as punctuation
188 else
189 return (Symbol s)
190 where
191 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
192 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
193
194 -- ----------------------------------------------------------------------
195 -- identifiers
196
197 lexId :: ReadP Lexeme
198 lexId = do c <- satisfy isIdsChar
199 s <- munch isIdfChar
200 return (Ident (c:s))
201 where
202 -- Identifiers can start with a '_'
203 isIdsChar c = isAlpha c || c == '_'
204 isIdfChar c = isAlphaNum c || c `elem` "_'"
205
206 -- ---------------------------------------------------------------------------
207 -- Lexing character literals
208
209 lexLitChar :: ReadP Lexeme
210 lexLitChar =
211 do _ <- char '\''
212 (c,esc) <- lexCharE
213 guard (esc || c /= '\'') -- Eliminate '' possibility
214 _ <- char '\''
215 return (Char c)
216
217 lexChar :: ReadP Char
218 lexChar = do { (c,_) <- lexCharE; return c }
219
220 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
221 lexCharE =
222 do c1 <- get
223 if c1 == '\\'
224 then do c2 <- lexEsc; return (c2, True)
225 else do return (c1, False)
226 where
227 lexEsc =
228 lexEscChar
229 +++ lexNumeric
230 +++ lexCntrlChar
231 +++ lexAscii
232
233 lexEscChar =
234 do c <- get
235 case c of
236 'a' -> return '\a'
237 'b' -> return '\b'
238 'f' -> return '\f'
239 'n' -> return '\n'
240 'r' -> return '\r'
241 't' -> return '\t'
242 'v' -> return '\v'
243 '\\' -> return '\\'
244 '\"' -> return '\"'
245 '\'' -> return '\''
246 _ -> pfail
247
248 lexNumeric =
249 do base <- lexBaseChar <++ return 10
250 n <- lexInteger base
251 guard (n <= toInteger (ord maxBound))
252 return (chr (fromInteger n))
253
254 lexCntrlChar =
255 do _ <- char '^'
256 c <- get
257 case c of
258 '@' -> return '\^@'
259 'A' -> return '\^A'
260 'B' -> return '\^B'
261 'C' -> return '\^C'
262 'D' -> return '\^D'
263 'E' -> return '\^E'
264 'F' -> return '\^F'
265 'G' -> return '\^G'
266 'H' -> return '\^H'
267 'I' -> return '\^I'
268 'J' -> return '\^J'
269 'K' -> return '\^K'
270 'L' -> return '\^L'
271 'M' -> return '\^M'
272 'N' -> return '\^N'
273 'O' -> return '\^O'
274 'P' -> return '\^P'
275 'Q' -> return '\^Q'
276 'R' -> return '\^R'
277 'S' -> return '\^S'
278 'T' -> return '\^T'
279 'U' -> return '\^U'
280 'V' -> return '\^V'
281 'W' -> return '\^W'
282 'X' -> return '\^X'
283 'Y' -> return '\^Y'
284 'Z' -> return '\^Z'
285 '[' -> return '\^['
286 '\\' -> return '\^\'
287 ']' -> return '\^]'
288 '^' -> return '\^^'
289 '_' -> return '\^_'
290 _ -> pfail
291
292 lexAscii =
293 do choice
294 [ (string "SOH" >> return '\SOH') <++
295 (string "SO" >> return '\SO')
296 -- \SO and \SOH need maximal-munch treatment
297 -- See the Haskell report Sect 2.6
298
299 , string "NUL" >> return '\NUL'
300 , string "STX" >> return '\STX'
301 , string "ETX" >> return '\ETX'
302 , string "EOT" >> return '\EOT'
303 , string "ENQ" >> return '\ENQ'
304 , string "ACK" >> return '\ACK'
305 , string "BEL" >> return '\BEL'
306 , string "BS" >> return '\BS'
307 , string "HT" >> return '\HT'
308 , string "LF" >> return '\LF'
309 , string "VT" >> return '\VT'
310 , string "FF" >> return '\FF'
311 , string "CR" >> return '\CR'
312 , string "SI" >> return '\SI'
313 , string "DLE" >> return '\DLE'
314 , string "DC1" >> return '\DC1'
315 , string "DC2" >> return '\DC2'
316 , string "DC3" >> return '\DC3'
317 , string "DC4" >> return '\DC4'
318 , string "NAK" >> return '\NAK'
319 , string "SYN" >> return '\SYN'
320 , string "ETB" >> return '\ETB'
321 , string "CAN" >> return '\CAN'
322 , string "EM" >> return '\EM'
323 , string "SUB" >> return '\SUB'
324 , string "ESC" >> return '\ESC'
325 , string "FS" >> return '\FS'
326 , string "GS" >> return '\GS'
327 , string "RS" >> return '\RS'
328 , string "US" >> return '\US'
329 , string "SP" >> return '\SP'
330 , string "DEL" >> return '\DEL'
331 ]
332
333
334 -- ---------------------------------------------------------------------------
335 -- string literal
336
337 lexString :: ReadP Lexeme
338 lexString =
339 do _ <- char '"'
340 body id
341 where
342 body f =
343 do (c,esc) <- lexStrItem
344 if c /= '"' || esc
345 then body (f.(c:))
346 else let s = f "" in
347 return (String s)
348
349 lexStrItem = (lexEmpty >> lexStrItem)
350 +++ lexCharE
351
352 lexEmpty =
353 do _ <- char '\\'
354 c <- get
355 case c of
356 '&' -> do return ()
357 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
358 _ -> do pfail
359
360 -- ---------------------------------------------------------------------------
361 -- Lexing numbers
362
363 type Base = Int
364 type Digits = [Int]
365
366 lexNumber :: ReadP Lexeme
367 lexNumber
368 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
369 -- If that fails, try for a decimal number
370 lexDecNumber -- Start with ordinary digits
371
372 lexHexOct :: ReadP Lexeme
373 lexHexOct
374 = do _ <- char '0'
375 base <- lexBaseChar
376 digits <- lexDigits base
377 return (Number (MkNumber base digits))
378
379 lexBaseChar :: ReadP Int
380 -- Lex a single character indicating the base; fail if not there
381 lexBaseChar = do { c <- get;
382 case c of
383 'o' -> return 8
384 'O' -> return 8
385 'x' -> return 16
386 'X' -> return 16
387 _ -> pfail }
388
389 lexDecNumber :: ReadP Lexeme
390 lexDecNumber =
391 do xs <- lexDigits 10
392 mFrac <- lexFrac <++ return Nothing
393 mExp <- lexExp <++ return Nothing
394 return (Number (MkDecimal xs mFrac mExp))
395
396 lexFrac :: ReadP (Maybe Digits)
397 -- Read the fractional part; fail if it doesn't
398 -- start ".d" where d is a digit
399 lexFrac = do _ <- char '.'
400 fraction <- lexDigits 10
401 return (Just fraction)
402
403 lexExp :: ReadP (Maybe Integer)
404 lexExp = do _ <- char 'e' +++ char 'E'
405 exp <- signedExp +++ lexInteger 10
406 return (Just exp)
407 where
408 signedExp
409 = do c <- char '-' +++ char '+'
410 n <- lexInteger 10
411 return (if c == '-' then -n else n)
412
413 lexDigits :: Int -> ReadP Digits
414 -- Lex a non-empty sequence of digits in specified base
415 lexDigits base =
416 do s <- look
417 xs <- scan s id
418 guard (not (null xs))
419 return xs
420 where
421 scan (c:cs) f = case valDig base c of
422 Just n -> do _ <- get; scan cs (f.(n:))
423 Nothing -> do return (f [])
424 scan [] f = do return (f [])
425
426 lexInteger :: Base -> ReadP Integer
427 lexInteger base =
428 do xs <- lexDigits base
429 return (val (fromIntegral base) 0 xs)
430
431 val :: Num a => a -> a -> Digits -> a
432 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
433 val _ y [] = y
434 val base y (x:xs) = y' `seq` val base y' xs
435 where
436 y' = y * base + fromIntegral x
437
438 -- Calculate a Rational from the exponent [of 10 to multiply with],
439 -- the integral part of the mantissa and the digits of the fractional
440 -- part. Leaving the calculation of the power of 10 until the end,
441 -- when we know the effective exponent, saves multiplications.
442 -- More importantly, this way we need at most one gcd instead of three.
443 --
444 -- frac was never used with anything but Integer and base 10, so
445 -- those are hardcoded now (trivial to change if necessary).
446 fracExp :: Integer -> Integer -> Digits -> Rational
447 fracExp exp mant []
448 | exp < 0 = mant % (10 ^ (-exp))
449 | otherwise = fromInteger (mant * 10 ^ exp)
450 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
451 where
452 exp' = exp - 1
453 mant' = mant * 10 + fromIntegral d
454
455 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
456 valDig 8 c
457 | '0' <= c && c <= '7' = Just (ord c - ord '0')
458 | otherwise = Nothing
459
460 valDig 10 c = valDecDig c
461
462 valDig 16 c
463 | '0' <= c && c <= '9' = Just (ord c - ord '0')
464 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
465 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
466 | otherwise = Nothing
467
468 valDig _ _ = error "valDig: Bad base"
469
470 valDecDig :: Char -> Maybe Int
471 valDecDig c
472 | '0' <= c && c <= '9' = Just (ord c - ord '0')
473 | otherwise = Nothing
474
475 -- ----------------------------------------------------------------------
476 -- other numeric lexing functions
477
478 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
479 readIntP base isDigit valDigit =
480 do s <- munch1 isDigit
481 return (val base 0 (map valDigit s))
482
483 readIntP' :: (Eq a, Num a) => a -> ReadP a
484 readIntP' base = readIntP base isDigit valDigit
485 where
486 isDigit c = maybe False (const True) (valDig base c)
487 valDigit c = maybe 0 id (valDig base c)
488
489 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
490 readOctP = readIntP' 8
491 readDecP = readIntP' 10
492 readHexP = readIntP' 16
493