Refactor number lexing; part of #5688
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..) -- :: *; Show, Eq
21
22 , numberToInteger, numberToRational
23
24 -- lexer
25 , lex -- :: ReadP Lexeme Skips leading spaces
26 , hsLex -- :: ReadP String
27 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
28
29 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
30 , readOctP -- :: Num a => ReadP a
31 , readDecP -- :: Num a => ReadP a
32 , readHexP -- :: Num a => ReadP a
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 #ifdef __GLASGOW_HASKELL__
39 import GHC.Base
40 import GHC.Char
41 import GHC.Num( Num(..), Integer )
42 import GHC.Show( Show(..) )
43 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
44 import GHC.Real( Integral, Rational, (%), fromIntegral,
45 toInteger, (^), infinity, notANumber )
46 import GHC.List
47 import GHC.Enum( maxBound )
48 #else
49 import Prelude hiding ( lex )
50 import Data.Char( chr, ord, isSpace, isAlpha, isAlphaNum )
51 import Data.Ratio( Ratio, (%) )
52 #endif
53 #ifdef __HUGS__
54 import Hugs.Prelude( Ratio(..) )
55 #endif
56 import Data.Maybe
57 import Control.Monad
58
59 -- -----------------------------------------------------------------------------
60 -- Lexing types
61
62 -- ^ Haskell lexemes.
63 data Lexeme
64 = Char Char -- ^ Character literal
65 | String String -- ^ String literal, with escapes interpreted
66 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
67 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
68 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
69 | Number Number
70 | EOF
71 deriving (Eq, Show)
72
73 data Number = MkNumber Int -- Base
74 Digits -- Integral part
75 | MkDecimal Digits -- Integral part
76 (Maybe Digits) -- Fractional part
77 (Maybe Integer) -- Exponent
78 | NotANumber
79 | Infinity
80 deriving (Eq, Show)
81
82 numberToInteger :: Number -> Maybe Integer
83 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
84 numberToInteger (MkDecimal iPart Nothing mExp)
85 = let i = val 10 0 iPart
86 in case mExp of
87 Nothing -> Just i
88 Just exp | exp >= 0 -> Just (i * (10 ^ exp))
89 _ -> Nothing
90 numberToInteger _ = Nothing
91
92 numberToRational :: Number -> Rational
93 numberToRational NotANumber = notANumber
94 numberToRational Infinity = infinity
95 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
96 numberToRational (MkDecimal iPart mFPart mExp)
97 = let i = val 10 0 iPart
98 in case (mFPart, mExp) of
99 (Nothing, Nothing) -> i % 1
100 (Nothing, Just exp)
101 | exp >= 0 -> (i * (10 ^ exp)) % 1
102 | otherwise -> i % (10 ^ (- exp))
103 (Just fPart, Nothing) -> fracExp 0 i fPart
104 (Just fPart, Just exp) -> fracExp exp i fPart
105 -- fracExp is a bit more efficient in calculating the Rational.
106 -- Instead of calculating the fractional part alone, then
107 -- adding the integral part and finally multiplying with
108 -- 10 ^ exp if an exponent was given, do it all at once.
109
110 -- -----------------------------------------------------------------------------
111 -- Lexing
112
113 lex :: ReadP Lexeme
114 lex = skipSpaces >> lexToken
115
116 hsLex :: ReadP String
117 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
118 hsLex = do skipSpaces
119 (s,_) <- gather lexToken
120 return s
121
122 lexToken :: ReadP Lexeme
123 lexToken = lexEOF +++
124 lexLitChar +++
125 lexString +++
126 lexPunc +++
127 lexSymbol +++
128 lexId +++
129 lexNumber
130
131
132 -- ----------------------------------------------------------------------
133 -- End of file
134 lexEOF :: ReadP Lexeme
135 lexEOF = do s <- look
136 guard (null s)
137 return EOF
138
139 -- ---------------------------------------------------------------------------
140 -- Single character lexemes
141
142 lexPunc :: ReadP Lexeme
143 lexPunc =
144 do c <- satisfy isPuncChar
145 return (Punc [c])
146 where
147 isPuncChar c = c `elem` ",;()[]{}`"
148
149 -- ----------------------------------------------------------------------
150 -- Symbols
151
152 lexSymbol :: ReadP Lexeme
153 lexSymbol =
154 do s <- munch1 isSymbolChar
155 if s `elem` reserved_ops then
156 return (Punc s) -- Reserved-ops count as punctuation
157 else
158 return (Symbol s)
159 where
160 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
161 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
162
163 -- ----------------------------------------------------------------------
164 -- identifiers
165
166 lexId :: ReadP Lexeme
167 lexId = lex_nan <++ lex_id
168 where
169 -- NaN and Infinity look like identifiers, so
170 -- we parse them first.
171 lex_nan = (string "NaN" >> return (Number NotANumber)) +++
172 (string "Infinity" >> return (Number Infinity))
173
174 lex_id = do c <- satisfy isIdsChar
175 s <- munch isIdfChar
176 return (Ident (c:s))
177
178 -- Identifiers can start with a '_'
179 isIdsChar c = isAlpha c || c == '_'
180 isIdfChar c = isAlphaNum c || c `elem` "_'"
181
182 #ifndef __GLASGOW_HASKELL__
183 infinity, notANumber :: Rational
184 infinity = 1 :% 0
185 notANumber = 0 :% 0
186 #endif
187
188 -- ---------------------------------------------------------------------------
189 -- Lexing character literals
190
191 lexLitChar :: ReadP Lexeme
192 lexLitChar =
193 do _ <- char '\''
194 (c,esc) <- lexCharE
195 guard (esc || c /= '\'') -- Eliminate '' possibility
196 _ <- char '\''
197 return (Char c)
198
199 lexChar :: ReadP Char
200 lexChar = do { (c,_) <- lexCharE; return c }
201
202 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
203 lexCharE =
204 do c1 <- get
205 if c1 == '\\'
206 then do c2 <- lexEsc; return (c2, True)
207 else do return (c1, False)
208 where
209 lexEsc =
210 lexEscChar
211 +++ lexNumeric
212 +++ lexCntrlChar
213 +++ lexAscii
214
215 lexEscChar =
216 do c <- get
217 case c of
218 'a' -> return '\a'
219 'b' -> return '\b'
220 'f' -> return '\f'
221 'n' -> return '\n'
222 'r' -> return '\r'
223 't' -> return '\t'
224 'v' -> return '\v'
225 '\\' -> return '\\'
226 '\"' -> return '\"'
227 '\'' -> return '\''
228 _ -> pfail
229
230 lexNumeric =
231 do base <- lexBaseChar <++ return 10
232 n <- lexInteger base
233 guard (n <= toInteger (ord maxBound))
234 return (chr (fromInteger n))
235
236 lexCntrlChar =
237 do _ <- char '^'
238 c <- get
239 case c of
240 '@' -> return '\^@'
241 'A' -> return '\^A'
242 'B' -> return '\^B'
243 'C' -> return '\^C'
244 'D' -> return '\^D'
245 'E' -> return '\^E'
246 'F' -> return '\^F'
247 'G' -> return '\^G'
248 'H' -> return '\^H'
249 'I' -> return '\^I'
250 'J' -> return '\^J'
251 'K' -> return '\^K'
252 'L' -> return '\^L'
253 'M' -> return '\^M'
254 'N' -> return '\^N'
255 'O' -> return '\^O'
256 'P' -> return '\^P'
257 'Q' -> return '\^Q'
258 'R' -> return '\^R'
259 'S' -> return '\^S'
260 'T' -> return '\^T'
261 'U' -> return '\^U'
262 'V' -> return '\^V'
263 'W' -> return '\^W'
264 'X' -> return '\^X'
265 'Y' -> return '\^Y'
266 'Z' -> return '\^Z'
267 '[' -> return '\^['
268 '\\' -> return '\^\'
269 ']' -> return '\^]'
270 '^' -> return '\^^'
271 '_' -> return '\^_'
272 _ -> pfail
273
274 lexAscii =
275 do choice
276 [ (string "SOH" >> return '\SOH') <++
277 (string "SO" >> return '\SO')
278 -- \SO and \SOH need maximal-munch treatment
279 -- See the Haskell report Sect 2.6
280
281 , string "NUL" >> return '\NUL'
282 , string "STX" >> return '\STX'
283 , string "ETX" >> return '\ETX'
284 , string "EOT" >> return '\EOT'
285 , string "ENQ" >> return '\ENQ'
286 , string "ACK" >> return '\ACK'
287 , string "BEL" >> return '\BEL'
288 , string "BS" >> return '\BS'
289 , string "HT" >> return '\HT'
290 , string "LF" >> return '\LF'
291 , string "VT" >> return '\VT'
292 , string "FF" >> return '\FF'
293 , string "CR" >> return '\CR'
294 , string "SI" >> return '\SI'
295 , string "DLE" >> return '\DLE'
296 , string "DC1" >> return '\DC1'
297 , string "DC2" >> return '\DC2'
298 , string "DC3" >> return '\DC3'
299 , string "DC4" >> return '\DC4'
300 , string "NAK" >> return '\NAK'
301 , string "SYN" >> return '\SYN'
302 , string "ETB" >> return '\ETB'
303 , string "CAN" >> return '\CAN'
304 , string "EM" >> return '\EM'
305 , string "SUB" >> return '\SUB'
306 , string "ESC" >> return '\ESC'
307 , string "FS" >> return '\FS'
308 , string "GS" >> return '\GS'
309 , string "RS" >> return '\RS'
310 , string "US" >> return '\US'
311 , string "SP" >> return '\SP'
312 , string "DEL" >> return '\DEL'
313 ]
314
315
316 -- ---------------------------------------------------------------------------
317 -- string literal
318
319 lexString :: ReadP Lexeme
320 lexString =
321 do _ <- char '"'
322 body id
323 where
324 body f =
325 do (c,esc) <- lexStrItem
326 if c /= '"' || esc
327 then body (f.(c:))
328 else let s = f "" in
329 return (String s)
330
331 lexStrItem = (lexEmpty >> lexStrItem)
332 +++ lexCharE
333
334 lexEmpty =
335 do _ <- char '\\'
336 c <- get
337 case c of
338 '&' -> do return ()
339 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
340 _ -> do pfail
341
342 -- ---------------------------------------------------------------------------
343 -- Lexing numbers
344
345 type Base = Int
346 type Digits = [Int]
347
348 lexNumber :: ReadP Lexeme
349 lexNumber
350 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
351 -- If that fails, try for a decimal number
352 lexDecNumber -- Start with ordinary digits
353
354 lexHexOct :: ReadP Lexeme
355 lexHexOct
356 = do _ <- char '0'
357 base <- lexBaseChar
358 digits <- lexDigits base
359 return (Number (MkNumber base digits))
360
361 lexBaseChar :: ReadP Int
362 -- Lex a single character indicating the base; fail if not there
363 lexBaseChar = do { c <- get;
364 case c of
365 'o' -> return 8
366 'O' -> return 8
367 'x' -> return 16
368 'X' -> return 16
369 _ -> pfail }
370
371 lexDecNumber :: ReadP Lexeme
372 lexDecNumber =
373 do xs <- lexDigits 10
374 mFrac <- lexFrac <++ return Nothing
375 mExp <- lexExp <++ return Nothing
376 return (Number (MkDecimal xs mFrac mExp))
377
378 lexFrac :: ReadP (Maybe Digits)
379 -- Read the fractional part; fail if it doesn't
380 -- start ".d" where d is a digit
381 lexFrac = do _ <- char '.'
382 fraction <- lexDigits 10
383 return (Just fraction)
384
385 lexExp :: ReadP (Maybe Integer)
386 lexExp = do _ <- char 'e' +++ char 'E'
387 exp <- signedExp +++ lexInteger 10
388 return (Just exp)
389 where
390 signedExp
391 = do c <- char '-' +++ char '+'
392 n <- lexInteger 10
393 return (if c == '-' then -n else n)
394
395 lexDigits :: Int -> ReadP Digits
396 -- Lex a non-empty sequence of digits in specified base
397 lexDigits base =
398 do s <- look
399 xs <- scan s id
400 guard (not (null xs))
401 return xs
402 where
403 scan (c:cs) f = case valDig base c of
404 Just n -> do _ <- get; scan cs (f.(n:))
405 Nothing -> do return (f [])
406 scan [] f = do return (f [])
407
408 lexInteger :: Base -> ReadP Integer
409 lexInteger base =
410 do xs <- lexDigits base
411 return (val (fromIntegral base) 0 xs)
412
413 val :: Num a => a -> a -> Digits -> a
414 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
415 val _ y [] = y
416 val base y (x:xs) = y' `seq` val base y' xs
417 where
418 y' = y * base + fromIntegral x
419
420 -- Calculate a Rational from the exponent [of 10 to multiply with],
421 -- the integral part of the mantissa and the digits of the fractional
422 -- part. Leaving the calculation of the power of 10 until the end,
423 -- when we know the effective exponent, saves multiplications.
424 -- More importantly, this way we need at most one gcd instead of three.
425 --
426 -- frac was never used with anything but Integer and base 10, so
427 -- those are hardcoded now (trivial to change if necessary).
428 fracExp :: Integer -> Integer -> Digits -> Rational
429 fracExp exp mant []
430 | exp < 0 = mant % (10 ^ (-exp))
431 | otherwise = fromInteger (mant * 10 ^ exp)
432 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
433 where
434 exp' = exp - 1
435 mant' = mant * 10 + fromIntegral d
436
437 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
438 valDig 8 c
439 | '0' <= c && c <= '7' = Just (ord c - ord '0')
440 | otherwise = Nothing
441
442 valDig 10 c = valDecDig c
443
444 valDig 16 c
445 | '0' <= c && c <= '9' = Just (ord c - ord '0')
446 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
447 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
448 | otherwise = Nothing
449
450 valDig _ _ = error "valDig: Bad base"
451
452 valDecDig :: Char -> Maybe Int
453 valDecDig c
454 | '0' <= c && c <= '9' = Just (ord c - ord '0')
455 | otherwise = Nothing
456
457 -- ----------------------------------------------------------------------
458 -- other numeric lexing functions
459
460 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
461 readIntP base isDigit valDigit =
462 do s <- munch1 isDigit
463 return (val base 0 (map valDigit s))
464
465 readIntP' :: (Eq a, Num a) => a -> ReadP a
466 readIntP' base = readIntP base isDigit valDigit
467 where
468 isDigit c = maybe False (const True) (valDig base c)
469 valDigit c = maybe 0 id (valDig base c)
470
471 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
472 readOctP = readIntP' 8
473 readDecP = readIntP' 10
474 readHexP = readIntP' 16
475