637299ac9827d276d983a09240803d3756ee4f89
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..)
21
22 , numberToInteger, numberToFixed, numberToRational, numberToRangedRational
23
24 -- lexer
25 , lex, expect
26 , hsLex
27 , lexChar
28
29 , readIntP
30 , readOctP
31 , readDecP
32 , readHexP
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 import GHC.Base
39 import GHC.Char
40 import GHC.Num( Num(..), Integer )
41 import GHC.Show( Show(..) )
42 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
43 import GHC.Real( Rational, (%), fromIntegral,
44 toInteger, (^) )
45 import GHC.List
46 import GHC.Enum( minBound, maxBound )
47 import Data.Maybe
48 import Control.Monad
49
50 -- -----------------------------------------------------------------------------
51 -- Lexing types
52
53 -- ^ Haskell lexemes.
54 data Lexeme
55 = Char Char -- ^ Character literal
56 | String String -- ^ String literal, with escapes interpreted
57 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
58 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
59 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
60 | Number Number
61 | EOF
62 deriving (Eq, Show)
63
64 data Number = MkNumber Int -- Base
65 Digits -- Integral part
66 | MkDecimal Digits -- Integral part
67 (Maybe Digits) -- Fractional part
68 (Maybe Integer) -- Exponent
69 deriving (Eq, Show)
70
71 numberToInteger :: Number -> Maybe Integer
72 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
73 numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart)
74 numberToInteger _ = Nothing
75
76 numberToFixed :: Integer -> Number -> Maybe (Integer, Integer)
77 numberToFixed _ (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart, 0)
78 numberToFixed _ (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart, 0)
79 numberToFixed p (MkDecimal iPart (Just fPart) Nothing)
80 = let i = val 10 0 iPart
81 f = val 10 0 (integerTake p (fPart ++ repeat 0))
82 -- Sigh, we really want genericTake, but that's above us in
83 -- the hierarchy, so we define our own version here (actually
84 -- specialised to Integer)
85 integerTake :: Integer -> [a] -> [a]
86 integerTake n _ | n <= 0 = []
87 integerTake _ [] = []
88 integerTake n (x:xs) = x : integerTake (n-1) xs
89 in Just (i, f)
90 numberToFixed _ _ = Nothing
91
92 -- This takes a floatRange, and if the Rational would be outside of
93 -- the floatRange then it may return Nothing. Not that it will not
94 -- /necessarily/ return Nothing, but it is good enough to fix the
95 -- space problems in #5688
96 -- Ways this is conservative:
97 -- * the floatRange is in base 2, but we pretend it is in base 10
98 -- * we pad the floateRange a bit, just in case it is very small
99 -- and we would otherwise hit an edge case
100 -- * We only worry about numbers that have an exponent. If they don't
101 -- have an exponent then the Rational won't be much larger than the
102 -- Number, so there is no problem
103 numberToRangedRational :: (Int, Int) -> Number
104 -> Maybe Rational -- Nothing = Inf
105 numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
106 -- if exp is out of integer bounds,
107 -- then the number is definitely out of range
108 | exp > fromIntegral (maxBound :: Int) ||
109 exp < fromIntegral (minBound :: Int)
110 = Nothing
111 | otherwise
112 = let mFirstDigit = case dropWhile (0 ==) iPart of
113 iPart'@(_ : _) -> Just (length iPart')
114 [] -> case mFPart of
115 Nothing -> Nothing
116 Just fPart ->
117 case span (0 ==) fPart of
118 (_, []) -> Nothing
119 (zeroes, _) ->
120 Just (negate (length zeroes))
121 in case mFirstDigit of
122 Nothing -> Just 0
123 Just firstDigit ->
124 let firstDigit' = firstDigit + fromInteger exp
125 in if firstDigit' > (pos + 3)
126 then Nothing
127 else if firstDigit' < (neg - 3)
128 then Just 0
129 else Just (numberToRational n)
130 numberToRangedRational _ n = Just (numberToRational n)
131
132 numberToRational :: Number -> Rational
133 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
134 numberToRational (MkDecimal iPart mFPart mExp)
135 = let i = val 10 0 iPart
136 in case (mFPart, mExp) of
137 (Nothing, Nothing) -> i % 1
138 (Nothing, Just exp)
139 | exp >= 0 -> (i * (10 ^ exp)) % 1
140 | otherwise -> i % (10 ^ (- exp))
141 (Just fPart, Nothing) -> fracExp 0 i fPart
142 (Just fPart, Just exp) -> fracExp exp i fPart
143 -- fracExp is a bit more efficient in calculating the Rational.
144 -- Instead of calculating the fractional part alone, then
145 -- adding the integral part and finally multiplying with
146 -- 10 ^ exp if an exponent was given, do it all at once.
147
148 -- -----------------------------------------------------------------------------
149 -- Lexing
150
151 lex :: ReadP Lexeme
152 lex = skipSpaces >> lexToken
153
154 expect :: Lexeme -> ReadP ()
155 expect lexeme = do { skipSpaces
156 ; thing <- lexToken
157 ; if thing == lexeme then return () else pfail }
158
159 hsLex :: ReadP String
160 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
161 hsLex = do skipSpaces
162 (s,_) <- gather lexToken
163 return s
164
165 lexToken :: ReadP Lexeme
166 lexToken = lexEOF +++
167 lexLitChar +++
168 lexString +++
169 lexPunc +++
170 lexSymbol +++
171 lexId +++
172 lexNumber
173
174
175 -- ----------------------------------------------------------------------
176 -- End of file
177 lexEOF :: ReadP Lexeme
178 lexEOF = do s <- look
179 guard (null s)
180 return EOF
181
182 -- ---------------------------------------------------------------------------
183 -- Single character lexemes
184
185 lexPunc :: ReadP Lexeme
186 lexPunc =
187 do c <- satisfy isPuncChar
188 return (Punc [c])
189 where
190 isPuncChar c = c `elem` ",;()[]{}`"
191
192 -- ----------------------------------------------------------------------
193 -- Symbols
194
195 lexSymbol :: ReadP Lexeme
196 lexSymbol =
197 do s <- munch1 isSymbolChar
198 if s `elem` reserved_ops then
199 return (Punc s) -- Reserved-ops count as punctuation
200 else
201 return (Symbol s)
202 where
203 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
204 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
205
206 -- ----------------------------------------------------------------------
207 -- identifiers
208
209 lexId :: ReadP Lexeme
210 lexId = do c <- satisfy isIdsChar
211 s <- munch isIdfChar
212 return (Ident (c:s))
213 where
214 -- Identifiers can start with a '_'
215 isIdsChar c = isAlpha c || c == '_'
216 isIdfChar c = isAlphaNum c || c `elem` "_'"
217
218 -- ---------------------------------------------------------------------------
219 -- Lexing character literals
220
221 lexLitChar :: ReadP Lexeme
222 lexLitChar =
223 do _ <- char '\''
224 (c,esc) <- lexCharE
225 guard (esc || c /= '\'') -- Eliminate '' possibility
226 _ <- char '\''
227 return (Char c)
228
229 lexChar :: ReadP Char
230 lexChar = do { (c,_) <- lexCharE; return c }
231
232 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
233 lexCharE =
234 do c1 <- get
235 if c1 == '\\'
236 then do c2 <- lexEsc; return (c2, True)
237 else do return (c1, False)
238 where
239 lexEsc =
240 lexEscChar
241 +++ lexNumeric
242 +++ lexCntrlChar
243 +++ lexAscii
244
245 lexEscChar =
246 do c <- get
247 case c of
248 'a' -> return '\a'
249 'b' -> return '\b'
250 'f' -> return '\f'
251 'n' -> return '\n'
252 'r' -> return '\r'
253 't' -> return '\t'
254 'v' -> return '\v'
255 '\\' -> return '\\'
256 '\"' -> return '\"'
257 '\'' -> return '\''
258 _ -> pfail
259
260 lexNumeric =
261 do base <- lexBaseChar <++ return 10
262 n <- lexInteger base
263 guard (n <= toInteger (ord maxBound))
264 return (chr (fromInteger n))
265
266 lexCntrlChar =
267 do _ <- char '^'
268 c <- get
269 case c of
270 '@' -> return '\^@'
271 'A' -> return '\^A'
272 'B' -> return '\^B'
273 'C' -> return '\^C'
274 'D' -> return '\^D'
275 'E' -> return '\^E'
276 'F' -> return '\^F'
277 'G' -> return '\^G'
278 'H' -> return '\^H'
279 'I' -> return '\^I'
280 'J' -> return '\^J'
281 'K' -> return '\^K'
282 'L' -> return '\^L'
283 'M' -> return '\^M'
284 'N' -> return '\^N'
285 'O' -> return '\^O'
286 'P' -> return '\^P'
287 'Q' -> return '\^Q'
288 'R' -> return '\^R'
289 'S' -> return '\^S'
290 'T' -> return '\^T'
291 'U' -> return '\^U'
292 'V' -> return '\^V'
293 'W' -> return '\^W'
294 'X' -> return '\^X'
295 'Y' -> return '\^Y'
296 'Z' -> return '\^Z'
297 '[' -> return '\^['
298 '\\' -> return '\^\'
299 ']' -> return '\^]'
300 '^' -> return '\^^'
301 '_' -> return '\^_'
302 _ -> pfail
303
304 lexAscii =
305 do choice
306 [ (string "SOH" >> return '\SOH') <++
307 (string "SO" >> return '\SO')
308 -- \SO and \SOH need maximal-munch treatment
309 -- See the Haskell report Sect 2.6
310
311 , string "NUL" >> return '\NUL'
312 , string "STX" >> return '\STX'
313 , string "ETX" >> return '\ETX'
314 , string "EOT" >> return '\EOT'
315 , string "ENQ" >> return '\ENQ'
316 , string "ACK" >> return '\ACK'
317 , string "BEL" >> return '\BEL'
318 , string "BS" >> return '\BS'
319 , string "HT" >> return '\HT'
320 , string "LF" >> return '\LF'
321 , string "VT" >> return '\VT'
322 , string "FF" >> return '\FF'
323 , string "CR" >> return '\CR'
324 , string "SI" >> return '\SI'
325 , string "DLE" >> return '\DLE'
326 , string "DC1" >> return '\DC1'
327 , string "DC2" >> return '\DC2'
328 , string "DC3" >> return '\DC3'
329 , string "DC4" >> return '\DC4'
330 , string "NAK" >> return '\NAK'
331 , string "SYN" >> return '\SYN'
332 , string "ETB" >> return '\ETB'
333 , string "CAN" >> return '\CAN'
334 , string "EM" >> return '\EM'
335 , string "SUB" >> return '\SUB'
336 , string "ESC" >> return '\ESC'
337 , string "FS" >> return '\FS'
338 , string "GS" >> return '\GS'
339 , string "RS" >> return '\RS'
340 , string "US" >> return '\US'
341 , string "SP" >> return '\SP'
342 , string "DEL" >> return '\DEL'
343 ]
344
345
346 -- ---------------------------------------------------------------------------
347 -- string literal
348
349 lexString :: ReadP Lexeme
350 lexString =
351 do _ <- char '"'
352 body id
353 where
354 body f =
355 do (c,esc) <- lexStrItem
356 if c /= '"' || esc
357 then body (f.(c:))
358 else let s = f "" in
359 return (String s)
360
361 lexStrItem = (lexEmpty >> lexStrItem)
362 +++ lexCharE
363
364 lexEmpty =
365 do _ <- char '\\'
366 c <- get
367 case c of
368 '&' -> do return ()
369 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
370 _ -> do pfail
371
372 -- ---------------------------------------------------------------------------
373 -- Lexing numbers
374
375 type Base = Int
376 type Digits = [Int]
377
378 lexNumber :: ReadP Lexeme
379 lexNumber
380 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
381 -- If that fails, try for a decimal number
382 lexDecNumber -- Start with ordinary digits
383
384 lexHexOct :: ReadP Lexeme
385 lexHexOct
386 = do _ <- char '0'
387 base <- lexBaseChar
388 digits <- lexDigits base
389 return (Number (MkNumber base digits))
390
391 lexBaseChar :: ReadP Int
392 -- Lex a single character indicating the base; fail if not there
393 lexBaseChar = do { c <- get;
394 case c of
395 'o' -> return 8
396 'O' -> return 8
397 'x' -> return 16
398 'X' -> return 16
399 _ -> pfail }
400
401 lexDecNumber :: ReadP Lexeme
402 lexDecNumber =
403 do xs <- lexDigits 10
404 mFrac <- lexFrac <++ return Nothing
405 mExp <- lexExp <++ return Nothing
406 return (Number (MkDecimal xs mFrac mExp))
407
408 lexFrac :: ReadP (Maybe Digits)
409 -- Read the fractional part; fail if it doesn't
410 -- start ".d" where d is a digit
411 lexFrac = do _ <- char '.'
412 fraction <- lexDigits 10
413 return (Just fraction)
414
415 lexExp :: ReadP (Maybe Integer)
416 lexExp = do _ <- char 'e' +++ char 'E'
417 exp <- signedExp +++ lexInteger 10
418 return (Just exp)
419 where
420 signedExp
421 = do c <- char '-' +++ char '+'
422 n <- lexInteger 10
423 return (if c == '-' then -n else n)
424
425 lexDigits :: Int -> ReadP Digits
426 -- Lex a non-empty sequence of digits in specified base
427 lexDigits base =
428 do s <- look
429 xs <- scan s id
430 guard (not (null xs))
431 return xs
432 where
433 scan (c:cs) f = case valDig base c of
434 Just n -> do _ <- get; scan cs (f.(n:))
435 Nothing -> do return (f [])
436 scan [] f = do return (f [])
437
438 lexInteger :: Base -> ReadP Integer
439 lexInteger base =
440 do xs <- lexDigits base
441 return (val (fromIntegral base) 0 xs)
442
443 val :: Num a => a -> a -> Digits -> a
444 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
445 val _ y [] = y
446 val base y (x:xs) = y' `seq` val base y' xs
447 where
448 y' = y * base + fromIntegral x
449
450 -- Calculate a Rational from the exponent [of 10 to multiply with],
451 -- the integral part of the mantissa and the digits of the fractional
452 -- part. Leaving the calculation of the power of 10 until the end,
453 -- when we know the effective exponent, saves multiplications.
454 -- More importantly, this way we need at most one gcd instead of three.
455 --
456 -- frac was never used with anything but Integer and base 10, so
457 -- those are hardcoded now (trivial to change if necessary).
458 fracExp :: Integer -> Integer -> Digits -> Rational
459 fracExp exp mant []
460 | exp < 0 = mant % (10 ^ (-exp))
461 | otherwise = fromInteger (mant * 10 ^ exp)
462 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
463 where
464 exp' = exp - 1
465 mant' = mant * 10 + fromIntegral d
466
467 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
468 valDig 8 c
469 | '0' <= c && c <= '7' = Just (ord c - ord '0')
470 | otherwise = Nothing
471
472 valDig 10 c = valDecDig c
473
474 valDig 16 c
475 | '0' <= c && c <= '9' = Just (ord c - ord '0')
476 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
477 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
478 | otherwise = Nothing
479
480 valDig _ _ = error "valDig: Bad base"
481
482 valDecDig :: Char -> Maybe Int
483 valDecDig c
484 | '0' <= c && c <= '9' = Just (ord c - ord '0')
485 | otherwise = Nothing
486
487 -- ----------------------------------------------------------------------
488 -- other numeric lexing functions
489
490 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
491 readIntP base isDigit valDigit =
492 do s <- munch1 isDigit
493 return (val base 0 (map valDigit s))
494
495 readIntP' :: (Eq a, Num a) => a -> ReadP a
496 readIntP' base = readIntP base isDigit valDigit
497 where
498 isDigit c = maybe False (const True) (valDig base c)
499 valDigit c = maybe 0 id (valDig base c)
500
501 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
502 readOctP = readIntP' 8
503 readDecP = readIntP' 10
504 readHexP = readIntP' 16
505