f2ef4cc92225bf7bce190c350b9c09f4dbd1c461
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..)
21
22 , numberToInteger, numberToFixed, numberToRational, numberToRangedRational
23
24 -- lexer
25 , lex, expect
26 , hsLex
27 , lexChar
28
29 , readIntP
30 , readOctP
31 , readDecP
32 , readHexP
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 import GHC.Base
39 import GHC.Char
40 import GHC.Num( Num(..), Integer )
41 import GHC.Show( Show(..) )
42 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
43 import GHC.Real( Rational, (%), fromIntegral,
44 toInteger, (^) )
45 import GHC.List
46 import GHC.Enum( minBound, maxBound )
47 import Data.Maybe
48 import Control.Monad
49
50 -- -----------------------------------------------------------------------------
51 -- Lexing types
52
53 -- ^ Haskell lexemes.
54 data Lexeme
55 = Char Char -- ^ Character literal
56 | String String -- ^ String literal, with escapes interpreted
57 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
58 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
59 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
60 | Number Number
61 | EOF
62 deriving (Eq, Show)
63
64 -- | /Since: 4.6.0.0/
65 data Number = MkNumber Int -- Base
66 Digits -- Integral part
67 | MkDecimal Digits -- Integral part
68 (Maybe Digits) -- Fractional part
69 (Maybe Integer) -- Exponent
70 deriving (Eq, Show)
71
72 numberToInteger :: Number -> Maybe Integer
73 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
74 numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart)
75 numberToInteger _ = Nothing
76
77 -- | /Since: 4.7.0.0/
78 numberToFixed :: Integer -> Number -> Maybe (Integer, Integer)
79 numberToFixed _ (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart, 0)
80 numberToFixed _ (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart, 0)
81 numberToFixed p (MkDecimal iPart (Just fPart) Nothing)
82 = let i = val 10 0 iPart
83 f = val 10 0 (integerTake p (fPart ++ repeat 0))
84 -- Sigh, we really want genericTake, but that's above us in
85 -- the hierarchy, so we define our own version here (actually
86 -- specialised to Integer)
87 integerTake :: Integer -> [a] -> [a]
88 integerTake n _ | n <= 0 = []
89 integerTake _ [] = []
90 integerTake n (x:xs) = x : integerTake (n-1) xs
91 in Just (i, f)
92 numberToFixed _ _ = Nothing
93
94 -- This takes a floatRange, and if the Rational would be outside of
95 -- the floatRange then it may return Nothing. Not that it will not
96 -- /necessarily/ return Nothing, but it is good enough to fix the
97 -- space problems in #5688
98 -- Ways this is conservative:
99 -- * the floatRange is in base 2, but we pretend it is in base 10
100 -- * we pad the floateRange a bit, just in case it is very small
101 -- and we would otherwise hit an edge case
102 -- * We only worry about numbers that have an exponent. If they don't
103 -- have an exponent then the Rational won't be much larger than the
104 -- Number, so there is no problem
105 numberToRangedRational :: (Int, Int) -> Number
106 -> Maybe Rational -- Nothing = Inf
107 numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
108 -- if exp is out of integer bounds,
109 -- then the number is definitely out of range
110 | exp > fromIntegral (maxBound :: Int) ||
111 exp < fromIntegral (minBound :: Int)
112 = Nothing
113 | otherwise
114 = let mFirstDigit = case dropWhile (0 ==) iPart of
115 iPart'@(_ : _) -> Just (length iPart')
116 [] -> case mFPart of
117 Nothing -> Nothing
118 Just fPart ->
119 case span (0 ==) fPart of
120 (_, []) -> Nothing
121 (zeroes, _) ->
122 Just (negate (length zeroes))
123 in case mFirstDigit of
124 Nothing -> Just 0
125 Just firstDigit ->
126 let firstDigit' = firstDigit + fromInteger exp
127 in if firstDigit' > (pos + 3)
128 then Nothing
129 else if firstDigit' < (neg - 3)
130 then Just 0
131 else Just (numberToRational n)
132 numberToRangedRational _ n = Just (numberToRational n)
133
134 -- | /Since: 4.6.0.0/
135 numberToRational :: Number -> Rational
136 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
137 numberToRational (MkDecimal iPart mFPart mExp)
138 = let i = val 10 0 iPart
139 in case (mFPart, mExp) of
140 (Nothing, Nothing) -> i % 1
141 (Nothing, Just exp)
142 | exp >= 0 -> (i * (10 ^ exp)) % 1
143 | otherwise -> i % (10 ^ (- exp))
144 (Just fPart, Nothing) -> fracExp 0 i fPart
145 (Just fPart, Just exp) -> fracExp exp i fPart
146 -- fracExp is a bit more efficient in calculating the Rational.
147 -- Instead of calculating the fractional part alone, then
148 -- adding the integral part and finally multiplying with
149 -- 10 ^ exp if an exponent was given, do it all at once.
150
151 -- -----------------------------------------------------------------------------
152 -- Lexing
153
154 lex :: ReadP Lexeme
155 lex = skipSpaces >> lexToken
156
157 -- | /Since: 4.7.0.0/
158 expect :: Lexeme -> ReadP ()
159 expect lexeme = do { skipSpaces
160 ; thing <- lexToken
161 ; if thing == lexeme then return () else pfail }
162
163 hsLex :: ReadP String
164 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
165 hsLex = do skipSpaces
166 (s,_) <- gather lexToken
167 return s
168
169 lexToken :: ReadP Lexeme
170 lexToken = lexEOF +++
171 lexLitChar +++
172 lexString +++
173 lexPunc +++
174 lexSymbol +++
175 lexId +++
176 lexNumber
177
178
179 -- ----------------------------------------------------------------------
180 -- End of file
181 lexEOF :: ReadP Lexeme
182 lexEOF = do s <- look
183 guard (null s)
184 return EOF
185
186 -- ---------------------------------------------------------------------------
187 -- Single character lexemes
188
189 lexPunc :: ReadP Lexeme
190 lexPunc =
191 do c <- satisfy isPuncChar
192 return (Punc [c])
193 where
194 isPuncChar c = c `elem` ",;()[]{}`"
195
196 -- ----------------------------------------------------------------------
197 -- Symbols
198
199 lexSymbol :: ReadP Lexeme
200 lexSymbol =
201 do s <- munch1 isSymbolChar
202 if s `elem` reserved_ops then
203 return (Punc s) -- Reserved-ops count as punctuation
204 else
205 return (Symbol s)
206 where
207 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
208 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
209
210 -- ----------------------------------------------------------------------
211 -- identifiers
212
213 lexId :: ReadP Lexeme
214 lexId = do c <- satisfy isIdsChar
215 s <- munch isIdfChar
216 return (Ident (c:s))
217 where
218 -- Identifiers can start with a '_'
219 isIdsChar c = isAlpha c || c == '_'
220 isIdfChar c = isAlphaNum c || c `elem` "_'"
221
222 -- ---------------------------------------------------------------------------
223 -- Lexing character literals
224
225 lexLitChar :: ReadP Lexeme
226 lexLitChar =
227 do _ <- char '\''
228 (c,esc) <- lexCharE
229 guard (esc || c /= '\'') -- Eliminate '' possibility
230 _ <- char '\''
231 return (Char c)
232
233 lexChar :: ReadP Char
234 lexChar = do { (c,_) <- lexCharE; return c }
235
236 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
237 lexCharE =
238 do c1 <- get
239 if c1 == '\\'
240 then do c2 <- lexEsc; return (c2, True)
241 else do return (c1, False)
242 where
243 lexEsc =
244 lexEscChar
245 +++ lexNumeric
246 +++ lexCntrlChar
247 +++ lexAscii
248
249 lexEscChar =
250 do c <- get
251 case c of
252 'a' -> return '\a'
253 'b' -> return '\b'
254 'f' -> return '\f'
255 'n' -> return '\n'
256 'r' -> return '\r'
257 't' -> return '\t'
258 'v' -> return '\v'
259 '\\' -> return '\\'
260 '\"' -> return '\"'
261 '\'' -> return '\''
262 _ -> pfail
263
264 lexNumeric =
265 do base <- lexBaseChar <++ return 10
266 n <- lexInteger base
267 guard (n <= toInteger (ord maxBound))
268 return (chr (fromInteger n))
269
270 lexCntrlChar =
271 do _ <- char '^'
272 c <- get
273 case c of
274 '@' -> return '\^@'
275 'A' -> return '\^A'
276 'B' -> return '\^B'
277 'C' -> return '\^C'
278 'D' -> return '\^D'
279 'E' -> return '\^E'
280 'F' -> return '\^F'
281 'G' -> return '\^G'
282 'H' -> return '\^H'
283 'I' -> return '\^I'
284 'J' -> return '\^J'
285 'K' -> return '\^K'
286 'L' -> return '\^L'
287 'M' -> return '\^M'
288 'N' -> return '\^N'
289 'O' -> return '\^O'
290 'P' -> return '\^P'
291 'Q' -> return '\^Q'
292 'R' -> return '\^R'
293 'S' -> return '\^S'
294 'T' -> return '\^T'
295 'U' -> return '\^U'
296 'V' -> return '\^V'
297 'W' -> return '\^W'
298 'X' -> return '\^X'
299 'Y' -> return '\^Y'
300 'Z' -> return '\^Z'
301 '[' -> return '\^['
302 '\\' -> return '\^\'
303 ']' -> return '\^]'
304 '^' -> return '\^^'
305 '_' -> return '\^_'
306 _ -> pfail
307
308 lexAscii =
309 do choice
310 [ (string "SOH" >> return '\SOH') <++
311 (string "SO" >> return '\SO')
312 -- \SO and \SOH need maximal-munch treatment
313 -- See the Haskell report Sect 2.6
314
315 , string "NUL" >> return '\NUL'
316 , string "STX" >> return '\STX'
317 , string "ETX" >> return '\ETX'
318 , string "EOT" >> return '\EOT'
319 , string "ENQ" >> return '\ENQ'
320 , string "ACK" >> return '\ACK'
321 , string "BEL" >> return '\BEL'
322 , string "BS" >> return '\BS'
323 , string "HT" >> return '\HT'
324 , string "LF" >> return '\LF'
325 , string "VT" >> return '\VT'
326 , string "FF" >> return '\FF'
327 , string "CR" >> return '\CR'
328 , string "SI" >> return '\SI'
329 , string "DLE" >> return '\DLE'
330 , string "DC1" >> return '\DC1'
331 , string "DC2" >> return '\DC2'
332 , string "DC3" >> return '\DC3'
333 , string "DC4" >> return '\DC4'
334 , string "NAK" >> return '\NAK'
335 , string "SYN" >> return '\SYN'
336 , string "ETB" >> return '\ETB'
337 , string "CAN" >> return '\CAN'
338 , string "EM" >> return '\EM'
339 , string "SUB" >> return '\SUB'
340 , string "ESC" >> return '\ESC'
341 , string "FS" >> return '\FS'
342 , string "GS" >> return '\GS'
343 , string "RS" >> return '\RS'
344 , string "US" >> return '\US'
345 , string "SP" >> return '\SP'
346 , string "DEL" >> return '\DEL'
347 ]
348
349
350 -- ---------------------------------------------------------------------------
351 -- string literal
352
353 lexString :: ReadP Lexeme
354 lexString =
355 do _ <- char '"'
356 body id
357 where
358 body f =
359 do (c,esc) <- lexStrItem
360 if c /= '"' || esc
361 then body (f.(c:))
362 else let s = f "" in
363 return (String s)
364
365 lexStrItem = (lexEmpty >> lexStrItem)
366 +++ lexCharE
367
368 lexEmpty =
369 do _ <- char '\\'
370 c <- get
371 case c of
372 '&' -> do return ()
373 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
374 _ -> do pfail
375
376 -- ---------------------------------------------------------------------------
377 -- Lexing numbers
378
379 type Base = Int
380 type Digits = [Int]
381
382 lexNumber :: ReadP Lexeme
383 lexNumber
384 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
385 -- If that fails, try for a decimal number
386 lexDecNumber -- Start with ordinary digits
387
388 lexHexOct :: ReadP Lexeme
389 lexHexOct
390 = do _ <- char '0'
391 base <- lexBaseChar
392 digits <- lexDigits base
393 return (Number (MkNumber base digits))
394
395 lexBaseChar :: ReadP Int
396 -- Lex a single character indicating the base; fail if not there
397 lexBaseChar = do { c <- get;
398 case c of
399 'o' -> return 8
400 'O' -> return 8
401 'x' -> return 16
402 'X' -> return 16
403 _ -> pfail }
404
405 lexDecNumber :: ReadP Lexeme
406 lexDecNumber =
407 do xs <- lexDigits 10
408 mFrac <- lexFrac <++ return Nothing
409 mExp <- lexExp <++ return Nothing
410 return (Number (MkDecimal xs mFrac mExp))
411
412 lexFrac :: ReadP (Maybe Digits)
413 -- Read the fractional part; fail if it doesn't
414 -- start ".d" where d is a digit
415 lexFrac = do _ <- char '.'
416 fraction <- lexDigits 10
417 return (Just fraction)
418
419 lexExp :: ReadP (Maybe Integer)
420 lexExp = do _ <- char 'e' +++ char 'E'
421 exp <- signedExp +++ lexInteger 10
422 return (Just exp)
423 where
424 signedExp
425 = do c <- char '-' +++ char '+'
426 n <- lexInteger 10
427 return (if c == '-' then -n else n)
428
429 lexDigits :: Int -> ReadP Digits
430 -- Lex a non-empty sequence of digits in specified base
431 lexDigits base =
432 do s <- look
433 xs <- scan s id
434 guard (not (null xs))
435 return xs
436 where
437 scan (c:cs) f = case valDig base c of
438 Just n -> do _ <- get; scan cs (f.(n:))
439 Nothing -> do return (f [])
440 scan [] f = do return (f [])
441
442 lexInteger :: Base -> ReadP Integer
443 lexInteger base =
444 do xs <- lexDigits base
445 return (val (fromIntegral base) 0 xs)
446
447 val :: Num a => a -> a -> Digits -> a
448 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
449 val _ y [] = y
450 val base y (x:xs) = y' `seq` val base y' xs
451 where
452 y' = y * base + fromIntegral x
453
454 -- Calculate a Rational from the exponent [of 10 to multiply with],
455 -- the integral part of the mantissa and the digits of the fractional
456 -- part. Leaving the calculation of the power of 10 until the end,
457 -- when we know the effective exponent, saves multiplications.
458 -- More importantly, this way we need at most one gcd instead of three.
459 --
460 -- frac was never used with anything but Integer and base 10, so
461 -- those are hardcoded now (trivial to change if necessary).
462 fracExp :: Integer -> Integer -> Digits -> Rational
463 fracExp exp mant []
464 | exp < 0 = mant % (10 ^ (-exp))
465 | otherwise = fromInteger (mant * 10 ^ exp)
466 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
467 where
468 exp' = exp - 1
469 mant' = mant * 10 + fromIntegral d
470
471 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
472 valDig 8 c
473 | '0' <= c && c <= '7' = Just (ord c - ord '0')
474 | otherwise = Nothing
475
476 valDig 10 c = valDecDig c
477
478 valDig 16 c
479 | '0' <= c && c <= '9' = Just (ord c - ord '0')
480 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
481 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
482 | otherwise = Nothing
483
484 valDig _ _ = error "valDig: Bad base"
485
486 valDecDig :: Char -> Maybe Int
487 valDecDig c
488 | '0' <= c && c <= '9' = Just (ord c - ord '0')
489 | otherwise = Nothing
490
491 -- ----------------------------------------------------------------------
492 -- other numeric lexing functions
493
494 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
495 readIntP base isDigit valDigit =
496 do s <- munch1 isDigit
497 return (val base 0 (map valDigit s))
498
499 readIntP' :: (Eq a, Num a) => a -> ReadP a
500 readIntP' base = readIntP base isDigit valDigit
501 where
502 isDigit c = maybe False (const True) (valDig base c)
503 valDigit c = maybe 0 id (valDig base c)
504
505 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
506 readOctP = readIntP' 8
507 readDecP = readIntP' 10
508 readHexP = readIntP' 16
509