Add Haddock `/Since: 4.7.0.0/` comments to new symbols
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..)
21
22 , numberToInteger, numberToFixed, numberToRational, numberToRangedRational
23
24 -- lexer
25 , lex, expect
26 , hsLex
27 , lexChar
28
29 , readIntP
30 , readOctP
31 , readDecP
32 , readHexP
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 import GHC.Base
39 import GHC.Char
40 import GHC.Num( Num(..), Integer )
41 import GHC.Show( Show(..) )
42 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
43 import GHC.Real( Rational, (%), fromIntegral,
44 toInteger, (^) )
45 import GHC.List
46 import GHC.Enum( minBound, maxBound )
47 import Data.Maybe
48 import Control.Monad
49
50 -- -----------------------------------------------------------------------------
51 -- Lexing types
52
53 -- ^ Haskell lexemes.
54 data Lexeme
55 = Char Char -- ^ Character literal
56 | String String -- ^ String literal, with escapes interpreted
57 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
58 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
59 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
60 | Number Number
61 | EOF
62 deriving (Eq, Show)
63
64 data Number = MkNumber Int -- Base
65 Digits -- Integral part
66 | MkDecimal Digits -- Integral part
67 (Maybe Digits) -- Fractional part
68 (Maybe Integer) -- Exponent
69 deriving (Eq, Show)
70
71 numberToInteger :: Number -> Maybe Integer
72 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
73 numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart)
74 numberToInteger _ = Nothing
75
76 -- | /Since: 4.7.0.0/
77 numberToFixed :: Integer -> Number -> Maybe (Integer, Integer)
78 numberToFixed _ (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart, 0)
79 numberToFixed _ (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart, 0)
80 numberToFixed p (MkDecimal iPart (Just fPart) Nothing)
81 = let i = val 10 0 iPart
82 f = val 10 0 (integerTake p (fPart ++ repeat 0))
83 -- Sigh, we really want genericTake, but that's above us in
84 -- the hierarchy, so we define our own version here (actually
85 -- specialised to Integer)
86 integerTake :: Integer -> [a] -> [a]
87 integerTake n _ | n <= 0 = []
88 integerTake _ [] = []
89 integerTake n (x:xs) = x : integerTake (n-1) xs
90 in Just (i, f)
91 numberToFixed _ _ = Nothing
92
93 -- This takes a floatRange, and if the Rational would be outside of
94 -- the floatRange then it may return Nothing. Not that it will not
95 -- /necessarily/ return Nothing, but it is good enough to fix the
96 -- space problems in #5688
97 -- Ways this is conservative:
98 -- * the floatRange is in base 2, but we pretend it is in base 10
99 -- * we pad the floateRange a bit, just in case it is very small
100 -- and we would otherwise hit an edge case
101 -- * We only worry about numbers that have an exponent. If they don't
102 -- have an exponent then the Rational won't be much larger than the
103 -- Number, so there is no problem
104 numberToRangedRational :: (Int, Int) -> Number
105 -> Maybe Rational -- Nothing = Inf
106 numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
107 -- if exp is out of integer bounds,
108 -- then the number is definitely out of range
109 | exp > fromIntegral (maxBound :: Int) ||
110 exp < fromIntegral (minBound :: Int)
111 = Nothing
112 | otherwise
113 = let mFirstDigit = case dropWhile (0 ==) iPart of
114 iPart'@(_ : _) -> Just (length iPart')
115 [] -> case mFPart of
116 Nothing -> Nothing
117 Just fPart ->
118 case span (0 ==) fPart of
119 (_, []) -> Nothing
120 (zeroes, _) ->
121 Just (negate (length zeroes))
122 in case mFirstDigit of
123 Nothing -> Just 0
124 Just firstDigit ->
125 let firstDigit' = firstDigit + fromInteger exp
126 in if firstDigit' > (pos + 3)
127 then Nothing
128 else if firstDigit' < (neg - 3)
129 then Just 0
130 else Just (numberToRational n)
131 numberToRangedRational _ n = Just (numberToRational n)
132
133 numberToRational :: Number -> Rational
134 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
135 numberToRational (MkDecimal iPart mFPart mExp)
136 = let i = val 10 0 iPart
137 in case (mFPart, mExp) of
138 (Nothing, Nothing) -> i % 1
139 (Nothing, Just exp)
140 | exp >= 0 -> (i * (10 ^ exp)) % 1
141 | otherwise -> i % (10 ^ (- exp))
142 (Just fPart, Nothing) -> fracExp 0 i fPart
143 (Just fPart, Just exp) -> fracExp exp i fPart
144 -- fracExp is a bit more efficient in calculating the Rational.
145 -- Instead of calculating the fractional part alone, then
146 -- adding the integral part and finally multiplying with
147 -- 10 ^ exp if an exponent was given, do it all at once.
148
149 -- -----------------------------------------------------------------------------
150 -- Lexing
151
152 lex :: ReadP Lexeme
153 lex = skipSpaces >> lexToken
154
155 -- | /Since: 4.7.0.0/
156 expect :: Lexeme -> ReadP ()
157 expect lexeme = do { skipSpaces
158 ; thing <- lexToken
159 ; if thing == lexeme then return () else pfail }
160
161 hsLex :: ReadP String
162 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
163 hsLex = do skipSpaces
164 (s,_) <- gather lexToken
165 return s
166
167 lexToken :: ReadP Lexeme
168 lexToken = lexEOF +++
169 lexLitChar +++
170 lexString +++
171 lexPunc +++
172 lexSymbol +++
173 lexId +++
174 lexNumber
175
176
177 -- ----------------------------------------------------------------------
178 -- End of file
179 lexEOF :: ReadP Lexeme
180 lexEOF = do s <- look
181 guard (null s)
182 return EOF
183
184 -- ---------------------------------------------------------------------------
185 -- Single character lexemes
186
187 lexPunc :: ReadP Lexeme
188 lexPunc =
189 do c <- satisfy isPuncChar
190 return (Punc [c])
191 where
192 isPuncChar c = c `elem` ",;()[]{}`"
193
194 -- ----------------------------------------------------------------------
195 -- Symbols
196
197 lexSymbol :: ReadP Lexeme
198 lexSymbol =
199 do s <- munch1 isSymbolChar
200 if s `elem` reserved_ops then
201 return (Punc s) -- Reserved-ops count as punctuation
202 else
203 return (Symbol s)
204 where
205 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
206 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
207
208 -- ----------------------------------------------------------------------
209 -- identifiers
210
211 lexId :: ReadP Lexeme
212 lexId = do c <- satisfy isIdsChar
213 s <- munch isIdfChar
214 return (Ident (c:s))
215 where
216 -- Identifiers can start with a '_'
217 isIdsChar c = isAlpha c || c == '_'
218 isIdfChar c = isAlphaNum c || c `elem` "_'"
219
220 -- ---------------------------------------------------------------------------
221 -- Lexing character literals
222
223 lexLitChar :: ReadP Lexeme
224 lexLitChar =
225 do _ <- char '\''
226 (c,esc) <- lexCharE
227 guard (esc || c /= '\'') -- Eliminate '' possibility
228 _ <- char '\''
229 return (Char c)
230
231 lexChar :: ReadP Char
232 lexChar = do { (c,_) <- lexCharE; return c }
233
234 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
235 lexCharE =
236 do c1 <- get
237 if c1 == '\\'
238 then do c2 <- lexEsc; return (c2, True)
239 else do return (c1, False)
240 where
241 lexEsc =
242 lexEscChar
243 +++ lexNumeric
244 +++ lexCntrlChar
245 +++ lexAscii
246
247 lexEscChar =
248 do c <- get
249 case c of
250 'a' -> return '\a'
251 'b' -> return '\b'
252 'f' -> return '\f'
253 'n' -> return '\n'
254 'r' -> return '\r'
255 't' -> return '\t'
256 'v' -> return '\v'
257 '\\' -> return '\\'
258 '\"' -> return '\"'
259 '\'' -> return '\''
260 _ -> pfail
261
262 lexNumeric =
263 do base <- lexBaseChar <++ return 10
264 n <- lexInteger base
265 guard (n <= toInteger (ord maxBound))
266 return (chr (fromInteger n))
267
268 lexCntrlChar =
269 do _ <- char '^'
270 c <- get
271 case c of
272 '@' -> return '\^@'
273 'A' -> return '\^A'
274 'B' -> return '\^B'
275 'C' -> return '\^C'
276 'D' -> return '\^D'
277 'E' -> return '\^E'
278 'F' -> return '\^F'
279 'G' -> return '\^G'
280 'H' -> return '\^H'
281 'I' -> return '\^I'
282 'J' -> return '\^J'
283 'K' -> return '\^K'
284 'L' -> return '\^L'
285 'M' -> return '\^M'
286 'N' -> return '\^N'
287 'O' -> return '\^O'
288 'P' -> return '\^P'
289 'Q' -> return '\^Q'
290 'R' -> return '\^R'
291 'S' -> return '\^S'
292 'T' -> return '\^T'
293 'U' -> return '\^U'
294 'V' -> return '\^V'
295 'W' -> return '\^W'
296 'X' -> return '\^X'
297 'Y' -> return '\^Y'
298 'Z' -> return '\^Z'
299 '[' -> return '\^['
300 '\\' -> return '\^\'
301 ']' -> return '\^]'
302 '^' -> return '\^^'
303 '_' -> return '\^_'
304 _ -> pfail
305
306 lexAscii =
307 do choice
308 [ (string "SOH" >> return '\SOH') <++
309 (string "SO" >> return '\SO')
310 -- \SO and \SOH need maximal-munch treatment
311 -- See the Haskell report Sect 2.6
312
313 , string "NUL" >> return '\NUL'
314 , string "STX" >> return '\STX'
315 , string "ETX" >> return '\ETX'
316 , string "EOT" >> return '\EOT'
317 , string "ENQ" >> return '\ENQ'
318 , string "ACK" >> return '\ACK'
319 , string "BEL" >> return '\BEL'
320 , string "BS" >> return '\BS'
321 , string "HT" >> return '\HT'
322 , string "LF" >> return '\LF'
323 , string "VT" >> return '\VT'
324 , string "FF" >> return '\FF'
325 , string "CR" >> return '\CR'
326 , string "SI" >> return '\SI'
327 , string "DLE" >> return '\DLE'
328 , string "DC1" >> return '\DC1'
329 , string "DC2" >> return '\DC2'
330 , string "DC3" >> return '\DC3'
331 , string "DC4" >> return '\DC4'
332 , string "NAK" >> return '\NAK'
333 , string "SYN" >> return '\SYN'
334 , string "ETB" >> return '\ETB'
335 , string "CAN" >> return '\CAN'
336 , string "EM" >> return '\EM'
337 , string "SUB" >> return '\SUB'
338 , string "ESC" >> return '\ESC'
339 , string "FS" >> return '\FS'
340 , string "GS" >> return '\GS'
341 , string "RS" >> return '\RS'
342 , string "US" >> return '\US'
343 , string "SP" >> return '\SP'
344 , string "DEL" >> return '\DEL'
345 ]
346
347
348 -- ---------------------------------------------------------------------------
349 -- string literal
350
351 lexString :: ReadP Lexeme
352 lexString =
353 do _ <- char '"'
354 body id
355 where
356 body f =
357 do (c,esc) <- lexStrItem
358 if c /= '"' || esc
359 then body (f.(c:))
360 else let s = f "" in
361 return (String s)
362
363 lexStrItem = (lexEmpty >> lexStrItem)
364 +++ lexCharE
365
366 lexEmpty =
367 do _ <- char '\\'
368 c <- get
369 case c of
370 '&' -> do return ()
371 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
372 _ -> do pfail
373
374 -- ---------------------------------------------------------------------------
375 -- Lexing numbers
376
377 type Base = Int
378 type Digits = [Int]
379
380 lexNumber :: ReadP Lexeme
381 lexNumber
382 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
383 -- If that fails, try for a decimal number
384 lexDecNumber -- Start with ordinary digits
385
386 lexHexOct :: ReadP Lexeme
387 lexHexOct
388 = do _ <- char '0'
389 base <- lexBaseChar
390 digits <- lexDigits base
391 return (Number (MkNumber base digits))
392
393 lexBaseChar :: ReadP Int
394 -- Lex a single character indicating the base; fail if not there
395 lexBaseChar = do { c <- get;
396 case c of
397 'o' -> return 8
398 'O' -> return 8
399 'x' -> return 16
400 'X' -> return 16
401 _ -> pfail }
402
403 lexDecNumber :: ReadP Lexeme
404 lexDecNumber =
405 do xs <- lexDigits 10
406 mFrac <- lexFrac <++ return Nothing
407 mExp <- lexExp <++ return Nothing
408 return (Number (MkDecimal xs mFrac mExp))
409
410 lexFrac :: ReadP (Maybe Digits)
411 -- Read the fractional part; fail if it doesn't
412 -- start ".d" where d is a digit
413 lexFrac = do _ <- char '.'
414 fraction <- lexDigits 10
415 return (Just fraction)
416
417 lexExp :: ReadP (Maybe Integer)
418 lexExp = do _ <- char 'e' +++ char 'E'
419 exp <- signedExp +++ lexInteger 10
420 return (Just exp)
421 where
422 signedExp
423 = do c <- char '-' +++ char '+'
424 n <- lexInteger 10
425 return (if c == '-' then -n else n)
426
427 lexDigits :: Int -> ReadP Digits
428 -- Lex a non-empty sequence of digits in specified base
429 lexDigits base =
430 do s <- look
431 xs <- scan s id
432 guard (not (null xs))
433 return xs
434 where
435 scan (c:cs) f = case valDig base c of
436 Just n -> do _ <- get; scan cs (f.(n:))
437 Nothing -> do return (f [])
438 scan [] f = do return (f [])
439
440 lexInteger :: Base -> ReadP Integer
441 lexInteger base =
442 do xs <- lexDigits base
443 return (val (fromIntegral base) 0 xs)
444
445 val :: Num a => a -> a -> Digits -> a
446 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
447 val _ y [] = y
448 val base y (x:xs) = y' `seq` val base y' xs
449 where
450 y' = y * base + fromIntegral x
451
452 -- Calculate a Rational from the exponent [of 10 to multiply with],
453 -- the integral part of the mantissa and the digits of the fractional
454 -- part. Leaving the calculation of the power of 10 until the end,
455 -- when we know the effective exponent, saves multiplications.
456 -- More importantly, this way we need at most one gcd instead of three.
457 --
458 -- frac was never used with anything but Integer and base 10, so
459 -- those are hardcoded now (trivial to change if necessary).
460 fracExp :: Integer -> Integer -> Digits -> Rational
461 fracExp exp mant []
462 | exp < 0 = mant % (10 ^ (-exp))
463 | otherwise = fromInteger (mant * 10 ^ exp)
464 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
465 where
466 exp' = exp - 1
467 mant' = mant * 10 + fromIntegral d
468
469 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
470 valDig 8 c
471 | '0' <= c && c <= '7' = Just (ord c - ord '0')
472 | otherwise = Nothing
473
474 valDig 10 c = valDecDig c
475
476 valDig 16 c
477 | '0' <= c && c <= '9' = Just (ord c - ord '0')
478 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
479 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
480 | otherwise = Nothing
481
482 valDig _ _ = error "valDig: Bad base"
483
484 valDecDig :: Char -> Maybe Int
485 valDecDig c
486 | '0' <= c && c <= '9' = Just (ord c - ord '0')
487 | otherwise = Nothing
488
489 -- ----------------------------------------------------------------------
490 -- other numeric lexing functions
491
492 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
493 readIntP base isDigit valDigit =
494 do s <- munch1 isDigit
495 return (val base 0 (map valDigit s))
496
497 readIntP' :: (Eq a, Num a) => a -> ReadP a
498 readIntP' base = readIntP base isDigit valDigit
499 where
500 isDigit c = maybe False (const True) (valDig base c)
501 valDigit c = maybe 0 id (valDig base c)
502
503 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
504 readOctP = readIntP' 8
505 readDecP = readIntP' 10
506 readHexP = readIntP' 16
507