1a163eec6c9039bd2e31ef5f2d7305174cf88261
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..) -- :: *; Show, Eq
21
22 , numberToInteger, numberToRational, numberToRangedRational
23
24 -- lexer
25 , lex -- :: ReadP Lexeme Skips leading spaces
26 , hsLex -- :: ReadP String
27 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
28
29 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
30 , readOctP -- :: Num a => ReadP a
31 , readDecP -- :: Num a => ReadP a
32 , readHexP -- :: Num a => ReadP a
33 )
34 where
35
36 import Text.ParserCombinators.ReadP
37
38 #ifdef __GLASGOW_HASKELL__
39 import GHC.Base
40 import GHC.Char
41 import GHC.Num( Num(..), Integer )
42 import GHC.Show( Show(..) )
43 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
44 import GHC.Real( Integral, Rational, (%), fromIntegral,
45 toInteger, (^) )
46 import GHC.List
47 import GHC.Enum( maxBound )
48 #else
49 import Prelude hiding ( lex )
50 import Data.Char( chr, ord, isSpace, isAlpha, isAlphaNum )
51 import Data.Ratio( Ratio, (%) )
52 #endif
53 #ifdef __HUGS__
54 import Hugs.Prelude( Ratio(..) )
55 #endif
56 import Data.Maybe
57 import Control.Monad
58
59 -- -----------------------------------------------------------------------------
60 -- Lexing types
61
62 -- ^ Haskell lexemes.
63 data Lexeme
64 = Char Char -- ^ Character literal
65 | String String -- ^ String literal, with escapes interpreted
66 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
67 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
68 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
69 | Number Number
70 | EOF
71 deriving (Eq, Show)
72
73 data Number = MkNumber Int -- Base
74 Digits -- Integral part
75 | MkDecimal Digits -- Integral part
76 (Maybe Digits) -- Fractional part
77 (Maybe Integer) -- Exponent
78 deriving (Eq, Show)
79
80 numberToInteger :: Number -> Maybe Integer
81 numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) 0 iPart)
82 numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 0 iPart)
83 numberToInteger _ = Nothing
84
85 -- This takes a floatRange, and if the Rational would be outside of
86 -- the floatRange then it may return Nothing. Not that it will not
87 -- /necessarily/ return Nothing, but it is good enough to fix the
88 -- space problems in #5688
89 -- Ways this is conservative:
90 -- * the floatRange is in base 2, but we pretend it is in base 10
91 -- * we pad the floateRange a bit, just in case it is very small
92 -- and we would otherwise hit an edge case
93 -- * We only worry about numbers that have an exponent. If they don't
94 -- have an exponent then the Rational won't be much larger than the
95 -- Number, so there is no problem
96 numberToRangedRational :: (Int, Int) -> Number
97 -> Maybe Rational -- Nothing = Inf
98 numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
99 = let mFirstDigit = case dropWhile (0 ==) iPart of
100 iPart'@(_ : _) -> Just (length iPart')
101 [] -> case mFPart of
102 Nothing -> Nothing
103 Just fPart ->
104 case span (0 ==) fPart of
105 (_, []) -> Nothing
106 (zeroes, _) ->
107 Just (negate (length zeroes))
108 in case mFirstDigit of
109 Nothing -> Just 0
110 Just firstDigit ->
111 let firstDigit' = firstDigit + fromInteger exp
112 in if firstDigit' > (pos + 3)
113 then Nothing
114 else if firstDigit' < (neg - 3)
115 then Just 0
116 else Just (numberToRational n)
117 numberToRangedRational _ n = Just (numberToRational n)
118
119 numberToRational :: Number -> Rational
120 numberToRational (MkNumber base iPart) = val (fromIntegral base) 0 iPart % 1
121 numberToRational (MkDecimal iPart mFPart mExp)
122 = let i = val 10 0 iPart
123 in case (mFPart, mExp) of
124 (Nothing, Nothing) -> i % 1
125 (Nothing, Just exp)
126 | exp >= 0 -> (i * (10 ^ exp)) % 1
127 | otherwise -> i % (10 ^ (- exp))
128 (Just fPart, Nothing) -> fracExp 0 i fPart
129 (Just fPart, Just exp) -> fracExp exp i fPart
130 -- fracExp is a bit more efficient in calculating the Rational.
131 -- Instead of calculating the fractional part alone, then
132 -- adding the integral part and finally multiplying with
133 -- 10 ^ exp if an exponent was given, do it all at once.
134
135 -- -----------------------------------------------------------------------------
136 -- Lexing
137
138 lex :: ReadP Lexeme
139 lex = skipSpaces >> lexToken
140
141 hsLex :: ReadP String
142 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
143 hsLex = do skipSpaces
144 (s,_) <- gather lexToken
145 return s
146
147 lexToken :: ReadP Lexeme
148 lexToken = lexEOF +++
149 lexLitChar +++
150 lexString +++
151 lexPunc +++
152 lexSymbol +++
153 lexId +++
154 lexNumber
155
156
157 -- ----------------------------------------------------------------------
158 -- End of file
159 lexEOF :: ReadP Lexeme
160 lexEOF = do s <- look
161 guard (null s)
162 return EOF
163
164 -- ---------------------------------------------------------------------------
165 -- Single character lexemes
166
167 lexPunc :: ReadP Lexeme
168 lexPunc =
169 do c <- satisfy isPuncChar
170 return (Punc [c])
171 where
172 isPuncChar c = c `elem` ",;()[]{}`"
173
174 -- ----------------------------------------------------------------------
175 -- Symbols
176
177 lexSymbol :: ReadP Lexeme
178 lexSymbol =
179 do s <- munch1 isSymbolChar
180 if s `elem` reserved_ops then
181 return (Punc s) -- Reserved-ops count as punctuation
182 else
183 return (Symbol s)
184 where
185 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
186 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
187
188 -- ----------------------------------------------------------------------
189 -- identifiers
190
191 lexId :: ReadP Lexeme
192 lexId = do c <- satisfy isIdsChar
193 s <- munch isIdfChar
194 return (Ident (c:s))
195 where
196 -- Identifiers can start with a '_'
197 isIdsChar c = isAlpha c || c == '_'
198 isIdfChar c = isAlphaNum c || c `elem` "_'"
199
200 -- ---------------------------------------------------------------------------
201 -- Lexing character literals
202
203 lexLitChar :: ReadP Lexeme
204 lexLitChar =
205 do _ <- char '\''
206 (c,esc) <- lexCharE
207 guard (esc || c /= '\'') -- Eliminate '' possibility
208 _ <- char '\''
209 return (Char c)
210
211 lexChar :: ReadP Char
212 lexChar = do { (c,_) <- lexCharE; return c }
213
214 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
215 lexCharE =
216 do c1 <- get
217 if c1 == '\\'
218 then do c2 <- lexEsc; return (c2, True)
219 else do return (c1, False)
220 where
221 lexEsc =
222 lexEscChar
223 +++ lexNumeric
224 +++ lexCntrlChar
225 +++ lexAscii
226
227 lexEscChar =
228 do c <- get
229 case c of
230 'a' -> return '\a'
231 'b' -> return '\b'
232 'f' -> return '\f'
233 'n' -> return '\n'
234 'r' -> return '\r'
235 't' -> return '\t'
236 'v' -> return '\v'
237 '\\' -> return '\\'
238 '\"' -> return '\"'
239 '\'' -> return '\''
240 _ -> pfail
241
242 lexNumeric =
243 do base <- lexBaseChar <++ return 10
244 n <- lexInteger base
245 guard (n <= toInteger (ord maxBound))
246 return (chr (fromInteger n))
247
248 lexCntrlChar =
249 do _ <- char '^'
250 c <- get
251 case c of
252 '@' -> return '\^@'
253 'A' -> return '\^A'
254 'B' -> return '\^B'
255 'C' -> return '\^C'
256 'D' -> return '\^D'
257 'E' -> return '\^E'
258 'F' -> return '\^F'
259 'G' -> return '\^G'
260 'H' -> return '\^H'
261 'I' -> return '\^I'
262 'J' -> return '\^J'
263 'K' -> return '\^K'
264 'L' -> return '\^L'
265 'M' -> return '\^M'
266 'N' -> return '\^N'
267 'O' -> return '\^O'
268 'P' -> return '\^P'
269 'Q' -> return '\^Q'
270 'R' -> return '\^R'
271 'S' -> return '\^S'
272 'T' -> return '\^T'
273 'U' -> return '\^U'
274 'V' -> return '\^V'
275 'W' -> return '\^W'
276 'X' -> return '\^X'
277 'Y' -> return '\^Y'
278 'Z' -> return '\^Z'
279 '[' -> return '\^['
280 '\\' -> return '\^\'
281 ']' -> return '\^]'
282 '^' -> return '\^^'
283 '_' -> return '\^_'
284 _ -> pfail
285
286 lexAscii =
287 do choice
288 [ (string "SOH" >> return '\SOH') <++
289 (string "SO" >> return '\SO')
290 -- \SO and \SOH need maximal-munch treatment
291 -- See the Haskell report Sect 2.6
292
293 , string "NUL" >> return '\NUL'
294 , string "STX" >> return '\STX'
295 , string "ETX" >> return '\ETX'
296 , string "EOT" >> return '\EOT'
297 , string "ENQ" >> return '\ENQ'
298 , string "ACK" >> return '\ACK'
299 , string "BEL" >> return '\BEL'
300 , string "BS" >> return '\BS'
301 , string "HT" >> return '\HT'
302 , string "LF" >> return '\LF'
303 , string "VT" >> return '\VT'
304 , string "FF" >> return '\FF'
305 , string "CR" >> return '\CR'
306 , string "SI" >> return '\SI'
307 , string "DLE" >> return '\DLE'
308 , string "DC1" >> return '\DC1'
309 , string "DC2" >> return '\DC2'
310 , string "DC3" >> return '\DC3'
311 , string "DC4" >> return '\DC4'
312 , string "NAK" >> return '\NAK'
313 , string "SYN" >> return '\SYN'
314 , string "ETB" >> return '\ETB'
315 , string "CAN" >> return '\CAN'
316 , string "EM" >> return '\EM'
317 , string "SUB" >> return '\SUB'
318 , string "ESC" >> return '\ESC'
319 , string "FS" >> return '\FS'
320 , string "GS" >> return '\GS'
321 , string "RS" >> return '\RS'
322 , string "US" >> return '\US'
323 , string "SP" >> return '\SP'
324 , string "DEL" >> return '\DEL'
325 ]
326
327
328 -- ---------------------------------------------------------------------------
329 -- string literal
330
331 lexString :: ReadP Lexeme
332 lexString =
333 do _ <- char '"'
334 body id
335 where
336 body f =
337 do (c,esc) <- lexStrItem
338 if c /= '"' || esc
339 then body (f.(c:))
340 else let s = f "" in
341 return (String s)
342
343 lexStrItem = (lexEmpty >> lexStrItem)
344 +++ lexCharE
345
346 lexEmpty =
347 do _ <- char '\\'
348 c <- get
349 case c of
350 '&' -> do return ()
351 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
352 _ -> do pfail
353
354 -- ---------------------------------------------------------------------------
355 -- Lexing numbers
356
357 type Base = Int
358 type Digits = [Int]
359
360 lexNumber :: ReadP Lexeme
361 lexNumber
362 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
363 -- If that fails, try for a decimal number
364 lexDecNumber -- Start with ordinary digits
365
366 lexHexOct :: ReadP Lexeme
367 lexHexOct
368 = do _ <- char '0'
369 base <- lexBaseChar
370 digits <- lexDigits base
371 return (Number (MkNumber base digits))
372
373 lexBaseChar :: ReadP Int
374 -- Lex a single character indicating the base; fail if not there
375 lexBaseChar = do { c <- get;
376 case c of
377 'o' -> return 8
378 'O' -> return 8
379 'x' -> return 16
380 'X' -> return 16
381 _ -> pfail }
382
383 lexDecNumber :: ReadP Lexeme
384 lexDecNumber =
385 do xs <- lexDigits 10
386 mFrac <- lexFrac <++ return Nothing
387 mExp <- lexExp <++ return Nothing
388 return (Number (MkDecimal xs mFrac mExp))
389
390 lexFrac :: ReadP (Maybe Digits)
391 -- Read the fractional part; fail if it doesn't
392 -- start ".d" where d is a digit
393 lexFrac = do _ <- char '.'
394 fraction <- lexDigits 10
395 return (Just fraction)
396
397 lexExp :: ReadP (Maybe Integer)
398 lexExp = do _ <- char 'e' +++ char 'E'
399 exp <- signedExp +++ lexInteger 10
400 return (Just exp)
401 where
402 signedExp
403 = do c <- char '-' +++ char '+'
404 n <- lexInteger 10
405 return (if c == '-' then -n else n)
406
407 lexDigits :: Int -> ReadP Digits
408 -- Lex a non-empty sequence of digits in specified base
409 lexDigits base =
410 do s <- look
411 xs <- scan s id
412 guard (not (null xs))
413 return xs
414 where
415 scan (c:cs) f = case valDig base c of
416 Just n -> do _ <- get; scan cs (f.(n:))
417 Nothing -> do return (f [])
418 scan [] f = do return (f [])
419
420 lexInteger :: Base -> ReadP Integer
421 lexInteger base =
422 do xs <- lexDigits base
423 return (val (fromIntegral base) 0 xs)
424
425 val :: Num a => a -> a -> Digits -> a
426 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
427 val _ y [] = y
428 val base y (x:xs) = y' `seq` val base y' xs
429 where
430 y' = y * base + fromIntegral x
431
432 -- Calculate a Rational from the exponent [of 10 to multiply with],
433 -- the integral part of the mantissa and the digits of the fractional
434 -- part. Leaving the calculation of the power of 10 until the end,
435 -- when we know the effective exponent, saves multiplications.
436 -- More importantly, this way we need at most one gcd instead of three.
437 --
438 -- frac was never used with anything but Integer and base 10, so
439 -- those are hardcoded now (trivial to change if necessary).
440 fracExp :: Integer -> Integer -> Digits -> Rational
441 fracExp exp mant []
442 | exp < 0 = mant % (10 ^ (-exp))
443 | otherwise = fromInteger (mant * 10 ^ exp)
444 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
445 where
446 exp' = exp - 1
447 mant' = mant * 10 + fromIntegral d
448
449 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
450 valDig 8 c
451 | '0' <= c && c <= '7' = Just (ord c - ord '0')
452 | otherwise = Nothing
453
454 valDig 10 c = valDecDig c
455
456 valDig 16 c
457 | '0' <= c && c <= '9' = Just (ord c - ord '0')
458 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
459 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
460 | otherwise = Nothing
461
462 valDig _ _ = error "valDig: Bad base"
463
464 valDecDig :: Char -> Maybe Int
465 valDecDig c
466 | '0' <= c && c <= '9' = Just (ord c - ord '0')
467 | otherwise = Nothing
468
469 -- ----------------------------------------------------------------------
470 -- other numeric lexing functions
471
472 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
473 readIntP base isDigit valDigit =
474 do s <- munch1 isDigit
475 return (val base 0 (map valDigit s))
476
477 readIntP' :: (Eq a, Num a) => a -> ReadP a
478 readIntP' base = readIntP base isDigit valDigit
479 where
480 isDigit c = maybe False (const True) (valDig base c)
481 valDigit c = maybe 0 id (valDig base c)
482
483 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
484 readOctP = readIntP' 8
485 readDecP = readIntP' 10
486 readHexP = readIntP' 16
487