Merge branch 'master' of mac:ghc/git/val32/libraries/base
[packages/base.git] / Text / Read / Lex.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Text.Read.Lex
7 -- Copyright : (c) The University of Glasgow 2002
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : provisional
12 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
13 --
14 -- The cut-down Haskell lexer, used by Text.Read
15 --
16 -----------------------------------------------------------------------------
17
18 module Text.Read.Lex
19 -- lexing types
20 ( Lexeme(..) -- :: *; Show, Eq
21
22 -- lexer
23 , lex -- :: ReadP Lexeme Skips leading spaces
24 , hsLex -- :: ReadP String
25 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
26
27 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
28 , readOctP -- :: Num a => ReadP a
29 , readDecP -- :: Num a => ReadP a
30 , readHexP -- :: Num a => ReadP a
31 )
32 where
33
34 import Text.ParserCombinators.ReadP
35
36 #ifdef __GLASGOW_HASKELL__
37 import GHC.Base
38 import GHC.Num( Num(..), Integer )
39 import GHC.Show( Show(..) )
40 #ifndef __HADDOCK__
41 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
42 #endif
43 import GHC.Real( Integral, Rational, (%), fromIntegral,
44 toInteger, (^), infinity, notANumber )
45 import GHC.List
46 import GHC.Enum( maxBound )
47 #else
48 import Prelude hiding ( lex )
49 import Data.Char( chr, ord, isSpace, isAlpha, isAlphaNum )
50 import Data.Ratio( Ratio, (%) )
51 #endif
52 #ifdef __HUGS__
53 import Hugs.Prelude( Ratio(..) )
54 #endif
55 import Data.Maybe
56 import Control.Monad
57
58 -- -----------------------------------------------------------------------------
59 -- Lexing types
60
61 -- ^ Haskell lexemes.
62 data Lexeme
63 = Char Char -- ^ Character literal
64 | String String -- ^ String literal, with escapes interpreted
65 | Punc String -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
66 | Ident String -- ^ Haskell identifier, e.g. @foo@, @Baz@
67 | Symbol String -- ^ Haskell symbol, e.g. @>>@, @:%@
68 | Int Integer -- ^ Integer literal
69 | Rat Rational -- ^ Floating point literal
70 | EOF
71 deriving (Eq, Show)
72
73 -- -----------------------------------------------------------------------------
74 -- Lexing
75
76 lex :: ReadP Lexeme
77 lex = skipSpaces >> lexToken
78
79 hsLex :: ReadP String
80 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
81 hsLex = do skipSpaces
82 (s,_) <- gather lexToken
83 return s
84
85 lexToken :: ReadP Lexeme
86 lexToken = lexEOF +++
87 lexLitChar +++
88 lexString +++
89 lexPunc +++
90 lexSymbol +++
91 lexId +++
92 lexNumber
93
94
95 -- ----------------------------------------------------------------------
96 -- End of file
97 lexEOF :: ReadP Lexeme
98 lexEOF = do s <- look
99 guard (null s)
100 return EOF
101
102 -- ---------------------------------------------------------------------------
103 -- Single character lexemes
104
105 lexPunc :: ReadP Lexeme
106 lexPunc =
107 do c <- satisfy isPuncChar
108 return (Punc [c])
109 where
110 isPuncChar c = c `elem` ",;()[]{}`"
111
112 -- ----------------------------------------------------------------------
113 -- Symbols
114
115 lexSymbol :: ReadP Lexeme
116 lexSymbol =
117 do s <- munch1 isSymbolChar
118 if s `elem` reserved_ops then
119 return (Punc s) -- Reserved-ops count as punctuation
120 else
121 return (Symbol s)
122 where
123 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
124 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
125
126 -- ----------------------------------------------------------------------
127 -- identifiers
128
129 lexId :: ReadP Lexeme
130 lexId = lex_nan <++ lex_id
131 where
132 -- NaN and Infinity look like identifiers, so
133 -- we parse them first.
134 lex_nan = (string "NaN" >> return (Rat notANumber)) +++
135 (string "Infinity" >> return (Rat infinity))
136
137 lex_id = do c <- satisfy isIdsChar
138 s <- munch isIdfChar
139 return (Ident (c:s))
140
141 -- Identifiers can start with a '_'
142 isIdsChar c = isAlpha c || c == '_'
143 isIdfChar c = isAlphaNum c || c `elem` "_'"
144
145 #ifndef __GLASGOW_HASKELL__
146 infinity, notANumber :: Rational
147 infinity = 1 :% 0
148 notANumber = 0 :% 0
149 #endif
150
151 -- ---------------------------------------------------------------------------
152 -- Lexing character literals
153
154 lexLitChar :: ReadP Lexeme
155 lexLitChar =
156 do _ <- char '\''
157 (c,esc) <- lexCharE
158 guard (esc || c /= '\'') -- Eliminate '' possibility
159 _ <- char '\''
160 return (Char c)
161
162 lexChar :: ReadP Char
163 lexChar = do { (c,_) <- lexCharE; return c }
164
165 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
166 lexCharE =
167 do c1 <- get
168 if c1 == '\\'
169 then do c2 <- lexEsc; return (c2, True)
170 else do return (c1, False)
171 where
172 lexEsc =
173 lexEscChar
174 +++ lexNumeric
175 +++ lexCntrlChar
176 +++ lexAscii
177
178 lexEscChar =
179 do c <- get
180 case c of
181 'a' -> return '\a'
182 'b' -> return '\b'
183 'f' -> return '\f'
184 'n' -> return '\n'
185 'r' -> return '\r'
186 't' -> return '\t'
187 'v' -> return '\v'
188 '\\' -> return '\\'
189 '\"' -> return '\"'
190 '\'' -> return '\''
191 _ -> pfail
192
193 lexNumeric =
194 do base <- lexBaseChar <++ return 10
195 n <- lexInteger base
196 guard (n <= toInteger (ord maxBound))
197 return (chr (fromInteger n))
198
199 lexCntrlChar =
200 do _ <- char '^'
201 c <- get
202 case c of
203 '@' -> return '\^@'
204 'A' -> return '\^A'
205 'B' -> return '\^B'
206 'C' -> return '\^C'
207 'D' -> return '\^D'
208 'E' -> return '\^E'
209 'F' -> return '\^F'
210 'G' -> return '\^G'
211 'H' -> return '\^H'
212 'I' -> return '\^I'
213 'J' -> return '\^J'
214 'K' -> return '\^K'
215 'L' -> return '\^L'
216 'M' -> return '\^M'
217 'N' -> return '\^N'
218 'O' -> return '\^O'
219 'P' -> return '\^P'
220 'Q' -> return '\^Q'
221 'R' -> return '\^R'
222 'S' -> return '\^S'
223 'T' -> return '\^T'
224 'U' -> return '\^U'
225 'V' -> return '\^V'
226 'W' -> return '\^W'
227 'X' -> return '\^X'
228 'Y' -> return '\^Y'
229 'Z' -> return '\^Z'
230 '[' -> return '\^['
231 '\\' -> return '\^\'
232 ']' -> return '\^]'
233 '^' -> return '\^^'
234 '_' -> return '\^_'
235 _ -> pfail
236
237 lexAscii =
238 do choice
239 [ (string "SOH" >> return '\SOH') <++
240 (string "SO" >> return '\SO')
241 -- \SO and \SOH need maximal-munch treatment
242 -- See the Haskell report Sect 2.6
243
244 , string "NUL" >> return '\NUL'
245 , string "STX" >> return '\STX'
246 , string "ETX" >> return '\ETX'
247 , string "EOT" >> return '\EOT'
248 , string "ENQ" >> return '\ENQ'
249 , string "ACK" >> return '\ACK'
250 , string "BEL" >> return '\BEL'
251 , string "BS" >> return '\BS'
252 , string "HT" >> return '\HT'
253 , string "LF" >> return '\LF'
254 , string "VT" >> return '\VT'
255 , string "FF" >> return '\FF'
256 , string "CR" >> return '\CR'
257 , string "SI" >> return '\SI'
258 , string "DLE" >> return '\DLE'
259 , string "DC1" >> return '\DC1'
260 , string "DC2" >> return '\DC2'
261 , string "DC3" >> return '\DC3'
262 , string "DC4" >> return '\DC4'
263 , string "NAK" >> return '\NAK'
264 , string "SYN" >> return '\SYN'
265 , string "ETB" >> return '\ETB'
266 , string "CAN" >> return '\CAN'
267 , string "EM" >> return '\EM'
268 , string "SUB" >> return '\SUB'
269 , string "ESC" >> return '\ESC'
270 , string "FS" >> return '\FS'
271 , string "GS" >> return '\GS'
272 , string "RS" >> return '\RS'
273 , string "US" >> return '\US'
274 , string "SP" >> return '\SP'
275 , string "DEL" >> return '\DEL'
276 ]
277
278
279 -- ---------------------------------------------------------------------------
280 -- string literal
281
282 lexString :: ReadP Lexeme
283 lexString =
284 do _ <- char '"'
285 body id
286 where
287 body f =
288 do (c,esc) <- lexStrItem
289 if c /= '"' || esc
290 then body (f.(c:))
291 else let s = f "" in
292 return (String s)
293
294 lexStrItem = (lexEmpty >> lexStrItem)
295 +++ lexCharE
296
297 lexEmpty =
298 do _ <- char '\\'
299 c <- get
300 case c of
301 '&' -> do return ()
302 _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
303 _ -> do pfail
304
305 -- ---------------------------------------------------------------------------
306 -- Lexing numbers
307
308 type Base = Int
309 type Digits = [Int]
310
311 lexNumber :: ReadP Lexeme
312 lexNumber
313 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
314 -- If that fails, try for a decimal number
315 lexDecNumber -- Start with ordinary digits
316
317 lexHexOct :: ReadP Lexeme
318 lexHexOct
319 = do _ <- char '0'
320 base <- lexBaseChar
321 digits <- lexDigits base
322 return (Int (val (fromIntegral base) 0 digits))
323
324 lexBaseChar :: ReadP Int
325 -- Lex a single character indicating the base; fail if not there
326 lexBaseChar = do { c <- get;
327 case c of
328 'o' -> return 8
329 'O' -> return 8
330 'x' -> return 16
331 'X' -> return 16
332 _ -> pfail }
333
334 lexDecNumber :: ReadP Lexeme
335 lexDecNumber =
336 do xs <- lexDigits 10
337 mFrac <- lexFrac <++ return Nothing
338 mExp <- lexExp <++ return Nothing
339 return (value xs mFrac mExp)
340 where
341 value xs mFrac mExp = valueFracExp (val 10 0 xs) mFrac mExp
342
343 valueFracExp :: Integer -> Maybe Digits -> Maybe Integer
344 -> Lexeme
345 valueFracExp a Nothing Nothing
346 = Int a -- 43
347 valueFracExp a Nothing (Just exp)
348 | exp >= 0 = Int (a * (10 ^ exp)) -- 43e7
349 | otherwise = Rat (a % (10 ^ (-exp))) -- 43e-7
350 valueFracExp a (Just fs) mExp -- 4.3[e2]
351 = Rat (fracExp (fromMaybe 0 mExp) a fs)
352 -- Be a bit more efficient in calculating the Rational.
353 -- Instead of calculating the fractional part alone, then
354 -- adding the integral part and finally multiplying with
355 -- 10 ^ exp if an exponent was given, do it all at once.
356
357 lexFrac :: ReadP (Maybe Digits)
358 -- Read the fractional part; fail if it doesn't
359 -- start ".d" where d is a digit
360 lexFrac = do _ <- char '.'
361 fraction <- lexDigits 10
362 return (Just fraction)
363
364 lexExp :: ReadP (Maybe Integer)
365 lexExp = do _ <- char 'e' +++ char 'E'
366 exp <- signedExp +++ lexInteger 10
367 return (Just exp)
368 where
369 signedExp
370 = do c <- char '-' +++ char '+'
371 n <- lexInteger 10
372 return (if c == '-' then -n else n)
373
374 lexDigits :: Int -> ReadP Digits
375 -- Lex a non-empty sequence of digits in specified base
376 lexDigits base =
377 do s <- look
378 xs <- scan s id
379 guard (not (null xs))
380 return xs
381 where
382 scan (c:cs) f = case valDig base c of
383 Just n -> do _ <- get; scan cs (f.(n:))
384 Nothing -> do return (f [])
385 scan [] f = do return (f [])
386
387 lexInteger :: Base -> ReadP Integer
388 lexInteger base =
389 do xs <- lexDigits base
390 return (val (fromIntegral base) 0 xs)
391
392 val :: Num a => a -> a -> Digits -> a
393 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
394 val _ y [] = y
395 val base y (x:xs) = y' `seq` val base y' xs
396 where
397 y' = y * base + fromIntegral x
398
399 -- Calculate a Rational from the exponent [of 10 to multiply with],
400 -- the integral part of the mantissa and the digits of the fractional
401 -- part. Leaving the calculation of the power of 10 until the end,
402 -- when we know the effective exponent, saves multiplications.
403 -- More importantly, this way we need at most one gcd instead of three.
404 --
405 -- frac was never used with anything but Integer and base 10, so
406 -- those are hardcoded now (trivial to change if necessary).
407 fracExp :: Integer -> Integer -> Digits -> Rational
408 fracExp exp mant []
409 | exp < 0 = mant % (10 ^ (-exp))
410 | otherwise = fromInteger (mant * 10 ^ exp)
411 fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
412 where
413 exp' = exp - 1
414 mant' = mant * 10 + fromIntegral d
415
416 valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
417 valDig 8 c
418 | '0' <= c && c <= '7' = Just (ord c - ord '0')
419 | otherwise = Nothing
420
421 valDig 10 c = valDecDig c
422
423 valDig 16 c
424 | '0' <= c && c <= '9' = Just (ord c - ord '0')
425 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
426 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
427 | otherwise = Nothing
428
429 valDig _ _ = error "valDig: Bad base"
430
431 valDecDig :: Char -> Maybe Int
432 valDecDig c
433 | '0' <= c && c <= '9' = Just (ord c - ord '0')
434 | otherwise = Nothing
435
436 -- ----------------------------------------------------------------------
437 -- other numeric lexing functions
438
439 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
440 readIntP base isDigit valDigit =
441 do s <- munch1 isDigit
442 return (val base 0 (map valDigit s))
443
444 readIntP' :: (Eq a, Num a) => a -> ReadP a
445 readIntP' base = readIntP base isDigit valDigit
446 where
447 isDigit c = maybe False (const True) (valDig base c)
448 valDigit c = maybe 0 id (valDig base c)
449
450 readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
451 readOctP = readIntP' 8
452 readDecP = readIntP' 10
453 readHexP = readIntP' 16