[project @ 2003-04-04 14:36:31 by simonpj]
[packages/base.git] / Text / Read / Lex.hs
1 {-# OPTIONS -fno-implicit-prelude #-}
2 -----------------------------------------------------------------------------
3 -- |
4 -- Module : Text.Read.Lex
5 -- Copyright : (c) The University of Glasgow 2002
6 -- License : BSD-style (see the file libraries/base/LICENSE)
7 --
8 -- Maintainer : libraries@haskell.org
9 -- Stability : provisional
10 -- Portability : portable
11 --
12 -- The cut-down Haskell lexer, used by Text.Read
13 --
14 -----------------------------------------------------------------------------
15
16 module Text.Read.Lex
17 -- lexing types
18 ( Lexeme(..) -- :: *; Show, Eq
19
20 -- lexer
21 , lex -- :: ReadP Lexeme Skips leading spaces
22 , hsLex -- :: ReadP String
23 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
24
25 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
26 , readOctP -- :: Num a => ReadP a
27 , readDecP -- :: Num a => ReadP a
28 , readHexP -- :: Num a => ReadP a
29 )
30 where
31
32 import Text.ParserCombinators.ReadP
33
34 import GHC.Base
35 import GHC.Num( Num(..), Integer )
36 import GHC.Show( Show(.. ), showChar, showString,
37 isSpace, isAlpha, isAlphaNum,
38 isOctDigit, isHexDigit, toUpper )
39 import GHC.Real( Ratio(..), Integral, Rational, (%), fromIntegral, fromRational,
40 toInteger, (^), (^^), infinity, notANumber )
41 import GHC.Float( Float, Double )
42 import GHC.List
43 import GHC.Show( ShowS, shows )
44 import GHC.Enum( minBound, maxBound )
45 import Data.Maybe
46 import Data.Either
47 import Control.Monad
48
49 -- -----------------------------------------------------------------------------
50 -- Lexing types
51
52 type LexP = ReadP Lexeme
53
54 data Lexeme
55 = Char Char -- Quotes removed,
56 | String String -- escapes interpreted
57 | Punc String -- Punctuation, eg "(", "::"
58 | Ident String -- Haskell identifiers, e.g. foo, baz
59 | Symbol String -- Haskell symbols, e.g. >>, %
60 | Int Integer
61 | Rat Rational
62 | EOF
63 deriving (Eq, Show)
64
65 -- -----------------------------------------------------------------------------
66 -- Lexing
67
68 lex :: ReadP Lexeme
69 lex = skipSpaces >> lexToken
70
71 hsLex :: ReadP String
72 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
73 hsLex = do skipSpaces
74 (s,_) <- gather lexToken
75 return s
76
77 lexToken :: ReadP Lexeme
78 lexToken = lexEOF +++
79 lexLitChar +++
80 lexString +++
81 lexPunc +++
82 lexSymbol +++
83 lexId +++
84 lexNumber
85
86
87 -- ----------------------------------------------------------------------
88 -- End of file
89 lexEOF :: ReadP Lexeme
90 lexEOF = do s <- look
91 guard (null s)
92 return EOF
93
94 -- ---------------------------------------------------------------------------
95 -- Single character lexemes
96
97 lexPunc :: ReadP Lexeme
98 lexPunc =
99 do c <- satisfy isPuncChar
100 return (Punc [c])
101 where
102 isPuncChar c = c `elem` ",;()[]{}`"
103
104 -- ----------------------------------------------------------------------
105 -- Symbols
106
107 lexSymbol :: ReadP Lexeme
108 lexSymbol =
109 do s <- munch1 isSymbolChar
110 if s `elem` reserved_ops then
111 return (Punc s) -- Reserved-ops count as punctuation
112 else
113 return (Symbol s)
114 where
115 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
116 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
117
118 -- ----------------------------------------------------------------------
119 -- identifiers
120
121 lexId :: ReadP Lexeme
122 lexId =
123 do c <- satisfy isIdsChar
124 s <- munch isIdfChar
125 return (Ident (c:s))
126 where
127 -- Identifiers can start with a '_'
128 isIdsChar c = isAlpha c || c == '_'
129 isIdfChar c = isAlphaNum c || c `elem` "_'"
130
131 -- ---------------------------------------------------------------------------
132 -- Lexing character literals
133
134 lexLitChar :: ReadP Lexeme
135 lexLitChar =
136 do char '\''
137 (c,esc) <- lexCharE
138 guard (esc || c /= '\'') -- Eliminate '' possibility
139 char '\''
140 return (Char c)
141
142 lexChar :: ReadP Char
143 lexChar = do { (c,_) <- lexCharE; return c }
144
145 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
146 lexCharE =
147 do c <- get
148 if c == '\\'
149 then do c <- lexEsc; return (c, True)
150 else do return (c, False)
151 where
152 lexEsc =
153 lexEscChar
154 +++ lexNumeric
155 +++ lexCntrlChar
156 +++ lexAscii
157
158 lexEscChar =
159 do c <- get
160 case c of
161 'a' -> return '\a'
162 'b' -> return '\b'
163 'f' -> return '\f'
164 'n' -> return '\n'
165 'r' -> return '\r'
166 't' -> return '\t'
167 'v' -> return '\v'
168 '\\' -> return '\\'
169 '\"' -> return '\"'
170 '\'' -> return '\''
171 _ -> pfail
172
173 lexNumeric =
174 do base <- lexBase
175 n <- lexInteger base
176 guard (n <= toInteger (ord maxBound))
177 return (chr (fromInteger n))
178 where
179 lexBase =
180 do s <- look
181 case s of
182 'o':_ -> do get; return 8
183 'x':_ -> do get; return 16
184 _ -> do return 10
185
186 lexCntrlChar =
187 do char '^'
188 c <- get
189 case c of
190 '@' -> return '\^@'
191 'A' -> return '\^A'
192 'B' -> return '\^B'
193 'C' -> return '\^C'
194 'D' -> return '\^D'
195 'E' -> return '\^E'
196 'F' -> return '\^F'
197 'G' -> return '\^G'
198 'H' -> return '\^H'
199 'I' -> return '\^I'
200 'J' -> return '\^J'
201 'K' -> return '\^K'
202 'L' -> return '\^L'
203 'M' -> return '\^M'
204 'N' -> return '\^N'
205 'O' -> return '\^O'
206 'P' -> return '\^P'
207 'Q' -> return '\^Q'
208 'R' -> return '\^R'
209 'S' -> return '\^S'
210 'T' -> return '\^T'
211 'U' -> return '\^U'
212 'V' -> return '\^V'
213 'W' -> return '\^W'
214 'X' -> return '\^X'
215 'Y' -> return '\^Y'
216 'Z' -> return '\^Z'
217 '[' -> return '\^['
218 '\\' -> return '\^\'
219 ']' -> return '\^]'
220 '^' -> return '\^^'
221 '_' -> return '\^_'
222 _ -> pfail
223
224 lexAscii =
225 do choice
226 [ do { string "SO" ; s <- look;
227 case s of
228 'H' : _ -> do { get ; return '\SOH' }
229 other -> return '\SO'
230 }
231 -- \SO and \SOH need maximal-munch treatment
232 -- See the Haskell report Sect 2.6
233 , string "NUL" >> return '\NUL'
234 , string "STX" >> return '\STX'
235 , string "ETX" >> return '\ETX'
236 , string "EOT" >> return '\EOT'
237 , string "ENQ" >> return '\ENQ'
238 , string "ACK" >> return '\ACK'
239 , string "BEL" >> return '\BEL'
240 , string "BS" >> return '\BS'
241 , string "HT" >> return '\HT'
242 , string "LF" >> return '\LF'
243 , string "VT" >> return '\VT'
244 , string "FF" >> return '\FF'
245 , string "CR" >> return '\CR'
246 , string "SI" >> return '\SI'
247 , string "DLE" >> return '\DLE'
248 , string "DC1" >> return '\DC1'
249 , string "DC2" >> return '\DC2'
250 , string "DC3" >> return '\DC3'
251 , string "DC4" >> return '\DC4'
252 , string "NAK" >> return '\NAK'
253 , string "SYN" >> return '\SYN'
254 , string "ETB" >> return '\ETB'
255 , string "CAN" >> return '\CAN'
256 , string "EM" >> return '\EM'
257 , string "SUB" >> return '\SUB'
258 , string "ESC" >> return '\ESC'
259 , string "FS" >> return '\FS'
260 , string "GS" >> return '\GS'
261 , string "RS" >> return '\RS'
262 , string "US" >> return '\US'
263 , string "SP" >> return '\SP'
264 , string "DEL" >> return '\DEL'
265 ]
266
267
268 -- ---------------------------------------------------------------------------
269 -- string literal
270
271 lexString :: ReadP Lexeme
272 lexString =
273 do char '"'
274 body id
275 where
276 body f =
277 do (c,esc) <- lexStrItem
278 if c /= '"' || esc
279 then body (f.(c:))
280 else let s = f "" in
281 return (String s)
282
283 lexStrItem = (lexEmpty >> lexStrItem)
284 +++ lexCharE
285
286 lexEmpty =
287 do char '\\'
288 c <- get
289 case c of
290 '&' -> do return ()
291 _ | isSpace c -> do skipSpaces; char '\\'; return ()
292 _ -> do pfail
293
294 -- ---------------------------------------------------------------------------
295 -- Lexing numbers
296
297 type Base = Int
298 type Digits = [Int]
299
300 showDigit :: Int -> ShowS
301 showDigit n | n <= 9 = shows n
302 | otherwise = showChar (chr (n + ord 'A' - 10))
303
304 lexNumber :: ReadP Lexeme
305 lexNumber = do { string "NaN"; return (Rat notANumber) } +++
306 do { string "Infinity"; return (Rat infinity) } +++
307 do { base <- lexBase ; lexNumberBase base }
308 where
309 lexBase =
310 do s <- look
311 case s of
312 '0':'o':_ -> do get; get; return 8
313 '0':'O':_ -> do get; get; return 8
314 '0':'x':_ -> do get; get; return 16
315 '0':'X':_ -> do get; get; return 16
316 _ -> do return 10
317
318 lexNumberBase :: Base -> ReadP Lexeme
319 lexNumberBase base =
320 do xs <- lexDigits base
321 mFrac <- lexFrac base
322 mExp <- lexExp base
323 return (value xs mFrac mExp)
324 where
325 baseInteger :: Integer
326 baseInteger = fromIntegral base
327
328 value xs mFrac mExp = valueFracExp (val baseInteger 0 xs) mFrac mExp
329
330 valueFracExp :: Integer -> Maybe Digits -> Maybe Integer
331 -> Lexeme
332 valueFracExp a Nothing Nothing
333 = Int a -- 43
334 valueFracExp a Nothing (Just exp)
335 | exp >= 0 = Int (a * (baseInteger ^ exp)) -- 43e7
336 | otherwise = Rat (valExp (fromInteger a) exp) -- 43e-7
337 valueFracExp a (Just fs) mExp
338 = case mExp of
339 Nothing -> Rat rat -- 4.3
340 Just exp -> Rat (valExp rat exp) -- 4.3e-4
341 where
342 rat :: Rational
343 rat = fromInteger a + frac (fromIntegral base) 0 1 fs
344
345 valExp :: Rational -> Integer -> Rational
346 valExp rat exp = rat * (fromIntegral base ^^ exp)
347
348 lexFrac :: Base -> ReadP (Maybe Digits)
349 lexFrac base =
350 do s <- look
351 case s of
352 '.' : d : _ | isJust (valDig base d) ->
353 -- The lookahead checks for point and at least one
354 -- valid following digit. For example 1..n must
355 -- lex the "1" off rather than failing.
356 do get
357 frac <- lexDigits base
358 return (Just frac)
359
360 _ ->
361 do return Nothing
362
363 lexExp :: Base -> ReadP (Maybe Integer)
364 lexExp base =
365 do s <- look
366 case s of
367 e : _ | e `elem` "eE" && base == 10 ->
368 do get
369 (signedExp +++ exp)
370 where
371 signedExp =
372 do c <- char '-' +++ char '+'
373 n <- lexInteger 10
374 return (Just (if c == '-' then -n else n))
375
376 exp =
377 do n <- lexInteger 10
378 return (Just n)
379
380 _ ->
381 do return Nothing
382
383 lexDigits :: Int -> ReadP Digits
384 -- Lex a non-empty sequence of digits in specified base
385 lexDigits base =
386 do s <- look
387 xs <- scan s id
388 guard (not (null xs))
389 return xs
390 where
391 scan (c:cs) f = case valDig base c of
392 Just n -> do get; scan cs (f.(n:))
393 Nothing -> do return (f [])
394 scan [] f = do return (f [])
395
396 lexInteger :: Base -> ReadP Integer
397 lexInteger base =
398 do xs <- lexDigits base
399 return (val (fromIntegral base) 0 xs)
400
401 val :: Num a => a -> a -> Digits -> a
402 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
403 val base y [] = y
404 val base y (x:xs) = y' `seq` val base y' xs
405 where
406 y' = y * base + fromIntegral x
407
408 frac :: Integral a => a -> a -> a -> Digits -> Ratio a
409 frac base a b [] = a % b
410 frac base a b (x:xs) = a' `seq` b' `seq` frac base a' b' xs
411 where
412 a' = a * base + fromIntegral x
413 b' = b * base
414
415 valDig :: Num a => a -> Char -> Maybe Int
416 valDig 8 c
417 | '0' <= c && c <= '7' = Just (ord c - ord '0')
418 | otherwise = Nothing
419
420 valDig 10 c
421 | '0' <= c && c <= '9' = Just (ord c - ord '0')
422 | otherwise = Nothing
423
424 valDig 16 c
425 | '0' <= c && c <= '9' = Just (ord c - ord '0')
426 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
427 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
428 | otherwise = Nothing
429
430 -- ----------------------------------------------------------------------
431 -- other numeric lexing functions
432
433 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
434 readIntP base isDigit valDigit =
435 do s <- munch1 isDigit
436 return (val base 0 (map valDigit s))
437
438 readIntP' :: Num a => a -> ReadP a
439 readIntP' base = readIntP base isDigit valDigit
440 where
441 isDigit c = maybe False (const True) (valDig base c)
442 valDigit c = maybe 0 id (valDig base c)
443
444 readOctP, readDecP, readHexP :: Num a => ReadP a
445 readOctP = readIntP' 8
446 readDecP = readIntP' 10
447 readHexP = readIntP' 16