[project @ 2005-01-11 16:04:08 by simonmar]
[packages/old-time.git] / Text / Read / Lex.hs
1 {-# OPTIONS_GHC -fno-implicit-prelude #-}
2 -----------------------------------------------------------------------------
3 -- |
4 -- Module : Text.Read.Lex
5 -- Copyright : (c) The University of Glasgow 2002
6 -- License : BSD-style (see the file libraries/base/LICENSE)
7 --
8 -- Maintainer : libraries@haskell.org
9 -- Stability : provisional
10 -- Portability : non-portable (uses Text.ParserCombinators.ReadP)
11 --
12 -- The cut-down Haskell lexer, used by Text.Read
13 --
14 -----------------------------------------------------------------------------
15
16 module Text.Read.Lex
17 -- lexing types
18 ( Lexeme(..) -- :: *; Show, Eq
19
20 -- lexer
21 , lex -- :: ReadP Lexeme Skips leading spaces
22 , hsLex -- :: ReadP String
23 , lexChar -- :: ReadP Char Reads just one char, with H98 escapes
24
25 , readIntP -- :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
26 , readOctP -- :: Num a => ReadP a
27 , readDecP -- :: Num a => ReadP a
28 , readHexP -- :: Num a => ReadP a
29 )
30 where
31
32 import Text.ParserCombinators.ReadP
33
34 #ifdef __GLASGOW_HASKELL__
35 import GHC.Base
36 import GHC.Num( Num(..), Integer )
37 import GHC.Show( Show(..) )
38 #ifndef __HADDOCK__
39 import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum )
40 #endif
41 import GHC.Real( Ratio(..), Integral, Rational, (%), fromIntegral,
42 toInteger, (^), (^^), infinity, notANumber )
43 import GHC.List
44 import GHC.Enum( maxBound )
45 #else
46 import Prelude hiding ( lex )
47 import Data.Char( chr, ord, isSpace, isAlpha, isAlphaNum )
48 import Data.Ratio( Ratio, (%) )
49 #endif
50 #ifdef __HUGS__
51 import Hugs.Prelude( Ratio(..) )
52 #endif
53 import Data.Maybe
54 import Control.Monad
55
56 -- -----------------------------------------------------------------------------
57 -- Lexing types
58
59 data Lexeme
60 = Char Char -- Quotes removed,
61 | String String -- escapes interpreted
62 | Punc String -- Punctuation, eg "(", "::"
63 | Ident String -- Haskell identifiers, e.g. foo, baz
64 | Symbol String -- Haskell symbols, e.g. >>, %
65 | Int Integer
66 | Rat Rational
67 | EOF
68 deriving (Eq, Show)
69
70 -- -----------------------------------------------------------------------------
71 -- Lexing
72
73 lex :: ReadP Lexeme
74 lex = skipSpaces >> lexToken
75
76 hsLex :: ReadP String
77 -- ^ Haskell lexer: returns the lexed string, rather than the lexeme
78 hsLex = do skipSpaces
79 (s,_) <- gather lexToken
80 return s
81
82 lexToken :: ReadP Lexeme
83 lexToken = lexEOF +++
84 lexLitChar +++
85 lexString +++
86 lexPunc +++
87 lexSymbol +++
88 lexId +++
89 lexNumber
90
91
92 -- ----------------------------------------------------------------------
93 -- End of file
94 lexEOF :: ReadP Lexeme
95 lexEOF = do s <- look
96 guard (null s)
97 return EOF
98
99 -- ---------------------------------------------------------------------------
100 -- Single character lexemes
101
102 lexPunc :: ReadP Lexeme
103 lexPunc =
104 do c <- satisfy isPuncChar
105 return (Punc [c])
106 where
107 isPuncChar c = c `elem` ",;()[]{}`"
108
109 -- ----------------------------------------------------------------------
110 -- Symbols
111
112 lexSymbol :: ReadP Lexeme
113 lexSymbol =
114 do s <- munch1 isSymbolChar
115 if s `elem` reserved_ops then
116 return (Punc s) -- Reserved-ops count as punctuation
117 else
118 return (Symbol s)
119 where
120 isSymbolChar c = c `elem` "!@#$%&*+./<=>?\\^|:-~"
121 reserved_ops = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]
122
123 -- ----------------------------------------------------------------------
124 -- identifiers
125
126 lexId :: ReadP Lexeme
127 lexId = lex_nan <++ lex_id
128 where
129 -- NaN and Infinity look like identifiers, so
130 -- we parse them first.
131 lex_nan = (string "NaN" >> return (Rat notANumber)) +++
132 (string "Infinity" >> return (Rat infinity))
133
134 lex_id = do c <- satisfy isIdsChar
135 s <- munch isIdfChar
136 return (Ident (c:s))
137
138 -- Identifiers can start with a '_'
139 isIdsChar c = isAlpha c || c == '_'
140 isIdfChar c = isAlphaNum c || c `elem` "_'"
141
142 #ifndef __GLASGOW_HASKELL__
143 infinity, notANumber :: Rational
144 infinity = 1 :% 0
145 notANumber = 0 :% 0
146 #endif
147
148 -- ---------------------------------------------------------------------------
149 -- Lexing character literals
150
151 lexLitChar :: ReadP Lexeme
152 lexLitChar =
153 do char '\''
154 (c,esc) <- lexCharE
155 guard (esc || c /= '\'') -- Eliminate '' possibility
156 char '\''
157 return (Char c)
158
159 lexChar :: ReadP Char
160 lexChar = do { (c,_) <- lexCharE; return c }
161
162 lexCharE :: ReadP (Char, Bool) -- "escaped or not"?
163 lexCharE =
164 do c <- get
165 if c == '\\'
166 then do c <- lexEsc; return (c, True)
167 else do return (c, False)
168 where
169 lexEsc =
170 lexEscChar
171 +++ lexNumeric
172 +++ lexCntrlChar
173 +++ lexAscii
174
175 lexEscChar =
176 do c <- get
177 case c of
178 'a' -> return '\a'
179 'b' -> return '\b'
180 'f' -> return '\f'
181 'n' -> return '\n'
182 'r' -> return '\r'
183 't' -> return '\t'
184 'v' -> return '\v'
185 '\\' -> return '\\'
186 '\"' -> return '\"'
187 '\'' -> return '\''
188 _ -> pfail
189
190 lexNumeric =
191 do base <- lexBaseChar <++ return 10
192 n <- lexInteger base
193 guard (n <= toInteger (ord maxBound))
194 return (chr (fromInteger n))
195
196 lexCntrlChar =
197 do char '^'
198 c <- get
199 case c of
200 '@' -> return '\^@'
201 'A' -> return '\^A'
202 'B' -> return '\^B'
203 'C' -> return '\^C'
204 'D' -> return '\^D'
205 'E' -> return '\^E'
206 'F' -> return '\^F'
207 'G' -> return '\^G'
208 'H' -> return '\^H'
209 'I' -> return '\^I'
210 'J' -> return '\^J'
211 'K' -> return '\^K'
212 'L' -> return '\^L'
213 'M' -> return '\^M'
214 'N' -> return '\^N'
215 'O' -> return '\^O'
216 'P' -> return '\^P'
217 'Q' -> return '\^Q'
218 'R' -> return '\^R'
219 'S' -> return '\^S'
220 'T' -> return '\^T'
221 'U' -> return '\^U'
222 'V' -> return '\^V'
223 'W' -> return '\^W'
224 'X' -> return '\^X'
225 'Y' -> return '\^Y'
226 'Z' -> return '\^Z'
227 '[' -> return '\^['
228 '\\' -> return '\^\'
229 ']' -> return '\^]'
230 '^' -> return '\^^'
231 '_' -> return '\^_'
232 _ -> pfail
233
234 lexAscii =
235 do choice
236 [ (string "SOH" >> return '\SOH') <++
237 (string "SO" >> return '\SO')
238 -- \SO and \SOH need maximal-munch treatment
239 -- See the Haskell report Sect 2.6
240
241 , string "NUL" >> return '\NUL'
242 , string "STX" >> return '\STX'
243 , string "ETX" >> return '\ETX'
244 , string "EOT" >> return '\EOT'
245 , string "ENQ" >> return '\ENQ'
246 , string "ACK" >> return '\ACK'
247 , string "BEL" >> return '\BEL'
248 , string "BS" >> return '\BS'
249 , string "HT" >> return '\HT'
250 , string "LF" >> return '\LF'
251 , string "VT" >> return '\VT'
252 , string "FF" >> return '\FF'
253 , string "CR" >> return '\CR'
254 , string "SI" >> return '\SI'
255 , string "DLE" >> return '\DLE'
256 , string "DC1" >> return '\DC1'
257 , string "DC2" >> return '\DC2'
258 , string "DC3" >> return '\DC3'
259 , string "DC4" >> return '\DC4'
260 , string "NAK" >> return '\NAK'
261 , string "SYN" >> return '\SYN'
262 , string "ETB" >> return '\ETB'
263 , string "CAN" >> return '\CAN'
264 , string "EM" >> return '\EM'
265 , string "SUB" >> return '\SUB'
266 , string "ESC" >> return '\ESC'
267 , string "FS" >> return '\FS'
268 , string "GS" >> return '\GS'
269 , string "RS" >> return '\RS'
270 , string "US" >> return '\US'
271 , string "SP" >> return '\SP'
272 , string "DEL" >> return '\DEL'
273 ]
274
275
276 -- ---------------------------------------------------------------------------
277 -- string literal
278
279 lexString :: ReadP Lexeme
280 lexString =
281 do char '"'
282 body id
283 where
284 body f =
285 do (c,esc) <- lexStrItem
286 if c /= '"' || esc
287 then body (f.(c:))
288 else let s = f "" in
289 return (String s)
290
291 lexStrItem = (lexEmpty >> lexStrItem)
292 +++ lexCharE
293
294 lexEmpty =
295 do char '\\'
296 c <- get
297 case c of
298 '&' -> do return ()
299 _ | isSpace c -> do skipSpaces; char '\\'; return ()
300 _ -> do pfail
301
302 -- ---------------------------------------------------------------------------
303 -- Lexing numbers
304
305 type Base = Int
306 type Digits = [Int]
307
308 lexNumber :: ReadP Lexeme
309 lexNumber
310 = lexHexOct <++ -- First try for hex or octal 0x, 0o etc
311 -- If that fails, try for a decimal number
312 lexDecNumber -- Start with ordinary digits
313
314 lexHexOct :: ReadP Lexeme
315 lexHexOct
316 = do char '0'
317 base <- lexBaseChar
318 digits <- lexDigits base
319 return (Int (val (fromIntegral base) 0 digits))
320
321 lexBaseChar :: ReadP Int
322 -- Lex a single character indicating the base; fail if not there
323 lexBaseChar = do { c <- get;
324 case c of
325 'o' -> return 8
326 'O' -> return 8
327 'x' -> return 16
328 'X' -> return 16
329 _ -> pfail }
330
331 lexDecNumber :: ReadP Lexeme
332 lexDecNumber =
333 do xs <- lexDigits 10
334 mFrac <- lexFrac <++ return Nothing
335 mExp <- lexExp <++ return Nothing
336 return (value xs mFrac mExp)
337 where
338 value xs mFrac mExp = valueFracExp (val 10 0 xs) mFrac mExp
339
340 valueFracExp :: Integer -> Maybe Digits -> Maybe Integer
341 -> Lexeme
342 valueFracExp a Nothing Nothing
343 = Int a -- 43
344 valueFracExp a Nothing (Just exp)
345 | exp >= 0 = Int (a * (10 ^ exp)) -- 43e7
346 | otherwise = Rat (valExp (fromInteger a) exp) -- 43e-7
347 valueFracExp a (Just fs) mExp
348 = case mExp of
349 Nothing -> Rat rat -- 4.3
350 Just exp -> Rat (valExp rat exp) -- 4.3e-4
351 where
352 rat :: Rational
353 rat = fromInteger a + frac 10 0 1 fs
354
355 valExp :: Rational -> Integer -> Rational
356 valExp rat exp = rat * (10 ^^ exp)
357
358 lexFrac :: ReadP (Maybe Digits)
359 -- Read the fractional part; fail if it doesn't
360 -- start ".d" where d is a digit
361 lexFrac = do char '.'
362 frac <- lexDigits 10
363 return (Just frac)
364
365 lexExp :: ReadP (Maybe Integer)
366 lexExp = do char 'e' +++ char 'E'
367 exp <- signedExp +++ lexInteger 10
368 return (Just exp)
369 where
370 signedExp
371 = do c <- char '-' +++ char '+'
372 n <- lexInteger 10
373 return (if c == '-' then -n else n)
374
375 lexDigits :: Int -> ReadP Digits
376 -- Lex a non-empty sequence of digits in specified base
377 lexDigits base =
378 do s <- look
379 xs <- scan s id
380 guard (not (null xs))
381 return xs
382 where
383 scan (c:cs) f = case valDig base c of
384 Just n -> do get; scan cs (f.(n:))
385 Nothing -> do return (f [])
386 scan [] f = do return (f [])
387
388 lexInteger :: Base -> ReadP Integer
389 lexInteger base =
390 do xs <- lexDigits base
391 return (val (fromIntegral base) 0 xs)
392
393 val :: Num a => a -> a -> Digits -> a
394 -- val base y [d1,..,dn] = y ++ [d1,..,dn], as it were
395 val base y [] = y
396 val base y (x:xs) = y' `seq` val base y' xs
397 where
398 y' = y * base + fromIntegral x
399
400 frac :: Integral a => a -> a -> a -> Digits -> Ratio a
401 frac base a b [] = a % b
402 frac base a b (x:xs) = a' `seq` b' `seq` frac base a' b' xs
403 where
404 a' = a * base + fromIntegral x
405 b' = b * base
406
407 valDig :: Num a => a -> Char -> Maybe Int
408 valDig 8 c
409 | '0' <= c && c <= '7' = Just (ord c - ord '0')
410 | otherwise = Nothing
411
412 valDig 10 c = valDecDig c
413
414 valDig 16 c
415 | '0' <= c && c <= '9' = Just (ord c - ord '0')
416 | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
417 | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
418 | otherwise = Nothing
419
420 valDecDig c
421 | '0' <= c && c <= '9' = Just (ord c - ord '0')
422 | otherwise = Nothing
423
424 -- ----------------------------------------------------------------------
425 -- other numeric lexing functions
426
427 readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
428 readIntP base isDigit valDigit =
429 do s <- munch1 isDigit
430 return (val base 0 (map valDigit s))
431
432 readIntP' :: Num a => a -> ReadP a
433 readIntP' base = readIntP base isDigit valDigit
434 where
435 isDigit c = maybe False (const True) (valDig base c)
436 valDigit c = maybe 0 id (valDig base c)
437
438 readOctP, readDecP, readHexP :: Num a => ReadP a
439 readOctP = readIntP' 8
440 readDecP = readIntP' 10
441 readHexP = readIntP' 16