Fix haddock module links.
[packages/parsec.git] / Text / Parsec / Token.hs
1 -----------------------------------------------------------------------------
2 -- |
3 -- Module : Text.Parsec.Token
4 -- Copyright : (c) Daan Leijen 1999-2001, (c) Paolo Martini 2007
5 -- License : BSD-style (see the LICENSE file)
6 --
7 -- Maintainer : derek.a.elkins@gmail.com
8 -- Stability : provisional
9 -- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
10 --
11 -- A helper module to parse lexical elements (tokens). See 'makeTokenParser'
12 -- for a description of how to use it.
13 --
14 -----------------------------------------------------------------------------
15
16 {-# LANGUAGE PolymorphicComponents #-}
17 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
18
19 module Text.Parsec.Token
20 ( LanguageDef
21 , GenLanguageDef (..)
22 , TokenParser
23 , GenTokenParser (..)
24 , makeTokenParser
25 ) where
26
27 import Data.Char ( isAlpha, toLower, toUpper, isSpace, digitToInt )
28 import Data.List ( nub, sort )
29 import Control.Monad.Identity
30 import Text.Parsec.Prim
31 import Text.Parsec.Char
32 import Text.Parsec.Combinator
33
34 -----------------------------------------------------------
35 -- Language Definition
36 -----------------------------------------------------------
37
38 type LanguageDef st = GenLanguageDef String st Identity
39
40 -- | The @GenLanguageDef@ type is a record that contains all parameterizable
41 -- features of the "Text.Parsec.Token" module. The module "Text.Parsec.Language"
42 -- contains some default definitions.
43
44 data GenLanguageDef s u m
45 = LanguageDef {
46
47 -- | Describes the start of a block comment. Use the empty string if the
48 -- language doesn't support block comments. For example \"\/*\".
49
50 commentStart :: String,
51
52 -- | Describes the end of a block comment. Use the empty string if the
53 -- language doesn't support block comments. For example \"*\/\".
54
55 commentEnd :: String,
56
57 -- | Describes the start of a line comment. Use the empty string if the
58 -- language doesn't support line comments. For example \"\/\/\".
59
60 commentLine :: String,
61
62 -- | Set to 'True' if the language supports nested block comments.
63
64 nestedComments :: Bool,
65
66 -- | This parser should accept any start characters of identifiers. For
67 -- example @letter \<|> char \"_\"@.
68
69 identStart :: ParsecT s u m Char,
70
71 -- | This parser should accept any legal tail characters of identifiers.
72 -- For example @alphaNum \<|> char \"_\"@.
73
74 identLetter :: ParsecT s u m Char,
75
76 -- | This parser should accept any start characters of operators. For
77 -- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
78
79 opStart :: ParsecT s u m Char,
80
81 -- | This parser should accept any legal tail characters of operators.
82 -- Note that this parser should even be defined if the language doesn't
83 -- support user-defined operators, or otherwise the 'reservedOp'
84 -- parser won't work correctly.
85
86 opLetter :: ParsecT s u m Char,
87
88 -- | The list of reserved identifiers.
89
90 reservedNames :: [String],
91
92 -- | The list of reserved operators.
93
94 reservedOpNames:: [String],
95
96 -- | Set to 'True' if the language is case sensitive.
97
98 caseSensitive :: Bool
99
100 }
101
102 -----------------------------------------------------------
103 -- A first class module: TokenParser
104 -----------------------------------------------------------
105
106 type TokenParser st = GenTokenParser String st Identity
107
108 -- | The type of the record that holds lexical parsers that work on
109 -- @s@ streams with state @u@ over a monad @m@.
110
111 data GenTokenParser s u m
112 = TokenParser {
113
114 -- | This lexeme parser parses a legal identifier. Returns the identifier
115 -- string. This parser will fail on identifiers that are reserved
116 -- words. Legal identifier (start) characters and reserved words are
117 -- defined in the 'LanguageDef' that is passed to
118 -- 'makeTokenParser'. An @identifier@ is treated as
119 -- a single token using 'try'.
120
121 identifier :: ParsecT s u m String,
122
123 -- | The lexeme parser @reserved name@ parses @symbol
124 -- name@, but it also checks that the @name@ is not a prefix of a
125 -- valid identifier. A @reserved@ word is treated as a single token
126 -- using 'try'.
127
128 reserved :: String -> ParsecT s u m (),
129
130 -- | This lexeme parser parses a legal operator. Returns the name of the
131 -- operator. This parser will fail on any operators that are reserved
132 -- operators. Legal operator (start) characters and reserved operators
133 -- are defined in the 'LanguageDef' that is passed to
134 -- 'makeTokenParser'. An @operator@ is treated as a
135 -- single token using 'try'.
136
137 operator :: ParsecT s u m String,
138
139 -- |The lexeme parser @reservedOp name@ parses @symbol
140 -- name@, but it also checks that the @name@ is not a prefix of a
141 -- valid operator. A @reservedOp@ is treated as a single token using
142 -- 'try'.
143
144 reservedOp :: String -> ParsecT s u m (),
145
146
147 -- | This lexeme parser parses a single literal character. Returns the
148 -- literal character value. This parsers deals correctly with escape
149 -- sequences. The literal character is parsed according to the grammar
150 -- rules defined in the Haskell report (which matches most programming
151 -- languages quite closely).
152
153 charLiteral :: ParsecT s u m Char,
154
155 -- | This lexeme parser parses a literal string. Returns the literal
156 -- string value. This parsers deals correctly with escape sequences and
157 -- gaps. The literal string is parsed according to the grammar rules
158 -- defined in the Haskell report (which matches most programming
159 -- languages quite closely).
160
161 stringLiteral :: ParsecT s u m String,
162
163 -- | This lexeme parser parses a natural number (a positive whole
164 -- number). Returns the value of the number. The number can be
165 -- specified in 'decimal', 'hexadecimal' or
166 -- 'octal'. The number is parsed according to the grammar
167 -- rules in the Haskell report.
168
169 natural :: ParsecT s u m Integer,
170
171 -- | This lexeme parser parses an integer (a whole number). This parser
172 -- is like 'natural' except that it can be prefixed with
173 -- sign (i.e. \'-\' or \'+\'). Returns the value of the number. The
174 -- number can be specified in 'decimal', 'hexadecimal'
175 -- or 'octal'. The number is parsed according
176 -- to the grammar rules in the Haskell report.
177
178 integer :: ParsecT s u m Integer,
179
180 -- | This lexeme parser parses a floating point value. Returns the value
181 -- of the number. The number is parsed according to the grammar rules
182 -- defined in the Haskell report.
183
184 float :: ParsecT s u m Double,
185
186 -- | This lexeme parser parses either 'natural' or a 'float'.
187 -- Returns the value of the number. This parsers deals with
188 -- any overlap in the grammar rules for naturals and floats. The number
189 -- is parsed according to the grammar rules defined in the Haskell report.
190
191 naturalOrFloat :: ParsecT s u m (Either Integer Double),
192
193 -- | Parses a positive whole number in the decimal system. Returns the
194 -- value of the number.
195
196 decimal :: ParsecT s u m Integer,
197
198 -- | Parses a positive whole number in the hexadecimal system. The number
199 -- should be prefixed with \"0x\" or \"0X\". Returns the value of the
200 -- number.
201
202 hexadecimal :: ParsecT s u m Integer,
203
204 -- | Parses a positive whole number in the octal system. The number
205 -- should be prefixed with \"0o\" or \"0O\". Returns the value of the
206 -- number.
207
208 octal :: ParsecT s u m Integer,
209
210 -- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
211 -- trailing white space.
212
213 symbol :: String -> ParsecT s u m String,
214
215 -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace'
216 -- parser, returning the value of @p@. Every lexical
217 -- token (lexeme) is defined using @lexeme@, this way every parse
218 -- starts at a point without white space. Parsers that use @lexeme@ are
219 -- called /lexeme/ parsers in this document.
220 --
221 -- The only point where the 'whiteSpace' parser should be
222 -- called explicitly is the start of the main parser in order to skip
223 -- any leading white space.
224 --
225 -- > mainParser = do{ whiteSpace
226 -- > ; ds <- many (lexeme digit)
227 -- > ; eof
228 -- > ; return (sum ds)
229 -- > }
230
231 lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a,
232
233 -- | Parses any white space. White space consists of /zero/ or more
234 -- occurrences of a 'space', a line comment or a block (multi
235 -- line) comment. Block comments may be nested. How comments are
236 -- started and ended is defined in the 'LanguageDef'
237 -- that is passed to 'makeTokenParser'.
238
239 whiteSpace :: ParsecT s u m (),
240
241 -- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
242 -- returning the value of @p@.
243
244 parens :: forall a. ParsecT s u m a -> ParsecT s u m a,
245
246 -- | Lexeme parser @braces p@ parses @p@ enclosed in braces (\'{\' and
247 -- \'}\'), returning the value of @p@.
248
249 braces :: forall a. ParsecT s u m a -> ParsecT s u m a,
250
251 -- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (\'\<\'
252 -- and \'>\'), returning the value of @p@.
253
254 angles :: forall a. ParsecT s u m a -> ParsecT s u m a,
255
256 -- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (\'[\'
257 -- and \']\'), returning the value of @p@.
258
259 brackets :: forall a. ParsecT s u m a -> ParsecT s u m a,
260
261 -- | DEPRECATED: Use 'brackets'.
262
263 squares :: forall a. ParsecT s u m a -> ParsecT s u m a,
264
265 -- | Lexeme parser |semi| parses the character \';\' and skips any
266 -- trailing white space. Returns the string \";\".
267
268 semi :: ParsecT s u m String,
269
270 -- | Lexeme parser @comma@ parses the character \',\' and skips any
271 -- trailing white space. Returns the string \",\".
272
273 comma :: ParsecT s u m String,
274
275 -- | Lexeme parser @colon@ parses the character \':\' and skips any
276 -- trailing white space. Returns the string \":\".
277
278 colon :: ParsecT s u m String,
279
280 -- | Lexeme parser @dot@ parses the character \'.\' and skips any
281 -- trailing white space. Returns the string \".\".
282
283 dot :: ParsecT s u m String,
284
285 -- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
286 -- separated by 'semi'. Returns a list of values returned by
287 -- @p@.
288
289 semiSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
290
291 -- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
292 -- separated by 'semi'. Returns a list of values returned by @p@.
293
294 semiSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a],
295
296 -- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
297 -- @p@ separated by 'comma'. Returns a list of values returned
298 -- by @p@.
299
300 commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a],
301
302 -- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
303 -- @p@ separated by 'comma'. Returns a list of values returned
304 -- by @p@.
305
306 commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
307 }
308
309 -----------------------------------------------------------
310 -- Given a LanguageDef, create a token parser.
311 -----------------------------------------------------------
312
313 -- | The expression @makeTokenParser language@ creates a 'GenTokenParser'
314 -- record that contains lexical parsers that are
315 -- defined using the definitions in the @language@ record.
316 --
317 -- The use of this function is quite stylized - one imports the
318 -- appropiate language definition and selects the lexical parsers that
319 -- are needed from the resulting 'GenTokenParser'.
320 --
321 -- > module Main where
322 -- >
323 -- > import Text.Parsec
324 -- > import qualified Text.Parsec.Token as P
325 -- > import Text.Parsec.Language (haskellDef)
326 -- >
327 -- > -- The parser
328 -- > ...
329 -- >
330 -- > expr = parens expr
331 -- > <|> identifier
332 -- > <|> ...
333 -- >
334 -- >
335 -- > -- The lexer
336 -- > lexer = P.makeTokenParser haskellDef
337 -- >
338 -- > parens = P.parens lexer
339 -- > braces = P.braces lexer
340 -- > identifier = P.identifier lexer
341 -- > reserved = P.reserved lexer
342 -- > ...
343
344 makeTokenParser :: (Stream s m Char)
345 => GenLanguageDef s u m -> GenTokenParser s u m
346 makeTokenParser languageDef
347 = TokenParser{ identifier = identifier
348 , reserved = reserved
349 , operator = operator
350 , reservedOp = reservedOp
351
352 , charLiteral = charLiteral
353 , stringLiteral = stringLiteral
354 , natural = natural
355 , integer = integer
356 , float = float
357 , naturalOrFloat = naturalOrFloat
358 , decimal = decimal
359 , hexadecimal = hexadecimal
360 , octal = octal
361
362 , symbol = symbol
363 , lexeme = lexeme
364 , whiteSpace = whiteSpace
365
366 , parens = parens
367 , braces = braces
368 , angles = angles
369 , brackets = brackets
370 , squares = brackets
371 , semi = semi
372 , comma = comma
373 , colon = colon
374 , dot = dot
375 , semiSep = semiSep
376 , semiSep1 = semiSep1
377 , commaSep = commaSep
378 , commaSep1 = commaSep1
379 }
380 where
381
382 -----------------------------------------------------------
383 -- Bracketing
384 -----------------------------------------------------------
385 parens p = between (symbol "(") (symbol ")") p
386 braces p = between (symbol "{") (symbol "}") p
387 angles p = between (symbol "<") (symbol ">") p
388 brackets p = between (symbol "[") (symbol "]") p
389
390 semi = symbol ";"
391 comma = symbol ","
392 dot = symbol "."
393 colon = symbol ":"
394
395 commaSep p = sepBy p comma
396 semiSep p = sepBy p semi
397
398 commaSep1 p = sepBy1 p comma
399 semiSep1 p = sepBy1 p semi
400
401
402 -----------------------------------------------------------
403 -- Chars & Strings
404 -----------------------------------------------------------
405 charLiteral = lexeme (between (char '\'')
406 (char '\'' <?> "end of character")
407 characterChar )
408 <?> "character"
409
410 characterChar = charLetter <|> charEscape
411 <?> "literal character"
412
413 charEscape = do{ char '\\'; escapeCode }
414 charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
415
416
417
418 stringLiteral = lexeme (
419 do{ str <- between (char '"')
420 (char '"' <?> "end of string")
421 (many stringChar)
422 ; return (foldr (maybe id (:)) "" str)
423 }
424 <?> "literal string")
425
426 stringChar = do{ c <- stringLetter; return (Just c) }
427 <|> stringEscape
428 <?> "string character"
429
430 stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
431
432 stringEscape = do{ char '\\'
433 ; do{ escapeGap ; return Nothing }
434 <|> do{ escapeEmpty; return Nothing }
435 <|> do{ esc <- escapeCode; return (Just esc) }
436 }
437
438 escapeEmpty = char '&'
439 escapeGap = do{ many1 space
440 ; char '\\' <?> "end of string gap"
441 }
442
443
444
445 -- escape codes
446 escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
447 <?> "escape code"
448
449 charControl = do{ char '^'
450 ; code <- upper
451 ; return (toEnum (fromEnum code - fromEnum 'A'))
452 }
453
454 charNum = do{ code <- decimal
455 <|> do{ char 'o'; number 8 octDigit }
456 <|> do{ char 'x'; number 16 hexDigit }
457 ; return (toEnum (fromInteger code))
458 }
459
460 charEsc = choice (map parseEsc escMap)
461 where
462 parseEsc (c,code) = do{ char c; return code }
463
464 charAscii = choice (map parseAscii asciiMap)
465 where
466 parseAscii (asc,code) = try (do{ string asc; return code })
467
468
469 -- escape code tables
470 escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
471 asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
472
473 ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
474 "FS","GS","RS","US","SP"]
475 ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
476 "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
477 "CAN","SUB","ESC","DEL"]
478
479 ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
480 '\EM','\FS','\GS','\RS','\US','\SP']
481 ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
482 '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
483 '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
484
485
486 -----------------------------------------------------------
487 -- Numbers
488 -----------------------------------------------------------
489 naturalOrFloat = lexeme (natFloat) <?> "number"
490
491 float = lexeme floating <?> "float"
492 integer = lexeme int <?> "integer"
493 natural = lexeme nat <?> "natural"
494
495
496 -- floats
497 floating = do{ n <- decimal
498 ; fractExponent n
499 }
500
501
502 natFloat = do{ char '0'
503 ; zeroNumFloat
504 }
505 <|> decimalFloat
506
507 zeroNumFloat = do{ n <- hexadecimal <|> octal
508 ; return (Left n)
509 }
510 <|> decimalFloat
511 <|> fractFloat 0
512 <|> return (Left 0)
513
514 decimalFloat = do{ n <- decimal
515 ; option (Left n)
516 (fractFloat n)
517 }
518
519 fractFloat n = do{ f <- fractExponent n
520 ; return (Right f)
521 }
522
523 fractExponent n = do{ fract <- fraction
524 ; expo <- option 1.0 exponent'
525 ; return ((fromInteger n + fract)*expo)
526 }
527 <|>
528 do{ expo <- exponent'
529 ; return ((fromInteger n)*expo)
530 }
531
532 fraction = do{ char '.'
533 ; digits <- many1 digit <?> "fraction"
534 ; return (foldr op 0.0 digits)
535 }
536 <?> "fraction"
537 where
538 op d f = (f + fromIntegral (digitToInt d))/10.0
539
540 exponent' = do{ oneOf "eE"
541 ; f <- sign
542 ; e <- decimal <?> "exponent"
543 ; return (power (f e))
544 }
545 <?> "exponent"
546 where
547 power e | e < 0 = 1.0/power(-e)
548 | otherwise = fromInteger (10^e)
549
550
551 -- integers and naturals
552 int = do{ f <- lexeme sign
553 ; n <- nat
554 ; return (f n)
555 }
556
557 sign = (char '-' >> return negate)
558 <|> (char '+' >> return id)
559 <|> return id
560
561 nat = zeroNumber <|> decimal
562
563 zeroNumber = do{ char '0'
564 ; hexadecimal <|> octal <|> decimal <|> return 0
565 }
566 <?> ""
567
568 decimal = number 10 digit
569 hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
570 octal = do{ oneOf "oO"; number 8 octDigit }
571
572 number base baseDigit
573 = do{ digits <- many1 baseDigit
574 ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
575 ; seq n (return n)
576 }
577
578 -----------------------------------------------------------
579 -- Operators & reserved ops
580 -----------------------------------------------------------
581 reservedOp name =
582 lexeme $ try $
583 do{ string name
584 ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
585 }
586
587 operator =
588 lexeme $ try $
589 do{ name <- oper
590 ; if (isReservedOp name)
591 then unexpected ("reserved operator " ++ show name)
592 else return name
593 }
594
595 oper =
596 do{ c <- (opStart languageDef)
597 ; cs <- many (opLetter languageDef)
598 ; return (c:cs)
599 }
600 <?> "operator"
601
602 isReservedOp name =
603 isReserved (sort (reservedOpNames languageDef)) name
604
605
606 -----------------------------------------------------------
607 -- Identifiers & Reserved words
608 -----------------------------------------------------------
609 reserved name =
610 lexeme $ try $
611 do{ caseString name
612 ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
613 }
614
615 caseString name
616 | caseSensitive languageDef = string name
617 | otherwise = do{ walk name; return name }
618 where
619 walk [] = return ()
620 walk (c:cs) = do{ caseChar c <?> msg; walk cs }
621
622 caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
623 | otherwise = char c
624
625 msg = show name
626
627
628 identifier =
629 lexeme $ try $
630 do{ name <- ident
631 ; if (isReservedName name)
632 then unexpected ("reserved word " ++ show name)
633 else return name
634 }
635
636
637 ident
638 = do{ c <- identStart languageDef
639 ; cs <- many (identLetter languageDef)
640 ; return (c:cs)
641 }
642 <?> "identifier"
643
644 isReservedName name
645 = isReserved theReservedNames caseName
646 where
647 caseName | caseSensitive languageDef = name
648 | otherwise = map toLower name
649
650
651 isReserved names name
652 = scan names
653 where
654 scan [] = False
655 scan (r:rs) = case (compare r name) of
656 LT -> scan rs
657 EQ -> True
658 GT -> False
659
660 theReservedNames
661 | caseSensitive languageDef = sort reserved
662 | otherwise = sort . map (map toLower) $ reserved
663 where
664 reserved = reservedNames languageDef
665
666
667
668 -----------------------------------------------------------
669 -- White space & symbols
670 -----------------------------------------------------------
671 symbol name
672 = lexeme (string name)
673
674 lexeme p
675 = do{ x <- p; whiteSpace; return x }
676
677
678 --whiteSpace
679 whiteSpace
680 | noLine && noMulti = skipMany (simpleSpace <?> "")
681 | noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
682 | noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
683 | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
684 where
685 noLine = null (commentLine languageDef)
686 noMulti = null (commentStart languageDef)
687
688
689 simpleSpace =
690 skipMany1 (satisfy isSpace)
691
692 oneLineComment =
693 do{ try (string (commentLine languageDef))
694 ; skipMany (satisfy (/= '\n'))
695 ; return ()
696 }
697
698 multiLineComment =
699 do { try (string (commentStart languageDef))
700 ; inComment
701 }
702
703 inComment
704 | nestedComments languageDef = inCommentMulti
705 | otherwise = inCommentSingle
706
707 inCommentMulti
708 = do{ try (string (commentEnd languageDef)) ; return () }
709 <|> do{ multiLineComment ; inCommentMulti }
710 <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
711 <|> do{ oneOf startEnd ; inCommentMulti }
712 <?> "end of comment"
713 where
714 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
715
716 inCommentSingle
717 = do{ try (string (commentEnd languageDef)); return () }
718 <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
719 <|> do{ oneOf startEnd ; inCommentSingle }
720 <?> "end of comment"
721 where
722 startEnd = nub (commentEnd languageDef ++ commentStart languageDef)