parser: Allow Lm (MODIFIER LETTER) category in identifiers
[ghc.git] / compiler / parser / Ctype.hs
1 -- Character classification
2 {-# LANGUAGE CPP #-}
3 module Ctype
4 ( is_ident -- Char# -> Bool
5 , is_symbol -- Char# -> Bool
6 , is_any -- Char# -> Bool
7 , is_space -- Char# -> Bool
8 , is_lower -- Char# -> Bool
9 , is_upper -- Char# -> Bool
10 , is_digit -- Char# -> Bool
11 , is_alphanum -- Char# -> Bool
12
13 , is_decdigit, is_hexdigit, is_octdigit, is_bindigit
14 , hexDigit, octDecDigit
15 ) where
16
17 #include "HsVersions.h"
18
19 import Data.Int ( Int32 )
20 import Data.Bits ( Bits((.&.)) )
21 import Data.Char ( ord, chr )
22 import Panic
23
24 -- Bit masks
25
26 cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
27 cIdent = 1
28 cSymbol = 2
29 cAny = 4
30 cSpace = 8
31 cLower = 16
32 cUpper = 32
33 cDigit = 64
34
35 -- | The predicates below look costly, but aren't, GHC+GCC do a great job
36 -- at the big case below.
37
38 {-# INLINE is_ctype #-}
39 is_ctype :: Int -> Char -> Bool
40 is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
41
42 is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
43 is_alphanum :: Char -> Bool
44 is_ident = is_ctype cIdent
45 is_symbol = is_ctype cSymbol
46 is_any = is_ctype cAny
47 is_space = is_ctype cSpace
48 is_lower = is_ctype cLower
49 is_upper = is_ctype cUpper
50 is_digit = is_ctype cDigit
51 is_alphanum = is_ctype (cLower+cUpper+cDigit)
52
53 -- Utils
54
55 hexDigit :: Char -> Int
56 hexDigit c | is_decdigit c = ord c - ord '0'
57 | otherwise = ord (to_lower c) - ord 'a' + 10
58
59 octDecDigit :: Char -> Int
60 octDecDigit c = ord c - ord '0'
61
62 is_decdigit :: Char -> Bool
63 is_decdigit c
64 = c >= '0' && c <= '9'
65
66 is_hexdigit :: Char -> Bool
67 is_hexdigit c
68 = is_decdigit c
69 || (c >= 'a' && c <= 'f')
70 || (c >= 'A' && c <= 'F')
71
72 is_octdigit :: Char -> Bool
73 is_octdigit c = c >= '0' && c <= '7'
74
75 is_bindigit :: Char -> Bool
76 is_bindigit c = c == '0' || c == '1'
77
78 to_lower :: Char -> Char
79 to_lower c
80 | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
81 | otherwise = c
82
83 -- | We really mean .|. instead of + below, but GHC currently doesn't do
84 -- any constant folding with bitops. *sigh*
85
86 charType :: Char -> Int
87 charType c = case c of
88 '\0' -> 0 -- \000
89 '\1' -> 0 -- \001
90 '\2' -> 0 -- \002
91 '\3' -> 0 -- \003
92 '\4' -> 0 -- \004
93 '\5' -> 0 -- \005
94 '\6' -> 0 -- \006
95 '\7' -> 0 -- \007
96 '\8' -> 0 -- \010
97 '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
98 '\10' -> cSpace -- \n (ditto)
99 '\11' -> cSpace -- \v (ditto)
100 '\12' -> cSpace -- \f (ditto)
101 '\13' -> cSpace -- ^M (ditto)
102 '\14' -> 0 -- \016
103 '\15' -> 0 -- \017
104 '\16' -> 0 -- \020
105 '\17' -> 0 -- \021
106 '\18' -> 0 -- \022
107 '\19' -> 0 -- \023
108 '\20' -> 0 -- \024
109 '\21' -> 0 -- \025
110 '\22' -> 0 -- \026
111 '\23' -> 0 -- \027
112 '\24' -> 0 -- \030
113 '\25' -> 0 -- \031
114 '\26' -> 0 -- \032
115 '\27' -> 0 -- \033
116 '\28' -> 0 -- \034
117 '\29' -> 0 -- \035
118 '\30' -> 0 -- \036
119 '\31' -> 0 -- \037
120 '\32' -> cAny + cSpace --
121 '\33' -> cAny + cSymbol -- !
122 '\34' -> cAny -- "
123 '\35' -> cAny + cSymbol -- #
124 '\36' -> cAny + cSymbol -- $
125 '\37' -> cAny + cSymbol -- %
126 '\38' -> cAny + cSymbol -- &
127 '\39' -> cAny + cIdent -- '
128 '\40' -> cAny -- (
129 '\41' -> cAny -- )
130 '\42' -> cAny + cSymbol -- *
131 '\43' -> cAny + cSymbol -- +
132 '\44' -> cAny -- ,
133 '\45' -> cAny + cSymbol -- -
134 '\46' -> cAny + cSymbol -- .
135 '\47' -> cAny + cSymbol -- /
136 '\48' -> cAny + cIdent + cDigit -- 0
137 '\49' -> cAny + cIdent + cDigit -- 1
138 '\50' -> cAny + cIdent + cDigit -- 2
139 '\51' -> cAny + cIdent + cDigit -- 3
140 '\52' -> cAny + cIdent + cDigit -- 4
141 '\53' -> cAny + cIdent + cDigit -- 5
142 '\54' -> cAny + cIdent + cDigit -- 6
143 '\55' -> cAny + cIdent + cDigit -- 7
144 '\56' -> cAny + cIdent + cDigit -- 8
145 '\57' -> cAny + cIdent + cDigit -- 9
146 '\58' -> cAny + cSymbol -- :
147 '\59' -> cAny -- ;
148 '\60' -> cAny + cSymbol -- <
149 '\61' -> cAny + cSymbol -- =
150 '\62' -> cAny + cSymbol -- >
151 '\63' -> cAny + cSymbol -- ?
152 '\64' -> cAny + cSymbol -- @
153 '\65' -> cAny + cIdent + cUpper -- A
154 '\66' -> cAny + cIdent + cUpper -- B
155 '\67' -> cAny + cIdent + cUpper -- C
156 '\68' -> cAny + cIdent + cUpper -- D
157 '\69' -> cAny + cIdent + cUpper -- E
158 '\70' -> cAny + cIdent + cUpper -- F
159 '\71' -> cAny + cIdent + cUpper -- G
160 '\72' -> cAny + cIdent + cUpper -- H
161 '\73' -> cAny + cIdent + cUpper -- I
162 '\74' -> cAny + cIdent + cUpper -- J
163 '\75' -> cAny + cIdent + cUpper -- K
164 '\76' -> cAny + cIdent + cUpper -- L
165 '\77' -> cAny + cIdent + cUpper -- M
166 '\78' -> cAny + cIdent + cUpper -- N
167 '\79' -> cAny + cIdent + cUpper -- O
168 '\80' -> cAny + cIdent + cUpper -- P
169 '\81' -> cAny + cIdent + cUpper -- Q
170 '\82' -> cAny + cIdent + cUpper -- R
171 '\83' -> cAny + cIdent + cUpper -- S
172 '\84' -> cAny + cIdent + cUpper -- T
173 '\85' -> cAny + cIdent + cUpper -- U
174 '\86' -> cAny + cIdent + cUpper -- V
175 '\87' -> cAny + cIdent + cUpper -- W
176 '\88' -> cAny + cIdent + cUpper -- X
177 '\89' -> cAny + cIdent + cUpper -- Y
178 '\90' -> cAny + cIdent + cUpper -- Z
179 '\91' -> cAny -- [
180 '\92' -> cAny + cSymbol -- backslash
181 '\93' -> cAny -- ]
182 '\94' -> cAny + cSymbol -- ^
183 '\95' -> cAny + cIdent + cLower -- _
184 '\96' -> cAny -- `
185 '\97' -> cAny + cIdent + cLower -- a
186 '\98' -> cAny + cIdent + cLower -- b
187 '\99' -> cAny + cIdent + cLower -- c
188 '\100' -> cAny + cIdent + cLower -- d
189 '\101' -> cAny + cIdent + cLower -- e
190 '\102' -> cAny + cIdent + cLower -- f
191 '\103' -> cAny + cIdent + cLower -- g
192 '\104' -> cAny + cIdent + cLower -- h
193 '\105' -> cAny + cIdent + cLower -- i
194 '\106' -> cAny + cIdent + cLower -- j
195 '\107' -> cAny + cIdent + cLower -- k
196 '\108' -> cAny + cIdent + cLower -- l
197 '\109' -> cAny + cIdent + cLower -- m
198 '\110' -> cAny + cIdent + cLower -- n
199 '\111' -> cAny + cIdent + cLower -- o
200 '\112' -> cAny + cIdent + cLower -- p
201 '\113' -> cAny + cIdent + cLower -- q
202 '\114' -> cAny + cIdent + cLower -- r
203 '\115' -> cAny + cIdent + cLower -- s
204 '\116' -> cAny + cIdent + cLower -- t
205 '\117' -> cAny + cIdent + cLower -- u
206 '\118' -> cAny + cIdent + cLower -- v
207 '\119' -> cAny + cIdent + cLower -- w
208 '\120' -> cAny + cIdent + cLower -- x
209 '\121' -> cAny + cIdent + cLower -- y
210 '\122' -> cAny + cIdent + cLower -- z
211 '\123' -> cAny -- {
212 '\124' -> cAny + cSymbol -- |
213 '\125' -> cAny -- }
214 '\126' -> cAny + cSymbol -- ~
215 '\127' -> 0 -- \177
216 _ -> panic ("charType: " ++ show c)