Move GeneralCategory et al to GHC.Unicode
[ghc.git] / libraries / base / Data / Char.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Data.Char
7 -- Copyright : (c) The University of Glasgow 2001
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : stable
12 -- Portability : portable
13 --
14 -- The Char type and associated operations.
15 --
16 -----------------------------------------------------------------------------
17
18 module Data.Char
19 (
20 Char
21
22 -- * Character classification
23 -- | Unicode characters are divided into letters, numbers, marks,
24 -- punctuation, symbols, separators (including spaces) and others
25 -- (including control characters).
26 , isControl, isSpace
27 , isLower, isUpper, isAlpha, isAlphaNum, isPrint
28 , isDigit, isOctDigit, isHexDigit
29 , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator
30
31 -- ** Subranges
32 , isAscii, isLatin1
33 , isAsciiUpper, isAsciiLower
34
35 -- ** Unicode general categories
36 , GeneralCategory(..), generalCategory
37
38 -- * Case conversion
39 , toUpper, toLower, toTitle
40
41 -- * Single digit characters
42 , digitToInt
43 , intToDigit
44
45 -- * Numeric representations
46 , ord
47 , chr
48
49 -- * String representations
50 , showLitChar
51 , lexLitChar
52 , readLitChar
53 ) where
54
55 import GHC.Base
56 import GHC.Char
57 import GHC.Real (fromIntegral)
58 import GHC.Show
59 import GHC.Read (readLitChar, lexLitChar)
60 import GHC.Unicode
61 import GHC.Num
62
63 -- $setup
64 -- Allow the use of Prelude in doctests.
65 -- >>> import Prelude
66
67 -- | Convert a single digit 'Char' to the corresponding 'Int'. This
68 -- function fails unless its argument satisfies 'isHexDigit', but
69 -- recognises both upper- and lower-case hexadecimal digits (that
70 -- is, @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
71 --
72 -- ==== __Examples__
73 --
74 -- Characters @\'0\'@ through @\'9\'@ are converted properly to
75 -- @0..9@:
76 --
77 -- >>> map digitToInt ['0'..'9']
78 -- [0,1,2,3,4,5,6,7,8,9]
79 --
80 -- Both upper- and lower-case @\'A\'@ through @\'F\'@ are converted
81 -- as well, to @10..15@.
82 --
83 -- >>> map digitToInt ['a'..'f']
84 -- [10,11,12,13,14,15]
85 -- >>> map digitToInt ['A'..'F']
86 -- [10,11,12,13,14,15]
87 --
88 -- Anything else throws an exception:
89 --
90 -- >>> digitToInt 'G'
91 -- *** Exception: Char.digitToInt: not a digit 'G'
92 -- >>> digitToInt '♥'
93 -- *** Exception: Char.digitToInt: not a digit '\9829'
94 --
95 digitToInt :: Char -> Int
96 digitToInt c
97 | (fromIntegral dec::Word) <= 9 = dec
98 | (fromIntegral hexl::Word) <= 5 = hexl + 10
99 | (fromIntegral hexu::Word) <= 5 = hexu + 10
100 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh
101 where
102 dec = ord c - ord '0'
103 hexl = ord c - ord 'a'
104 hexu = ord c - ord 'A'
105
106 -- derived character classifiers
107
108 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
109 -- title-case letters, plus letters of caseless scripts and
110 -- modifiers letters). This function is equivalent to
111 -- 'Data.Char.isAlpha'.
112 --
113 -- This function returns 'True' if its argument has one of the
114 -- following 'GeneralCategory's, or 'False' otherwise:
115 --
116 -- * 'UppercaseLetter'
117 -- * 'LowercaseLetter'
118 -- * 'TitlecaseLetter'
119 -- * 'ModifierLetter'
120 -- * 'OtherLetter'
121 --
122 -- These classes are defined in the
123 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
124 -- part of the Unicode standard. The same document defines what is
125 -- and is not a \"Letter\".
126 --
127 -- ==== __Examples__
128 --
129 -- Basic usage:
130 --
131 -- >>> isLetter 'a'
132 -- True
133 -- >>> isLetter 'A'
134 -- True
135 -- >>> isLetter '0'
136 -- False
137 -- >>> isLetter '%'
138 -- False
139 -- >>> isLetter '♥'
140 -- False
141 -- >>> isLetter '\31'
142 -- False
143 --
144 -- Ensure that 'isLetter' and 'isAlpha' are equivalent.
145 --
146 -- >>> let chars = [(chr 0)..]
147 -- >>> let letters = map isLetter chars
148 -- >>> let alphas = map isAlpha chars
149 -- >>> letters == alphas
150 -- True
151 --
152 isLetter :: Char -> Bool
153 isLetter c = case generalCategory c of
154 UppercaseLetter -> True
155 LowercaseLetter -> True
156 TitlecaseLetter -> True
157 ModifierLetter -> True
158 OtherLetter -> True
159 _ -> False
160
161 -- | Selects Unicode mark characters, for example accents and the
162 -- like, which combine with preceding characters.
163 --
164 -- This function returns 'True' if its argument has one of the
165 -- following 'GeneralCategory's, or 'False' otherwise:
166 --
167 -- * 'NonSpacingMark'
168 -- * 'SpacingCombiningMark'
169 -- * 'EnclosingMark'
170 --
171 -- These classes are defined in the
172 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
173 -- part of the Unicode standard. The same document defines what is
174 -- and is not a \"Mark\".
175 --
176 -- ==== __Examples__
177 --
178 -- Basic usage:
179 --
180 -- >>> isMark 'a'
181 -- False
182 -- >>> isMark '0'
183 -- False
184 --
185 -- Combining marks such as accent characters usually need to follow
186 -- another character before they become printable:
187 --
188 -- >>> map isMark "ò"
189 -- [False,True]
190 --
191 -- Puns are not necessarily supported:
192 --
193 -- >>> isMark '✓'
194 -- False
195 --
196 isMark :: Char -> Bool
197 isMark c = case generalCategory c of
198 NonSpacingMark -> True
199 SpacingCombiningMark -> True
200 EnclosingMark -> True
201 _ -> False
202
203 -- | Selects Unicode numeric characters, including digits from various
204 -- scripts, Roman numerals, et cetera.
205 --
206 -- This function returns 'True' if its argument has one of the
207 -- following 'GeneralCategory's, or 'False' otherwise:
208 --
209 -- * 'DecimalNumber'
210 -- * 'LetterNumber'
211 -- * 'OtherNumber'
212 --
213 -- These classes are defined in the
214 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
215 -- part of the Unicode standard. The same document defines what is
216 -- and is not a \"Number\".
217 --
218 -- ==== __Examples__
219 --
220 -- Basic usage:
221 --
222 -- >>> isNumber 'a'
223 -- False
224 -- >>> isNumber '%'
225 -- False
226 -- >>> isNumber '3'
227 -- True
228 --
229 -- ASCII @\'0\'@ through @\'9\'@ are all numbers:
230 --
231 -- >>> and $ map isNumber ['0'..'9']
232 -- True
233 --
234 -- Unicode Roman numerals are \"numbers\" as well:
235 --
236 -- >>> isNumber 'Ⅸ'
237 -- True
238 --
239 isNumber :: Char -> Bool
240 isNumber c = case generalCategory c of
241 DecimalNumber -> True
242 LetterNumber -> True
243 OtherNumber -> True
244 _ -> False
245
246 -- | Selects Unicode space and separator characters.
247 --
248 -- This function returns 'True' if its argument has one of the
249 -- following 'GeneralCategory's, or 'False' otherwise:
250 --
251 -- * 'Space'
252 -- * 'LineSeparator'
253 -- * 'ParagraphSeparator'
254 --
255 -- These classes are defined in the
256 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
257 -- part of the Unicode standard. The same document defines what is
258 -- and is not a \"Separator\".
259 --
260 -- ==== __Examples__
261 --
262 -- Basic usage:
263 --
264 -- >>> isSeparator 'a'
265 -- False
266 -- >>> isSeparator '6'
267 -- False
268 -- >>> isSeparator ' '
269 -- True
270 --
271 -- Warning: newlines and tab characters are not considered
272 -- separators.
273 --
274 -- >>> isSeparator '\n'
275 -- False
276 -- >>> isSeparator '\t'
277 -- False
278 --
279 -- But some more exotic characters are (like HTML's @&nbsp;@):
280 --
281 -- >>> isSeparator '\160'
282 -- True
283 --
284 isSeparator :: Char -> Bool
285 isSeparator c = case generalCategory c of
286 Space -> True
287 LineSeparator -> True
288 ParagraphSeparator -> True
289 _ -> False
290