e4e7fbfcb8fc1e3a61f3ec2a5ea05fe3286e9179
[ghc.git] / libraries / base / Data / Char.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Data.Char
7 -- Copyright : (c) The University of Glasgow 2001
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : stable
12 -- Portability : portable
13 --
14 -- The Char type and associated operations.
15 --
16 -----------------------------------------------------------------------------
17
18 module Data.Char
19 (
20 Char
21
22 -- * Character classification
23 -- | Unicode characters are divided into letters, numbers, marks,
24 -- punctuation, symbols, separators (including spaces) and others
25 -- (including control characters).
26 , isControl, isSpace
27 , isLower, isUpper, isAlpha, isAlphaNum, isPrint
28 , isDigit, isOctDigit, isHexDigit
29 , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator
30
31 -- ** Subranges
32 , isAscii, isLatin1
33 , isAsciiUpper, isAsciiLower
34
35 -- ** Unicode general categories
36 , GeneralCategory(..), generalCategory
37
38 -- * Case conversion
39 , toUpper, toLower, toTitle
40
41 -- * Single digit characters
42 , digitToInt
43 , intToDigit
44
45 -- * Numeric representations
46 , ord
47 , chr
48
49 -- * String representations
50 , showLitChar
51 , lexLitChar
52 , readLitChar
53 ) where
54
55 import GHC.Base
56 import GHC.Arr (Ix)
57 import GHC.Char
58 import GHC.Real (fromIntegral)
59 import GHC.Show
60 import GHC.Read (Read, readLitChar, lexLitChar)
61 import GHC.Unicode
62 import GHC.Num
63 import GHC.Enum
64
65 -- $setup
66 -- Allow the use of Prelude in doctests.
67 -- >>> import Prelude
68
69 -- | Convert a single digit 'Char' to the corresponding 'Int'. This
70 -- function fails unless its argument satisfies 'isHexDigit', but
71 -- recognises both upper- and lower-case hexadecimal digits (that
72 -- is, @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
73 --
74 -- ==== __Examples__
75 --
76 -- Characters @\'0\'@ through @\'9\'@ are converted properly to
77 -- @0..9@:
78 --
79 -- >>> map digitToInt ['0'..'9']
80 -- [0,1,2,3,4,5,6,7,8,9]
81 --
82 -- Both upper- and lower-case @\'A\'@ through @\'F\'@ are converted
83 -- as well, to @10..15@.
84 --
85 -- >>> map digitToInt ['a'..'f']
86 -- [10,11,12,13,14,15]
87 -- >>> map digitToInt ['A'..'F']
88 -- [10,11,12,13,14,15]
89 --
90 -- Anything else throws an exception:
91 --
92 -- >>> digitToInt 'G'
93 -- *** Exception: Char.digitToInt: not a digit 'G'
94 -- >>> digitToInt '♥'
95 -- *** Exception: Char.digitToInt: not a digit '\9829'
96 --
97 digitToInt :: Char -> Int
98 digitToInt c
99 | (fromIntegral dec::Word) <= 9 = dec
100 | (fromIntegral hexl::Word) <= 5 = hexl + 10
101 | (fromIntegral hexu::Word) <= 5 = hexu + 10
102 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh
103 where
104 dec = ord c - ord '0'
105 hexl = ord c - ord 'a'
106 hexu = ord c - ord 'A'
107
108 -- | Unicode General Categories (column 2 of the UnicodeData table) in
109 -- the order they are listed in the Unicode standard (the Unicode
110 -- Character Database, in particular).
111 --
112 -- ==== __Examples__
113 --
114 -- Basic usage:
115 --
116 -- >>> :t OtherLetter
117 -- OtherLetter :: GeneralCategory
118 --
119 -- 'Eq' instance:
120 --
121 -- >>> UppercaseLetter == UppercaseLetter
122 -- True
123 -- >>> UppercaseLetter == LowercaseLetter
124 -- False
125 --
126 -- 'Ord' instance:
127 --
128 -- >>> NonSpacingMark <= MathSymbol
129 -- True
130 --
131 -- 'Enum' instance:
132 --
133 -- >>> enumFromTo ModifierLetter SpacingCombiningMark
134 -- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
135 --
136 -- 'Read' instance:
137 --
138 -- >>> read "DashPunctuation" :: GeneralCategory
139 -- DashPunctuation
140 -- >>> read "17" :: GeneralCategory
141 -- *** Exception: Prelude.read: no parse
142 --
143 -- 'Show' instance:
144 --
145 -- >>> show EnclosingMark
146 -- "EnclosingMark"
147 --
148 -- 'Bounded' instance:
149 --
150 -- >>> minBound :: GeneralCategory
151 -- UppercaseLetter
152 -- >>> maxBound :: GeneralCategory
153 -- NotAssigned
154 --
155 -- 'Ix' instance:
156 --
157 -- >>> import Data.Ix ( index )
158 -- >>> index (OtherLetter,Control) FinalQuote
159 -- 12
160 -- >>> index (OtherLetter,Control) Format
161 -- *** Exception: Error in array index
162 --
163 data GeneralCategory
164 = UppercaseLetter -- ^ Lu: Letter, Uppercase
165 | LowercaseLetter -- ^ Ll: Letter, Lowercase
166 | TitlecaseLetter -- ^ Lt: Letter, Titlecase
167 | ModifierLetter -- ^ Lm: Letter, Modifier
168 | OtherLetter -- ^ Lo: Letter, Other
169 | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
170 | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
171 | EnclosingMark -- ^ Me: Mark, Enclosing
172 | DecimalNumber -- ^ Nd: Number, Decimal
173 | LetterNumber -- ^ Nl: Number, Letter
174 | OtherNumber -- ^ No: Number, Other
175 | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
176 | DashPunctuation -- ^ Pd: Punctuation, Dash
177 | OpenPunctuation -- ^ Ps: Punctuation, Open
178 | ClosePunctuation -- ^ Pe: Punctuation, Close
179 | InitialQuote -- ^ Pi: Punctuation, Initial quote
180 | FinalQuote -- ^ Pf: Punctuation, Final quote
181 | OtherPunctuation -- ^ Po: Punctuation, Other
182 | MathSymbol -- ^ Sm: Symbol, Math
183 | CurrencySymbol -- ^ Sc: Symbol, Currency
184 | ModifierSymbol -- ^ Sk: Symbol, Modifier
185 | OtherSymbol -- ^ So: Symbol, Other
186 | Space -- ^ Zs: Separator, Space
187 | LineSeparator -- ^ Zl: Separator, Line
188 | ParagraphSeparator -- ^ Zp: Separator, Paragraph
189 | Control -- ^ Cc: Other, Control
190 | Format -- ^ Cf: Other, Format
191 | Surrogate -- ^ Cs: Other, Surrogate
192 | PrivateUse -- ^ Co: Other, Private Use
193 | NotAssigned -- ^ Cn: Other, Not Assigned
194 deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
195
196 -- | The Unicode general category of the character. This relies on the
197 -- 'Enum' instance of 'GeneralCategory', which must remain in the
198 -- same order as the categories are presented in the Unicode
199 -- standard.
200 --
201 -- ==== __Examples__
202 --
203 -- Basic usage:
204 --
205 -- >>> generalCategory 'a'
206 -- LowercaseLetter
207 -- >>> generalCategory 'A'
208 -- UppercaseLetter
209 -- >>> generalCategory '0'
210 -- DecimalNumber
211 -- >>> generalCategory '%'
212 -- OtherPunctuation
213 -- >>> generalCategory '♥'
214 -- OtherSymbol
215 -- >>> generalCategory '\31'
216 -- Control
217 -- >>> generalCategory ' '
218 -- Space
219 --
220 generalCategory :: Char -> GeneralCategory
221 generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
222
223 -- derived character classifiers
224
225 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
226 -- title-case letters, plus letters of caseless scripts and
227 -- modifiers letters). This function is equivalent to
228 -- 'Data.Char.isAlpha'.
229 --
230 -- This function returns 'True' if its argument has one of the
231 -- following 'GeneralCategory's, or 'False' otherwise:
232 --
233 -- * 'UppercaseLetter'
234 -- * 'LowercaseLetter'
235 -- * 'TitlecaseLetter'
236 -- * 'ModifierLetter'
237 -- * 'OtherLetter'
238 --
239 -- These classes are defined in the
240 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
241 -- part of the Unicode standard. The same document defines what is
242 -- and is not a \"Letter\".
243 --
244 -- ==== __Examples__
245 --
246 -- Basic usage:
247 --
248 -- >>> isLetter 'a'
249 -- True
250 -- >>> isLetter 'A'
251 -- True
252 -- >>> isLetter '0'
253 -- False
254 -- >>> isLetter '%'
255 -- False
256 -- >>> isLetter '♥'
257 -- False
258 -- >>> isLetter '\31'
259 -- False
260 --
261 -- Ensure that 'isLetter' and 'isAlpha' are equivalent.
262 --
263 -- >>> let chars = [(chr 0)..]
264 -- >>> let letters = map isLetter chars
265 -- >>> let alphas = map isAlpha chars
266 -- >>> letters == alphas
267 -- True
268 --
269 isLetter :: Char -> Bool
270 isLetter c = case generalCategory c of
271 UppercaseLetter -> True
272 LowercaseLetter -> True
273 TitlecaseLetter -> True
274 ModifierLetter -> True
275 OtherLetter -> True
276 _ -> False
277
278 -- | Selects Unicode mark characters, for example accents and the
279 -- like, which combine with preceding characters.
280 --
281 -- This function returns 'True' if its argument has one of the
282 -- following 'GeneralCategory's, or 'False' otherwise:
283 --
284 -- * 'NonSpacingMark'
285 -- * 'SpacingCombiningMark'
286 -- * 'EnclosingMark'
287 --
288 -- These classes are defined in the
289 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
290 -- part of the Unicode standard. The same document defines what is
291 -- and is not a \"Mark\".
292 --
293 -- ==== __Examples__
294 --
295 -- Basic usage:
296 --
297 -- >>> isMark 'a'
298 -- False
299 -- >>> isMark '0'
300 -- False
301 --
302 -- Combining marks such as accent characters usually need to follow
303 -- another character before they become printable:
304 --
305 -- >>> map isMark "ò"
306 -- [False,True]
307 --
308 -- Puns are not necessarily supported:
309 --
310 -- >>> isMark '✓'
311 -- False
312 --
313 isMark :: Char -> Bool
314 isMark c = case generalCategory c of
315 NonSpacingMark -> True
316 SpacingCombiningMark -> True
317 EnclosingMark -> True
318 _ -> False
319
320 -- | Selects Unicode numeric characters, including digits from various
321 -- scripts, Roman numerals, et cetera.
322 --
323 -- This function returns 'True' if its argument has one of the
324 -- following 'GeneralCategory's, or 'False' otherwise:
325 --
326 -- * 'DecimalNumber'
327 -- * 'LetterNumber'
328 -- * 'OtherNumber'
329 --
330 -- These classes are defined in the
331 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
332 -- part of the Unicode standard. The same document defines what is
333 -- and is not a \"Number\".
334 --
335 -- ==== __Examples__
336 --
337 -- Basic usage:
338 --
339 -- >>> isNumber 'a'
340 -- False
341 -- >>> isNumber '%'
342 -- False
343 -- >>> isNumber '3'
344 -- True
345 --
346 -- ASCII @\'0\'@ through @\'9\'@ are all numbers:
347 --
348 -- >>> and $ map isNumber ['0'..'9']
349 -- True
350 --
351 -- Unicode Roman numerals are \"numbers\" as well:
352 --
353 -- >>> isNumber 'Ⅸ'
354 -- True
355 --
356 isNumber :: Char -> Bool
357 isNumber c = case generalCategory c of
358 DecimalNumber -> True
359 LetterNumber -> True
360 OtherNumber -> True
361 _ -> False
362
363 -- | Selects Unicode punctuation characters, including various kinds
364 -- of connectors, brackets and quotes.
365 --
366 -- This function returns 'True' if its argument has one of the
367 -- following 'GeneralCategory's, or 'False' otherwise:
368 --
369 -- * 'ConnectorPunctuation'
370 -- * 'DashPunctuation'
371 -- * 'OpenPunctuation'
372 -- * 'ClosePunctuation'
373 -- * 'InitialQuote'
374 -- * 'FinalQuote'
375 -- * 'OtherPunctuation'
376 --
377 -- These classes are defined in the
378 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
379 -- part of the Unicode standard. The same document defines what is
380 -- and is not a \"Punctuation\".
381 --
382 -- ==== __Examples__
383 --
384 -- Basic usage:
385 --
386 -- >>> isPunctuation 'a'
387 -- False
388 -- >>> isPunctuation '7'
389 -- False
390 -- >>> isPunctuation '♥'
391 -- False
392 -- >>> isPunctuation '"'
393 -- True
394 -- >>> isPunctuation '?'
395 -- True
396 -- >>> isPunctuation '—'
397 -- True
398 --
399 isPunctuation :: Char -> Bool
400 isPunctuation c = case generalCategory c of
401 ConnectorPunctuation -> True
402 DashPunctuation -> True
403 OpenPunctuation -> True
404 ClosePunctuation -> True
405 InitialQuote -> True
406 FinalQuote -> True
407 OtherPunctuation -> True
408 _ -> False
409
410 -- | Selects Unicode symbol characters, including mathematical and
411 -- currency symbols.
412 --
413 -- This function returns 'True' if its argument has one of the
414 -- following 'GeneralCategory's, or 'False' otherwise:
415 --
416 -- * 'MathSymbol'
417 -- * 'CurrencySymbol'
418 -- * 'ModifierSymbol'
419 -- * 'OtherSymbol'
420 --
421 -- These classes are defined in the
422 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
423 -- part of the Unicode standard. The same document defines what is
424 -- and is not a \"Symbol\".
425 --
426 -- ==== __Examples__
427 --
428 -- Basic usage:
429 --
430 -- >>> isSymbol 'a'
431 -- False
432 -- >>> isSymbol '6'
433 -- False
434 -- >>> isSymbol '='
435 -- True
436 --
437 -- The definition of \"math symbol\" may be a little
438 -- counter-intuitive depending on one's background:
439 --
440 -- >>> isSymbol '+'
441 -- True
442 -- >>> isSymbol '-'
443 -- False
444 --
445 isSymbol :: Char -> Bool
446 isSymbol c = case generalCategory c of
447 MathSymbol -> True
448 CurrencySymbol -> True
449 ModifierSymbol -> True
450 OtherSymbol -> True
451 _ -> False
452
453 -- | Selects Unicode space and separator characters.
454 --
455 -- This function returns 'True' if its argument has one of the
456 -- following 'GeneralCategory's, or 'False' otherwise:
457 --
458 -- * 'Space'
459 -- * 'LineSeparator'
460 -- * 'ParagraphSeparator'
461 --
462 -- These classes are defined in the
463 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
464 -- part of the Unicode standard. The same document defines what is
465 -- and is not a \"Separator\".
466 --
467 -- ==== __Examples__
468 --
469 -- Basic usage:
470 --
471 -- >>> isSeparator 'a'
472 -- False
473 -- >>> isSeparator '6'
474 -- False
475 -- >>> isSeparator ' '
476 -- True
477 --
478 -- Warning: newlines and tab characters are not considered
479 -- separators.
480 --
481 -- >>> isSeparator '\n'
482 -- False
483 -- >>> isSeparator '\t'
484 -- False
485 --
486 -- But some more exotic characters are (like HTML's @&nbsp;@):
487 --
488 -- >>> isSeparator '\160'
489 -- True
490 --
491 isSeparator :: Char -> Bool
492 isSeparator c = case generalCategory c of
493 Space -> True
494 LineSeparator -> True
495 ParagraphSeparator -> True
496 _ -> False
497