abf4064100e3d1215207c902b9badaa07dfc2f06
[packages/base.git] / Data / Char.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3
4 -----------------------------------------------------------------------------
5 -- |
6 -- Module : Data.Char
7 -- Copyright : (c) The University of Glasgow 2001
8 -- License : BSD-style (see the file libraries/base/LICENSE)
9 --
10 -- Maintainer : libraries@haskell.org
11 -- Stability : stable
12 -- Portability : portable
13 --
14 -- The Char type and associated operations.
15 --
16 -----------------------------------------------------------------------------
17
18 module Data.Char
19 (
20 Char
21
22 -- * Character classification
23 -- | Unicode characters are divided into letters, numbers, marks,
24 -- punctuation, symbols, separators (including spaces) and others
25 -- (including control characters).
26 , isControl, isSpace
27 , isLower, isUpper, isAlpha, isAlphaNum, isPrint
28 , isDigit, isOctDigit, isHexDigit
29 , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator
30
31 -- ** Subranges
32 , isAscii, isLatin1
33 , isAsciiUpper, isAsciiLower
34
35 -- ** Unicode general categories
36 , GeneralCategory(..), generalCategory
37
38 -- * Case conversion
39 , toUpper, toLower, toTitle
40
41 -- * Single digit characters
42 , digitToInt
43 , intToDigit
44
45 -- * Numeric representations
46 , ord
47 , chr
48
49 -- * String representations
50 , showLitChar
51 , lexLitChar
52 , readLitChar
53 ) where
54
55 #ifdef __GLASGOW_HASKELL__
56 import GHC.Base
57 import GHC.Arr (Ix)
58 import GHC.Char
59 import GHC.Real (fromIntegral)
60 import GHC.Show
61 import GHC.Read (Read, readLitChar, lexLitChar)
62 import GHC.Unicode
63 import GHC.Num
64 import GHC.Enum
65 #endif
66
67 #ifdef __HUGS__
68 import Hugs.Prelude (Ix)
69 import Hugs.Char
70 #endif
71
72 -- | Convert a single digit 'Char' to the corresponding 'Int'.
73 -- This function fails unless its argument satisfies 'isHexDigit',
74 -- but recognises both upper and lower-case hexadecimal digits
75 -- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
76 digitToInt :: Char -> Int
77 digitToInt c
78 | isDigit c = ord c - ord '0'
79 | c >= 'a' && c <= 'f' = ord c - ord 'a' + 10
80 | c >= 'A' && c <= 'F' = ord c - ord 'A' + 10
81 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh
82
83 #ifndef __GLASGOW_HASKELL__
84 isAsciiUpper, isAsciiLower :: Char -> Bool
85 isAsciiLower c = c >= 'a' && c <= 'z'
86 isAsciiUpper c = c >= 'A' && c <= 'Z'
87 #endif
88
89 -- | Unicode General Categories (column 2 of the UnicodeData table)
90 -- in the order they are listed in the Unicode standard.
91
92 data GeneralCategory
93 = UppercaseLetter -- ^ Lu: Letter, Uppercase
94 | LowercaseLetter -- ^ Ll: Letter, Lowercase
95 | TitlecaseLetter -- ^ Lt: Letter, Titlecase
96 | ModifierLetter -- ^ Lm: Letter, Modifier
97 | OtherLetter -- ^ Lo: Letter, Other
98 | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
99 | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
100 | EnclosingMark -- ^ Me: Mark, Enclosing
101 | DecimalNumber -- ^ Nd: Number, Decimal
102 | LetterNumber -- ^ Nl: Number, Letter
103 | OtherNumber -- ^ No: Number, Other
104 | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
105 | DashPunctuation -- ^ Pd: Punctuation, Dash
106 | OpenPunctuation -- ^ Ps: Punctuation, Open
107 | ClosePunctuation -- ^ Pe: Punctuation, Close
108 | InitialQuote -- ^ Pi: Punctuation, Initial quote
109 | FinalQuote -- ^ Pf: Punctuation, Final quote
110 | OtherPunctuation -- ^ Po: Punctuation, Other
111 | MathSymbol -- ^ Sm: Symbol, Math
112 | CurrencySymbol -- ^ Sc: Symbol, Currency
113 | ModifierSymbol -- ^ Sk: Symbol, Modifier
114 | OtherSymbol -- ^ So: Symbol, Other
115 | Space -- ^ Zs: Separator, Space
116 | LineSeparator -- ^ Zl: Separator, Line
117 | ParagraphSeparator -- ^ Zp: Separator, Paragraph
118 | Control -- ^ Cc: Other, Control
119 | Format -- ^ Cf: Other, Format
120 | Surrogate -- ^ Cs: Other, Surrogate
121 | PrivateUse -- ^ Co: Other, Private Use
122 | NotAssigned -- ^ Cn: Other, Not Assigned
123 deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
124
125 -- | The Unicode general category of the character.
126 generalCategory :: Char -> GeneralCategory
127 #if defined(__GLASGOW_HASKELL__)
128 generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
129 #endif
130 #ifdef __HUGS__
131 generalCategory c = toEnum (primUniGenCat c)
132 #endif
133
134 -- derived character classifiers
135
136 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
137 -- title-case letters, plus letters of caseless scripts and modifiers letters).
138 -- This function is equivalent to 'Data.Char.isAlpha'.
139 isLetter :: Char -> Bool
140 isLetter c = case generalCategory c of
141 UppercaseLetter -> True
142 LowercaseLetter -> True
143 TitlecaseLetter -> True
144 ModifierLetter -> True
145 OtherLetter -> True
146 _ -> False
147
148 -- | Selects Unicode mark characters, e.g. accents and the like, which
149 -- combine with preceding letters.
150 isMark :: Char -> Bool
151 isMark c = case generalCategory c of
152 NonSpacingMark -> True
153 SpacingCombiningMark -> True
154 EnclosingMark -> True
155 _ -> False
156
157 -- | Selects Unicode numeric characters, including digits from various
158 -- scripts, Roman numerals, etc.
159 isNumber :: Char -> Bool
160 isNumber c = case generalCategory c of
161 DecimalNumber -> True
162 LetterNumber -> True
163 OtherNumber -> True
164 _ -> False
165
166 -- | Selects Unicode punctuation characters, including various kinds
167 -- of connectors, brackets and quotes.
168 isPunctuation :: Char -> Bool
169 isPunctuation c = case generalCategory c of
170 ConnectorPunctuation -> True
171 DashPunctuation -> True
172 OpenPunctuation -> True
173 ClosePunctuation -> True
174 InitialQuote -> True
175 FinalQuote -> True
176 OtherPunctuation -> True
177 _ -> False
178
179 -- | Selects Unicode symbol characters, including mathematical and
180 -- currency symbols.
181 isSymbol :: Char -> Bool
182 isSymbol c = case generalCategory c of
183 MathSymbol -> True
184 CurrencySymbol -> True
185 ModifierSymbol -> True
186 OtherSymbol -> True
187 _ -> False
188
189 -- | Selects Unicode space and separator characters.
190 isSeparator :: Char -> Bool
191 isSeparator c = case generalCategory c of
192 Space -> True
193 LineSeparator -> True
194 ParagraphSeparator -> True
195 _ -> False
196