40052a760761b9e263efe5e030e900d251c2bb59
[packages/base.git] / Data / Char.hs
1 {-# LANGUAGE CPP, NoImplicitPrelude #-}
2
3 -----------------------------------------------------------------------------
4 -- |
5 -- Module : Data.Char
6 -- Copyright : (c) The University of Glasgow 2001
7 -- License : BSD-style (see the file libraries/base/LICENSE)
8 --
9 -- Maintainer : libraries@haskell.org
10 -- Stability : stable
11 -- Portability : portable
12 --
13 -- The Char type and associated operations.
14 --
15 -----------------------------------------------------------------------------
16
17 module Data.Char
18 (
19 Char
20
21 -- * Character classification
22 -- | Unicode characters are divided into letters, numbers, marks,
23 -- punctuation, symbols, separators (including spaces) and others
24 -- (including control characters).
25 , isControl, isSpace
26 , isLower, isUpper, isAlpha, isAlphaNum, isPrint
27 , isDigit, isOctDigit, isHexDigit
28 , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator
29
30 -- ** Subranges
31 , isAscii, isLatin1
32 , isAsciiUpper, isAsciiLower
33
34 -- ** Unicode general categories
35 , GeneralCategory(..), generalCategory
36
37 -- * Case conversion
38 , toUpper, toLower, toTitle -- :: Char -> Char
39
40 -- * Single digit characters
41 , digitToInt -- :: Char -> Int
42 , intToDigit -- :: Int -> Char
43
44 -- * Numeric representations
45 , ord -- :: Char -> Int
46 , chr -- :: Int -> Char
47
48 -- * String representations
49 , showLitChar -- :: Char -> ShowS
50 , lexLitChar -- :: ReadS String
51 , readLitChar -- :: ReadS Char
52
53 -- Implementation checked wrt. Haskell 98 lib report, 1/99.
54 ) where
55
56 #ifdef __GLASGOW_HASKELL__
57 import GHC.Base
58 import GHC.Arr (Ix)
59 import GHC.Real (fromIntegral)
60 import GHC.Show
61 import GHC.Read (Read, readLitChar, lexLitChar)
62 import GHC.Unicode
63 import GHC.Num
64 import GHC.Enum
65 #endif
66
67 #ifdef __HUGS__
68 import Hugs.Prelude (Ix)
69 import Hugs.Char
70 #endif
71
72 #ifdef __NHC__
73 import Prelude
74 import Prelude(Char,String)
75 import Char
76 import Ix
77 import NHC.FFI (CInt)
78 foreign import ccall unsafe "WCsubst.h u_gencat" wgencat :: CInt -> CInt
79 #endif
80
81 -- | Convert a single digit 'Char' to the corresponding 'Int'.
82 -- This function fails unless its argument satisfies 'isHexDigit',
83 -- but recognises both upper and lower-case hexadecimal digits
84 -- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
85 digitToInt :: Char -> Int
86 digitToInt c
87 | isDigit c = ord c - ord '0'
88 | c >= 'a' && c <= 'f' = ord c - ord 'a' + 10
89 | c >= 'A' && c <= 'F' = ord c - ord 'A' + 10
90 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh
91
92 #ifndef __GLASGOW_HASKELL__
93 isAsciiUpper, isAsciiLower :: Char -> Bool
94 isAsciiLower c = c >= 'a' && c <= 'z'
95 isAsciiUpper c = c >= 'A' && c <= 'Z'
96 #endif
97
98 -- | Unicode General Categories (column 2 of the UnicodeData table)
99 -- in the order they are listed in the Unicode standard.
100
101 data GeneralCategory
102 = UppercaseLetter -- ^ Lu: Letter, Uppercase
103 | LowercaseLetter -- ^ Ll: Letter, Lowercase
104 | TitlecaseLetter -- ^ Lt: Letter, Titlecase
105 | ModifierLetter -- ^ Lm: Letter, Modifier
106 | OtherLetter -- ^ Lo: Letter, Other
107 | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
108 | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
109 | EnclosingMark -- ^ Me: Mark, Enclosing
110 | DecimalNumber -- ^ Nd: Number, Decimal
111 | LetterNumber -- ^ Nl: Number, Letter
112 | OtherNumber -- ^ No: Number, Other
113 | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
114 | DashPunctuation -- ^ Pd: Punctuation, Dash
115 | OpenPunctuation -- ^ Ps: Punctuation, Open
116 | ClosePunctuation -- ^ Pe: Punctuation, Close
117 | InitialQuote -- ^ Pi: Punctuation, Initial quote
118 | FinalQuote -- ^ Pf: Punctuation, Final quote
119 | OtherPunctuation -- ^ Po: Punctuation, Other
120 | MathSymbol -- ^ Sm: Symbol, Math
121 | CurrencySymbol -- ^ Sc: Symbol, Currency
122 | ModifierSymbol -- ^ Sk: Symbol, Modifier
123 | OtherSymbol -- ^ So: Symbol, Other
124 | Space -- ^ Zs: Separator, Space
125 | LineSeparator -- ^ Zl: Separator, Line
126 | ParagraphSeparator -- ^ Zp: Separator, Paragraph
127 | Control -- ^ Cc: Other, Control
128 | Format -- ^ Cf: Other, Format
129 | Surrogate -- ^ Cs: Other, Surrogate
130 | PrivateUse -- ^ Co: Other, Private Use
131 | NotAssigned -- ^ Cn: Other, Not Assigned
132 deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
133
134 -- | The Unicode general category of the character.
135 generalCategory :: Char -> GeneralCategory
136 #if defined(__GLASGOW_HASKELL__) || defined(__NHC__)
137 generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
138 #endif
139 #ifdef __HUGS__
140 generalCategory c = toEnum (primUniGenCat c)
141 #endif
142
143 -- derived character classifiers
144
145 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
146 -- title-case letters, plus letters of caseless scripts and modifiers letters).
147 -- This function is equivalent to 'Data.Char.isAlpha'.
148 isLetter :: Char -> Bool
149 isLetter c = case generalCategory c of
150 UppercaseLetter -> True
151 LowercaseLetter -> True
152 TitlecaseLetter -> True
153 ModifierLetter -> True
154 OtherLetter -> True
155 _ -> False
156
157 -- | Selects Unicode mark characters, e.g. accents and the like, which
158 -- combine with preceding letters.
159 isMark :: Char -> Bool
160 isMark c = case generalCategory c of
161 NonSpacingMark -> True
162 SpacingCombiningMark -> True
163 EnclosingMark -> True
164 _ -> False
165
166 -- | Selects Unicode numeric characters, including digits from various
167 -- scripts, Roman numerals, etc.
168 isNumber :: Char -> Bool
169 isNumber c = case generalCategory c of
170 DecimalNumber -> True
171 LetterNumber -> True
172 OtherNumber -> True
173 _ -> False
174
175 -- | Selects Unicode punctuation characters, including various kinds
176 -- of connectors, brackets and quotes.
177 isPunctuation :: Char -> Bool
178 isPunctuation c = case generalCategory c of
179 ConnectorPunctuation -> True
180 DashPunctuation -> True
181 OpenPunctuation -> True
182 ClosePunctuation -> True
183 InitialQuote -> True
184 FinalQuote -> True
185 OtherPunctuation -> True
186 _ -> False
187
188 -- | Selects Unicode symbol characters, including mathematical and
189 -- currency symbols.
190 isSymbol :: Char -> Bool
191 isSymbol c = case generalCategory c of
192 MathSymbol -> True
193 CurrencySymbol -> True
194 ModifierSymbol -> True
195 OtherSymbol -> True
196 _ -> False
197
198 -- | Selects Unicode space and separator characters.
199 isSeparator :: Char -> Bool
200 isSeparator c = case generalCategory c of
201 Space -> True
202 LineSeparator -> True
203 ParagraphSeparator -> True
204 _ -> False
205
206 #ifdef __NHC__
207 -- dummy implementation
208 toTitle :: Char -> Char
209 toTitle = toUpper
210 #endif