Move GeneralCategory et al to GHC.Unicode
[ghc.git] / libraries / base / GHC / Unicode.hs
1 {-# LANGUAGE Trustworthy #-}
2 {-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-}
3 {-# OPTIONS_HADDOCK hide #-}
4
5 -----------------------------------------------------------------------------
6 -- |
7 -- Module : GHC.Unicode
8 -- Copyright : (c) The University of Glasgow, 2003
9 -- License : see libraries/base/LICENSE
10 --
11 -- Maintainer : cvs-ghc@haskell.org
12 -- Stability : internal
13 -- Portability : non-portable (GHC extensions)
14 --
15 -- Implementations for the character predicates (isLower, isUpper, etc.)
16 -- and the conversions (toUpper, toLower). The implementation uses
17 -- libunicode on Unix systems if that is available.
18 --
19 -----------------------------------------------------------------------------
20
21 module GHC.Unicode (
22 GeneralCategory (..), generalCategory,
23 isAscii, isLatin1, isControl,
24 isAsciiUpper, isAsciiLower,
25 isPrint, isSpace, isUpper,
26 isLower, isAlpha, isDigit,
27 isOctDigit, isHexDigit, isAlphaNum,
28 isPunctuation, isSymbol,
29 toUpper, toLower, toTitle,
30 wgencat
31 ) where
32
33 import GHC.Base
34 import GHC.Char (chr)
35 import GHC.Real
36 import GHC.Enum ( Enum (..), Bounded (..) )
37 import GHC.Arr ( Ix (..) )
38 import GHC.Num
39
40 -- Data.Char.chr already imports this and we need to define a Show instance
41 -- for GeneralCategory
42 import GHC.Show ( Show )
43
44 #include "HsBaseConfig.h"
45
46 -- | Unicode General Categories (column 2 of the UnicodeData table) in
47 -- the order they are listed in the Unicode standard (the Unicode
48 -- Character Database, in particular).
49 --
50 -- ==== __Examples__
51 --
52 -- Basic usage:
53 --
54 -- >>> :t OtherLetter
55 -- OtherLetter :: GeneralCategory
56 --
57 -- 'Eq' instance:
58 --
59 -- >>> UppercaseLetter == UppercaseLetter
60 -- True
61 -- >>> UppercaseLetter == LowercaseLetter
62 -- False
63 --
64 -- 'Ord' instance:
65 --
66 -- >>> NonSpacingMark <= MathSymbol
67 -- True
68 --
69 -- 'Enum' instance:
70 --
71 -- >>> enumFromTo ModifierLetter SpacingCombiningMark
72 -- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
73 --
74 -- 'Read' instance:
75 --
76 -- >>> read "DashPunctuation" :: GeneralCategory
77 -- DashPunctuation
78 -- >>> read "17" :: GeneralCategory
79 -- *** Exception: Prelude.read: no parse
80 --
81 -- 'Show' instance:
82 --
83 -- >>> show EnclosingMark
84 -- "EnclosingMark"
85 --
86 -- 'Bounded' instance:
87 --
88 -- >>> minBound :: GeneralCategory
89 -- UppercaseLetter
90 -- >>> maxBound :: GeneralCategory
91 -- NotAssigned
92 --
93 -- 'Ix' instance:
94 --
95 -- >>> import Data.Ix ( index )
96 -- >>> index (OtherLetter,Control) FinalQuote
97 -- 12
98 -- >>> index (OtherLetter,Control) Format
99 -- *** Exception: Error in array index
100 --
101 data GeneralCategory
102 = UppercaseLetter -- ^ Lu: Letter, Uppercase
103 | LowercaseLetter -- ^ Ll: Letter, Lowercase
104 | TitlecaseLetter -- ^ Lt: Letter, Titlecase
105 | ModifierLetter -- ^ Lm: Letter, Modifier
106 | OtherLetter -- ^ Lo: Letter, Other
107 | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
108 | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
109 | EnclosingMark -- ^ Me: Mark, Enclosing
110 | DecimalNumber -- ^ Nd: Number, Decimal
111 | LetterNumber -- ^ Nl: Number, Letter
112 | OtherNumber -- ^ No: Number, Other
113 | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
114 | DashPunctuation -- ^ Pd: Punctuation, Dash
115 | OpenPunctuation -- ^ Ps: Punctuation, Open
116 | ClosePunctuation -- ^ Pe: Punctuation, Close
117 | InitialQuote -- ^ Pi: Punctuation, Initial quote
118 | FinalQuote -- ^ Pf: Punctuation, Final quote
119 | OtherPunctuation -- ^ Po: Punctuation, Other
120 | MathSymbol -- ^ Sm: Symbol, Math
121 | CurrencySymbol -- ^ Sc: Symbol, Currency
122 | ModifierSymbol -- ^ Sk: Symbol, Modifier
123 | OtherSymbol -- ^ So: Symbol, Other
124 | Space -- ^ Zs: Separator, Space
125 | LineSeparator -- ^ Zl: Separator, Line
126 | ParagraphSeparator -- ^ Zp: Separator, Paragraph
127 | Control -- ^ Cc: Other, Control
128 | Format -- ^ Cf: Other, Format
129 | Surrogate -- ^ Cs: Other, Surrogate
130 | PrivateUse -- ^ Co: Other, Private Use
131 | NotAssigned -- ^ Cn: Other, Not Assigned
132 deriving (Show, Eq, Ord, Enum, Bounded, Ix)
133
134 -- | The Unicode general category of the character. This relies on the
135 -- 'Enum' instance of 'GeneralCategory', which must remain in the
136 -- same order as the categories are presented in the Unicode
137 -- standard.
138 --
139 -- ==== __Examples__
140 --
141 -- Basic usage:
142 --
143 -- >>> generalCategory 'a'
144 -- LowercaseLetter
145 -- >>> generalCategory 'A'
146 -- UppercaseLetter
147 -- >>> generalCategory '0'
148 -- DecimalNumber
149 -- >>> generalCategory '%'
150 -- OtherPunctuation
151 -- >>> generalCategory '♥'
152 -- OtherSymbol
153 -- >>> generalCategory '\31'
154 -- Control
155 -- >>> generalCategory ' '
156 -- Space
157 --
158 generalCategory :: Char -> GeneralCategory
159 generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
160
161 -- | Selects the first 128 characters of the Unicode character set,
162 -- corresponding to the ASCII character set.
163 isAscii :: Char -> Bool
164 isAscii c = c < '\x80'
165
166 -- | Selects the first 256 characters of the Unicode character set,
167 -- corresponding to the ISO 8859-1 (Latin-1) character set.
168 isLatin1 :: Char -> Bool
169 isLatin1 c = c <= '\xff'
170
171 -- | Selects ASCII lower-case letters,
172 -- i.e. characters satisfying both 'isAscii' and 'isLower'.
173 isAsciiLower :: Char -> Bool
174 isAsciiLower c = c >= 'a' && c <= 'z'
175
176 -- | Selects ASCII upper-case letters,
177 -- i.e. characters satisfying both 'isAscii' and 'isUpper'.
178 isAsciiUpper :: Char -> Bool
179 isAsciiUpper c = c >= 'A' && c <= 'Z'
180
181 -- | Selects control characters, which are the non-printing characters of
182 -- the Latin-1 subset of Unicode.
183 isControl :: Char -> Bool
184
185 -- | Selects printable Unicode characters
186 -- (letters, numbers, marks, punctuation, symbols and spaces).
187 isPrint :: Char -> Bool
188
189 -- | Returns 'True' for any Unicode space character, and the control
190 -- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.
191 isSpace :: Char -> Bool
192 -- isSpace includes non-breaking space
193 -- The magic 0x377 isn't really that magical. As of 2014, all the codepoints
194 -- at or below 0x377 have been assigned, so we shouldn't have to worry about
195 -- any new spaces appearing below there. It would probably be best to
196 -- use branchless ||, but currently the eqLit transformation will undo that,
197 -- so we'll do it like this until there's a way around that.
198 isSpace c
199 | uc <= 0x377 = uc == 32 || uc - 0x9 <= 4 || uc == 0xa0
200 | otherwise = iswspace (ord c) /= 0
201 where
202 uc = fromIntegral (ord c) :: Word
203
204 -- | Selects upper-case or title-case alphabetic Unicode characters (letters).
205 -- Title case is used by a small number of letter ligatures like the
206 -- single-character form of /Lj/.
207 isUpper :: Char -> Bool
208
209 -- | Selects lower-case alphabetic Unicode characters (letters).
210 isLower :: Char -> Bool
211
212 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
213 -- title-case letters, plus letters of caseless scripts and modifiers letters).
214 -- This function is equivalent to 'Data.Char.isLetter'.
215 isAlpha :: Char -> Bool
216
217 -- | Selects alphabetic or numeric digit Unicode characters.
218 --
219 -- Note that numeric digits outside the ASCII range are selected by this
220 -- function but not by 'isDigit'. Such digits may be part of identifiers
221 -- but are not used by the printer and reader to represent numbers.
222 isAlphaNum :: Char -> Bool
223
224 -- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
225 isDigit :: Char -> Bool
226 isDigit c = (fromIntegral (ord c - ord '0') :: Word) <= 9
227
228 -- We use an addition and an unsigned comparison instead of two signed
229 -- comparisons because it's usually faster and puts less strain on branch
230 -- prediction. It likely also enables some CSE when combined with functions
231 -- that follow up with an actual conversion.
232
233 -- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
234 isOctDigit :: Char -> Bool
235 isOctDigit c = (fromIntegral (ord c - ord '0') :: Word) <= 7
236
237 -- | Selects ASCII hexadecimal digits,
238 -- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.
239 isHexDigit :: Char -> Bool
240 isHexDigit c = isDigit c ||
241 (fromIntegral (ord c - ord 'A')::Word) <= 5 ||
242 (fromIntegral (ord c - ord 'a')::Word) <= 5
243
244 -- | Selects Unicode punctuation characters, including various kinds
245 -- of connectors, brackets and quotes.
246 --
247 -- This function returns 'True' if its argument has one of the
248 -- following 'GeneralCategory's, or 'False' otherwise:
249 --
250 -- * 'ConnectorPunctuation'
251 -- * 'DashPunctuation'
252 -- * 'OpenPunctuation'
253 -- * 'ClosePunctuation'
254 -- * 'InitialQuote'
255 -- * 'FinalQuote'
256 -- * 'OtherPunctuation'
257 --
258 -- These classes are defined in the
259 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
260 -- part of the Unicode standard. The same document defines what is
261 -- and is not a \"Punctuation\".
262 --
263 -- ==== __Examples__
264 --
265 -- Basic usage:
266 --
267 -- >>> isPunctuation 'a'
268 -- False
269 -- >>> isPunctuation '7'
270 -- False
271 -- >>> isPunctuation '♥'
272 -- False
273 -- >>> isPunctuation '"'
274 -- True
275 -- >>> isPunctuation '?'
276 -- True
277 -- >>> isPunctuation '—'
278 -- True
279 --
280 isPunctuation :: Char -> Bool
281 isPunctuation c = case generalCategory c of
282 ConnectorPunctuation -> True
283 DashPunctuation -> True
284 OpenPunctuation -> True
285 ClosePunctuation -> True
286 InitialQuote -> True
287 FinalQuote -> True
288 OtherPunctuation -> True
289 _ -> False
290
291 -- | Selects Unicode symbol characters, including mathematical and
292 -- currency symbols.
293 --
294 -- This function returns 'True' if its argument has one of the
295 -- following 'GeneralCategory's, or 'False' otherwise:
296 --
297 -- * 'MathSymbol'
298 -- * 'CurrencySymbol'
299 -- * 'ModifierSymbol'
300 -- * 'OtherSymbol'
301 --
302 -- These classes are defined in the
303 -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
304 -- part of the Unicode standard. The same document defines what is
305 -- and is not a \"Symbol\".
306 --
307 -- ==== __Examples__
308 --
309 -- Basic usage:
310 --
311 -- >>> isSymbol 'a'
312 -- False
313 -- >>> isSymbol '6'
314 -- False
315 -- >>> isSymbol '='
316 -- True
317 --
318 -- The definition of \"math symbol\" may be a little
319 -- counter-intuitive depending on one's background:
320 --
321 -- >>> isSymbol '+'
322 -- True
323 -- >>> isSymbol '-'
324 -- False
325 --
326 isSymbol :: Char -> Bool
327 isSymbol c = case generalCategory c of
328 MathSymbol -> True
329 CurrencySymbol -> True
330 ModifierSymbol -> True
331 OtherSymbol -> True
332 _ -> False
333
334 -- | Convert a letter to the corresponding upper-case letter, if any.
335 -- Any other character is returned unchanged.
336 toUpper :: Char -> Char
337
338 -- | Convert a letter to the corresponding lower-case letter, if any.
339 -- Any other character is returned unchanged.
340 toLower :: Char -> Char
341
342 -- | Convert a letter to the corresponding title-case or upper-case
343 -- letter, if any. (Title case differs from upper case only for a small
344 -- number of ligature letters.)
345 -- Any other character is returned unchanged.
346 toTitle :: Char -> Char
347
348 -- -----------------------------------------------------------------------------
349 -- Implementation with the supplied auto-generated Unicode character properties
350 -- table
351
352 -- Regardless of the O/S and Library, use the functions contained in WCsubst.c
353
354 isAlpha c = iswalpha (ord c) /= 0
355 isAlphaNum c = iswalnum (ord c) /= 0
356 isControl c = iswcntrl (ord c) /= 0
357 isPrint c = iswprint (ord c) /= 0
358 isUpper c = iswupper (ord c) /= 0
359 isLower c = iswlower (ord c) /= 0
360
361 toLower c = chr (towlower (ord c))
362 toUpper c = chr (towupper (ord c))
363 toTitle c = chr (towtitle (ord c))
364
365 foreign import ccall unsafe "u_iswalpha"
366 iswalpha :: Int -> Int
367
368 foreign import ccall unsafe "u_iswalnum"
369 iswalnum :: Int -> Int
370
371 foreign import ccall unsafe "u_iswcntrl"
372 iswcntrl :: Int -> Int
373
374 foreign import ccall unsafe "u_iswspace"
375 iswspace :: Int -> Int
376
377 foreign import ccall unsafe "u_iswprint"
378 iswprint :: Int -> Int
379
380 foreign import ccall unsafe "u_iswlower"
381 iswlower :: Int -> Int
382
383 foreign import ccall unsafe "u_iswupper"
384 iswupper :: Int -> Int
385
386 foreign import ccall unsafe "u_towlower"
387 towlower :: Int -> Int
388
389 foreign import ccall unsafe "u_towupper"
390 towupper :: Int -> Int
391
392 foreign import ccall unsafe "u_towtitle"
393 towtitle :: Int -> Int
394
395 foreign import ccall unsafe "u_gencat"
396 wgencat :: Int -> Int
397