char-code-definitions.js
6.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
var EOF = 0;
// https://drafts.csswg.org/css-syntax-3/
// § 4.2. Definitions
// digit
// A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9).
function isDigit(code) {
return code >= 0x0030 && code <= 0x0039;
}
// hex digit
// A digit, or a code point between U+0041 LATIN CAPITAL LETTER A (A) and U+0046 LATIN CAPITAL LETTER F (F),
// or a code point between U+0061 LATIN SMALL LETTER A (a) and U+0066 LATIN SMALL LETTER F (f).
function isHexDigit(code) {
return (
isDigit(code) || // 0 .. 9
(code >= 0x0041 && code <= 0x0046) || // A .. F
(code >= 0x0061 && code <= 0x0066) // a .. f
);
}
// uppercase letter
// A code point between U+0041 LATIN CAPITAL LETTER A (A) and U+005A LATIN CAPITAL LETTER Z (Z).
function isUppercaseLetter(code) {
return code >= 0x0041 && code <= 0x005A;
}
// lowercase letter
// A code point between U+0061 LATIN SMALL LETTER A (a) and U+007A LATIN SMALL LETTER Z (z).
function isLowercaseLetter(code) {
return code >= 0x0061 && code <= 0x007A;
}
// letter
// An uppercase letter or a lowercase letter.
function isLetter(code) {
return isUppercaseLetter(code) || isLowercaseLetter(code);
}
// non-ASCII code point
// A code point with a value equal to or greater than U+0080 <control>.
function isNonAscii(code) {
return code >= 0x0080;
}
// name-start code point
// A letter, a non-ASCII code point, or U+005F LOW LINE (_).
function isNameStart(code) {
return isLetter(code) || isNonAscii(code) || code === 0x005F;
}
// name code point
// A name-start code point, a digit, or U+002D HYPHEN-MINUS (-).
function isName(code) {
return isNameStart(code) || isDigit(code) || code === 0x002D;
}
// non-printable code point
// A code point between U+0000 NULL and U+0008 BACKSPACE, or U+000B LINE TABULATION,
// or a code point between U+000E SHIFT OUT and U+001F INFORMATION SEPARATOR ONE, or U+007F DELETE.
function isNonPrintable(code) {
return (
(code >= 0x0000 && code <= 0x0008) ||
(code === 0x000B) ||
(code >= 0x000E && code <= 0x001F) ||
(code === 0x007F)
);
}
// newline
// U+000A LINE FEED. Note that U+000D CARRIAGE RETURN and U+000C FORM FEED are not included in this definition,
// as they are converted to U+000A LINE FEED during preprocessing.
// TODO: we doesn't do a preprocessing, so check a code point for U+000D CARRIAGE RETURN and U+000C FORM FEED
function isNewline(code) {
return code === 0x000A || code === 0x000D || code === 0x000C;
}
// whitespace
// A newline, U+0009 CHARACTER TABULATION, or U+0020 SPACE.
function isWhiteSpace(code) {
return isNewline(code) || code === 0x0020 || code === 0x0009;
}
// § 4.3.8. Check if two code points are a valid escape
function isValidEscape(first, second) {
// If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
if (first !== 0x005C) {
return false;
}
// Otherwise, if the second code point is a newline or EOF, return false.
if (isNewline(second) || second === EOF) {
return false;
}
// Otherwise, return true.
return true;
}
// § 4.3.9. Check if three code points would start an identifier
function isIdentifierStart(first, second, third) {
// Look at the first code point:
// U+002D HYPHEN-MINUS
if (first === 0x002D) {
// If the second code point is a name-start code point or a U+002D HYPHEN-MINUS,
// or the second and third code points are a valid escape, return true. Otherwise, return false.
return (
isNameStart(second) ||
second === 0x002D ||
isValidEscape(second, third)
);
}
// name-start code point
if (isNameStart(first)) {
// Return true.
return true;
}
// U+005C REVERSE SOLIDUS (\)
if (first === 0x005C) {
// If the first and second code points are a valid escape, return true. Otherwise, return false.
return isValidEscape(first, second);
}
// anything else
// Return false.
return false;
}
// § 4.3.10. Check if three code points would start a number
function isNumberStart(first, second, third) {
// Look at the first code point:
// U+002B PLUS SIGN (+)
// U+002D HYPHEN-MINUS (-)
if (first === 0x002B || first === 0x002D) {
// If the second code point is a digit, return true.
if (isDigit(second)) {
return 2;
}
// Otherwise, if the second code point is a U+002E FULL STOP (.)
// and the third code point is a digit, return true.
// Otherwise, return false.
return second === 0x002E && isDigit(third) ? 3 : 0;
}
// U+002E FULL STOP (.)
if (first === 0x002E) {
// If the second code point is a digit, return true. Otherwise, return false.
return isDigit(second) ? 2 : 0;
}
// digit
if (isDigit(first)) {
// Return true.
return 1;
}
// anything else
// Return false.
return 0;
}
//
// Misc
//
// detect BOM (https://en.wikipedia.org/wiki/Byte_order_mark)
function isBOM(code) {
// UTF-16BE
if (code === 0xFEFF) {
return 1;
}
// UTF-16LE
if (code === 0xFFFE) {
return 1;
}
return 0;
}
// Fast code category
//
// https://drafts.csswg.org/css-syntax/#tokenizer-definitions
// > non-ASCII code point
// > A code point with a value equal to or greater than U+0080 <control>
// > name-start code point
// > A letter, a non-ASCII code point, or U+005F LOW LINE (_).
// > name code point
// > A name-start code point, a digit, or U+002D HYPHEN-MINUS (-)
// That means only ASCII code points has a special meaning and we define a maps for 0..127 codes only
var CATEGORY = new Array(0x80);
charCodeCategory.Eof = 0x80;
charCodeCategory.WhiteSpace = 0x82;
charCodeCategory.Digit = 0x83;
charCodeCategory.NameStart = 0x84;
charCodeCategory.NonPrintable = 0x85;
for (var i = 0; i < CATEGORY.length; i++) {
switch (true) {
case isWhiteSpace(i):
CATEGORY[i] = charCodeCategory.WhiteSpace;
break;
case isDigit(i):
CATEGORY[i] = charCodeCategory.Digit;
break;
case isNameStart(i):
CATEGORY[i] = charCodeCategory.NameStart;
break;
case isNonPrintable(i):
CATEGORY[i] = charCodeCategory.NonPrintable;
break;
default:
CATEGORY[i] = i || charCodeCategory.Eof;
}
}
function charCodeCategory(code) {
return code < 0x80 ? CATEGORY[code] : charCodeCategory.NameStart;
};
module.exports = {
isDigit: isDigit,
isHexDigit: isHexDigit,
isUppercaseLetter: isUppercaseLetter,
isLowercaseLetter: isLowercaseLetter,
isLetter: isLetter,
isNonAscii: isNonAscii,
isNameStart: isNameStart,
isName: isName,
isNonPrintable: isNonPrintable,
isNewline: isNewline,
isWhiteSpace: isWhiteSpace,
isValidEscape: isValidEscape,
isIdentifierStart: isIdentifierStart,
isNumberStart: isNumberStart,
isBOM: isBOM,
charCodeCategory: charCodeCategory
};