unicode-case-fold.py
4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
"""
Unicode case folding database conversion utility
Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:
<code>; <status>; <mapping>; # <name>
<status> can be one of four characters:
C - Common mappings
S - mappings for Simple case folding
F - mappings for Full case folding
T - special case for Turkish I characters
Right now this generates a function which implements simple case folding (C+S
entries).
"""
from __future__ import print_function
import sys
import re
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
# This variable will body of the mappings function
body = ""
# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
def mappings(f):
previous_from = -1
expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
for line in f:
m = expr.match(line)
if not m: continue
from_char = int(m.group(1), 16)
to_char = int(m.group(2), 16)
from_name = m.group(3)
if from_char <= previous_from:
raise Exception("Duplicate or unsorted characters in input")
yield from_char, to_char, from_name
previous_from = from_char
# Computes the shift (to_char - from_char) in a mapping.
def shift(mapping):
return mapping[1] - mapping[0]
# Computes the stride (from_char2 - from_char1) of two mappings.
def stride2(mapping1, mapping2):
return mapping2[0] - mapping1[0]
# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
def stride(block):
return stride2(block[0], block[1])
# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
def dump_block(b):
global body
if len(b) == 1:
# Special case for handling blocks of length 1. We don't even need to
# emit the "if (C < X) return C" check below as all characters in this
# range will be caught by the "C < X" check emitted by the first
# non-trivial block.
body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
return
first = b[0][0]
last = first + stride(b) * (len(b)-1)
modulo = first % stride(b)
# All characters before this block map to themselves.
body += " if (C < {0:#06x})\n return C;\n".format(first)
body += " // {0} characters\n".format(len(b))
# Generic pattern: check upper bound (lower bound is checked by the "if"
# above) and modulo of C, return C+shift.
pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
# Special case:
# We can elide the modulo-check because the expression "C|1" will map
# the intervening characters to themselves.
pattern = " if (C <= {0:#06x})\n return C | 1;\n"
elif stride(b) == 1:
# Another special case: X % 1 is always zero, so don't emit the
# modulo-check.
pattern = " if (C <= {0:#06x})\n return C + {3};\n"
body += pattern.format(last, stride(b), modulo, shift(b[0]))
current_block = []
f = urlopen(sys.argv[1])
for m in mappings(f):
if len(current_block) == 0:
current_block.append(m)
continue
if shift(current_block[0]) != shift(m):
# Incompatible shift, start a new block.
dump_block(current_block)
current_block = [m]
continue
if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
current_block.append(m)
continue
# Incompatible stride, start a new block.
dump_block(current_block)
current_block = [m]
f.close()
dump_block(current_block)
print('//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//')
print('//')
print('// This file was generated by utils/unicode-case-fold.py from the Unicode')
print('// case folding database at')
print('// ', sys.argv[1])
print('//')
print('// To regenerate this file, run:')
print('// utils/unicode-case-fold.py \\')
print('// "{}" \\'.format(sys.argv[1]))
print('// > lib/Support/UnicodeCaseFold.cpp')
print('//')
print('//===----------------------------------------------------------------------===//')
print('')
print('#include "llvm/Support/Unicode.h"')
print('')
print("int llvm::sys::unicode::foldCharSimple(int C) {")
print(body)
print(" return C;")
print("}")