comparison runtime/tools/unicode.vim @ 2041:d5867fd6b2b7 v7.2.330

updated for version 7.2.330 Problem: Tables for Unicode case operators are outdated. Solution: Add a Vim script for generating the tables. Include tables for Unicode 5.2.
author Bram Moolenaar <bram@zimbu.org>
date Tue, 12 Jan 2010 19:52:03 +0100
parents
children 1378bc45ebe5
comparison
equal deleted inserted replaced
2040:70c67b1bb1f1 2041:d5867fd6b2b7
1 " Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2 " The format of the UnicodeData.txt file is explained here:
3 " http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4 " For the other files see the header.
5 "
6 " Usage: Vim -S <this-file>
7 "
8 " Author: Bram Moolenaar
9 " Last Update: 2010 Jan 12
10
11 " Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12 func! ParseDataToProps()
13 let s:dataprops = []
14 let lnum = 1
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
17 if len(l) != 15
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19 return
20 endif
21 call add(s:dataprops, l)
22 let lnum += 1
23 endwhile
24 endfunc
25
26 " Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27 func! ParseFoldProps()
28 let s:foldprops = []
29 let lnum = 1
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
34 if len(l) != 4
35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36 return
37 endif
38 call add(s:foldprops, l)
39 endif
40 let lnum += 1
41 endwhile
42 endfunc
43
44 " Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45 func! ParseWidthProps()
46 let s:widthprops = []
47 let lnum = 1
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
52 if len(l) != 2
53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54 return
55 endif
56 call add(s:widthprops, l)
57 endif
58 let lnum += 1
59 endwhile
60 endfunc
61
62 " Build the toLower or toUpper table in a new buffer.
63 " Uses s:dataprops.
64 func! BuildCaseTable(name, index)
65 let start = -1
66 let end = -1
67 let step = 0
68 let add = -1
69 let ranges = []
70 for p in s:dataprops
71 if p[a:index] != ''
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75 " continue with same range.
76 let step = n - end
77 let end = n
78 else
79 if start >= 0
80 " produce previous range
81 call Range(ranges, start, end, step, add)
82 endif
83 let start = n
84 let end = n
85 let step = 0
86 let add = nl - n
87 endif
88 endif
89 endfor
90 if start >= 0
91 call Range(ranges, start, end, step, add)
92 endif
93
94 " New buffer to put the result in.
95 new
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
98 call setline(2, "{")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
102 wincmd p
103 endfunc
104
105 " Build the foldCase table in a new buffer.
106 " Uses s:foldprops.
107 func! BuildFoldTable()
108 let start = -1
109 let end = -1
110 let step = 0
111 let add = -1
112 let ranges = []
113 for p in s:foldprops
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118 " continue with same range.
119 let step = n - end
120 let end = n
121 else
122 if start >= 0
123 " produce previous range
124 call Range(ranges, start, end, step, add)
125 endif
126 let start = n
127 let end = n
128 let step = 0
129 let add = nl - n
130 endif
131 endif
132 endfor
133 if start >= 0
134 call Range(ranges, start, end, step, add)
135 endif
136
137 " New buffer to put the result in.
138 new
139 file foldCase
140 call setline(1, "static convertStruct foldCase[] =")
141 call setline(2, "{")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
145 wincmd p
146 endfunc
147
148 func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
151 endfunc
152
153 " Build the combining table.
154 " Uses s:dataprops.
155 func! BuildCombiningTable()
156 let start = -1
157 let end = -1
158 let ranges = []
159 for p in s:dataprops
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
163 " continue with same range.
164 let end = n
165 else
166 if start >= 0
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169 endif
170 let start = n
171 let end = n
172 endif
173 endif
174 endfor
175 if start >= 0
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177 endif
178
179 " New buffer to put the result in.
180 new
181 file combining
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
187 wincmd p
188 endfunc
189
190 " Build the ambiguous table in a new buffer.
191 " Uses s:widthprops and s:dataprops.
192 func! BuildAmbiguousTable()
193 let start = -1
194 let end = -1
195 let ranges = []
196 let dataidx = 0
197 for p in s:widthprops
198 if p[1][0] == 'A'
199 let n = ('0x' . p[0]) + 0
200 " Find this char in the data table.
201 while 1
202 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
203 if dn >= n
204 break
205 endif
206 let dataidx += 1
207 endwhile
208 if dn != n
209 echoerr "Cannot find character " . n . " in data table"
210 endif
211 " Only use the char when it's not a composing char.
212 let dp = s:dataprops[dataidx]
213 if dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me'
214 if start >= 0 && end + 1 == n
215 " continue with same range.
216 let end = n
217 else
218 if start >= 0
219 " produce previous range
220 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
221 endif
222 let start = n
223 if p[0] =~ '\.\.'
224 let end = ('0x' . substitute(p[0], '.*\.\.', '', '')) + 0
225 else
226 let end = n
227 endif
228 endif
229 endif
230 endif
231 endfor
232 if start >= 0
233 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
234 endif
235
236 " New buffer to put the result in.
237 new
238 file ambiguous
239 call setline(1, " static struct interval ambiguous[] =")
240 call setline(2, " {")
241 call append('$', ranges)
242 call setline('$', getline('$')[:-2]) " remove last comma
243 call setline(line('$') + 1, " };")
244 wincmd p
245 endfunc
246
247
248
249 " Edit the Unicode text file. Requires the netrw plugin.
250 edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
251
252 " Parse each line, create a list of lists.
253 call ParseDataToProps()
254
255 " Build the toLower table.
256 call BuildCaseTable("Lower", 13)
257
258 " Build the toUpper table.
259 call BuildCaseTable("Upper", 12)
260
261 " Build the ranges of composing chars.
262 call BuildCombiningTable()
263
264 " Edit the case folding text file. Requires the netrw plugin.
265 edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
266
267 " Parse each line, create a list of lists.
268 call ParseFoldProps()
269
270 " Build the foldCase table.
271 call BuildFoldTable()
272
273 " Edit the width text file. Requires the netrw plugin.
274 edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
275
276 " Parse each line, create a list of lists.
277 call ParseWidthProps()
278
279 " Build the ambiguous table.
280 call BuildAmbiguousTable()