Mercurial > vim
comparison src/mbyte.c @ 21973:85add08e6a2d v8.2.1536
patch 8.2.1536: cannot get the class of a character; emoji widths are wrong
Commit: https://github.com/vim/vim/commit/4e4473c927167fd24e5c8df90e0e8035080cf2da
Author: Bram Moolenaar <Bram@vim.org>
Date: Fri Aug 28 22:24:57 2020 +0200
patch 8.2.1536: cannot get the class of a character; emoji widths are wrong
Problem: Cannot get the class of a character; emoji widths are wrong in
some environments.
Solution: Add charclass(). Update some emoji widths. Add script to check
emoji widths.
author | Bram Moolenaar <Bram@vim.org> |
---|---|
date | Fri, 28 Aug 2020 22:30:04 +0200 |
parents | 0bc43a704f56 |
children | 2030f8267db9 |
comparison
equal
deleted
inserted
replaced
21972:10438c4900d1 | 21973:85add08e6a2d |
---|---|
130 static int utf_ptr2cells_len(char_u *p, int size); | 130 static int utf_ptr2cells_len(char_u *p, int size); |
131 static int dbcs_char2cells(int c); | 131 static int dbcs_char2cells(int c); |
132 static int dbcs_ptr2cells_len(char_u *p, int size); | 132 static int dbcs_ptr2cells_len(char_u *p, int size); |
133 static int dbcs_ptr2char(char_u *p); | 133 static int dbcs_ptr2char(char_u *p); |
134 static int dbcs_head_off(char_u *base, char_u *p); | 134 static int dbcs_head_off(char_u *base, char_u *p); |
135 #ifdef FEAT_EVAL | |
135 static int cw_value(int c); | 136 static int cw_value(int c); |
137 #endif | |
136 | 138 |
137 /* | 139 /* |
138 * Lookup table to quickly get the length in bytes of a UTF-8 character from | 140 * Lookup table to quickly get the length in bytes of a UTF-8 character from |
139 * the first byte of a UTF-8 string. | 141 * the first byte of a UTF-8 string. |
140 * Bytes which are illegal when used as the first byte have a 1. | 142 * Bytes which are illegal when used as the first byte have a 1. |
1386 {0x26bd, 0x26be}, | 1388 {0x26bd, 0x26be}, |
1387 {0x26c4, 0x26c5}, | 1389 {0x26c4, 0x26c5}, |
1388 {0x26ce, 0x26ce}, | 1390 {0x26ce, 0x26ce}, |
1389 {0x26d4, 0x26d4}, | 1391 {0x26d4, 0x26d4}, |
1390 {0x26ea, 0x26ea}, | 1392 {0x26ea, 0x26ea}, |
1391 {0x26f2, 0x26f3}, | 1393 {0x26f2, 0x26f5}, |
1392 {0x26f5, 0x26f5}, | |
1393 {0x26fa, 0x26fa}, | 1394 {0x26fa, 0x26fa}, |
1394 {0x26fd, 0x26fd}, | 1395 {0x26fd, 0x26fd}, |
1395 {0x2705, 0x2705}, | 1396 {0x2705, 0x2705}, |
1396 {0x270a, 0x270b}, | 1397 {0x270a, 0x270b}, |
1397 {0x2728, 0x2728}, | 1398 {0x2728, 0x2728}, |
1488 // Sorted list of non-overlapping intervals of Emoji characters that don't | 1489 // Sorted list of non-overlapping intervals of Emoji characters that don't |
1489 // have ambiguous or double width, | 1490 // have ambiguous or double width, |
1490 // based on http://unicode.org/emoji/charts/emoji-list.html | 1491 // based on http://unicode.org/emoji/charts/emoji-list.html |
1491 static struct interval emoji_wide[] = | 1492 static struct interval emoji_wide[] = |
1492 { | 1493 { |
1494 {0x23ed, 0x23ef}, | |
1495 {0x23f1, 0x23f2}, | |
1496 {0x23f8, 0x23fa}, | |
1497 {0x24c2, 0x24c2}, | |
1498 {0x261d, 0x261d}, | |
1499 {0x26c8, 0x26c8}, | |
1500 {0x26cf, 0x26cf}, | |
1501 {0x26d1, 0x26d1}, | |
1502 {0x26d3, 0x26d3}, | |
1503 {0x26e9, 0x26e9}, | |
1504 {0x26f0, 0x26f1}, | |
1505 {0x26f7, 0x26f9}, | |
1506 {0x270c, 0x270d}, | |
1507 {0x2934, 0x2935}, | |
1508 {0x1f170, 0x1f189}, | |
1493 {0x1f1e6, 0x1f1ff}, | 1509 {0x1f1e6, 0x1f1ff}, |
1494 {0x1f321, 0x1f321}, | 1510 {0x1f321, 0x1f321}, |
1495 {0x1f324, 0x1f32c}, | 1511 {0x1f324, 0x1f32c}, |
1496 {0x1f336, 0x1f336}, | 1512 {0x1f336, 0x1f336}, |
1497 {0x1f37d, 0x1f37d}, | 1513 {0x1f37d, 0x1f37d}, |
1531 {0x1f6f3, 0x1f6f3} | 1547 {0x1f6f3, 0x1f6f3} |
1532 }; | 1548 }; |
1533 | 1549 |
1534 if (c >= 0x100) | 1550 if (c >= 0x100) |
1535 { | 1551 { |
1552 #if defined(FEAT_EVAL) || defined(USE_WCHAR_FUNCTIONS) | |
1536 int n; | 1553 int n; |
1537 | 1554 #endif |
1555 | |
1556 #ifdef FEAT_EVAL | |
1538 n = cw_value(c); | 1557 n = cw_value(c); |
1539 if (n != 0) | 1558 if (n != 0) |
1540 return n; | 1559 return n; |
1560 #endif | |
1541 | 1561 |
1542 #ifdef USE_WCHAR_FUNCTIONS | 1562 #ifdef USE_WCHAR_FUNCTIONS |
1543 /* | 1563 /* |
1544 * Assume the library function wcwidth() works better than our own | 1564 * Assume the library function wcwidth() works better than our own |
1545 * stuff. It should return 1 for ambiguous width chars! | 1565 * stuff. It should return 1 for ambiguous width chars! |
2665 {0x303d, 0x303d}, | 2685 {0x303d, 0x303d}, |
2666 {0x3297, 0x3297}, | 2686 {0x3297, 0x3297}, |
2667 {0x3299, 0x3299}, | 2687 {0x3299, 0x3299}, |
2668 {0x1f004, 0x1f004}, | 2688 {0x1f004, 0x1f004}, |
2669 {0x1f0cf, 0x1f0cf}, | 2689 {0x1f0cf, 0x1f0cf}, |
2670 {0x1f170, 0x1f171}, | 2690 {0x1f170, 0x1f189}, |
2671 {0x1f17e, 0x1f17f}, | |
2672 {0x1f18e, 0x1f18e}, | 2691 {0x1f18e, 0x1f18e}, |
2673 {0x1f191, 0x1f19a}, | 2692 {0x1f191, 0x1f19a}, |
2674 {0x1f1e6, 0x1f1ff}, | 2693 {0x1f1e6, 0x1f1ff}, |
2675 {0x1f201, 0x1f202}, | 2694 {0x1f201, 0x1f202}, |
2676 {0x1f21a, 0x1f21a}, | 2695 {0x1f21a, 0x1f21a}, |
2833 if (vim_iswordc_buf(c, buf)) | 2852 if (vim_iswordc_buf(c, buf)) |
2834 return 2; // word character | 2853 return 2; // word character |
2835 return 1; // punctuation | 2854 return 1; // punctuation |
2836 } | 2855 } |
2837 | 2856 |
2857 // emoji | |
2858 if (intable(emoji_all, sizeof(emoji_all), c)) | |
2859 return 3; | |
2860 | |
2838 // binary search in table | 2861 // binary search in table |
2839 while (top >= bot) | 2862 while (top >= bot) |
2840 { | 2863 { |
2841 mid = (bot + top) / 2; | 2864 mid = (bot + top) / 2; |
2842 if (classes[mid].last < (unsigned int)c) | 2865 if (classes[mid].last < (unsigned int)c) |
2844 else if (classes[mid].first > (unsigned int)c) | 2867 else if (classes[mid].first > (unsigned int)c) |
2845 top = mid - 1; | 2868 top = mid - 1; |
2846 else | 2869 else |
2847 return (int)classes[mid].class; | 2870 return (int)classes[mid].class; |
2848 } | 2871 } |
2849 | |
2850 // emoji | |
2851 if (intable(emoji_all, sizeof(emoji_all), c)) | |
2852 return 3; | |
2853 | 2872 |
2854 // most other characters are "word" characters | 2873 // most other characters are "word" characters |
2855 return 2; | 2874 return 2; |
2856 } | 2875 } |
2857 | 2876 |
5350 } | 5369 } |
5351 | 5370 |
5352 return retval; | 5371 return retval; |
5353 } | 5372 } |
5354 | 5373 |
5374 #if defined(FEAT_EVAL) || defined(PROTO) | |
5375 | |
5355 /* | 5376 /* |
5356 * Table set by setcellwidths(). | 5377 * Table set by setcellwidths(). |
5357 */ | 5378 */ |
5358 typedef struct | 5379 typedef struct |
5359 { | 5380 { |
5523 vim_free(ptrs); | 5544 vim_free(ptrs); |
5524 vim_free(cw_table); | 5545 vim_free(cw_table); |
5525 cw_table = table; | 5546 cw_table = table; |
5526 cw_table_size = l->lv_len; | 5547 cw_table_size = l->lv_len; |
5527 } | 5548 } |
5549 | |
5550 void | |
5551 f_charclass(typval_T *argvars, typval_T *rettv UNUSED) | |
5552 { | |
5553 if (argvars[0].v_type != VAR_STRING | |
5554 || argvars[0].vval.v_string == NULL | |
5555 || *argvars[0].vval.v_string == NUL) | |
5556 { | |
5557 emsg(_(e_stringreq)); | |
5558 return; | |
5559 } | |
5560 rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string); | |
5561 } | |
5562 #endif |