comparison src/mbyte.c @ 21973:85add08e6a2d v8.2.1536

patch 8.2.1536: cannot get the class of a character; emoji widths are wrong Commit: https://github.com/vim/vim/commit/4e4473c927167fd24e5c8df90e0e8035080cf2da Author: Bram Moolenaar <Bram@vim.org> Date: Fri Aug 28 22:24:57 2020 +0200 patch 8.2.1536: cannot get the class of a character; emoji widths are wrong Problem: Cannot get the class of a character; emoji widths are wrong in some environments. Solution: Add charclass(). Update some emoji widths. Add script to check emoji widths.
author Bram Moolenaar <Bram@vim.org>
date Fri, 28 Aug 2020 22:30:04 +0200
parents 0bc43a704f56
children 2030f8267db9
comparison
equal deleted inserted replaced
21972:10438c4900d1 21973:85add08e6a2d
130 static int utf_ptr2cells_len(char_u *p, int size); 130 static int utf_ptr2cells_len(char_u *p, int size);
131 static int dbcs_char2cells(int c); 131 static int dbcs_char2cells(int c);
132 static int dbcs_ptr2cells_len(char_u *p, int size); 132 static int dbcs_ptr2cells_len(char_u *p, int size);
133 static int dbcs_ptr2char(char_u *p); 133 static int dbcs_ptr2char(char_u *p);
134 static int dbcs_head_off(char_u *base, char_u *p); 134 static int dbcs_head_off(char_u *base, char_u *p);
135 #ifdef FEAT_EVAL
135 static int cw_value(int c); 136 static int cw_value(int c);
137 #endif
136 138
137 /* 139 /*
138 * Lookup table to quickly get the length in bytes of a UTF-8 character from 140 * Lookup table to quickly get the length in bytes of a UTF-8 character from
139 * the first byte of a UTF-8 string. 141 * the first byte of a UTF-8 string.
140 * Bytes which are illegal when used as the first byte have a 1. 142 * Bytes which are illegal when used as the first byte have a 1.
1386 {0x26bd, 0x26be}, 1388 {0x26bd, 0x26be},
1387 {0x26c4, 0x26c5}, 1389 {0x26c4, 0x26c5},
1388 {0x26ce, 0x26ce}, 1390 {0x26ce, 0x26ce},
1389 {0x26d4, 0x26d4}, 1391 {0x26d4, 0x26d4},
1390 {0x26ea, 0x26ea}, 1392 {0x26ea, 0x26ea},
1391 {0x26f2, 0x26f3}, 1393 {0x26f2, 0x26f5},
1392 {0x26f5, 0x26f5},
1393 {0x26fa, 0x26fa}, 1394 {0x26fa, 0x26fa},
1394 {0x26fd, 0x26fd}, 1395 {0x26fd, 0x26fd},
1395 {0x2705, 0x2705}, 1396 {0x2705, 0x2705},
1396 {0x270a, 0x270b}, 1397 {0x270a, 0x270b},
1397 {0x2728, 0x2728}, 1398 {0x2728, 0x2728},
1488 // Sorted list of non-overlapping intervals of Emoji characters that don't 1489 // Sorted list of non-overlapping intervals of Emoji characters that don't
1489 // have ambiguous or double width, 1490 // have ambiguous or double width,
1490 // based on http://unicode.org/emoji/charts/emoji-list.html 1491 // based on http://unicode.org/emoji/charts/emoji-list.html
1491 static struct interval emoji_wide[] = 1492 static struct interval emoji_wide[] =
1492 { 1493 {
1494 {0x23ed, 0x23ef},
1495 {0x23f1, 0x23f2},
1496 {0x23f8, 0x23fa},
1497 {0x24c2, 0x24c2},
1498 {0x261d, 0x261d},
1499 {0x26c8, 0x26c8},
1500 {0x26cf, 0x26cf},
1501 {0x26d1, 0x26d1},
1502 {0x26d3, 0x26d3},
1503 {0x26e9, 0x26e9},
1504 {0x26f0, 0x26f1},
1505 {0x26f7, 0x26f9},
1506 {0x270c, 0x270d},
1507 {0x2934, 0x2935},
1508 {0x1f170, 0x1f189},
1493 {0x1f1e6, 0x1f1ff}, 1509 {0x1f1e6, 0x1f1ff},
1494 {0x1f321, 0x1f321}, 1510 {0x1f321, 0x1f321},
1495 {0x1f324, 0x1f32c}, 1511 {0x1f324, 0x1f32c},
1496 {0x1f336, 0x1f336}, 1512 {0x1f336, 0x1f336},
1497 {0x1f37d, 0x1f37d}, 1513 {0x1f37d, 0x1f37d},
1531 {0x1f6f3, 0x1f6f3} 1547 {0x1f6f3, 0x1f6f3}
1532 }; 1548 };
1533 1549
1534 if (c >= 0x100) 1550 if (c >= 0x100)
1535 { 1551 {
1552 #if defined(FEAT_EVAL) || defined(USE_WCHAR_FUNCTIONS)
1536 int n; 1553 int n;
1537 1554 #endif
1555
1556 #ifdef FEAT_EVAL
1538 n = cw_value(c); 1557 n = cw_value(c);
1539 if (n != 0) 1558 if (n != 0)
1540 return n; 1559 return n;
1560 #endif
1541 1561
1542 #ifdef USE_WCHAR_FUNCTIONS 1562 #ifdef USE_WCHAR_FUNCTIONS
1543 /* 1563 /*
1544 * Assume the library function wcwidth() works better than our own 1564 * Assume the library function wcwidth() works better than our own
1545 * stuff. It should return 1 for ambiguous width chars! 1565 * stuff. It should return 1 for ambiguous width chars!
2665 {0x303d, 0x303d}, 2685 {0x303d, 0x303d},
2666 {0x3297, 0x3297}, 2686 {0x3297, 0x3297},
2667 {0x3299, 0x3299}, 2687 {0x3299, 0x3299},
2668 {0x1f004, 0x1f004}, 2688 {0x1f004, 0x1f004},
2669 {0x1f0cf, 0x1f0cf}, 2689 {0x1f0cf, 0x1f0cf},
2670 {0x1f170, 0x1f171}, 2690 {0x1f170, 0x1f189},
2671 {0x1f17e, 0x1f17f},
2672 {0x1f18e, 0x1f18e}, 2691 {0x1f18e, 0x1f18e},
2673 {0x1f191, 0x1f19a}, 2692 {0x1f191, 0x1f19a},
2674 {0x1f1e6, 0x1f1ff}, 2693 {0x1f1e6, 0x1f1ff},
2675 {0x1f201, 0x1f202}, 2694 {0x1f201, 0x1f202},
2676 {0x1f21a, 0x1f21a}, 2695 {0x1f21a, 0x1f21a},
2833 if (vim_iswordc_buf(c, buf)) 2852 if (vim_iswordc_buf(c, buf))
2834 return 2; // word character 2853 return 2; // word character
2835 return 1; // punctuation 2854 return 1; // punctuation
2836 } 2855 }
2837 2856
2857 // emoji
2858 if (intable(emoji_all, sizeof(emoji_all), c))
2859 return 3;
2860
2838 // binary search in table 2861 // binary search in table
2839 while (top >= bot) 2862 while (top >= bot)
2840 { 2863 {
2841 mid = (bot + top) / 2; 2864 mid = (bot + top) / 2;
2842 if (classes[mid].last < (unsigned int)c) 2865 if (classes[mid].last < (unsigned int)c)
2844 else if (classes[mid].first > (unsigned int)c) 2867 else if (classes[mid].first > (unsigned int)c)
2845 top = mid - 1; 2868 top = mid - 1;
2846 else 2869 else
2847 return (int)classes[mid].class; 2870 return (int)classes[mid].class;
2848 } 2871 }
2849
2850 // emoji
2851 if (intable(emoji_all, sizeof(emoji_all), c))
2852 return 3;
2853 2872
2854 // most other characters are "word" characters 2873 // most other characters are "word" characters
2855 return 2; 2874 return 2;
2856 } 2875 }
2857 2876
5350 } 5369 }
5351 5370
5352 return retval; 5371 return retval;
5353 } 5372 }
5354 5373
5374 #if defined(FEAT_EVAL) || defined(PROTO)
5375
5355 /* 5376 /*
5356 * Table set by setcellwidths(). 5377 * Table set by setcellwidths().
5357 */ 5378 */
5358 typedef struct 5379 typedef struct
5359 { 5380 {
5523 vim_free(ptrs); 5544 vim_free(ptrs);
5524 vim_free(cw_table); 5545 vim_free(cw_table);
5525 cw_table = table; 5546 cw_table = table;
5526 cw_table_size = l->lv_len; 5547 cw_table_size = l->lv_len;
5527 } 5548 }
5549
5550 void
5551 f_charclass(typval_T *argvars, typval_T *rettv UNUSED)
5552 {
5553 if (argvars[0].v_type != VAR_STRING
5554 || argvars[0].vval.v_string == NULL
5555 || *argvars[0].vval.v_string == NUL)
5556 {
5557 emsg(_(e_stringreq));
5558 return;
5559 }
5560 rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string);
5561 }
5562 #endif