comparison src/mbyte.c @ 20695:cea8ae407452 v8.2.0901

patch 8.2.0901: formatting CJK text isn't optimal Commit: https://github.com/vim/vim/commit/e52702f00322c8a8861efd0bd6a3775e685e5685 Author: Bram Moolenaar <Bram@vim.org> Date: Thu Jun 4 18:22:13 2020 +0200 patch 8.2.0901: formatting CJK text isn't optimal Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes https://github.com/vim/vim/issues/3875)
author Bram Moolenaar <Bram@vim.org>
date Thu, 04 Jun 2020 18:30:04 +0200
parents 6c5b11458f31
children 0bc43a704f56
comparison
equal deleted inserted replaced
20694:3a049f4bdaa2 20695:cea8ae407452
3841 3841
3842 return (int)(p - q); 3842 return (int)(p - q);
3843 } 3843 }
3844 3844
3845 /* 3845 /*
3846 * Whether space is NOT allowed before/after 'c'.
3847 */
3848 int
3849 utf_eat_space(int cc)
3850 {
3851 return ((cc >= 0x2000 && cc <= 0x206F) // General punctuations
3852 || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
3853 || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
3854 || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
3855 || (cc >= 0xff1a && cc <= 0xff20) // ..
3856 || (cc >= 0xff3b && cc <= 0xff40) // ..
3857 || (cc >= 0xff5b && cc <= 0xff65)); // ..
3858 }
3859
3860 /*
3861 * Whether line break is allowed before "cc".
3862 */
3863 int
3864 utf_allow_break_before(int cc)
3865 {
3866 static const int BOL_prohibition_punct[] =
3867 {
3868 '!',
3869 '%',
3870 ')',
3871 ',',
3872 ':',
3873 ';',
3874 '>',
3875 '?',
3876 ']',
3877 '}',
3878 0x2019, // ’ right single quotation mark
3879 0x201d, // ” right double quotation mark
3880 0x2020, // † dagger
3881 0x2021, // ‡ double dagger
3882 0x2026, // … horizontal ellipsis
3883 0x2030, // ‰ per mille sign
3884 0x2031, // ‱ per then thousand sign
3885 0x203c, // ‼ double exclamation mark
3886 0x2047, // ⁇ double question mark
3887 0x2048, // ⁈ question exclamation mark
3888 0x2049, // ⁉ exclamation question mark
3889 0x2103, // ℃ degree celsius
3890 0x2109, // ℉ degree fahrenheit
3891 0x3001, // 、 ideographic comma
3892 0x3002, // 。 ideographic full stop
3893 0x3009, // 〉 right angle bracket
3894 0x300b, // 》 right double angle bracket
3895 0x300d, // 」 right corner bracket
3896 0x300f, // 』 right white corner bracket
3897 0x3011, // 】 right black lenticular bracket
3898 0x3015, // 〕 right tortoise shell bracket
3899 0x3017, // 〗 right white lenticular bracket
3900 0x3019, // 〙 right white tortoise shell bracket
3901 0x301b, // 〛 right white square bracket
3902 0xff01, // ! fullwidth exclamation mark
3903 0xff09, // ) fullwidth right parenthesis
3904 0xff0c, // , fullwidth comma
3905 0xff0e, // . fullwidth full stop
3906 0xff1a, // : fullwidth colon
3907 0xff1b, // ; fullwidth semicolon
3908 0xff1f, // ? fullwidth question mark
3909 0xff3d, // ] fullwidth right square bracket
3910 0xff5d, // } fullwidth right curly bracket
3911 };
3912
3913 int first = 0;
3914 int last = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
3915 int mid = 0;
3916
3917 while (first < last)
3918 {
3919 mid = (first + last)/2;
3920
3921 if (cc == BOL_prohibition_punct[mid])
3922 return FALSE;
3923 else if (cc > BOL_prohibition_punct[mid])
3924 first = mid + 1;
3925 else
3926 last = mid - 1;
3927 }
3928
3929 return cc != BOL_prohibition_punct[first];
3930 }
3931
3932 /*
3933 * Whether line break is allowed after "cc".
3934 */
3935 static int
3936 utf_allow_break_after(int cc)
3937 {
3938 static const int EOL_prohibition_punct[] =
3939 {
3940 '(',
3941 '<',
3942 '[',
3943 '`',
3944 '{',
3945 //0x2014, // — em dash
3946 0x2018, // ‘ left single quotation mark
3947 0x201c, // “ left double quotation mark
3948 //0x2053, // ~ swung dash
3949 0x3008, // 〈 left angle bracket
3950 0x300a, // 《 left double angle bracket
3951 0x300c, // 「 left corner bracket
3952 0x300e, // 『 left white corner bracket
3953 0x3010, // 【 left black lenticular bracket
3954 0x3014, // 〔 left tortoise shell bracket
3955 0x3016, // 〖 left white lenticular bracket
3956 0x3018, // 〘 left white tortoise shell bracket
3957 0x301a, // 〚 left white square bracket
3958 0xff08, // ( fullwidth left parenthesis
3959 0xff3b, // [ fullwidth left square bracket
3960 0xff5b, // { fullwidth left curly bracket
3961 };
3962
3963 int first = 0;
3964 int last = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
3965 int mid = 0;
3966
3967 while (first < last)
3968 {
3969 mid = (first + last)/2;
3970
3971 if (cc == EOL_prohibition_punct[mid])
3972 return FALSE;
3973 else if (cc > EOL_prohibition_punct[mid])
3974 first = mid + 1;
3975 else
3976 last = mid - 1;
3977 }
3978
3979 return cc != EOL_prohibition_punct[first];
3980 }
3981
3982 /*
3983 * Whether line break is allowed between "cc" and "ncc".
3984 */
3985 int
3986 utf_allow_break(int cc, int ncc)
3987 {
3988 // don't break between two-letter punctuations
3989 if (cc == ncc
3990 && (cc == 0x2014 // em dash
3991 || cc == 0x2026)) // horizontal ellipsis
3992 return FALSE;
3993
3994 return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
3995 }
3996
3997 /*
3846 * Copy a character from "*fp" to "*tp" and advance the pointers. 3998 * Copy a character from "*fp" to "*tp" and advance the pointers.
3847 */ 3999 */
3848 void 4000 void
3849 mb_copy_char(char_u **fp, char_u **tp) 4001 mb_copy_char(char_u **fp, char_u **tp)
3850 { 4002 {