diff src/mbyte.c @ 20695:cea8ae407452 v8.2.0901

patch 8.2.0901: formatting CJK text isn't optimal Commit: https://github.com/vim/vim/commit/e52702f00322c8a8861efd0bd6a3775e685e5685 Author: Bram Moolenaar <Bram@vim.org> Date: Thu Jun 4 18:22:13 2020 +0200 patch 8.2.0901: formatting CJK text isn't optimal Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes https://github.com/vim/vim/issues/3875)
author Bram Moolenaar <Bram@vim.org>
date Thu, 04 Jun 2020 18:30:04 +0200
parents 6c5b11458f31
children 0bc43a704f56
line wrap: on
line diff
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3843,6 +3843,158 @@ utf_head_off(char_u *base, char_u *p)
 }
 
 /*
+ * Whether space is NOT allowed before/after 'c'.
+ */
+    int
+utf_eat_space(int cc)
+{
+    return ((cc >= 0x2000 && cc <= 0x206F)	// General punctuations
+	 || (cc >= 0x2e00 && cc <= 0x2e7f)	// Supplemental punctuations
+	 || (cc >= 0x3000 && cc <= 0x303f)	// CJK symbols and punctuations
+	 || (cc >= 0xff01 && cc <= 0xff0f)	// Full width ASCII punctuations
+	 || (cc >= 0xff1a && cc <= 0xff20)	// ..
+	 || (cc >= 0xff3b && cc <= 0xff40)	// ..
+	 || (cc >= 0xff5b && cc <= 0xff65));	// ..
+}
+
+/*
+ * Whether line break is allowed before "cc".
+ */
+    int
+utf_allow_break_before(int cc)
+{
+    static const int BOL_prohibition_punct[] =
+    {
+	'!',
+	'%',
+	')',
+	',',
+	':',
+	';',
+	'>',
+	'?',
+	']',
+	'}',
+	0x2019, // ’ right single quotation mark
+	0x201d, // ” right double quotation mark
+	0x2020, // † dagger
+	0x2021, // ‡ double dagger
+	0x2026, // … horizontal ellipsis
+	0x2030, // ‰ per mille sign
+	0x2031, // ‱ per then thousand sign
+	0x203c, // ‼ double exclamation mark
+	0x2047, // ⁇ double question mark
+	0x2048, // ⁈ question exclamation mark
+	0x2049, // ⁉ exclamation question mark
+	0x2103, // ℃ degree celsius
+	0x2109, // ℉ degree fahrenheit
+	0x3001, // 、 ideographic comma
+	0x3002, // 。 ideographic full stop
+	0x3009, // 〉 right angle bracket
+	0x300b, // 》 right double angle bracket
+	0x300d, // 」 right corner bracket
+	0x300f, // 』 right white corner bracket
+	0x3011, // 】 right black lenticular bracket
+	0x3015, // 〕 right tortoise shell bracket
+	0x3017, // 〗 right white lenticular bracket
+	0x3019, // 〙 right white tortoise shell bracket
+	0x301b, // 〛 right white square bracket
+	0xff01, // ! fullwidth exclamation mark
+	0xff09, // ) fullwidth right parenthesis
+	0xff0c, // , fullwidth comma
+	0xff0e, // . fullwidth full stop
+	0xff1a, // : fullwidth colon
+	0xff1b, // ; fullwidth semicolon
+	0xff1f, // ? fullwidth question mark
+	0xff3d, // ] fullwidth right square bracket
+	0xff5d, // } fullwidth right curly bracket
+    };
+
+    int first = 0;
+    int last  = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+    int mid   = 0;
+
+    while (first < last)
+    {
+	mid = (first + last)/2;
+
+	if (cc == BOL_prohibition_punct[mid])
+	    return FALSE;
+	else if (cc > BOL_prohibition_punct[mid])
+	    first = mid + 1;
+	else
+	    last = mid - 1;
+    }
+
+    return cc != BOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed after "cc".
+ */
+    static int
+utf_allow_break_after(int cc)
+{
+    static const int EOL_prohibition_punct[] =
+    {
+	'(',
+	'<',
+	'[',
+	'`',
+	'{',
+	//0x2014, // — em dash
+	0x2018, // ‘ left single quotation mark
+	0x201c, // “ left double quotation mark
+	//0x2053, // ~ swung dash
+	0x3008, // 〈 left angle bracket
+	0x300a, // 《 left double angle bracket
+	0x300c, // 「 left corner bracket
+	0x300e, // 『 left white corner bracket
+	0x3010, // 【 left black lenticular bracket
+	0x3014, // 〔 left tortoise shell bracket
+	0x3016, // 〖 left white lenticular bracket
+	0x3018, // 〘 left white tortoise shell bracket
+	0x301a, // 〚 left white square bracket
+	0xff08, // ( fullwidth left parenthesis
+	0xff3b, // [ fullwidth left square bracket
+	0xff5b, // { fullwidth left curly bracket
+    };
+
+    int first = 0;
+    int last  = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+    int mid   = 0;
+
+    while (first < last)
+    {
+	mid = (first + last)/2;
+
+	if (cc == EOL_prohibition_punct[mid])
+	    return FALSE;
+	else if (cc > EOL_prohibition_punct[mid])
+	    first = mid + 1;
+	else
+	    last = mid - 1;
+    }
+
+    return cc != EOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed between "cc" and "ncc".
+ */
+    int
+utf_allow_break(int cc, int ncc)
+{
+    // don't break between two-letter punctuations
+    if (cc == ncc
+	    && (cc == 0x2014 // em dash
+		|| cc == 0x2026)) // horizontal ellipsis
+	return FALSE;
+
+    return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+}
+
+/*
  * Copy a character from "*fp" to "*tp" and advance the pointers.
  */
     void