changeset 32307:8d6f53a07ffd v9.0.1485

patch 9.0.1485: no functions for converting from/to UTF-16 index Commit: https://github.com/vim/vim/commit/67672ef097dd708244ff042a8364994da2b91e75 Author: Christian Brabandt <cb@256bit.org> Date: Mon Apr 24 21:09:54 2023 +0100 patch 9.0.1485: no functions for converting from/to UTF-16 index Problem: no functions for converting from/to UTF-16 index. Solution: Add UTF-16 flag to existing funtions and add strutf16len() and utf16idx(). (Yegappan Lakshmanan, closes #12216)
author Bram Moolenaar <Bram@vim.org>
date Mon, 24 Apr 2023 22:15:05 +0200
parents 6d5e523b5b6a
children 78e122c7239f
files runtime/doc/builtin.txt runtime/doc/eval.txt runtime/doc/usr_41.txt src/evalfunc.c src/proto/strings.pro src/strings.c src/testdir/test_functions.vim src/version.c
diffstat 8 files changed, 677 insertions(+), 56 deletions(-) [+]
line wrap: on
line diff
--- a/runtime/doc/builtin.txt
+++ b/runtime/doc/builtin.txt
@@ -81,8 +81,10 @@ bufnr([{buf} [, {create}]])	Number	Numbe
 bufwinid({buf})			Number	window ID of buffer {buf}
 bufwinnr({buf})			Number	window number of buffer {buf}
 byte2line({byte})		Number	line number at byte count {byte}
-byteidx({expr}, {nr})		Number	byte index of {nr}'th char in {expr}
-byteidxcomp({expr}, {nr})	Number	byte index of {nr}'th char in {expr}
+byteidx({expr}, {nr} [, {utf16}])
+				Number	byte index of {nr}'th char in {expr}
+byteidxcomp({expr}, {nr} [, {utf16}])
+				Number	byte index of {nr}'th char in {expr}
 call({func}, {arglist} [, {dict}])
 				any	call {func} with arguments {arglist}
 ceil({expr})			Float	round {expr} up
@@ -117,7 +119,7 @@ changenr()			Number	current change numbe
 char2nr({expr} [, {utf8}])	Number	ASCII/UTF-8 value of first char in {expr}
 charclass({string})		Number	character class of {string}
 charcol({expr} [, {winid}])	Number	column number of cursor or mark
-charidx({string}, {idx} [, {countcc}])
+charidx({string}, {idx} [, {countcc} [, {utf16}]])
 				Number	char index of byte {idx} in {string}
 chdir({dir})			String	change current working directory
 cindent({lnum})			Number	C indent for line {lnum}
@@ -604,6 +606,8 @@ strptime({format}, {timestring})
 strridx({haystack}, {needle} [, {start}])
 				Number	last index of {needle} in {haystack}
 strtrans({expr})		String	translate string to make it printable
+strutf16len({string} [, {countcc}])
+				Number	number of UTF-16 code units in {string}
 strwidth({expr})		Number	display cell length of the String {expr}
 submatch({nr} [, {list}])	String or List
 					specific match in ":s" or substitute()
@@ -704,6 +708,8 @@ undofile({name})		String	undo file name 
 undotree()			List	undo file tree
 uniq({list} [, {func} [, {dict}]])
 				List	remove adjacent duplicates from a list
+utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
+				Number	UTF-16 index of byte {idx} in {string}
 values({dict})			List	values in {dict}
 virtcol({expr} [, {list}])	Number or List
 					screen column of cursor or mark
@@ -1363,7 +1369,7 @@ byte2line({byte})					*byte2line()*
 <		{not available when compiled without the |+byte_offset|
 		feature}
 
-byteidx({expr}, {nr})					*byteidx()*
+byteidx({expr}, {nr} [, {utf16}])			*byteidx()*
 		Return byte index of the {nr}'th character in the String
 		{expr}.  Use zero for the first character, it then returns
 		zero.
@@ -1373,6 +1379,13 @@ byteidx({expr}, {nr})					*byteidx()*
 		length is added to the preceding base character.  See
 		|byteidxcomp()| below for counting composing characters
 		separately.
+		When {utf16} is present and TRUE, {nr} is used as the UTF-16
+		index in the String {expr} instead of as the character index.
+		The UTF-16 index is the index in the string when it is encoded
+		with 16-bit words.  If the specified UTF-16 index is in the
+		middle of a character (e.g. in a 4-byte character), then the
+		byte index of the first byte in the character is returned.
+		Refer to |string-offset-encoding| for more information.
 		Example : >
 			echo matchstr(str, ".", byteidx(str, 3))
 <		will display the fourth character.  Another way to do the
@@ -1384,11 +1397,17 @@ byteidx({expr}, {nr})					*byteidx()*
 		If there are less than {nr} characters -1 is returned.
 		If there are exactly {nr} characters the length of the string
 		in bytes is returned.
-
+		See |charidx()| and |utf16idx()| for getting the character and
+		UTF-16 index respectively from the byte index.
+		Examples: >
+			echo byteidx('a馃槉馃槉', 2)	returns 5
+			echo byteidx('a馃槉馃槉', 2, 1)	returns 1
+			echo byteidx('a馃槉馃槉', 3, 1)	returns 5
+<
 		Can also be used as a |method|: >
 			GetName()->byteidx(idx)
 
-byteidxcomp({expr}, {nr})					*byteidxcomp()*
+byteidxcomp({expr}, {nr} [, {utf16}])			*byteidxcomp()*
 		Like byteidx(), except that a composing character is counted
 		as a separate character.  Example: >
 			let s = 'e' .. nr2char(0x301)
@@ -1493,27 +1512,36 @@ charcol({expr} [, {winid}])				*charcol(
 			GetPos()->col()
 <
 							*charidx()*
-charidx({string}, {idx} [, {countcc}])
+charidx({string}, {idx} [, {countcc} [, {utf16}]])
 		Return the character index of the byte at {idx} in {string}.
 		The index of the first character is zero.
 		If there are no multibyte characters the returned value is
 		equal to {idx}.
+
 		When {countcc} is omitted or |FALSE|, then composing characters
-		are not counted separately, their byte length is
-		added to the preceding base character.
+		are not counted separately, their byte length is added to the
+		preceding base character.
 		When {countcc} is |TRUE|, then composing characters are
 		counted as separate characters.
+
+		When {utf16} is present and TRUE, {idx} is used as the UTF-16
+		index in the String {expr} instead of as the byte index.
+
 		Returns -1 if the arguments are invalid or if {idx} is greater
 		than the index of the last byte in {string}.  An error is
 		given if the first argument is not a string, the second
 		argument is not a number or when the third argument is present
 		and is not zero or one.
+
 		See |byteidx()| and |byteidxcomp()| for getting the byte index
-		from the character index.
+		from the character index and |utf16idx()| for getting the
+		UTF-16 index from the character index.
+		Refer to |string-offset-encoding| for more information.
 		Examples: >
 			echo charidx('a虂b虂c虂', 3)		returns 1
 			echo charidx('a虂b虂c虂', 6, 1)	returns 4
 			echo charidx('a虂b虂c虂', 16)		returns -1
+			echo charidx('a馃槉馃槉', 4, 0, 1)	returns 2
 <
 		Can also be used as a |method|: >
 			GetName()->charidx(idx)
@@ -9244,6 +9272,28 @@ strtrans({string})					*strtrans()*
 		Can also be used as a |method|: >
 			GetString()->strtrans()
 
+strutf16len({string} [, {countcc}])			*strutf16len()*
+		The result is a Number, which is the number of UTF-16 code
+		units in String {string} (after converting it to UTF-16).
+
+		When {countcc} is TRUE, composing characters are counted
+		separately.
+		When {countcc} is omitted or FALSE, composing characters are
+		ignored.
+
+		Returns zero on error.
+
+		Also see |strlen()| and |strcharlen()|.
+		Examples: >
+		    echo strutf16len('a')		returns 1
+		    echo strutf16len('漏')		returns 1
+		    echo strutf16len('馃槉')		returns 2
+		    echo strutf16len('a台虂')		returns 1
+		    echo strutf16len('a台虂', v:true)	returns 3
+
+		Can also be used as a |method|: >
+			GetText()->strutf16len()
+<
 strwidth({string})					*strwidth()*
 		The result is a Number, which is the number of display cells
 		String {string} occupies.  A Tab character is counted as one
@@ -10059,6 +10109,34 @@ uniq({list} [, {func} [, {dict}]])			*un
 
 		Can also be used as a |method|: >
 			mylist->uniq()
+<
+							*utf16idx()*
+utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
+		Same as |charidx()| but returns the UTF-16 index of the byte
+		at {idx} in {string} (after converting it to UTF-16).
+
+		When {charidx} is present and TRUE, {idx} is used as the
+		character index in the String {string} instead of as the byte
+		index.
+		An {idx} in the middle of a UTF-8 sequence is rounded upwards
+		to the end of that sequence.
+
+		See |byteidx()| and |byteidxcomp()| for getting the byte index
+		from the UTF-16 index and |charidx()| for getting the
+		character index from the UTF-16 index.
+		Refer to |string-offset-encoding| for more information.
+		Examples: >
+			echo utf16idx('a馃槉馃槉', 3)	returns 2
+			echo utf16idx('a馃槉馃槉', 7)	returns 4
+			echo utf16idx('a馃槉馃槉', 1, 0, 1)	returns 2
+			echo utf16idx('a馃槉馃槉', 2, 0, 1)	returns 4
+			echo utf16idx('aa台虂c', 6)		returns 2
+			echo utf16idx('aa台虂c', 6, 1)	returns 4
+			echo utf16idx('a馃槉馃槉', 9)	returns -1
+<
+		Can also be used as a |method|: >
+			GetName()->utf16idx(idx)
+
 
 values({dict})						*values()*
 		Return a |List| with all the values of {dict}.  The |List| is
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -1580,6 +1580,33 @@ Examples: >
 	echo $"The square root of {{9}} is {sqrt(9)}"
 <	The square root of {9} is 3.0 ~
 
+						*string-offset-encoding*
+A string consists of multiple characters.  How the characters are stored
+depends on 'encoding'.  Most common is UTF-8, which uses one byte for ASCII
+characters, two bytes for other latin characters and more bytes for other
+characters.
+
+A string offset can count characters or bytes.  Other programs may use
+UTF-16 encoding (16-bit words) and an offset of UTF-16 words.  Some functions
+use byte offsets, usually for UTF-8 encoding.  Other functions use character
+offsets, in which case the encoding doesn't matter.
+
+The different offsets for the string "a漏馃槉" are below:
+
+  UTF-8 offsets:
+      [0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
+  UTF-16 offsets:
+      [0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
+  UTF-32 (character) offsets:
+      [0]: 00000061, [1]: 000000A9, [2]: 0001F60A
+
+You can use the "g8" and "ga" commands on a character to see the
+decimal/hex/octal values.
+
+The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
+between these indices.  The functions |strlen()|, |strutf16len()| and
+|strcharlen()| return the number of bytes, UTF-16 code units and characters in
+a string respectively.
 
 option						*expr-option* *E112* *E113*
 ------
--- a/runtime/doc/usr_41.txt
+++ b/runtime/doc/usr_41.txt
@@ -754,6 +754,7 @@ String manipulation:					*string-functio
 	strlen()		length of a string in bytes
 	strcharlen()		length of a string in characters
 	strchars()		number of characters in a string
+	strutf16len()		number of UTF-16 code units in a string
 	strwidth()		size of string when displayed
 	strdisplaywidth()	size of string when displayed, deals with tabs
 	setcellwidths()		set character cell width overrides
@@ -771,6 +772,7 @@ String manipulation:					*string-functio
 	byteidx()		byte index of a character in a string
 	byteidxcomp()		like byteidx() but count composing characters
 	charidx()		character index of a byte in a string
+	utf16idx()		UTF-16 index of a byte in a string
 	repeat()		repeat a string multiple times
 	eval()			evaluate a string expression
 	execute()		execute an Ex command and get the output
--- a/src/evalfunc.c
+++ b/src/evalfunc.c
@@ -1751,9 +1751,9 @@ static funcentry_T global_functions[] =
 			ret_number,	    f_bufwinnr},
     {"byte2line",	1, 1, FEARG_1,	    arg1_number,
 			ret_number,	    f_byte2line},
-    {"byteidx",		2, 2, FEARG_1,	    arg2_string_number,
+    {"byteidx",		2, 3, FEARG_1,	    arg3_string_number_bool,
 			ret_number,	    f_byteidx},
-    {"byteidxcomp",	2, 2, FEARG_1,	    arg2_string_number,
+    {"byteidxcomp",	2, 3, FEARG_1,	    arg3_string_number_bool,
 			ret_number,	    f_byteidxcomp},
     {"call",		2, 3, FEARG_1,	    arg3_any_list_dict,
 			ret_any,	    f_call},
@@ -1803,7 +1803,7 @@ static funcentry_T global_functions[] =
 			ret_number,	    f_charclass},
     {"charcol",		1, 2, FEARG_1,	    arg2_string_or_list_number,
 			ret_number,	    f_charcol},
-    {"charidx",		2, 3, FEARG_1,	    arg3_string_number_bool,
+    {"charidx",		2, 4, FEARG_1,	    arg3_string_number_bool,
 			ret_number,	    f_charidx},
     {"chdir",		1, 1, FEARG_1,	    arg1_string,
 			ret_string,	    f_chdir},
@@ -2601,6 +2601,8 @@ static funcentry_T global_functions[] =
 			ret_number,	    f_strridx},
     {"strtrans",	1, 1, FEARG_1,	    arg1_string,
 			ret_string,	    f_strtrans},
+    {"strutf16len",	1, 2, FEARG_1,	    arg2_string_bool,
+			ret_number,	    f_strutf16len},
     {"strwidth",	1, 1, FEARG_1,	    arg1_string,
 			ret_number,	    f_strwidth},
     {"submatch",	1, 2, FEARG_1,	    arg2_number_bool,
@@ -2785,6 +2787,8 @@ static funcentry_T global_functions[] =
 			ret_dict_any,	    f_undotree},
     {"uniq",		1, 3, FEARG_1,	    arg13_sortuniq,
 			ret_first_arg,	    f_uniq},
+    {"utf16idx",	2, 4, FEARG_1,	    arg3_string_number_bool,
+			ret_number,	    f_utf16idx},
     {"values",		1, 1, FEARG_1,	    arg1_dict_any,
 			ret_list_member,    f_values},
     {"virtcol",		1, 2, FEARG_1,	    arg2_string_or_list_bool,
--- a/src/proto/strings.pro
+++ b/src/proto/strings.pro
@@ -36,12 +36,14 @@ void f_string(typval_T *argvars, typval_
 void f_strlen(typval_T *argvars, typval_T *rettv);
 void f_strcharlen(typval_T *argvars, typval_T *rettv);
 void f_strchars(typval_T *argvars, typval_T *rettv);
+void f_strutf16len(typval_T *argvars, typval_T *rettv);
 void f_strdisplaywidth(typval_T *argvars, typval_T *rettv);
 void f_strwidth(typval_T *argvars, typval_T *rettv);
 void f_strcharpart(typval_T *argvars, typval_T *rettv);
 void f_strpart(typval_T *argvars, typval_T *rettv);
 void f_strridx(typval_T *argvars, typval_T *rettv);
 void f_strtrans(typval_T *argvars, typval_T *rettv);
+void f_utf16idx(typval_T *argvars, typval_T *rettv);
 void f_tolower(typval_T *argvars, typval_T *rettv);
 void f_toupper(typval_T *argvars, typval_T *rettv);
 void f_tr(typval_T *argvars, typval_T *rettv);
--- a/src/strings.c
+++ b/src/strings.c
@@ -1006,10 +1006,6 @@ string_reduce(
     static void
 byteidx(typval_T *argvars, typval_T *rettv, int comp UNUSED)
 {
-    char_u	*t;
-    char_u	*str;
-    varnumber_T	idx;
-
     rettv->vval.v_number = -1;
 
     if (in_vim9script()
@@ -1017,20 +1013,42 @@ byteidx(typval_T *argvars, typval_T *ret
 		|| check_for_number_arg(argvars, 1) == FAIL))
 	return;
 
-    str = tv_get_string_chk(&argvars[0]);
-    idx = tv_get_number_chk(&argvars[1], NULL);
+    char_u *str = tv_get_string_chk(&argvars[0]);
+    varnumber_T	idx = tv_get_number_chk(&argvars[1], NULL);
     if (str == NULL || idx < 0)
 	return;
 
-    t = str;
+    varnumber_T	utf16idx = FALSE;
+    if (argvars[2].v_type != VAR_UNKNOWN)
+    {
+	utf16idx = tv_get_bool(&argvars[2]);
+	if (utf16idx < 0 || utf16idx > 1)
+	{
+	    semsg(_(e_using_number_as_bool_nr), utf16idx);
+	    return;
+	}
+    }
+
+    int (*ptr2len)(char_u *);
+    if (enc_utf8 && comp)
+	ptr2len = utf_ptr2len;
+    else
+	ptr2len = mb_ptr2len;
+
+    char_u *t = str;
     for ( ; idx > 0; idx--)
     {
 	if (*t == NUL)		// EOL reached
 	    return;
-	if (enc_utf8 && comp)
-	    t += utf_ptr2len(t);
-	else
-	    t += (*mb_ptr2len)(t);
+	if (utf16idx)
+	{
+	    int clen = ptr2len(t);
+	    int c = (clen > 1) ? utf_ptr2char(t) : *t;
+	    if (c > 0xFFFF)
+		idx--;
+	}
+	if (idx > 0)
+	    t += ptr2len(t);
     }
     rettv->vval.v_number = (varnumber_T)(t - str);
 }
@@ -1059,42 +1077,49 @@ f_byteidxcomp(typval_T *argvars, typval_
     void
 f_charidx(typval_T *argvars, typval_T *rettv)
 {
-    char_u	*str;
-    varnumber_T	idx;
-    varnumber_T	countcc = FALSE;
-    char_u	*p;
-    int		len;
-    int		(*ptr2len)(char_u *);
-
     rettv->vval.v_number = -1;
 
-    if ((check_for_string_arg(argvars, 0) == FAIL
+    if (check_for_string_arg(argvars, 0) == FAIL
 		|| check_for_number_arg(argvars, 1) == FAIL
-		|| check_for_opt_bool_arg(argvars, 2) == FAIL))
+		|| check_for_opt_bool_arg(argvars, 2) == FAIL
+		|| (argvars[2].v_type != VAR_UNKNOWN
+		    && check_for_opt_bool_arg(argvars, 3) == FAIL))
 	return;
 
-    str = tv_get_string_chk(&argvars[0]);
-    idx = tv_get_number_chk(&argvars[1], NULL);
+    char_u *str = tv_get_string_chk(&argvars[0]);
+    varnumber_T	idx = tv_get_number_chk(&argvars[1], NULL);
     if (str == NULL || idx < 0)
 	return;
 
+    varnumber_T	countcc = FALSE;
+    varnumber_T	utf16idx = FALSE;
     if (argvars[2].v_type != VAR_UNKNOWN)
+    {
 	countcc = tv_get_bool(&argvars[2]);
-    if (countcc < 0 || countcc > 1)
-    {
-	semsg(_(e_using_number_as_bool_nr), countcc);
-	return;
+	if (argvars[3].v_type != VAR_UNKNOWN)
+	    utf16idx = tv_get_bool(&argvars[3]);
     }
 
+    int (*ptr2len)(char_u *);
     if (enc_utf8 && countcc)
 	ptr2len = utf_ptr2len;
     else
 	ptr2len = mb_ptr2len;
 
-    for (p = str, len = 0; p <= str + idx; len++)
+    char_u	*p;
+    int		len;
+    for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++)
     {
 	if (*p == NUL)
 	    return;
+	if (utf16idx)
+	{
+	    idx--;
+	    int clen = ptr2len(p);
+	    int c = (clen > 1) ? utf_ptr2char(p) : *p;
+	    if (c > 0xFFFF)
+		idx--;
+	}
 	p += ptr2len(p);
     }
 
@@ -1359,6 +1384,38 @@ f_strchars(typval_T *argvars, typval_T *
 }
 
 /*
+ * "strutf16len()" function
+ */
+    void
+f_strutf16len(typval_T *argvars, typval_T *rettv)
+{
+    rettv->vval.v_number = -1;
+
+    if (check_for_string_arg(argvars, 0) == FAIL
+	    || check_for_opt_bool_arg(argvars, 1) == FAIL)
+	return;
+
+    varnumber_T countcc = FALSE;
+    if (argvars[1].v_type != VAR_UNKNOWN)
+	countcc = tv_get_bool(&argvars[1]);
+
+    char_u		*s = tv_get_string(&argvars[0]);
+    varnumber_T		len = 0;
+    int			(*func_mb_ptr2char_adv)(char_u **pp);
+    int			ch;
+
+    func_mb_ptr2char_adv = countcc ? mb_cptr2char_adv : mb_ptr2char_adv;
+    while (*s != NUL)
+    {
+	ch = func_mb_ptr2char_adv(&s);
+	if (ch > 0xFFFF)
+	    ++len;
+	++len;
+    }
+    rettv->vval.v_number = len;
+}
+
+/*
  * "strdisplaywidth()" function
  */
     void
@@ -1619,6 +1676,61 @@ f_strtrans(typval_T *argvars, typval_T *
     rettv->vval.v_string = transstr(tv_get_string(&argvars[0]));
 }
 
+
+/*
+ *
+ * "utf16idx()" function
+ */
+    void
+f_utf16idx(typval_T *argvars, typval_T *rettv)
+{
+    rettv->vval.v_number = -1;
+
+    if (check_for_string_arg(argvars, 0) == FAIL
+	    || check_for_opt_number_arg(argvars, 1) == FAIL
+	    || check_for_opt_bool_arg(argvars, 2) == FAIL
+	    || (argvars[2].v_type != VAR_UNKNOWN
+		    && check_for_opt_bool_arg(argvars, 3) == FAIL))
+	    return;
+
+    char_u *str = tv_get_string_chk(&argvars[0]);
+    varnumber_T	idx = tv_get_number_chk(&argvars[1], NULL);
+    if (str == NULL || idx < 0)
+	return;
+
+    varnumber_T	countcc = FALSE;
+    varnumber_T	charidx = FALSE;
+    if (argvars[2].v_type != VAR_UNKNOWN)
+    {
+	countcc = tv_get_bool(&argvars[2]);
+	if (argvars[3].v_type != VAR_UNKNOWN)
+	    charidx = tv_get_bool(&argvars[3]);
+    }
+
+    int (*ptr2len)(char_u *);
+    if (enc_utf8 && countcc)
+	ptr2len = utf_ptr2len;
+    else
+	ptr2len = mb_ptr2len;
+
+    char_u	*p;
+    int		len;
+    for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++)
+    {
+	if (*p == NUL)
+	    return;
+	int clen = ptr2len(p);
+	int c = (clen > 1) ? utf_ptr2char(p) : *p;
+	if (c > 0xFFFF)
+	    len++;
+	p += ptr2len(p);
+	if (charidx)
+	    idx--;
+    }
+
+    rettv->vval.v_number = len > 0 ? len - 1 : 0;
+}
+
 /*
  * "tolower(string)" function
  */
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -1192,19 +1192,14 @@ func Test_byte2line_line2byte()
   bw!
 endfunc
 
-" Test for byteidx() and byteidxcomp() functions
+" Test for byteidx() using a character index
 func Test_byteidx()
   let a = '.茅.' " one char of two bytes
   call assert_equal(0, byteidx(a, 0))
-  call assert_equal(0, byteidxcomp(a, 0))
   call assert_equal(1, byteidx(a, 1))
-  call assert_equal(1, byteidxcomp(a, 1))
   call assert_equal(3, byteidx(a, 2))
-  call assert_equal(3, byteidxcomp(a, 2))
   call assert_equal(4, byteidx(a, 3))
-  call assert_equal(4, byteidxcomp(a, 3))
   call assert_equal(-1, byteidx(a, 4))
-  call assert_equal(-1, byteidxcomp(a, 4))
 
   let b = '.e虂.' " normal e with composing char
   call assert_equal(0, b->byteidx(0))
@@ -1212,18 +1207,184 @@ func Test_byteidx()
   call assert_equal(4, b->byteidx(2))
   call assert_equal(5, b->byteidx(3))
   call assert_equal(-1, b->byteidx(4))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  call assert_equal(0, byteidx(str, 0))
+  call assert_equal(1, byteidx(str, 1))
+  call assert_equal(6, byteidx(str, 2))
+  call assert_equal(7, byteidx(str, 3))
+  call assert_equal(12, byteidx(str, 4))
+  call assert_equal(-1, byteidx(str, 5))
+
+  " empty string
+  call assert_equal(0, byteidx('', 0))
+  call assert_equal(-1, byteidx('', 1))
+
+  " error cases
   call assert_fails("call byteidx([], 0)", 'E730:')
-
+  call assert_fails("call byteidx('abc', [])", 'E745:')
+endfunc
+
+" Test for byteidxcomp() using a character index
+func Test_byteidxcomp()
+  let a = '.茅.' " one char of two bytes
+  call assert_equal(0, byteidxcomp(a, 0))
+  call assert_equal(1, byteidxcomp(a, 1))
+  call assert_equal(3, byteidxcomp(a, 2))
+  call assert_equal(4, byteidxcomp(a, 3))
+  call assert_equal(-1, byteidxcomp(a, 4))
+
+  let b = '.e虂.' " normal e with composing char
   call assert_equal(0, b->byteidxcomp(0))
   call assert_equal(1, b->byteidxcomp(1))
   call assert_equal(2, b->byteidxcomp(2))
   call assert_equal(4, b->byteidxcomp(3))
   call assert_equal(5, b->byteidxcomp(4))
   call assert_equal(-1, b->byteidxcomp(5))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  call assert_equal(0, byteidxcomp(str, 0))
+  call assert_equal(1, byteidxcomp(str, 1))
+  call assert_equal(2, byteidxcomp(str, 2))
+  call assert_equal(4, byteidxcomp(str, 3))
+  call assert_equal(6, byteidxcomp(str, 4))
+  call assert_equal(7, byteidxcomp(str, 5))
+  call assert_equal(8, byteidxcomp(str, 6))
+  call assert_equal(10, byteidxcomp(str, 7))
+  call assert_equal(12, byteidxcomp(str, 8))
+  call assert_equal(-1, byteidxcomp(str, 9))
+
+  " empty string
+  call assert_equal(0, byteidxcomp('', 0))
+  call assert_equal(-1, byteidxcomp('', 1))
+
+  " error cases
   call assert_fails("call byteidxcomp([], 0)", 'E730:')
+  call assert_fails("call byteidxcomp('abc', [])", 'E745:')
 endfunc
 
-" Test for charidx()
+" Test for byteidx() using a UTF-16 index
+func Test_byteidx_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, byteidx(str, i, v:true))
+  endfor
+  call assert_equal(3, byteidx(str, 3, v:true))
+  call assert_equal(-1, byteidx(str, 4, v:true))
+
+  " string with two byte characters
+  let str = "a漏漏b"
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(3, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(6, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " string with two byte characters
+  let str = "a馃槉馃槉b"
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(1, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(5, byteidx(str, 4, v:true))
+  call assert_equal(9, byteidx(str, 5, v:true))
+  call assert_equal(10, byteidx(str, 6, v:true))
+  call assert_equal(-1, byteidx(str, 7, v:true))
+
+  " string with composing characters
+  let str = '-a虂-b虂'
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(4, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(8, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(6, byteidx(str, 2, v:true))
+  call assert_equal(7, byteidx(str, 3, v:true))
+  call assert_equal(12, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " empty string
+  call assert_equal(0, byteidx('', 0, v:true))
+  call assert_equal(-1, byteidx('', 1, v:true))
+
+  " error cases
+  call assert_fails('call byteidx(str, 0, [])', 'E745:')
+endfunc
+
+" Test for byteidxcomp() using a UTF-16 index
+func Test_byteidxcomp_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, byteidxcomp(str, i, v:true))
+  endfor
+  call assert_equal(3, byteidxcomp(str, 3, v:true))
+  call assert_equal(-1, byteidxcomp(str, 4, v:true))
+
+  " string with two byte characters
+  let str = "a漏漏b"
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(3, byteidxcomp(str, 2, v:true))
+  call assert_equal(5, byteidxcomp(str, 3, v:true))
+  call assert_equal(6, byteidxcomp(str, 4, v:true))
+  call assert_equal(-1, byteidxcomp(str, 5, v:true))
+
+  " string with two byte characters
+  let str = "a馃槉馃槉b"
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(1, byteidxcomp(str, 2, v:true))
+  call assert_equal(5, byteidxcomp(str, 3, v:true))
+  call assert_equal(5, byteidxcomp(str, 4, v:true))
+  call assert_equal(9, byteidxcomp(str, 5, v:true))
+  call assert_equal(10, byteidxcomp(str, 6, v:true))
+  call assert_equal(-1, byteidxcomp(str, 7, v:true))
+
+  " string with composing characters
+  let str = '-a虂-b虂'
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(2, byteidxcomp(str, 2, v:true))
+  call assert_equal(4, byteidxcomp(str, 3, v:true))
+  call assert_equal(5, byteidxcomp(str, 4, v:true))
+  call assert_equal(6, byteidxcomp(str, 5, v:true))
+  call assert_equal(8, byteidxcomp(str, 6, v:true))
+  call assert_equal(-1, byteidxcomp(str, 7, v:true))
+  call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(2, byteidxcomp(str, 2, v:true))
+  call assert_equal(4, byteidxcomp(str, 3, v:true))
+  call assert_equal(6, byteidxcomp(str, 4, v:true))
+  call assert_equal(7, byteidxcomp(str, 5, v:true))
+  call assert_equal(8, byteidxcomp(str, 6, v:true))
+  call assert_equal(10, byteidxcomp(str, 7, v:true))
+  call assert_equal(12, byteidxcomp(str, 8, v:true))
+  call assert_equal(-1, byteidxcomp(str, 9, v:true))
+
+  " empty string
+  call assert_equal(0, byteidxcomp('', 0, v:true))
+  call assert_equal(-1, byteidxcomp('', 1, v:true))
+
+  " error cases
+  call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
+endfunc
+
+" Test for charidx() using a byte index
 func Test_charidx()
   let a = 'xa虂b虂y'
   call assert_equal(0, charidx(a, 0))
@@ -1232,17 +1393,20 @@ func Test_charidx()
   call assert_equal(3, charidx(a, 7))
   call assert_equal(-1, charidx(a, 8))
   call assert_equal(-1, charidx(a, -1))
-  call assert_equal(-1, charidx('', 0))
-  call assert_equal(-1, charidx(test_null_string(), 0))
 
   " count composing characters
-  call assert_equal(0, charidx(a, 0, 1))
-  call assert_equal(2, charidx(a, 2, 1))
-  call assert_equal(3, charidx(a, 4, 1))
-  call assert_equal(5, charidx(a, 7, 1))
-  call assert_equal(-1, charidx(a, 8, 1))
+  call assert_equal(0, a->charidx(0, 1))
+  call assert_equal(2, a->charidx(2, 1))
+  call assert_equal(3, a->charidx(4, 1))
+  call assert_equal(5, a->charidx(7, 1))
+  call assert_equal(-1, a->charidx(8, 1))
+
+  " empty string
+  call assert_equal(-1, charidx('', 0))
   call assert_equal(-1, charidx('', 0, 1))
 
+  " error cases
+  call assert_equal(-1, charidx(test_null_string(), 0))
   call assert_fails('let x = charidx([], 1)', 'E1174:')
   call assert_fails('let x = charidx("abc", [])', 'E1210:')
   call assert_fails('let x = charidx("abc", 1, [])', 'E1212:')
@@ -1250,6 +1414,237 @@ func Test_charidx()
   call assert_fails('let x = charidx("abc", 1, 2)', 'E1212:')
 endfunc
 
+" Test for charidx() using a UTF-16 index
+func Test_charidx_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 3, v:false, v:true))
+
+  " string with two byte characters
+  let str = "a漏漏b"
+  call assert_equal(0, charidx(str, 0, v:false, v:true))
+  call assert_equal(1, charidx(str, 1, v:false, v:true))
+  call assert_equal(2, charidx(str, 2, v:false, v:true))
+  call assert_equal(3, charidx(str, 3, v:false, v:true))
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+
+  " string with four byte characters
+  let str = "a馃槉馃槉b"
+  call assert_equal(0, charidx(str, 0, v:false, v:true))
+  call assert_equal(1, charidx(str, 1, v:false, v:true))
+  call assert_equal(1, charidx(str, 2, v:false, v:true))
+  call assert_equal(2, charidx(str, 3, v:false, v:true))
+  call assert_equal(2, charidx(str, 4, v:false, v:true))
+  call assert_equal(3, charidx(str, 5, v:false, v:true))
+  call assert_equal(-1, charidx(str, 6, v:false, v:true))
+
+  " string with composing characters
+  let str = '-a虂-b虂'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, charidx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 6, v:true, v:true))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, charidx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 8, v:true, v:true))
+
+  " empty string
+  call assert_equal(-1, charidx('', 0, v:false, v:true))
+  call assert_equal(-1, charidx('', 0, v:true, v:true))
+
+  " error cases
+  call assert_equal(-1, charidx('', 0, v:false, v:true))
+  call assert_equal(-1, charidx('', 0, v:true, v:true))
+  call assert_equal(-1, charidx(test_null_string(), 0, v:false, v:true))
+  call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:')
+  call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:')
+endfunc
+
+" Test for utf16idx() using a byte index
+func Test_utf16idx_from_byteidx()
+  " UTF-16 index of a string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, utf16idx(str, i))
+  endfor
+  call assert_equal(-1, utf16idx(str, 3))
+
+  " UTF-16 index of a string with two byte characters
+  let str = 'a漏漏b'
+  call assert_equal(0, str->utf16idx(0))
+  call assert_equal(1, str->utf16idx(1))
+  call assert_equal(1, str->utf16idx(2))
+  call assert_equal(2, str->utf16idx(3))
+  call assert_equal(2, str->utf16idx(4))
+  call assert_equal(3, str->utf16idx(5))
+  call assert_equal(-1, str->utf16idx(6))
+
+  " UTF-16 index of a string with four byte characters
+  let str = 'a馃槉馃槉b'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(2, utf16idx(str, 1))
+  call assert_equal(2, utf16idx(str, 2))
+  call assert_equal(2, utf16idx(str, 3))
+  call assert_equal(2, utf16idx(str, 4))
+  call assert_equal(4, utf16idx(str, 5))
+  call assert_equal(4, utf16idx(str, 6))
+  call assert_equal(4, utf16idx(str, 7))
+  call assert_equal(4, utf16idx(str, 8))
+  call assert_equal(5, utf16idx(str, 9))
+  call assert_equal(-1, utf16idx(str, 10))
+
+  " UTF-16 index of a string with composing characters
+  let str = '-a虂-b虂'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(1, utf16idx(str, 1))
+  call assert_equal(1, utf16idx(str, 2))
+  call assert_equal(1, utf16idx(str, 3))
+  call assert_equal(2, utf16idx(str, 4))
+  call assert_equal(3, utf16idx(str, 5))
+  call assert_equal(3, utf16idx(str, 6))
+  call assert_equal(3, utf16idx(str, 7))
+  call assert_equal(-1, utf16idx(str, 8))
+  call assert_equal(0, utf16idx(str, 0, v:true))
+  call assert_equal(1, utf16idx(str, 1, v:true))
+  call assert_equal(2, utf16idx(str, 2, v:true))
+  call assert_equal(2, utf16idx(str, 3, v:true))
+  call assert_equal(3, utf16idx(str, 4, v:true))
+  call assert_equal(4, utf16idx(str, 5, v:true))
+  call assert_equal(5, utf16idx(str, 6, v:true))
+  call assert_equal(5, utf16idx(str, 7, v:true))
+  call assert_equal(-1, utf16idx(str, 8, v:true))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(1, utf16idx(str, 1))
+  call assert_equal(1, utf16idx(str, 2))
+  call assert_equal(1, utf16idx(str, 3))
+  call assert_equal(1, utf16idx(str, 4))
+  call assert_equal(1, utf16idx(str, 5))
+  call assert_equal(2, utf16idx(str, 6))
+  call assert_equal(3, utf16idx(str, 7))
+  call assert_equal(3, utf16idx(str, 8))
+  call assert_equal(3, utf16idx(str, 9))
+  call assert_equal(3, utf16idx(str, 10))
+  call assert_equal(3, utf16idx(str, 11))
+  call assert_equal(-1, utf16idx(str, 12))
+  call assert_equal(0, utf16idx(str, 0, v:true))
+  call assert_equal(1, utf16idx(str, 1, v:true))
+  call assert_equal(2, utf16idx(str, 2, v:true))
+  call assert_equal(2, utf16idx(str, 3, v:true))
+  call assert_equal(3, utf16idx(str, 4, v:true))
+  call assert_equal(3, utf16idx(str, 5, v:true))
+  call assert_equal(4, utf16idx(str, 6, v:true))
+  call assert_equal(5, utf16idx(str, 7, v:true))
+  call assert_equal(6, utf16idx(str, 8, v:true))
+  call assert_equal(6, utf16idx(str, 9, v:true))
+  call assert_equal(7, utf16idx(str, 10, v:true))
+  call assert_equal(7, utf16idx(str, 11, v:true))
+  call assert_equal(-1, utf16idx(str, 12, v:true))
+
+  " empty string
+  call assert_equal(-1, utf16idx('', 0))
+  call assert_equal(-1, utf16idx('', 0, v:true))
+
+  " error cases
+  call assert_equal(-1, utf16idx("", 0))
+  call assert_equal(-1, utf16idx("abc", -1))
+  call assert_equal(-1, utf16idx(test_null_string(), 0))
+  call assert_fails('let l = utf16idx([], 0)', 'E1174:')
+  call assert_fails('let l = utf16idx("ab", [])', 'E1210:')
+  call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:')
+endfunc
+
+" Test for utf16idx() using a character index
+func Test_utf16idx_from_charidx()
+  let str = "abc"
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 3, v:false, v:true))
+
+  " UTF-16 index of a string with two byte characters
+  let str = "a漏漏b"
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+
+  " UTF-16 index of a string with four byte characters
+  let str = "a馃槉馃槉b"
+  call assert_equal(0, utf16idx(str, 0, v:false, v:true))
+  call assert_equal(2, utf16idx(str, 1, v:false, v:true))
+  call assert_equal(4, utf16idx(str, 2, v:false, v:true))
+  call assert_equal(5, utf16idx(str, 3, v:false, v:true))
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+
+  " UTF-16 index of a string with composing characters
+  let str = '-a虂-b虂'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, utf16idx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 6, v:true, v:true))
+
+  " string with multiple composing characters
+  let str = '-a台虂-a台虂'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, utf16idx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 8, v:true, v:true))
+
+  " empty string
+  call assert_equal(-1, utf16idx('', 0, v:false, v:true))
+  call assert_equal(-1, utf16idx('', 0, v:true, v:true))
+
+  " error cases
+  call assert_equal(-1, utf16idx(test_null_string(), 0, v:true, v:true))
+  call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:')
+endfunc
+
+" Test for strutf16len()
+func Test_strutf16len()
+  call assert_equal(3, strutf16len('abc'))
+  call assert_equal(3, 'abc'->strutf16len(v:true))
+  call assert_equal(4, strutf16len('a漏漏b'))
+  call assert_equal(4, strutf16len('a漏漏b', v:true))
+  call assert_equal(6, strutf16len('a馃槉馃槉b'))
+  call assert_equal(6, strutf16len('a馃槉馃槉b', v:true))
+  call assert_equal(4, strutf16len('-a虂-b虂'))
+  call assert_equal(6, strutf16len('-a虂-b虂', v:true))
+  call assert_equal(4, strutf16len('-a台虂-a台虂'))
+  call assert_equal(8, strutf16len('-a台虂-a台虂', v:true))
+  call assert_equal(0, strutf16len(''))
+
+  " error cases
+  call assert_fails('let l = strutf16len([])', 'E1174:')
+  call assert_fails('let l = strutf16len("a", [])', 'E1212:')
+  call assert_equal(0, strutf16len(test_null_string()))
+endfunc
+
 func Test_count()
   let l = ['a', 'a', 'A', 'b']
   call assert_equal(2, count(l, 'a'))
@@ -3074,5 +3469,4 @@ func Test_delfunc_while_listing()
   call StopVimInTerminal(buf)
 endfunc
 
-
 " vim: shiftwidth=2 sts=2 expandtab
--- a/src/version.c
+++ b/src/version.c
@@ -696,6 +696,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    1485,
+/**/
     1484,
 /**/
     1483,