changeset 22770:3e4981de5636 v8.2.1933

patch 8.2.1933: cannot sort using locale ordering Commit: https://github.com/vim/vim/commit/55e29611d20bca14fa5efc61385bc8a6b7acd9e2 Author: Bram Moolenaar <Bram@vim.org> Date: Sun Nov 1 13:57:44 2020 +0100 patch 8.2.1933: cannot sort using locale ordering Problem: Cannot sort using locale ordering. Solution: Add a flag for :sort and sort() to use the locale. (Dominique Pell?, closes #7237)
author Bram Moolenaar <Bram@vim.org>
date Sun, 01 Nov 2020 14:00:03 +0100
parents b03c250a83e7
children 84263588f5be
files runtime/doc/change.txt runtime/doc/eval.txt src/ex_cmds.c src/list.c src/testdir/test_sort.vim src/version.c
diffstat 6 files changed, 112 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/runtime/doc/change.txt
+++ b/runtime/doc/change.txt
@@ -1801,7 +1801,7 @@ Vim has a sorting function and a sorting
 found here: |sort()|, |uniq()|.
 
 							*:sor* *:sort*
-:[range]sor[t][!] [b][f][i][n][o][r][u][x] [/{pattern}/]
+:[range]sor[t][!] [b][f][i][l][n][o][r][u][x] [/{pattern}/]
 			Sort lines in [range].  When no range is given all
 			lines are sorted.
 
@@ -1809,6 +1809,14 @@ found here: |sort()|, |uniq()|.
 
 			With [i] case is ignored.
 
+			With [l] sort uses the current locale. See
+			`language collate` to check or set the locale used
+			for ordering. For example, with "en_US.UTF8",
+			Ö will be ordered after O and before P,
+			whereas with the Swedish locale "sv_SE.UTF8",
+			it will be after Z.
+			Case is typically ignored by the locale.
+
 			Options [n][f][x][o][b] are mutually exclusive.
 
 			With [n] sorting is done on the first decimal number
@@ -1875,8 +1883,7 @@ found here: |sort()|, |uniq()|.
 Note that using `:sort` with `:global` doesn't sort the matching lines, it's
 quite useless.
 
-The details about sorting depend on the library function used.  There is no
-guarantee that sorting obeys the current locale.  You will have to try it out.
+`:sort` does not use the current locale unless the l flag is used.
 Vim does do a "stable" sort.
 
 The sorting can be interrupted, but if you interrupt it too late in the
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -9700,6 +9700,13 @@ sort({list} [, {func} [, {dict}]])			*so
 		When {func} is given and it is '1' or 'i' then case is
 		ignored.
 
+		When {func} is given and it is 'l' then the current locale
+		is used for ordering. See `language collate` to check or set
+		the locale used for ordering.  For example, with "en_US.UTF8",
+		Ö will be ordered after O and before P, whereas with the
+		Swedish locale "sv_SE.UTF8", it will be after Z.
+		Case is typically ignored by the locale.
+
 		When {func} is given and it is 'n' then all items will be
 		sorted numerical (Implementation detail: This uses the
 		strtod() function to parse numbers, Strings, Lists, Dicts and
--- a/src/ex_cmds.c
+++ b/src/ex_cmds.c
@@ -277,6 +277,7 @@ linelen(int *has_tab)
 static char_u	*sortbuf1;
 static char_u	*sortbuf2;
 
+static int	sort_lc;	// sort using locale
 static int	sort_ic;	// ignore case
 static int	sort_nr;	// sort on number
 static int	sort_rx;	// sort on regex instead of skipping it
@@ -307,7 +308,13 @@ typedef struct
     } st_u;
 } sorti_T;
 
-static int sort_compare(const void *s1, const void *s2);
+    static int
+string_compare(const void *s1, const void *s2)
+{
+    if (sort_lc)
+	return strcoll((char *)s1, (char *)s2);
+    return sort_ic ? STRICMP(s1, s2) : STRCMP(s1, s2);
+}
 
     static int
 sort_compare(const void *s1, const void *s2)
@@ -350,8 +357,7 @@ sort_compare(const void *s1, const void 
 		     l2.st_u.line.end_col_nr - l2.st_u.line.start_col_nr + 1);
 	sortbuf2[l2.st_u.line.end_col_nr - l2.st_u.line.start_col_nr] = 0;
 
-	result = sort_ic ? STRICMP(sortbuf1, sortbuf2)
-						 : STRCMP(sortbuf1, sortbuf2);
+	result = string_compare(sortbuf1, sortbuf2);
     }
 
     // If two lines have the same value, preserve the original line order.
@@ -398,7 +404,7 @@ ex_sort(exarg_T *eap)
     if (nrs == NULL)
 	goto sortend;
 
-    sort_abort = sort_ic = sort_rx = sort_nr = 0;
+    sort_abort = sort_ic = sort_lc = sort_rx = sort_nr = 0;
 #ifdef FEAT_FLOAT
     sort_flt = 0;
 #endif
@@ -409,6 +415,8 @@ ex_sort(exarg_T *eap)
 	    ;
 	else if (*p == 'i')
 	    sort_ic = TRUE;
+	else if (*p == 'l')
+	    sort_lc = TRUE;
 	else if (*p == 'r')
 	    sort_rx = TRUE;
 	else if (*p == 'n')
@@ -614,8 +622,7 @@ ex_sort(exarg_T *eap)
 	    change_occurred = TRUE;
 
 	s = ml_get(get_lnum);
-	if (!unique || i == 0
-		|| (sort_ic ? STRICMP(s, sortbuf1) : STRCMP(s, sortbuf1)) != 0)
+	if (!unique || i == 0 || string_compare(s, sortbuf1) != 0)
 	{
 	    // Copy the line into a buffer, it may become invalid in
 	    // ml_append(). And it's needed for "unique".
--- a/src/list.c
+++ b/src/list.c
@@ -1516,6 +1516,7 @@ typedef struct
 typedef struct
 {
     int		item_compare_ic;
+    int		item_compare_lc;
     int		item_compare_numeric;
     int		item_compare_numbers;
 #ifdef FEAT_FLOAT
@@ -1594,10 +1595,10 @@ item_compare(const void *s1, const void 
 	p2 = (char_u *)"";
     if (!sortinfo->item_compare_numeric)
     {
-	if (sortinfo->item_compare_ic)
-	    res = STRICMP(p1, p2);
+	if (sortinfo->item_compare_lc)
+	    res = strcoll((char *)p1, (char *)p2);
 	else
-	    res = STRCMP(p1, p2);
+	    res = sortinfo->item_compare_ic ? STRICMP(p1, p2): STRCMP(p1, p2);
     }
     else
     {
@@ -1706,6 +1707,7 @@ do_sort_uniq(typval_T *argvars, typval_T
 	    goto theend;	// short list sorts pretty quickly
 
 	info.item_compare_ic = FALSE;
+	info.item_compare_lc = FALSE;
 	info.item_compare_numeric = FALSE;
 	info.item_compare_numbers = FALSE;
 #ifdef FEAT_FLOAT
@@ -1773,6 +1775,11 @@ do_sort_uniq(typval_T *argvars, typval_T
 			info.item_compare_func = NULL;
 			info.item_compare_ic = TRUE;
 		    }
+		    else if (STRCMP(info.item_compare_func, "l") == 0)
+		    {
+			info.item_compare_func = NULL;
+			info.item_compare_lc = TRUE;
+		    }
 		}
 	    }
 
--- a/src/testdir/test_sort.vim
+++ b/src/testdir/test_sort.vim
@@ -15,6 +15,25 @@ func Test_sort_strings()
   " numbers compared as strings
   call assert_equal([1, 2, 3], sort([3, 2, 1]))
   call assert_equal([13, 28, 3], sort([3, 28, 13]))
+
+  call assert_equal(['A', 'O', 'P', 'a', 'o', 'p', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ'],
+  \            sort(['A', 'O', 'P', 'a', 'o', 'p', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ']))
+
+  call assert_equal(['A', 'a', 'o', 'O', 'p', 'P', 'Ä', 'Ô', 'ä', 'ô', 'œ', 'œ'],
+  \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'i'))
+
+  let lc = execute('language collate')
+  " With the following locales, the accentuated letters are ordered
+  " similarly to the non-accentuated letters...
+  if lc =~? '"\(en\|es\|de\|fr\|it\|nl\).*\.utf-\?8"'
+    call assert_equal(['a', 'A', 'ä', 'Ä', 'o', 'O', 'ô', 'Ô', 'œ', 'œ', 'p', 'P'],
+    \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'l'))
+  " ... whereas with a Swedish locale, the accentuated letters are ordered
+  " after Z.
+  elseif lc =~? '"sv.*utf-\?8"'
+    call assert_equal(['a', 'A', 'o', 'O', 'p', 'P', 'ä', 'Ä', 'œ', 'œ', 'ô', 'Ô'],
+    \            sort(['A', 'a', 'o', 'O', 'œ', 'œ', 'p', 'P', 'Ä', 'ä', 'ô', 'Ô'], 'l'))
+  endif
 endfunc
 
 func Test_sort_numeric()
@@ -1204,6 +1223,57 @@ func Test_sort_cmd()
 	\ },
 	\ ]
 
+    " With the following locales, the accentuated letters are ordered
+    " similarly to the non-accentuated letters...
+    let lc = execute('language collate')
+    if lc =~? '"\(en\|es\|de\|fr\|it\|nl\).*\.utf-\?8"'
+      let tests += [
+	\ {
+	\    'name' : 'sort with locale',
+	\    'cmd' : '%sort l',
+	\    'input' : [
+	\	'A',
+	\	'E',
+	\	'O',
+	\	'À',
+	\	'È',
+	\	'É',
+	\	'Ô',
+	\	'Œ',
+	\	'Z',
+	\	'a',
+	\	'e',
+	\	'o',
+	\	'à',
+	\	'è',
+	\	'é',
+	\	'ô',
+	\	'œ',
+	\	'z'
+	\    ],
+	\    'expected' : [
+	\	'a',
+	\	'A',
+	\	'à',
+	\	'À',
+	\	'e',
+	\	'E',
+	\	'é',
+	\	'É',
+	\	'è',
+	\	'È',
+	\	'o',
+	\	'O',
+	\	'ô',
+	\	'Ô',
+	\	'œ',
+	\	'Œ',
+	\	'z',
+	\	'Z'
+	\    ]
+	\ },
+	\ ]
+  endif
   if has('float')
     let tests += [
           \ {
--- a/src/version.c
+++ b/src/version.c
@@ -751,6 +751,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    1933,
+/**/
     1932,
 /**/
     1931,