view src/arabic.c @ 33566:e1e3805fcd96 v9.0.2028

patch 9.0.2028: confusing build dependencies Commit: https://github.com/vim/vim/commit/5d03525cdef5db1b1cedfa26c6f8a21aaa207ec0 Author: Yee Cheng Chin <ychin.git@gmail.com> Date: Sun Oct 15 09:50:53 2023 +0200 patch 9.0.2028: confusing build dependencies Problem: confusing build dependencies Solution: clean them up, make them parallelizable Separate vim binary and unittest dependencies, make them parallelizable Clean up make dependencies so Vim and unit test binaries only depend on the object files they need. This fixes an existing issue where after running unit tests, the Vim binary would be invalidated, which results in it having to be linked again when running script tests, even though Vim was already previously built. Make link.sh (script we use to link those binaries) generate namespaced temporary files for each app to avoid them colliding with each other. This allows `unittesttargets` to be built in parallel. These fixes are useful when using link-time-optimization as the link phase could now take minutes rather than a few seconds. closes: #13344 Signed-off-by: Christian Brabandt <cb@256bit.org> Co-authored-by: Yee Cheng Chin <ychin.git@gmail.com>
author Christian Brabandt <cb@256bit.org>
date Sun, 15 Oct 2023 10:00:03 +0200
parents 7334bf933510
children
line wrap: on
line source

/* vi:set ts=8 sts=4 sw=4 noet:
 *
 * VIM - Vi IMproved    by Bram Moolenaar
 *
 * Do ":help uganda"  in Vim to read copying and usage conditions.
 * Do ":help credits" in Vim to see a list of people who contributed.
 * See README.txt for an overview of the Vim source code.
 */

/*
 * arabic.c: functions for Arabic language
 *
 * Author: Nadim Shaikli & Isam Bayazidi
 * Farsi support and restructuring to make adding new letters easier by Ali
 * Gholami Rudi.  Further work by Ameretat Reith.
 */

/*
 * Sorted list of unicode Arabic characters.  Each entry holds the
 * presentation forms of a letter.
 *
 * Arabic characters are categorized into following types:
 *
 * Isolated	- iso-8859-6 form
 * Initial	- unicode form-B start
 * Medial	- unicode form-B middle
 * Final	- unicode form-B final
 * Stand-Alone	- unicode form-B isolated
 */

#include "vim.h"

#if defined(FEAT_ARABIC) || defined(PROTO)

// Unicode values for Arabic characters.
#define a_HAMZA				0x0621
#define a_ALEF_MADDA			0x0622
#define a_ALEF_HAMZA_ABOVE		0x0623
#define a_WAW_HAMZA			0x0624
#define a_ALEF_HAMZA_BELOW		0x0625
#define a_YEH_HAMZA			0x0626
#define a_ALEF				0x0627
#define a_BEH				0x0628
#define a_TEH_MARBUTA			0x0629
#define a_TEH				0x062a
#define a_THEH				0x062b
#define a_JEEM				0x062c
#define a_HAH				0x062d
#define a_KHAH				0x062e
#define a_DAL				0x062f
#define a_THAL				0x0630
#define a_REH				0x0631
#define a_ZAIN				0x0632
#define a_SEEN				0x0633
#define a_SHEEN				0x0634
#define a_SAD				0x0635
#define a_DAD				0x0636
#define a_TAH				0x0637
#define a_ZAH				0x0638
#define a_AIN				0x0639
#define a_GHAIN				0x063a
#define a_TATWEEL			0x0640
#define a_FEH				0x0641
#define a_QAF				0x0642
#define a_KAF				0x0643
#define a_LAM				0x0644
#define a_MEEM				0x0645
#define a_NOON				0x0646
#define a_HEH				0x0647
#define a_WAW				0x0648
#define a_ALEF_MAKSURA			0x0649
#define a_YEH				0x064a
#define a_FATHATAN			0x064b
#define a_DAMMATAN			0x064c
#define a_KASRATAN			0x064d
#define a_FATHA				0x064e
#define a_DAMMA				0x064f
#define a_KASRA				0x0650
#define a_SHADDA			0x0651
#define a_SUKUN				0x0652
#define a_MADDA_ABOVE			0x0653
#define a_HAMZA_ABOVE			0x0654
#define a_HAMZA_BELOW			0x0655

#define a_PEH				0x067e
#define a_TCHEH				0x0686
#define a_JEH				0x0698
#define a_FKAF				0x06a9
#define a_GAF				0x06af
#define a_FYEH				0x06cc

#define a_s_LAM_ALEF_MADDA_ABOVE	0xfef5
#define a_f_LAM_ALEF_MADDA_ABOVE	0xfef6
#define a_s_LAM_ALEF_HAMZA_ABOVE	0xfef7
#define a_f_LAM_ALEF_HAMZA_ABOVE	0xfef8
#define a_s_LAM_ALEF_HAMZA_BELOW	0xfef9
#define a_f_LAM_ALEF_HAMZA_BELOW	0xfefa
#define a_s_LAM_ALEF			0xfefb
#define a_f_LAM_ALEF			0xfefc

static struct achar {
    unsigned c;
    unsigned isolated;
    unsigned initial;
    unsigned medial;
    unsigned final;
} achars[] = {
    {a_HAMZA, 0xfe80, 0, 0, 0},
    {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
    {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
    {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
    {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
    {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
    {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
    {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
    {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
    {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
    {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
    {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
    {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
    {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
    {a_DAL, 0xfea9, 0, 0, 0xfeaa},
    {a_THAL, 0xfeab, 0, 0, 0xfeac},
    {a_REH, 0xfead, 0, 0, 0xfeae},
    {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
    {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
    {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
    {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
    {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
    {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
    {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
    {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
    {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
    {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
    {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
    {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
    {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
    {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
    {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
    {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
    {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
    {a_WAW, 0xfeed, 0, 0, 0xfeee},
    {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
    {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
    {a_FATHATAN, 0xfe70, 0, 0, 0},
    {a_DAMMATAN, 0xfe72, 0, 0, 0},
    {a_KASRATAN, 0xfe74, 0, 0, 0},
    {a_FATHA, 0xfe76, 0, 0xfe77, 0},
    {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
    {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
    {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
    {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
    {a_MADDA_ABOVE, 0, 0, 0, 0},
    {a_HAMZA_ABOVE, 0, 0, 0, 0},
    {a_HAMZA_BELOW, 0, 0, 0, 0},
    {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
    {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
    {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
    {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
    {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
    {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
};

#define a_BYTE_ORDER_MARK		0xfeff

/*
 * Find the struct achar pointer to the given Arabic char.
 * Returns NULL if not found.
 */
    static struct achar *
find_achar(int c)
{
    int h, m, l;

    // using binary search to find c
    h = ARRAY_LENGTH(achars);
    l = 0;
    while (l < h)
    {
	m = (h + l) / 2;
	if (achars[m].c == (unsigned)c)
	    return &achars[m];
	if ((unsigned)c < achars[m].c)
	    h = m;
	else
	    l = m + 1;
    }
    return NULL;
}

/*
 * Change shape - from Combination (2 char) to an Isolated
 */
    static int
chg_c_laa2i(int hid_c)
{
    int tempc;

    switch (hid_c)
    {
	case a_ALEF_MADDA:
	    tempc = a_s_LAM_ALEF_MADDA_ABOVE;
	    break;
	case a_ALEF_HAMZA_ABOVE:
	    tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
	    break;
	case a_ALEF_HAMZA_BELOW:
	    tempc = a_s_LAM_ALEF_HAMZA_BELOW;
	    break;
	case a_ALEF:
	    tempc = a_s_LAM_ALEF;
	    break;
	default:
	    tempc = 0;
    }

    return tempc;
}

/*
 * Change shape - from Combination-Isolated to Final
 */
    static int
chg_c_laa2f(int hid_c)
{
    int tempc;

    switch (hid_c)
    {
	case a_ALEF_MADDA:
	    tempc = a_f_LAM_ALEF_MADDA_ABOVE;
	    break;
	case a_ALEF_HAMZA_ABOVE:
	    tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
	    break;
	case a_ALEF_HAMZA_BELOW:
	    tempc = a_f_LAM_ALEF_HAMZA_BELOW;
	    break;
	case a_ALEF:
	    tempc = a_f_LAM_ALEF;
	    break;
	default:
	    tempc = 0;
    }

    return tempc;
}

/*
 * Returns whether it is possible to join the given letters
 */
    static int
can_join(int c1, int c2)
{
    struct achar *a1 = find_achar(c1);
    struct achar *a2 = find_achar(c2);

    return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
}

/*
 * Check whether we are dealing with a character that could be regarded as an
 * Arabic combining character, need to check the character before this.
 */
    int
arabic_maycombine(int two)
{
    if (p_arshape && !p_tbidi)
	return (two == a_ALEF_MADDA
		    || two == a_ALEF_HAMZA_ABOVE
		    || two == a_ALEF_HAMZA_BELOW
		    || two == a_ALEF);
    return FALSE;
}

/*
 * Check whether we are dealing with Arabic combining characters.
 * Note: these are NOT really composing characters!
 */
    int
arabic_combine(
    int		one,	    // first character
    int		two)	    // character just after "one"
{
    if (one == a_LAM)
	return arabic_maycombine(two);
    return FALSE;
}

/*
 * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
 *		(alphabet/number/punctuation)
 */
    static int
A_is_iso(int c)
{
    return find_achar(c) != NULL;
}

/*
 * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
 */
    static int
A_is_ok(int c)
{
    return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
}

/*
 * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
 *		with some exceptions/exclusions
 */
    static int
A_is_valid(int c)
{
    return (A_is_ok(c) && c != a_HAMZA);
}

/*
 * Do Arabic shaping on character "c".  Returns the shaped character.
 * out:    "ccp" points to the first byte of the character to be shaped.
 * in/out: "c1p" points to the first composing char for "c".
 * in:     "prev_c"  is the previous character (not shaped)
 * in:     "prev_c1" is the first composing char for the previous char
 *		     (not shaped)
 * in:     "next_c"  is the next character (not shaped).
 */
    int
arabic_shape(
    int		c,
    int		*ccp,
    int		*c1p,
    int		prev_c,
    int		prev_c1,
    int		next_c)
{
    int		curr_c;
    int		curr_laa;
    int		prev_laa;

    // Deal only with Arabic characters, pass back all others
    if (!A_is_ok(c))
	return c;

    curr_laa = arabic_combine(c, *c1p);
    prev_laa = arabic_combine(prev_c, prev_c1);

    if (curr_laa)
    {
	if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
	    curr_c = chg_c_laa2f(*c1p);
	else
	    curr_c = chg_c_laa2i(*c1p);

	// Remove the composing character
	*c1p = 0;
    }
    else
    {
	struct achar *curr_a = find_achar(c);
	int backward_combine = !prev_laa && can_join(prev_c, c);
	int forward_combine = can_join(c, next_c);

	if (backward_combine)
	{
	    if (forward_combine)
		curr_c = curr_a->medial;
	    else
		curr_c = curr_a->final;
	}
	else
	{
	    if (forward_combine)
		curr_c = curr_a->initial;
	    else
		curr_c = curr_a->isolated;
	}
    }

    // Character missing from the table means using original character.
    if (curr_c == NUL)
	curr_c = c;

    if (curr_c != c && ccp != NULL)
    {
	char_u buf[MB_MAXBYTES + 1];

	// Update the first byte of the character.
	(*mb_char2bytes)(curr_c, buf);
	*ccp = buf[0];
    }

    // Return the shaped character
    return curr_c;
}
#endif // FEAT_ARABIC