view src/arabic.c @ 34681:bfb2f74221b9

Added tag v9.1.0221 for changeset 2fd4ce2a50586046d91c7dff36a7fe5eabfaf113
author Christian Brabandt <cb@256bit.org>
date Thu, 28 Mar 2024 12:00:04 +0100
parents 7334bf933510
children
line wrap: on
line source

/* vi:set ts=8 sts=4 sw=4 noet:
 *
 * VIM - Vi IMproved    by Bram Moolenaar
 *
 * Do ":help uganda"  in Vim to read copying and usage conditions.
 * Do ":help credits" in Vim to see a list of people who contributed.
 * See README.txt for an overview of the Vim source code.
 */

/*
 * arabic.c: functions for Arabic language
 *
 * Author: Nadim Shaikli & Isam Bayazidi
 * Farsi support and restructuring to make adding new letters easier by Ali
 * Gholami Rudi.  Further work by Ameretat Reith.
 */

/*
 * Sorted list of unicode Arabic characters.  Each entry holds the
 * presentation forms of a letter.
 *
 * Arabic characters are categorized into following types:
 *
 * Isolated	- iso-8859-6 form
 * Initial	- unicode form-B start
 * Medial	- unicode form-B middle
 * Final	- unicode form-B final
 * Stand-Alone	- unicode form-B isolated
 */

#include "vim.h"

#if defined(FEAT_ARABIC) || defined(PROTO)

// Unicode values for Arabic characters.
#define a_HAMZA				0x0621
#define a_ALEF_MADDA			0x0622
#define a_ALEF_HAMZA_ABOVE		0x0623
#define a_WAW_HAMZA			0x0624
#define a_ALEF_HAMZA_BELOW		0x0625
#define a_YEH_HAMZA			0x0626
#define a_ALEF				0x0627
#define a_BEH				0x0628
#define a_TEH_MARBUTA			0x0629
#define a_TEH				0x062a
#define a_THEH				0x062b
#define a_JEEM				0x062c
#define a_HAH				0x062d
#define a_KHAH				0x062e
#define a_DAL				0x062f
#define a_THAL				0x0630
#define a_REH				0x0631
#define a_ZAIN				0x0632
#define a_SEEN				0x0633
#define a_SHEEN				0x0634
#define a_SAD				0x0635
#define a_DAD				0x0636
#define a_TAH				0x0637
#define a_ZAH				0x0638
#define a_AIN				0x0639
#define a_GHAIN				0x063a
#define a_TATWEEL			0x0640
#define a_FEH				0x0641
#define a_QAF				0x0642
#define a_KAF				0x0643
#define a_LAM				0x0644
#define a_MEEM				0x0645
#define a_NOON				0x0646
#define a_HEH				0x0647
#define a_WAW				0x0648
#define a_ALEF_MAKSURA			0x0649
#define a_YEH				0x064a
#define a_FATHATAN			0x064b
#define a_DAMMATAN			0x064c
#define a_KASRATAN			0x064d
#define a_FATHA				0x064e
#define a_DAMMA				0x064f
#define a_KASRA				0x0650
#define a_SHADDA			0x0651
#define a_SUKUN				0x0652
#define a_MADDA_ABOVE			0x0653
#define a_HAMZA_ABOVE			0x0654
#define a_HAMZA_BELOW			0x0655

#define a_PEH				0x067e
#define a_TCHEH				0x0686
#define a_JEH				0x0698
#define a_FKAF				0x06a9
#define a_GAF				0x06af
#define a_FYEH				0x06cc

#define a_s_LAM_ALEF_MADDA_ABOVE	0xfef5
#define a_f_LAM_ALEF_MADDA_ABOVE	0xfef6
#define a_s_LAM_ALEF_HAMZA_ABOVE	0xfef7
#define a_f_LAM_ALEF_HAMZA_ABOVE	0xfef8
#define a_s_LAM_ALEF_HAMZA_BELOW	0xfef9
#define a_f_LAM_ALEF_HAMZA_BELOW	0xfefa
#define a_s_LAM_ALEF			0xfefb
#define a_f_LAM_ALEF			0xfefc

static struct achar {
    unsigned c;
    unsigned isolated;
    unsigned initial;
    unsigned medial;
    unsigned final;
} achars[] = {
    {a_HAMZA, 0xfe80, 0, 0, 0},
    {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
    {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
    {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
    {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
    {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
    {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
    {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
    {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
    {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
    {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
    {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
    {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
    {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
    {a_DAL, 0xfea9, 0, 0, 0xfeaa},
    {a_THAL, 0xfeab, 0, 0, 0xfeac},
    {a_REH, 0xfead, 0, 0, 0xfeae},
    {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
    {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
    {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
    {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
    {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
    {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
    {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
    {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
    {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
    {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
    {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
    {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
    {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
    {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
    {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
    {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
    {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
    {a_WAW, 0xfeed, 0, 0, 0xfeee},
    {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
    {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
    {a_FATHATAN, 0xfe70, 0, 0, 0},
    {a_DAMMATAN, 0xfe72, 0, 0, 0},
    {a_KASRATAN, 0xfe74, 0, 0, 0},
    {a_FATHA, 0xfe76, 0, 0xfe77, 0},
    {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
    {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
    {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
    {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
    {a_MADDA_ABOVE, 0, 0, 0, 0},
    {a_HAMZA_ABOVE, 0, 0, 0, 0},
    {a_HAMZA_BELOW, 0, 0, 0, 0},
    {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
    {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
    {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
    {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
    {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
    {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
};

#define a_BYTE_ORDER_MARK		0xfeff

/*
 * Find the struct achar pointer to the given Arabic char.
 * Returns NULL if not found.
 */
    static struct achar *
find_achar(int c)
{
    int h, m, l;

    // using binary search to find c
    h = ARRAY_LENGTH(achars);
    l = 0;
    while (l < h)
    {
	m = (h + l) / 2;
	if (achars[m].c == (unsigned)c)
	    return &achars[m];
	if ((unsigned)c < achars[m].c)
	    h = m;
	else
	    l = m + 1;
    }
    return NULL;
}

/*
 * Change shape - from Combination (2 char) to an Isolated
 */
    static int
chg_c_laa2i(int hid_c)
{
    int tempc;

    switch (hid_c)
    {
	case a_ALEF_MADDA:
	    tempc = a_s_LAM_ALEF_MADDA_ABOVE;
	    break;
	case a_ALEF_HAMZA_ABOVE:
	    tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
	    break;
	case a_ALEF_HAMZA_BELOW:
	    tempc = a_s_LAM_ALEF_HAMZA_BELOW;
	    break;
	case a_ALEF:
	    tempc = a_s_LAM_ALEF;
	    break;
	default:
	    tempc = 0;
    }

    return tempc;
}

/*
 * Change shape - from Combination-Isolated to Final
 */
    static int
chg_c_laa2f(int hid_c)
{
    int tempc;

    switch (hid_c)
    {
	case a_ALEF_MADDA:
	    tempc = a_f_LAM_ALEF_MADDA_ABOVE;
	    break;
	case a_ALEF_HAMZA_ABOVE:
	    tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
	    break;
	case a_ALEF_HAMZA_BELOW:
	    tempc = a_f_LAM_ALEF_HAMZA_BELOW;
	    break;
	case a_ALEF:
	    tempc = a_f_LAM_ALEF;
	    break;
	default:
	    tempc = 0;
    }

    return tempc;
}

/*
 * Returns whether it is possible to join the given letters
 */
    static int
can_join(int c1, int c2)
{
    struct achar *a1 = find_achar(c1);
    struct achar *a2 = find_achar(c2);

    return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
}

/*
 * Check whether we are dealing with a character that could be regarded as an
 * Arabic combining character, need to check the character before this.
 */
    int
arabic_maycombine(int two)
{
    if (p_arshape && !p_tbidi)
	return (two == a_ALEF_MADDA
		    || two == a_ALEF_HAMZA_ABOVE
		    || two == a_ALEF_HAMZA_BELOW
		    || two == a_ALEF);
    return FALSE;
}

/*
 * Check whether we are dealing with Arabic combining characters.
 * Note: these are NOT really composing characters!
 */
    int
arabic_combine(
    int		one,	    // first character
    int		two)	    // character just after "one"
{
    if (one == a_LAM)
	return arabic_maycombine(two);
    return FALSE;
}

/*
 * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
 *		(alphabet/number/punctuation)
 */
    static int
A_is_iso(int c)
{
    return find_achar(c) != NULL;
}

/*
 * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
 */
    static int
A_is_ok(int c)
{
    return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
}

/*
 * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
 *		with some exceptions/exclusions
 */
    static int
A_is_valid(int c)
{
    return (A_is_ok(c) && c != a_HAMZA);
}

/*
 * Do Arabic shaping on character "c".  Returns the shaped character.
 * out:    "ccp" points to the first byte of the character to be shaped.
 * in/out: "c1p" points to the first composing char for "c".
 * in:     "prev_c"  is the previous character (not shaped)
 * in:     "prev_c1" is the first composing char for the previous char
 *		     (not shaped)
 * in:     "next_c"  is the next character (not shaped).
 */
    int
arabic_shape(
    int		c,
    int		*ccp,
    int		*c1p,
    int		prev_c,
    int		prev_c1,
    int		next_c)
{
    int		curr_c;
    int		curr_laa;
    int		prev_laa;

    // Deal only with Arabic characters, pass back all others
    if (!A_is_ok(c))
	return c;

    curr_laa = arabic_combine(c, *c1p);
    prev_laa = arabic_combine(prev_c, prev_c1);

    if (curr_laa)
    {
	if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
	    curr_c = chg_c_laa2f(*c1p);
	else
	    curr_c = chg_c_laa2i(*c1p);

	// Remove the composing character
	*c1p = 0;
    }
    else
    {
	struct achar *curr_a = find_achar(c);
	int backward_combine = !prev_laa && can_join(prev_c, c);
	int forward_combine = can_join(c, next_c);

	if (backward_combine)
	{
	    if (forward_combine)
		curr_c = curr_a->medial;
	    else
		curr_c = curr_a->final;
	}
	else
	{
	    if (forward_combine)
		curr_c = curr_a->initial;
	    else
		curr_c = curr_a->isolated;
	}
    }

    // Character missing from the table means using original character.
    if (curr_c == NUL)
	curr_c = c;

    if (curr_c != c && ccp != NULL)
    {
	char_u buf[MB_MAXBYTES + 1];

	// Update the first byte of the character.
	(*mb_char2bytes)(curr_c, buf);
	*ccp = buf[0];
    }

    // Return the shaped character
    return curr_c;
}
#endif // FEAT_ARABIC