diff src/arabic.c @ 16066:473fbdb2717c v8.1.1038

patch 8.1.1038: Arabic support excludes Farsi commit https://github.com/vim/vim/commit/dc4fa190e7b9d6ba49416ce875d2192c4444d3eb Author: Bram Moolenaar <Bram@vim.org> Date: Fri Mar 22 16:33:15 2019 +0100 patch 8.1.1038: Arabic support excludes Farsi Problem: Arabic support excludes Farsi. Solution: Add Farsi support to the Arabic support. (Ali Gholami Rudi, Ameretat Reith)
author Bram Moolenaar <Bram@vim.org>
date Fri, 22 Mar 2019 16:45:05 +0100
parents 78faa25f9698
children 061cf939f7ce
line wrap: on
line diff
--- a/src/arabic.c
+++ b/src/arabic.c
@@ -11,541 +11,311 @@
  * arabic.c: functions for Arabic language
  *
  * Author: Nadim Shaikli & Isam Bayazidi
+ * Farsi support and restructuring to make adding new letters easier by Ali
+ * Gholami Rudi.  Further work by Ameretat Reith.
+ */
+
+/*
+ * Sorted list of unicode Arabic characters.  Each entry holds the
+ * presentation forms of a letter.
+ *
+ * Arabic characters are categorized into following types:
+ *
+ * Isolated	- iso-8859-6 form
+ * Initial	- unicode form-B start
+ * Medial	- unicode form-B middle
+ * Final	- unicode form-B final
+ * Stand-Alone	- unicode form-B isolated
  */
 
 #include "vim.h"
 
 #if defined(FEAT_ARABIC) || defined(PROTO)
 
-static int  A_firstc_laa(int c1, int c);
-static int  A_is_harakat(int c);
-static int  A_is_iso(int c);
-static int  A_is_formb(int c);
-static int  A_is_ok(int c);
-static int  A_is_valid(int c);
-static int  A_is_special(int c);
-
+// Unicode values for Arabic characters.
+#define a_HAMZA				0x0621
+#define a_ALEF_MADDA			0x0622
+#define a_ALEF_HAMZA_ABOVE		0x0623
+#define a_WAW_HAMZA			0x0624
+#define a_ALEF_HAMZA_BELOW		0x0625
+#define a_YEH_HAMZA			0x0626
+#define a_ALEF				0x0627
+#define a_BEH				0x0628
+#define a_TEH_MARBUTA			0x0629
+#define a_TEH				0x062a
+#define a_THEH				0x062b
+#define a_JEEM				0x062c
+#define a_HAH				0x062d
+#define a_KHAH				0x062e
+#define a_DAL				0x062f
+#define a_THAL				0x0630
+#define a_REH				0x0631
+#define a_ZAIN				0x0632
+#define a_SEEN				0x0633
+#define a_SHEEN				0x0634
+#define a_SAD				0x0635
+#define a_DAD				0x0636
+#define a_TAH				0x0637
+#define a_ZAH				0x0638
+#define a_AIN				0x0639
+#define a_GHAIN				0x063a
+#define a_TATWEEL			0x0640
+#define a_FEH				0x0641
+#define a_QAF				0x0642
+#define a_KAF				0x0643
+#define a_LAM				0x0644
+#define a_MEEM				0x0645
+#define a_NOON				0x0646
+#define a_HEH				0x0647
+#define a_WAW				0x0648
+#define a_ALEF_MAKSURA			0x0649
+#define a_YEH				0x064a
+#define a_FATHATAN			0x064b
+#define a_DAMMATAN			0x064c
+#define a_KASRATAN			0x064d
+#define a_FATHA				0x064e
+#define a_DAMMA				0x064f
+#define a_KASRA				0x0650
+#define a_SHADDA			0x0651
+#define a_SUKUN				0x0652
+#define a_MADDA_ABOVE			0x0653
+#define a_HAMZA_ABOVE			0x0654
+#define a_HAMZA_BELOW			0x0655
 
-/*
- * Returns True if c is an ISO-8859-6 shaped ARABIC letter (user entered)
- */
-    static int
-A_is_a(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_HAMZA:
-	case a_ALEF_MADDA:
-	case a_ALEF_HAMZA_ABOVE:
-	case a_WAW_HAMZA:
-	case a_ALEF_HAMZA_BELOW:
-	case a_YEH_HAMZA:
-	case a_ALEF:
-	case a_BEH:
-	case a_TEH_MARBUTA:
-	case a_TEH:
-	case a_THEH:
-	case a_JEEM:
-	case a_HAH:
-	case a_KHAH:
-	case a_DAL:
-	case a_THAL:
-	case a_REH:
-	case a_ZAIN:
-	case a_SEEN:
-	case a_SHEEN:
-	case a_SAD:
-	case a_DAD:
-	case a_TAH:
-	case a_ZAH:
-	case a_AIN:
-	case a_GHAIN:
-	case a_TATWEEL:
-	case a_FEH:
-	case a_QAF:
-	case a_KAF:
-	case a_LAM:
-	case a_MEEM:
-	case a_NOON:
-	case a_HEH:
-	case a_WAW:
-	case a_ALEF_MAKSURA:
-	case a_YEH:
-	    return TRUE;
-    }
+#define a_PEH				0x067e
+#define a_TCHEH				0x0686
+#define a_JEH				0x0698
+#define a_FKAF				0x06a9
+#define a_GAF				0x06af
+#define a_FYEH				0x06cc
 
-    return FALSE;
-}
-
+#define a_s_LAM_ALEF_MADDA_ABOVE	0xfef5
+#define a_f_LAM_ALEF_MADDA_ABOVE	0xfef6
+#define a_s_LAM_ALEF_HAMZA_ABOVE	0xfef7
+#define a_f_LAM_ALEF_HAMZA_ABOVE	0xfef8
+#define a_s_LAM_ALEF_HAMZA_BELOW	0xfef9
+#define a_f_LAM_ALEF_HAMZA_BELOW	0xfefa
+#define a_s_LAM_ALEF			0xfefb
+#define a_f_LAM_ALEF			0xfefc
 
-/*
- * Returns True if c is an Isolated Form-B ARABIC letter
- */
-    static int
-A_is_s(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_s_HAMZA:
-	case a_s_ALEF_MADDA:
-	case a_s_ALEF_HAMZA_ABOVE:
-	case a_s_WAW_HAMZA:
-	case a_s_ALEF_HAMZA_BELOW:
-	case a_s_YEH_HAMZA:
-	case a_s_ALEF:
-	case a_s_BEH:
-	case a_s_TEH_MARBUTA:
-	case a_s_TEH:
-	case a_s_THEH:
-	case a_s_JEEM:
-	case a_s_HAH:
-	case a_s_KHAH:
-	case a_s_DAL:
-	case a_s_THAL:
-	case a_s_REH:
-	case a_s_ZAIN:
-	case a_s_SEEN:
-	case a_s_SHEEN:
-	case a_s_SAD:
-	case a_s_DAD:
-	case a_s_TAH:
-	case a_s_ZAH:
-	case a_s_AIN:
-	case a_s_GHAIN:
-	case a_s_FEH:
-	case a_s_QAF:
-	case a_s_KAF:
-	case a_s_LAM:
-	case a_s_MEEM:
-	case a_s_NOON:
-	case a_s_HEH:
-	case a_s_WAW:
-	case a_s_ALEF_MAKSURA:
-	case a_s_YEH:
-	    return TRUE;
-    }
-
-    return FALSE;
-}
-
+static struct achar {
+    unsigned c;
+    unsigned isolated;
+    unsigned initial;
+    unsigned medial;
+    unsigned final;
+} achars[] = {
+    {a_HAMZA, 0xfe80, 0, 0, 0},
+    {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
+    {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
+    {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
+    {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
+    {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
+    {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
+    {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
+    {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
+    {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
+    {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
+    {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
+    {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
+    {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
+    {a_DAL, 0xfea9, 0, 0, 0xfeaa},
+    {a_THAL, 0xfeab, 0, 0, 0xfeac},
+    {a_REH, 0xfead, 0, 0, 0xfeae},
+    {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
+    {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
+    {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
+    {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
+    {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
+    {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
+    {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
+    {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
+    {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
+    {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
+    {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
+    {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
+    {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
+    {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
+    {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
+    {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
+    {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
+    {a_WAW, 0xfeed, 0, 0, 0xfeee},
+    {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
+    {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
+    {a_FATHATAN, 0xfe70, 0, 0, 0},
+    {a_DAMMATAN, 0xfe72, 0, 0, 0},
+    {a_KASRATAN, 0xfe74, 0, 0, 0},
+    {a_FATHA, 0xfe76, 0, 0xfe77, 0},
+    {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
+    {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
+    {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
+    {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
+    {a_MADDA_ABOVE, 0, 0, 0, 0},
+    {a_HAMZA_ABOVE, 0, 0, 0, 0},
+    {a_HAMZA_BELOW, 0, 0, 0, 0},
+    {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
+    {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
+    {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
+    {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
+    {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
+    {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
+};
 
-/*
- * Returns True if c is a Final shape of an ARABIC letter
- */
-    static int
-A_is_f(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_f_ALEF_MADDA:
-	case a_f_ALEF_HAMZA_ABOVE:
-	case a_f_WAW_HAMZA:
-	case a_f_ALEF_HAMZA_BELOW:
-	case a_f_YEH_HAMZA:
-	case a_f_ALEF:
-	case a_f_BEH:
-	case a_f_TEH_MARBUTA:
-	case a_f_TEH:
-	case a_f_THEH:
-	case a_f_JEEM:
-	case a_f_HAH:
-	case a_f_KHAH:
-	case a_f_DAL:
-	case a_f_THAL:
-	case a_f_REH:
-	case a_f_ZAIN:
-	case a_f_SEEN:
-	case a_f_SHEEN:
-	case a_f_SAD:
-	case a_f_DAD:
-	case a_f_TAH:
-	case a_f_ZAH:
-	case a_f_AIN:
-	case a_f_GHAIN:
-	case a_f_FEH:
-	case a_f_QAF:
-	case a_f_KAF:
-	case a_f_LAM:
-	case a_f_MEEM:
-	case a_f_NOON:
-	case a_f_HEH:
-	case a_f_WAW:
-	case a_f_ALEF_MAKSURA:
-	case a_f_YEH:
-	case a_f_LAM_ALEF_MADDA_ABOVE:
-	case a_f_LAM_ALEF_HAMZA_ABOVE:
-	case a_f_LAM_ALEF_HAMZA_BELOW:
-	case a_f_LAM_ALEF:
-	    return TRUE;
-    }
-    return FALSE;
-}
+#define a_BYTE_ORDER_MARK		0xfeff
 
-
-/*
- * Change shape - from ISO-8859-6/Isolated to Form-B Isolated
- */
-    static int
-chg_c_a2s(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_HAMZA: return a_s_HAMZA;
-	case a_ALEF_MADDA: return a_s_ALEF_MADDA;
-	case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE;
-	case a_WAW_HAMZA: return a_s_WAW_HAMZA;
-	case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW;
-	case a_YEH_HAMZA: return a_s_YEH_HAMZA;
-	case a_ALEF: return a_s_ALEF;
-	case a_TEH_MARBUTA: return a_s_TEH_MARBUTA;
-	case a_DAL: return a_s_DAL;
-	case a_THAL: return a_s_THAL;
-	case a_REH: return a_s_REH;
-	case a_ZAIN: return a_s_ZAIN;
-	case a_TATWEEL: return cur_c;	/* exceptions */
-	case a_WAW: return a_s_WAW;
-	case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA;
-	case a_BEH: return a_s_BEH;
-	case a_TEH: return a_s_TEH;
-	case a_THEH: return a_s_THEH;
-	case a_JEEM: return a_s_JEEM;
-	case a_HAH: return a_s_HAH;
-	case a_KHAH: return a_s_KHAH;
-	case a_SEEN: return a_s_SEEN;
-	case a_SHEEN: return a_s_SHEEN;
-	case a_SAD: return a_s_SAD;
-	case a_DAD: return a_s_DAD;
-	case a_TAH: return a_s_TAH;
-	case a_ZAH: return a_s_ZAH;
-	case a_AIN: return a_s_AIN;
-	case a_GHAIN: return a_s_GHAIN;
-	case a_FEH: return a_s_FEH;
-	case a_QAF: return a_s_QAF;
-	case a_KAF: return a_s_KAF;
-	case a_LAM: return a_s_LAM;
-	case a_MEEM: return a_s_MEEM;
-	case a_NOON: return a_s_NOON;
-	case a_HEH: return a_s_HEH;
-	case a_YEH: return a_s_YEH;
-    }
-    return 0;
-}
-
+#define ARRAY_SIZE(a)		(sizeof(a) / sizeof((a)[0]))
 
 /*
- * Change shape - from ISO-8859-6/Isolated to Initial
+ * Find the struct achar pointer to the given Arabic char.
+ * Returns NULL if not found.
  */
-    static int
-chg_c_a2i(int cur_c)
+    static struct achar *
+find_achar(int c)
 {
-    switch (cur_c)
-    {
-	case a_YEH_HAMZA: return a_i_YEH_HAMZA;
-	case a_HAMZA:			/* exceptions */
-	    return a_s_HAMZA;
-	case a_ALEF_MADDA:		/* exceptions */
-	    return a_s_ALEF_MADDA;
-	case a_ALEF_HAMZA_ABOVE:	/* exceptions */
-	    return a_s_ALEF_HAMZA_ABOVE;
-	case a_WAW_HAMZA:		/* exceptions */
-	    return a_s_WAW_HAMZA;
-	case a_ALEF_HAMZA_BELOW:	/* exceptions */
-	    return a_s_ALEF_HAMZA_BELOW;
-	case a_ALEF:			/* exceptions */
-	    return a_s_ALEF;
-	case a_TEH_MARBUTA:		/* exceptions */
-	    return a_s_TEH_MARBUTA;
-	case a_DAL:			/* exceptions */
-	    return a_s_DAL;
-	case a_THAL:			/* exceptions */
-	    return a_s_THAL;
-	case a_REH:			/* exceptions */
-	    return a_s_REH;
-	case a_ZAIN:			/* exceptions */
-	    return a_s_ZAIN;
-	case a_TATWEEL:			/* exceptions */
-	    return cur_c;
-	case a_WAW:			/* exceptions */
-	    return a_s_WAW;
-	case a_ALEF_MAKSURA:		/* exceptions */
-	    return a_s_ALEF_MAKSURA;
-	case a_BEH: return a_i_BEH;
-	case a_TEH: return a_i_TEH;
-	case a_THEH: return a_i_THEH;
-	case a_JEEM: return a_i_JEEM;
-	case a_HAH: return a_i_HAH;
-	case a_KHAH: return a_i_KHAH;
-	case a_SEEN: return a_i_SEEN;
-	case a_SHEEN: return a_i_SHEEN;
-	case a_SAD: return a_i_SAD;
-	case a_DAD: return a_i_DAD;
-	case a_TAH: return a_i_TAH;
-	case a_ZAH: return a_i_ZAH;
-	case a_AIN: return a_i_AIN;
-	case a_GHAIN: return a_i_GHAIN;
-	case a_FEH: return a_i_FEH;
-	case a_QAF: return a_i_QAF;
-	case a_KAF: return a_i_KAF;
-	case a_LAM: return a_i_LAM;
-	case a_MEEM: return a_i_MEEM;
-	case a_NOON: return a_i_NOON;
-	case a_HEH: return a_i_HEH;
-	case a_YEH: return a_i_YEH;
-    }
-    return 0;
-}
-
+    int h, m, l;
 
-/*
- * Change shape - from ISO-8859-6/Isolated to Medial
- */
-    static int
-chg_c_a2m(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_HAMZA: return a_s_HAMZA;	/* exception */
-	case a_ALEF_MADDA: return a_f_ALEF_MADDA;	/* exception */
-	case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE;	/* exception */
-	case a_WAW_HAMZA: return a_f_WAW_HAMZA;	/* exception */
-	case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW;	/* exception */
-	case a_YEH_HAMZA: return a_m_YEH_HAMZA;
-	case a_ALEF: return a_f_ALEF;	/* exception */
-	case a_BEH: return a_m_BEH;
-	case a_TEH_MARBUTA: return a_f_TEH_MARBUTA;	/* exception */
-	case a_TEH: return a_m_TEH;
-	case a_THEH: return a_m_THEH;
-	case a_JEEM: return a_m_JEEM;
-	case a_HAH: return a_m_HAH;
-	case a_KHAH: return a_m_KHAH;
-	case a_DAL: return a_f_DAL;	/* exception */
-	case a_THAL: return a_f_THAL;	/* exception */
-	case a_REH: return a_f_REH;	/* exception */
-	case a_ZAIN: return a_f_ZAIN;	/* exception */
-	case a_SEEN: return a_m_SEEN;
-	case a_SHEEN: return a_m_SHEEN;
-	case a_SAD: return a_m_SAD;
-	case a_DAD: return a_m_DAD;
-	case a_TAH: return a_m_TAH;
-	case a_ZAH: return a_m_ZAH;
-	case a_AIN: return a_m_AIN;
-	case a_GHAIN: return a_m_GHAIN;
-	case a_TATWEEL: return cur_c;	/* exception */
-	case a_FEH: return a_m_FEH;
-	case a_QAF: return a_m_QAF;
-	case a_KAF: return a_m_KAF;
-	case a_LAM: return a_m_LAM;
-	case a_MEEM: return a_m_MEEM;
-	case a_NOON: return a_m_NOON;
-	case a_HEH: return a_m_HEH;
-	case a_WAW: return a_f_WAW;	/* exception */
-	case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA;	/* exception */
-	case a_YEH: return a_m_YEH;
-    }
-    return 0;
-}
-
-
-/*
- * Change shape - from ISO-8859-6/Isolated to final
- */
-    static int
-chg_c_a2f(int cur_c)
-{
-    /* NOTE: these encodings need to be accounted for
-     * a_f_ALEF_MADDA;
-     * a_f_ALEF_HAMZA_ABOVE;
-     * a_f_ALEF_HAMZA_BELOW;
-     * a_f_LAM_ALEF_MADDA_ABOVE;
-     * a_f_LAM_ALEF_HAMZA_ABOVE;
-     * a_f_LAM_ALEF_HAMZA_BELOW;
-     */
-    switch (cur_c)
+    // using binary search to find c
+    h = ARRAY_SIZE(achars);
+    l = 0;
+    while (l < h)
     {
-	case a_HAMZA: return a_s_HAMZA;	/* exception */
-	case a_ALEF_MADDA: return a_f_ALEF_MADDA;
-	case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE;
-	case a_WAW_HAMZA: return a_f_WAW_HAMZA;
-	case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW;
-	case a_YEH_HAMZA: return a_f_YEH_HAMZA;
-	case a_ALEF: return a_f_ALEF;
-	case a_BEH: return a_f_BEH;
-	case a_TEH_MARBUTA: return a_f_TEH_MARBUTA;
-	case a_TEH: return a_f_TEH;
-	case a_THEH: return a_f_THEH;
-	case a_JEEM: return a_f_JEEM;
-	case a_HAH: return a_f_HAH;
-	case a_KHAH: return a_f_KHAH;
-	case a_DAL: return a_f_DAL;
-	case a_THAL: return a_f_THAL;
-	case a_REH: return a_f_REH;
-	case a_ZAIN: return a_f_ZAIN;
-	case a_SEEN: return a_f_SEEN;
-	case a_SHEEN: return a_f_SHEEN;
-	case a_SAD: return a_f_SAD;
-	case a_DAD: return a_f_DAD;
-	case a_TAH: return a_f_TAH;
-	case a_ZAH: return a_f_ZAH;
-	case a_AIN: return a_f_AIN;
-	case a_GHAIN: return a_f_GHAIN;
-	case a_TATWEEL:	return cur_c;	/* exception */
-	case a_FEH: return a_f_FEH;
-	case a_QAF: return a_f_QAF;
-	case a_KAF: return a_f_KAF;
-	case a_LAM: return a_f_LAM;
-	case a_MEEM: return a_f_MEEM;
-	case a_NOON: return a_f_NOON;
-	case a_HEH: return a_f_HEH;
-	case a_WAW: return a_f_WAW;
-	case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA;
-	case a_YEH: return a_f_YEH;
+	m = (h + l) / 2;
+	if (achars[m].c == (unsigned)c)
+	    return &achars[m];
+	if ((unsigned)c < achars[m].c)
+	    h = m;
+	else
+	    l = m + 1;
     }
-    return 0;
+    return NULL;
 }
 
-
-/*
- * Change shape - from Initial to Medial
- * This code is unreachable, because for the relevant characters ARABIC_CHAR()
- * is FALSE;
- */
-#if 0
-    static int
-chg_c_i2m(int cur_c)
-{
-    switch (cur_c)
-    {
-	case a_i_YEH_HAMZA: return a_m_YEH_HAMZA;
-	case a_i_BEH: return a_m_BEH;
-	case a_i_TEH: return a_m_TEH;
-	case a_i_THEH: return a_m_THEH;
-	case a_i_JEEM: return a_m_JEEM;
-	case a_i_HAH: return a_m_HAH;
-	case a_i_KHAH: return a_m_KHAH;
-	case a_i_SEEN: return a_m_SEEN;
-	case a_i_SHEEN: return a_m_SHEEN;
-	case a_i_SAD: return a_m_SAD;
-	case a_i_DAD: return a_m_DAD;
-	case a_i_TAH: return a_m_TAH;
-	case a_i_ZAH: return a_m_ZAH;
-	case a_i_AIN: return a_m_AIN;
-	case a_i_GHAIN: return a_m_GHAIN;
-	case a_i_FEH: return a_m_FEH;
-	case a_i_QAF: return a_m_QAF;
-	case a_i_KAF: return a_m_KAF;
-	case a_i_LAM: return a_m_LAM;
-	case a_i_MEEM: return a_m_MEEM;
-	case a_i_NOON: return a_m_NOON;
-	case a_i_HEH: return a_m_HEH;
-	case a_i_YEH: return a_m_YEH;
-    }
-    return 0;
-}
-#endif
-
-
-/*
- * Change shape - from Final to Medial
- */
-    static int
-chg_c_f2m(int cur_c)
-{
-    switch (cur_c)
-    {
-	/* NOTE: these encodings are multi-positional, no ?
-	 * case a_f_ALEF_MADDA:
-	 * case a_f_ALEF_HAMZA_ABOVE:
-	 * case a_f_ALEF_HAMZA_BELOW:
-	 */
-	case a_f_YEH_HAMZA: return a_m_YEH_HAMZA;
-	case a_f_WAW_HAMZA:		/* exceptions */
-	case a_f_ALEF:
-	case a_f_TEH_MARBUTA:
-	case a_f_DAL:
-	case a_f_THAL:
-	case a_f_REH:
-	case a_f_ZAIN:
-	case a_f_WAW:
-	case a_f_ALEF_MAKSURA:
-		return cur_c;
-	case a_f_BEH: return a_m_BEH;
-	case a_f_TEH: return a_m_TEH;
-	case a_f_THEH: return a_m_THEH;
-	case a_f_JEEM: return a_m_JEEM;
-	case a_f_HAH: return a_m_HAH;
-	case a_f_KHAH: return a_m_KHAH;
-	case a_f_SEEN: return a_m_SEEN;
-	case a_f_SHEEN: return a_m_SHEEN;
-	case a_f_SAD: return a_m_SAD;
-	case a_f_DAD: return a_m_DAD;
-	case a_f_TAH: return a_m_TAH;
-	case a_f_ZAH: return a_m_ZAH;
-	case a_f_AIN: return a_m_AIN;
-	case a_f_GHAIN: return a_m_GHAIN;
-	case a_f_FEH: return a_m_FEH;
-	case a_f_QAF: return a_m_QAF;
-	case a_f_KAF: return a_m_KAF;
-	case a_f_LAM: return a_m_LAM;
-	case a_f_MEEM: return a_m_MEEM;
-	case a_f_NOON: return a_m_NOON;
-	case a_f_HEH: return a_m_HEH;
-	case a_f_YEH: return a_m_YEH;
-
-	/* NOTE: these encodings are multi-positional, no ?
-	 * case a_f_LAM_ALEF_MADDA_ABOVE:
-	 * case a_f_LAM_ALEF_HAMZA_ABOVE:
-	 * case a_f_LAM_ALEF_HAMZA_BELOW:
-	 * case a_f_LAM_ALEF:
-	 */
-    }
-    return 0;
-}
-
-
 /*
  * Change shape - from Combination (2 char) to an Isolated
  */
     static int
 chg_c_laa2i(int hid_c)
 {
+    int tempc;
+
     switch (hid_c)
     {
-	case a_ALEF_MADDA: return a_s_LAM_ALEF_MADDA_ABOVE;
-	case a_ALEF_HAMZA_ABOVE: return a_s_LAM_ALEF_HAMZA_ABOVE;
-	case a_ALEF_HAMZA_BELOW: return a_s_LAM_ALEF_HAMZA_BELOW;
-	case a_ALEF: return a_s_LAM_ALEF;
+	case a_ALEF_MADDA:
+	    tempc = a_s_LAM_ALEF_MADDA_ABOVE;
+	    break;
+	case a_ALEF_HAMZA_ABOVE:
+	    tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
+	    break;
+	case a_ALEF_HAMZA_BELOW:
+	    tempc = a_s_LAM_ALEF_HAMZA_BELOW;
+	    break;
+	case a_ALEF:
+	    tempc = a_s_LAM_ALEF;
+	    break;
+	default:
+	    tempc = 0;
     }
-    return 0;
+
+    return tempc;
 }
 
-
 /*
  * Change shape - from Combination-Isolated to Final
  */
     static int
 chg_c_laa2f(int hid_c)
 {
+    int tempc;
+
     switch (hid_c)
     {
-	case a_ALEF_MADDA: return a_f_LAM_ALEF_MADDA_ABOVE;
-	case a_ALEF_HAMZA_ABOVE: return a_f_LAM_ALEF_HAMZA_ABOVE;
-	case a_ALEF_HAMZA_BELOW: return a_f_LAM_ALEF_HAMZA_BELOW;
-	case a_ALEF: return a_f_LAM_ALEF;
+	case a_ALEF_MADDA:
+	    tempc = a_f_LAM_ALEF_MADDA_ABOVE;
+	    break;
+	case a_ALEF_HAMZA_ABOVE:
+	    tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
+	    break;
+	case a_ALEF_HAMZA_BELOW:
+	    tempc = a_f_LAM_ALEF_HAMZA_BELOW;
+	    break;
+	case a_ALEF:
+	    tempc = a_f_LAM_ALEF;
+	    break;
+	default:
+	    tempc = 0;
     }
-    return 0;
+
+    return tempc;
+}
+
+/*
+ * Returns whether it is possible to join the given letters
+ */
+    static int
+can_join(int c1, int c2)
+{
+    struct achar *a1 = find_achar(c1);
+    struct achar *a2 = find_achar(c2);
+
+    return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
 }
 
 /*
- * Do "half-shaping" on character "c".  Return zero if no shaping.
+ * Check whether we are dealing with a character that could be regarded as an
+ * Arabic combining character, need to check the character before this.
+ */
+    int
+arabic_maycombine(int two)
+{
+    if (p_arshape && !p_tbidi)
+	return (two == a_ALEF_MADDA
+		    || two == a_ALEF_HAMZA_ABOVE
+		    || two == a_ALEF_HAMZA_BELOW
+		    || two == a_ALEF);
+    return FALSE;
+}
+
+/*
+ * Check whether we are dealing with Arabic combining characters.
+ * Note: these are NOT really composing characters!
+ */
+    int
+arabic_combine(
+    int		one,	    // first character
+    int		two)	    // character just after "one"
+{
+    if (one == a_LAM)
+	return arabic_maycombine(two);
+    return FALSE;
+}
+
+/*
+ * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
+ *		(alphabet/number/punctuation)
  */
     static int
-half_shape(int c)
+A_is_iso(int c)
+{
+    return find_achar(c) != NULL;
+}
+
+/*
+ * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
+ */
+    static int
+A_is_ok(int c)
 {
-    if (A_is_a(c))
-	return chg_c_a2i(c);
-    if (A_is_valid(c) && A_is_f(c))
-	return chg_c_f2m(c);
-    return 0;
+    return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
+}
+
+/*
+ * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
+ *		with some exceptions/exclusions
+ */
+    static int
+A_is_valid(int c)
+{
+    return (A_is_ok(c) && c != a_HAMZA);
 }
 
 /*
@@ -567,48 +337,44 @@ arabic_shape(
     int		next_c)
 {
     int		curr_c;
-    int		shape_c;
     int		curr_laa;
     int		prev_laa;
 
-    /* Deal only with Arabic character, pass back all others */
+    // Deal only with Arabic characters, pass back all others
     if (!A_is_ok(c))
 	return c;
 
-    /* half-shape current and previous character */
-    shape_c = half_shape(prev_c);
-
-    curr_laa = A_firstc_laa(c, *c1p);
-    prev_laa = A_firstc_laa(prev_c, prev_c1);
+    curr_laa = arabic_combine(c, *c1p);
+    prev_laa = arabic_combine(prev_c, prev_c1);
 
     if (curr_laa)
     {
-	if (A_is_valid(prev_c) && !A_is_f(shape_c)
-					 && !A_is_s(shape_c) && !prev_laa)
-	    curr_c = chg_c_laa2f(curr_laa);
+	if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
+	    curr_c = chg_c_laa2f(*c1p);
 	else
-	    curr_c = chg_c_laa2i(curr_laa);
+	    curr_c = chg_c_laa2i(*c1p);
 
-	/* Remove the composing character */
+	// Remove the composing character
 	*c1p = 0;
     }
-    else if (!A_is_valid(prev_c) && A_is_valid(next_c))
-	curr_c = chg_c_a2i(c);
-    else if (!shape_c || A_is_f(shape_c) || A_is_s(shape_c) || prev_laa)
-	curr_c = A_is_valid(next_c) ? chg_c_a2i(c) : chg_c_a2s(c);
-    else if (A_is_valid(next_c))
-#if 0
-	curr_c = A_is_iso(c) ? chg_c_a2m(c) : chg_c_i2m(c);
-#else
-	curr_c = A_is_iso(c) ? chg_c_a2m(c) : 0;
-#endif
-    else if (A_is_valid(prev_c))
-	curr_c = chg_c_a2f(c);
     else
-	curr_c = chg_c_a2s(c);
+    {
+	struct achar *curr_a = find_achar(c);
+	int backward_combine = !prev_laa && can_join(prev_c, c);
+	int forward_combine = can_join(c, next_c);
 
-    /* Sanity check -- curr_c should, in the future, never be 0.
-     * We should, in the future, insert a fatal error here. */
+	if (backward_combine && forward_combine)
+	    curr_c = curr_a->medial;
+	if (backward_combine && !forward_combine)
+	    curr_c = curr_a->final;
+	if (!backward_combine && forward_combine)
+	    curr_c = curr_a->initial;
+	if (!backward_combine && !forward_combine)
+	    curr_c = curr_a->isolated;
+    }
+
+    // Sanity check -- curr_c should, in the future, never be 0.
+    // We should, in the future, insert a fatal error here.
     if (curr_c == NUL)
 	curr_c = c;
 
@@ -616,97 +382,12 @@ arabic_shape(
     {
 	char_u buf[MB_MAXBYTES + 1];
 
-	/* Update the first byte of the character. */
+	// Update the first byte of the character.
 	(*mb_char2bytes)(curr_c, buf);
 	*ccp = buf[0];
     }
 
-    /* Return the shaped character */
+    // Return the shaped character
     return curr_c;
 }
-
-
-/*
- * A_firstc_laa returns first character of LAA combination if it exists
- */
-    static int
-A_firstc_laa(
-    int c,	/* base character */
-    int c1)	/* first composing character */
-{
-    if (c1 != NUL && c == a_LAM && !A_is_harakat(c1))
-	return c1;
-    return 0;
-}
-
-
-/*
- * A_is_harakat returns TRUE if 'c' is an Arabic Harakat character
- *		(harakat/tanween)
- */
-    static int
-A_is_harakat(int c)
-{
-    return (c >= a_FATHATAN && c <= a_SUKUN);
-}
-
-
-/*
- * A_is_iso returns TRUE if 'c' is an Arabic ISO-8859-6 character
- *		(alphabet/number/punctuation)
- */
-    static int
-A_is_iso(int c)
-{
-    return ((c >= a_HAMZA && c <= a_GHAIN)
-	    || (c >= a_TATWEEL && c <= a_HAMZA_BELOW)
-	    || c == a_MINI_ALEF);
-}
-
-
-/*
- * A_is_formb returns TRUE if 'c' is an Arabic 10646-1 FormB character
- *		(alphabet/number/punctuation)
- */
-    static int
-A_is_formb(int c)
-{
-    return ((c >= a_s_FATHATAN && c <= a_s_DAMMATAN)
-	    || c == a_s_KASRATAN
-	    || (c >= a_s_FATHA && c <= a_f_LAM_ALEF)
-	    || c == a_BYTE_ORDER_MARK);
-}
-
-
-/*
- * A_is_ok returns TRUE if 'c' is an Arabic 10646 (8859-6 or Form-B)
- */
-    static int
-A_is_ok(int c)
-{
-    return (A_is_iso(c) || A_is_formb(c));
-}
-
-
-/*
- * A_is_valid returns TRUE if 'c' is an Arabic 10646 (8859-6 or Form-B)
- *		with some exceptions/exclusions
- */
-    static int
-A_is_valid(int c)
-{
-    return (A_is_ok(c) && !A_is_special(c));
-}
-
-
-/*
- * A_is_special returns TRUE if 'c' is not a special Arabic character.
- *		Specials don't adhere to most of the rules.
- */
-    static int
-A_is_special(int c)
-{
-    return (c == a_HAMZA || c == a_s_HAMZA);
-}
-
-#endif /* FEAT_ARABIC */
+#endif // FEAT_ARABIC