comparison src/charset.c @ 258:f93df7322443

updated for version 7.0070
author vimboss
date Sat, 23 Apr 2005 20:52:00 +0000
parents 4707450c2b33
children b3c0268f7815
comparison
equal deleted inserted replaced
257:51a4d1c2a95b 258:f93df7322443
913 return mb_get_class(p) >= 2; 913 return mb_get_class(p) >= 2;
914 # endif 914 # endif
915 return (GET_CHARTAB(buf, *p) != 0); 915 return (GET_CHARTAB(buf, *p) != 0);
916 } 916 }
917 917
918 static char spell_chartab[256]; 918 /*
919 * The tables used for spelling. These are only used for the first 256
920 * characters.
921 */
922 typedef struct spelltab_S
923 {
924 char_u st_isw[256]; /* flags: is word char */
925 char_u st_isu[256]; /* flags: is uppercase char */
926 char_u st_fold[256]; /* chars: folded case */
927 } spelltab_T;
928
929 static spelltab_T spelltab;
930 static int did_set_spelltab;
931
932 #define SPELL_ISWORD 1
933 #define SPELL_ISUPPER 2
934
935 static void clear_spell_chartab __ARGS((spelltab_T *sp));
936 static int set_spell_finish __ARGS((spelltab_T *new_st));
937
938 /*
939 * Init the chartab used for spelling for ASCII.
940 * EBCDIC is not supported!
941 */
942 static void
943 clear_spell_chartab(sp)
944 spelltab_T *sp;
945 {
946 int i;
947
948 /* Init everything to FALSE. */
949 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
950 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
951 for (i = 0; i < 256; ++i)
952 sp->st_fold[i] = i;
953
954 /* We include digits. A word shouldn't start with a digit, but handling
955 * that is done separately. */
956 for (i = '0'; i <= '9'; ++i)
957 sp->st_isw[i] = TRUE;
958 for (i = 'A'; i <= 'Z'; ++i)
959 {
960 sp->st_isw[i] = TRUE;
961 sp->st_isu[i] = TRUE;
962 sp->st_fold[i] = i + 0x20;
963 }
964 for (i = 'a'; i <= 'z'; ++i)
965 sp->st_isw[i] = TRUE;
966 }
919 967
920 /* 968 /*
921 * Init the chartab used for spelling. Only depends on 'encoding'. 969 * Init the chartab used for spelling. Only depends on 'encoding'.
922 * Called once while starting up and when 'encoding' was changed. 970 * Called once while starting up and when 'encoding' changes.
923 * Unfortunately, we can't use isalpha() here, since the current locale may 971 * The default is to use isalpha(), but the spell file should define the word
924 * differ from 'encoding'. 972 * characters to make it possible that 'encoding' differs from the current
973 * locale.
925 */ 974 */
926 void 975 void
927 init_spell_chartab() 976 init_spell_chartab()
928 { 977 {
929 int i; 978 int i;
930 979
931 /* ASCII is always the same, no matter what 'encoding' is used. 980 did_set_spelltab = FALSE;
932 * EBCDIC is not supported! */ 981 clear_spell_chartab(&spelltab);
933 for (i = 0; i < '0'; ++i) 982
934 spell_chartab[i] = FALSE;
935 /* We include numbers. A word shouldn't start with a number, but handling
936 * that is done separately. */
937 for ( ; i <= '9'; ++i)
938 spell_chartab[i] = TRUE;
939 for ( ; i < 'A'; ++i)
940 spell_chartab[i] = FALSE;
941 for ( ; i <= 'Z'; ++i)
942 spell_chartab[i] = TRUE;
943 for ( ; i < 'a'; ++i)
944 spell_chartab[i] = FALSE;
945 for ( ; i <= 'z'; ++i)
946 spell_chartab[i] = TRUE;
947 #ifdef FEAT_MBYTE 983 #ifdef FEAT_MBYTE
948 if (enc_dbcs) 984 if (enc_dbcs)
949 { 985 {
950 /* DBCS: assume double-wide characters are word characters. */ 986 /* DBCS: assume double-wide characters are word characters. */
951 for ( ; i <= 255; ++i) 987 for (i = 128; i <= 255; ++i)
952 if (MB_BYTE2LEN(i) == 2) 988 if (MB_BYTE2LEN(i) == 2)
953 spell_chartab[i] = TRUE; 989 spelltab.st_isw[i] = TRUE;
954 else
955 spell_chartab[i] = FALSE;
956 }
957 else if (STRCMP(p_enc, "cp850") == 0)
958 #endif
959 #if defined(MSDOS) || defined(FEAT_MBYTE)
960 {
961 /* cp850, MS-DOS */
962 for ( ; i < 128; ++i)
963 spell_chartab[i] = FALSE;
964 for ( ; i <= 0x9a; ++i)
965 spell_chartab[i] = TRUE;
966 for ( ; i < 0xa0; ++i)
967 spell_chartab[i] = FALSE;
968 for ( ; i <= 0xa5; ++i)
969 spell_chartab[i] = TRUE;
970 for ( ; i <= 255; ++i)
971 spell_chartab[i] = FALSE;
972 }
973 #endif
974 #ifdef FEAT_MBYTE
975 else if (STRCMP(p_enc, "iso-8859-2") == 0)
976 {
977 /* latin2 */
978 for ( ; i <= 0xa0; ++i)
979 spell_chartab[i] = FALSE;
980 for ( ; i <= 255; ++i)
981 spell_chartab[i] = TRUE;
982 spell_chartab[0xa4] = FALSE; /* currency sign */
983 spell_chartab[0xa7] = FALSE; /* paragraph sign */
984 spell_chartab[0xad] = FALSE; /* dash */
985 spell_chartab[0xb0] = FALSE; /* degrees */
986 spell_chartab[0xf7] = FALSE; /* divide-by */
987 } 990 }
988 else 991 else
989 #endif 992 #endif
990 #if defined(FEAT_MBYTE) || !defined(MSDOS) 993 {
991 { 994 /* Rough guess: use isalpha() for characters above 128. */
992 /* Rough guess: anything we don't recognize assumes word characters 995 for (i = 128; i < 256; ++i)
993 * like latin1. */ 996 {
994 for ( ; i < 0xc0; ++i) 997 spelltab.st_isw[i] = isalpha(i);
995 spell_chartab[i] = FALSE; 998 if (isupper(i))
996 for ( ; i <= 255; ++i) 999 {
997 spell_chartab[i] = TRUE; 1000 spelltab.st_isu[i] = TRUE;
998 # ifdef FEAT_MBYTE 1001 spelltab.st_fold[i] = tolower(i);
999 if (STRCMP(p_enc, "latin1") == 0) 1002 }
1000 # endif 1003 }
1001 spell_chartab[0xf7] = FALSE; /* divide-by */ 1004 }
1002 } 1005 }
1003 #endif 1006
1004 } 1007 static char *e_affform = N_("E761: Format error in affix file FOL, LOW or UPP");
1005 1008 static char *e_affrange = N_("E762: Character in FOL, LOW or UPP is out of range");
1006 /* 1009
1007 * Return TRUE if "p" points to a word character. 1010 /*
1008 * This only depends on 'encoding', not on 'iskeyword'. 1011 * Set the spell character tables from strings in the affix file.
1012 */
1013 int
1014 set_spell_chartab(fol, low, upp)
1015 char_u *fol;
1016 char_u *low;
1017 char_u *upp;
1018 {
1019 /* We build the new tables here first, so that we can compare with the
1020 * previous one. */
1021 spelltab_T new_st;
1022 char_u *pf = fol, *pl = low, *pu = upp;
1023 int f, l, u;
1024
1025 clear_spell_chartab(&new_st);
1026
1027 while (*pf != NUL)
1028 {
1029 if (*pl == NUL || *pu == NUL)
1030 {
1031 EMSG(_(e_affform));
1032 return FAIL;
1033 }
1034 #ifdef FEAT_MBYTE
1035 f = mb_ptr2char_adv(&pf);
1036 l = mb_ptr2char_adv(&pl);
1037 u = mb_ptr2char_adv(&pu);
1038 #else
1039 f = *pf++;
1040 l = *pl++;
1041 u = *pu++;
1042 #endif
1043 /* Every character that appears is a word character. */
1044 if (f < 256)
1045 new_st.st_isw[f] = TRUE;
1046 if (l < 256)
1047 new_st.st_isw[l] = TRUE;
1048 if (u < 256)
1049 new_st.st_isw[u] = TRUE;
1050
1051 /* if "LOW" and "FOL" are not the same the "LOW" char needs
1052 * case-folding */
1053 if (l < 256 && l != f)
1054 {
1055 if (f >= 256)
1056 {
1057 EMSG(_(e_affrange));
1058 return FAIL;
1059 }
1060 new_st.st_fold[l] = f;
1061 }
1062
1063 /* if "UPP" and "FOL" are not the same the "UPP" char needs
1064 * case-folding and it's upper case. */
1065 if (u < 256 && u != f)
1066 {
1067 if (f >= 256)
1068 {
1069 EMSG(_(e_affrange));
1070 return FAIL;
1071 }
1072 new_st.st_fold[u] = f;
1073 new_st.st_isu[u] = TRUE;
1074 }
1075 }
1076
1077 if (*pl != NUL || *pu != NUL)
1078 {
1079 EMSG(_(e_affform));
1080 return FAIL;
1081 }
1082
1083 return set_spell_finish(&new_st);
1084 }
1085
1086 /*
1087 * Set the spell character tables from strings in the .spl file.
1088 */
1089 int
1090 set_spell_charflags(flags, cnt, upp)
1091 char_u *flags;
1092 int cnt;
1093 char_u *upp;
1094 {
1095 /* We build the new tables here first, so that we can compare with the
1096 * previous one. */
1097 spelltab_T new_st;
1098 int i;
1099 char_u *p = upp;
1100
1101 clear_spell_chartab(&new_st);
1102
1103 for (i = 0; i < cnt; ++i)
1104 {
1105 new_st.st_isw[i + 128] = (flags[i] & SPELL_ISWORD) != 0;
1106 new_st.st_isu[i + 128] = (flags[i] & SPELL_ISUPPER) != 0;
1107
1108 if (*p == NUL)
1109 return FAIL;
1110 #ifdef FEAT_MBYTE
1111 new_st.st_fold[i + 128] = mb_ptr2char_adv(&p);
1112 #else
1113 new_st.st_fold[i + 128] = *p++;
1114 #endif
1115 }
1116
1117 return set_spell_finish(&new_st);
1118 }
1119
1120 static int
1121 set_spell_finish(new_st)
1122 spelltab_T *new_st;
1123 {
1124 int i;
1125
1126 if (did_set_spelltab)
1127 {
1128 /* check that it's the same table */
1129 for (i = 0; i < 256; ++i)
1130 {
1131 if (spelltab.st_isw[i] != new_st->st_isw[i]
1132 || spelltab.st_isu[i] != new_st->st_isu[i]
1133 || spelltab.st_fold[i] != new_st->st_fold[i])
1134 {
1135 EMSG(_("E763: Word characters differ between spell files"));
1136 return FAIL;
1137 }
1138 }
1139 }
1140 else
1141 {
1142 /* copy the new spelltab into the one being used */
1143 spelltab = *new_st;
1144 did_set_spelltab = TRUE;
1145 }
1146
1147 return OK;
1148 }
1149
1150 #if defined(FEAT_MBYTE) || defined(PROTO)
1151 /*
1152 * Write the current tables into the .spl file.
1153 */
1154 void
1155 write_spell_chartab(fd)
1156 FILE *fd;
1157 {
1158 char_u charbuf[256 * 4];
1159 int len = 0;
1160 int flags;
1161 int i;
1162
1163 if (!did_set_spelltab)
1164 {
1165 /* No character table specified, write zero counts. */
1166 fputc(0, fd);
1167 fputc(0, fd);
1168 fputc(0, fd);
1169 return;
1170 }
1171
1172 fputc(128, fd); /* <charflagslen> */
1173 for (i = 128; i < 256; ++i)
1174 {
1175 flags = 0;
1176 if (spelltab.st_isw[i])
1177 flags |= SPELL_ISWORD;
1178 if (spelltab.st_isu[i])
1179 flags |= SPELL_ISUPPER;
1180 fputc(flags, fd); /* <charflags> */
1181
1182 len += mb_char2bytes(spelltab.st_fold[i], charbuf + len);
1183 }
1184
1185 put_bytes(fd, (long_u)len, 2); /* <fcharlen> */
1186 fwrite(charbuf, (size_t)len, (size_t)1, fd); /* <fchars> */
1187 }
1188 #endif
1189
1190 /*
1191 * Return TRUE if "p" points to a word character for spelling.
1009 */ 1192 */
1010 int 1193 int
1011 spell_iswordc(p) 1194 spell_iswordc(p)
1012 char_u *p; 1195 char_u *p;
1013 { 1196 {
1014 # ifdef FEAT_MBYTE 1197 # ifdef FEAT_MBYTE
1015 if (has_mbyte && MB_BYTE2LEN(*p) > 1) 1198 if (has_mbyte && MB_BYTE2LEN(*p) > 1)
1016 return mb_get_class(p) >= 2; 1199 return mb_get_class(p) >= 2;
1017 # endif 1200 # endif
1018 return spell_chartab[*p]; 1201 return spelltab.st_isw[*p];
1019 } 1202 }
1020 #endif 1203
1204 /*
1205 * Return TRUE if "c" is an upper-case character for spelling.
1206 */
1207 int
1208 spell_isupper(c)
1209 int c;
1210 {
1211 # ifdef FEAT_MBYTE
1212 if (enc_utf8)
1213 {
1214 /* For Unicode we can call utf_isupper(), but don't do that for ASCII,
1215 * because we don't want to use 'casemap' here. */
1216 if (c >= 128)
1217 return utf_isupper(c);
1218 }
1219 else if (has_mbyte && c > 256)
1220 {
1221 /* For characters above 255 we don't have something specfied.
1222 * Fall back to locale-dependent iswupper(). If not available
1223 * simply return FALSE. */
1224 # ifdef HAVE_ISWUPPER
1225 return iswupper(c);
1226 # else
1227 return FALSE;
1228 # endif
1229 }
1230 # endif
1231 return spelltab.st_isu[c];
1232 }
1233
1234 /*
1235 * case-fold "p[len]" into "buf[buflen]". Used for spell checking.
1236 * Returns FAIL when something wrong.
1237 */
1238 int
1239 spell_casefold(p, len, buf, buflen)
1240 char_u *p;
1241 int len;
1242 char_u *buf;
1243 int buflen;
1244 {
1245 int i;
1246
1247 if (len >= buflen)
1248 {
1249 buf[0] = NUL;
1250 return FAIL; /* result will not fit */
1251 }
1252
1253 #ifdef FEAT_MBYTE
1254 if (has_mbyte)
1255 {
1256 int c;
1257 int outi = 0;
1258
1259 /* Fold one character at a time. */
1260 for (i = 0; i < len; i += mb_ptr2len_check(p + i))
1261 {
1262 c = mb_ptr2char(p + i);
1263 if (enc_utf8)
1264 /* For Unicode case folding is always the same, no need to use
1265 * the table from the spell file. */
1266 c = utf_fold(c);
1267 else if (c < 256)
1268 /* Use the table from the spell file. */
1269 c = spelltab.st_fold[c];
1270 # ifdef HAVE_TOWLOWER
1271 else
1272 /* We don't know what to do, fall back to towlower(), it
1273 * depends on the current locale. */
1274 c = towlower(c);
1275 # endif
1276 if (outi + MB_MAXBYTES > buflen)
1277 {
1278 buf[outi] = NUL;
1279 return FAIL;
1280 }
1281 outi += mb_char2bytes(c, buf + outi);
1282 }
1283 buf[outi] = NUL;
1284 }
1285 else
1286 #endif
1287 {
1288 /* Be quick for non-multibyte encodings. */
1289 for (i = 0; i < len; ++i)
1290 buf[i] = spelltab.st_fold[p[i]];
1291 buf[i] = NUL;
1292 }
1293
1294 return OK;
1295 }
1296
1297 #endif /* FEAT_SYN_HL */
1021 1298
1022 /* 1299 /*
1023 * return TRUE if 'c' is a valid file-name character 1300 * return TRUE if 'c' is a valid file-name character
1024 * Assume characters above 0x100 are valid (multi-byte). 1301 * Assume characters above 0x100 are valid (multi-byte).
1025 */ 1302 */