Mercurial > vim
comparison src/fileio.c @ 595:fea48f63efc8
updated for version 7.0169
author | vimboss |
---|---|
date | Tue, 13 Dec 2005 20:02:15 +0000 |
parents | 6a91f35b354d |
children | ba54311bc43e |
comparison
equal
deleted
inserted
replaced
594:35cef95a6b76 | 595:fea48f63efc8 |
---|---|
122 }; | 122 }; |
123 | 123 |
124 static int buf_write_bytes __ARGS((struct bw_info *ip)); | 124 static int buf_write_bytes __ARGS((struct bw_info *ip)); |
125 | 125 |
126 #ifdef FEAT_MBYTE | 126 #ifdef FEAT_MBYTE |
127 static linenr_T readfile_linenr __ARGS((linenr_T linecnt, char_u *p, char_u *endp)); | |
127 static int ucs2bytes __ARGS((unsigned c, char_u **pp, int flags)); | 128 static int ucs2bytes __ARGS((unsigned c, char_u **pp, int flags)); |
128 static int same_encoding __ARGS((char_u *a, char_u *b)); | 129 static int same_encoding __ARGS((char_u *a, char_u *b)); |
129 static int get_fio_flags __ARGS((char_u *ptr)); | 130 static int get_fio_flags __ARGS((char_u *ptr)); |
130 static char_u *check_for_bom __ARGS((char_u *p, long size, int *lenp, int flags)); | 131 static char_u *check_for_bom __ARGS((char_u *p, long size, int *lenp, int flags)); |
131 static int make_bom __ARGS((char_u *buf, char_u *name)); | 132 static int make_bom __ARGS((char_u *buf, char_u *name)); |
135 # ifdef MACOS_X | 136 # ifdef MACOS_X |
136 static int get_mac_fio_flags __ARGS((char_u *ptr)); | 137 static int get_mac_fio_flags __ARGS((char_u *ptr)); |
137 # endif | 138 # endif |
138 #endif | 139 #endif |
139 static int move_lines __ARGS((buf_T *frombuf, buf_T *tobuf)); | 140 static int move_lines __ARGS((buf_T *frombuf, buf_T *tobuf)); |
141 | |
140 | 142 |
141 void | 143 void |
142 filemess(buf, name, s, attr) | 144 filemess(buf, name, s, attr) |
143 buf_T *buf; | 145 buf_T *buf; |
144 char_u *name; | 146 char_u *name; |
255 int try_dos = (vim_strchr(p_ffs, 'd') != NULL); | 257 int try_dos = (vim_strchr(p_ffs, 'd') != NULL); |
256 int try_unix = (vim_strchr(p_ffs, 'x') != NULL); | 258 int try_unix = (vim_strchr(p_ffs, 'x') != NULL); |
257 int file_rewind = FALSE; | 259 int file_rewind = FALSE; |
258 #ifdef FEAT_MBYTE | 260 #ifdef FEAT_MBYTE |
259 int can_retry; | 261 int can_retry; |
260 int conv_error = FALSE; /* conversion error detected */ | 262 linenr_T conv_error = 0; /* line nr with conversion error */ |
263 linenr_T illegal_byte = 0; /* line nr with illegal byte */ | |
261 int keep_dest_enc = FALSE; /* don't retry when char doesn't fit | 264 int keep_dest_enc = FALSE; /* don't retry when char doesn't fit |
262 in destination encoding */ | 265 in destination encoding */ |
263 linenr_T illegal_byte = 0; /* line nr with illegal byte */ | 266 int bad_char_behavior = BAD_REPLACE; |
267 /* BAD_KEEP, BAD_DROP or character to | |
268 * replace with */ | |
264 char_u *tmpname = NULL; /* name of 'charconvert' output file */ | 269 char_u *tmpname = NULL; /* name of 'charconvert' output file */ |
265 int fio_flags = 0; | 270 int fio_flags = 0; |
266 char_u *fenc; /* fileencoding to use */ | 271 char_u *fenc; /* fileencoding to use */ |
267 int fenc_alloced; /* fenc_next is in allocated memory */ | 272 int fenc_alloced; /* fenc_next is in allocated memory */ |
268 char_u *fenc_next = NULL; /* next item in 'fencs' or NULL */ | 273 char_u *fenc_next = NULL; /* next item in 'fencs' or NULL */ |
752 * fileformat, and after the autocommands, which may change them. | 757 * fileformat, and after the autocommands, which may change them. |
753 */ | 758 */ |
754 linecnt = curbuf->b_ml.ml_line_count; | 759 linecnt = curbuf->b_ml.ml_line_count; |
755 | 760 |
756 #ifdef FEAT_MBYTE | 761 #ifdef FEAT_MBYTE |
762 /* "++bad=" argument. */ | |
763 if (eap != NULL && eap->bad_char != 0) | |
764 bad_char_behavior = eap->bad_char; | |
765 | |
757 /* | 766 /* |
758 * Decide which 'encoding' to use first. | 767 * Decide which 'encoding' to use or use first. |
759 */ | 768 */ |
760 if (eap != NULL && eap->force_enc != 0) | 769 if (eap != NULL && eap->force_enc != 0) |
761 { | 770 { |
762 fenc = enc_canonize(eap->cmd + eap->force_enc); | 771 fenc = enc_canonize(eap->cmd + eap->force_enc); |
763 fenc_alloced = TRUE; | 772 fenc_alloced = TRUE; |
773 keep_dest_enc = TRUE; | |
764 } | 774 } |
765 else if (curbuf->b_p_bin) | 775 else if (curbuf->b_p_bin) |
766 { | 776 { |
767 fenc = (char_u *)""; /* binary: don't convert */ | 777 fenc = (char_u *)""; /* binary: don't convert */ |
768 fenc_alloced = FALSE; | 778 fenc_alloced = FALSE; |
862 ml_delete(lnum--, FALSE); | 872 ml_delete(lnum--, FALSE); |
863 file_rewind = FALSE; | 873 file_rewind = FALSE; |
864 #ifdef FEAT_MBYTE | 874 #ifdef FEAT_MBYTE |
865 if (newfile) | 875 if (newfile) |
866 curbuf->b_p_bomb = FALSE; | 876 curbuf->b_p_bomb = FALSE; |
867 conv_error = FALSE; | 877 conv_error = 0; |
868 #endif | 878 #endif |
869 } | 879 } |
870 | 880 |
871 /* | 881 /* |
872 * When retrying with another "fenc" and the first time "fileformat" | 882 * When retrying with another "fenc" and the first time "fileformat" |
906 if (eap != NULL && eap->force_enc != 0) | 916 if (eap != NULL && eap->force_enc != 0) |
907 { | 917 { |
908 /* Conversion given with "++cc=" wasn't possible, read | 918 /* Conversion given with "++cc=" wasn't possible, read |
909 * without conversion. */ | 919 * without conversion. */ |
910 notconverted = TRUE; | 920 notconverted = TRUE; |
911 conv_error = FALSE; | 921 conv_error = 0; |
912 if (fenc_alloced) | 922 if (fenc_alloced) |
913 vim_free(fenc); | 923 vim_free(fenc); |
914 fenc = (char_u *)""; | 924 fenc = (char_u *)""; |
915 fenc_alloced = FALSE; | 925 fenc_alloced = FALSE; |
916 } | 926 } |
1041 goto retry; | 1051 goto retry; |
1042 } | 1052 } |
1043 } | 1053 } |
1044 } | 1054 } |
1045 | 1055 |
1046 /* Set can_retry when it's possible to rewind the file and try with | 1056 /* Set "can_retry" when it's possible to rewind the file and try with |
1047 * another "fenc" value. It's FALSE when no other "fenc" to try, reading | 1057 * another "fenc" value. It's FALSE when no other "fenc" to try, reading |
1048 * stdin or "fenc" was specified with "++enc=". */ | 1058 * stdin or fixed at a specific encoding. */ |
1049 can_retry = (*fenc != NUL && !read_stdin | 1059 can_retry = (*fenc != NUL && !read_stdin && !keep_dest_enc); |
1050 && (eap == NULL || eap->force_enc == 0)); | |
1051 #endif | 1060 #endif |
1052 | 1061 |
1053 if (!skip_read) | 1062 if (!skip_read) |
1054 { | 1063 { |
1055 linerest = 0; | 1064 linerest = 0; |
1227 { | 1236 { |
1228 if (size < 0) /* read error */ | 1237 if (size < 0) /* read error */ |
1229 error = TRUE; | 1238 error = TRUE; |
1230 #ifdef FEAT_MBYTE | 1239 #ifdef FEAT_MBYTE |
1231 else if (conv_restlen > 0) | 1240 else if (conv_restlen > 0) |
1232 /* some trailing bytes unconverted */ | 1241 { |
1233 conv_error = TRUE; | 1242 /* Reached end-of-file but some trailing bytes could |
1243 * not be converted. Trucated file? */ | |
1244 if (conv_error == 0) | |
1245 conv_error = linecnt; | |
1246 if (bad_char_behavior != BAD_DROP) | |
1247 { | |
1248 fio_flags = 0; /* don't convert this */ | |
1249 if (bad_char_behavior == BAD_KEEP) | |
1250 { | |
1251 /* Keep the trailing bytes as-is. */ | |
1252 size = conv_restlen; | |
1253 ptr -= conv_restlen; | |
1254 } | |
1255 else | |
1256 { | |
1257 /* Replace the trailing bytes with the | |
1258 * replacement character. */ | |
1259 size = 1; | |
1260 *--ptr = bad_char_behavior; | |
1261 } | |
1262 conv_restlen = 0; | |
1263 } | |
1264 } | |
1234 #endif | 1265 #endif |
1235 } | 1266 } |
1236 | 1267 |
1237 #ifdef FEAT_CRYPT | 1268 #ifdef FEAT_CRYPT |
1238 /* | 1269 /* |
1347 while ((iconv(iconv_fd, (void *)&fromp, &from_size, | 1378 while ((iconv(iconv_fd, (void *)&fromp, &from_size, |
1348 &top, &to_size) | 1379 &top, &to_size) |
1349 == (size_t)-1 && ICONV_ERRNO != ICONV_EINVAL) | 1380 == (size_t)-1 && ICONV_ERRNO != ICONV_EINVAL) |
1350 || from_size > CONV_RESTLEN) | 1381 || from_size > CONV_RESTLEN) |
1351 { | 1382 { |
1352 if (!keep_dest_enc && can_retry) | 1383 if (can_retry) |
1353 goto rewind_retry; | 1384 goto rewind_retry; |
1354 if (!keep_dest_enc) | 1385 if (conv_error == 0) |
1355 conv_error = TRUE; | 1386 conv_error = readfile_linenr(linecnt, |
1356 | 1387 ptr, (char_u *)top); |
1357 /* Ignore a byte and try again. */ | 1388 |
1389 /* Deal with a bad byte and continue with the next. */ | |
1358 ++fromp; | 1390 ++fromp; |
1359 --from_size; | 1391 --from_size; |
1360 *top++ = '?'; | 1392 if (bad_char_behavior == BAD_KEEP) |
1361 --to_size; | 1393 { |
1394 *top++ = *(fromp - 1); | |
1395 --to_size; | |
1396 } | |
1397 else if (bad_char_behavior != BAD_DROP) | |
1398 { | |
1399 *top++ = bad_char_behavior; | |
1400 --to_size; | |
1401 } | |
1362 } | 1402 } |
1363 | 1403 |
1364 if (from_size > 0) | 1404 if (from_size > 0) |
1365 { | 1405 { |
1366 /* Some remaining characters, keep them for the next | 1406 /* Some remaining characters, keep them for the next |
1377 # endif | 1417 # endif |
1378 | 1418 |
1379 # ifdef WIN3264 | 1419 # ifdef WIN3264 |
1380 if (fio_flags & FIO_CODEPAGE) | 1420 if (fio_flags & FIO_CODEPAGE) |
1381 { | 1421 { |
1422 char_u *src, *dst; | |
1423 int u8c; | |
1424 WCHAR ucs2buf[3]; | |
1425 int ucs2len; | |
1426 int codepage = FIO_GET_CP(fio_flags); | |
1427 int bytelen; | |
1428 int found_bad; | |
1429 char replstr[2]; | |
1430 | |
1382 /* | 1431 /* |
1383 * Conversion from an MS-Windows codepage or UTF-8 to UTF-8 or | 1432 * Conversion from an MS-Windows codepage or UTF-8 to UTF-8 or |
1384 * a codepage, using standard MS-Windows functions. | 1433 * a codepage, using standard MS-Windows functions. This |
1385 * 1. find out how many ucs-2 characters there are. | 1434 * requires two steps: |
1386 * 2. convert from 'fileencoding' to ucs-2 | 1435 * 1. convert from 'fileencoding' to ucs-2 |
1387 * 3. convert from ucs-2 to 'encoding' | 1436 * 2. convert from ucs-2 to 'encoding' |
1437 * | |
1438 * Because there may be illegal bytes AND an incomplete byte | |
1439 * sequence at the end, we may have to do the conversion one | |
1440 * character at a time to get it right. | |
1388 */ | 1441 */ |
1389 char_u *ucsp; | 1442 |
1390 size_t from_size = size; | 1443 /* Replacement string for WideCharToMultiByte(). */ |
1391 int needed; | 1444 if (bad_char_behavior > 0) |
1392 char_u *p; | 1445 replstr[0] = bad_char_behavior; |
1393 int u8c; | 1446 else |
1447 replstr[0] = '?'; | |
1448 replstr[1] = NUL; | |
1394 | 1449 |
1395 /* | 1450 /* |
1396 * 1. find out how many ucs-2 characters there are. | 1451 * Move the bytes to the end of the buffer, so that we have |
1452 * room to put the result at the start. | |
1397 */ | 1453 */ |
1454 src = ptr + real_size - size; | |
1455 mch_memmove(src, ptr, size); | |
1456 | |
1457 /* | |
1458 * Do the conversion. | |
1459 */ | |
1460 dst = ptr; | |
1461 size = size; | |
1462 while (size > 0) | |
1463 { | |
1464 found_bad = FALSE; | |
1465 | |
1398 # ifdef CP_UTF8 /* VC 4.1 doesn't define CP_UTF8 */ | 1466 # ifdef CP_UTF8 /* VC 4.1 doesn't define CP_UTF8 */ |
1399 if (FIO_GET_CP(fio_flags) == CP_UTF8) | 1467 if (codepage == CP_UTF8) |
1468 { | |
1469 /* Handle CP_UTF8 input ourselves to be able to handle | |
1470 * trailing bytes properly. | |
1471 * Get one UTF-8 character from src. */ | |
1472 bytelen = utf_ptr2len_len(src, size); | |
1473 if (bytelen > size) | |
1474 { | |
1475 /* Only got some bytes of a character. Normally | |
1476 * it's put in "conv_rest", but if it's too long | |
1477 * deal with it as if they were illegal bytes. */ | |
1478 if (bytelen <= CONV_RESTLEN) | |
1479 break; | |
1480 | |
1481 /* weird overlong byte sequence */ | |
1482 bytelen = size; | |
1483 found_bad = TRUE; | |
1484 } | |
1485 else | |
1486 { | |
1487 u8c = utf_ptr2char(src); | |
1488 if (u8c > 0xffff) | |
1489 found_bad = TRUE; | |
1490 ucs2buf[0] = u8c; | |
1491 ucs2len = 1; | |
1492 } | |
1493 } | |
1494 else | |
1495 # endif | |
1496 { | |
1497 /* We don't know how long the byte sequence is, try | |
1498 * from one to three bytes. */ | |
1499 for (bytelen = 1; bytelen <= size && bytelen <= 3; | |
1500 ++bytelen) | |
1501 { | |
1502 ucs2len = MultiByteToWideChar(codepage, | |
1503 MB_ERR_INVALID_CHARS, | |
1504 (LPCSTR)src, bytelen, | |
1505 ucs2buf, 3); | |
1506 if (ucs2len > 0) | |
1507 break; | |
1508 } | |
1509 if (ucs2len == 0) | |
1510 { | |
1511 /* If we have only one byte then it's probably an | |
1512 * incomplete byte sequence. Otherwise discard | |
1513 * one byte as a bad character. */ | |
1514 if (size == 1) | |
1515 break; | |
1516 found_bad = TRUE; | |
1517 bytelen = 1; | |
1518 } | |
1519 } | |
1520 | |
1521 if (!found_bad) | |
1522 { | |
1523 int i; | |
1524 | |
1525 /* Convert "ucs2buf[ucs2len]" to 'enc' in "dst". */ | |
1526 if (enc_utf8) | |
1527 { | |
1528 /* From UCS-2 to UTF-8. Cannot fail. */ | |
1529 for (i = 0; i < ucs2len; ++i) | |
1530 dst += utf_char2bytes(ucs2buf[i], dst); | |
1531 } | |
1532 else | |
1533 { | |
1534 BOOL bad = FALSE; | |
1535 int dstlen; | |
1536 | |
1537 /* From UCS-2 to "enc_codepage". If the | |
1538 * conversion uses the default character "?", | |
1539 * the data doesn't fit in this encoding. */ | |
1540 dstlen = WideCharToMultiByte(enc_codepage, 0, | |
1541 (LPCWSTR)ucs2buf, ucs2len, | |
1542 (LPSTR)dst, (src - dst), | |
1543 replstr, &bad); | |
1544 if (bad) | |
1545 found_bad = TRUE; | |
1546 else | |
1547 dst += dstlen; | |
1548 } | |
1549 } | |
1550 | |
1551 if (found_bad) | |
1552 { | |
1553 /* Deal with bytes we can't convert. */ | |
1554 if (can_retry) | |
1555 goto rewind_retry; | |
1556 if (conv_error == 0) | |
1557 conv_error = readfile_linenr(linecnt, ptr, dst); | |
1558 if (bad_char_behavior != BAD_DROP) | |
1559 { | |
1560 if (bad_char_behavior == BAD_KEEP) | |
1561 { | |
1562 mch_memmove(dst, src, bytelen); | |
1563 dst += bytelen; | |
1564 } | |
1565 else | |
1566 *dst++ = bad_char_behavior; | |
1567 } | |
1568 } | |
1569 | |
1570 src += bytelen; | |
1571 size -= bytelen; | |
1572 } | |
1573 | |
1574 if (size > 0) | |
1400 { | 1575 { |
1401 int l, flen; | 1576 /* An incomplete byte sequence remaining. */ |
1402 | 1577 mch_memmove(conv_rest, src, size); |
1403 /* Handle CP_UTF8 ourselves to be able to handle trailing | 1578 conv_restlen = size; |
1404 * bytes properly. First find out the number of | |
1405 * characters and check for trailing bytes. */ | |
1406 needed = 0; | |
1407 p = ptr; | |
1408 for (flen = from_size; flen > 0; flen -= l) | |
1409 { | |
1410 l = utf_ptr2len_len(p, flen); | |
1411 if (l > flen) /* incomplete char */ | |
1412 { | |
1413 if (l > CONV_RESTLEN) | |
1414 /* weird overlong byte sequence */ | |
1415 goto rewind_retry; | |
1416 mch_memmove(conv_rest, p, flen); | |
1417 conv_restlen = flen; | |
1418 from_size -= flen; | |
1419 break; | |
1420 } | |
1421 if (l == 1 && *p >= 0x80) /* illegal byte */ | |
1422 goto rewind_retry; | |
1423 ++needed; | |
1424 p += l; | |
1425 } | |
1426 } | 1579 } |
1427 else | 1580 |
1428 # endif | 1581 /* The new size is equal to how much "dst" was advanced. */ |
1429 { | 1582 size = dst - ptr; |
1430 /* We can't tell if the last byte of an MBCS string is | |
1431 * valid and MultiByteToWideChar() returns zero if it | |
1432 * isn't. Try the whole string, and if that fails, bump | |
1433 * the last byte into conv_rest and try again. */ | |
1434 needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), | |
1435 MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, | |
1436 NULL, 0); | |
1437 if (needed == 0) | |
1438 { | |
1439 conv_rest[0] = ptr[from_size - 1]; | |
1440 conv_restlen = 1; | |
1441 --from_size; | |
1442 needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), | |
1443 MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, | |
1444 NULL, 0); | |
1445 } | |
1446 | |
1447 /* If there really is a conversion error, try using another | |
1448 * conversion. */ | |
1449 if (needed == 0) | |
1450 goto rewind_retry; | |
1451 } | |
1452 | |
1453 /* | |
1454 * 2. convert from 'fileencoding' to ucs-2 | |
1455 * | |
1456 * Put the result of conversion to UCS-2 at the end of the | |
1457 * buffer, then convert from UCS-2 to UTF-8 or "enc_codepage" | |
1458 * into the start of the buffer. If there is not enough space | |
1459 * just fail, there is probably something wrong. | |
1460 */ | |
1461 ucsp = ptr + real_size - (needed * sizeof(WCHAR)); | |
1462 if (ucsp < ptr + size) | |
1463 goto rewind_retry; | |
1464 | |
1465 # ifdef CP_UTF8 /* VC 4.1 doesn't define CP_UTF8 */ | |
1466 if (FIO_GET_CP(fio_flags) == CP_UTF8) | |
1467 { | |
1468 int l, flen; | |
1469 | |
1470 /* Convert from utf-8 to ucs-2. */ | |
1471 needed = 0; | |
1472 p = ptr; | |
1473 for (flen = from_size; flen > 0; flen -= l) | |
1474 { | |
1475 l = utf_ptr2len_len(p, flen); | |
1476 u8c = utf_ptr2char(p); | |
1477 ucsp[needed * 2] = (u8c & 0xff); | |
1478 ucsp[needed * 2 + 1] = (u8c >> 8); | |
1479 ++needed; | |
1480 p += l; | |
1481 } | |
1482 } | |
1483 else | |
1484 # endif | |
1485 needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), | |
1486 MB_ERR_INVALID_CHARS, (LPCSTR)ptr, | |
1487 from_size, (LPWSTR)ucsp, needed); | |
1488 | |
1489 /* | |
1490 * 3. convert from ucs-2 to 'encoding' | |
1491 */ | |
1492 if (enc_utf8) | |
1493 { | |
1494 /* From UCS-2 to UTF-8. Cannot fail. */ | |
1495 p = ptr; | |
1496 for (; needed > 0; --needed) | |
1497 { | |
1498 u8c = *ucsp++; | |
1499 u8c += (*ucsp++ << 8); | |
1500 p += utf_char2bytes(u8c, p); | |
1501 } | |
1502 size = p - ptr; | |
1503 } | |
1504 else | |
1505 { | |
1506 BOOL bad = FALSE; | |
1507 | |
1508 /* From UCS-2 to "enc_codepage". If the conversion uses | |
1509 * the default character "?", the data doesn't fit in this | |
1510 * encoding, so fail (unless forced). */ | |
1511 size = WideCharToMultiByte(enc_codepage, 0, | |
1512 (LPCWSTR)ucsp, needed, | |
1513 (LPSTR)ptr, real_size, "?", &bad); | |
1514 if (bad && !keep_dest_enc) | |
1515 goto rewind_retry; | |
1516 } | |
1517 } | 1583 } |
1518 else | 1584 else |
1519 # endif | 1585 # endif |
1520 # ifdef MACOS_X | 1586 # ifdef MACOS_X |
1521 if (fio_flags & FIO_MACROMAN) | 1587 if (fio_flags & FIO_MACROMAN) |
1626 if (p == ptr) | 1692 if (p == ptr) |
1627 { | 1693 { |
1628 /* Missing leading word. */ | 1694 /* Missing leading word. */ |
1629 if (can_retry) | 1695 if (can_retry) |
1630 goto rewind_retry; | 1696 goto rewind_retry; |
1631 conv_error = TRUE; | 1697 if (conv_error == 0) |
1698 conv_error = readfile_linenr(linecnt, | |
1699 ptr, p); | |
1700 if (bad_char_behavior == BAD_DROP) | |
1701 continue; | |
1702 if (bad_char_behavior != BAD_KEEP) | |
1703 u8c = bad_char_behavior; | |
1632 } | 1704 } |
1633 | 1705 |
1634 /* found second word of double-word, get the first | 1706 /* found second word of double-word, get the first |
1635 * word and compute the resulting character */ | 1707 * word and compute the resulting character */ |
1636 if (fio_flags & FIO_ENDIAN_L) | 1708 if (fio_flags & FIO_ENDIAN_L) |
1641 else | 1713 else |
1642 { | 1714 { |
1643 u16c = *--p; | 1715 u16c = *--p; |
1644 u16c += (*--p << 8); | 1716 u16c += (*--p << 8); |
1645 } | 1717 } |
1718 u8c = 0x10000 + ((u16c & 0x3ff) << 10) | |
1719 + (u8c & 0x3ff); | |
1720 | |
1646 /* Check if the word is indeed a leading word. */ | 1721 /* Check if the word is indeed a leading word. */ |
1647 if (u16c < 0xd800 || u16c > 0xdbff) | 1722 if (u16c < 0xd800 || u16c > 0xdbff) |
1648 { | 1723 { |
1649 if (can_retry) | 1724 if (can_retry) |
1650 goto rewind_retry; | 1725 goto rewind_retry; |
1651 conv_error = TRUE; | 1726 if (conv_error == 0) |
1727 conv_error = readfile_linenr(linecnt, | |
1728 ptr, p); | |
1729 if (bad_char_behavior == BAD_DROP) | |
1730 continue; | |
1731 if (bad_char_behavior != BAD_KEEP) | |
1732 u8c = bad_char_behavior; | |
1652 } | 1733 } |
1653 u8c = 0x10000 + ((u16c & 0x3ff) << 10) | |
1654 + (u8c & 0x3ff); | |
1655 } | 1734 } |
1656 } | 1735 } |
1657 else if (fio_flags & FIO_UCS4) | 1736 else if (fio_flags & FIO_UCS4) |
1658 { | 1737 { |
1659 if (fio_flags & FIO_ENDIAN_L) | 1738 if (fio_flags & FIO_ENDIAN_L) |
1676 if (*--p < 0x80) | 1755 if (*--p < 0x80) |
1677 u8c = *p; | 1756 u8c = *p; |
1678 else | 1757 else |
1679 { | 1758 { |
1680 len = utf_head_off(ptr, p); | 1759 len = utf_head_off(ptr, p); |
1760 p -= len; | |
1761 u8c = utf_ptr2char(p); | |
1681 if (len == 0) | 1762 if (len == 0) |
1682 { | 1763 { |
1683 /* Not a valid UTF-8 character, retry with | 1764 /* Not a valid UTF-8 character, retry with |
1684 * another fenc when possible, otherwise just | 1765 * another fenc when possible, otherwise just |
1685 * report the error. */ | 1766 * report the error. */ |
1686 if (can_retry) | 1767 if (can_retry) |
1687 goto rewind_retry; | 1768 goto rewind_retry; |
1688 conv_error = TRUE; | 1769 if (conv_error == 0) |
1770 conv_error = readfile_linenr(linecnt, | |
1771 ptr, p); | |
1772 if (bad_char_behavior == BAD_DROP) | |
1773 continue; | |
1774 if (bad_char_behavior != BAD_KEEP) | |
1775 u8c = bad_char_behavior; | |
1689 } | 1776 } |
1690 p -= len; | |
1691 u8c = utf_ptr2char(p); | |
1692 } | 1777 } |
1693 } | 1778 } |
1694 if (enc_utf8) /* produce UTF-8 */ | 1779 if (enc_utf8) /* produce UTF-8 */ |
1695 { | 1780 { |
1696 dest -= utf_char2len(u8c); | 1781 dest -= utf_char2len(u8c); |
1702 if (u8c >= 0x100) | 1787 if (u8c >= 0x100) |
1703 { | 1788 { |
1704 /* character doesn't fit in latin1, retry with | 1789 /* character doesn't fit in latin1, retry with |
1705 * another fenc when possible, otherwise just | 1790 * another fenc when possible, otherwise just |
1706 * report the error. */ | 1791 * report the error. */ |
1707 if (can_retry && !keep_dest_enc) | 1792 if (can_retry) |
1708 goto rewind_retry; | 1793 goto rewind_retry; |
1709 *dest = 0xBF; | 1794 if (conv_error == 0) |
1710 conv_error = TRUE; | 1795 conv_error = readfile_linenr(linecnt, ptr, p); |
1796 if (bad_char_behavior == BAD_DROP) | |
1797 ++dest; | |
1798 else if (bad_char_behavior == BAD_KEEP) | |
1799 *dest = u8c; | |
1800 else if (eap != NULL && eap->bad_char != 0) | |
1801 *dest = bad_char_behavior; | |
1802 else | |
1803 *dest = 0xBF; | |
1711 } | 1804 } |
1712 else | 1805 else |
1713 *dest = u8c; | 1806 *dest = u8c; |
1714 } | 1807 } |
1715 } | 1808 } |
1718 line_start = dest - linerest; | 1811 line_start = dest - linerest; |
1719 mch_memmove(line_start, buffer, (size_t)linerest); | 1812 mch_memmove(line_start, buffer, (size_t)linerest); |
1720 size = (long)((ptr + real_size) - dest); | 1813 size = (long)((ptr + real_size) - dest); |
1721 ptr = dest; | 1814 ptr = dest; |
1722 } | 1815 } |
1723 else if (enc_utf8 && !conv_error && !curbuf->b_p_bin) | 1816 else if (enc_utf8 && conv_error == 0 && !curbuf->b_p_bin) |
1724 { | 1817 { |
1725 /* Reading UTF-8: Check if the bytes are valid UTF-8. | 1818 /* Reading UTF-8: Check if the bytes are valid UTF-8. |
1726 * Need to start before "ptr" when part of the character was | 1819 * Need to start before "ptr" when part of the character was |
1727 * read in the previous read() call. */ | 1820 * read in the previous read() call. */ |
1728 for (p = ptr - utf_head_off(buffer, ptr); p < ptr + size; ++p) | 1821 for (p = ptr - utf_head_off(buffer, ptr); ; ++p) |
1729 { | 1822 { |
1823 int todo = (ptr + size) - p; | |
1824 int l; | |
1825 | |
1826 if (todo <= 0) | |
1827 break; | |
1730 if (*p >= 0x80) | 1828 if (*p >= 0x80) |
1731 { | 1829 { |
1732 len = utf_ptr2len(p); | |
1733 /* A length of 1 means it's an illegal byte. Accept | 1830 /* A length of 1 means it's an illegal byte. Accept |
1734 * an incomplete character at the end though, the next | 1831 * an incomplete character at the end though, the next |
1735 * read() will get the next bytes, we'll check it | 1832 * read() will get the next bytes, we'll check it |
1736 * then. */ | 1833 * then. */ |
1737 if (len == 1) | 1834 l = utf_ptr2len_len(p, todo); |
1835 if (l > todo) | |
1738 { | 1836 { |
1739 p += utf_byte2len(*p) - 1; | 1837 /* Incomplete byte sequence, the next read() |
1838 * should get them and check the bytes. */ | |
1839 p += todo; | |
1740 break; | 1840 break; |
1741 } | 1841 } |
1742 p += len - 1; | 1842 if (l == 1) |
1843 { | |
1844 /* Illegal byte. If we can try another encoding | |
1845 * do that. */ | |
1846 if (can_retry) | |
1847 break; | |
1848 | |
1849 /* Remember the first linenr with an illegal byte */ | |
1850 if (illegal_byte == 0) | |
1851 illegal_byte = readfile_linenr(linecnt, ptr, p); | |
1852 # ifdef USE_ICONV | |
1853 /* When we did a conversion report an error. */ | |
1854 if (iconv_fd != (iconv_t)-1 && conv_error == 0) | |
1855 conv_error = readfile_linenr(linecnt, ptr, p); | |
1856 # endif | |
1857 | |
1858 /* Drop, keep or replace the bad byte. */ | |
1859 if (bad_char_behavior == BAD_DROP) | |
1860 { | |
1861 mch_memmove(p, p+1, todo - 1); | |
1862 --p; | |
1863 --size; | |
1864 } | |
1865 else if (bad_char_behavior != BAD_KEEP) | |
1866 *p = bad_char_behavior; | |
1867 } | |
1868 p += l - 1; | |
1743 } | 1869 } |
1744 } | 1870 } |
1745 if (p < ptr + size) | 1871 if (p < ptr + size) |
1746 { | 1872 { |
1747 /* Detected a UTF-8 error. */ | 1873 /* Detected a UTF-8 error. */ |
1748 if (can_retry) | |
1749 { | |
1750 rewind_retry: | 1874 rewind_retry: |
1751 /* Retry reading with another conversion. */ | 1875 /* Retry reading with another conversion. */ |
1752 # if defined(FEAT_EVAL) && defined(USE_ICONV) | 1876 # if defined(FEAT_EVAL) && defined(USE_ICONV) |
1753 if (*p_ccv != NUL && iconv_fd != (iconv_t)-1) | 1877 if (*p_ccv != NUL && iconv_fd != (iconv_t)-1) |
1754 /* iconv() failed, try 'charconvert' */ | 1878 /* iconv() failed, try 'charconvert' */ |
1755 did_iconv = TRUE; | 1879 did_iconv = TRUE; |
1756 else | |
1757 # endif | |
1758 /* use next item from 'fileencodings' */ | |
1759 advance_fenc = TRUE; | |
1760 file_rewind = TRUE; | |
1761 goto retry; | |
1762 } | |
1763 | |
1764 /* There is no alternative fenc, just report the error. */ | |
1765 # ifdef USE_ICONV | |
1766 if (iconv_fd != (iconv_t)-1) | |
1767 conv_error = TRUE; | |
1768 else | 1880 else |
1769 # endif | 1881 # endif |
1770 if (illegal_byte == 0) /* Keep the first linenr */ | 1882 /* use next item from 'fileencodings' */ |
1771 { | 1883 advance_fenc = TRUE; |
1772 char_u *s; | 1884 file_rewind = TRUE; |
1773 | 1885 goto retry; |
1774 /* Estimate the line number. */ | |
1775 illegal_byte = curbuf->b_ml.ml_line_count - linecnt + 1; | |
1776 for (s = ptr; s < p; ++s) | |
1777 if (*s == '\n') | |
1778 ++illegal_byte; | |
1779 } | |
1780 } | 1886 } |
1781 } | 1887 } |
1782 #endif | 1888 #endif |
1783 | 1889 |
1784 /* count the number of characters (after conversion!) */ | 1890 /* count the number of characters (after conversion!) */ |
2157 STRCAT(IObuff, _("[crypted]")); | 2263 STRCAT(IObuff, _("[crypted]")); |
2158 c = TRUE; | 2264 c = TRUE; |
2159 } | 2265 } |
2160 #endif | 2266 #endif |
2161 #ifdef FEAT_MBYTE | 2267 #ifdef FEAT_MBYTE |
2162 if (conv_error) | 2268 if (conv_error != 0) |
2163 { | 2269 { |
2164 STRCAT(IObuff, _("[CONVERSION ERROR]")); | 2270 sprintf((char *)IObuff + STRLEN(IObuff), |
2271 _("[CONVERSION ERROR in line %ld]"), (long)conv_error); | |
2165 c = TRUE; | 2272 c = TRUE; |
2166 } | 2273 } |
2167 else if (illegal_byte > 0) | 2274 else if (illegal_byte > 0) |
2168 { | 2275 { |
2169 sprintf((char *)IObuff + STRLEN(IObuff), | 2276 sprintf((char *)IObuff + STRLEN(IObuff), |
2213 } | 2320 } |
2214 | 2321 |
2215 /* with errors writing the file requires ":w!" */ | 2322 /* with errors writing the file requires ":w!" */ |
2216 if (newfile && (error | 2323 if (newfile && (error |
2217 #ifdef FEAT_MBYTE | 2324 #ifdef FEAT_MBYTE |
2218 || conv_error | 2325 || conv_error != 0 |
2219 #endif | 2326 #endif |
2220 )) | 2327 )) |
2221 curbuf->b_p_ro = TRUE; | 2328 curbuf->b_p_ro = TRUE; |
2222 | 2329 |
2223 u_clearline(); /* cannot use "U" command after adding lines */ | 2330 u_clearline(); /* cannot use "U" command after adding lines */ |
2294 | 2401 |
2295 if (recoverymode && error) | 2402 if (recoverymode && error) |
2296 return FAIL; | 2403 return FAIL; |
2297 return OK; | 2404 return OK; |
2298 } | 2405 } |
2406 | |
2407 #ifdef FEAT_MBYTE | |
2408 | |
2409 /* | |
2410 * From the current line count and characters read after that, estimate the | |
2411 * line number where we are now. | |
2412 * Used for error messages that include a line number. | |
2413 */ | |
2414 static linenr_T | |
2415 readfile_linenr(linecnt, p, endp) | |
2416 linenr_T linecnt; /* line count before reading more bytes */ | |
2417 char_u *p; /* start of more bytes read */ | |
2418 char_u *endp; /* end of more bytes read */ | |
2419 { | |
2420 char_u *s; | |
2421 linenr_T lnum; | |
2422 | |
2423 lnum = curbuf->b_ml.ml_line_count - linecnt + 1; | |
2424 for (s = p; s < endp; ++s) | |
2425 if (*s == '\n') | |
2426 ++lnum; | |
2427 return lnum; | |
2428 } | |
2429 #endif | |
2299 | 2430 |
2300 /* | 2431 /* |
2301 * Fill "*eap" to force the 'fileencoding' and 'fileformat' to be equal to the | 2432 * Fill "*eap" to force the 'fileencoding' and 'fileformat' to be equal to the |
2302 * buffer "buf". Used for calling readfile(). | 2433 * buffer "buf". Used for calling readfile(). |
2303 * Returns OK or FAIL. | 2434 * Returns OK or FAIL. |