Mercurial > vim
comparison src/regexp.c @ 24:8ff7fd162d3c v7.0016
updated for version 7.0016
author | vimboss |
---|---|
date | Mon, 13 Sep 2004 20:26:32 +0000 |
parents | 3fc0f57ecb91 |
children | ac33b7c03fac |
comparison
equal
deleted
inserted
replaced
23:3f44e9abe4ec | 24:8ff7fd162d3c |
---|---|
31 * | 31 * |
32 * Beware that some of this code is subtly aware of the way operator | 32 * Beware that some of this code is subtly aware of the way operator |
33 * precedence is structured in regular expressions. Serious changes in | 33 * precedence is structured in regular expressions. Serious changes in |
34 * regular-expression syntax might require a total rethink. | 34 * regular-expression syntax might require a total rethink. |
35 * | 35 * |
36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb | 36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert |
37 * and Bram Moolenaar. | 37 * Webb, Ciaran McCreesh and Bram Moolenaar. |
38 * Named character class support added by Walter Briscoe (1998 Jul 01) | 38 * Named character class support added by Walter Briscoe (1998 Jul 01) |
39 */ | 39 */ |
40 | 40 |
41 #include "vim.h" | 41 #include "vim.h" |
42 | 42 |
374 * \n - New line (NL). | 374 * \n - New line (NL). |
375 * \r - Carriage Return (CR). | 375 * \r - Carriage Return (CR). |
376 * \t - Tab (TAB). | 376 * \t - Tab (TAB). |
377 * \e - Escape (ESC). | 377 * \e - Escape (ESC). |
378 * \b - Backspace (Ctrl_H). | 378 * \b - Backspace (Ctrl_H). |
379 * \d - Character code in decimal, eg \d123 | |
380 * \o - Character code in octal, eg \o80 | |
381 * \x - Character code in hex, eg \x4a | |
382 * \u - Multibyte character code, eg \u20ac | |
383 * \U - Long multibyte character code, eg \U12345678 | |
379 */ | 384 */ |
380 static char_u REGEXP_INRANGE[] = "]^-n\\"; | 385 static char_u REGEXP_INRANGE[] = "]^-n\\"; |
381 static char_u REGEXP_ABBR[] = "nrteb"; | 386 static char_u REGEXP_ABBR[] = "nrtebdoxuU"; |
382 | 387 |
383 static int backslash_trans __ARGS((int c)); | 388 static int backslash_trans __ARGS((int c)); |
384 static int skip_class_name __ARGS((char_u **pp)); | 389 static int skip_class_name __ARGS((char_u **pp)); |
385 static char_u *skip_anyof __ARGS((char_u *p)); | 390 static char_u *skip_anyof __ARGS((char_u *p)); |
386 static void init_class_tab __ARGS((void)); | 391 static void init_class_tab __ARGS((void)); |
679 static int getchr __ARGS((void)); | 684 static int getchr __ARGS((void)); |
680 static void skipchr_keepstart __ARGS((void)); | 685 static void skipchr_keepstart __ARGS((void)); |
681 static int peekchr __ARGS((void)); | 686 static int peekchr __ARGS((void)); |
682 static void skipchr __ARGS((void)); | 687 static void skipchr __ARGS((void)); |
683 static void ungetchr __ARGS((void)); | 688 static void ungetchr __ARGS((void)); |
689 static int gethexchrs __ARGS((int maxinputlen)); | |
690 static int getoctchrs __ARGS((void)); | |
691 static int getdecchrs __ARGS((void)); | |
692 static int coll_get_char __ARGS((void)); | |
684 static void regcomp_start __ARGS((char_u *expr, int flags)); | 693 static void regcomp_start __ARGS((char_u *expr, int flags)); |
685 static char_u *reg __ARGS((int, int *)); | 694 static char_u *reg __ARGS((int, int *)); |
686 static char_u *regbranch __ARGS((int *flagp)); | 695 static char_u *regbranch __ARGS((int *flagp)); |
687 static char_u *regconcat __ARGS((int *flagp)); | 696 static char_u *regconcat __ARGS((int *flagp)); |
688 static char_u *regpiece __ARGS((int *)); | 697 static char_u *regpiece __ARGS((int *)); |
1720 } | 1729 } |
1721 *flagp &= ~HASWIDTH; | 1730 *flagp &= ~HASWIDTH; |
1722 break; | 1731 break; |
1723 } | 1732 } |
1724 | 1733 |
1734 case 'd': /* %d123 decimal */ | |
1735 case 'o': /* %o123 octal */ | |
1736 case 'x': /* %xab hex 2 */ | |
1737 case 'u': /* %uabcd hex 4 */ | |
1738 case 'U': /* %U1234abcd hex 8 */ | |
1739 { | |
1740 int i; | |
1741 | |
1742 switch (c) | |
1743 { | |
1744 case 'd': i = getdecchrs(); break; | |
1745 case 'o': i = getoctchrs(); break; | |
1746 case 'x': i = gethexchrs(2); break; | |
1747 case 'u': i = gethexchrs(4); break; | |
1748 case 'U': i = gethexchrs(8); break; | |
1749 default: i = -1; break; | |
1750 } | |
1751 | |
1752 if (i < 0) | |
1753 EMSG_M_RET_NULL( | |
1754 _("E678: Invalid character after %s%%[dxouU]"), | |
1755 reg_magic == MAGIC_ALL); | |
1756 ret = regnode(EXACTLY); | |
1757 if (i == 0) | |
1758 regc(0x0a); | |
1759 else | |
1760 #ifdef FEAT_MBYTE | |
1761 regmbc(i); | |
1762 #else | |
1763 regc(i); | |
1764 #endif | |
1765 regc(NUL); | |
1766 *flagp |= HASWIDTH; | |
1767 break; | |
1768 } | |
1769 | |
1725 default: | 1770 default: |
1726 if (VIM_ISDIGIT(c) || c == '<' || c == '>') | 1771 if (VIM_ISDIGIT(c) || c == '<' || c == '>') |
1727 { | 1772 { |
1728 long_u n = 0; | 1773 long_u n = 0; |
1729 int cmp; | 1774 int cmp; |
1814 if (has_mbyte) | 1859 if (has_mbyte) |
1815 endc = mb_ptr2char_adv(®parse); | 1860 endc = mb_ptr2char_adv(®parse); |
1816 else | 1861 else |
1817 #endif | 1862 #endif |
1818 endc = *regparse++; | 1863 endc = *regparse++; |
1864 | |
1865 /* Handle \o40, \x20 and \u20AC style sequences */ | |
1866 if (endc == '\\' && !cpo_lit) | |
1867 endc = coll_get_char(); | |
1868 | |
1819 if (startc > endc) | 1869 if (startc > endc) |
1820 EMSG_RET_NULL(_(e_invrange)); | 1870 EMSG_RET_NULL(_(e_invrange)); |
1821 #ifdef FEAT_MBYTE | 1871 #ifdef FEAT_MBYTE |
1822 if (has_mbyte && ((*mb_char2len)(startc) > 1 | 1872 if (has_mbyte && ((*mb_char2len)(startc) > 1 |
1823 || (*mb_char2len)(endc) > 1)) | 1873 || (*mb_char2len)(endc) > 1)) |
1872 /* else: must have had a \n already */ | 1922 /* else: must have had a \n already */ |
1873 } | 1923 } |
1874 *flagp |= HASNL; | 1924 *flagp |= HASNL; |
1875 regparse++; | 1925 regparse++; |
1876 startc = -1; | 1926 startc = -1; |
1927 } | |
1928 else if (*regparse == 'd' | |
1929 || *regparse == 'o' | |
1930 || *regparse == 'x' | |
1931 || *regparse == 'u' | |
1932 || *regparse == 'U') | |
1933 { | |
1934 startc = coll_get_char(); | |
1935 if (startc == 0) | |
1936 regc(0x0a); | |
1937 else | |
1938 #ifdef FEAT_MBYTE | |
1939 regmbc(startc); | |
1940 #else | |
1941 regc(startc); | |
1942 #endif | |
1877 } | 1943 } |
1878 else | 1944 else |
1879 { | 1945 { |
1880 startc = backslash_trans(*regparse++); | 1946 startc = backslash_trans(*regparse++); |
1881 regc(startc); | 1947 regc(startc); |
2515 * getchr(). */ | 2581 * getchr(). */ |
2516 regparse -= prevchr_len; | 2582 regparse -= prevchr_len; |
2517 } | 2583 } |
2518 | 2584 |
2519 /* | 2585 /* |
2586 * get and return the value of the hex string immediately after the current | |
2587 * position. Return -1 for invalid, or 0-255 for valid. Position is updated: | |
2588 * blahblah\%x20asdf | |
2589 * before-^ ^-after | |
2590 * The parameter controls the maximum number of input characters. This will be | |
2591 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence. | |
2592 */ | |
2593 static int | |
2594 gethexchrs(maxinputlen) | |
2595 int maxinputlen; | |
2596 { | |
2597 int nr = 0; | |
2598 int c; | |
2599 int i; | |
2600 | |
2601 for (i = 0; i < maxinputlen; ++i) | |
2602 { | |
2603 c = regparse[0]; | |
2604 if (!vim_isxdigit(c)) | |
2605 break; | |
2606 nr <<= 4; | |
2607 nr |= hex2nr(c); | |
2608 ++regparse; | |
2609 } | |
2610 | |
2611 if (i == 0) | |
2612 return -1; | |
2613 return nr; | |
2614 } | |
2615 | |
2616 /* | |
2617 * get and return the value of the decimal string immediately after the | |
2618 * current position. Return -1 for invalid. Consumes all digits. | |
2619 */ | |
2620 static int | |
2621 getdecchrs() | |
2622 { | |
2623 int nr = 0; | |
2624 int c; | |
2625 int i; | |
2626 | |
2627 for (i = 0; ; ++i) | |
2628 { | |
2629 c = regparse[0]; | |
2630 if (c < '0' || c > '9') | |
2631 break; | |
2632 nr *= 10; | |
2633 nr += c - '0'; | |
2634 ++regparse; | |
2635 } | |
2636 | |
2637 if (i == 0) | |
2638 return -1; | |
2639 return nr; | |
2640 } | |
2641 | |
2642 /* | |
2643 * get and return the value of the octal string immediately after the current | |
2644 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle | |
2645 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't | |
2646 * treat 8 or 9 as recognised characters. Position is updated: | |
2647 * blahblah\%o210asdf | |
2648 * before-^ ^-after | |
2649 */ | |
2650 static int | |
2651 getoctchrs() | |
2652 { | |
2653 int nr = 0; | |
2654 int c; | |
2655 int i; | |
2656 | |
2657 for (i = 0; i < 3 && nr < 040; ++i) | |
2658 { | |
2659 c = regparse[0]; | |
2660 if (c < '0' || c > '7') | |
2661 break; | |
2662 nr <<= 3; | |
2663 nr |= hex2nr(c); | |
2664 ++regparse; | |
2665 } | |
2666 | |
2667 if (i == 0) | |
2668 return -1; | |
2669 return nr; | |
2670 } | |
2671 | |
2672 /* | |
2673 * Get a number after a backslash that is inside []. | |
2674 * When nothing is recognized return a backslash. | |
2675 */ | |
2676 static int | |
2677 coll_get_char() | |
2678 { | |
2679 int nr = -1; | |
2680 | |
2681 switch (*regparse++) | |
2682 { | |
2683 case 'd': nr = getdecchrs(); break; | |
2684 case 'o': nr = getoctchrs(); break; | |
2685 case 'x': nr = gethexchrs(2); break; | |
2686 case 'u': nr = gethexchrs(4); break; | |
2687 case 'U': nr = gethexchrs(8); break; | |
2688 } | |
2689 if (nr < 0) | |
2690 { | |
2691 /* If getting the number fails be backwards compatible: the character | |
2692 * is a backslash. */ | |
2693 --regparse; | |
2694 nr = '\\'; | |
2695 } | |
2696 return nr; | |
2697 } | |
2698 | |
2699 /* | |
2520 * read_limits - Read two integers to be taken as a minimum and maximum. | 2700 * read_limits - Read two integers to be taken as a minimum and maximum. |
2521 * If the first character is '-', then the range is reversed. | 2701 * If the first character is '-', then the range is reversed. |
2522 * Should end with 'end'. If minval is missing, zero is default, if maxval is | 2702 * Should end with 'end'. If minval is missing, zero is default, if maxval is |
2523 * missing, a very big number is the default. | 2703 * missing, a very big number is the default. |
2524 */ | 2704 */ |