comparison src/regexp.c @ 24:8ff7fd162d3c v7.0016

updated for version 7.0016
author vimboss
date Mon, 13 Sep 2004 20:26:32 +0000
parents 3fc0f57ecb91
children ac33b7c03fac
comparison
equal deleted inserted replaced
23:3f44e9abe4ec 24:8ff7fd162d3c
31 * 31 *
32 * Beware that some of this code is subtly aware of the way operator 32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in 33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink. 34 * regular-expression syntax might require a total rethink.
35 * 35 *
36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb 36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * and Bram Moolenaar. 37 * Webb, Ciaran McCreesh and Bram Moolenaar.
38 * Named character class support added by Walter Briscoe (1998 Jul 01) 38 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */ 39 */
40 40
41 #include "vim.h" 41 #include "vim.h"
42 42
374 * \n - New line (NL). 374 * \n - New line (NL).
375 * \r - Carriage Return (CR). 375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB). 376 * \t - Tab (TAB).
377 * \e - Escape (ESC). 377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H). 378 * \b - Backspace (Ctrl_H).
379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
379 */ 384 */
380 static char_u REGEXP_INRANGE[] = "]^-n\\"; 385 static char_u REGEXP_INRANGE[] = "]^-n\\";
381 static char_u REGEXP_ABBR[] = "nrteb"; 386 static char_u REGEXP_ABBR[] = "nrtebdoxuU";
382 387
383 static int backslash_trans __ARGS((int c)); 388 static int backslash_trans __ARGS((int c));
384 static int skip_class_name __ARGS((char_u **pp)); 389 static int skip_class_name __ARGS((char_u **pp));
385 static char_u *skip_anyof __ARGS((char_u *p)); 390 static char_u *skip_anyof __ARGS((char_u *p));
386 static void init_class_tab __ARGS((void)); 391 static void init_class_tab __ARGS((void));
679 static int getchr __ARGS((void)); 684 static int getchr __ARGS((void));
680 static void skipchr_keepstart __ARGS((void)); 685 static void skipchr_keepstart __ARGS((void));
681 static int peekchr __ARGS((void)); 686 static int peekchr __ARGS((void));
682 static void skipchr __ARGS((void)); 687 static void skipchr __ARGS((void));
683 static void ungetchr __ARGS((void)); 688 static void ungetchr __ARGS((void));
689 static int gethexchrs __ARGS((int maxinputlen));
690 static int getoctchrs __ARGS((void));
691 static int getdecchrs __ARGS((void));
692 static int coll_get_char __ARGS((void));
684 static void regcomp_start __ARGS((char_u *expr, int flags)); 693 static void regcomp_start __ARGS((char_u *expr, int flags));
685 static char_u *reg __ARGS((int, int *)); 694 static char_u *reg __ARGS((int, int *));
686 static char_u *regbranch __ARGS((int *flagp)); 695 static char_u *regbranch __ARGS((int *flagp));
687 static char_u *regconcat __ARGS((int *flagp)); 696 static char_u *regconcat __ARGS((int *flagp));
688 static char_u *regpiece __ARGS((int *)); 697 static char_u *regpiece __ARGS((int *));
1720 } 1729 }
1721 *flagp &= ~HASWIDTH; 1730 *flagp &= ~HASWIDTH;
1722 break; 1731 break;
1723 } 1732 }
1724 1733
1734 case 'd': /* %d123 decimal */
1735 case 'o': /* %o123 octal */
1736 case 'x': /* %xab hex 2 */
1737 case 'u': /* %uabcd hex 4 */
1738 case 'U': /* %U1234abcd hex 8 */
1739 {
1740 int i;
1741
1742 switch (c)
1743 {
1744 case 'd': i = getdecchrs(); break;
1745 case 'o': i = getoctchrs(); break;
1746 case 'x': i = gethexchrs(2); break;
1747 case 'u': i = gethexchrs(4); break;
1748 case 'U': i = gethexchrs(8); break;
1749 default: i = -1; break;
1750 }
1751
1752 if (i < 0)
1753 EMSG_M_RET_NULL(
1754 _("E678: Invalid character after %s%%[dxouU]"),
1755 reg_magic == MAGIC_ALL);
1756 ret = regnode(EXACTLY);
1757 if (i == 0)
1758 regc(0x0a);
1759 else
1760 #ifdef FEAT_MBYTE
1761 regmbc(i);
1762 #else
1763 regc(i);
1764 #endif
1765 regc(NUL);
1766 *flagp |= HASWIDTH;
1767 break;
1768 }
1769
1725 default: 1770 default:
1726 if (VIM_ISDIGIT(c) || c == '<' || c == '>') 1771 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1727 { 1772 {
1728 long_u n = 0; 1773 long_u n = 0;
1729 int cmp; 1774 int cmp;
1814 if (has_mbyte) 1859 if (has_mbyte)
1815 endc = mb_ptr2char_adv(&regparse); 1860 endc = mb_ptr2char_adv(&regparse);
1816 else 1861 else
1817 #endif 1862 #endif
1818 endc = *regparse++; 1863 endc = *regparse++;
1864
1865 /* Handle \o40, \x20 and \u20AC style sequences */
1866 if (endc == '\\' && !cpo_lit)
1867 endc = coll_get_char();
1868
1819 if (startc > endc) 1869 if (startc > endc)
1820 EMSG_RET_NULL(_(e_invrange)); 1870 EMSG_RET_NULL(_(e_invrange));
1821 #ifdef FEAT_MBYTE 1871 #ifdef FEAT_MBYTE
1822 if (has_mbyte && ((*mb_char2len)(startc) > 1 1872 if (has_mbyte && ((*mb_char2len)(startc) > 1
1823 || (*mb_char2len)(endc) > 1)) 1873 || (*mb_char2len)(endc) > 1))
1872 /* else: must have had a \n already */ 1922 /* else: must have had a \n already */
1873 } 1923 }
1874 *flagp |= HASNL; 1924 *flagp |= HASNL;
1875 regparse++; 1925 regparse++;
1876 startc = -1; 1926 startc = -1;
1927 }
1928 else if (*regparse == 'd'
1929 || *regparse == 'o'
1930 || *regparse == 'x'
1931 || *regparse == 'u'
1932 || *regparse == 'U')
1933 {
1934 startc = coll_get_char();
1935 if (startc == 0)
1936 regc(0x0a);
1937 else
1938 #ifdef FEAT_MBYTE
1939 regmbc(startc);
1940 #else
1941 regc(startc);
1942 #endif
1877 } 1943 }
1878 else 1944 else
1879 { 1945 {
1880 startc = backslash_trans(*regparse++); 1946 startc = backslash_trans(*regparse++);
1881 regc(startc); 1947 regc(startc);
2515 * getchr(). */ 2581 * getchr(). */
2516 regparse -= prevchr_len; 2582 regparse -= prevchr_len;
2517 } 2583 }
2518 2584
2519 /* 2585 /*
2586 * get and return the value of the hex string immediately after the current
2587 * position. Return -1 for invalid, or 0-255 for valid. Position is updated:
2588 * blahblah\%x20asdf
2589 * before-^ ^-after
2590 * The parameter controls the maximum number of input characters. This will be
2591 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2592 */
2593 static int
2594 gethexchrs(maxinputlen)
2595 int maxinputlen;
2596 {
2597 int nr = 0;
2598 int c;
2599 int i;
2600
2601 for (i = 0; i < maxinputlen; ++i)
2602 {
2603 c = regparse[0];
2604 if (!vim_isxdigit(c))
2605 break;
2606 nr <<= 4;
2607 nr |= hex2nr(c);
2608 ++regparse;
2609 }
2610
2611 if (i == 0)
2612 return -1;
2613 return nr;
2614 }
2615
2616 /*
2617 * get and return the value of the decimal string immediately after the
2618 * current position. Return -1 for invalid. Consumes all digits.
2619 */
2620 static int
2621 getdecchrs()
2622 {
2623 int nr = 0;
2624 int c;
2625 int i;
2626
2627 for (i = 0; ; ++i)
2628 {
2629 c = regparse[0];
2630 if (c < '0' || c > '9')
2631 break;
2632 nr *= 10;
2633 nr += c - '0';
2634 ++regparse;
2635 }
2636
2637 if (i == 0)
2638 return -1;
2639 return nr;
2640 }
2641
2642 /*
2643 * get and return the value of the octal string immediately after the current
2644 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2645 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2646 * treat 8 or 9 as recognised characters. Position is updated:
2647 * blahblah\%o210asdf
2648 * before-^ ^-after
2649 */
2650 static int
2651 getoctchrs()
2652 {
2653 int nr = 0;
2654 int c;
2655 int i;
2656
2657 for (i = 0; i < 3 && nr < 040; ++i)
2658 {
2659 c = regparse[0];
2660 if (c < '0' || c > '7')
2661 break;
2662 nr <<= 3;
2663 nr |= hex2nr(c);
2664 ++regparse;
2665 }
2666
2667 if (i == 0)
2668 return -1;
2669 return nr;
2670 }
2671
2672 /*
2673 * Get a number after a backslash that is inside [].
2674 * When nothing is recognized return a backslash.
2675 */
2676 static int
2677 coll_get_char()
2678 {
2679 int nr = -1;
2680
2681 switch (*regparse++)
2682 {
2683 case 'd': nr = getdecchrs(); break;
2684 case 'o': nr = getoctchrs(); break;
2685 case 'x': nr = gethexchrs(2); break;
2686 case 'u': nr = gethexchrs(4); break;
2687 case 'U': nr = gethexchrs(8); break;
2688 }
2689 if (nr < 0)
2690 {
2691 /* If getting the number fails be backwards compatible: the character
2692 * is a backslash. */
2693 --regparse;
2694 nr = '\\';
2695 }
2696 return nr;
2697 }
2698
2699 /*
2520 * read_limits - Read two integers to be taken as a minimum and maximum. 2700 * read_limits - Read two integers to be taken as a minimum and maximum.
2521 * If the first character is '-', then the range is reversed. 2701 * If the first character is '-', then the range is reversed.
2522 * Should end with 'end'. If minval is missing, zero is default, if maxval is 2702 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2523 * missing, a very big number is the default. 2703 * missing, a very big number is the default.
2524 */ 2704 */