src/lib/regex/front/perl-regex-parser-g.pkg

15.4.864 src/lib/regex/front/perl-regex-parser-g.pkg

# perl-regex-parser-g.pkg
#
#   This module implements a subset of Perl regular expression syntax.
#
#   WARNING: there is no locale support!
#
#   The meta characters are:
#     "\" "^" "$" "." "[" "]" "|" "(" ")" "*" "+" "?"
#
#           \   Quote the next metacharacter
#           ^   Match the beginning of the line
#           .   Match any character (except newline)
#           $   Match the end of the line (or before newline at the end)
#           |   Alternation
#           ()  Grouping
#           []  Character class
#
#
#       The following standard quantifiers are recognized:
#
#           *      Match 0 or more times
#           +      Match 1 or more times
#           ?      Match 1 or 0 times
#           { n }    Match exactly n times
#           { n,}   Match at least n times
#           { n, m }  Match at least n but not more than m times
#
#          \033        octal char (think of a PDP-11)
#           \x1B        hex char
#           \x { 263a }    wide hex char         (Unicode SMILEY)
#           \c[         control char
#           \N { name }    named char
#           \l          lowercase next char (think vi)
#           \u          uppercase next char (think vi)
#           \L          lowercase till \E (think vi)
#           \U          uppercase till \E (think vi)
#           \E          end case modification (think vi)
#           \Q          quote (disable) pattern metacharacters till \E
#
#           \pP Match P, named property.  Use \p { Prop } for longer names.
#           \PP Match non-P
#           \C  Match a single C char (octet) even under utf8.
#
#               Perl defines the following zero-width assertions:
#
#           \b  Match a word boundary
#           \B  Match a non-(word boundary)
#           \A  Match only at beginning of string
#           \Z  Match only at end of string, or before newline at the end
#           \z  Match only at end of string

# Compiled by:
#     src/lib/std/standard.lib

# Invoked by:
#     src/lib/regex/front/perl-regex-parser.pkg

generic package perl_regex_parser_g (
    r:  Abstract_Regular_Expression
)
: (weak)
api {
    include api Generalized_Regular_Expression_Parser
                where
                   r == r;

   #  With user supplied error handler
   scan':  ((X, String) -> Null_Or ((r::Abstract_Regular_Expression, X)) )
        -> number_string::Reader (Char, X)
        -> number_string::Reader (r::Abstract_Regular_Expression, X);
}
=
package {
    package r = r;

    package s
        =
        generic_regular_expression_syntax_g (

            package r = r;
            package s = r::char_set;

            fun char c
                =
                r::char::from_int (char::to_int c);

            fun make_set s
                =
                s::add_list (s::empty, map char (string::explode s));

            dangling_modifiers = FALSE;         #  Don't allow things like /?/

            Escape
                = CHAR         r::char::Char
                | MATCH_SET    s::Set
                | NONMATCH_SET s::Set
                | REGEXP       r::Abstract_Regular_Expression
                | CHARCODE     number_string::Radix
                | CTRL
                | BACKREF     ((String -> String), Int)
                | ERROR       String
                ;

            Context =  IN_CHARSET | IN_REGEXP;

            #           \t          tab                   (HT, TAB)
            #           \n          newline               (LF, NL)
            #           \r          return                (CR)
            #           \f          form feed             (FF)
            #           \a          alarm (bell)          (BEL)
            #           \e          escape (think troff)  (ESC)

            tab = CHAR (char '\t');
            nl  = CHAR (char '\n');
            cr  = CHAR (char '\r');
            ff  = CHAR (char '\f');
            bel = CHAR (char '\a');
            esc = CHAR (char '\x1b'); #  NOTE: SML uses decimal, but Mythryl follows C in using octal for \ddd

            #  IMPORTANT NOTE: perl's . also matches \0
            dot = r::NONMATCH_SET (make_set "\n");

            # What perl means:

            fun is_word c
                =
                r::char::is_alphanumeric c
                or
                r::char::to_int c == 95;

            #
            #      \w  Match a "word" character (alphanumeric plus "_")
            #      \W  Match a non-word character
            #      \s  Match a whitespace character
            #      \S  Match a non-whitespace character
            #      \d  Match a digit character
            #      \D  Match a non-digit character

            fun every p
                =
                {   fun iter (i, s)
                        =
                        {   s =   if (p i)   i ! s;
                                  else           s;
                                  fi;

                            if   (r::char::(>=) (i, r::char::max_char))

                                 s;
                            else
                                 iter (r::char::next i, s);
                            fi;
                        };

                    s::add_list (   s::empty,
                                   iter (r::char::min_char, [])
                               );
                };

            words       = r::add_range (s::empty, char 'a', char 'z');
            words       = r::add_range (words, char 'A', char 'Z');
            words       = r::add_range (words, char '0', char '9');
            words       = s::add (words, char '_');
            word        = MATCH_SET words;
            nonword     = NONMATCH_SET words;

            spaces      = every r::char::is_space;
            space       = MATCH_SET spaces;
            nonspace    = NONMATCH_SET spaces;

            digits      = r::add_range (s::empty, char '0', char '9');
            digit       = MATCH_SET digits;
            nondigit    = NONMATCH_SET digits;

            hex         = CHARCODE number_string::HEX;
            oct         = CHARCODE number_string::OCTAL;

            # Perl defines the following zero-width assertions:
            #
            #      \b  Match a word boundary
            #      \B  Match a non-(word boundary)
            #      \A  Match only at beginning of string
            #      \Z  Match only at end of string, or before newline at the end
            #      \z  Match only at end of string

                       #  word at begining of string

            fun is_word_boundary { prev=>NULL,  this=>THE c, next } =>  is_word c;                      #  word at end of string
                is_word_boundary { prev=>THE c, this=>NULL,  next } =>  is_word c;
                is_word_boundary { prev=>NULL,  this=>NULL,  next } =>  FALSE;                          #  empty string
                is_word_boundary { prev=>THE x, this=>THE y, next }
                    =>
                    if (is_word x)   not (is_word y);
                    else                 (is_word y);
                    fi;
            end;

            fun is_start_of_string { prev=>NULL, this, next } =>  TRUE;
                is_start_of_string _                          =>  FALSE;
            end;

            fun is_end_of_string { this=>NULL, prev, next } =>  TRUE;
                is_end_of_string _                          =>  FALSE;
            end;

            fun is_end_of_string'{ this=>NULL , prev, next      } =>  TRUE;
                is_end_of_string'{ this=>THE c, next=>NULL, ... } =>  r::char::to_int c == 10;
                is_end_of_string' _ => FALSE;
            end;

            word_b    = REGEXP (r::BOUNDARY is_word_boundary);
            nonword_b = REGEXP (r::BOUNDARY (not o is_word_boundary));
            begin_b   = REGEXP (r::BOUNDARY is_start_of_string);
            end_b     = REGEXP (r::BOUNDARY is_end_of_string);
            end_b'    = REGEXP (r::BOUNDARY is_end_of_string');

            Callbackdata = Void;

            # Handle quoting of
            #   \Q ... \E
            #   \U ... \E      to upper case
            #   \L ... \E      to lower case
            # All meta characters are treated as normal within these.
            # I think the semantics here is the same as perl's.
            #
            fun quote transform context getc s
                =
                {   fun loop (s, chars)
                        =
                        case (getc s)

                             NULL
                                 =>
                                 THE (ERROR "missing \\E", s);

                             THE (c as '\\', s)
                                 =>
                                 case (getc s)

                                      THE ('E', s)
                                          =>
                                          done (chars, s);

                                      _   =>
                                          loop (s, char (transform c) ! chars);
                                 esac;

                             THE (c, s)
                                 =>
                                 loop (s, char (transform c) ! chars);
                        esac

                    also
                    fun done (chars, s)
                        =
                        case (context, chars)

                             (_, [c])         =>   THE (CHAR c, s);
                             (IN_REGEXP,  cs) =>   THE (REGEXP (r::CONCAT (reverse (map r::CHAR cs))), s);
                             (IN_CHARSET, cs) =>   THE (MATCH_SET (s::add_list (s::empty, cs)), s);
                        esac;

                    loop (s, []);
                };

            #  Callback for escape sequences:
            #
            fun escape data context getc s
                =
                case (getc s, context)
                    #
                    (NULL, _) => NULL;

                    # Simple escapes:
                    #
                    (THE('t', s), _) => THE (tab, s);
                    (THE('n', s), _) => THE (nl, s);
                    (THE('r', s), _) => THE (cr, s);
                    (THE('f', s), _) => THE (ff, s);
                    (THE('a', s), _) => THE (bel, s);
                    (THE('e', s), _) => THE (esc, s);

                    # Character codes; no unicode support yet!
                    #
                    (THE('0', s), _) => THE (oct, s);
                    (THE('x', s), _) => THE (hex, s);
                    (THE('c', s), _) => THE (CTRL, s);

                    #  Not yet supported
                    (THE('N', s), _) => THE (ERROR "named character",     s);
                    (THE('l', s), _) => THE (ERROR "lowercase next char", s);
                    (THE('u', s), _) => THE (ERROR "uppercase next char", s);

                    # Character set abbreviations:
                    #
                    (THE('w', s), _) => THE (word, s);
                    (THE('W', s), _) => THE (nonword, s);
                    (THE('s', s), _) => THE (space, s);
                    (THE('S', s), _) => THE (nonspace, s);
                    (THE('d', s), _) => THE (digit, s);
                    (THE('D', s), _) => THE (nondigit, s);

                    # Quoting:
                    #
                    (THE('Q', s), _) => quote (\\ x=x) context getc s;
                    (THE('L', s), _) => quote char::to_lower context getc s;
                    (THE('U', s), _) => quote char::to_upper context getc s;

                    # Boundary operators; these cannot appear in a char set:
                    #
                    (THE('b', s), IN_REGEXP) => THE (word_b, s);
                    (THE('B', s), IN_REGEXP) => THE (nonword_b, s);
                    (THE('A', s), IN_REGEXP) => THE (begin_b, s);
                    (THE('Z', s), IN_REGEXP) => THE (end_b', s);
                    (THE('z', s), IN_REGEXP) => THE (end_b, s);

                    # Property
                 /* What are these?
                  | (THE('p', s), _) => (THE PROPERTY, s)
                  | (THE('P', s), _) => (THE NONPROPERTY, s)
                  */

                    (THE (c, s'), _)
                        =>
                        if (char::is_digit c and context == IN_REGEXP)

                            # It is a back reference.

                            # BUG: int::scan is too greedy.
                            # How to handle things like \11 where  XXX BUGGO FIXME
                            #    is it \1 and the character 1?

                            case (int::scan number_string::DECIMAL getc s)
                                #
                                THE (i, s) =>  THE (BACKREF (\\ x = x, i), s);
                                NULL       =>  THE (ERROR "back reference error", s);
                            esac;

                        else
                            #  By default just treat the character literally
                            THE (CHAR (char c), s');
                        fi;
                esac;
    );  # package s

    fun scan' err getc
        =
        s::scan { data=>(), backslash=> '\\', error=>err } getc;

    fun scan getc
         =
         scan' (\\ _ =  raise exception r::CANNOT_PARSE) getc;
};

Comments and suggestions to: bugs@mythryl.org