PreviousUpNext

15.4.858  src/lib/regex/backend/perl-regex-engine-g.pkg

## perl-regex-engine-g.pkg
#
# Implements a perl-like regular expression matcher.
# This module is based on backtracking search.
#
# TODO:
# 1. Compile suitable subexpressions into DFA
# 2. Lookahead optimizations when scanning for substring match.
#        --- Allen Leung Leung
#            (leunga@{ cs.nyu.edu, dorsai.org } )

# Compiled by:
#     src/lib/std/standard.lib


###                 "There is a great satisfaction in building
###                  good tools for other people to use."
###
###                                      -- Freeman Dyson



stipulate

   # Set this to TRUE if you need a representation that is thread-safe; 
   # When this is TRUE the backreferences table will be allocated anew
   # every time a new match is performed.

   thread_safe = TRUE;

herein

    generic package perl_regex_engine_g (
        #
        r:  Char_Abstract_Regular_Expression                                    # Char_Abstract_Regular_Expression      is from   src/lib/regex/front/abstract-regular-expression.api
    )                                                                           # abstract_regular_expression           is from   src/lib/regex/front/abstract-regular-expression.pkg
    : (weak)
    Perl_Regular_Expression_Engine                                              # Perl_Regular_Expression_Engine        is from   src/lib/regex/backend/perl-regex-engine.api
    {

        package r =  r;
        package s =  r;
#       package d =  dfa_engine;                                                # dfa_engine                            is from   src/lib/regex/backend/dfa-engine.pkg
        package m =  regex_match_result;                                        # regex_match_result                    is from   src/lib/regex/glue/regex-match-result.pkg
        package v =  rw_vector;                                                 # rw_vector                             is from   src/lib/std/src/rw-vector.pkg

        Compiled_Regular_Expression
            =
            COMPILED_REGULAR_EXPRESSION {
                backref_var_count:  Int,                                        # Number of backreference variables. 
                references:         v::Rw_Vector( String ),                     # References table.
                regexp:             s::Abstract_Regular_Expression,
                multiline:          Bool,                                       # ^ and $ match '\n' iff this is TRUE.
                begin_only:         Bool,                                       # Can only match at beginning of line? 
                lookahead:  Null_Or(  s::Abstract_Regular_Expression )          # A simple lookahead test
            };


        exception BACKTRACK;
        exception BUG;


        fun multiline (COMPILED_REGULAR_EXPRESSION { backref_var_count, references, regexp, begin_only, lookahead, ... } )
            = 
            COMPILED_REGULAR_EXPRESSION { backref_var_count, references, regexp, begin_only, lookahead, multiline=>TRUE };


        fun singleline (COMPILED_REGULAR_EXPRESSION { backref_var_count, references, regexp, begin_only, 
                             lookahead, ... } )
            = 
            COMPILED_REGULAR_EXPRESSION { backref_var_count, references, regexp, begin_only, lookahead, multiline=>FALSE };


        # It would be worth looking at the
        # set of optimizations implemented
        # by the Perl5 regex optimizaer for
        # ideas here -- it is a mature
        # implementation with lots of good
        # ideas.  Note that some optimizations
        # such as star-sequence collapse are
        # already implemented in
        #     src/lib/regex/front/generic-regular-expression-syntax-g.pkg
        # so there is no point replicating that
        # effort here:
        #
        fun optimize re
            =
            re;         #  Not implemented yet.



        # Analyse a regular expression
        # for three properties:
        #
        #   1) Will it match only at the beginning of a string?
        #   2) "must be empty"?
        #   3) Length needed for backref vector.
        #      This is one more than max backref
        #      variable used:  \1 \2 \3 ...
        #
        fun collect_info re
            =
            {   (analyse_regex re) ->  (begin_only, is_empty);

                (begin_only, is_empty, *n+1);
            }   
            where
                n = REF -1;

                fun track_max_backref v
                    =
                    if (*n <  v)
                         n := v;
                    fi;


                # regex -> (begin_only, is_empty):
                #
                fun analyse_regex (s::CHAR         _) =>  (FALSE, FALSE);
                    analyse_regex (s::MATCH_SET    _) =>  (FALSE, FALSE);
                    analyse_regex (s::NONMATCH_SET _) =>  (FALSE, FALSE);

                    analyse_regex (s::PLUS     re) =>   analyse_regex re;
                    analyse_regex (s::STAR     re) =>   analyse_regex re;
                    analyse_regex (s::OPTION   re) =>   analyse_regex re;
                    analyse_regex (s::GROUP    re) =>   analyse_regex re;
                    analyse_regex (s::GUARD(_, re))=>   analyse_regex re;

                    analyse_regex (s::BEGIN) =>   (TRUE, TRUE);
                    analyse_regex (s::END  ) =>   (FALSE, TRUE);

                    analyse_regex (s::CONCAT es) =>   analyse_concat es;
                    analyse_regex (s::ALT    es) =>   analyse_alt es;

                    analyse_regex (s::INTERVAL (re, min, THE 0)) => (FALSE, TRUE);
                    analyse_regex (s::INTERVAL (re, min, max)) => analyse_regex re;

                    analyse_regex (s::ASSIGN   (v, _, re)) => { track_max_backref v;   analyse_regex re; };
                    analyse_regex (s::BACK_REF (_, v))     => { track_max_backref v;   (FALSE, FALSE); };
                    analyse_regex (s::BOUNDARY _)          => (FALSE, TRUE);
                end

                # Analyse a concatenation of patterns:
                #
                also
                fun analyse_concat [] => (FALSE, TRUE);

                    analyse_concat (e ! es)
                        =>
                        {   (analyse_regex  e ) ->  (begin_only,  is_empty );
                            (analyse_concat es) ->  (begin_only', is_empty');

                            ( begin_only or is_empty and begin_only',
                              is_empty and is_empty'
                            );
                        };
                end

                # Analyse an alternation of patterns:
                #
                also
                fun analyse_alt []  =>  (FALSE, FALSE);          #  Can never match anything 

                    analyse_alt [e] =>  analyse_regex e;

                    analyse_alt (e ! es)
                        =>
                        {   (analyse_regex e ) -> (begin_only , is_empty );
                            (analyse_alt   es) -> (begin_only', is_empty');

                            ( begin_only and begin_only', 
                              is_empty   and is_empty'
                            );
                        };
                end;
            end;                        # where

        fun compile regexp
            =
            {   (collect_info regexp)
                    ->
                    (begin_only, is_empty, backref_var_count);

                COMPILED_REGULAR_EXPRESSION { backref_var_count,
                        begin_only,
                        regexp,
                        multiline  =>  FALSE,
                        lookahead  =>  NULL,
                        references =>  thread_safe  ??   v::make_rw_vector (0,                 "")
                                                    ::   v::make_rw_vector (backref_var_count, "")
                      };
            };

        fun scan (COMPILED_REGULAR_EXPRESSION { regexp, multiline, ... }, references, getc, start_pos, stream, last)
            =
            {
                fun lesseq (i, NULL   ) =>   TRUE;
                    lesseq (i, THE max) =>   i <= max;
                end;

                fun less   (i, NULL   ) =>   TRUE;
                    less   (i, THE max) =>   i < max;
                end;

                # Get n characters from the stream.
                #
                # There must be at least n characters
                # in the stream:
                #
                fun get_n_chars (n, s)
                    = 
                    {   fun f (0, s)
                                =>
                                [];

                            f (n, s)
                                =>
                                case (getc s)
                                  
                                     THE (c, s) =>   c ! f (n - 1, s);
                                     NULL       =>   raise exception BUG;
                                esac;
                        end;

                        string::implode (f (n, s));
                    };


                # Get and set back-references:
                #
                fun get_backref  v     =  v::get (references, v);
                fun set_backref (v, s) =  v::set (references, v, s);


                # At the beginning of the line? 
                #
                fun at_start (0, last         ) =>   TRUE;
                    at_start (_, NULL         ) =>   TRUE;
                    at_start (_, THE('\n', _) ) =>   multiline;
                    at_start (_, _            ) =>   FALSE;
                end;

                # This function constructs an
                # empty match, used when the
                # appropriate alternative is
                # not followed at all:
                #
                fun empty_alternative (s::GROUP e ) =>   [m::REGEX_MATCH_RESULT (NULL, empty_alternative e)];

                    empty_alternative (s::ALT l   ) =>   list::cat (map empty_alternative l);
                    empty_alternative (s::CONCAT l) =>   list::cat (map empty_alternative l);

                    empty_alternative (s::INTERVAL (e, _, _)) =>   empty_alternative e;
                    empty_alternative (s::OPTION e)           =>   empty_alternative e;
                    empty_alternative (s::STAR   e)           =>   empty_alternative e;
                    empty_alternative (s::PLUS   e)           =>   empty_alternative e;

                    empty_alternative (s::ASSIGN (v, _, e))
                        =>
                        {   set_backref (v, "");                 #  reset reference 
                            empty_alternative e;
                        };

                    empty_alternative (s::GUARD(_, e))
                        =>
                        empty_alternative e;

                    empty_alternative _ => [];
                end;

                ################################################################################
                #                  Overview of Match Engine Core
                #
                # Following is the core of the Perl5-compatible
                # regex engine, consisting of two functions:
                #
                #     match_regex 
                #     pop_stack_and_continue
                #
                # Together these implement a simple finite state
                # machine (fsm) which marches through the regular
                # expression matching process via a sequence
                # of tail-recursive calls.
                #
                # (Since the Mythryl compiler promises to implement
                # tail-recursive calls "properly", which is to say
                # without pushing anything on the stack, instead
                # doing just a jump with parameters, tail recursive
                # calls are a great way in Mythryl to do finite
                # state matchine transitions.  Don't do this in
                # C -- you'll overflow your stack.)
                #
                # These two functions use "fate-passing style"
                # which means essentially that we maintain our return
                # stack explicitly by hand as a datastructure rather
                # than depending on the language's implicit compiler-
                # implemented call stack.
                #
                # As coded, the parameters to   match_regex
                # are the state of our fsm and
                # the tail-recursive calls to   match_regex
                # are our state transitions.
                #
                # The  match_regex  args (and thus fsm state variables) are:
                #
                #     re              = The part of current regex subexpression not yet matched.
                #     pos             = Current integer position within the string we're matching against.
                #     this_char       = NULL or else the current char being matched, as a (char, rest-of-string) pair.
                #     last_char       = Last value of 'this_char', for doing boundary matches like \b.
                #     rest_of_string  = Remaining part of string being matched, as a stream of chars.
                #     matches_found   = List of regex matches already found, in reverse order.
                #     stack           = Return_Stack -- remaining work to do after current sub-regex match is done.
                #
                # Our regex-matching algorithm requires backtracking
                # at times;  we implement this by raising a BACKTRACK
                # exception and trapping it at the appropriate resumption
                # point. 
                #
                # Note that our Abstract_Regular_Expression
                # regex representation supports operations
                # not defined in the surface Perl5 syntax,
                # and that our regex engine here implements
                # them.
                #
                # In particular, in backrefs we support
                # application of an arbitrary user-defined
                # function for transforming the matched text
                # before matching (it might map to uppercase
                # or reverse the string, say), and we support
                # arbitrary user-defined GUARD predicates which
                # must match at a given point.
                #
                # Since the Perl5 surface syntax provides
                # no way to access these capabilities, they
                # are currently irrelevant to end-users.
                #
                #
                ################################################################################



                # Define the explicitly-maintained
                # "return stack" (fate) used
                # to remember where we are in outer
                # regex matching while processing
                # an inner nested regex match:

                Return_Stack(X)

                  = GROUP    ( List( s::Abstract_Regular_Expression ),          # Regular expressions to match.
                               X,                                               # String to match against.
                               Int,                                             # Position in string.
                               List( Regex_Match_Result(X) ),                   # List of matches found so far, most recent first.
                               Return_Stack(X)                                  # 
                             )

                  | ASSIGN   ( Int,                                             # Backref var, as int offset into vector.
                              (String -> String),                               # String transform, I think unused at present.
                               List( s::Abstract_Regular_Expression ),          # Regular expressions to match.
                               X,                                               # String to match against.
                               Int,                                             # Position in string.
                               List( Regex_Match_Result(X) ),                   # List of matches found so far, most recent first.
                               Return_Stack(X)
                             )

                  | GUARD    ((String -> Bool),                                 # Predicate to test matched substring.
                               List( s::Abstract_Regular_Expression ),          # Regular expressions to match.
                               X,                                               # String to match against.
                               Int,                                             # Position in string.
                               List( Regex_Match_Result(X) ),                   # List of matches found so far, most recent first.
                               Return_Stack(X)
                             )

                  | CONCAT   ( List( s::Abstract_Regular_Expression ),          # Regular expressions to match.
                               Return_Stack(X)
                             )

                  | REPEAT   ( s::Abstract_Regular_Expression,                  # Regex to match repeatedly.
                               Int,                                             # Value of 'pos' before last match of preceding.
                               Int,                                             # Minimum number of matches.
                               Null_Or( Int ),                                  # Maximum number of matches (NULL == infinity).
                               Int,                                             # Number of matches so far.
                               List( s::Abstract_Regular_Expression ),          # Regular expressions to match.
                               List( Regex_Match_Result(X) ),                   # List of matches found so far, most recent first.
                               Return_Stack(X)
                             )

                  | RETURN

                withtype Regex_Match_Result(X)
                         =
                         m::Regex_Match_Result ( Null_Or { match_position: X,  match_length: Int } )

                    also Buf(X)
                         =
                         Null_Or ((Char, X));



                # Here we define our core
                # finite state machine
                # transition function.
                #
                # The arguments define our
                # fsm state -- see comments above:
                #
                fun match_regex ([], pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We have successfully matched
                        # a regex subexpression, so pop
                        # the stack and continue with
                        # enclosing regex matches.
                        #
                        # (If 'stack' is empty, we're done.)
                        #
                        pop_stack_and_continue (pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::CHAR c ! re, pos, this_char as THE (c', rest_of_string'), last_char, rest_of_string, matches_found, stack)
                        =>
                        # Match a simple constant character c:
                        #
                        if (c == c')   match_regex (re, pos+1, NULL, this_char, rest_of_string', matches_found, stack);
                        else
                             raise exception BACKTRACK;
                        fi;


                    match_regex (s::MATCH_SET set ! re, pos, this_char as THE (c', rest_of_string'), last_char, rest_of_string, matches_found, stack)
                        =>
                        # Match this_char against a
                        # character set like [A-Za-z]:
                        #
                        if  (s::char_set::member (set, c'))   match_regex (re, pos+1, NULL, this_char, rest_of_string', matches_found, stack);
                        else 
                             raise exception BACKTRACK;
                        fi;


                    match_regex (s::NONMATCH_SET set ! re, pos, this_char as THE (c', rest_of_string'), last_char, rest_of_string, matches_found, stack)
                        =>
                        # Match this_char against a negated
                        # character set like [^A-Za-z]:
                        #
                        if (not (s::char_set::member (set, c')))   match_regex (re, pos+1, NULL, this_char, rest_of_string', matches_found, stack);
                        else 
                             raise exception BACKTRACK;
                        fi;


                    match_regex (s::GROUP x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # Match a parenthesized subexpression
                        # by pushing our current state on the
                        # stack and continuing:
                        #
                        match_regex ([x], pos, this_char, last_char, rest_of_string, [], GROUP (re, rest_of_string, pos, matches_found, stack));


                    match_regex (s::ALT [] ! _, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We've failed to match any of
                        # the alternatives in a
                        #     (foo|bar|zot)
                        # type construct, so it is time
                        # to backtrack:
                        # 
                        raise exception BACKTRACK;


                    match_regex (s::ALT [x] ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We've arrived at the last
                        # alternative in a
                        #     (foo|bar|zot)
                        # type construct.  The only
                        # difference between this and
                        # the next case is that we do
                        # not bother trapping BACKTRACK,
                        # since we have no remaining
                        # alternatives to try:
                        # 
                        match_regex (x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::ALT (x ! xs) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We've arrived at a non-final
                        # alternative in a
                        #     (foo|bar|zot)
                        # type construct.  We try matching
                        # the next untried alternative;
                        # if it is forced to BACKTRACK,
                        # we iterate to the next alternative:
                        # 
                        match_regex (x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        except
                            BACKTRACK
                                =
                                match_regex (s::ALT xs ! re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::CONCAT x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match a sequence of
                        # two or more regular expressions.
                        # We call ourself to match the first
                        # one, pushing the rest on the stack:
                        #
                        match_regex (x, pos, this_char, last_char, rest_of_string, matches_found, CONCAT (re, stack));


                    match_regex (s::STAR x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match  foo*
                        # for some regex foo.
                        #
                        # We treat this as a special
                        # case of the general INTERVAL
                        # operator foo{m,n}:
                        #
                        match_regex (s::INTERVAL (x, 0, NULL) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::PLUS x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match  foo+
                        # for some regex foo.
                        #
                        # We treat this as a special
                        # case of the general INTERVAL
                        # operator foo{m,n}:
                        #
                        match_regex (s::INTERVAL (x, 1, NULL) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::OPTION x ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match  foo?
                        # for some regex foo.
                        #
                        # We treat this as a special
                        # case of the general INTERVAL
                        # operator foo{m,n}:
                        #
                        match_regex (s::INTERVAL (x, 0, THE 1) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    match_regex (s::INTERVAL (x, min, max) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match  foo{m,n}
                        # for some regex foo,
                        # where n might be
                        # infinite (represented by
                        # NULL) or we might have
                        # m>n, which always fails.
                        #
                        # In general we handle this
                        # by calling ourself recursively
                        # to match foo after pushing an
                        # appropriate REPEAT(foo) on the
                        # stack to handle the required
                        # remaining number of matches:
                        #
                        {   fun empty_match ()
                                = 
                                match_regex (re, pos, this_char, last_char, rest_of_string, 
                                       list::reverse_and_prepend (empty_alternative x, matches_found), stack);

                            fun try_at_least_one ()
                                = 
                                match_regex ([x], pos, this_char, last_char, rest_of_string, [], 
                                        REPEAT (x, pos, min, max, 1, re, matches_found, stack));

                            if (lesseq (min, max))
                                 
                                 if (min > 0)
                                     
                                      try_at_least_one ();
                                 else
                                      try_at_least_one ()
                                      except
                                          BACKTRACK = empty_match ();
                                 fi;
                            else
                                empty_match ();          # The range is empty.
                            fi;
                        };


                    match_regex (s::BEGIN ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match   ^
                        # the start-of-string char:
                        #
                        if (at_start (pos, last_char))   match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);
                        else
                            raise exception BACKTRACK;
                        fi;


                    match_regex (s::END ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # We're trying to match   $
                        # the end-of-string char:
                        #
                        case (getc rest_of_string)
                          
                             NULL
                                 =>
                                 match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);

                             last_char' as THE (c', rest_of_string')
                                 => 
                                 if (c' == '\n')

                                     if multiline

                                          # In multi-line  mode
                                          #  '$' matches a newline:
                                          #
                                          match_regex (re, pos+1, NULL, last_char', rest_of_string', matches_found, stack);
                                     else
                        
                                          # Even in single-line mode
                                          # '$' matches a newline
                                          # before the end of string:
                                          #
                                          case (getc rest_of_string')

                                              NULL =>  match_regex (re, pos, this_char, last_char, rest_of_string', matches_found, stack);
                                              _    =>  raise exception BACKTRACK;
                                          esac;         
                                     fi;

                                 else

                                     raise exception BACKTRACK;
                                 fi;
                        esac;


                    # Save a group match in a backref variable.
                    # The generic_regular_expression_syntax_g
                    # logic only generates this if there is a
                    # backref referencing the group:
                    #
                    match_regex (s::ASSIGN (v, f, x) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        match_regex ([x], pos, this_char, last_char, rest_of_string, [], ASSIGN (v, f, re, rest_of_string, pos, matches_found, stack));


                    # Match a back reference:
                    #
                    match_regex (s::BACK_REF (f, v) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        match_backref (0, last_char, rest_of_string)
                        where
                            text =  f (get_backref v);

                            n =   size text;

                            fun match_backref (i, last_char, rest_of_string)
                                = 
                                if   (i >= n)
                                     
                                     match_regex (re, pos+n, NULL, last_char, rest_of_string, matches_found, stack);
                                else
                                     case (getc rest_of_string)
                                       
                                          last_char as THE (c', rest_of_string')
                                              => 
                                              if (c' == string::get_byte_as_char (text, i))   match_backref (i+1, last_char, rest_of_string');
                                              else
                                                  raise exception BACKTRACK;
                                              fi;

                                          NULL => raise exception BACKTRACK;
                                     esac;
                              fi;
                        end;


                    match_regex (s::GUARD (predicate, x) ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # Handle a guard predicate.  (There is currently
                        # no way to invoke this from Perl5 syntax.)  At
                        # this point all we do is push a GUARD entry on
                        # our return stack;  the actual check will be done
                        # on return:
                        #
                        match_regex([x], pos, this_char, last_char, rest_of_string, [], GUARD (predicate, re, rest_of_string, pos, matches_found, stack));


                    # Generalized boundary operator 
                    #
                    match_regex (s::BOUNDARY ok ! re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                        =>
                        # Here we handle matching of
                        # the zero-length constructs
                        #
                        #    ^ $ \A \b \B \z \Z
                        #
                        # The 'ok' argument is one of
                        # the predicate functions
                        #
                        #     is_word_boundary
                        #     is_start_of_string
                        #     is_end_of_string
                        #     is_end_of_string'
                        #            ...
                        #
                        # defined and passed to us by
                        #
                        #     src/lib/regex/front/perl-regex-parser-g.pkg
                        #
                        {   prev =   case last_char
                                       
                                          THE (c, _) =>   THE c;
                                          NULL       =>   NULL;
                                     esac;

                            my  (this, next)
                                = 
                                case (getc rest_of_string)
                                  
                                     NULL => (NULL, NULL);

                                     THE (c, rest_of_string')
                                         => 
                                         case (getc rest_of_string')
                                           
                                              NULL        =>   (THE c, NULL  );
                                              THE (c', _) =>   (THE c, THE c');
                                         esac;
                                esac;


                            if (ok { prev, this, next })   match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);
                            else
                                 raise exception BACKTRACK;
                            fi;
                        };


                    # Fetch next char from string
                    # and pass it as this_char:
                    #
                    match_regex (re, pos, NULL, last_char, rest_of_string, matches_found, stack)
                        =>
                        case (getc rest_of_string)
                             NULL =>  raise exception BACKTRACK;
                             this_char =>  match_regex (re, pos, this_char, NULL, rest_of_string, matches_found, stack);
                        esac;
                end



                also
                fun pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, nested,

                          GROUP (re, rest_of_string', pos', siblings, stack)
                        )
                        =>
                        # Finished matching a group,
                        # so note the matched string
                        # in our matches_found slot
                        # and continue:
                        #
                        {   matches_found
                                =
                                m::REGEX_MATCH_RESULT
                                  (
                                    THE { match_position => rest_of_string',
                                          match_length   => pos - pos'
                                        },
                                    reverse nested
                                  )
                                !
                                siblings;

                            match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);
                        };


                    pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, matches_found2,

                          ASSIGN (backref_var, f, re, rest_of_string', pos', matches_found1, stack)
                        )
                        =>
                        # Note in indicated backreference variable the
                        # string matched since the ASSIGN was pushed
                        # on the stack, which is to say the part of
                        # the input string in the range (pos', pos).
                        #
                        # We have to be able to undo this when backtracking,
                        # so we also remember the old value of the var:
                        #
                        {   saved_value =   get_backref backref_var;

                            matches_found =   matches_found2 @ matches_found1;

                            set_backref (backref_var, f (get_n_chars (pos - pos', rest_of_string')));

                            match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack)
                            except
                                # Restore saved backref value
                                # when backtracking:
                                #
                                BACKTRACK
                                    =
                                    {   set_backref (backref_var, saved_value);
                                        raise exception BACKTRACK;
                                    };

                        };


                    pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, matches_found2,

                          GUARD (predicate, re, rest_of_string', pos', matches_found1, stack)
                        )
                        =>
                        # Call the given predicate with the
                        # substring matched since GUARD was
                        # pushed on the stack, which is to
                        # say the range (pos', pos).  If the
                        # predicate returns FALSE, fail this
                        # match try and backtrack.
                        #
                        # Our Perl5 surface syntax does not
                        # provide any way to specify such a
                        # predicate, so we aren't going to
                        # be doing this too often at present:
                        #
                        {   matches_found =   matches_found2 @ matches_found1;

                            if   (predicate (get_n_chars (pos - pos', rest_of_string')))
                                
                                 match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);
                            else
                                 raise exception BACKTRACK;
                            fi;
                        };


                    pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, matches_found,

                          CONCAT (re, stack)
                        )
                        # We're matching two regexes
                        # in sequence and just finished
                        # the first one, so pop the second
                        # off the stack and carry on:
                        #
                        =>
                        match_regex (re, pos, this_char, last_char, rest_of_string, matches_found, stack);


                    # We're handling a match of one of
                    #     foo*
                    #     foo+
                    #     foo?
                    #     foo{m,n}  
                    # for some regex 'foo'. (We use
                    # the same logic to handle all
                    # four constructs.)  We just
                    # completed a match of foo.
                    #
                    pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, matches_found2,

                          REPEAT
                            ( foo,              # Regex being matched repeatedly.
                              pos',             # Value of 'pos' before last match of 'foo'.
                              min,              # Minimum number of 'foo' matches needed.
                              max,              # Maximum number of 'foo' matches allowed. (NULL == "infinity".)
                              i,                # Actual  mumber of 'foo' matches now done.
                              re,               # Remaining regex to match when done.
                              matches_found1,   # Substrings previously matched while iterating over 'foo'.
                              stack
                            )
                        )
                        =>
                        if (i > 1 and pos == pos')

                            # This check keeps cases like
                            #
                            #     "abc" =~ ./(.?)*/
                            #
                            # from looping infinitely due to
                            # the fact that '*' repeats until
                            # the subexpression fails but that
                            # (without this check) (.?) will
                            # never fail:
                            #
                            raise exception BACKTRACK;

                        elif (i < min)
                            
                             # We haven't yet made our nut,
                             # so keep on iterating:
                             #  
                             match_regex
                               ( [foo], pos, this_char, last_char, rest_of_string, [],
                                 REPEAT (foo, pos, min, max, i+1, re, matches_found1, stack)
                               );

                        elif (less (i, max))

                             # We've made our nut.  We keep
                             # on iterating 'foo' so as to
                             # favor a "maximum-munch" solution
                             # but we have a legal number of
                             # 'foo' matches now, so if the
                             # next 'foo' match fails we trap
                             # the BACKTRACK and continue the
                             # rest of the global match with
                             # just the current number of 'foo'
                             # matches:
                             #    
                             match_regex
                               ( [foo], pos, this_char, last_char, rest_of_string, [],
                                 REPEAT (foo, pos, min, max, i+1, re, matches_found1, stack)
                               )
                             except
                                 BACKTRACK
                                     =
                                     match_regex (re, pos, this_char, last_char, rest_of_string, matches_found2 @ matches_found1, stack);
                        else 
                                     match_regex (re, pos, this_char, last_char, rest_of_string, matches_found2 @ matches_found1, stack);
                                     #
                                     # We've reached our 'foo' repeats limit
                                     # so carry on with the rest of the global
                                     # regex match.
                        fi;

                    pop_stack_and_continue
                        ( pos, this_char, last_char, rest_of_string, matches_found,

                          RETURN
                        )
                        =>
                        # We've successfully completed the entire
                        # requested regex match on the string, so
                        # reverse the match results to put them
                        # in the caller-expected left-to-right order,
                        # and we're done:
                        #
                        (pos, this_char, last_char, rest_of_string, reverse matches_found);
                end;


                # Do the complete requested regex match.
                # If it doesn't succeed, we'll wind up
                # at the "except BACKTRACK" below;  if
                # we return normally, the match succeeded:
                #
                my (pos, this_char, last_char, rest_of_string, matches_found)
                    = 
                    match_regex ([regexp], start_pos, NULL, last, stream, [], RETURN);

                # By convention "group 0" match is the
                # entire matched (sub)string, so push
                # that on the front of the list of
                # match results and return the lot:
                #
                THE ( m::REGEX_MATCH_RESULT
                        ( THE { match_position =>  stream,
                                match_length   =>  pos - start_pos
                              },

                          matches_found
                        ),

                        rest_of_string
                    );
            }                                           # fun scan
            except
                BACKTRACK = NULL;


        fun allot_refs (COMPILED_REGULAR_EXPRESSION { backref_var_count, references, ... } )
            = 
            if   (thread_safe   and   backref_var_count > 0) 
                
                 v::make_rw_vector (backref_var_count, "");
            else
                 references;
            fi;


        fun prefix regexp getc stream
            = 
            scan (regexp, allot_refs regexp, getc, 0, stream, NULL);


        fun find   (regexp as COMPILED_REGULAR_EXPRESSION { begin_only, multiline, ... } )   getc   stream
            =  
            {
                refs =   allot_refs regexp;

                #  Most general and slowest 
                #
                fun loop1 (pos, stream, last)
                    = 
                    case (scan (regexp, refs, getc, pos, stream, last))
                      
                         NULL
                             =>
                             case (getc stream)
                               
                                  NULL => NULL;

                                  last as THE (c, s)
                                       =>
                                       loop1 (pos+1, s, last);
                             esac;

                         x => x;
                    esac;


                # Multiline pattern that only
                # matches at start of line:
                #
                fun loop2 (pos, stream, last)
                    = 
                    case (scan (regexp, refs, getc, pos, stream, last))
                      
                          NULL =>   skip_to_next_line (pos, stream);
                          x    =>   x;
                    esac

                also
                fun skip_to_next_line (pos, stream)
                    =
                    case (getc stream)
                      
                         NULL => NULL;
                         last as THE('\n', s) =>   loop2 (pos+1, s, last);
                         last as THE(_,    s) =>   skip_to_next_line (pos+1, s);
                    esac;

                case (begin_only, multiline)
                  
                     (TRUE, FALSE) =>   scan (regexp, refs, getc, 0, stream, NULL);
                     (FALSE, _   ) =>   loop1 (0, stream, NULL);
                     (TRUE, TRUE ) =>   loop2 (0, stream, NULL);
                esac;
            };


        # Execute the longest match:
        #
        fun match rules
            = 
            do_it
            where
                # Precompile all patterns:
                #
                rules =  map  (\\ (re, act) =  (compile re, act))
                              rules;

                fun do_it getc stream
                    = 
                    find_best (rules, NULL)
                    where 
                        fun find_best ([], NULL)
                                =>
                                NULL;

                            find_best ([], THE (len, match, s, act))
                                =>
                                THE (act match, s);

                            find_best((re, act) ! rest, best)
                                =>
                                case (prefix re getc stream, best)
                                  
                                     (NULL, best)
                                         =>
                                         find_best (rest, best);


                                     ( THE (m as m::REGEX_MATCH_RESULT (THE { match_length, ... }, _), s),
                                       NULL
                                     )
                                         => 
                                         find_best (rest, THE (match_length, m, s, act));


                                     ( THE (m as m::REGEX_MATCH_RESULT (THE { match_length, ... }, _), s), 
                                       THE (len', _, _, _)
                                     )
                                         =>
                                         if (match_length > len')   find_best (rest, THE (match_length, m, s, act));
                                         else                       find_best (rest, best);
                                         fi;

                                      _ => raise exception BUG;
                                esac;
                        end;                            # fun find_best
                    end;                                # where/do_it
            end;                                        # where/match

    };

end;                                                    # stipulate


Comments and suggestions to: bugs@mythryl.org

PreviousUpNext