PreviousUpNext

15.4.854  src/lib/regex/backend/bt-engine.pkg

## bt-engine.pkg

# Compiled by:
#     src/lib/std/standard.lib

# A regular expressions matcher based on a backtracking search.


###              "Technology has advanced more
###               in the last thirty years than
###               in the previous two thousand.
###               The exponential increase in
###               advancement will only continue."
###
###                      -- Niels Bohr



package   backtrack_engine
: (weak)  Regular_Expression_Engine                                     # Regular_Expression_Engine     is from   src/lib/regex/backend/regular-expression-engine.api
{
    exception ERROR;

    package r = abstract_regular_expression;                            # abstract_regular_expression   is from   src/lib/regex/front/abstract-regular-expression.pkg
    package s = r;
    package m = regex_match_result;                                     # regex_match_result            is from   src/lib/regex/glue/regex-match-result.pkg

    Regex = s::Syntax;

    fun compile r = r;

    fun scan (regexp, getc, pos, stream)
        =
        {   fun getc' s
                =
                case (getc s)
                  
                     THE v =>  v;
                     NULL  =>  raise exception INDEX_OUT_OF_BOUNDS;
                esac;

            # This function gets an empty match structure,
            # for when the appropriate alternative is not
            # followed at all:
            #
            fun get_match_structure (s::GROUP e)            =>  [m::MATCH (NULL, get_match_structure e)];
                get_match_structure (s::ALT l)              =>  list::cat (map get_match_structure l);
                get_match_structure (s::CONCAT l)           =>  list::cat (map get_match_structure l);

                get_match_structure (s::INTERVAL (e, _, _)) =>  get_match_structure e;
                get_match_structure (s::OPTION e)           =>  get_match_structure e;
                get_match_structure (s::STAR e)             =>  get_match_structure e;
                get_match_structure (s::PLUS e)             =>  get_match_structure e;

                get_match_structure (_)                     =>  [];
            end;


            # Walk a regular expression in fate-passing style
            # The fate is simply a list of all this is left to do
            # Fates only seem to arise when concatenation are considered
            # 
            # Walk returns the boolean status of the beast, and a match_tree
            # containing the match information.
            # Also: the last position scanned and the remainder stream
            # MODIFICATION: walk returns a list of matches
            # (because we need to extract the longest match)
            #
            fun max []       sel
                    =>
                    raise exception ERROR;

                max (x . xs) sel
                    => 
                    {   fun max' [] curr curr_sel => curr;

                            max' (x . xs) curr curr_sel
                                =>
                                {   x_sel = sel x;
                                    if (x_sel > curr_sel   )   max' xs x x_sel;
                                                          else   max' xs curr curr_sel;   fi;
                                };
                        end;

                        max' xs x (sel x);
                    };
            end;


            fun longest l
                =
                max l (#3: (X, Y, Int, Z) -> Int);

            fun opt_minus1 (THE i) =>  THE (i - 1);
                opt_minus1 NULL    =>  NULL;
            end;

            fun walk (s::GROUP e, fate, p, inits)
                    => 
                    case (walk (e,[], p, inits) )
                      
                         [] => [(FALSE,[], p, inits)];

                         ((b, matches, last, s) . ls)
                             => 
                             {   fun loop [] c_last 1 c_cont c_list
                                         =>
                                         {   my [(b,  matches,  last,  s )] =  c_list;
                                             my [(b', matches', last', s')] =  c_cont;

                                             [(b', (m::MATCH (THE { pos=>inits, len=>last-p }, 
                                                             matches)) . matches', last', s')];
                                         };

                                     loop [] c_last n c_cont c_list
                                         =>
                                         raise exception ERROR;

                                     loop ((b, matches, last, s) . es) c_len c_num c_cont c_list
                                         =>
                                         {  my v as (_, _, last', _)
                                                =
                                                longest (walk (s::CONCAT [], fate, last, s));

                                             if   (last' > c_len)
                                                 
                                                  loop es last' 1 [v] [(b, matches, last, s)];
                                             else
                                                  if   (last' == c_len)
                                                      
                                                       loop es c_len (c_num+1) (v . c_cont) 
                                                                ((b, matches, last, s) . c_list);
                                                  else
                                                       loop es c_len c_num c_cont c_list;
                                                  fi;
                                              fi;
                                         };
                                 end;

                                 loop ls last 1 [longest (walk (s::CONCAT [], fate, last, s))] 
                                      [(b, matches, last, s)];
                             };
                    esac;

                walk (s::ALT [],[], p, inits) =>  [(TRUE,[], p, inits)];
                walk (s::ALT [], (c . cs), p, inits) =>  walk (c, cs, p, inits);
                walk (s::ALT l, fate, p, inits)
                     => 
                     loop l
                     where
                         fun loop [] => [];

                             loop (e . es)
                                 =>
                                 {   g = longest (walk (e, fate, p, inits));

                                     if   (#1 g)
                                          g . (loop es);
                                     else     (loop es);       fi;
                                 };
                         end;
                     end;

                walk (s::CONCAT [],[], p, inits) =>  [(TRUE,[], p, inits)];
                walk (s::CONCAT [], (c . cs), p, inits) =>  walk (c, cs, p, inits);
                walk (s::CONCAT (e . es), fate, p, inits) =>  walk (e, (es@fate), p, inits);
                walk (s::INTERVAL (e, 0, THE 0),[], p, inits) =>  [(TRUE,[], p, inits)];
                walk (s::INTERVAL (e, 0, THE 0), (c . cs), p, inits) =>  walk (c, cs, p, inits);
                walk (s::INTERVAL (e, 0, k), fate, p, inits)
                     => 
                     {   my (b', matches', last', s') = longest (walk (s::CONCAT [],          fate, p, inits));
                         my (b,  matches,  last,  s ) = longest (walk (s::INTERVAL (e, 1, k), fate, p, inits));

                         if   ((b and b' and last >= last') or (b and not b'))
                             
                              [(b, matches, last, s)];
                         else
                              if ((b' and b and last' > last) or (b' and not b))
                                  
                                   [(b', (get_match_structure e)@matches', last', s')];
                              else
                                   [(FALSE,[], p, inits)];
                              fi;
                         fi;
                     };

                walk (s::INTERVAL (e, 1, THE 1), fate, p, inits) => walk (e, fate, p, inits);
                walk (s::INTERVAL (e, 1, k), fate, p, inits)
                    => 
                    {   my (b', matches', last', s') = longest (walk (e,[], p, inits));  #  need to match 1 

                        if   (not b')
                            
                             [(FALSE, [], p, inits)];
                        else
                             my (b,   matches,   last,   s  ) = longest (walk (s::INTERVAL (e, 1, opt_minus1 k), fate, last', s'));
                             my (b'', matches'', last'', s'') = longest (walk (s::CONCAT [],                     fate, last', s'));

                             if   (b and b'' and last'' >= last)
                                 
                                  [(b'', matches'@matches'', last'', s'')];
                             else
                                  if b   [(b, matches, last, s)];
                                  else   [(b'', matches'@matches'', last'', s'')];      fi;
                             fi;

                        fi;
                    };

                walk (s::INTERVAL (e, n1, k), fate, p, inits)
                    =>
                    walk (s::CONCAT [e, s::INTERVAL (e, n1 - 1, opt_minus1 k)], fate, p, inits);

                walk (s::OPTION e, fate, p, inits)
                    =>
                    walk (s::INTERVAL (e, 0, THE 1), fate, p, inits);

                walk (s::STAR e, fate, p, inits)
                    =>
                    walk (s::INTERVAL (e, 0, NULL), fate, p, inits);

                walk (s::PLUS e, fate, p, inits)
                    =>
                    walk (s::INTERVAL (e, 1, NULL), fate, p, inits);

                walk (s::MATCH_SET set,[], p, inits)
                    =>
                    if   (s::char_set::is_empty set)
                        
                         [(TRUE,[], p, inits)];
                    else 
                         case (getc inits)
                           
                              THE (chr, s)
                                  => 
                                  {   b = s::char_set::member (set, chr);

                                      [(b,[], p+(b ?? 1 :: 0), (b ?? s :: inits))];
                                  };

                              NULL => [(FALSE,[], p, inits)];
                         esac;
                    fi;

                walk (s::MATCH_SET set, (c . cs), p, inits)
                    =>
                    if   (s::char_set::is_empty set)
                        
                         walk (c, cs, p, inits);
                    else
                         case (getc inits)
                           
                              THE (chr, s)
                                  => 
                                  if   (s::char_set::member (set, chr))
                                      
                                       walk (c, cs, (p+1), s);
                                  else
                                       [(FALSE,[], p, inits)];
                                  fi;

                              NULL => [(FALSE,[], p, inits)];
                         esac;
                    fi;

                walk (s::NONMATCH_SET set,[], p, inits)
                    =>
                    case (getc inits)
                      
                         THE (chr, s)
                             => 
                             {   b = not (s::char_set::member (set, chr));

                                 [(b, [], p+(b ?? 1 :: 0), (b ?? s :: inits))];
                             };

                         NULL
                             =>
                             [(FALSE,[], p, inits)];
                   esac;

                walk (s::NONMATCH_SET set, (c . cs), p, inits)
                    => 
                    case (getc inits)
                      
                         THE (chr, s)
                             =>
                             if   (s::char_set::member (set, chr))
                                 
                                  [(FALSE,[], p, inits)];
                             else
                                  walk (c, cs, (p+1), s);
                             fi;

                         NULL => [(FALSE,[], p, inits)];
                    esac;

                walk (s::CHAR ch,[], p, inits)
                    =>
                     case (getc inits)
                       
                          THE (chr, s)
                              => 
                              {   b = (chr == ch);

                                  [(b, [], p+(b ?? 1 :: 0), (b ?? s :: inits))];
                              };

                          NULL => [(FALSE,[], p, inits)];
                     esac;

                walk (s::CHAR ch, (c . cs), p, inits)
                    =>
                    case (getc inits)
                      
                         THE (chr, s)
                             =>
                             if   (chr == ch)
                                 
                                  walk (c, cs, (p+1), s) ;
                             else
                                  [(FALSE,[], p, inits)];
                             fi;

                         NULL => [(FALSE,[], p, inits)];
                    esac;

                walk (s::BEGIN,[], p, inits)
                    =>
                    [(p==0,[], p, inits)];

                walk (s::BEGIN, (c . cs), p, inits)
                    =>
                    if   (p==0)
                        
                         walk (c, cs, p, inits);
                    else
                         [(FALSE,[], p, inits)];
                    fi;

                walk (s::END,[], p, inits)
                    =>
                    [(not (null_or::not_null (getc (inits))),[], p, inits)];

                walk (s::END, (c . cs), p, inits)
                    =>
                    if   (null_or::not_null (getc (inits)))
                        
                         [(FALSE,[], p, inits)];
                    else
                         walk (c, cs, p, inits);
                    fi;

                walk _
                    =>
                    raise exception ERROR;
            end;

            l = walk (regexp,[], pos, stream)
                except
                    INDEX_OUT_OF_BOUNDS = [(FALSE,[], pos, stream)];

            my v as (result, matches, last, s')
                =
                longest l
                except
                    _ = (FALSE,[], pos, stream);

            if result   THE (m::MATCH (THE { pos=>stream, len=>last-pos }, matches), s');
            else        NULL;   fi;
        };                              # fun scan


    fun prefix regexp getc stream
        =
        scan (regexp, getc, 0, stream);

    fun find regexp getc stream
        =       
        {   fun loop (p, s)
                =
                case (scan (regexp, getc, p, s))
                  
                     NULL
                         =>
                         case (getc s)
                           
                              THE (_, s') =>  loop (p+1, s');
                              NULL        =>  NULL;
                         esac;

                     THE v => THE v;
                esac;

            loop (0, stream);
        };


    fun match [] getc stream
            =>
            NULL;

        match l  getc stream
            =>
            {   m = map
                        (\\ (r, f) =  (prefix r getc stream, f))
                        l;


                #  Find the longest THE 

                fun loop ([], max, len)
                        =>
                        max;

                    loop ((NULL, _) . xs, max, maxlen)
                        =>
                        loop (xs, max, maxlen);

                    loop ((THE (m, cs), f) . xs, max, maxlen)
                        =>
                        {   my (THE { pos, len } )
                               =
                               match_tree::nth (m, 0);

                            if   (len > maxlen)
                                 loop (xs, (THE (m, cs), f), len);
                            else loop (xs, max, maxlen);  fi;
                        };
                end;

                my (max, f)
                    =
                    loop (tail m, head m, -1);

                case max 
                  
                     NULL => NULL;
                     THE (m, cs) => THE (f m, cs);
                esac;
            };
    end;

};






Comments and suggestions to: bugs@mythryl.org

PreviousUpNext