PreviousUpNext

15.4.575  src/lib/compiler/front/parser/raw-syntax/regex-to-raw-syntax.pkg

## regex-to-raw-syntax.pkg

# Compiled by:
#     src/lib/compiler/front/parser/parser.sublib

# This was an early 'Perl7' idea to speed up regex
# execution speed by compiling hard code for it.
# The results were unimpressive.  I think that the
# fact that the input string winds up being scanned
# one character at a time (typically) means that
# there is very little win in practice from
# eliminating the interpretation step of the
# regex itself.
#
# This file and related stuff should probably be deleted
# unless it can be repurposed for something.
#
#                 -- 2009-10-30 CrT

# Compiled by:
#     src/lib/compiler/front/parser/parser.sublib



###               "Mathematics, rightly viewed, possesses not only truth,
###                but supreme beauty -- a beauty cold and austere,
###                like that of sculpture, without appeal to any part
###                of our weaker nature, without the gorgeous trappings
###                of paintings or music, yet sublimely pure and capable
###                of a stern perfection such as only the greatest art can show."
###
###                                             -- Bertrand Russell



package   regex_to_raw_syntax
: (weak)  Regex_To_Raw_Syntax                           # Regex_To_Raw_Syntax   is from   src/lib/compiler/front/parser/raw-syntax/regex-to-raw-syntax.api
{
    include package   raw_syntax;
    include package   error_message;
    include package   symbol;
    include package   fast_symbol;
    include package   raw_syntax_junk;
    include package   fixity;

    # A simple syntax tree for regular expressions:

    Regular_Expression
      = REGEX_STRING  String
      | REGEX_DOT
      | REGEX_STAR  Regular_Expression
      ;

    exception REGEX_CODE_BROKEN;

    fun regex_to_raw_syntax (expression, regular_expressions, expressionleft, expressionright, regular_expressionsright)
        =
        {   eqeq_as_rawsym   =   raw_symbol (eqeq_hash, eqeq_string);

            (make_value_and_fixity_symbols   eqeq_as_rawsym)
                ->      
                (v, f);

            eqeq_as_aexp =  { item               => VARIABLE_IN_EXPRESSION [v],
                              source_code_region => (expressionleft, regular_expressionsright),
                              fixity             => THE f
                            };


            fun make_raw   name_string
                =
                raw_symbol (hash_string::hash_string name_string, name_string);



            fun pat_and_aexp_syms   name_string
                =
                {   raw_sym = make_raw name_string;

                    my (v, f) =   make_value_and_fixity_symbols   raw_sym;

                    (   {   item               => VARIABLE_IN_PATTERN [v], 
                            source_code_region => (expressionleft, regular_expressionsright),
                            fixity             => THE f
                        },
                        {   item               => VARIABLE_IN_EXPRESSION [v], 
                            source_code_region => (expressionleft, regular_expressionsright),
                            fixity             => THE f
                        }
                    );
                };

            fun vb_val_naming   (apat, expression)
                =
                NAMED_VALUE {
                    expression,
                    pattern    => PRE_FIXITY_PATTERN [ apat ],
                    is_lazy    => FALSE
                };

            fun aexp_val_naming   (apat, expression)
                =
                {   vb =   vb_val_naming   (apat, expression);

                    VALUE_DECLARATIONS ([ vb ], NIL);
                };

            fun aexp_let   (declaration, expression)
                =
                LET_EXPRESSION { declaration, expression };

            fun to_fixity_item   item
                =
                {   item,
                    source_code_region => (expressionleft, expressionright),
                    fixity             => NULL
                };

            fun dot_exp_let   (declaration, expression)
                =
                [   {   item => aexp_let (declaration, expression),
                        source_code_region => (expressionleft, expressionright),
                        fixity             => NULL
                    }
                ];

            fun expr_let   (declaration, expression)
                =
                PRE_FIXITY_EXPRESSION (dot_exp_let (declaration, expression));

            fun aexp_package_part   (package_name, id_name)
                =
                {    p =   make_package_symbol (make_raw package_name);
                     i =   make_value_symbol   (make_raw id_name);

                     VARIABLE_IN_EXPRESSION [p, i];
                };



            my (substrate_as_apat, substrate_as_aexp) = pat_and_aexp_syms  "substrate";
            my (subscript_as_apat, subscript_as_aexp) = pat_and_aexp_syms  "INDEX_OUT_OF_BOUNDS";
            my (    false_as_apat,     false_as_aexp) = pat_and_aexp_syms  "FALSE";
            my (    deref_as_apat,     deref_as_aexp) = pat_and_aexp_syms  "deref";
            my (        c_as_apat,         c_as_aexp) = pat_and_aexp_syms  "c";
            my (        i_as_apat,         i_as_aexp) = pat_and_aexp_syms  "i";
            my (     loop_as_apat,      loop_as_aexp) = pat_and_aexp_syms  "loop";
            my (    match_as_apat,     match_as_aexp) = pat_and_aexp_syms  "match";
            my (   match2_as_apat,    match2_as_aexp) = pat_and_aexp_syms  "match2";
            my (match_end_as_apat, match_end_as_aexp) = pat_and_aexp_syms  "match_end";
            my (        s_as_apat,         s_as_aexp) = pat_and_aexp_syms  "s";
            my (      ref_as_apat,       ref_as_aexp) = pat_and_aexp_syms  "REF";
            my (     nada_as_apat,      nada_as_aexp) = pat_and_aexp_syms  "_";
            my (     plus_as_apat,      plus_as_aexp) = pat_and_aexp_syms  "+";
            my (   bangeq_as_apat,    bangeq_as_aexp) = pat_and_aexp_syms  "!=";
            my (  coloneq_as_apat,   coloneq_as_aexp) = pat_and_aexp_syms  ":=";

            my ( try_match_at_all_offsets_as_apat, try_match_at_all_offsets_as_aexp)
                =
                pat_and_aexp_syms  "try_match_at_all_offsets";


            fun dot_exp_int   i
                =
                [   to_fixity_item (INT_CONSTANT_IN_EXPRESSION i)   ];



            # Synthesize the raw syntax for a
            #
            #    fun name i = body
            #
            # declaration:
            #
            fun make_fun (name_as_apat, body)
                =
                FUNCTION_DECLARATIONS (
                    [   NAMED_FUNCTION {

                            pattern_clauses
                                =>
                                [
                                    PATTERN_CLAUSE {

                                        result_type =>   NULL,
                                        patterns    =>   [ name_as_apat, i_as_apat ],
                                        expression  =>   body
                                    }
                                ],

                            kind    => PLAIN_FUN,
                            is_lazy => FALSE,

                            null_or_type => NULL
                        }
                    ],
                    NIL
                );


            substrate_eq_expression =   aexp_val_naming ( substrate_as_apat, expression );
            i_eq_zero               =   aexp_val_naming ( i_as_apat, PRE_FIXITY_EXPRESSION (dot_exp_int 0) );

            match_end_eq_ref_zero
                =
                aexp_val_naming (
                    match_end_as_apat,
                    PRE_FIXITY_EXPRESSION [
                        ref_as_aexp,
                        to_fixity_item (INT_CONSTANT_IN_EXPRESSION 0)
                    ]
                );


            # We match amy single character but newline.
            #
            # Here we define a parse-time function to
            # generate the raw syntax declaring a function
            # 'match' like one of the two below:
            #
            #    # Final case: 
            #    fun match i
            #        =
            #        let c = string::get_byte_as_char ( substrate, i )
            #        in
            #            c != '\n';
            #        end 
            #
            #    # Non-final case: 
            #    fun match i
            #        =
            #        let fun match i = ...
            #            c = string::get_byte_as_char ( substrate, i )
            #            i = i + 1
            #        in
            #            c != '\n'
            #            and
            #            match i;
            #        end 
            #
            fun make_dot_match_fn   (i, fate_or_null)
                =
                if (fate_or_null == NULL)
                    
                    # This is the "end of target_string" case,
                    # with no further recursive calls needed:
                    #
                    #        let c = string::get_byte_as_char ( substrate, i )
                    #            i = i + 1
                    #            (match_end := i);      # Publish location of end of match (plus one).
                    #        in
                    #            c != '\n';
                    #        end 

                    expr_let (

                        SEQUENTIAL_DECLARATIONS [

                            # "c = string::get_byte_as_char ( substrate, i );"
                            aexp_val_naming (
                                c_as_apat,
                                PRE_FIXITY_EXPRESSION [
                                    to_fixity_item (aexp_package_part ("string", "sub") ),
                                    to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
                                ]
                            ),

                            #  i = i + 1
                            aexp_val_naming (
                                i_as_apat,
                                PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
                            ),

                            #  (match_end := i)
                            aexp_val_naming (
                                nada_as_apat,
                                PRE_FIXITY_EXPRESSION [ match_end_as_aexp, coloneq_as_aexp, i_as_aexp ]
                            )
                        ],

                        # c != '\n'
                        PRE_FIXITY_EXPRESSION [
                            c_as_aexp,
                            bangeq_as_aexp,
                            to_fixity_item (
                                CHAR_CONSTANT_IN_EXPRESSION "\n"
                            )
                        ]
                    );


                else

                    # This is the "before end of target_string" case,
                    # with further recursive calls needed for full match:
                    #
                    #        let fun match i = ...
                    #            c = string::get_byte_as_char ( substrate, i )
                    #            i = i + 1
                    #        in
                    #            c != '\n'
                    #            and
                    #            match i;
                    #        end 

                    expr_let (

                        SEQUENTIAL_DECLARATIONS [

                            # Compile in declaration of our fate fn as
                            #
                            #     fun match i = ...
                            #
                            case fate_or_null

                                THE fate
                                     =>
                                     make_fun( match_as_apat, fate );

                                NULL => raise exception REGEX_CODE_BROKEN;
                            esac,

                            # "c = string::get_byte_as_char ( substrate, i );"
                            aexp_val_naming (
                                c_as_apat,
                                PRE_FIXITY_EXPRESSION [
                                    to_fixity_item (aexp_package_part ("string", "sub") ),
                                    to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
                                ]
                            ),

                            #  i = i + 1
                            aexp_val_naming (
                                i_as_apat,
                                PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
                            )
                        ],

                        # c != '\n'
                        # and
                        # match i;
                        AND_EXPRESSION (
                            PRE_FIXITY_EXPRESSION [
                                c_as_aexp,
                                bangeq_as_aexp,
                                to_fixity_item (
                                    CHAR_CONSTANT_IN_EXPRESSION "\n"
                                )
                            ],
                            PRE_FIXITY_EXPRESSION [ match_as_aexp, i_as_aexp ]
                        )

                    );fi;






            # We match a length-N constant string using
            # N simple functions, each of which checks
            # one character and then either gives up or
            # calls the next function in the chain. The
            # last such function is a special case, so
            # we need to be able to compile two sorts of
            # these functions.
            #
            # Here we define a parse-time function to
            # generate the raw syntax declaring a function
            # 'match' like one of the two below, where
            # 'h' and 'e' are illustrative character
            # constants from the regular expression string:
            #
            #    # Final case: 
            #    fun match i
            #        =
            #        let c = string::get_byte_as_char ( substrate, i )
            #        in
            #            c == 'e';
            #        end 
            #
            #    # Non-final case: 
            #    fun match i
            #        =
            #        let fun match i = ...
            #            c = string::get_byte_as_char ( substrate, i )
            #            i = i + 1
            #        in
            #            c == 'h'
            #            and
            #            match i;
            #        end 
            #
            fun make_string_match_fn   (pattern_string, i, fate_or_null)
                =
                if ( (i + 1)      ==   (size pattern_string)
                     and
                     fate_or_null ==   NULL
                )
                    # This is the "end of target_string" case,
                    # with no further recursive calls needed:
                    #
                    #        let c = string::get_byte_as_char ( substrate, i )
                    #            i = i + 1
                    #            (match_end := i);      # Publish location of end of match (plus one).
                    #        in
                    #            c == 'e';
                    #        end 

                    expr_let (

                        SEQUENTIAL_DECLARATIONS [

                            # "c = string::get_byte_as_char ( substrate, i );"
                            aexp_val_naming (
                                c_as_apat,
                                PRE_FIXITY_EXPRESSION [
                                    to_fixity_item (aexp_package_part ("string", "sub") ),
                                    to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
                                ]
                            ),

                            #  i = i + 1
                            aexp_val_naming (
                                i_as_apat,
                                PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
                            ),

                            #  (match_end := i)
                            aexp_val_naming (
                                nada_as_apat,
                                PRE_FIXITY_EXPRESSION [ match_end_as_aexp, coloneq_as_aexp, i_as_aexp ]
                            )
                        ],

                        # c == 'h'
                        PRE_FIXITY_EXPRESSION [
                            c_as_aexp,
                            eqeq_as_aexp,
                            to_fixity_item (
                                CHAR_CONSTANT_IN_EXPRESSION (
                                    string::substring (pattern_string, i, 1)
                                )
                            )
                        ]
                    );


                else
                    # This is the "before end of target_string" case,
                    # with further recursive calls needed for full match:
                    #
                    #        let fun match i = ...
                    #            c = string::get_byte_as_char ( substrate, i )
                    #            i = i + 1
                    #        in
                    #            c == 'h'
                    #            and
                    #            match i;
                    #        end 

                    expr_let (

                        SEQUENTIAL_DECLARATIONS [

                            # Here we need to define our fate fn as
                            #
                            #     fun match i = ...
                            #
                            # for some suitable '...'.
                            #
                            # If we have not yet reached the end of the pattern_string
                            # which we are assigned to match, then we need a fate
                            # function which checks the next character in pattern_string.
                            # Otherwise, our 'match' fn here will be the fate fn
                            # passed in by our caller, which will try to match the next
                            # regular expression element after our pattern_string.
                            #
                            if   ((i + 1)      <   (size pattern_string))
                                
                                 make_fun (
                                     match_as_apat,
                                     make_string_match_fn   (pattern_string,   i + 1,   fate_or_null)
                                 );
                            else
                                 case fate_or_null

                                     THE fate
                                         =>
                                         make_fun( match_as_apat, fate );

                                     NULL => raise exception REGEX_CODE_BROKEN;
                                 esac;
                            fi,

                            # "c = string::get_byte_as_char ( substrate, i );"
                            aexp_val_naming (
                                c_as_apat,
                                PRE_FIXITY_EXPRESSION [
                                    to_fixity_item (aexp_package_part ("string", "sub") ),
                                    to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
                                ]
                            ),

                            #  i = i + 1
                            aexp_val_naming (
                                i_as_apat,
                                PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
                            )
                        ],

                        # c == 'h'
                        # and
                        # match i;
                        AND_EXPRESSION (
                            PRE_FIXITY_EXPRESSION [
                                c_as_aexp,
                                eqeq_as_aexp,
                                to_fixity_item (
                                    CHAR_CONSTANT_IN_EXPRESSION (
                                        string::substring (pattern_string, i, 1)
                                    )
                                )
                            ],
                            PRE_FIXITY_EXPRESSION [ match_as_aexp, i_as_aexp ]
                        )
                    );fi;



            # Here we set up directly by hand the raw syntax
            # defining the below function.  This syntax will
            # be included in a let...in...end statement. The
            # purpose of this function is to try matching a
            # given constant string (implicitly defined by
            # fun 'match') at offsets i, i+1, i+2... until either
            # a successful match is found or a INDEX_OUT_OF_BOUNDS exception
            # is thrown, meaning that we've run out of input string.
            #
            #    fun try_match_at_all_offsets (i)
            #        =
            #        match i
            #        or  
            #        try_match_at_all_offsets (i+1)
            #
            fun_try_match_at_all_offsets_i
                =
                make_fun (

                    try_match_at_all_offsets_as_apat,

                    OR_EXPRESSION (
                        PRE_FIXITY_EXPRESSION [
                            match_as_aexp,
                            i_as_aexp
                        ],
                        PRE_FIXITY_EXPRESSION [
                            try_match_at_all_offsets_as_aexp,
                            to_fixity_item (
                                PRE_FIXITY_EXPRESSION [
                                    i_as_aexp,
                                    plus_as_aexp,
                                    to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1)
                                ]
                            )
                        ]
                    )
                );



            # We want to generate raw syntax for a function
            # which finds as many matches of 'match'
            # as possible in a row followed by a match
            # of 'match2'.
            # 
            # If 'match' and 'match2' always returned a
            # boolean value, we could use the code:
            #
            #     fun x i
            #         =
            #         let fun match i = ...  # Pattern to repeat.
            #             fun match2 i = ... # Fate.
            #
            #             fun loop i
            #                 =
            #                 (match i   and   loop *match_end)
            #                 or                 
            #                 (match2 i)
            #         in
            #             loop i
            #         end
            #
            # In practice, 'match' and 'match2' may throw a
            # INDEX_OUT_OF_BOUNDS exception, so we need to trap those
            # and treat them the same as FALSE:
            #
            #
            #     fun x i
            #         =
            #         let fun match i = ...  # Pattern to repeat.
            #             fun match2 i = ... # Fate.
            #
            #             fun loop i
            #                 =
            #                 (   (match i   and   loop *match_end)
            #                     except
            #                         INDEX_OUT_OF_BOUNDS = FALSE
            #                 )
            #                 or                 
            #                 (match2 i)
            #         in
            #             loop i
            #         end
            #
            fun make_star_match_fn   (regex, fate_or_null)
                =
                case fate_or_null

                     NULL => raise exception REGEX_CODE_BROKEN;

                    THE fate
                         =>
                         expr_let (

                             SEQUENTIAL_DECLARATIONS [

                                 make_fun (
                                     match_as_apat,
                                     regex_to_raw_syntax (regex, NULL)
                                 ),

                                 make_fun (
                                     match2_as_apat,
                                     fate
                                 ),

                                 make_fun (
                                     loop_as_apat,

                                     OR_EXPRESSION (

                                         #      (match i   and   loop *match_end)
                                         #      except
                                         #          INDEX_OUT_OF_BOUNDS = FALSE
                                         #             
                                         EXCEPT_EXPRESSION {

                                             expression
                                                 =>
                                                 AND_EXPRESSION (

                                                     # match i
                                                     PRE_FIXITY_EXPRESSION [
                                                         match_as_aexp,
                                                         i_as_aexp
                                                     ],

                                                     # loop *match_end
                                                     PRE_FIXITY_EXPRESSION [
                                                         loop_as_aexp,

                                                         to_fixity_item (
                                                             PRE_FIXITY_EXPRESSION [
                                                                 deref_as_aexp,
                                                                 match_end_as_aexp
                                                             ]
                                                         )
                                                     ]
                                                 ),

                                             rules
                                                 =>
                                                 [   CASE_RULE {
                                                         pattern => subscript_as_apat.item, 
                                                         expression => false_as_aexp.item
                                                     }
                                                 ]
                                         },

                                         # match2 i
                                         PRE_FIXITY_EXPRESSION [
                                             match2_as_aexp,
                                             i_as_aexp
                                         ]
                                     )
                                 )
                             ],


                             # loop i
                             PRE_FIXITY_EXPRESSION [
                                 loop_as_aexp,
                                 i_as_aexp
                             ]
                         );
                esac


           also
           fun regex_to_raw_syntax (
                    regex,              # Regular expression element to translate
                    fate_or_null        # NULL or else Raw syntax for code call (at runtime) if we succeed in matching 'regex'
                )
                =
                case regex

                     REGEX_STAR regex
                         =>
                         {   fun_match_i = make_star_match_fn   (regex, fate_or_null);

                             fun_match_i;
                         };

                    REGEX_STRING s
                         =>
                         {
                             # Generate raw syntax for a set of
                             # nested functions which collectively
                             # check for the presence of target
                             # string ('s') at a given offset
                             # in the substrate string.  Later we'll
                             # include this early in a let...in...end statement:

                             fun_match_i =   make_string_match_fn (s, 0, fate_or_null);

                             fun_match_i;
                         };

                    REGEX_DOT
                         =>
                         fun_match_i
                         where
                             fun_match_i
                                 =
                                 make_dot_match_fn (0, fate_or_null);

                         end;
                esac;



            # Root compile-one-regular-expression fn.
            #
            fun regex_list_to_raw_syntax (
                    regex_list, # Remaining regular expression to translate
                    fate_or_null        # NULL or else Raw syntax for code call (at runtime) if we succeed in matching 'regex'
                )
                =
                case regex_list

                    [ regex ]
                         =>
                         regex_to_raw_syntax( regex, fate_or_null );



                    regex ! regexes
                         =>
                         {
                             # Generate raw syntax for a set of
                             # nested functions which collectively
                             # check for the presence of target
                             # string ('s') at a given offset
                             # in the substrate string.  Later we'll
                             # include this early in a let...in...end statement:
                             #
                             fate
                                 =
                                 regex_list_to_raw_syntax( regexes, fate_or_null );

                             regex_to_raw_syntax( regex, THE fate );

                         };



                    _ => raise exception REGEX_CODE_BROKEN;
                esac;



            fun make_outer_wrapper_expression (
                    regex_list  # Remaining regular expression to translate
                )
                =
                {   fate
                         =
                         regex_list_to_raw_syntax (
                             regular_expressions,
                             NULL
                         );


                    EXCEPT_EXPRESSION {

                        expression
                            =>
                            expr_let (

                                SEQUENTIAL_DECLARATIONS [
                                    substrate_eq_expression,
                                    i_eq_zero,
                                    match_end_eq_ref_zero,
                                    make_fun( match_as_apat, fate ),
                                    fun_try_match_at_all_offsets_i
                                ],

                                # match 0
                                PRE_FIXITY_EXPRESSION [
#                                   match_as_aexp,                      # For  anchored matches.
                                    try_match_at_all_offsets_as_aexp,   # For unanchors matches.
                                    to_fixity_item (INT_CONSTANT_IN_EXPRESSION 0)
                                ]
                            ),

                        rules
                            =>
                            [   CASE_RULE {
                                    pattern => subscript_as_apat.item, 
                                    expression => false_as_aexp.item
                                }
                            ]
                    };
                };


            result_expression
                 =
                 make_outer_wrapper_expression (
                     regular_expressions
                 );

            result_expression;
        };

 
};              #  package 




Comments and suggestions to: bugs@mythryl.org

PreviousUpNext