## regex-to-raw-syntax.pkg
# Compiled by:
#
src/lib/compiler/front/parser/parser.sublib# This was an early 'Perl7' idea to speed up regex
# execution speed by compiling hard code for it.
# The results were unimpressive. I think that the
# fact that the input string winds up being scanned
# one character at a time (typically) means that
# there is very little win in practice from
# eliminating the interpretation step of the
# regex itself.
#
# This file and related stuff should probably be deleted
# unless it can be repurposed for something.
#
# -- 2009-10-30 CrT
# Compiled by:
#
src/lib/compiler/front/parser/parser.sublib### "Mathematics, rightly viewed, possesses not only truth,
### but supreme beauty -- a beauty cold and austere,
### like that of sculpture, without appeal to any part
### of our weaker nature, without the gorgeous trappings
### of paintings or music, yet sublimely pure and capable
### of a stern perfection such as only the greatest art can show."
###
### -- Bertrand Russell
package regex_to_raw_syntax
: (weak) Regex_To_Raw_Syntax # Regex_To_Raw_Syntax is from
src/lib/compiler/front/parser/raw-syntax/regex-to-raw-syntax.api{
include package raw_syntax;
include package error_message;
include package symbol;
include package fast_symbol;
include package raw_syntax_junk;
include package fixity;
# A simple syntax tree for regular expressions:
Regular_Expression
= REGEX_STRING String
| REGEX_DOT
| REGEX_STAR Regular_Expression
;
exception REGEX_CODE_BROKEN;
fun regex_to_raw_syntax (expression, regular_expressions, expressionleft, expressionright, regular_expressionsright)
=
{ eqeq_as_rawsym = raw_symbol (eqeq_hash, eqeq_string);
(make_value_and_fixity_symbols eqeq_as_rawsym)
->
(v, f);
eqeq_as_aexp = { item => VARIABLE_IN_EXPRESSION [v],
source_code_region => (expressionleft, regular_expressionsright),
fixity => THE f
};
fun make_raw name_string
=
raw_symbol (hash_string::hash_string name_string, name_string);
fun pat_and_aexp_syms name_string
=
{ raw_sym = make_raw name_string;
my (v, f) = make_value_and_fixity_symbols raw_sym;
( { item => VARIABLE_IN_PATTERN [v],
source_code_region => (expressionleft, regular_expressionsright),
fixity => THE f
},
{ item => VARIABLE_IN_EXPRESSION [v],
source_code_region => (expressionleft, regular_expressionsright),
fixity => THE f
}
);
};
fun vb_val_naming (apat, expression)
=
NAMED_VALUE {
expression,
pattern => PRE_FIXITY_PATTERN [ apat ],
is_lazy => FALSE
};
fun aexp_val_naming (apat, expression)
=
{ vb = vb_val_naming (apat, expression);
VALUE_DECLARATIONS ([ vb ], NIL);
};
fun aexp_let (declaration, expression)
=
LET_EXPRESSION { declaration, expression };
fun to_fixity_item item
=
{ item,
source_code_region => (expressionleft, expressionright),
fixity => NULL
};
fun dot_exp_let (declaration, expression)
=
[ { item => aexp_let (declaration, expression),
source_code_region => (expressionleft, expressionright),
fixity => NULL
}
];
fun expr_let (declaration, expression)
=
PRE_FIXITY_EXPRESSION (dot_exp_let (declaration, expression));
fun aexp_package_part (package_name, id_name)
=
{ p = make_package_symbol (make_raw package_name);
i = make_value_symbol (make_raw id_name);
VARIABLE_IN_EXPRESSION [p, i];
};
my (substrate_as_apat, substrate_as_aexp) = pat_and_aexp_syms "substrate";
my (subscript_as_apat, subscript_as_aexp) = pat_and_aexp_syms "INDEX_OUT_OF_BOUNDS";
my ( false_as_apat, false_as_aexp) = pat_and_aexp_syms "FALSE";
my ( deref_as_apat, deref_as_aexp) = pat_and_aexp_syms "deref";
my ( c_as_apat, c_as_aexp) = pat_and_aexp_syms "c";
my ( i_as_apat, i_as_aexp) = pat_and_aexp_syms "i";
my ( loop_as_apat, loop_as_aexp) = pat_and_aexp_syms "loop";
my ( match_as_apat, match_as_aexp) = pat_and_aexp_syms "match";
my ( match2_as_apat, match2_as_aexp) = pat_and_aexp_syms "match2";
my (match_end_as_apat, match_end_as_aexp) = pat_and_aexp_syms "match_end";
my ( s_as_apat, s_as_aexp) = pat_and_aexp_syms "s";
my ( ref_as_apat, ref_as_aexp) = pat_and_aexp_syms "REF";
my ( nada_as_apat, nada_as_aexp) = pat_and_aexp_syms "_";
my ( plus_as_apat, plus_as_aexp) = pat_and_aexp_syms "+";
my ( bangeq_as_apat, bangeq_as_aexp) = pat_and_aexp_syms "!=";
my ( coloneq_as_apat, coloneq_as_aexp) = pat_and_aexp_syms ":=";
my ( try_match_at_all_offsets_as_apat, try_match_at_all_offsets_as_aexp)
=
pat_and_aexp_syms "try_match_at_all_offsets";
fun dot_exp_int i
=
[ to_fixity_item (INT_CONSTANT_IN_EXPRESSION i) ];
# Synthesize the raw syntax for a
#
# fun name i = body
#
# declaration:
#
fun make_fun (name_as_apat, body)
=
FUNCTION_DECLARATIONS (
[ NAMED_FUNCTION {
pattern_clauses
=>
[
PATTERN_CLAUSE {
result_type => NULL,
patterns => [ name_as_apat, i_as_apat ],
expression => body
}
],
kind => PLAIN_FUN,
is_lazy => FALSE,
null_or_type => NULL
}
],
NIL
);
substrate_eq_expression = aexp_val_naming ( substrate_as_apat, expression );
i_eq_zero = aexp_val_naming ( i_as_apat, PRE_FIXITY_EXPRESSION (dot_exp_int 0) );
match_end_eq_ref_zero
=
aexp_val_naming (
match_end_as_apat,
PRE_FIXITY_EXPRESSION [
ref_as_aexp,
to_fixity_item (INT_CONSTANT_IN_EXPRESSION 0)
]
);
# We match amy single character but newline.
#
# Here we define a parse-time function to
# generate the raw syntax declaring a function
# 'match' like one of the two below:
#
# # Final case:
# fun match i
# =
# let c = string::get_byte_as_char ( substrate, i )
# in
# c != '\n';
# end
#
# # Non-final case:
# fun match i
# =
# let fun match i = ...
# c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# in
# c != '\n'
# and
# match i;
# end
#
fun make_dot_match_fn (i, fate_or_null)
=
if (fate_or_null == NULL)
# This is the "end of target_string" case,
# with no further recursive calls needed:
#
# let c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# (match_end := i); # Publish location of end of match (plus one).
# in
# c != '\n';
# end
expr_let (
SEQUENTIAL_DECLARATIONS [
# "c = string::get_byte_as_char ( substrate, i );"
aexp_val_naming (
c_as_apat,
PRE_FIXITY_EXPRESSION [
to_fixity_item (aexp_package_part ("string", "sub") ),
to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
]
),
# i = i + 1
aexp_val_naming (
i_as_apat,
PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
),
# (match_end := i)
aexp_val_naming (
nada_as_apat,
PRE_FIXITY_EXPRESSION [ match_end_as_aexp, coloneq_as_aexp, i_as_aexp ]
)
],
# c != '\n'
PRE_FIXITY_EXPRESSION [
c_as_aexp,
bangeq_as_aexp,
to_fixity_item (
CHAR_CONSTANT_IN_EXPRESSION "\n"
)
]
);
else
# This is the "before end of target_string" case,
# with further recursive calls needed for full match:
#
# let fun match i = ...
# c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# in
# c != '\n'
# and
# match i;
# end
expr_let (
SEQUENTIAL_DECLARATIONS [
# Compile in declaration of our fate fn as
#
# fun match i = ...
#
case fate_or_null
THE fate
=>
make_fun( match_as_apat, fate );
NULL => raise exception REGEX_CODE_BROKEN;
esac,
# "c = string::get_byte_as_char ( substrate, i );"
aexp_val_naming (
c_as_apat,
PRE_FIXITY_EXPRESSION [
to_fixity_item (aexp_package_part ("string", "sub") ),
to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
]
),
# i = i + 1
aexp_val_naming (
i_as_apat,
PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
)
],
# c != '\n'
# and
# match i;
AND_EXPRESSION (
PRE_FIXITY_EXPRESSION [
c_as_aexp,
bangeq_as_aexp,
to_fixity_item (
CHAR_CONSTANT_IN_EXPRESSION "\n"
)
],
PRE_FIXITY_EXPRESSION [ match_as_aexp, i_as_aexp ]
)
);fi;
# We match a length-N constant string using
# N simple functions, each of which checks
# one character and then either gives up or
# calls the next function in the chain. The
# last such function is a special case, so
# we need to be able to compile two sorts of
# these functions.
#
# Here we define a parse-time function to
# generate the raw syntax declaring a function
# 'match' like one of the two below, where
# 'h' and 'e' are illustrative character
# constants from the regular expression string:
#
# # Final case:
# fun match i
# =
# let c = string::get_byte_as_char ( substrate, i )
# in
# c == 'e';
# end
#
# # Non-final case:
# fun match i
# =
# let fun match i = ...
# c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# in
# c == 'h'
# and
# match i;
# end
#
fun make_string_match_fn (pattern_string, i, fate_or_null)
=
if ( (i + 1) == (size pattern_string)
and
fate_or_null == NULL
)
# This is the "end of target_string" case,
# with no further recursive calls needed:
#
# let c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# (match_end := i); # Publish location of end of match (plus one).
# in
# c == 'e';
# end
expr_let (
SEQUENTIAL_DECLARATIONS [
# "c = string::get_byte_as_char ( substrate, i );"
aexp_val_naming (
c_as_apat,
PRE_FIXITY_EXPRESSION [
to_fixity_item (aexp_package_part ("string", "sub") ),
to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
]
),
# i = i + 1
aexp_val_naming (
i_as_apat,
PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
),
# (match_end := i)
aexp_val_naming (
nada_as_apat,
PRE_FIXITY_EXPRESSION [ match_end_as_aexp, coloneq_as_aexp, i_as_aexp ]
)
],
# c == 'h'
PRE_FIXITY_EXPRESSION [
c_as_aexp,
eqeq_as_aexp,
to_fixity_item (
CHAR_CONSTANT_IN_EXPRESSION (
string::substring (pattern_string, i, 1)
)
)
]
);
else
# This is the "before end of target_string" case,
# with further recursive calls needed for full match:
#
# let fun match i = ...
# c = string::get_byte_as_char ( substrate, i )
# i = i + 1
# in
# c == 'h'
# and
# match i;
# end
expr_let (
SEQUENTIAL_DECLARATIONS [
# Here we need to define our fate fn as
#
# fun match i = ...
#
# for some suitable '...'.
#
# If we have not yet reached the end of the pattern_string
# which we are assigned to match, then we need a fate
# function which checks the next character in pattern_string.
# Otherwise, our 'match' fn here will be the fate fn
# passed in by our caller, which will try to match the next
# regular expression element after our pattern_string.
#
if ((i + 1) < (size pattern_string))
make_fun (
match_as_apat,
make_string_match_fn (pattern_string, i + 1, fate_or_null)
);
else
case fate_or_null
THE fate
=>
make_fun( match_as_apat, fate );
NULL => raise exception REGEX_CODE_BROKEN;
esac;
fi,
# "c = string::get_byte_as_char ( substrate, i );"
aexp_val_naming (
c_as_apat,
PRE_FIXITY_EXPRESSION [
to_fixity_item (aexp_package_part ("string", "sub") ),
to_fixity_item (TUPLE_EXPRESSION [ substrate_as_aexp.item, i_as_aexp.item ] )
]
),
# i = i + 1
aexp_val_naming (
i_as_apat,
PRE_FIXITY_EXPRESSION [ i_as_aexp, plus_as_aexp, to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1) ]
)
],
# c == 'h'
# and
# match i;
AND_EXPRESSION (
PRE_FIXITY_EXPRESSION [
c_as_aexp,
eqeq_as_aexp,
to_fixity_item (
CHAR_CONSTANT_IN_EXPRESSION (
string::substring (pattern_string, i, 1)
)
)
],
PRE_FIXITY_EXPRESSION [ match_as_aexp, i_as_aexp ]
)
);fi;
# Here we set up directly by hand the raw syntax
# defining the below function. This syntax will
# be included in a let...in...end statement. The
# purpose of this function is to try matching a
# given constant string (implicitly defined by
# fun 'match') at offsets i, i+1, i+2... until either
# a successful match is found or a INDEX_OUT_OF_BOUNDS exception
# is thrown, meaning that we've run out of input string.
#
# fun try_match_at_all_offsets (i)
# =
# match i
# or
# try_match_at_all_offsets (i+1)
#
fun_try_match_at_all_offsets_i
=
make_fun (
try_match_at_all_offsets_as_apat,
OR_EXPRESSION (
PRE_FIXITY_EXPRESSION [
match_as_aexp,
i_as_aexp
],
PRE_FIXITY_EXPRESSION [
try_match_at_all_offsets_as_aexp,
to_fixity_item (
PRE_FIXITY_EXPRESSION [
i_as_aexp,
plus_as_aexp,
to_fixity_item (INT_CONSTANT_IN_EXPRESSION 1)
]
)
]
)
);
# We want to generate raw syntax for a function
# which finds as many matches of 'match'
# as possible in a row followed by a match
# of 'match2'.
#
# If 'match' and 'match2' always returned a
# boolean value, we could use the code:
#
# fun x i
# =
# let fun match i = ... # Pattern to repeat.
# fun match2 i = ... # Fate.
#
# fun loop i
# =
# (match i and loop *match_end)
# or
# (match2 i)
# in
# loop i
# end
#
# In practice, 'match' and 'match2' may throw a
# INDEX_OUT_OF_BOUNDS exception, so we need to trap those
# and treat them the same as FALSE:
#
#
# fun x i
# =
# let fun match i = ... # Pattern to repeat.
# fun match2 i = ... # Fate.
#
# fun loop i
# =
# ( (match i and loop *match_end)
# except
# INDEX_OUT_OF_BOUNDS = FALSE
# )
# or
# (match2 i)
# in
# loop i
# end
#
fun make_star_match_fn (regex, fate_or_null)
=
case fate_or_null
NULL => raise exception REGEX_CODE_BROKEN;
THE fate
=>
expr_let (
SEQUENTIAL_DECLARATIONS [
make_fun (
match_as_apat,
regex_to_raw_syntax (regex, NULL)
),
make_fun (
match2_as_apat,
fate
),
make_fun (
loop_as_apat,
OR_EXPRESSION (
# (match i and loop *match_end)
# except
# INDEX_OUT_OF_BOUNDS = FALSE
#
EXCEPT_EXPRESSION {
expression
=>
AND_EXPRESSION (
# match i
PRE_FIXITY_EXPRESSION [
match_as_aexp,
i_as_aexp
],
# loop *match_end
PRE_FIXITY_EXPRESSION [
loop_as_aexp,
to_fixity_item (
PRE_FIXITY_EXPRESSION [
deref_as_aexp,
match_end_as_aexp
]
)
]
),
rules
=>
[ CASE_RULE {
pattern => subscript_as_apat.item,
expression => false_as_aexp.item
}
]
},
# match2 i
PRE_FIXITY_EXPRESSION [
match2_as_aexp,
i_as_aexp
]
)
)
],
# loop i
PRE_FIXITY_EXPRESSION [
loop_as_aexp,
i_as_aexp
]
);
esac
also
fun regex_to_raw_syntax (
regex, # Regular expression element to translate
fate_or_null # NULL or else Raw syntax for code call (at runtime) if we succeed in matching 'regex'
)
=
case regex
REGEX_STAR regex
=>
{ fun_match_i = make_star_match_fn (regex, fate_or_null);
fun_match_i;
};
REGEX_STRING s
=>
{
# Generate raw syntax for a set of
# nested functions which collectively
# check for the presence of target
# string ('s') at a given offset
# in the substrate string. Later we'll
# include this early in a let...in...end statement:
fun_match_i = make_string_match_fn (s, 0, fate_or_null);
fun_match_i;
};
REGEX_DOT
=>
fun_match_i
where
fun_match_i
=
make_dot_match_fn (0, fate_or_null);
end;
esac;
# Root compile-one-regular-expression fn.
#
fun regex_list_to_raw_syntax (
regex_list, # Remaining regular expression to translate
fate_or_null # NULL or else Raw syntax for code call (at runtime) if we succeed in matching 'regex'
)
=
case regex_list
[ regex ]
=>
regex_to_raw_syntax( regex, fate_or_null );
regex ! regexes
=>
{
# Generate raw syntax for a set of
# nested functions which collectively
# check for the presence of target
# string ('s') at a given offset
# in the substrate string. Later we'll
# include this early in a let...in...end statement:
#
fate
=
regex_list_to_raw_syntax( regexes, fate_or_null );
regex_to_raw_syntax( regex, THE fate );
};
_ => raise exception REGEX_CODE_BROKEN;
esac;
fun make_outer_wrapper_expression (
regex_list # Remaining regular expression to translate
)
=
{ fate
=
regex_list_to_raw_syntax (
regular_expressions,
NULL
);
EXCEPT_EXPRESSION {
expression
=>
expr_let (
SEQUENTIAL_DECLARATIONS [
substrate_eq_expression,
i_eq_zero,
match_end_eq_ref_zero,
make_fun( match_as_apat, fate ),
fun_try_match_at_all_offsets_i
],
# match 0
PRE_FIXITY_EXPRESSION [
# match_as_aexp, # For anchored matches.
try_match_at_all_offsets_as_aexp, # For unanchors matches.
to_fixity_item (INT_CONSTANT_IN_EXPRESSION 0)
]
),
rules
=>
[ CASE_RULE {
pattern => subscript_as_apat.item,
expression => false_as_aexp.item
}
]
};
};
result_expression
=
make_outer_wrapper_expression (
regular_expressions
);
result_expression;
};
}; # package