# perl-regex-parser-g.pkg
#
# This module implements a subset of Perl regular expression syntax.
#
# WARNING: there is no locale support!
#
# The meta characters are:
# "\" "^" "$" "." "[" "]" "
|" "(" ")" "*" "+" "?"
#
# \ Quote the next metacharacter
# ^ Match the beginning of the line
# . Match any character (except newline)
# $ Match the end of the line (or before newline at the end)
#
| Alternation
# () Grouping
# [] Character class
#
#
# The following standard quantifiers are recognized:
#
# * Match 0 or more times
# + Match 1 or more times
# ? Match 1 or 0 times
# { n } Match exactly n times
# { n,} Match at least n times
# { n, m } Match at least n but not more than m times
#
# \033 octal char (think of a PDP-11)
# \x1B hex char
# \x { 263a } wide hex char (Unicode SMILEY)
# \c[ control char
# \N { name } named char
# \l lowercase next char (think vi)
# \u uppercase next char (think vi)
# \L lowercase till \E (think vi)
# \U uppercase till \E (think vi)
# \E end case modification (think vi)
# \Q quote (disable) pattern metacharacters till \E
#
# \pP Match P, named property. Use \p { Prop } for longer names.
# \PP Match non-P
# \C Match a single C char (octet) even under utf8.
#
# Perl defines the following zero-width assertions:
#
# \b Match a word boundary
# \B Match a non-(word boundary)
# \A Match only at beginning of string
# \Z Match only at end of string, or before newline at the end
# \z Match only at end of string
# Compiled by:
#
src/lib/std/standard.lib# Invoked by:
#
src/lib/regex/front/perl-regex-parser.pkggeneric package perl_regex_parser_g (
r: Abstract_Regular_Expression
)
: (weak)
api {
include api Generalized_Regular_Expression_Parser
where
r == r;
# With user supplied error handler
scan': ((X, String) -> Null_Or ((r::Abstract_Regular_Expression, X)) )
-> number_string::Reader (Char, X)
-> number_string::Reader (r::Abstract_Regular_Expression, X);
}
=
package {
package r = r;
package s
=
generic_regular_expression_syntax_g (
package r = r;
package s = r::char_set;
fun char c
=
r::char::from_int (char::to_int c);
fun make_set s
=
s::add_list (s::empty, map char (string::explode s));
dangling_modifiers = FALSE; # Don't allow things like /?/
Escape
= CHAR r::char::Char
| MATCH_SET s::Set
| NONMATCH_SET s::Set
| REGEXP r::Abstract_Regular_Expression
| CHARCODE number_string::Radix
| CTRL
| BACKREF ((String -> String), Int)
| ERROR String
;
Context = IN_CHARSET
| IN_REGEXP;
# \t tab (HT, TAB)
# \n newline (LF, NL)
# \r return (CR)
# \f form feed (FF)
# \a alarm (bell) (BEL)
# \e escape (think troff) (ESC)
tab = CHAR (char '\t');
nl = CHAR (char '\n');
cr = CHAR (char '\r');
ff = CHAR (char '\f');
bel = CHAR (char '\a');
esc = CHAR (char '\x1b'); # NOTE: SML uses decimal, but Mythryl follows C in using octal for \ddd
# IMPORTANT NOTE: perl's . also matches \0
dot = r::NONMATCH_SET (make_set "\n");
# What perl means:
fun is_word c
=
r::char::is_alphanumeric c
or
r::char::to_int c == 95;
#
# \w Match a "word" character (alphanumeric plus "_")
# \W Match a non-word character
# \s Match a whitespace character
# \S Match a non-whitespace character
# \d Match a digit character
# \D Match a non-digit character
fun every p
=
{ fun iter (i, s)
=
{ s = if (p i) i ! s;
else s;
fi;
if (r::char::(>=) (i, r::char::max_char))
s;
else
iter (r::char::next i, s);
fi;
};
s::add_list ( s::empty,
iter (r::char::min_char, [])
);
};
words = r::add_range (s::empty, char 'a', char 'z');
words = r::add_range (words, char 'A', char 'Z');
words = r::add_range (words, char '0', char '9');
words = s::add (words, char '_');
word = MATCH_SET words;
nonword = NONMATCH_SET words;
spaces = every r::char::is_space;
space = MATCH_SET spaces;
nonspace = NONMATCH_SET spaces;
digits = r::add_range (s::empty, char '0', char '9');
digit = MATCH_SET digits;
nondigit = NONMATCH_SET digits;
hex = CHARCODE number_string::HEX;
oct = CHARCODE number_string::OCTAL;
# Perl defines the following zero-width assertions:
#
# \b Match a word boundary
# \B Match a non-(word boundary)
# \A Match only at beginning of string
# \Z Match only at end of string, or before newline at the end
# \z Match only at end of string
# word at begining of string
fun is_word_boundary { prev=>NULL, this=>THE c, next } => is_word c; # word at end of string
is_word_boundary { prev=>THE c, this=>NULL, next } => is_word c;
is_word_boundary { prev=>NULL, this=>NULL, next } => FALSE; # empty string
is_word_boundary { prev=>THE x, this=>THE y, next }
=>
if (is_word x) not (is_word y);
else (is_word y);
fi;
end;
fun is_start_of_string { prev=>NULL, this, next } => TRUE;
is_start_of_string _ => FALSE;
end;
fun is_end_of_string { this=>NULL, prev, next } => TRUE;
is_end_of_string _ => FALSE;
end;
fun is_end_of_string'{ this=>NULL , prev, next } => TRUE;
is_end_of_string'{ this=>THE c, next=>NULL, ... } => r::char::to_int c == 10;
is_end_of_string' _ => FALSE;
end;
word_b = REGEXP (r::BOUNDARY is_word_boundary);
nonword_b = REGEXP (r::BOUNDARY (not o is_word_boundary));
begin_b = REGEXP (r::BOUNDARY is_start_of_string);
end_b = REGEXP (r::BOUNDARY is_end_of_string);
end_b' = REGEXP (r::BOUNDARY is_end_of_string');
Callbackdata = Void;
# Handle quoting of
# \Q ... \E
# \U ... \E to upper case
# \L ... \E to lower case
# All meta characters are treated as normal within these.
# I think the semantics here is the same as perl's.
#
fun quote transform context getc s
=
{ fun loop (s, chars)
=
case (getc s)
NULL
=>
THE (ERROR "missing \\E", s);
THE (c as '\\', s)
=>
case (getc s)
THE ('E', s)
=>
done (chars, s);
_ =>
loop (s, char (transform c) ! chars);
esac;
THE (c, s)
=>
loop (s, char (transform c) ! chars);
esac
also
fun done (chars, s)
=
case (context, chars)
(_, [c]) => THE (CHAR c, s);
(IN_REGEXP, cs) => THE (REGEXP (r::CONCAT (reverse (map r::CHAR cs))), s);
(IN_CHARSET, cs) => THE (MATCH_SET (s::add_list (s::empty, cs)), s);
esac;
loop (s, []);
};
# Callback for escape sequences:
#
fun escape data context getc s
=
case (getc s, context)
#
(NULL, _) => NULL;
# Simple escapes:
#
(THE('t', s), _) => THE (tab, s);
(THE('n', s), _) => THE (nl, s);
(THE('r', s), _) => THE (cr, s);
(THE('f', s), _) => THE (ff, s);
(THE('a', s), _) => THE (bel, s);
(THE('e', s), _) => THE (esc, s);
# Character codes; no unicode support yet!
#
(THE('0', s), _) => THE (oct, s);
(THE('x', s), _) => THE (hex, s);
(THE('c', s), _) => THE (CTRL, s);
# Not yet supported
(THE('N', s), _) => THE (ERROR "named character", s);
(THE('l', s), _) => THE (ERROR "lowercase next char", s);
(THE('u', s), _) => THE (ERROR "uppercase next char", s);
# Character set abbreviations:
#
(THE('w', s), _) => THE (word, s);
(THE('W', s), _) => THE (nonword, s);
(THE('s', s), _) => THE (space, s);
(THE('S', s), _) => THE (nonspace, s);
(THE('d', s), _) => THE (digit, s);
(THE('D', s), _) => THE (nondigit, s);
# Quoting:
#
(THE('Q', s), _) => quote (\\ x=x) context getc s;
(THE('L', s), _) => quote char::to_lower context getc s;
(THE('U', s), _) => quote char::to_upper context getc s;
# Boundary operators; these cannot appear in a char set:
#
(THE('b', s), IN_REGEXP) => THE (word_b, s);
(THE('B', s), IN_REGEXP) => THE (nonword_b, s);
(THE('A', s), IN_REGEXP) => THE (begin_b, s);
(THE('Z', s), IN_REGEXP) => THE (end_b', s);
(THE('z', s), IN_REGEXP) => THE (end_b, s);
# Property
/* What are these?
| (THE('p', s), _) => (THE PROPERTY, s)
| (THE('P', s), _) => (THE NONPROPERTY, s)
*/
(THE (c, s'), _)
=>
if (char::is_digit c and context == IN_REGEXP)
# It is a back reference.
# BUG: int::scan is too greedy.
# How to handle things like \11 where XXX BUGGO FIXME
# is it \1 and the character 1?
case (int::scan number_string::DECIMAL getc s)
#
THE (i, s) => THE (BACKREF (\\ x = x, i), s);
NULL => THE (ERROR "back reference error", s);
esac;
else
# By default just treat the character literally
THE (CHAR (char c), s');
fi;
esac;
); # package s
fun scan' err getc
=
s::scan { data=>(), backslash=> '\\', error=>err } getc;
fun scan getc
=
scan' (\\ _ = raise exception r::CANNOT_PARSE) getc;
};