## regular-expression.pkg
# Compiled by:
#
src/app/future-lex/src/lexgen.lib# Regular expression representation and manipulation.
#
# The main points here are to:
# (1) Make it easy for an RE parser to construct
# RE expressions
# (2) Canonicalize REs for effective comparison
# (3) Implement the RE derivatives algorithm
#
# See the implementation notes for details on the derivatives
# algorithm and the canonicalization strategy.
### "Ludwig Boltzmann, who spent much of
### his life studying statistical mechanics,
### died in 1906, by his own hand.
###
### "Paul Ehrenfest, carrying on the work,
### died similarly in 1933.
###
### "Now it is our turn to study statistical mechanics."
###
### --David L. Goodstein, States of Matter
#DO set_control "compiler::trap_int_overflow" "TRUE";
stipulate
package u1w = one_word_unt; # one_word_unt is from
src/lib/std/one-word-unt.pkg package vec = vector; # vector is from
src/lib/std/src/vector.pkgherein
package regular_expression
: (weak) Regular_Expression # Regular_Expression is from
src/app/future-lex/src/regular-expression.api {
# Symbols (i.e., words)
package sym
=
package {
Point = u1w::Unt;
compare = u1w::compare;
my min_pt: u1w::Unt = 0u0;
max_pt = u1w::bitwise_not 0u0;
fun next (w: u1w::Unt) =
if (w == u1w::bitwise_not 0u0 ) w;
else w + 0u1;fi;
fun prior (w: u1w::Unt) =
if (w == 0u0 ) w;
else w - 0u1;fi;
fun is_succ (w1, w2)
=
(next w1 == w2);
};
package symbol_set
=
interval_set_g( sym ); # interval_set_g is from
src/lib/src/interval-set-g.pkg Symbol = sym::Point;
Symbol_Set = symbol_set::Set;
package sis = symbol_set;
# REs
Re
= EPSILON # Matches the empty string.
| ANY
# Matches any single symbol.
| NONE
# Matches nothing (i.e. the empty language).
| SYM_SET Symbol_Set
| CONCAT List( Re )
| CLOSURE Re
| OP ((Rator, List( Re )) )
# list length != 1 and in sorted order
| NOT Re
also Rator = OR
| AND | XOR;
# we give a total order to REs; this is useful for canonicalization
fun compare (re1, re2)
=
{ fun compare_op (OR, OR ) => EQUAL;
compare_op (OR, _ ) => LESS;
compare_op (_, OR ) => GREATER;
compare_op (AND, AND) => EQUAL;
compare_op (AND, _ ) => LESS;
compare_op (_, AND) => GREATER;
compare_op (XOR, XOR) => EQUAL;
end;
fun compare_list (result1, result2)
=
list::compare_sequences compare (result1, result2);
case (re1, re2)
#
(EPSILON, EPSILON) => EQUAL;
(EPSILON, _) => LESS;
(_, EPSILON) => GREATER;
(ANY, ANY) => EQUAL;
(ANY, _) => LESS;
(_, ANY) => GREATER;
(NONE, NONE) => EQUAL;
(NONE, _) => LESS;
(_, NONE) => GREATER;
(SYM_SET a, SYM_SET b) => sis::compare (a, b);
(SYM_SET a, _) => LESS;
(_, SYM_SET b) => GREATER;
(CONCAT a, CONCAT b) => compare_list (a, b);
(CONCAT a, _) => LESS;
(_, CONCAT b) => GREATER;
(CLOSURE a, CLOSURE b) => compare (a, b);
(CLOSURE a, _) => LESS;
(_, CLOSURE b) => GREATER;
(OP (op1, result1), OP (op2, result2))
=>
case (compare_op (op1, op2))
#
EQUAL => compare_list (result1, result2);
order => order;
esac;
(OP _, _) => LESS;
(_, OP _) => GREATER;
(NOT a, NOT b) => compare (a, b);
esac;
};
# sort = list_mergesort::sort (\\ (re1, re2) => compare (re1, re2) = LESS)
# primitive REs
any = ANY;
none = NONE;
epsilon = EPSILON;
# Canonical constructors
fun make_symbol_set c
=
if (sis::is_empty c) NONE;
elif (sis::is_universe c) ANY;
else SYM_SET c;
fi;
fun make_symbol symbol
=
make_symbol_set (sis::singleton symbol);
fun make_meld (re1, re2)
=
case (re1, re2)
(EPSILON, re2) => re2;
(re1, EPSILON) => re1;
(NONE, _) => NONE;
(_, NONE) => NONE;
(CONCAT result1, CONCAT result2) => CONCAT (result1@result2);
(re1, CONCAT result2) => CONCAT (re1 ! result2);
(CONCAT result1, re2) => CONCAT (result1 @ [re2]);
_ => CONCAT [re1, re2];
esac;
fun make_meld_list [] => EPSILON;
make_meld_list (re ! result) => make_meld (re, make_meld_list result);
end;
fun make_closure EPSILON => EPSILON;
make_closure NONE => EPSILON;
make_closure (re as CLOSURE _) => re;
make_closure re => CLOSURE re;
end;
fun merge_sis (in_res, mop)
=
{ fun is_sis (SYM_SET _) => TRUE;
is_sis _ => FALSE;
end;
my (siss, result)
=
list::partition is_sis in_res;
case siss
#
[] => in_res;
[re] => in_res;
sis ! siss'
=>
reinsert (merged, result)
where
fun wrapmop (SYM_SET s1, SYM_SET s2)
=>
SYM_SET (mop (s1, s2));
wrapmop _
=>
raise exception DIE "BUG: wrapmop: SymSet expected";
end;
merged = list::fold_forward wrapmop sis siss';
fun reinsert (re1, [])
=>
[re1];
reinsert (re1, re ! result)
=>
case (compare (re1, re))
LESS => re1 ! re ! result;
EQUAL => raise exception DIE "BUG: mergeSIS: only one SymSet expected";
GREATER => re ! (reinsert (re1, result));
esac;
end;
end;
esac;
};
fun make_or (re1, re2)
=
{ fun merge ([], result2) => result2;
merge (result1, []) => result1;
merge (re1 ! r1, re2 ! r2)
=>
case ( compare (re1, re2))
LESS => re1 ! merge (r1, re2 ! r2);
GREATER => re2 ! merge (re1 ! r1, r2);
EQUAL => merge (re1 ! r1, r2);
esac;
end;
fun mk (a, b)
=
case (merge_sis (merge (a, b), sis::union))
[] => NONE;
[re] => re;
result => OP (OR, result);
esac;
case (re1, re2)
(NONE, _) => re2;
(_, NONE) => re1;
(SYM_SET s1, SYM_SET s2)
=>
make_symbol_set (sis::union (s1, s2));
(OP (OR, result1), OP (OR, result2)) => mk (result1, result2);
(OP (OR, result1), _) => mk (result1, [re2]);
(_, OP (OR, result2)) => mk([re1], result2);
(re1, re2)
=>
case (compare (re1, re2))
LESS => OP (OR, [re1, re2]);
GREATER => OP (OR, [re2, re1]);
EQUAL => re1;
esac;
esac;
};
fun make_and (re1, re2)
=
{ fun merge ([], result2) => result2;
merge (result1, []) => result1;
merge (re1 ! r1, re2 ! r2)
=>
case (compare (re1, re2))
LESS => re1 ! merge (r1, re2 ! r2);
GREATER => re2 ! merge (re1 ! r1, r2);
EQUAL => merge (re1 ! r1, r2);
esac;
end;
fun mk (a, b)
=
case (merge_sis (merge (a, b), sis::intersect))
[] => NONE;
[re] => re;
result => OP (AND, result);
esac;
case (re1, re2)
(NONE, _) => NONE;
(_, NONE) => NONE;
(SYM_SET s1, SYM_SET s2)
=>
make_symbol_set (sis::intersect (s1, s2));
(OP (AND, result1), OP (AND, result2))
=>
mk (result1, result2);
(OP (AND, result1), _) => mk (result1, [re2]);
(_, OP (AND, result2)) => mk([re1], result2);
(re1, re2)
=>
case (compare (re1, re2))
LESS => OP (AND, [re1, re2]);
GREATER => OP (AND, [re2, re1]);
EQUAL => re1;
esac;
esac;
};
fun make_xor (re1, re2)
=
{ fun merge ([], result2) => result2;
merge (result1, []) => result1;
merge (re1 ! r1, re2 ! r2)
=>
case (compare (re1, re2))
LESS => re1 ! merge (r1, re2 ! r2);
EQUAL => merge (r1, r2);
GREATER => re2 ! merge (re1 ! r1, r2);
esac;
end;
fun mk (a, b)
=
case (merge (a, b))
[] => NONE;
[re] => re;
result => OP (XOR, result);
esac;
case (re1, re2)
(NONE, _) => re2;
(_, NONE) => re1;
(SYM_SET s1, SYM_SET s2)
=>
make_symbol_set (
sis::intersect (
sis::union (s1, s2),
sis::complement (sis::intersect (s1, s2))
)
);
(OP (XOR, result1), OP (XOR, result2)) => mk (result1, result2);
(OP (XOR, result1), _) => mk (result1, [re2]);
(_, OP (XOR, result2)) => mk([re1], result2);
(re1, re2)
=>
case ( compare (re1, re2))
LESS => OP (XOR, [re1, re2]);
GREATER => OP (XOR, [re2, re1]);
EQUAL => NONE; # XXX BUGGO FIXME is this right?
esac;
esac;
};
fun mk_op (OR, re1, re2) => make_or (re1, re2);
mk_op (AND, re1, re2) => make_and (re1, re2);
mk_op (XOR, re1, re2) => make_xor (re1, re2);
end;
fun make_not (NOT re) => re;
make_not NONE => make_closure ANY;
make_not re => NOT re;
end;
fun make_option re
=
make_or (EPSILON, re);
fun make_repetition (re, low, high)
=
{ fun low_reps 0 => EPSILON;
low_reps 1 => re;
low_reps n => make_meld (re, low_reps (n - 1));
end;
fun high_reps 0 => EPSILON;
high_reps 1 => make_option re;
high_reps n => make_meld (make_option re, high_reps (n - 1));
end;
if (high < low) raise exception INDEX_OUT_OF_BOUNDS; fi;
make_meld (low_reps low, high_reps (high - low));
};
fun make_at_least (re, 0) => make_closure re;
make_at_least (re, n) => make_meld (re, make_at_least (re, n - 1));
end;
fun is_none NONE => TRUE;
is_none _ => FALSE;
end;
fun symbol_to_string w
=
"#\"" + (char::to_string (char::from_int (u1w::to_int w))) + "\""
except
OVERFLOW = raise exception DIE "(BUG) regular_expression: symbol_to_string on a nonascii character";
fun sisto_string s
=
{ fun c2s c
=
if (c < 0u128)
char::to_string (char::from_int (u1w::to_int c));
else
string::cat ["\\u", u1w::to_string c];
fi;
fun f (a, b)
=
if (a == b)
#
c2s a;
else
cat [c2s a, "-", c2s b];
fi;
# We want to describe the interval set as concisely as possible,
# so we compare the number of intervals in the set to the number
# of intervals in its complement, and use the smaller of the two.
intervals = sis::intervals s;
intervals' = sis::intervals (sis::complement s);
my (neg, rngs)
=
if (list::length intervals < list::length intervals')
("", intervals);
else ("^", intervals');
fi;
str = neg + (string::cat (list::map f rngs));
if (string::length_in_bytes str <= 1)
str;
else
"[" + str + "]";
fi;
};
fun to_string re
=
string::cat (to_s (re, []))
where
fun op_to_string OR => "
|";
op_to_string AND => "&";
op_to_string XOR => "^";
end;
fun op_prec OR => 0;
op_prec AND => 2;
op_prec XOR => 1;
end;
fun prec ANY => 6;
prec NONE => 6;
prec EPSILON => 6;
prec (SYM_SET _) => 6;
prec (CONCAT []) => 6;
prec (CONCAT _) => 3;
prec (CLOSURE _) => 5;
prec (OP(_, [])) => 6;
prec (OP(_, [re])) => prec re;
prec (OP (operator, _)) => op_prec operator;
prec (NOT _) => 4;
end;
fun to_s (ANY, l) => "{ any }" ! l;
to_s (NONE, l) => "{ none }" ! l;
to_s (EPSILON, l) => "{ epsilon }" ! l;
to_s (SYM_SET s, l) => sisto_string s ! l;
to_s (CONCAT [], l) => "" ! l;
to_s (CONCAT [re], l) => to_s (re, l);
to_s (CONCAT result, l) => to_s'(result, 3, "", l);
to_s (CLOSURE re, l) => paren (5, re, "*" ! l);
to_s (OP (_, []), l) => "{}" ! l;
to_s (OP (operator, [re]), l) => to_s (re, l);
to_s (OP (operator, result), l) => to_s'(result, op_prec operator, op_to_string operator, l);
to_s (NOT re, l) => "!" ! paren (4, re, l);
end
also
fun to_s' ([], p, operator, l) => raise exception DIE "empty";
to_s' (re ! r, p, operator, l)
=>
paren (p, re, list::fold_backward
(\\ (re, l) => operator ! paren (p, re, l); end )
l r);
end
also
fun paren (p, re, l)
=
if (p <= prec re)
to_s (re, l);
else
"(" ! to_s (re, ")" ! l);
fi;
end;
# TRUE iff epsilon is in the language recognized by the RE
fun nullable ANY => FALSE;
nullable NONE => FALSE;
nullable EPSILON => TRUE;
nullable (SYM_SET _) => FALSE;
nullable (CLOSURE _) => TRUE;
nullable (CONCAT result) => list::all nullable result;
nullable (OP (OR, result)) => list::exists nullable result;
nullable (OP (AND, result)) => list::all nullable result;
nullable (OP (XOR, re ! r))
=>
(nullable re and not (list::exists nullable r))
or nullable (OP (XOR, r));
nullable (OP (XOR, []))
=>
raise exception DIE "(BUG) RegExpression: RE operator has no operands";
nullable (NOT re)
=>
not (nullable re);
end;
fun delta re
=
if (nullable re)
EPSILON;
else
NONE;
fi;
# Compute derivative w.r.t. a symbol
fun derivative a
=
da
where
fun da ANY => EPSILON;
da NONE => NONE;
da EPSILON => NONE;
da (SYM_SET s)
=>
if (sis::member (s, a)) EPSILON;
else NONE;
fi;
da (re as CLOSURE re')
=>
make_meld (da re', re);
da (CONCAT []) => NONE;
da (CONCAT [re]) => da re;
da (CONCAT (re ! result))
=>
make_or(
make_meld_list((da re) ! result),
make_meld (delta re, da (CONCAT result))
);
da (OP(_, [])) => raise exception DIE "(BUG) RegExpression: RE operator has no operands";
da (OP (operator, [re])) => da re;
da (OP (operator, re ! result)) => mk_op (operator, da re, da (OP (operator, result)));
da (NOT re) => make_not (da re);
end;
end;
package map
=
red_black_map_g ( # red_black_map_g is from
src/lib/src/red-black-map-g.pkg package {
Key = vec::Vector( Re );
compare = vec::compare_sequences compare;
}
);
# Find the smallest partitioning of the alphabet that
# "respects" the given sets. If S is one of the sets
# returned by compress, then it must be either disjoint
# with or a subset of each of the sets in the sets
# parameter. see the implementation notes for more detail.
#
fun compress sets
=
list::fold_forward part1 [] (sis::universe ! sets)
where
# Do partition of a set againt a list of sets,
# assuming the list of sets is pairwise disjoint:
#
fun part1 (set, [])
=>
if (sis::is_empty set ) [];
else [set];
fi;
part1 (set1, set2 ! ss)
=>
if (sis::is_empty set1 )
set2 ! ss;
else
i = sis::intersect (set1, set2);
if (sis::is_empty i )
(set2 ! (part1 (set1, ss)));
else
s1 = sis::difference (set1, i);
s2 = sis::difference (set2, i);
ss' = if (sis::is_empty s1) ss;
else part1 (s1, ss);
fi;
if (sis::is_empty s2 )
(i ! ss');
else
(i ! s2 ! ss');
fi;
fi;
fi;
end;
end;
fun derivatives (result: vec::Vector( Re ))
=
ilks (sets', map::empty)
where
# Ds is the "factoring function"
fun ds ANY => [sis::universe];
ds NONE => [];
ds EPSILON => [];
ds (SYM_SET s) => [s];
ds (CLOSURE re) => ds re;
ds (CONCAT []) => [];
ds (CONCAT [re]) => ds re;
ds (CONCAT (re ! result))
=>
if (nullable re)
(ds re) @ (ds (CONCAT result));
else
ds re;
fi;
ds (OP (operator, result))
=>
list::cat (map ds result);
ds (NOT re)
=>
ds re;
end;
sets = vec::fold_forward
(\\ (re, sets) => (ds re) @ sets; end )
[] result;
sets' = compress sets;
fun ilks ([], ilk_map)
=>
map::keyvals_list ilk_map;
ilks (set ! sets, ilk_map)
=>
{ # use first element as representative of the equiv ilk
my (rep, _)
=
list::head (sis::intervals set);
derivs = vec::map (derivative rep) result;
case (map::get (ilk_map, derivs))
#
NULL => ilks (sets, map::set (ilk_map, derivs, set));
THE set'
=>
{ map' = map::set (ilk_map,
derivs,
sis::union (set, set'));
ilks (sets, map');
};
esac;
};
end;
end;
};
end;
## COPYRIGHT (c) 2005 John Reppy (http://www.cs.uchicago.edu/~jhr) Aaron Turon (adrassi@gmail.com)
## Subsequent changes by Jeff Prothero Copyright (c) 2010-2015,
## released per terms of SMLNJ-COPYRIGHT.