## protostring.pkg
# Compiled by:
# src/lib/core/init/init.cmi
# Some common operations that are used by
# both the 'string' and 'substring' packages.
stipulate
package it = inline_t; # inline_t is from
src/lib/core/init/built-in.pkg package rt = runtime; # runtime is from src/lib/core/init/built-in.pkg.
#
include package base_types; # base_types is from
src/lib/core/init/built-in.pkg #
infix my 80 * / % mod div ;
infix my 70 $ ^ + - ;
infixr my 60 . ! @ << >> >>> ;
infix my 50 > < >= <= == != ;
infix my 40 := o ;
infix my 10 then ;
#
herein
# This package is referenced in:
#
# src/lib/core/init/init.cmi
#
package protostring {
# ===========
#
stipulate
#
include package proto_pervasive; # proto_pervasive is from
src/lib/core/init/proto-pervasive.pkg package c = it::char; # inline_t is from
src/lib/core/init/built-in.pkg (+) = it::default_int::(+);
(-) = it::default_int::(-);
(*) = it::default_int::(*);
(quot) = it::default_int::quot;
(<) = it::default_int::(<);
(<=) = it::default_int::(<=);
(>) = it::default_int::(>);
(>=) = it::default_int::(>=);
# (==) = it::(==);
unsafe_get = it::vector_of_chars::get_byte_as_char;
unsafe_set = it::vector_of_chars::set_char_as_byte;
unsafe_create = rt::asm::make_string; # "rt" == "runtime"
max_size = core::maximum_vector_length;
size = it::vector_of_chars::length;
herein
# Allocate an uninitialized string of given length (with a size check)
fun create n
=
if (it::default_int::ltu (max_size, n))
raise exception core::SIZE;
else
unsafe_create n;
fi;
# A vector of single character strings # NB: 'chars' is (also) used as ps::chars by fun str in
src/lib/core/init/pervasive.pkg #
chars = rt::asm::make_typeagnostic_ro_vector (c::max_ord+1, next 0) # "rt" == "runtime".
where
fun next i
=
if (i <= c::max_ord)
#
s = unsafe_create 1;
unsafe_set (s, 0, c::chr i);
s ! next (i+1);
else
[];
fi;
end;
fun unsafe_substring (_, _, 0)
=>
"";
unsafe_substring (s, i, 1)
=>
it::poly_vector::get (chars, it::cast (unsafe_get (s, i)));
unsafe_substring (s, i, n)
=>
ss
where
ss = unsafe_create n;
fun copy j
=
if (j != n)
#
unsafe_set (ss, j, unsafe_get (s, i+j));
copy (j+1);
fi;
copy 0;
end;
end;
# Concatenate a pair of non-empty strings:
#
fun meld2 (x, y)
=
ss
where
xl = size x;
yl = size y;
ss = create (xl+yl);
fun copyx n
=
if (n != xl)
#
unsafe_set (ss, n, unsafe_get (x, n));
copyx (n+1);
fi;
fun copyy n
=
if (n != yl)
#
unsafe_set (ss, xl+n, unsafe_get (y, n));
copyy (n+1);
fi;
copyx 0;
copyy 0;
end;
# Given a reverse order list of strings
# and a total length, return the
# concatenation of the list.
#
fun rev_meld (0, _)
=>
"";
rev_meld (1, lst)
=>
find lst
where
fun find ("" ! r) => find r;
find (s ! _) => s;
find _ => ""; # * impossible *
end;
end;
rev_meld (tot_len, lst)
=>
ss
where
ss = create tot_len;
fun copy ([], _)
=>
();
copy (s ! r, i)
=>
{ len = size s;
i = i - len;
fun copy' j
=
if (j != len)
#
unsafe_set (ss, i+j, unsafe_get (s, j));
copy'(j+1);
fi;
copy' 0;
copy (r, i);
};
end;
copy (lst, tot_len);
end;
end;
# Map a translation function across
# the characters of a substring:
#
fun translate (tr, s, i, n)
=
map_list (i, 0, [])
where
stop = i+n;
fun map_list (j, result_length, result_list)
=
if (j < stop)
s' = tr (unsafe_get (s, j));
map_list (j + 1, result_length + size s', s' ! result_list);
else
rev_meld (result_length, result_list);
fi;
end;
# Implode a non-empty list of characters
# into a string where both the length and
# list are given as arguments:
#
fun implode (len, cl)
=
ss
where
ss = create len;
#
fun copy ([], _) => ();
#
copy (c ! r, i)
=>
{ it::vector_of_chars::set_char_as_byte (ss, i, c);
#
copy (r, i+1);
};
end;
copy (cl, 0);
end;
# Implode a reversed non-empty list of characters
# into a string where both the length and list
# are given as arguments:
#
fun rev_implode (len, cl)
=
ss
where
ss = create len;
#
fun copy ([], _) => ();
#
copy (c ! r, i) => { it::vector_of_chars::set_char_as_byte (ss, i, c);
#
copy (r, i - 1);
};
end;
copy (cl, len - 1);
end;
fun is_prefix (s1, s2, i2, n2)
=
(n2 >= n1 and eq (0, i2))
where
n1 = size s1;
fun eq (i, j)
=
i >= n1
or
( unsafe_get (s1, i) == unsafe_get (s2, j)
and
eq (i+1, j+1)
);
end;
fun compare_sequences compare_fn (s1, i1, n1, s2, i2, n2)
=
compare' 0
where
my (n, order)
=
if (n1 == n2) (n1, EQUAL);
elif (n1 < n2) (n1, LESS);
else (n2, GREATER);
fi;
fun compare' i
=
if (i == n)
order;
else
c1 = unsafe_get (s1, i1+i);
c2 = unsafe_get (s2, i2+i);
case (compare_fn (c1, c2))
EQUAL => compare' (i+1);
order => order;
esac;
fi;
end;
fun compare (s1, i1, n1, s2, i2, n2)
=
compare_sequences compare_fn (s1, i1, n1, s2, i2, n2)
where
fun compare_fn (c1, c2)
=
if (c1 == c2) EQUAL;
elif (c::(>) (c1, c2)) GREATER;
else LESS;
fi;
end;
# Knuth-Morris-Pratt String Matching
#
# knuth_morris_pratt_string_match: String -> (String, Int, Int) -> Int
#
# Find the first string within the second, starting at and
# ending before the given positions.
# Return the starting position of the match
# or the given ending position if there is no match.
#
# For background see (e.g.)
#
# https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
#
fun knuth_morris_pratt_string_match pattern
=
{ psz = size pattern;
next = it::poly_rw_vector::make_nonempty_rw_vector (psz, -1);
fun pat x = unsafe_get (pattern, x);
fun nxt x = it::poly_rw_vector::get (next, x);
fun setnxt (i, x)
=
it::poly_rw_vector::set (next, i, x);
# We're trying to fill 'next' at position p (> 0) and higher;
# invariants: x >= 0
# pattern[0..x) == pattern[p-x..p)
# for i in [0..p) :
# pattern[i] <> pattern[next[i]]
# pattern[0..next[i]) = pattern[i-next[i]..i)
fun fill (p, x)
=
if (p >= psz) (); # Reached end of 'pattern' so we're done.
elif (pat x == pat p) dnxt (p, nxt x, x + 1);
else dnxt (p, x, nxt x + 1);
fi
also
fun dnxt (p, x, y)
=
{ setnxt (p, x);
#
fill (p + 1, y);
};
# Once 'next' has been initialized, it serves the following purpose:
#
# Suppose we are looking at text position t and pattern position p.
#
# This means that all pattern positions < p have already
# matched the text positions that directly precede t.
#
# Now, if the text[t] matches pattern[p], then we simply advance
# both t and p and try again.
#
# However, if the two do not match, then we simply
# try t again, but this time with the pattern position
# given by next[p].
#
# Success is when p reaches the end of the pattern.
# Failure is when t reaches the end of the text
# without p having reached the end of the pattern.
#
fun search (text, start, stop)
=
loop (0, start)
where
fun txt x
=
unsafe_get (text, x);
fun loop (p, t)
=
if (p >= psz ) t-psz; # SUCCESS: Complete pattern matched. Return byte offset of start of patternmatch in 'text'.
elif (t >= stop ) stop; # FAILURE: Reached end of 'text' (or at least the part of it we're allowed to search) without finding a match. Return ending position of search in 'text'.
elif (p < 0 or pat p == txt t) loop (p+1, t+1); # Pattern[p] == text[t] so advance p and t by one.
else loop (nxt p, t); # Pattern[p] != text[t] so try next possible match of pattern against text[t].
fi;
end;
fill (1, 0);
search;
};
fun knuth_morris_pratt_string_match_backward pattern # Same as above, but searching backward instead of forward.
= # We build/use 'next' identically to the above, but tweak 'pat' and 'txt' to read 'pattern' and 'text' back-to-front instead of front-to-back.
{ psz = size pattern;
next = it::poly_rw_vector::make_nonempty_rw_vector (psz, -1);
psz1 = psz - 1;
fun pat x = unsafe_get (pattern, psz1 - x); # Change from above: 'psz1 - x' replaces 'x'.
fun nxt x = it::poly_rw_vector::get (next, x);
fun setnxt (i, x)
=
it::poly_rw_vector::set (next, i, x);
fun fill (p, x)
=
if (p >= psz) ();
elif (pat x == pat p) dnxt (p, nxt x, x + 1);
else dnxt (p, x, nxt x + 1);
fi
also
fun dnxt (p, x, y)
=
{ setnxt (p, x);
#
fill (p + 1, y);
};
fun search (text, start, stop) # Here we expect to have start > stop because we're searching BACKWARD through 'text' from 'start' to 'stop'.
=
loop (0, start)
where
tsz = size text;
tsz1 = tsz - 1;
start = tsz1 - start - psz1; # Change from above: 'tsz1 - start - psz1' replaces 'start'.
stop = tsz1 - stop; # Change from above: 'tsz1 - stop' replaces 'stop'.
fun txt x
=
unsafe_get (text, tsz1 - x); # Change from above: 'tsz1 - x' replaces 'x'.
fun loop (p, t)
=
if (p >= psz ) tsz - t; # Change from above: 'tsz - t' replaces '(t-psz)'. (psz goes away because we still want to return the index of the leftmost byte in the match.)
elif (t >= stop ) tsz1 - stop; # Change from above: 'tsz1 - (t-stop)' replaces 'stop'.
elif (p < 0 or pat p == txt t) loop (p+1, t+1);
else loop (nxt p, t);
fi;
end;
fill (1, 0);
search;
};
end; # stipulate
}; # package protostring
end;