## string-guts.pkg
#
# Basic string ops.
#
# See also:
#
#
src/lib/std/src/string-junk.pkg# Compiled by:
#
src/lib/std/src/standard-core.sublib### "Harp not on that string."
###
### -- William Shakespeare, "Henry VI"
stipulate
package chr = char; # char is from
src/lib/std/src/char.pkg package it = inline_t; # inline_t is from
src/lib/core/init/built-in.pkg package c = it::char; # inline_t is from
src/lib/core/init/built-in.pkg package ps = protostring; # protostring is from
src/lib/std/src/protostring.pkg package rt = runtime; # runtime is from
src/lib/core/init/runtime.pkg package g2d = exceptions_guts; # exceptions_guts is from
src/lib/std/src/exceptions-guts.pkg nb = log::note_on_stderr; # log is from
src/lib/std/src/log.pkg # Note: sprintf etc are not available at this level, so if you need to debug this file try stuff like
# nb {. (cat [ "utf8_to_ucs2/LUP: i=", (tagged_int_guts::to_string i) ]); };
herein
package string_guts
: (weak) String # String is from
src/lib/std/src/string.api {
(+) = it::default_int::(+);
(-) = it::default_int::(-);
(<) = it::default_int::(<);
(<=) = it::default_int::(<=);
(>) = it::default_int::(>);
(>=) = it::default_int::(>=);
(&) = it::default_int::bitwise_and;
# (==) = it::(==);
unsafe_get = it::vector_of_chars::get_byte_as_char;
unsafe_set = it::vector_of_chars::set_char_as_byte;
unsafe_get_byte = it::vector_of_chars::get_byte;
unsafe_set_byte = it::vector_of_chars::set_byte;
# These are not used in production code, but it is a
# good idea to test new code with them before switching
# over to the above 'unsafe_set' ops:
#
fun safe_set_byte (s: String, i: Int, b: Int): Void
=
{ len = it::vector_of_chars::length s;
#
if (i < 0 or i >= len)
#
nb {. (cat [ "safe_set_byte: error: i=", (tagged_int_guts::to_string i), " len=", (tagged_int_guts::to_string len), " s='", s, "'\n" ]); };
else
unsafe_set_byte (s, i, b);
fi;
};
fun safe_set (s: String, i: Int, c: Char): Void
=
{ len = it::vector_of_chars::length s;
#
if (i < 0 or i >= len)
#
nb {. (cat [ "safe_set: error: i=", (tagged_int_guts::to_string i), " len=", (tagged_int_guts::to_string len), " s='", s, "'\n" ]); };
else
unsafe_set (s, i, c);
fi;
};
# List reverse # A local copy may run faster than the global one, due inlining currently not worrking cross-package.
#
fun reverse ([], l) => l;
reverse (x ! r, l) => reverse (r, x ! l);
end;
Char = Char;
String = String;
maximum_vector_length = core::maximum_vector_length;
# The lengths of a string:
#
length_in_bytes = it::vector_of_chars::length;
fun length_in_chars string # Intended for use on 7-bit ascii and UTF-8. Counts number of bytes not matching 10xxxxxx.
=
{ len = length_in_bytes string;
#
count_chars (0, 0) # Over all bytes in string
where
fun count_chars (i: Int, charcount: Int)
=
if (i == len) charcount; # If we've checked all bytes, return result.
else
c = unsafe_get (string, i); # Get i-th byte as a char.
c = char::to_int c; # Convert char to int.
if (c & 0xC0 == 0x80) count_chars (i+1, charcount ); # This is a non-initial byte in a utf-8 multibyte char sequence, so do not increment charcount.
else count_chars (i+1, charcount+1); # This is the first byte of monobyte or multibyte char sequence, so increment charcount.
fi;
fi;
end;
};
fun prefix_length_in_bytes # Given string and prefix length in chars, return prefix length in bytes.
(
string: String,
prefix_length_in_chars: Int
)
=
{ bytelen = length_in_bytes string;
#
count_chars (0, 0) # Over all bytes in string
where
fun count_chars
(
byteoffset: Int, # Current byte offset into 'string'.
chars_so_far: Int # Number chars in string for which we have seen at least the first byte.
)
=
if (byteoffset == bytelen) bytelen; # Caller may have specified a prefix-length-in-chars longer than the string? Anyhow, just return the string length-in-bytes.
else
c = unsafe_get (string, byteoffset); # Get our byte as a char.
c = char::to_int c; # Convert char to int.
if (c & 0xC0 == 0x80) # If this is a continuation byte in a utf-8 multibyte char,
# # then
count_chars (byteoffset+1, chars_so_far); # do not increment count of chars seen.
elif (chars_so_far == prefix_length_in_chars) # This is not a continuation byte of a multibyte char,
# # so if we've seen the required number of chars, then
byteoffset; # we're at the end of the requested prefix -- return its length-in-bytes. Since 'byteoffset' points to first byte of next char, it is the length-in-bytes of the required prefix.
else
count_chars (byteoffset+1, chars_so_far+1); # This is the first byte of monobyte or multibyte char sequence, so increment count of chars seen.
fi;
fi;
end;
};
unsafe_create
=
rt::asm::make_string;
# Allocate an uninitialized string of given length
#
fun create n
=
if (it::default_int::ltu (maximum_vector_length, n))
#
raise exception g2d::SIZE;
else
rt::asm::make_string n;
fi;
# Convert a character into a single character string
#
fun from_char (c: chr::Char) : String
=
it::poly_vector::get (ps::chars, it::cast c);
# Get a byte from a string and return it as a character:
#
get_byte_as_char
=
it::vector_of_chars::get_byte_as_char_with_boundscheck: (String, Int) -> Char;
# Get a byte from a string:
#
get_byte
=
it::vector_of_chars::get_byte_with_boundscheck: (String, Int) -> Int;
# Get a (possibly UTF-8 encoded) char from a string.
#
# Currently we return this as an int because in
#
src/lib/core/init/built-in.pkg # we have
# package char {
# #
# max_ord = 255;
# and changing that will be nontrivial, so returning
# values > 255 as a Char is currently problematic:
#
fun get_char_as_int (s: String, i: Int): (Int, Int) # For UTF-8 background see (e.g.) http://www.cl.cam.ac.uk/~mgk25/ucs/man-utf-8.html
=
{ len = length_in_bytes s;
#
if (i >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
c = unsafe_get_byte (s, i);
if (c & 0x80 == 0) # Single-byte case?
#
(c, i+1);
#
elif (c & 0xE0 == 0xC0) # Two-byte case?
#
if (i+1 >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
#
c = ((c & 0x1F) << 6)
+ (unsafe_get_byte(s, i+1) & 0x3F); # Second byte should have form 10xxxxx -- we don't check this.
(c, i+2);
elif (c & 0xF0 == 0xE0) # Three-byte case?
#
if (i+2 >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
#
c = ((c & 0x0F) << 12)
+ ((unsafe_get_byte(s, i+1) & 0x3F) << 6) # Second byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+2) & 0x3F) ); # Third byte should have form 10xxxxx -- we don't check this.
(c, i+3);
elif (c & 0xF8 == 0xF0) # Four-byte case?
#
if (i+3 >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
#
c = ((c & 0x07) << 18)
+ ((unsafe_get_byte(s, i+1) & 0x3F) << 12) # Second byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+2) & 0x3F) << 6) # Third byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+3) & 0x3F) ); # Fourth byte should have form 10xxxxx -- we don't check this.
(c, i+4);
elif (c & 0xFC == 0xF8) # Five-byte case?
#
if (i+4 >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
#
c = ((c & 0x03) << 24)
+ ((unsafe_get_byte(s, i+1) & 0x3F) << 18) # Second byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+2) & 0x3F) << 12) # Third byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+3) & 0x3F) << 6) # Fourth byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+4) & 0x3F) ); # Fifth byte should have form 10xxxxx -- we don't check this.
(c, i+5);
elif (c & 0xFE == 0xFC) # Six-byte case?
#
if (i+5 >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
#
c = ((c & 0x01) << 30)
+ ((unsafe_get_byte(s, i+1) & 0x3F) << 24) # Second byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+2) & 0x3F) << 18) # Third byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+3) & 0x3F) << 12) # Fourth byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+4) & 0x3F) << 6) # Fifth byte should have form 10xxxxx -- we don't check this.
+ ((unsafe_get_byte(s, i+5) & 0x3F) ); # Six h byte should have form 10xxxxx -- we don't check this.
(c, i+6);
else
(c, i+1); # Not a legal UTF-8 encoding. Should maybe log an error or raise an exception or something, but it's probably just some old 8-bit ascii encoding -- kinder to just accept it.
fi;
};
# Return number of bytes (1-6) used to encode char at given byte offset in string.
# This is just a dumbed-down version of the previous.
#
fun get_char_bytecount (s: String, i: Int): Int # For UTF-8 background see (e.g.) http://www.cl.cam.ac.uk/~mgk25/ucs/man-utf-8.html
=
{ len = length_in_bytes s;
#
if (i >= len) raise exception core::INDEX_OUT_OF_BOUNDS; fi;
c = unsafe_get_byte (s, i);
if (c & 0x80 == 0) # Single-byte case?
#
1;
elif (c & 0xE0 == 0xC0) # Two-byte case?
#
2;
elif (c & 0xF0 == 0xE0) # Three-byte case?
#
3;
elif (c & 0xF8 == 0xF0) # Four-byte case?
#
4;
elif (c & 0xFC == 0xF8) # Five-byte case?
#
5;
elif (c & 0xFE == 0xFC) # Six-byte case?
#
6;
else
1; # Not a legal UTF-8 encoding. Should maybe log an error or raise an exception or something, but it's probably just some old 8-bit ascii encoding, probably kinder to just accept it.
fi;
};
fun byte_offset_of_ith_char (s: String, i: Int) # Intended for use on 7-bit ascii and UTF-8.
=
{ len = length_in_bytes s;
#
walk_string (0, 0) # Over all bytes in string
where
fun walk_string (byte_offset: Int, charcount: Int)
=
if (charcount == i) THE byte_offset; # Found desired char.
elif (byte_offset == len) NULL; # String has less than 'i' chars, cannot fulfill request.
else
bytes = get_char_bytecount (s, byte_offset);
walk_string (byte_offset + bytes, charcount + 1);
fi;
end;
};
fun utf8_to_ucs2 (input: String): String # Return a string in which each char is encoded using exactly two bytes, most-significant first. Intended primarily for use with w2x::x::POLY_TEXT16 in
src/lib/x-kit/widget/xkit/app/guishim-imp-for-x.pkg =
{ charlen = length_in_chars input;
bytelen = length_in_bytes input;
#
outbytes = charlen * 2;
result = rt::asm::make_string outbytes;
lup (0, 0, 0)
where
fun lup (from: Int, to: Int, i: Int) # 'i' is just for debugging.
=
if (from < bytelen) # Make sure we have input remaining.
#
bytecount = get_char_bytecount (input, from); # sprintf etc are not available at this level, so if you need to debug this file try stuff like
if (to + 2 <= outbytes) # This is the safest termination condition -- we won't overrun our output buffer no matter how corrupt the input is.
# and from + bytecount <= bytelen) # We could check this too if we were being totally anal.
#
(get_char_as_int (input, from))
->
(char, from);
lobyte = (char ) & 0xFF;
hibyte = (char >> 8) & 0xFF;
unsafe_set_byte (result, to, hibyte);
unsafe_set_byte (result, to+1, lobyte);
lup (from, to + 2, i + 1); # Note that 'from' was updated by 'get_char_as_int' above.
fi;
fi;
end;
result;
};
# The (_[]) enables 'vec[index]' notation; # Gave up on this because with utf8 we need to distinguish clearly between bytes and chars, which this notation does not do. -- 2015-05-27 CrT
#
# my (_[]): (String, Int) -> Char
# =
# it::vector_of_chars::get_byte_as_char_with_boundscheck;
# Return the n-character substring of s starting at position i.
# NOTE: we use unts to check the right bound so as to avoid
# raising overflow.
#
stipulate
package w = it::default_unt; # inline_t is from
src/lib/core/init/built-in.pkg herein
fun substring (s, i, n)
=
if (((i < 0) or (n < 0)
or
w::(<) (w::from_int (size s), w::(+) (w::from_int i, w::from_int n)))
)
raise exception g2d::INDEX_OUT_OF_BOUNDS; # exceptions_guts is from
src/lib/std/src/exceptions-guts.pkg else
ps::unsafe_substring (s, i, n);
fi;
end;
fun extract (v, base, opt_len)
=
{ len = size v;
#
fun new_vec n
=
{ new_v = rt::asm::make_string n;
#
fun fill i
=
if (i < n)
#
unsafe_set (new_v, i, unsafe_get (v, base+i));
fill (i+1);
fi;
fill 0;
new_v;
};
case (base, opt_len)
#
(0, NULL) => v;
(_, THE 0)
=>
if (base < 0 or len < base)
#
raise exception g2d::INDEX_OUT_OF_BOUNDS;
else "";
fi;
(_, NULL)
=>
{ if (base < 0 or len < base) raise exception g2d::INDEX_OUT_OF_BOUNDS; fi;
#
if (base == len) "";
else new_vec (len - base);
fi;
};
(_, THE 1)
=>
{ if (base < 0 or len < base+1) raise exception g2d::INDEX_OUT_OF_BOUNDS; fi;
#
str (unsafe_get (v, base));
};
(_, THE n)
=>
{ if (base < 0 or n < 0 or len < base+n) raise exception g2d::INDEX_OUT_OF_BOUNDS; fi;
#
new_vec n;
};
esac;
};
# Concatenate a list of strings:
#
fun cat [ string ]
=>
string;
cat (sl: List( String ))
=>
{ fun length (i, [])
=>
i;
length (i, s ! rest)
=>
length (i+size s, rest);
end;
case (length (0, sl))
#
0 => "";
1 => find sl
where
fun find ("" ! r) => find r;
find ( s ! _) => s;
find _ => ""; # Impossible.
end;
end;
tot_len
=>
{ ss = create tot_len;
fun copy ([], _)
=>
();
copy (s ! r, i)
=>
{ len = size s;
fun copy' j
=
if (j != len)
#
unsafe_set (ss, i+j, unsafe_get (s, j));
copy'(j+1);
fi;
copy' 0;
copy (r, i+len);
};
end;
copy (sl, 0);
ss;
};
esac;
};
end; # cat
# Concatenate a list of strings using the
# given separator string, so
# join " " ["an", "example"]
# ->
# "an example"
#
fun join _ [] => "";
join _ [x] => x;
join sep (h ! t)
=>
cat (
reverse (
fold_forward
(\\ (x, l) = x ! sep ! l)
[h]
t,
[]
)
);
end;
# As above, with null delimiters:
fun implode [] => ""; # Implode a list of characters into a string.
#
implode cl
=>
{ fun length ([], n) => n;
length (_ ! r, n) => length (r, n+1);
end;
ps::implode (length (cl, 0), cl);
};
end;
fun explode s # Explode a string into a list of characters.
=
f (NIL, size s - 1)
where
fun f (l, -1) => l;
f (l, i) => f (unsafe_get (s, i) ! l, i - 1);
end;
end;
fun map f vec
=
case (size vec)
#
0 => "";
#
len => { new_vec = rt::asm::make_string len;
#
mapf 0
where
fun mapf i
=
if (i < len)
#
unsafe_set (new_vec, i, f (unsafe_get (vec, i)));
mapf (i+1);
fi;
end;
new_vec;
};
esac;
# Map a translation function across the characters of a string
#
fun translate tr s
=
ps::translate (tr, s, 0, size s);
fun tokens is_delimiter s # Tokenize a string using the given predicate
= # to define the delimiter characters.
reverse (scan_token (0, 0, []), [])
where
n = size s;
#
fun substr (i, j, tokens)
=
if (i == j) tokens;
else ps::unsafe_substring (s, i, j-i) ! tokens;
fi;
fun scan_token (i, j, tokens)
=
if (j < n)
#
if (is_delimiter (unsafe_get (s, j))) skip_delimiters (j+1, substr (i, j, tokens));
else scan_token (i, j+1, tokens);
fi;
else
substr (i, j, tokens);
fi
also
fun skip_delimiters (j, tokens)
=
if (j < n)
#
if (is_delimiter (unsafe_get (s, j))) skip_delimiters (j+1, tokens);
else scan_token (j, j+1, tokens);
fi;
else
tokens;
fi;
end;
fun fields is_delimiter s
=
{ n = size s;
#
reverse (scan_field (0, 0, []), [])
where
fun scan_field (i, j, fields)
=
if (j < n)
#
if (is_delimiter (unsafe_get (s, j))) scan_field (j+1, j+1, substr (i, j, fields));
else scan_field (i, j+1, fields);
fi;
else
substr (i, j, fields);
fi
where
fun substr (i, j, fields)
=
ps::unsafe_substring(s, i, j-i) ! fields;
end;
end;
};
fun lines s # Split 's' into lines at '\n' chars and return resulting list of strings. We leave the '\n's at the ends of the lines, so doing a 'cat' on the result recreates our input. (You can use 'map chomp lines' to remove the newlines.)
=
{ n = size s;
#
reverse (scan_line (0, 0, []), [])
where
fun scan_line (i, j, lines)
=
if (j < n)
#
if ((unsafe_get(s,j)) == '\n') scan_line (j+1, j+1, substr (i, j+1, lines));
else scan_line (i, j+1, lines);
fi;
else
substr (i, j, lines);
fi
where
fun substr (i, j, lines)
=
if (i < j) ps::unsafe_substring(s, i, j-i) ! lines;
else lines; # This case avoids an unwanted empty string if input terminates with a newline.
fi;
end;
end;
};
fun repeat (s: String, i: Int) # Return result of concatenating 'i' copies of 's'.
=
repeat' (i, [""])
where
fun repeat' (i, result)
=
if (i <= 0) cat result;
else repeat' (i - 1, s ! result);
fi;
end;
fun expand_tabs_and_control_chars # Expands tabs (on 8-char tabstops) into blanks and control chars (and DEL) into ^A notation.
{
utf8text: String, # String to be expanded, assumed to be utf8-encoded.
startcol: Int, # Screen col to assume for first char in 'text'. Normally 0 for left-justified text. Useful when expanding multiple strings within a single screen line.
screencol1: Int, # Query byte-extent of this screeen column in input and output strings.
screencol2: Int, # Query byte-extent of this screeen column in input and output strings. Having both screencol1 and screencol2 is helpful when displaying the selected region in
src/lib/x-kit/widget/edit/screenline.pkg utf8byte: Int # Query screen-column of this byte offset into 'utfxtext'.
}
: { screentext: String, # Resulting tab-expanded string.
startcol: Int, # Screen col to assume for first char in any text following 'text'.
#
screentext_length_in_screencols: Int,
screencol1_byteoffset_in_utf8text: Int,
screencol1_bytescount_in_utf8text: Int,
#
screencol1_byteoffset_in_screentext: Int,
screencol1_bytescount_in_screentext: Int,
#
screencol1_firstcol_on_screen: Int,
screencol1_colcount_on_screen: Int,
screencol2_byteoffset_in_utf8text: Int,
screencol2_bytescount_in_utf8text: Int,
#
screencol2_byteoffset_in_screentext: Int,
screencol2_bytescount_in_screentext: Int,
#
screencol2_firstcol_on_screen: Int,
screencol2_colcount_on_screen: Int,
utf8byte_firstcol_on_screen: Int,
utf8byte_colcount_on_screen: Int
}
=
{ utf8_len_in_bytes = length_in_bytes utf8text;
screentext_length_in_screencols = REF 0;
screencol1_byteoffset_in_utf8text = REF 0;
screencol1_bytescount_in_utf8text = REF 0;
screencol1_byteoffset_in_screentext = REF 0;
screencol1_bytescount_in_screentext = REF 0;
screencol1_firstcol_on_screen = REF 0;
screencol1_colcount_on_screen = REF 0;
screencol2_byteoffset_in_utf8text = REF 0;
screencol2_bytescount_in_utf8text = REF 0;
screencol2_byteoffset_in_screentext = REF 0;
screencol2_bytescount_in_screentext = REF 0;
screencol2_firstcol_on_screen = REF 0;
screencol2_colcount_on_screen = REF 0;
utf8byte_firstcol_on_screen = REF 0;
utf8byte_colcount_on_screen = REF 0;
screentext_len_in_bytes
=
outlen (0, startcol, 0)
where
fun outlen # Compute number of bytes needed for output string. Tabs expand into 1-8 blanks, control chars (and DEL) into ^A ^B ^C ... and everything else gets copied over unchanged, including multibyte UTF-8 chars.
(
from: Int, # Byte offset in input string.
col: Int, # Visual column on output string.
to: Int # Byte offset in result string.
)
=
if (from >= utf8_len_in_bytes)
#
my (to, col) # If needed, add enough trailing blanks to 'screentext' to ensure that 'screencol1_byteoffset_in_screentext' will be a valid offset.
=
if (col > screencol1) (to, col);
else (to + (screencol1 - col) + 1, screencol1 + 1);
fi;
my (to, col) # If needed, add enough trailing blanks to 'screentext' to ensure that 'screencol2_byteoffset_in_screentext' will be a valid offset.
=
if (col > screencol2) (to, col);
else (to + (screencol2 - col) + 1, screencol2 + 1);
fi;
screentext_length_in_screencols := col;
to;
else
charlen = get_char_bytecount (utf8text, from);
#
charlen = if (from + charlen > utf8_len_in_bytes) utf8_len_in_bytes - from; # Invalid UTF-8 encoding: requires more bytes than remain. Silently copy only as many as actually available.
else charlen; # Normal case.
fi;
my { from_bump, col_bump, to_bump }
=
if (charlen > 1)
#
{ from_bump => charlen,
col_bump => 1,
to_bump => charlen
};
else
cols = case (get_byte_as_char (utf8text, from))
#
'\^@' => 2; # We could code this more cleverly, but I like simple and easy to understand at a glance.
'\^A' => 2;
'\^B' => 2;
'\^C' => 2;
'\^D' => 2;
'\^E' => 2;
'\^F' => 2;
'\^G' => 2;
'\^H' => 2;
'\^I' => 8 - (col & 7);
'\^J' => 2;
'\^K' => 2;
'\^L' => 2;
'\^M' => 2;
'\^N' => 2;
'\^O' => 2;
'\^P' => 2;
'\^Q' => 2;
'\^R' => 2;
'\^S' => 2;
'\^T' => 2;
'\^U' => 2;
'\^V' => 2;
'\^W' => 2;
'\^X' => 2;
'\^Y' => 2;
'\^Z' => 2;
'\^[' => 2;
'\^\' => 2;
'\^]' => 2;
'\^_' => 2;
'\x7F'=> 2; # DEL char.
#
_ => 1;
esac;
{ from_bump => 1,
col_bump => cols,
to_bump => cols
};
fi;
outlen (from + from_bump, col + col_bump, to + to_bump);
fi;
end;
fun n_blanks (result, to, count) # Write 'count' blanks into string 'result' starting at offset 'to'.
=
if (count > 0)
#
unsafe_set (result, to, ' ');
n_blanks (result, to + 1, count - 1);
fi;
screentext = rt::asm::make_string screentext_len_in_bytes;
col = fillstring (0, startcol, 0) # Copy 'utf8text' string to 'screentext' string, expanding tabs and control chars as we go.
where
fun fillstring
(
from: Int, # Byte offset in utf8text string.
col: Int, # Visual column on output string.
to: Int # Byte offset in result string.
)
=
if (to >= screentext_len_in_bytes)
#
col;
else
my (charlen, input, fromoffset)
=
if (from < utf8_len_in_bytes)
#
charlen = get_char_bytecount (utf8text, from);
charlen = if (from + charlen > utf8_len_in_bytes) utf8_len_in_bytes - from; # Invalid UTF-8 encoding: requires more bytes than remain. Silently copy only as many as actually available.
else charlen; # Normal case.
fi;
(charlen, utf8text, 0);
else
(1, " ", -from); # We're past the actual end of 'utf8text', but adding trailing blanks as padding to 'screentext' to ensure that screencol1 and screencol2 correspond to chars in 'screentext'.
fi;
my { from_bump, col_bump, to_bump }
=
if (charlen > 1) # For now at least we'll copy multibyte utf8 chars through unchanged.
#
case charlen
#
2 => { unsafe_set_byte (screentext, to , unsafe_get_byte (input, from + fromoffset));
unsafe_set_byte (screentext, to+1, unsafe_get_byte (input, from+1 + fromoffset));
};
3 => { unsafe_set_byte (screentext, to , unsafe_get_byte (input, from + fromoffset));
unsafe_set_byte (screentext, to+1, unsafe_get_byte (input, from+1 + fromoffset));
unsafe_set_byte (screentext, to+2, unsafe_get_byte (input, from+2 + fromoffset));
};
4 => { unsafe_set_byte (screentext, to , unsafe_get_byte (input, from + fromoffset));
unsafe_set_byte (screentext, to+1, unsafe_get_byte (input, from+1 + fromoffset));
unsafe_set_byte (screentext, to+2, unsafe_get_byte (input, from+2 + fromoffset));
unsafe_set_byte (screentext, to+3, unsafe_get_byte (input, from+3 + fromoffset));
};
5 => { unsafe_set_byte (screentext, to , unsafe_get_byte (input, from + fromoffset));
unsafe_set_byte (screentext, to+1, unsafe_get_byte (input, from+1 + fromoffset));
unsafe_set_byte (screentext, to+2, unsafe_get_byte (input, from+2 + fromoffset));
unsafe_set_byte (screentext, to+3, unsafe_get_byte (input, from+3 + fromoffset));
unsafe_set_byte (screentext, to+4, unsafe_get_byte (input, from+4 + fromoffset));
};
6 => { unsafe_set_byte (screentext, to , unsafe_get_byte (input, from + fromoffset));
unsafe_set_byte (screentext, to+1, unsafe_get_byte (input, from+1 + fromoffset));
unsafe_set_byte (screentext, to+2, unsafe_get_byte (input, from+2 + fromoffset));
unsafe_set_byte (screentext, to+3, unsafe_get_byte (input, from+3 + fromoffset));
unsafe_set_byte (screentext, to+4, unsafe_get_byte (input, from+4 + fromoffset));
unsafe_set_byte (screentext, to+5, unsafe_get_byte (input, from+5 + fromoffset));
};
_ => (); # Impossible -- UTF-8 encodings are only defined for lengths 1-6.
esac;
{ from_bump => charlen,
col_bump => 1,
to_bump => charlen
};
else
c = get_byte_as_char (input, from + fromoffset);
#
cols = case c
#
'\^@' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, '@'); 2; };
'\^A' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'A'); 2; };
'\^B' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'B'); 2; };
'\^C' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'C'); 2; };
'\^D' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'D'); 2; };
'\^E' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'E'); 2; };
'\^F' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'F'); 2; };
'\^G' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'G'); 2; };
'\^H' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'H'); 2; };
'\^I' => { blanks = 8 - (col & 7); n_blanks (screentext, to,blanks); blanks; };
'\^J' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'J'); 2; };
'\^K' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'K'); 2; };
'\^L' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'L'); 2; };
'\^M' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'M'); 2; };
'\^N' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'N'); 2; };
'\^O' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'O'); 2; };
'\^P' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'P'); 2; };
'\^Q' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'Q'); 2; };
'\^R' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'R'); 2; };
'\^S' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'S'); 2; };
'\^T' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'T'); 2; };
'\^U' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'U'); 2; };
'\^V' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'V'); 2; };
'\^W' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'W'); 2; };
'\^X' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'X'); 2; };
'\^Y' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'Y'); 2; };
'\^Z' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, 'Z'); 2; };
'\^[' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, '['); 2; };
'\^\' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, '\\'); 2; };
'\^]' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, ']'); 2; };
'\^_' => { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, '_'); 2; };
'\x7F'=> { unsafe_set (screentext, to, '^'); unsafe_set (screentext, to+1, '?'); 2; }; # DEL char. ^? seems to be as standard a representation as any.
#
_ => { unsafe_set (screentext, to, c ); 1; };
esac;
{ from_bump => 1,
col_bump => cols,
to_bump => cols
};
fi;
if (col <= screencol1 # If we're crossing over screen column where screencol1 is, note its location in utf8text and screentext strings.
and col + col_bump > screencol1)
#
screencol1_byteoffset_in_utf8text := from;
screencol1_bytescount_in_utf8text := from_bump;
screencol1_byteoffset_in_screentext := to;
screencol1_bytescount_in_screentext := to_bump;
screencol1_firstcol_on_screen := col;
screencol1_colcount_on_screen := col_bump;
fi;
if (col <= screencol2 # If we're crossing over screen column where screencol2 is, note its location in utf8text and screentext strings.
and col + col_bump > screencol2)
#
screencol2_byteoffset_in_utf8text := from;
screencol2_bytescount_in_utf8text := from_bump;
screencol2_byteoffset_in_screentext := to;
screencol2_bytescount_in_screentext := to_bump;
screencol2_firstcol_on_screen := col;
screencol2_colcount_on_screen := col_bump;
fi;
if (from <= utf8byte
and from + from_bump > utf8byte)
#
utf8byte_firstcol_on_screen := col;
utf8byte_colcount_on_screen := col_bump;
fi;
fillstring ( from + from_bump,
col + col_bump,
to + to_bump
);
fi;
end;
{ screentext,
startcol => col,
#
screentext_length_in_screencols => *screentext_length_in_screencols,
screencol1_byteoffset_in_utf8text => *screencol1_byteoffset_in_utf8text,
screencol1_bytescount_in_utf8text => *screencol1_bytescount_in_utf8text,
#
screencol1_byteoffset_in_screentext => *screencol1_byteoffset_in_screentext,
screencol1_bytescount_in_screentext => *screencol1_bytescount_in_screentext,
#
screencol1_firstcol_on_screen => *screencol1_firstcol_on_screen,
screencol1_colcount_on_screen => *screencol1_colcount_on_screen,
screencol2_byteoffset_in_utf8text => *screencol2_byteoffset_in_utf8text,
screencol2_bytescount_in_utf8text => *screencol2_bytescount_in_utf8text,
#
screencol2_byteoffset_in_screentext => *screencol2_byteoffset_in_screentext,
screencol2_bytescount_in_screentext => *screencol2_bytescount_in_screentext,
#
screencol2_firstcol_on_screen => *screencol2_firstcol_on_screen,
screencol2_colcount_on_screen => *screencol2_colcount_on_screen,
utf8byte_firstcol_on_screen => *utf8byte_firstcol_on_screen,
utf8byte_colcount_on_screen => *utf8byte_colcount_on_screen
};
};
fun longest_common_prefix
(
s1: String,
s2: String
)
=
{ len1 = length_in_bytes s1;
len2 = length_in_bytes s2;
len = min (len1, len2);
prefix_len = scan 0
where
fun scan i
=
if (i == len)
#
len;
elif ( unsafe_get_byte (s1, i)
== unsafe_get_byte (s2, i)
)
scan (i+1);
else
i;
fi;
end;
substring (s1, 0, prefix_len);
};
fun drop_leading_whitespace (s: String)
=
{ len = length_in_bytes s;
#
prefix_len = scan 0
where
fun scan i
=
if (i == len)
#
len;
elif (char::is_space( unsafe_get(s, i) ))
scan (i+1);
else
i;
fi;
end;
extract (s, prefix_len, NULL);
};
fun drop_trailing_whitespace (s: String)
=
{ len = length_in_bytes s;
#
prefix_len = scan (len - 1)
where
fun scan i
=
if (i == -1)
#
0;
elif (char::is_space( unsafe_get(s, i) ))
scan (i - 1);
else
i+1;
fi;
end;
substring (s, 0, prefix_len);
};
# String comparisons
#
fun is_prefix s1 s2
=
ps::is_prefix (s1, s2, 0, size s2);
fun is_suffix s1 s2
=
{ sz2 = size s2;
#
ps::is_prefix (s1, s2, sz2 - size s1, sz2);
};
fun is_substring s
=
{ stringsearch = ps::knuth_morris_pratt_string_match s;
#
fun search s'
=
{ endpos = size s';
#
stringsearch (s', 0, endpos) < endpos;
};
search;
};
fun find_substring s
=
{ stringsearch = ps::knuth_morris_pratt_string_match s;
#
fun search s'
=
{ endpos = size s';
#
result = stringsearch (s', 0, endpos);
if (result < endpos) THE result;
else NULL;
fi;
};
search;
};
fun find_substring' s
=
{ stringsearch = ps::knuth_morris_pratt_string_match s;
#
fun search (s', start)
=
{ endpos = size s';
#
result = stringsearch (s', start, endpos);
#
if (result < endpos) THE result;
else NULL;
fi;
};
search;
};
fun find_substring_backward s
=
{ stringsearch = ps::knuth_morris_pratt_string_match_backward s;
#
fun search s'
=
{ endpos = size s';
#
result = stringsearch (s', endpos - 1, -1);
if (result >= 0) THE result;
else NULL;
fi;
};
search;
};
fun find_substring_backward' s
=
{ stringsearch = ps::knuth_morris_pratt_string_match_backward s;
#
fun search (s', start)
=
{ endpos = size s';
#
result = stringsearch (s', start, -1);
#
if (result >= 0) THE result;
else NULL;
fi;
};
search;
};
fun compare (a, b)
=
ps::compare (a, 0, size a, b, 0, size b);
fun compare_sequences compare_g (a, b)
=
ps::compare_sequences compare_g (a, 0, size a, b, 0, size b);
fun has_alpha string = list::exists chr::is_alpha (explode string); # For efficiency, should really have string::exists and string::all someday. XXX SUCKO FIXME.
fun has_upper string = list::exists chr::is_upper (explode string);
fun has_lower string = list::exists chr::is_lower (explode string);
fun is_alpha string = length_in_bytes string > 0 and list::all chr::is_alpha (explode string);
fun is_upper string = length_in_bytes string > 0 and list::all chr::is_upper (explode string);
fun is_lower string = length_in_bytes string > 0 and list::all chr::is_lower (explode string);
fun is_mixed string = is_alpha string and has_upper string and has_lower string;
fun is_ascii string # TRUE iff all bytes have high bit equal to zero.
=
{ len = length_in_bytes string;
#
check_bytes 0 # Over all bytes in string
where
fun check_bytes (i: Int)
=
if (i == len) TRUE; # If we've checked all bytes, is_ascii is TRUE.
else
c = unsafe_get (string, i); # Get i-th byte as a char.
c = char::to_int c; # Convert char to int.
if (c & 0x80 == 0x80) FALSE; # If high bit of byte is 1, is_ascii is FALSE.
else check_bytes (i+1); # Check rest of bytes in string.
fi;
fi;
end;
};
# String greater or equal
#
fun string_gt (a, b)
=
compare 0
where
al = size a;
bl = size b;
n = if (al < bl) al;
else bl;
fi;
fun compare i
=
if (i == n)
#
al > bl;
else
ai = unsafe_get (a, i);
bi = unsafe_get (b, i);
chr::(>) (ai, bi)
or
( (ai == bi)
and
compare (i+1)
);
fi;
end;
fun (<=) (a, b) = if (string_gt (a, b) ) FALSE; else TRUE; fi;
fun (<) (a, b) = string_gt (b, a);
fun (>=) (a, b)
=
b <= a;
my (>) = string_gt;
fun from_string' scan_char s
=
accum (0, [])
where
len = size s;
fun getc i
=
if (it::default_int::(<) (i, len))
#
THE (unsafe_get (s, i), i+1);
else
NULL;
fi;
scan_char = scan_char getc;
fun accum (i, chars)
=
case (scan_char i)
#
NULL
=>
if (it::default_int::(<) (i, len)) NULL; # Bad format
else THE (implode (list::reverse chars));
fi;
#
THE (c, i')
=>
accum (i', c ! chars);
esac;
end;
fun (+) ("", s) => s;
(+) (s, "") => s;
(+) (x, y) => ps::meld2 (x, y);
end;
# Concatenate a list of strings using the
# given separator and delimiter strings, so
# join' "(" " " ")" ["an", "example"]
# ->
# "(an example)"
#
fun join' _ _ _ [] => "";
#
join' start _ stop [x] => start + x + stop;
#
join' start sep stop (h ! t)
=>
cat (
start
!
h
!
fold_backward
(\\ (x, l) = sep ! x ! l)
[ stop ]
t
);
end;
# Drop trailing newline on string, if present:
#
fun chomp ""
=>
"";
chomp string
=>
{ len = length_in_bytes string;
#
if (get_byte_as_char (string, len - 1) != '\n') string;
else extract (string, 0, THE (len - 1));
fi;
};
end;
# There's a shorter definition of this fn in
src/lib/compiler/toplevel/interact/read-eval-print-loop-g.pkg # -- should we use it instead? XXX QUERO FIXME
# [2015-06-15 CrT: This should probably move to (just-created)
src/lib/std/src/string-junk.pkg to_lower = map chr::to_lower;
to_upper = map chr::to_upper;
fun to_mixed string # "THIS_is_tExt" -> "This_Is_Text"
=
to_mixed' (' ', explode string, [])
where
fun to_mixed' (_, [], chars)
=>
(implode (list::reverse chars));
to_mixed' (last, this ! rest, chars)
=>
if (not (chr::is_alpha this)) to_mixed' (this, rest, this ! chars);
elif (not (chr::is_alpha last)) to_mixed' (this, rest, chr::to_upper this ! chars);
else to_mixed' (this, rest, chr::to_lower this ! chars);
fi;
end;
end;
from_string = from_string' chr::scan;
to_string = translate chr::to_string;
from_cstring = from_string' chr::scan_c;
to_cstring = translate chr::to_cstring;
}; # package string
end;