## string.api
#
# Basic string ops.
#
# See also:
#
src/lib/std/src/string-junk.api# Compiled by:
#
src/lib/std/src/standard-core.sublib### "The string is a stark data package and
### everywhere it is passed there is duplication.
### It is a perfect vehicle for hiding information."
###
### -- Alan J Perlis
# This api is implemented in:
#
#
src/lib/std/src/string-guts.pkg#
api String {
#
eqtype Char;
eqtype String;
maximum_vector_length: Int;
length_in_bytes: String -> Int; # E.g.: length "abcdef" --> 6
length_in_chars: String -> Int; # String should be 7-bit ascii or UTF-8. Returns number of bytes in string not matching 10xxxxxx.
prefix_length_in_bytes:
(String, Int) -> Int; # Given string and prefix length in chars, return prefix length in bytes.
get_byte: (String, Int) -> Int; # E.g.: string::get_byte ("abcdef", 0) --> 97
get_byte_as_char: (String, Int) -> Char; # E.g.: string::get_byte_as_char ("abcdef", 0) --> 'a'
get_char_as_int: (String, Int) -> (Int, Int); # First result is utf-8 char starting at given byte offset in string (might occupy 1-6 bytes). Second result is next byte offset to read (== original offset + char-length-in-bytes.) For UTF-8 encoding background see (e.g.) http://www.cl.cam.ac.uk/~mgk25/ucs/man-utf-8.html
get_char_bytecount: (String, Int) -> Int; # Returns number of bytes used to encode UTF-8 char at given byte offset in string.,
# (_[]): (String, Int) -> Char; # Note: The (_[]) enables 'vec[index]' notation;
extract: (String, Int, Null_Or( Int )) -> String; # E.g.: extract ("abcdef", 2, NULL) --> "cdef"
# E.g.: extract ("abcdef", 2, THE 1) --> "c" # Int args are (byteoffset, length).
substring: (String, Int, Int) -> String; # E.g.: substring ("abcdef", 1, 4); --> "bcde" # Int args are (byteoffset, length).
+ : (String, String) -> String; # E.g.: "abc" + "def" --> "abcdef"
cat: List( String ) -> String; # E.g.: cat ["an", "example"] --> "anexample"
join: String -> List( String ) -> String; # E.g.: join " " ["an", "example"] --> "an example"
join': String -> String -> String -> List(String) -> String; # E.g.: join' "(" ", " ")" ["an", "example"] --> "(an, example)"
from_char: Char -> String; # E.g.: from_char 'a' --> "a"
implode: List(Char) -> String; # E.g.: implode ['a', 'b', 'c'] --> "abc"
explode: String -> List(Char); # E.g.: explode "abc" --> ['a', 'b', 'c']
chomp: String -> String; # E.g.: chomp "abc\n" --> "abc" (Drops trailing newline if present, else is a no-op.)
map: (Char -> Char) -> String -> String; # E.g.: string::map char::to_upper "abc" --> "ABC"
repeat: (String, Int) -> String; # E.g.: repeat ("abc", 2) --> "abcabc"
translate: (Char -> String) -> String -> String;
tokens: (Char -> Bool) -> String -> List(String); # E.g.: string::tokens {. #c == ','; } "a,b,,c"; --> ["a", "b", "c"]
fields: (Char -> Bool) -> String -> List(String); # E.g.: string::fields {. #c == ','; } "a,b,,c"; --> ["a", "b", "", "c"]
lines: String -> List(String); # E.g.: string::lines "a\nb\n\nc"; --> ["a\n", "b\n", "\n", "c"]
longest_common_prefix: (String, String) -> String; # Return the longest common prefix of two strings.
drop_leading_whitespace: String -> String; # Drop all leading chars which satisfy char::is_space().
drop_trailing_whitespace: String -> String; # Drop all trailing chars which satisfy char::is_space().
# See also trim in
src/lib/std/src/string-junk.api is_prefix: String -> String -> Bool; # Is String1 is a prefix of String2?
is_substring: String -> String -> Bool;
is_suffix: String -> String -> Bool;
find_substring: String -> String -> Null_Or( Int ); # Knuth-Morris-Pratt string search. Find first string arg in second. Return byte offset of match else NULL.
find_substring': String -> (String, Int) -> Null_Or( Int ); # Knuth-Morris-Pratt string search. Find first string arg in second, starting at given byte offset. Return byte offset of match else NULL.
# NB: The curried form of the above two fns allows the setup work for the pattern string to be re-used over multiple search strings.
find_substring_backward: String -> String -> Null_Or( Int ); # These two are just like previous two, but searching backward instead of forward.
find_substring_backward': String -> (String, Int) -> Null_Or( Int ); #
compare: (String, String) -> Order;
compare_sequences: ((Char, Char) -> Order) -> (String, String) -> Order;
to_lower: String -> String; # "THIS_is_tExt" -> "this_is_text"
to_upper: String -> String; # "THIS_is_tExt" -> "THIS_IS_TEXT"
to_mixed: String -> String; # "THIS_is_tExt" -> "This_Is_Text"
has_alpha: String -> Bool; # fun has_alpha string = list::exists char::is_alpha (explode string);
has_lower: String -> Bool; # fun has_upper string = list::exists char::is_upper (explode string);
has_upper: String -> Bool; # fun has_lower string = list::exists char::is_lower (explode string);
is_alpha: String -> Bool; # fun is_alpha string = length string > 0 and list::all char::is_alpha (explode string);
is_upper: String -> Bool; # fun is_upper string = length string > 0 and list::all char::is_upper (explode string);
is_lower: String -> Bool; # fun is_lower string = length string > 0 and list::all char::is_lower (explode string);
is_mixed: String -> Bool; # fun is_mixed string = is_alpha string and has_upper string and has_lower string;
is_ascii: String -> Bool; # TRUE iff all bytes in string have high bit equal to zero.
# For is_alpha/is_space/etc predicates on
# individual chars in a string see:
#
#
src/lib/std/src/string-chartype.api < : (String, String) -> Bool;
<= : (String, String) -> Bool;
> : (String, String) -> Bool;
>= : (String, String) -> Bool;
from_string: String -> Null_Or( String );
to_string: String -> String;
from_cstring: String -> Null_Or( String );
to_cstring: String -> String;
byte_offset_of_ith_char: (String, Int) -> Null_Or(Int); # Scan down utf-8 encoded string looking for byte offset of first byte of i-th char. Return NULL if string contains less than 'i' chars.
# The following should perhaps be in a separate edit-support pkg, but
# for the moment keeping them in the string pkg is easy and convenient,
# because they use low-level unsafe ops not easily available above the
#
src/lib/std/src/string-guts.pkg # level.
utf8_to_ucs2: String -> String; # Return a string in which each char is encoded using exactly two bytes, most-significant first. Intended primarily for use with w2x::x::POLY_TEXT16 in
src/lib/x-kit/widget/xkit/app/guishim-imp-for-x.pkg expand_tabs_and_control_chars # We need this to convert raw line String into something viewable on screen in
src/lib/x-kit/widget/edit/screenline.pkg : # Expands tabs (on 8-char tabstops) into blanks. Expands control chars (and DEL) into ^A notation. If necessary. blank-pad end of string so both both screencol1 and screencol2 correspond to valid offsets within 'screentext'.
{ utf8text: String,
startcol: Int, # Screen col for first char of 'text'. (Normally 0 for left-justified string.)
screencol1: Int, # Query byte-extent of this screeen column in input and output strings. Use -1 if you don't care.
screencol2: Int, # Query byte-extent of this screeen column in input and output strings. Having both screencol1 and screencol2 is helpful when displaying the selected region in
src/lib/x-kit/widget/edit/screenline.pkg utf8byte: Int # Query screen-column of this byte offset into 'utf8text'. Use -1 if you don't care.
}
->
{ screentext: String, # Expanded text.
startcol: Int, # Screen col for first char of any text following 'text'. Useful when expanding multiple strings within a single line.
#
screentext_length_in_screencols: Int, # Length in screen columns of screentext. Because utf8 chars may occupy 1-6 bytes in utf8text but only a single screen column, tabs 1 byte in utf8text but 1-8 screen columns and control chars 1 byte in utf8text but 2 screen columns, computing this is nontrivial.
# NB: screencol1_byteoffset_in_utf8text is not guaranteed to be a valid offset into utf8text, because screencol1 is allowed to be beyond the end of the displayed text corresponding to utf8text.
screencol1_byteoffset_in_utf8text: Int, # Byte offset in input 'text' corresponding to screencol1. Because utf8 chars occupy one screen column but 1-6 bytes in input, control chars 2 screen columns but one byte in input and tabs 1-8 screen columns but one byte in input, computing this is nontrivial.
screencol1_bytescount_in_utf8text: Int, # Byte length of input 'text' char corresponding to screencol1. This will be 1 except for multibyte utf8 chars.
#
screencol1_byteoffset_in_screentext: Int, # Byte offset in output 'text' corresponding to screencol1.
screencol1_bytescount_in_screentext: Int, # Byte length of output 'text' char corresponding to screencol1. This will be 1-8 for tabs, 2 for control chars, otherwise 1 except for multibyte utf8 chars.
#
screencol1_firstcol_on_screen: Int, # Screen column at which char under cursor begins. Note that screencol1 may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
screencol1_colcount_on_screen: Int, # Length in screen columns of char under screencol1.
# NB: screencol2_byteoffset_in_utf8text is not guaranteed to be a valid offset into utf8text, because screencol2 is allowed to be beyond the end of the displayed text corresponding to utf8text.
screencol2_byteoffset_in_utf8text: Int, # Byte offset in input 'text' corresponding to screencol2. Because utf8 chars occupy one screen column but 1-6 bytes in input, control chars 2 screen columns but one byte in input and tabs 1-8 screen columns but one byte in input, computing this is nontrivial.
screencol2_bytescount_in_utf8text: Int, # Byte length of input 'text' char corresponding to screencol2. This will be 1 except for multibyte utf8 chars.
#
screencol2_byteoffset_in_screentext: Int, # Byte offset in output 'text' corresponding to screencol2.
screencol2_bytescount_in_screentext: Int, # Byte length of output 'text' char corresponding to screencol2. This will be 1-8 for tabs, 2 for control chars, otherwise 1 except for multibyte utf8 chars.
#
screencol2_firstcol_on_screen: Int, # Screen column at which char under cursor begins. Note that screencol2 may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
screencol2_colcount_on_screen: Int, # Length in screen columns of char under screencol2.
utf8byte_firstcol_on_screen: Int, # Screen column at which utf8text byteoffset 'utf8byte' begins. Note that utf8byte may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
utf8byte_colcount_on_screen: Int # Length in screen columns of char at utf8text byteoffset 'utf8byte'.
};
};
## COPYRIGHT (c) 1995 AT&T Bell Laboratories.
## Subsequent changes by Jeff Prothero Copyright (c) 2010-2015,
## released per terms of SMLNJ-COPYRIGHT.