src/lib/std/src/string.api

15.3.549 src/lib/std/src/string.api

## string.api
#
# Basic string ops.
#
# See also:
#     src/lib/std/src/string-junk.api

# Compiled by:
#     src/lib/std/src/standard-core.sublib

###     "The string is a stark data package and
###      everywhere it is passed there is duplication.
###      It is a perfect vehicle for hiding information."
###
###                               -- Alan J Perlis

# This api is implemented in:
#
#     src/lib/std/src/string-guts.pkg
#
api String {
    #
    eqtype Char;
    eqtype String;

    maximum_vector_length:  Int;

    length_in_bytes:     String -> Int;                                         # E.g.:   length "abcdef"                           -->  6
    length_in_chars:     String -> Int;                                         # String should be 7-bit ascii or UTF-8. Returns number of bytes in string not matching 10xxxxxx.
    prefix_length_in_bytes:
                        (String, Int) -> Int;                                   # Given string and prefix length in chars, return prefix length in bytes.

    get_byte:           (String, Int) -> Int;                                   # E.g.:   string::get_byte         ("abcdef", 0)    -->  97
    get_byte_as_char:   (String, Int) -> Char;                                  # E.g.:   string::get_byte_as_char ("abcdef", 0)    -->  'a'
    get_char_as_int:    (String, Int) -> (Int, Int);                            # First result is utf-8 char starting at given byte offset in string (might occupy 1-6 bytes).  Second result is next byte offset to read (== original offset + char-length-in-bytes.) For UTF-8 encoding background see (e.g.) http://www.cl.cam.ac.uk/~mgk25/ucs/man-utf-8.html
    get_char_bytecount: (String, Int) -> Int;                                   # Returns number of bytes used to encode UTF-8 char at given byte offset in string.,
#   (_[]):              (String, Int) -> Char;                                  # Note:   The (_[])   enables   'vec[index]'  notation;

    extract:            (String, Int, Null_Or( Int )) -> String;                # E.g.:   extract ("abcdef", 2, NULL)               -->   "cdef"
                                                                                # E.g.:   extract ("abcdef", 2, THE 1)              -->  "c"                    # Int args are (byteoffset, length).
    substring:          (String, Int, Int) -> String;                           # E.g.:   substring ("abcdef", 1, 4);               -->   "bcde"                # Int args are (byteoffset, length).

    +  :                (String, String) -> String;                             # E.g.:   "abc" + "def"                             -->   "abcdef"
    cat:                List( String ) -> String;                               # E.g.:   cat                   ["an", "example"]   -->   "anexample"
    join:               String -> List( String ) -> String;                     # E.g.:   join  " "             ["an", "example"]   -->   "an example"
    join':              String -> String -> String -> List(String) -> String;   # E.g.:   join'  "("  ", "  ")" ["an", "example"]   -->   "(an, example)"
    from_char:          Char   -> String;                                       # E.g.:   from_char 'a'                             -->   "a"
    implode:            List(Char) -> String;                                   # E.g.:   implode ['a', 'b', 'c']                   -->   "abc"
    explode:            String -> List(Char);                                   # E.g.:   explode "abc"                             -->   ['a', 'b', 'c']
    chomp:              String -> String;                                       # E.g.:   chomp "abc\n"                             -->   "abc"    (Drops trailing newline if present, else is a no-op.)
    map:                (Char -> Char) -> String -> String;                     # E.g.:   string::map char::to_upper "abc"          -->   "ABC"
    repeat:             (String, Int) -> String;                                # E.g.:   repeat ("abc", 2)                         -->   "abcabc"

    translate:          (Char -> String) -> String -> String;
    tokens:             (Char -> Bool) -> String -> List(String);               # E.g.:   string::tokens {. #c == ','; } "a,b,,c";  -->   ["a", "b", "c"]
    fields:             (Char -> Bool) -> String -> List(String);               # E.g.:   string::fields {. #c == ','; } "a,b,,c";  -->   ["a", "b", "", "c"]
    lines:                                String -> List(String);               # E.g.:   string::lines                  "a\nb\n\nc";  -->   ["a\n", "b\n", "\n", "c"]

    longest_common_prefix:      (String, String) -> String;                     # Return the longest common prefix of two strings.

    drop_leading_whitespace:    String -> String;                               # Drop all leading  chars which satisfy char::is_space().
    drop_trailing_whitespace:   String -> String;                               # Drop all trailing chars which satisfy char::is_space().
                                                                                # See also  trim  in   src/lib/std/src/string-junk.api

    is_prefix:          String -> String -> Bool;                               # Is String1 is a prefix of String2?
    is_substring:       String -> String -> Bool;
    is_suffix:          String -> String -> Bool;

    find_substring:     String -> String        -> Null_Or( Int );              # Knuth-Morris-Pratt string search.  Find first string arg in second.                                 Return byte offset of match else NULL.
    find_substring':    String -> (String, Int) -> Null_Or( Int );              # Knuth-Morris-Pratt string search.  Find first string arg in second, starting at given byte offset.  Return byte offset of match else NULL.
                                                                                # NB: The curried form of the above two fns allows the setup work for the pattern string to be re-used over multiple search strings.
    find_substring_backward:    String -> String        -> Null_Or( Int );      # These two are just like previous two, but searching backward instead of forward.
    find_substring_backward':   String -> (String, Int) -> Null_Or( Int );      #

    compare:            (String, String) -> Order;
    compare_sequences:  ((Char, Char) -> Order) -> (String, String) -> Order;

    to_lower:           String -> String;                                       # "THIS_is_tExt" -> "this_is_text"
    to_upper:           String -> String;                                       # "THIS_is_tExt" -> "THIS_IS_TEXT"
    to_mixed:           String -> String;                                       # "THIS_is_tExt" -> "This_Is_Text"

    has_alpha:          String -> Bool;                                         # fun has_alpha string =   list::exists  char::is_alpha  (explode string);
    has_lower:          String -> Bool;                                         # fun has_upper string =   list::exists  char::is_upper  (explode string);
    has_upper:          String -> Bool;                                         # fun has_lower string =   list::exists  char::is_lower  (explode string);

    is_alpha:           String -> Bool;                                         # fun is_alpha  string =   length string > 0   and   list::all  char::is_alpha  (explode string);
    is_upper:           String -> Bool;                                         # fun is_upper  string =   length string > 0   and   list::all  char::is_upper  (explode string);
    is_lower:           String -> Bool;                                         # fun is_lower  string =   length string > 0   and   list::all  char::is_lower  (explode string);
    is_mixed:           String -> Bool;                                         # fun is_mixed  string =   is_alpha string  and  has_upper string  and  has_lower string;
    is_ascii:           String -> Bool;                                         # TRUE iff all bytes in string have high bit equal to zero.

    # For is_alpha/is_space/etc predicates on
    # individual chars in a string see:
    #
    #     src/lib/std/src/string-chartype.api

    <  : (String, String) -> Bool;
    <= : (String, String) -> Bool;
    >  : (String, String) -> Bool;
    >= : (String, String) -> Bool;

    from_string:        String -> Null_Or( String );
    to_string:          String ->          String;
    from_cstring:       String -> Null_Or( String );
    to_cstring:         String ->          String;

    byte_offset_of_ith_char: (String, Int) -> Null_Or(Int);                     # Scan down utf-8 encoded string looking for byte offset of first byte of i-th char.  Return NULL if string contains less than 'i' chars.

    # The following should perhaps be in a separate edit-support pkg, but
    # for the moment keeping them in the string pkg is easy and convenient,
    # because they use low-level unsafe ops not easily available above the
    #     src/lib/std/src/string-guts.pkg
    # level.

    utf8_to_ucs2:       String -> String;                                       # Return a string in which each char is encoded using exactly two bytes, most-significant first.  Intended primarily for use with  w2x::x::POLY_TEXT16  in  src/lib/x-kit/widget/xkit/app/guishim-imp-for-x.pkg

    expand_tabs_and_control_chars                                               # We need this to convert raw line String into something viewable on screen in   src/lib/x-kit/widget/edit/screenline.pkg
      :                                                                         # Expands tabs (on 8-char tabstops) into blanks. Expands control chars (and DEL) into ^A notation.  If necessary. blank-pad end of string so both both screencol1 and screencol2 correspond to valid offsets within 'screentext'.
      { utf8text:       String,
        startcol:       Int,                                                    # Screen col for first char of 'text'. (Normally 0 for left-justified string.)
        screencol1:     Int,                                                    # Query byte-extent of this screeen column in input and output strings. Use -1 if you don't care.
        screencol2:     Int,                                                    # Query byte-extent of this screeen column in input and output strings. Having both screencol1 and screencol2 is helpful when displaying the selected region in  src/lib/x-kit/widget/edit/screenline.pkg
        utf8byte:       Int                                                     # Query screen-column of this byte offset into 'utf8text'.              Use -1 if you don't care.
      }
      ->
      { screentext:     String,                                                 # Expanded text.
        startcol:       Int,                                                    # Screen col for first char of any text following 'text'.  Useful when expanding multiple strings within a single line.
        #
        screentext_length_in_screencols:        Int,                            # Length in screen columns of screentext.  Because utf8 chars may occupy 1-6 bytes in utf8text but only a single screen column, tabs 1 byte in utf8text but 1-8 screen columns and control chars 1 byte in utf8text but 2 screen columns, computing this is nontrivial.

                                                                                # NB: screencol1_byteoffset_in_utf8text is not guaranteed to be a valid offset into utf8text, because screencol1 is allowed to be beyond the end of the displayed text corresponding to utf8text.
        screencol1_byteoffset_in_utf8text:      Int,                            # Byte offset in  input 'text' corresponding to screencol1.  Because utf8 chars occupy one screen column but 1-6 bytes in input, control chars 2 screen columns but one byte in input and tabs 1-8 screen columns but one byte in input, computing this is nontrivial.
        screencol1_bytescount_in_utf8text:      Int,                            # Byte length of  input 'text' char corresponding to screencol1.  This will be 1 except for multibyte utf8 chars.
        #
        screencol1_byteoffset_in_screentext:    Int,                            # Byte offset in output 'text' corresponding to screencol1.
        screencol1_bytescount_in_screentext:    Int,                            # Byte length of output 'text' char corresponding to screencol1.  This will be 1-8 for tabs, 2 for control chars, otherwise 1 except for multibyte utf8 chars.
        #
        screencol1_firstcol_on_screen:          Int,                            # Screen column at which char under cursor begins.  Note that screencol1 may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
        screencol1_colcount_on_screen:          Int,                            # Length in screen columns of char under screencol1.

                                                                                # NB: screencol2_byteoffset_in_utf8text is not guaranteed to be a valid offset into utf8text, because screencol2 is allowed to be beyond the end of the displayed text corresponding to utf8text.
        screencol2_byteoffset_in_utf8text:      Int,                            # Byte offset in  input 'text' corresponding to screencol2.  Because utf8 chars occupy one screen column but 1-6 bytes in input, control chars 2 screen columns but one byte in input and tabs 1-8 screen columns but one byte in input, computing this is nontrivial.
        screencol2_bytescount_in_utf8text:      Int,                            # Byte length of  input 'text' char corresponding to screencol2.  This will be 1 except for multibyte utf8 chars.
        #
        screencol2_byteoffset_in_screentext:    Int,                            # Byte offset in output 'text' corresponding to screencol2.
        screencol2_bytescount_in_screentext:    Int,                            # Byte length of output 'text' char corresponding to screencol2.  This will be 1-8 for tabs, 2 for control chars, otherwise 1 except for multibyte utf8 chars.
        #
        screencol2_firstcol_on_screen:          Int,                            # Screen column at which char under cursor begins.  Note that screencol2 may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
        screencol2_colcount_on_screen:          Int,                            # Length in screen columns of char under screencol2.

        utf8byte_firstcol_on_screen:            Int,                            # Screen column at which utf8text byteoffset 'utf8byte' begins.  Note that utf8byte may be (e.g.) somewhere in the middle of a tab, so computing this value is nontrivial.
        utf8byte_colcount_on_screen:            Int                             # Length in screen columns of char at utf8text byteoffset 'utf8byte'.
      };
};

## COPYRIGHT (c) 1995 AT&T Bell Laboratories.
## Subsequent changes by Jeff Prothero Copyright (c) 2010-2015,
## released per terms of SMLNJ-COPYRIGHT.

Comments and suggestions to: bugs@mythryl.org