src/lib/core/init/protostring.pkg

15.4.726 src/lib/core/init/protostring.pkg

## protostring.pkg

# Compiled by:
#     src/lib/core/init/init.cmi

# Some common operations that are used by
# both the 'string' and 'substring' packages.

stipulate
    package it  =  inline_t;                                    # inline_t              is from   src/lib/core/init/built-in.pkg
    package rt  =  runtime;                                     # runtime               is from   src/lib/core/init/built-in.pkg.
    #
    include package   base_types;                                       # base_types            is from   src/lib/core/init/built-in.pkg
    #
    infix  my 80  * / %  mod  div ;
    infix  my 70 $ ^ + - ;
    infixr my 60 . ! @ << >> >>> ;
    infix  my 50 > < >= <= == != ;
    infix  my 40 := o ;
    infix  my 10 then ;
    #
herein

    # This package is referenced in:
    #
    #     src/lib/core/init/init.cmi
    #
    package protostring {
        #   ===========
        #
        stipulate
            #
            include package   proto_pervasive;                  # proto_pervasive       is from   src/lib/core/init/proto-pervasive.pkg

            package c = it::char;                               # inline_t              is from   src/lib/core/init/built-in.pkg

            (+)    = it::default_int::(+);
            (-)    = it::default_int::(-);
            (*)    = it::default_int::(*);
            (quot) = it::default_int::quot;
            (<)    = it::default_int::(<);
            (<=)   = it::default_int::(<=);
            (>)    = it::default_int::(>);
            (>=)   = it::default_int::(>=);
      #     (==)   = it::(==);

            unsafe_get = it::vector_of_chars::get_byte_as_char;
            unsafe_set = it::vector_of_chars::set_char_as_byte;
            unsafe_create = rt::asm::make_string;               # "rt" == "runtime"

            max_size =  core::maximum_vector_length;

            size = it::vector_of_chars::length;

        herein

            # Allocate an uninitialized string of given length (with a size check)

            fun create n
                =
                if (it::default_int::ltu (max_size, n))
                    raise exception core::SIZE;
                else
                    unsafe_create n;
                fi;

            # A vector of single character strings                                              # NB: 'chars' is (also) used as ps::chars by   fun str  in  src/lib/core/init/pervasive.pkg
            #
            chars = rt::asm::make_typeagnostic_ro_vector  (c::max_ord+1,  next 0)               # "rt" == "runtime".
                    where
                        fun next i
                            =
                            if (i <= c::max_ord)
                                #
                                s = unsafe_create 1;

                                unsafe_set (s, 0, c::chr i);

                                s ! next (i+1);
                            else
                                [];
                            fi;
                    end;

            fun unsafe_substring (_, _, 0)
                    =>
                    "";

                unsafe_substring (s, i, 1)
                    =>
                    it::poly_vector::get (chars, it::cast (unsafe_get (s, i)));

                unsafe_substring (s, i, n)
                    =>
                    ss
                    where
                        ss =  unsafe_create  n;

                        fun copy j
                            =
                            if (j != n)
                                #
                                unsafe_set (ss, j, unsafe_get (s, i+j));
                                copy (j+1);
                            fi;

                        copy 0;
                    end;
            end;

            # Concatenate a pair of non-empty strings:
            #
            fun meld2 (x, y)
                =
                ss
                where
                    xl = size x;
                    yl = size y;
                    ss = create (xl+yl);

                    fun copyx n
                        =
                        if (n != xl)
                            #
                            unsafe_set (ss, n, unsafe_get (x, n));
                            copyx (n+1);
                        fi;

                    fun copyy n
                        =
                        if (n != yl)
                            #
                            unsafe_set (ss, xl+n, unsafe_get (y, n));
                            copyy (n+1);
                        fi;

                    copyx 0;
                    copyy 0;
                end;

            # Given a reverse order list of strings
            # and a total length, return the
            # concatenation of the list.
            #
            fun rev_meld (0, _)
                    =>
                    "";

                rev_meld (1, lst)
                    =>
                    find lst
                    where
                        fun find ("" ! r) =>   find r;
                            find (s !  _) =>   s;
                            find _        =>   "";                      # * impossible *
                        end;
                    end;

                rev_meld (tot_len, lst)
                    =>
                    ss
                    where

                        ss = create tot_len;

                        fun copy ([], _)
                                =>
                                ();

                           copy (s ! r, i)
                                =>
                                {   len = size s;
                                    i = i - len;

                                    fun copy' j
                                        =
                                        if (j != len)
                                            #
                                            unsafe_set (ss, i+j, unsafe_get (s, j));
                                            copy'(j+1);
                                        fi;

                                    copy' 0;
                                    copy (r, i);
                                };
                        end;

                        copy (lst, tot_len);
                    end;
            end;

            # Map a translation function across
            # the characters of a substring:
            #
            fun translate (tr, s, i, n)
                =
                map_list (i, 0, [])
                where
                    stop = i+n;

                    fun map_list (j, result_length, result_list)
                        =
                        if   (j < stop)

                             s' = tr (unsafe_get (s, j));

                             map_list   (j + 1,   result_length + size s',   s' ! result_list);
                        else
                             rev_meld (result_length, result_list);
                        fi;
                  end;

            # Implode a non-empty list of characters
            # into a string where both the length and
            # list are given as arguments:
            #
            fun implode (len, cl)
                =
                ss
                where
                    ss = create len;
                    #
                    fun copy ([],    _) =>   ();
                        #
                        copy (c ! r, i)
                            =>
                            {   it::vector_of_chars::set_char_as_byte (ss, i, c);
                                #
                                copy (r, i+1);
                            };
                    end;

                    copy (cl, 0);
                end;

            # Implode a reversed non-empty list of characters
            # into a string where both the length and list
            # are given as arguments:
            #
            fun rev_implode (len, cl)
                =
                ss
                where
                    ss =  create len;
                    #
                    fun copy ([],    _) =>  ();
                        #
                        copy (c ! r, i) =>  {   it::vector_of_chars::set_char_as_byte (ss, i, c);
                                                #
                                                copy (r, i - 1);
                                            };
                    end;

                    copy (cl, len - 1);

                end;

            fun is_prefix (s1, s2, i2, n2)
                =
                (n2 >= n1  and  eq (0, i2))
                where
                    n1 = size s1;

                    fun eq (i, j)
                        =
                        i >= n1
                        or
                        (   unsafe_get (s1, i) == unsafe_get (s2, j)
                            and
                            eq (i+1, j+1)
                        );

                end;

            fun compare_sequences compare_fn (s1, i1, n1, s2, i2, n2)
                =
                compare' 0
                where
                    my (n, order)
                        =
                        if   (n1 == n2)  (n1, EQUAL);
                        elif (n1 <  n2)  (n1, LESS);
                        else             (n2, GREATER);
                        fi;

                    fun compare' i
                        =
                        if   (i == n)
                             order;
                        else
                             c1 = unsafe_get (s1, i1+i);
                             c2 = unsafe_get (s2, i2+i);

                             case (compare_fn (c1, c2))

                                  EQUAL =>   compare' (i+1);
                                  order =>   order;
                             esac;
                        fi;

                end;

            fun compare (s1, i1, n1, s2, i2, n2)
                =
                compare_sequences compare_fn (s1, i1, n1, s2, i2, n2)
                where
                    fun compare_fn (c1, c2)
                        =
                        if   (c1 == c2)         EQUAL;
                        elif (c::(>) (c1, c2))  GREATER;
                        else                    LESS;
                        fi;

                end;

            # Knuth-Morris-Pratt String Matching
            #
            # knuth_morris_pratt_string_match:  String -> (String, Int, Int) -> Int
            #
            # Find the first string within the second, starting at and
            # ending before the given positions.
            # Return the starting position of the match
            # or the given ending position if there is no match.
            #
            # For background see (e.g.)
            #
            #     https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
            #
            fun knuth_morris_pratt_string_match  pattern
                =
                {   psz  =  size pattern;
                    next =  it::poly_rw_vector::make_nonempty_rw_vector (psz, -1);

                    fun pat x =  unsafe_get (pattern, x);
                    fun nxt x =  it::poly_rw_vector::get (next, x);

                    fun setnxt (i, x)
                        =
                        it::poly_rw_vector::set (next, i, x);

                    # We're trying to fill 'next' at position p (> 0) and higher;
                    # invariants: x >= 0
                    #             pattern[0..x) == pattern[p-x..p)
                    #             for i in [0..p) :
                    #                 pattern[i] <> pattern[next[i]]
                    #                 pattern[0..next[i]) = pattern[i-next[i]..i)
                    fun fill (p, x)
                        =
                        if   (p >= psz)        ();                                              # Reached end of 'pattern' so we're done.
                        elif (pat x == pat p)  dnxt (p,  nxt x,      x + 1);
                        else                   dnxt (p,      x,  nxt x + 1);
                        fi

                    also
                    fun dnxt (p, x, y)
                        =
                        {   setnxt (p, x);
                            #
                            fill (p + 1, y);
                        };

                    # Once 'next' has been initialized, it serves the following purpose:
                    #
                    # Suppose we are looking at text position t and pattern position p.
                    #
                    # This means that all pattern positions < p have already
                    # matched the text positions that directly precede t.
                    #
                    # Now, if the text[t] matches pattern[p], then we simply advance
                    # both t and p and try again.
                    #
                    # However, if the two do not match, then we simply
                    # try t again, but this time with the pattern position
                    # given by next[p].
                    #
                    # Success is when p reaches the end of the pattern.
                    # Failure is when t reaches the end of the text
                    # without p  having reached the end of the pattern.
                    #
                    fun search (text, start, stop)
                        =
                        loop (0, start)
                        where
                            fun txt x
                                =
                                unsafe_get (text, x);

                            fun loop (p, t)
                                =
                                if   (p >= psz               )   t-psz;                         # SUCCESS:  Complete pattern matched.  Return byte offset of start of patternmatch in 'text'.
                                elif (t >= stop              )   stop;                          # FAILURE:  Reached end of 'text' (or at least the part of it we're allowed to search) without finding a match.  Return ending position of search in 'text'.
                                elif (p < 0 or pat p == txt t)   loop (p+1, t+1);               # Pattern[p] == text[t] so advance p and t by one.
                                else                             loop (nxt p, t);               # Pattern[p] != text[t] so try next possible match of pattern against text[t].
                                fi;
                        end;

                   fill (1, 0);

                   search;
                };

            fun knuth_morris_pratt_string_match_backward  pattern                               # Same as above, but searching backward instead of forward.
                =                                                                               # We build/use 'next' identically to the above, but tweak 'pat' and 'txt' to read 'pattern' and 'text' back-to-front instead of front-to-back.
                {   psz  =  size pattern;
                    next =  it::poly_rw_vector::make_nonempty_rw_vector (psz, -1);

                    psz1 =  psz - 1;

                    fun pat x =  unsafe_get (pattern, psz1 - x);                                # Change from above:  'psz1 - x' replaces 'x'.
                    fun nxt x =  it::poly_rw_vector::get (next, x);

                    fun setnxt (i, x)
                        =
                        it::poly_rw_vector::set (next, i, x);

                    fun fill (p, x)
                        =
                        if   (p >= psz)        ();
                        elif (pat x == pat p)  dnxt (p,  nxt x,      x + 1);
                        else                   dnxt (p,      x,  nxt x + 1);
                        fi

                    also
                    fun dnxt (p, x, y)
                        =
                        {   setnxt (p, x);
                            #
                            fill (p + 1, y);
                        };

                    fun search (text, start, stop)                                              # Here we expect to have  start > stop  because we're searching BACKWARD through 'text' from 'start' to 'stop'.
                        =
                        loop (0, start)
                        where
                            tsz   = size text;
                            tsz1  = tsz - 1;

                            start = tsz1 - start - psz1;                                        # Change from above:  'tsz1 - start - psz1' replaces 'start'.
                            stop  = tsz1 - stop;                                                # Change from above:  'tsz1 - stop'         replaces 'stop'.

                            fun txt x
                                =
                                unsafe_get (text, tsz1 - x);                                    # Change from above:  'tsz1 - x' replaces 'x'.

                            fun loop (p, t)
                                =
                                if   (p >= psz               )   tsz  - t;                      # Change from above:  'tsz  - t'        replaces '(t-psz)'.    (psz goes away because we still want to return the index of the leftmost byte in the match.)
                                elif (t >= stop              )   tsz1 - stop;                   # Change from above:  'tsz1 - (t-stop)' replaces 'stop'.
                                elif (p < 0 or pat p == txt t)   loop (p+1, t+1);
                                else                             loop (nxt p, t);
                                fi;

                        end;

                   fill (1, 0);

                   search;
                };

        end;                                                            # stipulate
    };                                                                  # package protostring
end;

Comments and suggestions to: bugs@mythryl.org