# unpickler.pkg
# Compiled by:
#
src/lib/compiler/src/library/pickle.lib# This is the new "generic" unpickle facility.
#
# It replaces Andrew Appel's original "shareread" module.
#
# The main difference is that instead of using a "universal"
# type together with numerous injections and projections
# it uses separate maps.
#
# This approach proves to be a lot more lightweight.
#
# The benefits are:
#
# - No projections, hence no "match nonexhaustive" warnings and...
#
# - No additional run-time overhead (all checking is done during
# the map membership test which is common to both implementations)
#
# - No necessity for those many tiny "fn"-functions that litter the
# original code, resulting in...
#
# - A more "natural" style for writing the actual unpickling code
# that makes for shorter source code
#
# - A lot less generated machine code (less than 50% of the original
# version)
#
# - Slightly faster operation (around 15% speedup)
# (My guess is that it is a combination of fewer projections and
# fewer generated thunks that make the code run faster.)
#
# July 1999, Matthias Blume
#
# We now use the high bit in char codes of shareable nodes to indicate
# that actual sharing has occurred. If the high bit is not set, we
# no longer bother to insert the node into its sharing map. This
# improves unpickling speed (e.g., for autoloading) by about 25% and
# saves tons of memory.
#
# October 2000, Matthias Blume
### "If we wish to count lines of code,
### we should not regard them as lines
### *produced* but as lines *spent*."
###
### -- Edsger J Dijkstra
api Unpickler {
#
exception FORMAT;
Sharemap(V); # One for each type. THIS IS MUTABLE STATE!
Unpickler; # Encapsulates unpickling state.
# Make a type-specific sharing map using "make_map".
#
# Be sure to create such maps only locally, otherwise you have a
# space leak.
#
# The Mythryl type system will prevent you from accidentially using the
# same map for different types, so don't worry. But using TOO MANY
# maps (i.e., more than one map for the same type) will likely
# cause problems because the unpickler might try to get for a
# back reference that is in a different map than the one where the
# value is actually registered.
#
# By the way, this warning is not unique to the many-maps approach.
# The same thing would happen with the original "universal domain"
# unpickler if you declare two different constructors for the
# same type. Given that there are about 100 types (and thus
# 100 constructors or maps) in the Mythryl dictionary pickler,
# the possibility for such a mistake is not to be dismissed.
make_sharemap: Void -> Sharemap(V);
Pickle_Reader(V)
=
Void -> V;
# A "charstream" is the mechanism that gets actual characters from
# the pickle. For ordinary pickles, the unpickler will never call
# "seek". Moreover, the same is true if you read the pickles created
# by pickleN sequentially from the first to the last (i.e., not
# "out-of-order"). "currentPosition" determines the current position
# and must be implemented.
Charstream
=
{ getchar: Void -> Char, # Read and return next char, advancing charstream cursor.
peekchar: Int -> Char, # Read ith-next char, leaving charstream cursor unchanged.
seek: Int -> Void, # Set charstream cursor to given value.
tell: Void -> Int # Return charstream cursor.
};
# Construct a Charstream which on successive calls
# returns successive chars from given string:
#
make_charstream_for_string
:
String -> Charstream; # The string is the pickle string.
# make_enhanced_charstream_for_string is a souped-up version of make_charstream_for_string:
# It takes a function to produce (and re-produce) the pickle string
# on demand and returns the actual charstream together with a
# "deleter" -- a function to let go of the pickle string.
# When suspended unpickling resumes after the string got deleted,
# the charstream will automatically re-fetch the pickle string
# using the function provided:
make_enhanced_charstream_for_string
:
( Null_Or( String ),
(Void -> String)
)
->
{ charstream: Charstream,
clear_pickle_cache: Void -> Void
};
# Open the unpickling session - everything is parameterized by this;
# the charstream provides the bytes of the pickle:
make_unpickler
:
Charstream -> Unpickler;
# The typical style is to write a "read_<my_type>" function for each type.
#
# The read function uses a local read_value_of_my_type function which takes the
# first character of a pickle (this is usually the discriminator that
# was given to @@@ or % in the pickler) and returns the unpickled
# value. The function recursively calls other "reader" functions.
#
# To actually get the value from the pickle, pass the read_value_of_my_type
# to 'read_sharable_value' -- together with the current unpickler and the
# type-appropriate sharemap. 'read_sharable_value' will take care of
# back-references (using the sharemap) and pass the first character to your read_value_of_my_type
# when necessary. The standard pattern for writing a "read_<my_type>"
# therefore is:
#
# unpickler = unpickler::make_unpickler pickle
#
# fun read_sharable_value sharemap read_value
# =
# unpickler::read_sharable_value unpickler sharemap read_value
#
# <my_type>_sharemap = unpickler::make_sharemap ()
#
# fun read_<my_type> ()
# =
# read_sharable_value <my_type>_sharemap read_<my_type>'
# where
# fun read_<my_type>' 'a' => ... # Case a
# read_<my_type>' 'b' => ... # Case b
# ...
# _ => raise unpickler::FORMAT
# end;
# end;
#
read_sharable_value
:
Unpickler -> Sharemap(V) -> (Char -> V) -> V;
# If you know that you don't need a map because there can be no
# sharing, then you can use "read_unsharable_value" instead of "read_sharable_value".
read_unsharable_value
:
Unpickler -> (Char -> V) -> V;
# Making readers for some common types:
read_int: Unpickler -> Pickle_Reader( Int );
read_int1: Unpickler -> Pickle_Reader( one_word_int::Int );
read_unt: Unpickler -> Pickle_Reader( Unt );
read_unt1: Unpickler -> Pickle_Reader( one_word_unt::Unt );
read_bool: Unpickler -> Pickle_Reader( Bool );
read_string: Unpickler -> Pickle_Reader( String );
# Readers for parametric types need their own map:
#
read_list: Unpickler -> Sharemap( List( V ) ) -> Pickle_Reader(V) -> Pickle_Reader( List(V) );
read_null_or: Unpickler -> Sharemap( Null_Or(V) ) -> Pickle_Reader(V) -> Pickle_Reader( Null_Or(V) );
read_pair: Unpickler -> Sharemap( (X, Y) ) -> (Pickle_Reader(X), Pickle_Reader(Y)) -> Pickle_Reader( (X, Y) );
# The laziness generated here is in the unpickling.
# In other words unpickling state is not discarded
# until the last lazy value has been forced.
read_lazy: Unpickler -> Pickle_Reader(X) -> Pickle_Reader(Void -> X);
};
stipulate
package im = int_red_black_map; # int_red_black_map is from
src/lib/src/int-red-black-map.pkgherein
package unpickler
: Unpickler # Unpickler is from
src/lib/compiler/src/library/unpickler.pkg {
exception FORMAT;
Sharemap(V)
=
Ref( im::Map( (V, Int) ) );
Pickle_Reader(V) = Void -> V;
Charstream
=
{ getchar: Void -> Char,
peekchar: Int -> Char,
seek: Int -> Void,
tell: Void -> Int
};
Unpickler
=
{ string_sharemap: Sharemap( String ),
charstream: Charstream
};
#
fun make_sharemap ()
=
REF im::empty;
#
fun make_charstream_for_string pickle_string
=
# Construct a Charstream which on successive calls
# returns successive chars from given string:
#
{ getchar, peekchar, seek, tell }
where
position = REF 0;
#
fun getchar ()
=
{ old_position = *position;
position := old_position + 1;
string::get_byte_as_char (pickle_string, old_position)
except
INDEX_OUT_OF_BOUNDS = raise exception FORMAT;
};
#
fun peekchar i
=
string::get_byte_as_char (pickle_string, *position + i)
except
INDEX_OUT_OF_BOUNDS = raise exception FORMAT;
#
fun seek new_position
=
position := new_position;
#
fun tell ()
=
*position;
end;
#
fun make_enhanced_charstream_for_string # This fn is called (only) from:
src/app/makelib/freezefile/freezefile-g.pkg ( initial_pickle,
fetch_string
)
=
{ clear_pickle_cache,
#
charstream => { getchar, peekchar, seek, tell }
}
where
position = REF 0; # Tracks next char to read within initial_pickle.
pickle_string_ref
=
REF initial_pickle;
#
fun grab_string ()
=
case *pickle_string_ref
#
THE string => string;
#
NULL =>
{ string = fetch_string ();
pickle_string_ref := THE string;
string;
};
esac;
#
fun clear_pickle_cache ()
=
pickle_string_ref := NULL;
#
fun getchar ()
=
{ s = grab_string ();
p = *position;
position := p + 1;
string::get_byte_as_char (s, p)
except
INDEX_OUT_OF_BOUNDS = raise exception FORMAT;
};
#
fun peekchar i
=
{ s = grab_string ();
p = *position;
string::get_byte_as_char (s, p+i)
except
INDEX_OUT_OF_BOUNDS = raise exception FORMAT;
};
#
fun seek new_position
=
position := new_position;
#
fun tell ()
=
*position;
end; # fun make_enhanced_charstream_for_string
stipulate
fun from_any_int byte_source ()
#
# Read from byte_source an int encoded
# in our indefinite-precision expanding-opcode
# style bytestream encoding. The basic idea
# here is:
#
# o The high bit is 1 on all bytes except the last.
# o On the last byte the second-highest bit gives the sign.
# o The remaining bits all get spliced together in sequence
# to generate the int result, last byte holding lowest bits.
=
loop 0u0
where
& = one_byte_unt::bitwise_and;
#
infix my & ;
to_large = one_byte_unt::to_large_unt;
# one_byte_unt is from
src/lib/std/one-byte-unt.pkg # byte is from
src/lib/std/src/byte.pkg #
fun loop n
=
{ w8 = byte::char_to_byte (byte_source ());
#
if ((w8 & 0u128) == 0u0) (n * (one_word_unt::from_int 64) + to_large (w8 & 0u63), (w8 & 0u64) != 0u0);
else loop (n * (one_word_unt::from_int 128) + to_large (w8 & 0u127));
fi;
};
end;
#
fun from_large_unt convert byte_source ()
=
case (from_any_int byte_source ())
#
(w, FALSE) => convert w except _ = raise exception FORMAT;
_ => raise exception FORMAT;
esac;
#
fun from_large_int convert byte_source ()
=
{ my (wpos, negative)
=
from_any_int byte_source ();
# Negation must be done in unsigned domain
# to prevent overflow on min_int.
#
# For the same reason we must
# then use to_int_x.
w = negative ?? 0u0 - wpos
:: wpos;
i = large_unt::to_multiword_int_x w;
convert i except _ = raise exception FORMAT;
};
herein
from_int = from_large_int int::from_multiword_int;
from_int1 = from_large_int one_word_int::from_multiword_int;
from_unt = from_large_unt unt::from_large_unt;
from_unt1 = from_large_unt one_word_unt::from_large_unt;
end;
#
fun make_unpickler charstream
=
{ string_sharemap => make_sharemap (),
charstream
}
: Unpickler;
#
fun read_sharable_value
(unpickler: Unpickler)
shared_values_of_my_type
read_value_of_my_type
=
# The main duty of 'read_sharable_value' is to factor
# off from the rest of the unpickling code the job
# of dealing with references to shared values
# -- values referenced by more than one pointer
# in the pickled datastructure.
#
# Such shared values are flagged in the pickle
# bytestream by a 0xFF byte (255 decimal)
# followed by an integer backreference to the
# offset in the pickle at which the actual
# value may be found.
#
# The first time we unpickle such a shared value,
# we salt it away in our shared_values_of_my_type.
#
# When we encounter subsequent backreferences to
# that particular value, we can then just fish
# the previously-unpickled value out of our
# shared_values_of_my_type and return it, thus
# re-establishing the desired value sharing.
#
{
fun first_time (position, c)
=
# Caller guarantees that character 'c'
# has the high bit set:
#
{ v = read_value_of_my_type (char::from_int (char::to_int c - 128));
position' = unpickler.charstream.tell ();
shared_values_of_my_type
:=
im::set (*shared_values_of_my_type, position, (v, position'));
v;
};
c = unpickler.charstream.getchar ();
if (char::to_int c < 128)
#
# High-bit is not set, so this is not a shared node.
# Therefore, it can't possibly be in the map, and
# we can call read_value_of_my_type directly.
read_value_of_my_type c;
else
if (c == '\xff')
#
position = from_int unpickler.charstream.getchar ();
case (im::get (*shared_values_of_my_type, position))
#
THE (v, _) => v;
#
NULL
=>
{ here = unpickler.charstream.tell ();
unpickler.charstream.seek position;
# It is ok to use "getchar" here because
# there won't be back-references that jump
# to other back-references.
# (Since we are jumping to something that
# was shared, it has the high-bit set, so
# calling "firstTime" is ok.)
first_time (position, unpickler.charstream.getchar())
then
unpickler.charstream.seek here;
};
esac;
else
position = unpickler.charstream.tell () - 1; # Must subtract one to get back in front of c.
case (im::get (*shared_values_of_my_type, position))
#
THE (v, position')
=>
{ unpickler.charstream.seek position';
v;
};
NULL => first_time (position, c);
esac;
fi;
fi;
};
# "read_unsharable_value" gets around backref detection. Certain integer
# encodings may otherwise be mis-identified as back references.
# Moreover, unlike in the case of "read_sharable_value" we don't need a map
# for "read_unsharable_value". This could be used as an optimization for
# types that are known to never be shared anyway (Bool, etc.).
#
fun read_unsharable_value (unpickler: Unpickler) f
=
f (unpickler.charstream.getchar ());
stipulate
fun f2r from_x (unpickler: Unpickler) # "f2r" is probably "from_to_read" (converts a 'from_x' fn to 'read_x' fn).
=
from_x unpickler.charstream.getchar;
herein
read_int = f2r from_int;
read_int1 = f2r from_int1;
read_unt = f2r from_unt;
read_unt1 = f2r from_unt1;
end;
# read_lazy advances 'currentPosition' as though it
# had read the next value, but does not actually
# do so. Instead, it returns a memoized thunk
# which will do so at need:
#
fun read_lazy unpickler read_value_of_my_type ()
=
{ memo = REF (\\ () = raise exception DIE "unpickler::readLazy");
unpickler -> { charstream => { tell, seek, ... }, ... };
# The size may have leading 0s because of padding,
# so loop reading integers until we get a nonzero
# value to return:
#
fun get_size ()
=
{ size = read_int unpickler ();
if (size == 0 ) get_size ();
else size; fi;
};
# Read size of value to
# read lazily:
size = get_size (); # size of v
# Remember position at which
# lazily read value starts,
# so we can come back later
# and actually read it:
start = tell (); # start of v
# Construct a thunk to do the real
# (lazily delayed) read of the
# value, when it comes time to do so.
#
# To do this we need to 'seek' back to
# the proper position in the stream,
# but to maintain sanity (since this
# will happen at an unpredictable
# time) we write it to save and
# restore the 'tell' ('current_position') value
# in force at the time of thunk
# invocation.
#
# We also memo-ize our thunk to cache
# the value lazily read, so that on
# second and subsequent thunk invocations,
# we can immediately return the cached
# value rather than actually having to
# do the complete read again.
#
fun thunk ()
=
{ initial_position = tell (); # Remember prevailing stream position.
seek start; # Reset stream to start of our lazily-read value.
v = read_value_of_my_type (); # Do the actual read of lazily-read value.
#
seek initial_position; # Restore prevailing stream position.
memo := (\\ () = v); # Set memo to return cached value instead of re-reading.
v; # Return lazily read value.
};
memo := thunk; # Set up lazy read of value on initial thunk invocation.
seek (start + size); # Advance stream position if though we had read the value
{. *memo (); }; # Return thunk that will do the lazy read on request.
};
#
fun read_list unpickler sharemap_for_my_type read_value_of_my_type ()
=
read_sharable_value unpickler sharemap_for_my_type readlist
where
fun read_chops ()
=
read_sharable_value unpickler sharemap_for_my_type readchopslist
where
fun readchopslist 'N' => [];
readchopslist 'C' => read_value_of_my_type ()
! read_value_of_my_type ()
! read_value_of_my_type ()
! read_value_of_my_type ()
! read_value_of_my_type ()
! read_chops ();
readchopslist _ => raise exception FORMAT;
end;
end;
#
fun readlist '0' => [];
readlist '1' => [read_value_of_my_type ()];
readlist '2' => [read_value_of_my_type (), read_value_of_my_type ()];
readlist '3' => [read_value_of_my_type (), read_value_of_my_type (), read_value_of_my_type ()];
readlist '4' => [read_value_of_my_type (), read_value_of_my_type (), read_value_of_my_type (), read_value_of_my_type ()];
readlist '5' => read_chops ();
readlist '6' => read_value_of_my_type () ! read_chops ();
readlist '7' => read_value_of_my_type () ! read_value_of_my_type () ! read_chops ();
readlist '8' => read_value_of_my_type () ! read_value_of_my_type () ! read_value_of_my_type () ! read_chops ();
readlist '9' => read_value_of_my_type () ! read_value_of_my_type () ! read_value_of_my_type () ! read_value_of_my_type () ! read_chops ();
readlist _ => raise exception FORMAT;
end;
end;
#
fun read_null_or unpickler sharemap_for_my_type read_value_of_my_type ()
=
read_sharable_value unpickler sharemap_for_my_type read_null_or'
where
fun read_null_or' 'n' => NULL;
read_null_or' 's' => THE (read_value_of_my_type ());
read_null_or' _ => raise exception FORMAT;
end;
end;
#
fun read_pair unpickler sharemap_for_my_type (read_a, read_b) ()
=
read_sharable_value unpickler sharemap_for_my_type read_pair'
where
fun read_pair' 'p' => (read_a (), read_b ());
read_pair' _ => raise exception FORMAT;
end;
end;
#
fun read_bool unpickler ()
=
read_unsharable_value unpickler read_bool'
where
fun read_bool' 't' => TRUE;
read_bool' 'f' => FALSE;
read_bool' _ => raise exception FORMAT;
end;
end;
#
fun read_string unpickler ()
=
read_sharable_value unpickler string_sharemap read_string'
where
unpickler
->
{ string_sharemap,
charstream => { getchar, ... }
};
#
fun read_string' '"'
=>
loop ([], getchar ())
where
fun loop (l, '"') => string::implode (reverse l);
loop (l, '\\') => loop (getchar () ! l, getchar ());
loop (l, c) => loop (c ! l, getchar ());
end;
# This is a bloody st00pid way to un/pickle strings! The '\' stuff is
# fine for human-readable/human-editable sourcecode, but for mechanical
# processing it is much better to just put a length up front and then
# read that many bytes at one go. XXX BUGGO FIXME. -- 2011-01-18 CrT
end;
read_string' _
=>
raise exception FORMAT;
end;
end;
};
end;