## pickler.pkg
#
# Pickle facility.
#
#
# OVERVIEW
# ========
#
# This module contains the core functionality used for 'pickling',
# which is to say, encoding datastructures as bytestrings suitable
# for saving in disk files, sending over network connections,
# or computing message digest hashcodes.
#
# In general, our pickled representation looks a lot like code
# in a portable bytecode instruction set. It consists of opcodes
# identifying what to do (construct a particular kind of value)
# followed by databytes supplying the information needed for
# that particular operation.
#
# For simple types like integer and boolean, pickling reduces to
# just writing an opcode-like typetag byte identifying the type,
# followed by a string of one or more bytes identifying the value.
#
# For a boolean, the typetag is '-9' and the value is "t" or "f".
#
# For an integer, the typetag is '-1' and for the value we use an
# expanding-opcode style encoding in which bit 7 (the high bit)
# gives the sign and bit 6 is a flag indicating whether more data
# bytes follow. This encodes small integers in a a single byte,
# lets us deal gracefully with word length differences between
# machines, and also cleanly supports indefinite-precision integers.
#
# For a constructor value, the typetag uniquely identifies the
# sumtype, a following one-byte discriminator identifies the
# particular constructor of that sumtype, and the associated
# constructor arguments, if any, follow that.
#
# Our biggest design challenge is to deal properly with sharing,
# which is to say, with multiple pointers to a single value.
# These can be due to pointer cycles in the datastructure or
# to shared nodes in a tree package.
#
# For pure values, handling sharing properly is 'merely' a space
# optimization which in the worst case prevents exponential explosions
# in space consumption during unpickling.
#
# For impure values, handling sharing properly is essential
# to preserving correct semantics: If the unpickler replicates
# stateful chunks, so that changes made to one copy are no longer
# visible in other copies, we will have wild breakage all through
# the code of the unpickled program.
#
# Within the pickle bytestring, sharing is implemented by "back references".
# Logically, a back reference is a pointer to already-pickled value
# appearing somewhere earlier in the pickle bytestring.
# Physically, a back reference consists of an all-ones 0xFF byte
# (255 decimal) followed by an integer encoding giving the byte
# offset of the already-pickled value within the pickle bytestring.
# The special 0xFF value is reserved for flagging backreferences.
#
#
#
# DATA STRUCTURES
# ===============
#
# Our core pickling datastructre is our 'Funtree_To_Stringtree_State'
# record, which contains five components:
#
# pickleloc map
# forwarding map
# ad hoc sharing map
# pickle bytesize
# sharing map
#
# The pickleloc map maintains a mapping between already pickled
# values and their byte address (offset) within the pickle bytestring.
# Whenever we are about to append a new value to the accumulating
# pickle bytestring, we first check the pickleloc map to see if
# the value already exists somewhere within the bytestring, and if
# so we simply write a backreference to the pre-existing representation.
#
# The forwarding map ensures that we never encode a backreference to
# a backreference, or something like that, I think... ?
#
# The ad hoc sharing map is actually a parameter to this module
# supplied by the client, which allows additional sharing to be
# implemented above and beyond what the basic sharing algorithm
# would implement.
#
# The "pickle bytesize" tracks the current size-in-bytes of the
# (eventual) pickle. Its value is used to supply the offsets
# entered into (in particular) the pickleloc map. As we will
# see below, at the time such entries are made, we don't have
# an actual simple pickle string, but rather an abstract tree
# representation whose total length is not easily computed by
# direct means.
#
# Finally, the sharing map tracks all pickled values which are
# referenced by backpointers -- which is to say, all shared
# values. We flag these values specially for the unpickler.
# As a result, the unpickler need only keep a table of all
# actually shared values rather than all potentially shared
# values, which saves it a lot of space and a significant
# amount of computation time.
# On the reasonable assumption that a pickle is read more
# times than it is written, this results in significant overall
# time savings.
#
#
#
#
# ALGORITHM
# =========
#
# Our pickling algorithm proceeds by three phases.
#
# PHASE 1: Funtree construction.
# ----------------------------------
#
# In the first phase, we recursively construct a tree
# of opaque closures.
# Each closure contains the information for one
# datastructure value or record.
# Each closure is a function accepting a single argument
# consisting of the above-described state tuple.
# This representation has the advantage of extreme
# generality since our clients can always add new kinds of closures
# to the tree to explicitly encode knowledge about new kinds of
# datastructures (arrays, say), without affecting any of the
# code in this package.
# This representation has the corresponding disadvantage
# of being completely opaque: There is no way to traverse,
# inspect, or update the resulting tree package. All you
# can do is evaluate it by calling the root closure with a
# Funtree_To_Stringtree_State tuple.
#
# PHASE 2: Stringtree construction.
# ----------------------------------
#
# Evaluating the phase-one Funtree in this fashion (by
# supplying a Funtree_To_Stringtree_State tuple) results in the
# phase-two representation of the pickle, a Stringtree binary tree
# consisting entirely of two kinds of nodes:
# LEAF nodes containing a bytestring.
# NODE nodes containing two subtrees.
# This representation has advantages and disadvantages inverse
# to those the closure tree: It is trivial to traverse, inspect
# and modify, but contains no explicit knowledge about different
# datastructure types.
#
# PHASE 3: Flattening the Stringtree to a list of strings.
# -----------------------------------------
#
# Our final phase is a linear-time pass over the string tree
# reducing it to a simple list of strings, which are then
# collapsed down into a single final picklestring using the
# standard library 'cat' function.
# During this phase we also set the 'sharing' bits which
# tell the unpickler which values are actually shared, and
# thus must be entered into its backreference-resolution map.
#
#
#
#
# HISTORICAL NOTES
# ================
#
# This is the new "generic" pickle utility which replaces Andrew Appel's
# original "sharewrite" module. Aside from formal differences, this
# new module ended up not being any different from Andrew's. However,
# it ties in with its "unpickle" counterpart which is a *lot* better than
# its predecessor.
#
# Generated pickles tend to be a little bit smaller, which can
# probably be explained by the slightly more compact (in the common case,
# i.e. for small absolute values) integer representation.
#
# July 1999, Matthias Blume
#
# Addendum: This module now also marks as "actually being shared" those
# nodes where actual sharing has been detected. Marking is done by
# setting the high bit in the char code of the node. This means that
# char codes must be in the range [0, 126] to avoid conflicts. (127
# cannot be used because setting the high bit there results in 255 --
# which is the backref code.)
# This improves unpickling time by about 25% and also reduces memory
# usage because much fewer sharing map entries have to be made during
# unpickling.
#
# October 2000, Matthias Blume
# Compiled by:
#
src/lib/compiler/src/library/pickle.lib#
# By the way, there is no point in trying to internally use
# vector_of_one_byte_unts::Vector instead of string for now.
# These strings participate in order comparisons (which makes
# vector_of_one_byte_unts::Vector unsuitable). Moreover, conversion between
# string and vector_of_one_byte_unts::Vector is currently just a cast, so it
# does not cost anything in the end.
api Pickler {
Id;
Datatype_Tag = Int; # negative numbers are reserved!
#
# Type info. Use a different number for each type constructor.
Funtree( A_adhoc_map );
To_Funtree (A_adhoc_map, V) = V -> Funtree( A_adhoc_map );
make_funtree_node: Datatype_Tag -> String -> List( Funtree(A_adhoc_map) ) -> Funtree(A_adhoc_map);
#
# make_funtree_node produces the Funtree for one case (constructor) of a sumtype.
# The string must be one character long and the argument
# should be the list of Funtree encodings for the constructor's arguments.
# Use the same datatype_tag for all constructors of the same sumtype
# and different datatype_tags for constructors of different types.
#
# The latter is really only important if there are constructors
# of different type who have identical argument types and use the
# same make_funtree_node identificaton string. In this case the pickler might
# equate two values of different types, and as a result the
# unpickler will be very unhappy.
#
# On the other hand, if you use different datatype_tags for the same type,
# then nothing terrible will happen. You might lose some sharing,
# though.
#
# The string argument could theoretically be more than one character
# long. In this case the corresponding unpickling function must
# be sure to get all those characters out of the input stream.
# We actually do exploit this "feature" internally.
adhoc_share: { find: (A_adhoc_map, V) -> Null_Or( Id ),
insert: (A_adhoc_map, V, Id) -> A_adhoc_map
}
-> To_Funtree (A_adhoc_map, V)
-> To_Funtree (A_adhoc_map, V);
#
# "adhoc_share" is used to specify potential for "ad-hoc" sharing
# using the user-supplied map.
# Ad-hoc sharing is used to identify parts of the value that the
# hash-conser cannot automatically identify but which should be
# identified nevertheless, or to identify those parts that would be
# too expensive to be left to the hash-conser.
# Generating funtree nodes for values of some basic types:
#
wrap_bool: To_Funtree (A_adhoc_map, Bool);
wrap_int: To_Funtree (A_adhoc_map, Int);
wrap_unt: To_Funtree (A_adhoc_map, Unt);
wrap_int1: To_Funtree (A_adhoc_map, one_word_int::Int);
wrap_unt1: To_Funtree (A_adhoc_map, one_word_unt::Unt);
wrap_string: To_Funtree (A_adhoc_map, String);
# Generating pickles for some parameterized types
# (given a pickler for the parameter):
#
wrap_list: To_Funtree (A_adhoc_map, X) -> To_Funtree (A_adhoc_map, List(X) );
wrap_null_or: To_Funtree (A_adhoc_map, X) -> To_Funtree (A_adhoc_map, Null_Or(X) );
wrap_pair: (To_Funtree (A_adhoc_map, X), To_Funtree (A_adhoc_map, Y)) -> To_Funtree (A_adhoc_map, (X, Y));
wrap_thunk: To_Funtree (A_adhoc_map, X) -> To_Funtree (A_adhoc_map, Void -> X);
#
# Pickling a thunk. The thunk will be forced
# by the pickler. Unpickling is lazy again; but, of course, that
# laziness is unrelated to the laziness of the original value.
funtree_to_pickle: A_adhoc_map -> Funtree(A_adhoc_map) -> String;
#
# Convert the Funtree into an actual String pickle.
# The xxx_lifter stuff is here to allow picklers to be "patched
# together". If you already have a pickler that uses a sharing map
# of type B and you want to use it as part of a bigger pickler that
# uses a sharing map of type A, you must write a (B, A) map_lifter
# which then lets you lift the existing pickler to one that uses
# type A maps instead of its own type B maps.
#
# The idea is that B maps are really part of A maps. They can be
# extracted for the duration of using the existing pickler. Then,
# when that pickler is done, we can patch the resulting new B map
# back into the original A map to obtain a new A map.
Map_Lifter (B_adhoc_map, A_adhoc_map)
=
{ extract: A_adhoc_map -> B_adhoc_map,
patchback: (A_adhoc_map, B_adhoc_map) -> A_adhoc_map
};
lift_funtree_maker
:
Map_Lifter (B_adhoc_map, A_adhoc_map)
-> To_Funtree (B_adhoc_map, V)
-> To_Funtree (A_adhoc_map, V);
};
stipulate
package is = int_red_black_set; # int_red_black_set is from
src/lib/src/int-red-black-set.pkg package tag = pickler_sumtype_tags; # pickler_sumtype_tags is from
src/lib/compiler/src/library/pickler-sumtype-tags.pkgherein
package pickler
: Pickler # Pickler is from
src/lib/compiler/src/library/pickler.pkg {
Pickle_Bytesize = Int;
Id = Int;
Codes = List( Id );
Datatype_Tag = Int;
Shared_Value_Offsets = is::Set;
shared_value_offsets_empty = is::empty;
shared_value_offsets_add = is::add;
shared_value_offsets_list = is::vals_list;
package plm # "plm" == "pickleloc map"
=
red_black_map_g
(
Key = (String, Datatype_Tag, Codes);
#
# Our pickleloc map keys consist of a triple containing the
# three data needed to uniquely identify one node/value in
# the datastructure being pickled, namely:
# o A string holding the pickled value/contents of the node proper.
# o A typetag distinguishing, for example, the string "t" from
# the boolean value "t".
# o A list of offsets within the pickle of the pickled children
# of the node.
# In other words, for purposes of our base pickling algorithm,
# two nodes are identical if they have the same type, the same
# immediate values, and the same child nodes.
# Define an ordering over the above Key type.
# The only purpose of this is to allow us to store
# and retrieve keys from a binary tree, so the
# particular ordering relation implemented is noncritical:
#
fun compare ((contents, typetag, kidlist), (contents', typetag', kidlist'))
=
{ fun codes_cmp ( [], []) => EQUAL;
codes_cmp (_ ! _, []) => GREATER;
codes_cmp ([], _ ! _) => LESS;
codes_cmp (h ! t, h' ! t')
=>
if (h < h') LESS;
elif (h > h') GREATER;
else codes_cmp (t, t');
fi;
end;
if (typetag < typetag') LESS;
elif (typetag > typetag') GREATER;
else
case (string::compare (contents, contents'))
#
EQUAL => codes_cmp (kidlist, kidlist');
unequal => unequal;
esac;
fi;
};
);
package fwm # "fwm" == "forwarding_map"
=
int_red_black_map; # int_red_black_map is from
src/lib/src/int-red-black-map.pkg Stringtree = LEAF String
| NODE (Stringtree, Stringtree);
#
# The Stringtree binary-tree type provides a convenient
# intermediate pickle representation.
#
# When we're done inserting, deleting and
# appending, we can flatten a Stringtree
# to an actual pickle string in linear time.
fun total_string_bytes (LEAF s) => size s;
total_string_bytes (NODE (p, p')) => total_string_bytes p + total_string_bytes p';
end;
backref_escape_string = LEAF "\xff";
#
# Within the pickle string, the appearance of a 0xFF (255)
# value signals that the following value is a backreference
# rather than a literal. The value 255 is hardwired into
# the decode/encode logic in various ways, so don't try
# changing it unless you know exactly what you're doing.
backref_bytesize = 1; # Size in bytes of backref_escape_string.
nullbytes = LEAF "";
Pickleloc_Map = plm::Map( Id );
Forwarding_Map = fwm::Map( Id );
Funtree_To_Stringtree_State(A_adhoc_map)
=
{ pickleloc_map: Pickleloc_Map,
forwarding_map: Forwarding_Map,
adhoc_map: A_adhoc_map,
pickle_bytesize: Pickle_Bytesize,
shared_value_offsets: Shared_Value_Offsets
};
Funtree(A_adhoc_map)
=
Funtree_To_Stringtree_State(A_adhoc_map)
->
(Codes, Stringtree, Funtree_To_Stringtree_State(A_adhoc_map));
#
# As discussed, a Funtree is an opaque tree of closures
# which when invoked with a Funtree_To_Stringtree_State produces a
# Stringtree (plus other debris).
To_Funtree(A_adhoc_map, V)
=
V -> Funtree( A_adhoc_map );
infix my 40 @@@ ;
infixr my 50 & ;
# When partially applied, '&' combines two Funtree
# nodes/subtrees into a single new Funtree.
#
# As with any Funtree node, when our partially applied
# result is then applied to a 'Funtree_To_Stringtree_State' tuple,
# it converts itself into Stringtree form.
#
fun ( (f: Funtree(A_adhoc_map))
& (g: Funtree(A_adhoc_map))
)
(state: Funtree_To_Stringtree_State(A_adhoc_map))
=
{ (f state ) -> (shared_value_offsets_list_f, stringtree_f, state' );
(g state') -> (shared_value_offsets_list_g, stringtree_g, state'');
#
( shared_value_offsets_list_f @ shared_value_offsets_list_g,
NODE (stringtree_f, stringtree_g),
state''
);
};
# Combine a List(Funtree) into a single new Funtree.
#
fun funtrees_to_funtree (first, second ! rest) => first & funtrees_to_funtree (second, rest);
funtrees_to_funtree (result, []) => result;
end;
fun large_unt_to_bytestring'
( n: large_unt::Unt,
negative: Bool
)
=
{ # Encode unt 'n' as a variable-length sequence of bytes.
# Each nonfinal byte has the high bit set and holds seven
# bits of number. The final byte has the high bit clear,
# the sign bit comes next, and the lower six hold the
# last six bits of the unt:
#
# --------------------Z--------------------
#
|1xxxxxxx|1xxxxxxx|...|1xxxxxxx|0Sxxxxxx|
# --------------------Z--------------------
#
# This is essentially the same mechanism used in
#
src/lib/compiler/execution/compiledfile/compiledfile.pkg # -- maybe we should share it: # XXX BUGGO FIXME
#
// = large_unt::(/);
%% = large_unt::(%);
!! = large_unt::bitwise_or;
#
infix my // %% !! ;
last_digit = n %% 0u64; # Least significant six bits.
last_byte = if negative last_digit !! 0u64; # Set our sign bit (bit 6).
else last_digit;
fi;
byte::bytes_to_string (unt_to_bytestring' (n // 0u64, [ to_unt8 last_byte ])) # Process remaining bits.
where
to_unt8 = one_byte_unt::from_large_unt;
# Eat 7 bits/loop from least-significant
# end of unt. We set high bit on each
# byte to 1 to signify that this is a
# nonfinal byte:
#
fun unt_to_bytestring' (0u0, result_bytes) => vector_of_one_byte_unts::from_list result_bytes;
unt_to_bytestring' ( n, result_bytes) => unt_to_bytestring' (n // 0u128, to_unt8 ((n %% 0u128) !! 0u128) ! result_bytes);
end;
end;
};
fun large_unt_to_bytestring n
=
large_unt_to_bytestring' (n, FALSE);
fun multiword_int_to_bytestring i
=
if (i >= 0) large_unt_to_bytestring' ( large_unt::from_multiword_int i, FALSE);
else large_unt_to_bytestring' (0u0 - large_unt::from_multiword_int i, TRUE ); # Negate in the unsigned domain.
fi;
unt1_to_bytestring = large_unt_to_bytestring o one_word_unt::to_large_unt;
unt_to_bytestring = large_unt_to_bytestring o unt::to_large_unt;
int1_to_bytestring = multiword_int_to_bytestring o one_word_int::to_multiword_int;
int_to_bytestring = multiword_int_to_bytestring o int::to_multiword_int;
# '%%%' is a helper function which constructs Funtree nodes
# for childless input datastructure nodes.
#
# Curried application of %%% to its first two
# arguments produces the Funtree node.
#
# As with all Funtreee nodes, application to a
# Funtree_To_Stringtree_State value then yields
# a Stringtree node.
#
# The closure we generate will enter the given (char,typetag)
# pair into the backreference map unless it is already
# there, in which case it will instead enter into the forwarding
# table a pointer from its pickle-offset to the backref's
# pickle-offset.
#
# This fn accepts a three-element input argument containing:
#
# o typetag: -1 for integer ... -9 for booleans etc.
# Note that this doesn't get written to the pickle:
# The information is implicit in the type graph and
# the de/pickling routines. It mainly serves to keep
# our pickleloc table from confusing, say, the
# string value "f" with the boolean value "f".
#
# o 'c': discriminator within the type. For example for booleans, it will be "t" or "f".
#
# o Third argument is our Funtree_To_Stringtree_State value.
#
# The return value consists of a triple containing:
# o A pickle offset for the pickleloc map.
# o A 'LEAF c' node for the stringtree.
# o Our updated 'state' tuple.
#
fun make_funtree_leaf type_tag c { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize, shared_value_offsets }
=
{ key = (c, type_tag, []);
case (plm::get (pickleloc_map, key)) # Have we already seen the value with this key?
#
THE pickleloc # Yes, at this offset in existing pickle.
=>
( [ pickleloc ],
LEAF c,
{ pickleloc_map,
forwarding_map => fwm::set (forwarding_map, pickle_bytesize, pickleloc), # Map backref loc to its target loc.
adhoc_map,
pickle_bytesize => pickle_bytesize + size c,
shared_value_offsets
}
);
NULL # No, we haven't seen this key before in pickle.
=>
( [ pickle_bytesize ],
LEAF c,
{ pickleloc_map => plm::set (pickleloc_map, key, pickle_bytesize), # Map key to its location in pickle.
forwarding_map,
adhoc_map,
pickle_bytesize => pickle_bytesize + size c,
shared_value_offsets
}
);
esac;
};
# When partially applied, 'make_funtree_node' creates Funtree
# nodes. When these nodes are in turn applied to
# our usual 'Funtree_To_Stringtree_State' tuple argument,
# they evaluate to Stringtree nodes.
#
# Arguments are:
#
# o datatype_tag: An integer identifying the type of node.
#
# o (c, [childClosuretreeNodes])
# This pair contains the actual useful information
# content of the node. The 'c' string encodes the
# information content of the node proper. The
# the [childClosuretreeNodes] list has one entry
# for each child node.
#
# o 'Funtree_To_Stringtree_State' tuple. This gets applied only later,
# during conversion from Funtree to Stringtree form.
#
fun make_funtree_node datatype_tag c [] state
=>
make_funtree_leaf datatype_tag c state;
make_funtree_node datatype_tag c (firstkid ! morekids) { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize, shared_value_offsets }
=>
{ funtree = funtrees_to_funtree (firstkid, morekids);
my ( kidoffsets,
stringtree,
{ pickleloc_map => pickleloc_map', forwarding_map => forwarding_map', adhoc_map => adhoc_map', pickle_bytesize => pickle_bytesize', shared_value_offsets => shared_value_offsets' }
)
=
funtree { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize => pickle_bytesize + size c, shared_value_offsets };
key = (c, datatype_tag, kidoffsets);
case (plm::get (pickleloc_map, key))
#
THE offset => {
back_ref_num = int_to_bytestring offset;
( [offset],
NODE (backref_escape_string, LEAF back_ref_num),
{ pickleloc_map,
forwarding_map => fwm::set (forwarding_map, pickle_bytesize, offset),
adhoc_map,
pickle_bytesize => pickle_bytesize + backref_bytesize + size back_ref_num,
shared_value_offsets => shared_value_offsets_add (shared_value_offsets', offset)
}
);
};
#
NULL => ( [pickle_bytesize],
NODE (LEAF c, stringtree),
{ pickleloc_map => plm::set (pickleloc_map', key, pickle_bytesize),
forwarding_map => forwarding_map',
adhoc_map => adhoc_map',
pickle_bytesize => pickle_bytesize',
shared_value_offsets => shared_value_offsets'
}
);
esac;
};
end;
fun adhoc_share { find, insert } w v { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize, shared_value_offsets }
=
case (find (adhoc_map, v))
#
NULL => w v { pickleloc_map, forwarding_map, adhoc_map => insert (adhoc_map, v, pickle_bytesize), pickle_bytesize, shared_value_offsets };
#
THE i0 => { backref_offset = the_else (fwm::get (forwarding_map, i0), i0);
back_ref_num = int_to_bytestring backref_offset;
( [backref_offset],
#
NODE (backref_escape_string, LEAF back_ref_num),
#
{ pickleloc_map,
forwarding_map,
adhoc_map,
pickle_bytesize => pickle_bytesize + backref_bytesize + size back_ref_num,
shared_value_offsets => shared_value_offsets_add (shared_value_offsets, backref_offset)
}
);
};
esac;
fun wrap_thunk w thunk { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize, shared_value_offsets }
=
{ v = thunk ();
# The larger the value of trialStart, the smaller the chance that
# the loop (see below) will run more than once. However, some
# space may be wasted. 3 should avoid this most of the time.
# (Experience shows: 2 doesn't.)
#
trial_start = 3;
# This loop is ugly, but we don't expect it to run very often.
# It is needed because we must first pickle the length of the
# encoding of the thunk's value, but that encoding depends
# on the length (or rather: on the length of the length).
#
fun loop (nxt, ilen)
=
{ my (kidoffsets, stringtree, state)
=
w v { pickleloc_map, forwarding_map, adhoc_map, pickle_bytesize => nxt, shared_value_offsets };
size' = total_string_bytes stringtree;
ie = int_to_bytestring size';
iesz = size ie;
# Padding in front is better because the unpickler can
# simply discard all leading 0s and does not need to know
# about the pickler's setting of "trialStart".
#
null = LEAF "\000";
fun pad (stringtree, n)
=
if (n == 0) stringtree;
else pad (NODE (null, stringtree), n - 1);
fi;
if (ilen < iesz) loop (nxt + 1, ilen + 1);
else (kidoffsets, NODE (pad (LEAF ie, ilen - iesz), stringtree), state);
fi;
};
loop (pickle_bytesize + trial_start, trial_start);
};
# Note that even though the encoding could start with the
# backref_escape_code character (0xFF), we know that it isn't
# actually a backref because make_funtree_leaf suppresses back-references.
# Of course, this must be taken care of by
src/lib/compiler/src/library/unpickler.pkg #
fun wrap_int i = make_funtree_leaf tag::int ( int_to_bytestring i );
fun wrap_unt u = make_funtree_leaf tag::unt ( unt_to_bytestring u );
fun wrap_int1 i32 = make_funtree_leaf tag::one_word_int (int1_to_bytestring i32);
fun wrap_unt1 u32 = make_funtree_leaf tag::one_word_unt (unt1_to_bytestring u32);
fun wrap_pair (wrap_a, wrap_b) (a, b)
=
make_funtree_node tag::pair "p" [ wrap_a a,
wrap_b b
];
fun wrap_null_or wrap_a_value
=
wrap_null_or' wrap_a_value
where
fun wrap_null_or' wrap_a_value (THE value) => make_funtree_node tag::null_or "s" [wrap_a_value value]; # "s" for "some"
wrap_null_or' wrap_a_value NULL => make_funtree_leaf tag::null_or "n"; # "n" for "none" -- the old SML nomenclature for THE/NULL.
end;
end;
fun wrap_list wrap_one_list_element list_to_pickle
=
# We buy space and time efficiency
# (at the cost of code complexity)
# by processing the list contents
# five at a time:
#
case (chop_into_quints list_to_pickle)
#
([], [] ) => make_funtree_leaf tag::list "0";
([a], [] ) => make_funtree_node tag::list "1" [p a];
([a, b], [] ) => make_funtree_node tag::list "2" [p a, p b];
([a, b, c], [] ) => make_funtree_node tag::list "3" [p a, p b, p c];
([a, b, c, d], [] ) => make_funtree_node tag::list "4" [p a, p b, p c, p d];
#
([], quints) => make_funtree_node tag::list "5" [ wrap_quints quints];
([a], quints) => make_funtree_node tag::list "6" [p a, wrap_quints quints];
([a, b], quints) => make_funtree_node tag::list "7" [p a, p b, wrap_quints quints];
([a, b, c], quints) => make_funtree_node tag::list "8" [p a, p b, p c, wrap_quints quints];
([a, b, c, d], quints) => make_funtree_node tag::list "9" [p a, p b, p c, p d, wrap_quints quints];
#
_ => raise exception DIE "pickler::wrap_list: impossible chop";
esac
where
p = wrap_one_list_element; # Local abbreviation;
# Pickle list elements five-at-a-time:
#
fun wrap_quints [] => make_funtree_leaf tag::list "N";
wrap_quints ((a, b, c, d, e) ! quints) => make_funtree_node tag::list "C" [p a, p b, p c, p d, p e, wrap_quints quints];
end;
fun chop_into_quints list_to_chop
=
# Chop a list into a list of 5-tuples -- "quints".
# return (leftovers, quints). Example:
#
# chop_int_quints [a,b,c,d,e,f,g,h,i,j,k,l]
# ->
# ( [a,b], # Leftovers.
# [(c,d,e,f,g), (h,i,j,k,l)] # Quints.
# )
#
# The leftovers come from the
# start of list_to_chop, the remaining
# elements are in original order, regrouped.
#
chop5 (reverse list_to_chop, [])
where
fun chop5 (e ! d ! c ! b ! a ! rest, cl)
=>
chop5 (rest, (a, b, c, d, e) ! cl);
chop5 (rest, cl)
=>
(reverse rest, cl);
end;
end;
end;
fun wrap_string string
=
make_funtree_node tag::string string' [dummy_pickle]
where
# The dummy_pickle is a hack to get strings to be shared
# automatically. They don't have "natural" children, so normally
# make_funtree_leaf would suppress the backref. The dummy pickle produces no
# codes and no output, but it is there to make make_funtree_node believe that
# there are children.
#
fun dummy_pickle state
=
([], nullbytes, state);
fun esc '\\' => "\\\\"; # Why are we doing this? Just giving the length first is usually faster and easier. XXX BUGGO FIXME.
esc '"' => "\\\"";
esc '\xff' => "\\\xff"; # Must escape backref char.
esc c => string::from_char c;
end;
string' = cat ["\"", string::translate esc string, "\""];
end;
fun wrap_bool TRUE => make_funtree_leaf tag::bool "t";
wrap_bool FALSE => make_funtree_leaf tag::bool "f";
end;
stipulate
fun stringtree_to_string (
stringtree,
pickle_length_in_bytes,
shared_value_offsets
)
=
{ # 'add' is a utility routine for 'flatten' (see below)
# which prepends a string to our accumulating result
# list of strings.
#
# This would be completely trivial except that we must
# also set the high bit in the first byte of the string
# if it corresponds to a shared value, as a signal to
# the unpickler to save this value in its backreference
# table.
#
# To make this possible, we are given a sorted list of
# byte offsets within the pickle corresponding to shared
# values.
#
# We also maintain a 'byte_offset_within_pickle' state variable
# giving our current offset within the pickle, which decreases
# monotonically because we are building up the pickle
# stringlist back-to-front.
#
# So if our 'byte_offset_within_pickle' state variable is equal
# to the top entry on our shared_value_offsets list, we are at
# a shared value and must set its high bit.
#
fun add ("", byte_offset_within_pickle, shared_value_offset, more_shared_value_offsets, stringlist)
=>
(byte_offset_within_pickle, shared_value_offset ! more_shared_value_offsets, stringlist);
add (string, byte_offset_within_pickle, shared_value_offset, more_shared_value_offsets, stringlist)
=>
{
string_length = size string;
new_byte_offset_within_pickle
= byte_offset_within_pickle - string_length;
# If this string is shared (that is, if there
# is a backreference to it somewhere) then we
# flag this fact for the benefit of the unpickler
# by setting the high bit in the first byte of
# the string.
#
# Otherwise, we can just add it to our result
# stringlist as is:
if (new_byte_offset_within_pickle != shared_value_offset) # Is this a shared string?
#
( new_byte_offset_within_pickle, # Not a shared string.
shared_value_offset ! more_shared_value_offsets,
string ! stringlist
);
else
new_first_byte # A shared string -- set high bit in first byte.
=
string::from_char
(char::from_int
(string::get_byte (string, 0) + 128));
fun ret stringlist
=
(new_byte_offset_within_pickle, more_shared_value_offsets, stringlist);
# If it is a one-byte string we can just prepend our
# just-computed high byte to result stringlist,
# otherwise we need to prepend both first-byte
# and rest-of-string:
#
if (string_length > 1) ret (new_first_byte ! string::extract (string, 1, NULL) ! stringlist);
else ret (new_first_byte ! stringlist);
fi;
fi;
};
end;
# fast_flatten is a faster, simpler version of 'flatten'
# (see below) which we switch to once we are out of
# shared codes.
#
fun fast_flatten (LEAF string, results: List(String)) # A leaf is easy.
=>
string ! results;
fast_flatten (NODE (x, LEAF string), result) # A node with a right-child leaf is almost as easy.
=>
fast_flatten (x, string ! result);
fast_flatten (NODE (a, NODE (b, c)), result) # Rotate until we reduce to above case.
=>
fast_flatten (NODE (NODE (a, b), c), result);
end;
# 'flatten' converts a stringtree into a list of
# strings in one linear-time pass, which stringlist
# is then 'cat'-ed to produce the final pickle
# string.
#
# We build up the stringlist back-to-front since it
# is easier to prepend values to a list than to append
# them.
#
# During this pass, we also set the high bits in
# those bytecodes that correspond to shared nodes.
# The positions of these codes are given by our
# sharedCodes argument, which is a high-to-low
# sorted list of integers.
# First argument is the stringtree to flatten.
# Second argument is a triple consisting of:
# byte_offset_within_pickle: Monotonically decreasing intra-pickle address.
# shared_value_offsets: List of offsets within the pickle which
# correspond to shared values (== values
# with backreferences). We pass this info
# on to the unpickler via high-bit flags;
# This allows the unpickler to avoid entering
# into its backreference table values which
# do not have any backreferences.
# stringlist: The accumulating result list of strings
# which together constitute the result pickle.
#
fun flatten (stringtree, (_, [], results: List(String)))
=>
fast_flatten (stringtree, results);
flatten (LEAF string, (byte_offset_within_pickle, shared_value_offset ! more_shared_value_offsets, results))
=>
#3 (add (string, byte_offset_within_pickle, shared_value_offset, more_shared_value_offsets, results));
flatten (NODE (stringtree, LEAF string), (byte_offset_within_pickle, shared_value_offset ! more_shared_value_offsets, results))
=>
flatten (stringtree, add (string, byte_offset_within_pickle, shared_value_offset, more_shared_value_offsets, results));
flatten (NODE (stringtree_a, NODE (stringtree_b, stringtree_c)), arg_triple)
=>
flatten (NODE (NODE (stringtree_a, stringtree_b), stringtree_c), arg_triple);
end;
# Flatten the stringtree into a list of strings,
# and then concatenate that list to produce the
# final pickle string:
cat (flatten (stringtree, (pickle_length_in_bytes, reverse (shared_value_offsets_list shared_value_offsets), [])));
};
herein
# Convert a Funtree to a Stringtree and thence
# to a single String -- the result pickle:
#
fun funtree_to_pickle adhoc_map funtree
=
{ (funtree
{ pickleloc_map => plm::empty,
forwarding_map => fwm::empty,
adhoc_map,
pickle_bytesize => 0,
shared_value_offsets => shared_value_offsets_empty
})
->
(_, stringtree, { pickle_bytesize, shared_value_offsets, pickleloc_map => _, forwarding_map => _, adhoc_map => _ });
stringtree_to_string (stringtree, pickle_bytesize, shared_value_offsets);
};
end;
Map_Lifter (B_adhoc_map, A_adhoc_map)
=
{ extract: A_adhoc_map -> B_adhoc_map,
patchback: (A_adhoc_map, B_adhoc_map) -> A_adhoc_map
};
fun lift_funtree_maker { extract, patchback } wb b { pickleloc_map, forwarding_map, adhoc_map => a_adhoc_map, pickle_bytesize, shared_value_offsets }
=
{ b_adhoc_map = extract a_adhoc_map;
(wb b { adhoc_map => b_adhoc_map, pickleloc_map, forwarding_map, pickle_bytesize, shared_value_offsets })
->
(kidoffsets, stringtree, { adhoc_map => b_adhoc_map', pickleloc_map, forwarding_map, pickle_bytesize, shared_value_offsets });
a_adhoc_map' = patchback (a_adhoc_map, b_adhoc_map');
(kidoffsets, stringtree, { adhoc_map => a_adhoc_map', pickleloc_map, forwarding_map, pickle_bytesize, shared_value_offsets });
};
# For export:
#
# make_funtree_node = make_funtree_node;
};
end;