## univariate-sample.pkg
#
# Some statistical functions on unweighted univariate samples.
# Compiled by:
#
src/lib/std/standard.lib### "It's kind of fun to do the impossible."
###
### -- Walt Disney
stipulate
package mth = math; # math is from
src/lib/std/src/bind-math-32.pkg package rs = random_sample; # random_sample is from
src/lib/src/random-sample.pkg package rwv = rw_vector ; # rw_vector is from
src/lib/std/src/rw-vector.pkgherein
package univariate_sample : api {
#
# We distinguish between two kinds of samples. Only the "heavy"
# kind permits calculation of average deviation and median.
# It is also considerably more expensive because it keeps an
# rw_vector of all points while the "light" variety is constant-size.
Light;
Heavy;
Sample(X); # Light or heavy.
Evaluation(X); # Light or heavy.
# Samples are built incrementally by adding points
# to an initially empty sample:
lempty: Sample( Light );
hempty: Void -> Sample( Heavy );
ladd: (Float, Sample( Light )) -> Sample( Light ); # Constant
hadd: (Float, Sample( Heavy )) -> Sample( Heavy ); # Amortized constant
# Evaluate the sample.
# This completes all the expensive work except
# for things that depend on "heavy" samples:
#
evaluate: Sample(X) -> Evaluation(X); # Constant
# Extracting "cheap" information (constant-time):
#
nn: Evaluation(X) -> Int;
n: Evaluation(X) -> Float; # nn as float
mean: Evaluation(X) -> Float;
variance: Evaluation(X) -> Float;
standard_deviation: Evaluation(X) -> Float;
skew: Evaluation(X) -> Float;
kurtosis: Evaluation(X) -> Float;
# Extracting "expensive" information:
#
median: Evaluation( Heavy ) -> Float; # randomized linear
average_deviation: Evaluation( Heavy ) -> Float; # linear
}
{
infix my 90 @@@ ; my (@@@) = unsafe::rw_vector::get;
infix my 40 <- ; fun (a, i) <- x = unsafe::rw_vector::set (a, i, x);
# Two kinds of "extra info":
#
Light = Void; # Nothing
Heavy = (Rw_Vector( Float ), Int); # Rubber rw_vector of points, size
Sample(X) = (X, Int, Float, Float, Float, Float); # A sample: (extra info, nn, sum x^4, sum x^3, sum x^2, sum x).
# An evaluation: (extra info, nn, nn as float,
# mean, variance, standard deviation,
# skew, kurtosis) :
Evaluation(X)
=
EVALUATION {
extra_info: X, # Extra info
ni: Int, # Number of points
nr: Float, # Number of points (as float)
mean: Float,
sd2: Float, # sd * sd = variance
sd: Float, # standard deviation
skew: Float,
kurtosis: Float
};
min_size = 1024; # minimum allocated size of heavy rw_vector
lempty = ((), 0, 0.0, 0.0, 0.0, 0.0);
fun hempty ()
=
((rwv::make_rw_vector (min_size, 0.0), min_size), 0, 0.0, 0.0, 0.0, 0.0);
fun ladd (x: Float, ((), n, sx4, sx3, sx2, sx1))
=
{ x2 = x*x: Float;
x3 = x2*x: Float;
x4 = x2*x2: Float;
((), n+1, sx4+x4, sx3+x3, sx2+x2, sx1+x);
};
fun hadd (x: Float, ((a, size), n, sx4, sx3, sx2, sx1))
=
{ x2 = x*x: Float;
x3 = x2*x: Float;
x4 = x2*x2: Float;
my (a, size)
=
if (n < size)
(a, size);
else
size = size+size;
#
b = rwv::from_fn ( size,
#
\\ i = if (i < n) a @@@ i;
else 0.0;
fi
);
(b, size);
fi;
(a, n) <- x;
((a, size), n+1, sx4+x4, sx3+x3, sx2+x2, sx1+x);
};
fun evaluate (extra_info, ni, sx4, sx3, sx2, sx1)
=
{ n = float(ni);
n' = n - 1.0;
m = sx1 / n;
m2 = m*m;
m3 = m2*m;
sd2 = (sx2 + m*(n*m - 2.0*sx1)) / n';
sd = mth::sqrt sd2;
(sd*sd2, sd2*sd2) -> (sd3, sd4);
sk = (sx3-m*(3.0*(sx2-sx1*m)+n*m2)) / (n*sd3);
k = ((sx4+m*(6.0*sx2*m - 4.0*(sx3+sx1*m2)+n*m3)) / (n*sd4)) - 3.0;
EVALUATION {
extra_info,
ni,
nr => n,
mean => m,
sd2,
sd,
skew => sk,
kurtosis => k
};
};
fun un_e (EVALUATION r)
=
r;
fun nn e = .ni (un_e e);
fun n e = .nr (un_e e);
fun mean e = .mean (un_e e);
fun variance e = .sd2 (un_e e);
fun standard_deviation e = .sd (un_e e);
fun skew e = .skew (un_e e);
fun kurtosis e = .kurtosis (un_e e);
fun median (EVALUATION { extra_info => (a, _), ni, ... } )
=
rs::median' (rw_vector_slice::make_slice (a, 0, THE ni));
fun average_deviation (EVALUATION { extra_info => (a, _), ni, nr, mean => m, ... } )
=
ad (0, 0.0)
where
fun ad (i, ds)
=
if ( i>= ni) ds / nr;
else ad (i+1, ds + abs (a@@@i-m));
fi;
end;
};
end;