15.4.993  src/lib/src/univariate-sample.pkg

`## univariate-sample.pkg`
`#`
`# Some statistical functions on unweighted univariate samples.`

`# Compiled by:`
`#     `src/lib/std/standard.lib

`###             "It's kind of fun to do the impossible."`
`###`
`###                             -- Walt Disney`

`stipulate`
`    package mth =  math;                                                        # math                  is from   `src/lib/std/src/bind-math-32.pkg
`    package rs  =  random_sample;                                               # random_sample         is from   `src/lib/src/random-sample.pkg
`    package rwv =  rw_vector ;                                                  # rw_vector             is from   `src/lib/std/src/rw-vector.pkg
`herein`

`    package univariate_sample : api {`
`        #`
`        # We distinguish between two kinds of samples.  Only the "heavy"`
`        # kind permits calculation of average deviation and median.`
`        # It is also considerably more expensive because it keeps an`
`        # rw_vector of all points while the "light" variety is constant-size.`

`        Light;`
`        Heavy;`

`        Sample(X);              # Light or heavy. `
`        Evaluation(X);          # Light or heavy.`

`        # Samples are built incrementally by adding points`
`        # to an initially empty sample:`

`        lempty:          Sample( Light );`
`        hempty:  Void -> Sample( Heavy );`

`        ladd:    (Float, Sample( Light )) -> Sample( Light );   #  Constant `
`        hadd:    (Float, Sample( Heavy )) -> Sample( Heavy );   #  Amortized constant `

`        # Evaluate the sample.`
`        # This completes all the expensive work except`
`        # for things that depend on "heavy" samples:`
`        #`
`        evaluate:  Sample(X) -> Evaluation(X);                  # Constant `

`        # Extracting "cheap" information (constant-time): `
`        #`
`        nn:                 Evaluation(X) -> Int;`
`        n:                  Evaluation(X) -> Float;             #  nn as float`
`        mean:               Evaluation(X) -> Float;`
`        variance:           Evaluation(X) -> Float;`
`        standard_deviation: Evaluation(X) -> Float;`
`        skew:               Evaluation(X) -> Float;`
`        kurtosis:           Evaluation(X) -> Float;`

`        # Extracting "expensive" information: `
`        #`
`        median:             Evaluation( Heavy ) -> Float;       #  randomized linear `
`        average_deviation:  Evaluation( Heavy ) -> Float;       #  linear `

`    }`
`    {`
`        infix my  90 @@@ ;   my (@@@)        =  unsafe::rw_vector::get;`
`        infix my  40 <-  ;   fun (a, i) <- x =  unsafe::rw_vector::set (a, i, x);`

`        # Two kinds of "extra info": `
`        #`
`        Light = Void;                           # Nothing `
`        Heavy = (Rw_Vector( Float ), Int);      # Rubber rw_vector of points, size `

`        Sample(X) = (X, Int, Float, Float, Float, Float);       # A sample:  (extra info, nn, sum x^4, sum x^3, sum x^2, sum x).`

`        # An evaluation: (extra info, nn, nn as real,`
`        #                             mean, variance, standard deviation,`
`        #                             skew, kurtosis) :`
`        Evaluation(X)`
`            =`
`            EVALUATION  {`
`              extra_info: X,            #  Extra info `
`              ni:       Int,            #  Number of points `
`              nr:       Float,          #  Number of points (as real) `
`              mean:     Float,`
`              sd2:      Float,          #  sd * sd = variance `
`              sd:       Float,          #  standard deviation `
`              skew:     Float,`
`              kurtosis: Float`
`            };`

`        min_size = 1024;                #  minimum allocated size of heavy rw_vector `

`        lempty =   ((), 0, 0.0, 0.0, 0.0, 0.0);`

`        fun hempty ()`
`            =`
`            ((rwv::make_rw_vector (min_size, 0.0), min_size), 0, 0.0, 0.0, 0.0, 0.0);`

`        fun ladd (x: Float, ((), n, sx4, sx3, sx2, sx1))`
`            =`
`            {   x2 =  x*x;`
`                #`
`                (x2*x, x2*x2) ->   (x3, x4);`

`                ((), n+1, sx4+x4, sx3+x3, sx2+x2, sx1+x);`
`            };`

`        fun hadd (x: Float, ((a, size), n, sx4, sx3, sx2, sx1))`
`            =`
`            {   x2 = x*x;`
`                #`
`                (x2*x, x2*x2) ->   (x3, x4);`

`                my (a, size)`
`                    =`
`                    if  (n < size)`
`                        (a, size);`
`                    else `
`                        size = size+size;`
`                        #`
`                        b = rwv::from_fn  ( size,`
`                                            #`
`                                            fn i =  if (i < n)   a @@@ i;`
`                                                    else         0.0;`
`                                                    fi`
`                                          );`

`                        (b, size);`
`                    fi;`

`                (a, n) <- x;`

`                ((a, size), n+1, sx4+x4, sx3+x3, sx2+x2, sx1+x);`
`            };`

`        fun evaluate (extra_info, ni, sx4, sx3, sx2, sx1)`
`            =`
`            {   n   = real ni;`
`                n'  = n - 1.0;`

`                m   = sx1 // n;`
`                m2  = m*m;`
`                m3  = m2*m;`

`                sd2 =  (sx2 + m*(n*m - 2.0*sx1)) // n';`
`                sd  =  mth::sqrt sd2;`

`                (sd*sd2, sd2*sd2) ->   (sd3, sd4);`

`                sk = (sx3-m*(3.0*(sx2-sx1*m)+n*m2)) // (n*sd3);`
`                k  = ((sx4+m*(6.0*sx2*m - 4.0*(sx3+sx1*m2)+n*m3)) // (n*sd4)) - 3.0;`

`                EVALUATION {`
`                  extra_info,`
`                  ni,`
`                  nr => n,`
`                  mean => m,`
`                  sd2,`
`                  sd,`
`                  skew => sk,`
`                  kurtosis => k`
`                };`
`            };`

`        fun un_e (EVALUATION r)`
`            =`
`            r;`

`        fun nn                 e =  .ni       (un_e e);`
`        fun n                  e =  .nr       (un_e e);`
`        fun mean               e =  .mean     (un_e e);`
`        fun variance           e =  .sd2      (un_e e);`
`        fun standard_deviation e =  .sd       (un_e e);`
`        fun skew               e =  .skew     (un_e e);`
`        fun kurtosis           e =  .kurtosis (un_e e);`

`        fun median (EVALUATION { extra_info => (a, _), ni, ... } )`
`            =`
`            rs::median' (rw_vector_slice::make_slice (a, 0, THE ni));`

`        fun average_deviation (EVALUATION { extra_info => (a, _), ni, nr, mean => m, ... } )`
`            =`
`            ad (0, 0.0)`
`            where`
`                fun ad (i, ds)`
`                    =`
`                    if ( i>= ni)  ds // nr;`
`                    else          ad (i+1, ds + abs (a@@@i-m));`
`                    fi;`

`            end;`
`    };`
`end;`