Posts for the month of October 2015

Radar-shaped statistic to check the uniformity of a design of experiments

Here is a Python translation of a R function allowing to check the uniformity of a design of experiments obtained by OpenTURNS (as a NumericalSample object). It is an implementation of the "Radar-shaped statistic" method detailed in the PhD thesis of J. Franco (2008).

The program requires a C shared library 'unif_radar_lib.so' that can be compiled from a source file 'C_rss.c', displayed below. For example the following commands can be used in Linux console with the gcc compiler:

gcc -c -Wall -Werror -fpic -std=c99 C_rss.c
gcc -shared -o unif_radar.so C_rss.o

Then the 'unif_radar.py' file can be executed directly or called from another user-defined function. Calling directly the enclosed version will run an example with a design based on a low-discrepancy sequence.

File 'unif_radar.py'

#!/usr/bin/env python
# coding = utf-8

"""
Check the uniformity of a design of experiments using the radar-shaped 
statistic.

The uniformity of a multidimensional design is checked in each plane defined 
by a couple (Xi, Xj) of input parameters. In every plane, the distribution of 
the orthogonal projections of the design points onto an axis with an angle 
'theta' with respect to the Xi and Xj axes is studied. The projection 
distribution is compared to the uniform distribution by means of a given 
distance, e.g. the Kolmogorov-Smirnov one. The greatest discrepancy is 
retained and a statistical test is performed to challenge the uniformity 
assumption. Moreover, the design is shown in the "worst" plane (Xi, Xj).

The method was designed by J. Franco (2008). This code is a Python translation 
of a R function in the R library DiceDesign (Franco et al., 2015). 

References
----------
Franco J. (2008). Planification d'experiences numeriques en phase exploratoire 
pour la simulation des phenomenes complexes, PhD Thesis, Ecole Nationale 
Superieure des Mines de Saint-Etienne.

Franco J., Dupuy D., Roustant O. (2015). DiceDesign: Designs of Computer 
Experiments. R package version 1.7, 
URL http://CRAN.R-project.org/package=DiceDesign.
"""

import numpy as np
import ctypes
import matplotlib.pyplot as plt
import openturns as ot


def scale_data(x):
    """Transform the data into uniformly distributed values over [-1,1]"""
    x = ot.NumericalSample(x)
    xs = []
    for i in range(x.getDimension()):
        xi = x[:,i]
        xs.append( [2.*xi.computeEmpiricalCDF(xij)-1. for xij in xi] )
    return ot.NumericalSample(np.array(xs).T)


def ks_stat(x):
    """Evaluate the Kolmogorov-Smirnov statistic"""
    n = float(len(x))
    ordered_sample = np.sort(x)
    Dplus = np.max( (1.+np.arange(n))/n - ordered_sample )
    Dminus = np.max( ordered_sample - np.arange(n)/n )
    stat = np.max([Dplus, Dminus])
    return stat
    
    
def ks_quantile(n, alpha = 0.05, eps=np.finfo(float).eps):
    """Evaluate the Kolmogorov-Smirnov alpha-quantile"""
    index, = np.where(np.abs([0.1, 0.05, 0.025, 0.01] - alpha*np.ones(4)) < eps)
    q = [1.224, 1.358, 1.48, 1.628][index]/(np.sqrt(n)+0.12+0.11/np.sqrt(n))
    
    return q


def check_design(x, n_theta=360, alpha=0.95, c_lib='./unif_radar_lib.so', \
                 eps=np.finfo(float).eps):
    """
    Check the uniformity of the data using the radar-shaped statistic.
    
    Plots showing the greatest distance to the uniformity in the "worst" plane 
    of inputs (Xi, Xj) are created. The results of the statistical test are 
    printed.
    
    Parameters
    ----------
    x: OpenTURNS NumericalSample
        Design of experiments.
    
    n_theta: integer, optional
        Number of angular positions for the projection axis (default is 360).
    
    alpha: float, optional
        Level of the statistical uniformity test (default is 0.95).
    
    c_lib: string, optional
        Name of the C shared library used to speed up the iterations through 
        the pairs of input variables (default is the location in the current 
        working directory './unif_radar_lib.so').
    
    eps: float, optional
        Tolerance for the small axis orientation angle (default is the numpy 
        value 'np.finfo(float).eps')
    """
    
    print("")
    print("2D radial scanning statistic (RSS) using Kolmogorov-Smirnov statistic")
    print("Discretization step (in degree): %.1f" %(180./n_theta))
    print("Maximum of RSS values (global statistic) per pair of dimensions\n")
    
    design = np.array(scale_data(x))
    n, d = design.shape
    
    # Discretization of the angular position
    theta_degree = np.linspace(0., 180., n_theta+1)[0:-1]
    theta = theta_degree*np.pi/180. # deg -> rad    
    cos_theta, sin_theta = np.cos(theta), np.sin(theta)
    subdiv_halfcircle = np.vstack((cos_theta, sin_theta)).T
    
    # Specify the C code library and types
    my_library = ctypes.CDLL(c_lib)
    my_library.C_rss2Dloop.argtypes = \
        [ctypes.POINTER(ctypes.c_double)]*4 + [ctypes.c_int]*2
    my_library.C_rss2Dloop.restype = \
        ctypes.POINTER(ctypes.c_double)
    
    global_stat = np.zeros((d, d))
    global_stat_max = 0
    
    # Iterate through pairs of input parameters
    
    for i in range(d-1):
        x1 = design[:,i]
        
        for j in range(i+1, d):
            
            x2 = design[:,j]
            
            x1_c = (ctypes.c_double * n)()
            for index, value in enumerate(x1):
                x1_c[index] = value                
            x2_c = (ctypes.c_double * n)()
            for index, value in enumerate(x2):
                x2_c[index] = value                
            ax_c = (ctypes.c_double * n_theta)()
            for index, value in enumerate(cos_theta):
                ax_c[index] = value                
            ay_c = (ctypes.c_double * n_theta)()
            for index, value in enumerate(sin_theta):
                ay_c[index] = value                
            n_c = ctypes.c_int(n)          
            n_theta_c = ctypes.c_int(n_theta)
            
            out = my_library.C_rss2Dloop(x1_c, x2_c, ax_c, ay_c, n_c, n_theta_c)
            
            projections2 = np.array(out[:n*n_theta])
            projections2 = projections2.reshape((n, n_theta), order="F")
            
            # Iterate through the angle positions
            
            anglewise_stat_ij = np.zeros(n_theta)
            for i_theta in range(n_theta):
                anglewise_stat_ij[i_theta] = ks_stat(projections2[:,i_theta])
    
            # Store the worst pvalue found for the parameters (Xi, Xj)
            global_stat_ij = np.max(anglewise_stat_ij)
            global_stat[i,j] = global_stat_ij
            global_stat[j,i] = global_stat_ij
            
            # Update the worst pvalue found for all parameter couples
            if global_stat_ij > global_stat_max:
                global_stat_max = global_stat_ij.copy()
                pair_worst = [i,j]
                anglewise_stat = anglewise_stat_ij.copy()
            
            print("Plane (%d,%d)  - Greatest observed Kolmogorov distance: %.4f" \
                  % (i+1, j+1, global_stat_ij))

    # Plot results
    
    rss_curve_x = anglewise_stat*subdiv_halfcircle[:,0]
    rss_curve_y = anglewise_stat*subdiv_halfcircle[:,1]
    
    gof_test_stat = ks_quantile(n, 1.-alpha)
    
    design_worst = design[:,pair_worst]
    anglewise_stat_max = np.max(anglewise_stat)
    index_max = np.argmax(anglewise_stat)
    cos_theta_max = subdiv_halfcircle[index_max,0]
    sin_theta_max = subdiv_halfcircle[index_max,1]
    dir_max = subdiv_halfcircle[index_max,:]
    
    plt.figure()
    plt.subplot(111, aspect='equal')
    plt.plot(design_worst[:,0], design_worst[:,1], 'ob')
    plt.xlabel(r"$X_{"+str(pair_worst[0]+1)+"}$", size=16)
    plt.ylabel(r"$X_{"+str(pair_worst[1]+1)+"}$", size=16)
    plt.title("Worst plane", size=16)
    
    plt.figure()
    plt.subplot(111, aspect='equal')
    rx = np.hstack((rss_curve_x, -rss_curve_x, rss_curve_x[0]))
    ry = np.hstack((rss_curve_y, -rss_curve_y, rss_curve_y[0]))
    plt.plot(rx, ry, '-b', label="KS statistic")
    theta_aux = np.linspace(0., 2.*np.pi+0.1, 80)
    plt.plot(gof_test_stat*np.cos(theta_aux), gof_test_stat*np.sin(theta_aux), \
             '-k', label="KS rejection threshold")
    plt.legend(loc="best")
    plt.title(r"Plane (X%d , X%d) - Worst angle: %d$^{\circ}$" \
              % (pair_worst[0]+1, pair_worst[1]+1, theta_degree[index_max]), \
              size = 16)
    
    plt.figure()
    plt.subplot(111, aspect='equal')
    plt.plot(design_worst[:,0], design_worst[:,1], 'ob')
    projections = np.dot(design_worst, dir_max)
    plt.plot(projections*dir_max[0], projections*dir_max[1], 'or')
    if np.abs(cos_theta_max) < eps:
        plt.plot([0,0], [-1,1], "-r")
    else:
        plt.plot([-1,1], np.array([-1,1])*sin_theta_max/cos_theta_max, "-r")
    for i in range(n):
        plt.plot([design_worst[i,0], projections[i]*cos_theta_max], \
                 [design_worst[i,1], projections[i]*sin_theta_max], 'r:')
    plt.xlabel(r"$X_{"+str(pair_worst[0]+1)+"}$", size=16)
    plt.ylabel(r"$X_{"+str(pair_worst[1]+1)+"}$", size=16)
    plt.xlim([-1,1])
    plt.ylim([-1,1])
    plt.title(r"$\theta=%d^{\circ}$" % (theta_degree[index_max]), size=20)
    
    print("\n\nKolmogorov-Smirnov quantile: %.4f\n" % global_stat_max)
    print("Kolmogorov-Smirnov threshold: %.2f\n" % gof_test_stat)
    if global_stat_max < gof_test_stat:
        print("Uniformity hypothesis NOT REJECTED")
    else:
        print("Uniformity hypothesis REJECTED at level %.2f\n" % alpha)


if __name__ == "__main__":
    
    """Test with a low discrepancy sequence"""
    
    plt.ion() # interactive mode ON
    
    n = 60
    d = 20
    
    mySobolSeq = ot.SobolSequence(d)
    design = 2*np.array( mySobolSeq.generate(n) )-1.
    
    check_design(design)

File 'C_rss.c'

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define precision 0.000000000001

#define SQUARE(x) ((x)*(x))
#define CUBE(x) ((x)*(x)*(x))
#define MAX(a,b) (a<b ? b:a)


double C_uniform_cdf(const double *p) {

	return(0.5 * (MAX(*p + 1., 0.) - MAX(*p - 1., 0.)));

}


double C_sumof2uniforms_cdf(const double *p, const double *ax, const double *ay, const int *k) {
	
	double s = 0.;
	
	if ((fabs(ax[*k]) > precision) && (fabs(ay[*k]) > precision)) {
		s += SQUARE(MAX(*p - ax[*k] - ay[*k], 0.));
		s += SQUARE(MAX(*p + ax[*k] + ay[*k], 0.));
		s -= SQUARE(MAX(*p - ax[*k] + ay[*k], 0.));
		s -= SQUARE(MAX(*p + ax[*k] - ay[*k], 0.));
		return(s / (8 * ax[*k] * ay[*k]));
	} 
	else return(C_uniform_cdf(p));

}


double C_sumof3uniforms_cdf(const double *p, const double *ax, const double *ay, const double *az, const int *k) {
	
	double s = 0.;
	
	if ((fabs(ax[*k]) > precision) && (fabs(ay[*k]) > precision) && (fabs(az[*k]) > precision)) {
		s += CUBE(MAX(*p + ax[*k] + ay[*k] + az[*k], 0.));
		s += CUBE(MAX(*p + ax[*k] - ay[*k] - az[*k], 0.));
		s += CUBE(MAX(*p - ax[*k] + ay[*k] - az[*k], 0.));
		s += CUBE(MAX(*p - ax[*k] - ay[*k] + az[*k], 0.));
		s -= CUBE(MAX(*p - ax[*k] - ay[*k] - az[*k], 0.));
		s -= CUBE(MAX(*p - ax[*k] + ay[*k] + az[*k], 0.));
		s -= CUBE(MAX(*p + ax[*k] - ay[*k] + az[*k], 0.));
		s -= CUBE(MAX(*p + ax[*k] + ay[*k] - az[*k], 0.));
		return(s / (48 * ax[*k] * ay[*k] * az[*k]));
	} 
	else if ((fabs(ax[*k]) <= precision)) return(C_sumof2uniforms_cdf(p, ay, az, k));
	else if ((fabs(ay[*k]) <= precision)) return(C_sumof2uniforms_cdf(p, ax, az, k));
	else return(C_sumof2uniforms_cdf(p, ax, ay, k));
}


void C_rss2Dloop(const double *x, const double *y, const double *ax, const double *ay, const int *n, const int *n_theta, double *ans) {
	
	double p;
	
	for (int i_theta=0; i_theta < *n_theta; i_theta++) {
			
		for (int i=0; i < *n; i++) {
			p = x[i]*ax[i_theta] + y[i]*ay[i_theta];
			ans[i + *n * i_theta] = C_sumof2uniforms_cdf(&p, ax, ay, &i_theta);
		}
		
	}

}


void C_rss3Dloop(const double *x, const double *y, const double *z, const double *ax, const double *ay, const double *az, const int *n, const int *n_theta, const int *n_phi, const int *i_phi, double *ans) {
	
	double p;
	int k;
	
	for (int i_theta=0; i_theta < *n_theta; i_theta++) {
			
		for (int i=0; i < *n; i++) {
			k = i_theta + *n_theta * *i_phi;
			p = x[i]*ax[k] + y[i]*ay[k] + z[i]*az[k];
			ans[i + *n * i_theta] = C_sumof3uniforms_cdf(&p, ax, ay, az, &k);
		}
		
	}
}