Posts in category use

Python Function for Finding the best Continuous Distribution among all OT Distributions

Hy hereafter, you will find the code of a file which contains a python function which is very usefull for finding the best distribution from a numerical sample. Its use is :

> FindBestContinuousDistribution(NS) # where NS is a NumericalSample


This function returns the continuous distribution in OT which best fits the emprirical distribution of the NumericalSample NS. Some distributions may have been not tested due to the nature of the data (distributions which requires positive value for example) or due to the estimation of the parameters which could fail (see the warnings which are printed during execution)

ps: I have not find an easy way to share a file, that's why i have made a copy of the whole file ...

import openturns as ot
import numpy as np
import warnings




def ListAllFactoryName():
    """ Return the name of all the DistributionFactory in OT """
    allDist = dir(ot.dist)
    AllFactory = []
    for elt in allDist:
        if ('Factory' in elt) and ('_' not in elt) and ('Histogram' not in elt):
            AllFactory.append(elt)
    return(AllFactory)



def ListAllFactory():
    """ Return a dictionnary with DistributionFactory objects of OT with the distinction
    'AllContinuousFactory' and 'AllDiscreteFactory' """
    result = {}
    AllContinuousFactoryName = []
    AllContinuousFactory =[]
    AllDiscreteFactoryName = []
    AllDiscreteFactory =[]    
    AllFactoryName = ListAllFactoryName()    
    for factoryName in AllFactoryName:
        factoryName = 'ot.' + factoryName
        StrDist = factoryName.replace('Factory','()')
        Dist = eval(StrDist)
        if Dist.isContinuous():
            factoryName = factoryName.replace('Factory','Factory()')            
            AllContinuousFactoryName.append(factoryName)
            factory = eval(factoryName)
            AllContinuousFactory.append(factory)
        else :
            factoryName = factoryName.replace('Factory','Factory()')            
            AllDiscreteFactoryName.append(factoryName)
            factory = eval(factoryName)
            AllDiscreteFactory.append(factory)            
    result['AllContinuousFactoryName']=AllContinuousFactoryName
    result['AllContinuousFactory']=AllContinuousFactory
    result['AllDiscreteFactoryName']=AllDiscreteFactoryName
    result['AllDiscreteFactory']=AllDiscreteFactory
    return(result)


def FindBestContinuousDistribution(NS):
    """ Return the continuous distribution in OT which best fits the emprirical distribution of the NumericalSample NS.
    Some distributions may have been not tested due to the nature of the data (distributions which requires positive value for example)
    or due to the estimation of the parameters which could fail (see the warnings)."""
    print '\n Return the continuous distribution in OT which best fits the emprirical distribution of the NumericalSample NS.'
    print 'Some distributions may have been not tested due to the nature of the data (distributions which requires positive value for example)'
    print 'or due to the estimation of the parameters which could fail (see the warnings).'
    if not isinstance(NS,ot.statistics.NumericalSample):
        warnings.warn( '-- The argument is not NumericalSample')
        return
    if NS.getDimension()>1:
        warnings.warn( '-- The NumericalSample must be of dimension 1')
        return
    AllFactory = ListAllFactory()
    ContinuousFactory = AllFactory['AllContinuousFactory']
    ContinuousFactoryName = AllFactory['AllContinuousFactoryName']

    DistTestedOK =[]
    DistTestedNOK = []
    ListFactoryOK = []
    ListFactoryNOK = []
    TestResu = []
    TestPValue = []    
    i = 0
    
    print('\n-------------- NOT TESTED DISTRIBUTIONS ----------------\n')
    for factory in ContinuousFactory:
        Name = ContinuousFactoryName[i]
        Name = Name.replace('Factory()','')
        Name = Name.replace('ot.','')
        try:
            EstimatedDistribution = factory.build(NS)
            ListFactoryOK.append(factory)
            DistTestedOK.append(Name)
            Test = ot.FittingTest.Kolmogorov(NS,EstimatedDistribution)
            TestResu.append(Test.getBinaryQualityMeasure())
            TestPValue.append(Test.getPValue())
        except Exception as e:
            ListFactoryNOK.append(factory)
            DistTestedNOK.append(Name)
            print(Name + ' Distribution Not tested')
            print(e)
            print('----\n')
            #pass
        i = i+1    
    print(DistTestedNOK)
    print('-----------------------------------------------------------------------\n')
    
    print('\n---------------- TESTED DISTRIBUTIONS  -------------------------------')    
    print(DistTestedOK)
    print('--------------------------------------\n')

    print('\n------- ACCEPTED DISTRIBUTIONS WITH KS TEST (at level 5%) -----------\n')
    DistTestedOK = np.array(DistTestedOK)
    DistAccepted = DistTestedOK[np.array(TestResu)==True]
    TestPValue = np.array(TestPValue)
    DistAcceptedPValue = TestPValue[np.array(TestResu)==True]
    Tab = [DistAccepted,DistAcceptedPValue]
    print 'KS-pvalue' + '\t\t', 'Ditribution' 
    print('------------------------------------------')
    for i in range(len(DistAccepted)):
      print DistAcceptedPValue[i],'\t\t',DistAccepted[i]
    #print(np.array(Tab).transpose())
    print('\n--------------------------------------\n')
    
    
    collectionFactory = ot.DistributionFactoryCollection(ListFactoryOK)
    bestDistributionKolmogorov = ot.FittingTest.BestModelKolmogorov(NS,collectionFactory)
    bestDistributionBIC =  ot.FittingTest_BestModelBIC(NS,collectionFactory)
    print("\n--- Best Distribution with Kolmogorov Test ----------------------------")
    print(bestDistributionKolmogorov)
    print('-------------------------------------------------------------------------\n')    

    print("\n--- Best Distribution with BIC ----------------------------------------")
    print(bestDistributionBIC)
    print('-------------------------------------------------------------------------\n')    


    return bestDistributionBIC