comparison trafficintelligence/utils.py @ 1156:f7fbe624fff7

added helper functions for categorical variables
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 25 Sep 2020 11:56:59 -0400
parents 342701cdac30
children d71a4d174b1a
comparison
equal deleted inserted replaced
1155:fd729e8f073c 1156:f7fbe624fff7
6 from pathlib import Path 6 from pathlib import Path
7 from math import sqrt, ceil, floor 7 from math import sqrt, ceil, floor
8 from copy import deepcopy, copy 8 from copy import deepcopy, copy
9 from collections import Counter 9 from collections import Counter
10 10
11 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t 11 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t, chi2_contingency
12 from scipy.spatial import distance 12 from scipy.spatial import distance
13 from scipy.sparse import dok_matrix 13 from scipy.sparse import dok_matrix
14 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit, float as npfloat 14 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit, float as npfloat
15 from numpy.random import random_sample, permutation as nppermutation 15 from numpy.random import random_sample, permutation as nppermutation
16 from pandas import DataFrame, concat 16 from pandas import DataFrame, concat, crosstab
17 import matplotlib.pyplot as plt 17 import matplotlib.pyplot as plt
18 18
19 datetimeFormat = "%Y-%m-%d %H:%M:%S" 19 datetimeFormat = "%Y-%m-%d %H:%M:%S"
20 20
21 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 21 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720
652 while s < maxSum: 652 while s < maxSum:
653 x = d.rvs() 653 x = d.rvs()
654 sample.append(x) 654 sample.append(x)
655 s += x 655 s += x
656 return sample 656 return sample
657 657
658 def cramers_v(x, y):
659 """ calculate Cramers V statistic for categorial-categorial association.
660 uses correction from Bergsma and Wicher,
661 Journal of the Korean Statistical Society 42 (2013): 323-328
662 https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
663 https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
664 """
665 confusionMatrix = crosstab(x,y)
666 chi2 = chi2_contingency(confusionMatrix)[0]
667 n = confusionMatrix.sum().sum()
668 phi2 = chi2/n
669 r,k = confusionMatrix.shape
670 phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
671 rcorr = r-((r-1)**2)/(n-1)
672 kcorr = k-((k-1)**2)/(n-1)
673 return sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
674
675 def categoricalCorrelationMatrix(data, categoricalVariables):
676 'Returns correlation matrix for the categorical variables'
677 corr = np.ones((len(categoricalVariables), len(categoricalVariables)))
678 for i in range(len(categoricalVariables)):
679 for j in range(i):
680 corr[i,j] = utils.cramers_v(petDf[categoricalVariables[i]], petDf[categoricalVariables[j]])
681 corr[j,i] = corr[i,j]
682 return corr
683
658 ######################### 684 #########################
659 # regression analysis using statsmodels (and pandas) 685 # regression analysis using statsmodels (and pandas)
660 ######################### 686 #########################
661 687
662 # TODO make class for experiments? 688 # TODO make class for experiments?