Mercurial > hg > nsaunier > traffic-intelligence
comparison trafficintelligence/utils.py @ 1156:f7fbe624fff7
added helper functions for categorical variables
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Fri, 25 Sep 2020 11:56:59 -0400 |
| parents | 342701cdac30 |
| children | d71a4d174b1a |
comparison
equal
deleted
inserted
replaced
| 1155:fd729e8f073c | 1156:f7fbe624fff7 |
|---|---|
| 6 from pathlib import Path | 6 from pathlib import Path |
| 7 from math import sqrt, ceil, floor | 7 from math import sqrt, ceil, floor |
| 8 from copy import deepcopy, copy | 8 from copy import deepcopy, copy |
| 9 from collections import Counter | 9 from collections import Counter |
| 10 | 10 |
| 11 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t | 11 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t, chi2_contingency |
| 12 from scipy.spatial import distance | 12 from scipy.spatial import distance |
| 13 from scipy.sparse import dok_matrix | 13 from scipy.sparse import dok_matrix |
| 14 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit, float as npfloat | 14 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit, float as npfloat |
| 15 from numpy.random import random_sample, permutation as nppermutation | 15 from numpy.random import random_sample, permutation as nppermutation |
| 16 from pandas import DataFrame, concat | 16 from pandas import DataFrame, concat, crosstab |
| 17 import matplotlib.pyplot as plt | 17 import matplotlib.pyplot as plt |
| 18 | 18 |
| 19 datetimeFormat = "%Y-%m-%d %H:%M:%S" | 19 datetimeFormat = "%Y-%m-%d %H:%M:%S" |
| 20 | 20 |
| 21 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 | 21 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 |
| 652 while s < maxSum: | 652 while s < maxSum: |
| 653 x = d.rvs() | 653 x = d.rvs() |
| 654 sample.append(x) | 654 sample.append(x) |
| 655 s += x | 655 s += x |
| 656 return sample | 656 return sample |
| 657 | 657 |
| 658 def cramers_v(x, y): | |
| 659 """ calculate Cramers V statistic for categorial-categorial association. | |
| 660 uses correction from Bergsma and Wicher, | |
| 661 Journal of the Korean Statistical Society 42 (2013): 323-328 | |
| 662 https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 | |
| 663 https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792 | |
| 664 """ | |
| 665 confusionMatrix = crosstab(x,y) | |
| 666 chi2 = chi2_contingency(confusionMatrix)[0] | |
| 667 n = confusionMatrix.sum().sum() | |
| 668 phi2 = chi2/n | |
| 669 r,k = confusionMatrix.shape | |
| 670 phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) | |
| 671 rcorr = r-((r-1)**2)/(n-1) | |
| 672 kcorr = k-((k-1)**2)/(n-1) | |
| 673 return sqrt(phi2corr/min((kcorr-1),(rcorr-1))) | |
| 674 | |
| 675 def categoricalCorrelationMatrix(data, categoricalVariables): | |
| 676 'Returns correlation matrix for the categorical variables' | |
| 677 corr = np.ones((len(categoricalVariables), len(categoricalVariables))) | |
| 678 for i in range(len(categoricalVariables)): | |
| 679 for j in range(i): | |
| 680 corr[i,j] = utils.cramers_v(petDf[categoricalVariables[i]], petDf[categoricalVariables[j]]) | |
| 681 corr[j,i] = corr[i,j] | |
| 682 return corr | |
| 683 | |
| 658 ######################### | 684 ######################### |
| 659 # regression analysis using statsmodels (and pandas) | 685 # regression analysis using statsmodels (and pandas) |
| 660 ######################### | 686 ######################### |
| 661 | 687 |
| 662 # TODO make class for experiments? | 688 # TODO make class for experiments? |
