nsaunier/traffic-intelligence: python/utils.py comparison

comparison python/utils.py @ 668:f8dcf483b296

code to prepare regression variables (remove correlated variables) and record dataset size in experimnets

author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Tue, 26 May 2015 11:19:03 +0200
parents	179b81faa1f8
children	df6be882f325

comparison

equal deleted inserted replaced

-:179b81faa1f8
+:f8dcf483b296
 #! /usr/bin/env python
 ''' Generic utilities.'''
+import matplotlib.pyplot as plt
 from datetime import time, datetime
 from math import sqrt
+from scipy.stats import kruskal, shapiro
 datetimeFormat = "%Y-%m-%d %H:%M:%S"
 #########################
 # Enumerations
 def linearRegression(x, y, deg = 1, plotData = False):
 '''returns the least square estimation of the linear regression of y = ax+b
 as well as the plot'''
 from numpy.lib.polynomial import polyfit
-from matplotlib.pyplot import plot
 from numpy.core.multiarray import arange
 coef = polyfit(x, y, deg)
 if plotData:
 def poly(x):
 result = 0
 for i in range(len(coef)):
 result += coef[i]*x**(len(coef)-i-1)
 return result
-plot(x, y, 'x')
+plt.plot(x, y, 'x')
 xx = arange(min(x), max(x),(max(x)-min(x))/1000)
-plot(xx, [poly(z) for z in xx])
+plt.plot(xx, [poly(z) for z in xx])
 return coef
+def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
+'''Computes (and displays) the correlation matrix for a pandas DataFrame'''
+c=data.corr(correlationMethod)
+if plotFigure:
+fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0]))
+fig.add_subplot(1,1,1)
+#plt.imshow(np.fabs(c), interpolation='none')
+plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
+colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()]
+#correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
+plt.xticks(range(len(colnames)), colnames, rotation=90)
+plt.yticks(range(len(colnames)), colnames)
+plt.tick_params('both', length=0)
+plt.subplots_adjust(bottom = 0.29)
+plt.colorbar()
+plt.title('Correlation ({})'.format(correlationMethod))
+plt.tight_layout()
+if figureFilename is not None:
+plt.savefig(figureFilename, dpi = 150, transparent = True)
+return c
+def addDummies(data, variables, allVariables = True):
+'''Add binary dummy variables for each value of a nominal variable
+in a pandas DataFrame'''
+from numpy import NaN, dtype
+newVariables = []
+for var in variables:
+if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
+values = data[var].unique()
+if not allVariables:
+values = values[:-1]
+for val in values:
+if val is not NaN:
+newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
+data[newVariable] = (data[var] == val)
+newVariables.append(newVariable)
+return newVariables
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'):
+'''Studies the influence of (nominal) independent variable over the dependent variable
+Makes tests if the conditional distributions are normal
+using the Shapiro-Wilk test (in which case ANOVA could be used)
+Implements uses the non-parametric Kruskal Wallis test'''
+tmp = data[data[independentVariable].notnull()]
+independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
+if len(independentVariableValues) >= 2:
+for x in independentVariableValues:
+print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
+if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
+print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])
+if plotFigure:
+plt.figure()
+plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+#q25, q75 = tmp[dependentVariable].quantile([.25, .75])
+#plt.ylim(ymax = q75+1.5*(q75-q25))
+plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
+plt.title('{} vs {}'.format(dependentVariable, independentVariable))
+if figureFilenamePrefix is not None:
+plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
+#else:
+# TODO formatter le tableau (html?)
+print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
+return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+else:
+return None
+def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
+'''Removes variables from candidate independent variables if
+- if two independent variables are correlated (> maxCorrelationThreshold), one is removed
+- if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
+Returns the remaining non-correlated variables, correlated with the dependent variable
+correlationFunc is spearmanr or pearsonr from scipy.stats
+TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
+from numpy import dtype
+from copy import copy
+result = copy(independentVariables)
+for v1 in independentVariables:
+if v1 in correlations.index:
+for v2 in independentVariables:
+if v2 != v1 and v2 in correlations.index:
+if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
+if v1 in result and v2 in result:
+print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1))
+result.remove(v2)
+#regressionIndependentVariables = result
+for var in copy(result):
+if data.dtypes[var] != dtype('O'):
+cor, p = correlationFunc(data[dependentVariable], data[var])
+if p > maxCorrelationP:
+print('Removing {} (no correlation p={})'.format(var, p))
+result.remove(var)
+return result
 #########################
 # regression analysis using statsmodels (and pandas)
 #########################
 # TODO make class for experiments?
+# TODO add tests with public dataset downloaded from Internet (IRIS et al)
 def modelString(experiment, dependentVariable, independentVariables):
 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
 import statsmodels.formula.api as smf
 if experiment[independentVariables].any():
 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
 experiments.loc[i,'r2adj'] = results.rsquared_adj
 experiments.loc[i,'condNum'] = results.condition_number
 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
+experiments.loc[i,'nobs'] = int(results.nobs)
 return experiments
 def generateExperiments(independentVariables):
 '''Generates all possible models for including or not each independent variable'''
 experiments = {}
 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
 experiments = pd.DataFrame(experiments)
 experiments['r2adj'] = 0.
 experiments['condNum'] = np.nan
 experiments['shapiroP'] = -1
+experiments['nobs'] = -1
 return experiments
 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
 '''Generates all possible model with the independentVariables
 and runs them, saving the results in experiments
 model = modelFunc(modelStr, data = data)
 results = model.fit()
 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
 experiments.loc[rowIdx, 'condNum'] = results.condition_number
 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
+experiments.loc[rowIdx, 'nobs'] = int(results.nobs)
 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
 bestModel[currentVarNum] = True
 return experiments

Mercurial > hg > nsaunier > traffic-intelligence

comparison python/utils.py @ 668:f8dcf483b296