Mercurial > hg > nsaunier > traffic-intelligence
comparison python/utils.py @ 668:f8dcf483b296
code to prepare regression variables (remove correlated variables) and record dataset size in experimnets
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Tue, 26 May 2015 11:19:03 +0200 |
| parents | 179b81faa1f8 |
| children | df6be882f325 |
comparison
equal
deleted
inserted
replaced
| 667:179b81faa1f8 | 668:f8dcf483b296 |
|---|---|
| 1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
| 2 ''' Generic utilities.''' | 2 ''' Generic utilities.''' |
| 3 | 3 |
| 4 import matplotlib.pyplot as plt | |
| 4 from datetime import time, datetime | 5 from datetime import time, datetime |
| 5 from math import sqrt | 6 from math import sqrt |
| 6 | 7 from scipy.stats import kruskal, shapiro |
| 7 | 8 |
| 8 datetimeFormat = "%Y-%m-%d %H:%M:%S" | 9 datetimeFormat = "%Y-%m-%d %H:%M:%S" |
| 9 | 10 |
| 10 ######################### | 11 ######################### |
| 11 # Enumerations | 12 # Enumerations |
| 271 | 272 |
| 272 def linearRegression(x, y, deg = 1, plotData = False): | 273 def linearRegression(x, y, deg = 1, plotData = False): |
| 273 '''returns the least square estimation of the linear regression of y = ax+b | 274 '''returns the least square estimation of the linear regression of y = ax+b |
| 274 as well as the plot''' | 275 as well as the plot''' |
| 275 from numpy.lib.polynomial import polyfit | 276 from numpy.lib.polynomial import polyfit |
| 276 from matplotlib.pyplot import plot | |
| 277 from numpy.core.multiarray import arange | 277 from numpy.core.multiarray import arange |
| 278 coef = polyfit(x, y, deg) | 278 coef = polyfit(x, y, deg) |
| 279 if plotData: | 279 if plotData: |
| 280 def poly(x): | 280 def poly(x): |
| 281 result = 0 | 281 result = 0 |
| 282 for i in range(len(coef)): | 282 for i in range(len(coef)): |
| 283 result += coef[i]*x**(len(coef)-i-1) | 283 result += coef[i]*x**(len(coef)-i-1) |
| 284 return result | 284 return result |
| 285 plot(x, y, 'x') | 285 plt.plot(x, y, 'x') |
| 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) | 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) |
| 287 plot(xx, [poly(z) for z in xx]) | 287 plt.plot(xx, [poly(z) for z in xx]) |
| 288 return coef | 288 return coef |
| 289 | 289 |
| 290 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): | |
| 291 '''Computes (and displays) the correlation matrix for a pandas DataFrame''' | |
| 292 c=data.corr(correlationMethod) | |
| 293 if plotFigure: | |
| 294 fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0])) | |
| 295 fig.add_subplot(1,1,1) | |
| 296 #plt.imshow(np.fabs(c), interpolation='none') | |
| 297 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm | |
| 298 colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()] | |
| 299 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) | |
| 300 plt.xticks(range(len(colnames)), colnames, rotation=90) | |
| 301 plt.yticks(range(len(colnames)), colnames) | |
| 302 plt.tick_params('both', length=0) | |
| 303 plt.subplots_adjust(bottom = 0.29) | |
| 304 plt.colorbar() | |
| 305 plt.title('Correlation ({})'.format(correlationMethod)) | |
| 306 plt.tight_layout() | |
| 307 if figureFilename is not None: | |
| 308 plt.savefig(figureFilename, dpi = 150, transparent = True) | |
| 309 return c | |
| 310 | |
| 311 def addDummies(data, variables, allVariables = True): | |
| 312 '''Add binary dummy variables for each value of a nominal variable | |
| 313 in a pandas DataFrame''' | |
| 314 from numpy import NaN, dtype | |
| 315 newVariables = [] | |
| 316 for var in variables: | |
| 317 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: | |
| 318 values = data[var].unique() | |
| 319 if not allVariables: | |
| 320 values = values[:-1] | |
| 321 for val in values: | |
| 322 if val is not NaN: | |
| 323 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | |
| 324 data[newVariable] = (data[var] == val) | |
| 325 newVariables.append(newVariable) | |
| 326 return newVariables | |
| 327 | |
| 328 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): | |
| 329 '''Studies the influence of (nominal) independent variable over the dependent variable | |
| 330 | |
| 331 Makes tests if the conditional distributions are normal | |
| 332 using the Shapiro-Wilk test (in which case ANOVA could be used) | |
| 333 Implements uses the non-parametric Kruskal Wallis test''' | |
| 334 tmp = data[data[independentVariable].notnull()] | |
| 335 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | |
| 336 if len(independentVariableValues) >= 2: | |
| 337 for x in independentVariableValues: | |
| 338 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | |
| 339 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | |
| 340 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) | |
| 341 if plotFigure: | |
| 342 plt.figure() | |
| 343 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
| 344 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) | |
| 345 #plt.ylim(ymax = q75+1.5*(q75-q25)) | |
| 346 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | |
| 347 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | |
| 348 if figureFilenamePrefix is not None: | |
| 349 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | |
| 350 #else: | |
| 351 # TODO formatter le tableau (html?) | |
| 352 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | |
| 353 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
| 354 else: | |
| 355 return None | |
| 356 | |
| 357 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): | |
| 358 '''Removes variables from candidate independent variables if | |
| 359 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed | |
| 360 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) | |
| 361 Returns the remaining non-correlated variables, correlated with the dependent variable | |
| 362 | |
| 363 correlationFunc is spearmanr or pearsonr from scipy.stats | |
| 364 | |
| 365 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' | |
| 366 from numpy import dtype | |
| 367 from copy import copy | |
| 368 result = copy(independentVariables) | |
| 369 for v1 in independentVariables: | |
| 370 if v1 in correlations.index: | |
| 371 for v2 in independentVariables: | |
| 372 if v2 != v1 and v2 in correlations.index: | |
| 373 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: | |
| 374 if v1 in result and v2 in result: | |
| 375 print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1)) | |
| 376 result.remove(v2) | |
| 377 #regressionIndependentVariables = result | |
| 378 for var in copy(result): | |
| 379 if data.dtypes[var] != dtype('O'): | |
| 380 cor, p = correlationFunc(data[dependentVariable], data[var]) | |
| 381 if p > maxCorrelationP: | |
| 382 print('Removing {} (no correlation p={})'.format(var, p)) | |
| 383 result.remove(var) | |
| 384 return result | |
| 385 | |
| 290 | 386 |
| 291 ######################### | 387 ######################### |
| 292 # regression analysis using statsmodels (and pandas) | 388 # regression analysis using statsmodels (and pandas) |
| 293 ######################### | 389 ######################### |
| 294 | 390 |
| 295 # TODO make class for experiments? | 391 # TODO make class for experiments? |
| 392 # TODO add tests with public dataset downloaded from Internet (IRIS et al) | |
| 296 def modelString(experiment, dependentVariable, independentVariables): | 393 def modelString(experiment, dependentVariable, independentVariables): |
| 297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) | 394 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) |
| 298 | 395 |
| 299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): | 396 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): |
| 300 import statsmodels.formula.api as smf | 397 import statsmodels.formula.api as smf |
| 319 if experiment[independentVariables].any(): | 416 if experiment[independentVariables].any(): |
| 320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') | 417 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') |
| 321 experiments.loc[i,'r2adj'] = results.rsquared_adj | 418 experiments.loc[i,'r2adj'] = results.rsquared_adj |
| 322 experiments.loc[i,'condNum'] = results.condition_number | 419 experiments.loc[i,'condNum'] = results.condition_number |
| 323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] | 420 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] |
| 421 experiments.loc[i,'nobs'] = int(results.nobs) | |
| 324 return experiments | 422 return experiments |
| 325 | 423 |
| 326 def generateExperiments(independentVariables): | 424 def generateExperiments(independentVariables): |
| 327 '''Generates all possible models for including or not each independent variable''' | 425 '''Generates all possible models for including or not each independent variable''' |
| 328 experiments = {} | 426 experiments = {} |
| 337 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) | 435 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) |
| 338 experiments = pd.DataFrame(experiments) | 436 experiments = pd.DataFrame(experiments) |
| 339 experiments['r2adj'] = 0. | 437 experiments['r2adj'] = 0. |
| 340 experiments['condNum'] = np.nan | 438 experiments['condNum'] = np.nan |
| 341 experiments['shapiroP'] = -1 | 439 experiments['shapiroP'] = -1 |
| 440 experiments['nobs'] = -1 | |
| 342 return experiments | 441 return experiments |
| 343 | 442 |
| 344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | 443 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): |
| 345 '''Generates all possible model with the independentVariables | 444 '''Generates all possible model with the independentVariables |
| 346 and runs them, saving the results in experiments | 445 and runs them, saving the results in experiments |
| 383 model = modelFunc(modelStr, data = data) | 482 model = modelFunc(modelStr, data = data) |
| 384 results = model.fit() | 483 results = model.fit() |
| 385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | 484 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj |
| 386 experiments.loc[rowIdx, 'condNum'] = results.condition_number | 485 experiments.loc[rowIdx, 'condNum'] = results.condition_number |
| 387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] | 486 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] |
| 487 experiments.loc[rowIdx, 'nobs'] = int(results.nobs) | |
| 388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | 488 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: |
| 389 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | 489 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] |
| 390 bestModel[currentVarNum] = True | 490 bestModel[currentVarNum] = True |
| 391 return experiments | 491 return experiments |
| 392 | 492 |
