Mercurial > hg > nsaunier > traffic-intelligence
comparison python/utils.py @ 667:179b81faa1f8
added regression analysis functions
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Mon, 25 May 2015 13:54:29 +0200 |
| parents | 15e244d2a1b5 |
| children | f8dcf483b296 |
comparison
equal
deleted
inserted
replaced
| 666:93633ce122c3 | 667:179b81faa1f8 |
|---|---|
| 284 return result | 284 return result |
| 285 plot(x, y, 'x') | 285 plot(x, y, 'x') |
| 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) | 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) |
| 287 plot(xx, [poly(z) for z in xx]) | 287 plot(xx, [poly(z) for z in xx]) |
| 288 return coef | 288 return coef |
| 289 | |
| 290 | |
| 291 ######################### | |
| 292 # regression analysis using statsmodels (and pandas) | |
| 293 ######################### | |
| 294 | |
| 295 # TODO make class for experiments? | |
| 296 def modelString(experiment, dependentVariable, independentVariables): | |
| 297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) | |
| 298 | |
| 299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
| 300 import statsmodels.formula.api as smf | |
| 301 modelStr = modelString(experiment, dependentVariable, independentVariables) | |
| 302 if regressionType == 'ols': | |
| 303 model = smf.ols(modelStr, data = data) | |
| 304 elif regressionType == 'gls': | |
| 305 model = smf.gls(modelStr, data = data) | |
| 306 elif regressionType == 'rlm': | |
| 307 model = smf.rlm(modelStr, data = data) | |
| 308 else: | |
| 309 print('Unknown regression type {}. Exiting'.format(regressionType)) | |
| 310 import sys | |
| 311 sys.exit() | |
| 312 return model.fit() | |
| 313 | |
| 314 def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
| 315 '''Runs several models and stores 3 statistics | |
| 316 adjusted R2, condition number (should be small, eg < 1000) | |
| 317 and p-value for Shapiro-Wilk test of residual normality''' | |
| 318 for i,experiment in experiments.iterrows(): | |
| 319 if experiment[independentVariables].any(): | |
| 320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') | |
| 321 experiments.loc[i,'r2adj'] = results.rsquared_adj | |
| 322 experiments.loc[i,'condNum'] = results.condition_number | |
| 323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] | |
| 324 return experiments | |
| 325 | |
| 326 def generateExperiments(independentVariables): | |
| 327 '''Generates all possible models for including or not each independent variable''' | |
| 328 experiments = {} | |
| 329 nIndependentVariables = len(independentVariables) | |
| 330 if nIndependentVariables != len(np.unique(independentVariables)): | |
| 331 print("Duplicate variables. Exiting") | |
| 332 import sys | |
| 333 sys.exit() | |
| 334 nModels = 2**nIndependentVariables | |
| 335 for i,var in enumerate(independentVariables): | |
| 336 pattern = [False]*(2**i)+[True]*(2**i) | |
| 337 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) | |
| 338 experiments = pd.DataFrame(experiments) | |
| 339 experiments['r2adj'] = 0. | |
| 340 experiments['condNum'] = np.nan | |
| 341 experiments['shapiroP'] = -1 | |
| 342 return experiments | |
| 343 | |
| 344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | |
| 345 '''Generates all possible model with the independentVariables | |
| 346 and runs them, saving the results in experiments | |
| 347 with multiprocess option''' | |
| 348 experiments = generateExperiments(independentVariables) | |
| 349 nModels = len(experiments) | |
| 350 print("Running {} models with {} processes".format(nModels, nProcesses)) | |
| 351 if nProcesses == 1: | |
| 352 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) | |
| 353 else: | |
| 354 pool = Pool(processes = nProcesses) | |
| 355 chunkSize = int(np.ceil(nModels/nProcesses)) | |
| 356 jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)] | |
| 357 return pd.concat([job.get() for job in jobs]) | |
| 358 | |
| 359 def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None): | |
| 360 '''Forward search for best model (based on adjusted R2) | |
| 361 Randomly starting with one variable and adding randomly variables | |
| 362 if they improve the model | |
| 363 | |
| 364 The results are added to experiments if provided as argument | |
| 365 Storing in experiment relies on the index being the number equal | |
| 366 to the binary code derived from the independent variables''' | |
| 367 if experiments is None: | |
| 368 experiments = generateExperiments(independentVariables) | |
| 369 nIndependentVariables = len(independentVariables) | |
| 370 permutation = np.random.permutation(range(nIndependentVariables)).tolist() | |
| 371 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} | |
| 372 print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)])) | |
| 373 bestModel = [False]*nIndependentVariables | |
| 374 currentVarNum = 0 | |
| 375 currentR2Adj = 0. | |
| 376 for currentVarNum in xrange(nIndependentVariables): | |
| 377 currentModel = [i for i in bestModel] | |
| 378 currentModel[currentVarNum] = True | |
| 379 rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | |
| 380 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | |
| 381 if experiments.loc[rowIdx, 'shapiroP'] < 0: | |
| 382 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) | |
| 383 model = modelFunc(modelStr, data = data) | |
| 384 results = model.fit() | |
| 385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | |
| 386 experiments.loc[rowIdx, 'condNum'] = results.condition_number | |
| 387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] | |
| 388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | |
| 389 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | |
| 390 bestModel[currentVarNum] = True | |
| 391 return experiments | |
| 392 | |
| 393 def displayModelResults(results, model = None): | |
| 394 import statsmodels.api as sm | |
| 395 '''Displays some model results''' | |
| 396 print results.summary() | |
| 397 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) | |
| 398 if model is not None: | |
| 399 plt.figure() | |
| 400 plt.plot(results.predict(), model.endog, 'x') | |
| 401 x=plt.xlim() | |
| 402 y=plt.ylim() | |
| 403 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') | |
| 404 plt.title('true vs predicted') | |
| 405 plt.figure() | |
| 406 plt.plot(results.predict(), results.resid, 'x') | |
| 407 plt.title('residuals vs predicted') | |
| 408 sm.qqplot(results.resid, fit = True, line = '45') | |
| 409 | |
| 289 | 410 |
| 290 ######################### | 411 ######################### |
| 291 # iterable section | 412 # iterable section |
| 292 ######################### | 413 ######################### |
| 293 | 414 |
