Mercurial > hg > nsaunier > traffic-intelligence
comparison python/utils.py @ 674:01b89182891a
corrected bug for intersection of lines (thanks to Paul for finding)
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Tue, 26 May 2015 18:16:51 +0200 |
| parents | 5473b7460375 |
| children | ab3fdff42624 |
comparison
equal
deleted
inserted
replaced
| 673:5505f9dbb28e | 674:01b89182891a |
|---|---|
| 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') |
| 317 data[newVariable] = (data[var] == val) | 317 data[newVariable] = (data[var] == val) |
| 318 newVariables.append(newVariable) | 318 newVariables.append(newVariable) |
| 319 return newVariables | 319 return newVariables |
| 320 | 320 |
| 321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): | 321 def frenchify(s, displayNames): |
| 322 return s | |
| 323 | |
| 324 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False): | |
| 322 '''Studies the influence of (nominal) independent variable over the dependent variable | 325 '''Studies the influence of (nominal) independent variable over the dependent variable |
| 323 | 326 |
| 324 Makes tests if the conditional distributions are normal | 327 Makes tests if the conditional distributions are normal |
| 325 using the Shapiro-Wilk test (in which case ANOVA could be used) | 328 using the Shapiro-Wilk test (in which case ANOVA could be used) |
| 326 Implements uses the non-parametric Kruskal Wallis test''' | 329 Implements uses the non-parametric Kruskal Wallis test''' |
| 327 tmp = data[data[independentVariable].notnull()] | 330 tmp = data[data[independentVariable].notnull()] |
| 328 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | 331 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) |
| 329 if len(independentVariableValues) >= 2: | 332 if len(independentVariableValues) >= 2: |
| 333 if saveLatex: | |
| 334 from storage import openCheck | |
| 335 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') | |
| 330 for x in independentVariableValues: | 336 for x in independentVariableValues: |
| 331 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | 337 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) |
| 332 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | 338 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: |
| 333 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) | 339 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) |
| 334 if plotFigure: | 340 if plotFigure: |
| 336 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | 342 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) |
| 337 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) | 343 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) |
| 338 #plt.ylim(ymax = q75+1.5*(q75-q25)) | 344 #plt.ylim(ymax = q75+1.5*(q75-q25)) |
| 339 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | 345 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) |
| 340 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | 346 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) |
| 341 if figureFilenamePrefix is not None: | 347 if filenamePrefix is not None: |
| 342 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | 348 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) |
| 343 #else: | 349 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) |
| 344 # TODO formatter le tableau (html?) | 350 if saveLatex: |
| 345 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | 351 out.write('\begin{table}[htp!]') |
| 352 out.write(frenchify(table.to_latex(), displayNames)) | |
| 353 out.write('\end{table}[htp!]') | |
| 354 else: | |
| 355 print table | |
| 346 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | 356 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) |
| 347 else: | 357 else: |
| 348 return None | 358 return None |
| 349 | 359 |
| 350 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): | 360 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): |
| 442 from pandas import concat | 452 from pandas import concat |
| 443 from multiprocessing import Pool | 453 from multiprocessing import Pool |
| 444 experiments = generateExperiments(independentVariables) | 454 experiments = generateExperiments(independentVariables) |
| 445 nModels = len(experiments) | 455 nModels = len(experiments) |
| 446 print("Running {} models with {} processes".format(nModels, nProcesses)) | 456 print("Running {} models with {} processes".format(nModels, nProcesses)) |
| 457 print("IndependentVariables: {}".format(independentVariables)) | |
| 447 if nProcesses == 1: | 458 if nProcesses == 1: |
| 448 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) | 459 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) |
| 449 else: | 460 else: |
| 450 pool = Pool(processes = nProcesses) | 461 pool = Pool(processes = nProcesses) |
| 451 chunkSize = int(ceil(nModels/nProcesses)) | 462 chunkSize = int(ceil(nModels/nProcesses)) |
