Mercurial > hg > nsaunier > traffic-intelligence
comparison trafficintelligence/utils.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
| author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
|---|---|
| date | Fri, 15 Jun 2018 11:19:10 -0400 |
| parents | python/utils.py@a13f47c8931d |
| children | c6cf75a2ed08 |
comparison
equal
deleted
inserted
replaced
| 1027:6129296848d3 | 1028:cc5cb04b04b0 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 ''' Generic utilities.''' | |
| 4 | |
| 5 import matplotlib.pyplot as plt | |
| 6 from datetime import time, datetime | |
| 7 from argparse import ArgumentTypeError | |
| 8 from pathlib import Path | |
| 9 from math import sqrt, ceil, floor | |
| 10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm | |
| 11 from scipy.spatial import distance | |
| 12 from scipy.sparse import dok_matrix | |
| 13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log | |
| 14 | |
| 15 | |
| 16 datetimeFormat = "%Y-%m-%d %H:%M:%S" | |
| 17 | |
| 18 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 | |
| 19 | |
| 20 ######################### | |
| 21 # Strings | |
| 22 ######################### | |
| 23 | |
| 24 def upperCaseFirstLetter(s): | |
| 25 words = s.split(' ') | |
| 26 lowerWords = [w[0].upper()+w[1:].lower() for w in words] | |
| 27 return ' '.join(lowerWords) | |
| 28 | |
| 29 class TimeConverter: | |
| 30 def __init__(self, datetimeFormat = datetimeFormat): | |
| 31 self.datetimeFormat = datetimeFormat | |
| 32 | |
| 33 def convert(self, s): | |
| 34 try: | |
| 35 return datetime.strptime(s, self.datetimeFormat) | |
| 36 except ValueError: | |
| 37 msg = "Not a valid date: '{0}'.".format(s) | |
| 38 raise ArgumentTypeError(msg) | |
| 39 | |
| 40 ######################### | |
| 41 # Enumerations | |
| 42 ######################### | |
| 43 | |
| 44 def inverseEnumeration(l): | |
| 45 'Returns the dictionary that provides for each element in the input list its index in the input list' | |
| 46 result = {} | |
| 47 for i,x in enumerate(l): | |
| 48 result[x] = i | |
| 49 return result | |
| 50 | |
| 51 ######################### | |
| 52 # Simple statistics | |
| 53 ######################### | |
| 54 | |
| 55 def logNormalMeanVar(loc, scale): | |
| 56 '''location and scale are respectively the mean and standard deviation of the normal in the log-normal distribution | |
| 57 https://en.wikipedia.org/wiki/Log-normal_distribution | |
| 58 | |
| 59 same as lognorm.stats(scale, 0, exp(loc))''' | |
| 60 mean = exp(loc+(scale**2)/2) | |
| 61 var = (exp(scale**2)-1)*exp(2*loc+scale**2) | |
| 62 return mean, var | |
| 63 | |
| 64 def fitLogNormal(x): | |
| 65 'returns the fitted location and scale of the lognormal (general definition)' | |
| 66 shape, loc, scale = lognorm.fit(x, floc=0.) | |
| 67 return log(scale), shape | |
| 68 | |
| 69 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False): | |
| 70 from scipy.stats.distributions import norm | |
| 71 if nRoundingDigits is None: | |
| 72 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200. | |
| 73 else: | |
| 74 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits) | |
| 75 stdev = round(stdev, nRoundingDigits) | |
| 76 tolerance = round(tolerance, nRoundingDigits) | |
| 77 if printLatex: | |
| 78 print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'.format(0.5+percentConfidence/200.,k, stdev, tolerance)) | |
| 79 return (k*stdev/tolerance)**2 | |
| 80 | |
| 81 def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False): | |
| 82 '''if trueStd, use normal distribution, otherwise, Student | |
| 83 | |
| 84 Use otherwise t.interval or norm.interval for the boundaries | |
| 85 ex: norm.interval(0.95) | |
| 86 t.interval(0.95, nSamples-1)''' | |
| 87 from scipy.stats.distributions import norm, t | |
| 88 if trueStd: | |
| 89 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) | |
| 90 else: # use Student | |
| 91 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2) | |
| 92 e = k*stdev/sqrt(nSamples) | |
| 93 if printLatex: | |
| 94 print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples)) | |
| 95 return mean-e, mean+e | |
| 96 | |
| 97 def computeChi2(expected, observed): | |
| 98 '''Returns the Chi2 statistics''' | |
| 99 return sum([((e-o)*(e-o))/float(e) for e, o in zip(expected, observed)]) | |
| 100 | |
| 101 class generateDistribution(rv_continuous): | |
| 102 def __init__(self, values, probabilities): | |
| 103 '''The values (and corresponding probabilities) are supposed to be sorted by value | |
| 104 for v, p in zip(values, probabilities): P(X<=v)=p''' | |
| 105 assert probabilities[0]==0 | |
| 106 self.values = values | |
| 107 self.probabilities = probabilities | |
| 108 | |
| 109 def _cdf(self, x): | |
| 110 if x < self.values[0]: | |
| 111 return self.probabilities[0] | |
| 112 else: | |
| 113 i=0 | |
| 114 while i+1<len(self.values) and self.values[i+1] < x: | |
| 115 i += 1 | |
| 116 if i == len(self.values)-1: | |
| 117 return self.probabilities[-1] | |
| 118 else: | |
| 119 return (self.probabilities[i+1]-self.probabilities[i])/(self.values[i+1]-self.values[i]) | |
| 120 | |
| 121 class DistributionSample(object): | |
| 122 def nSamples(self): | |
| 123 return sum(self.counts) | |
| 124 | |
| 125 def cumulativeDensityFunction(sample, normalized = False): | |
| 126 '''Returns the cumulative density function of the sample of a random variable''' | |
| 127 xaxis = sorted(sample) | |
| 128 counts = arange(1,len(sample)+1) # dtype = float | |
| 129 if normalized: | |
| 130 counts /= float(len(sample)) | |
| 131 return xaxis, counts | |
| 132 | |
| 133 class DiscreteDistributionSample(DistributionSample): | |
| 134 '''Class to represent a sample of a distribution for a discrete random variable''' | |
| 135 def __init__(self, categories, counts): | |
| 136 self.categories = categories | |
| 137 self.counts = counts | |
| 138 | |
| 139 def mean(self): | |
| 140 result = [float(x*y) for x,y in zip(self.categories, self.counts)] | |
| 141 return npsum(result)/self.nSamples() | |
| 142 | |
| 143 def var(self, mean = None): | |
| 144 if not mean: | |
| 145 m = self.mean() | |
| 146 else: | |
| 147 m = mean | |
| 148 result = 0. | |
| 149 squares = [float((x-m)*(x-m)*y) for x,y in zip(self.categories, self.counts)] | |
| 150 return npsum(squares)/(self.nSamples()-1) | |
| 151 | |
| 152 def referenceCounts(self, probability): | |
| 153 '''probability is a function that returns the probability of the random variable for the category values''' | |
| 154 refProba = [probability(c) for c in self.categories] | |
| 155 refProba[-1] = 1-npsum(refProba[:-1]) | |
| 156 refCounts = [r*self.nSamples() for r in refProba] | |
| 157 return refCounts, refProba | |
| 158 | |
| 159 class ContinuousDistributionSample(DistributionSample): | |
| 160 '''Class to represent a sample of a distribution for a continuous random variable | |
| 161 with the number of observations for each interval | |
| 162 intervals (categories variable) are defined by their left limits, the last one being the right limit | |
| 163 categories contain therefore one more element than the counts''' | |
| 164 def __init__(self, categories, counts): | |
| 165 # todo add samples for initialization and everything to None? (or setSamples?) | |
| 166 self.categories = categories | |
| 167 self.counts = counts | |
| 168 | |
| 169 @staticmethod | |
| 170 def generate(sample, categories): | |
| 171 if min(sample) < min(categories): | |
| 172 print('Sample has lower min than proposed categories ({}, {})'.format(min(sample), min(categories))) | |
| 173 if max(sample) > max(categories): | |
| 174 print('Sample has higher max than proposed categories ({}, {})'.format(max(sample), max(categories))) | |
| 175 dist = ContinuousDistributionSample(sorted(categories), [0]*(len(categories)-1)) | |
| 176 for s in sample: | |
| 177 i = 0 | |
| 178 while i<len(dist.categories) and dist.categories[i] <= s: | |
| 179 i += 1 | |
| 180 if i <= len(dist.counts): | |
| 181 dist.counts[i-1] += 1 | |
| 182 #print('{} in {} {}'.format(s, dist.categories[i-1], dist.categories[i])) | |
| 183 else: | |
| 184 print('Element {} is not in the categories'.format(s)) | |
| 185 return dist | |
| 186 | |
| 187 def mean(self): | |
| 188 result = 0. | |
| 189 for i in range(len(self.counts)-1): | |
| 190 result += self.counts[i]*(self.categories[i]+self.categories[i+1])/2 | |
| 191 return result/self.nSamples() | |
| 192 | |
| 193 def var(self, mean = None): | |
| 194 if not mean: | |
| 195 m = self.mean() | |
| 196 else: | |
| 197 m = mean | |
| 198 result = 0. | |
| 199 for i in range(len(self.counts)-1): | |
| 200 mid = (self.categories[i]+self.categories[i+1])/2 | |
| 201 result += self.counts[i]*(mid - m)*(mid - m) | |
| 202 return result/(self.nSamples()-1) | |
| 203 | |
| 204 def referenceCounts(self, cdf): | |
| 205 '''cdf is a cumulative distribution function | |
| 206 returning the probability of the variable being less that x''' | |
| 207 # refCumulativeCounts = [0]#[cdf(self.categories[0][0])] | |
| 208 # for inter in self.categories: | |
| 209 # refCumulativeCounts.append(cdf(inter[1])) | |
| 210 refCumulativeCounts = [cdf(x) for x in self.categories[1:-1]] | |
| 211 | |
| 212 refProba = [refCumulativeCounts[0]] | |
| 213 for i in xrange(1,len(refCumulativeCounts)): | |
| 214 refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1]) | |
| 215 refProba.append(1-refCumulativeCounts[-1]) | |
| 216 refCounts = [p*self.nSamples() for p in refProba] | |
| 217 | |
| 218 return refCounts, refProba | |
| 219 | |
| 220 def printReferenceCounts(self, refCounts=None): | |
| 221 if refCounts: | |
| 222 ref = refCounts | |
| 223 else: | |
| 224 ref = self.referenceCounts | |
| 225 for i in xrange(len(ref[0])): | |
| 226 print('{0}-{1} & {2:0.3} & {3:0.3} \\\\'.format(self.categories[i],self.categories[i+1],ref[1][i], ref[0][i])) | |
| 227 | |
| 228 | |
| 229 ######################### | |
| 230 # maths section | |
| 231 ######################### | |
| 232 | |
| 233 # def kernelSmoothing(sampleX, X, Y, weightFunc, halfwidth): | |
| 234 # '''Returns a smoothed weighted version of Y at the predefined values of sampleX | |
| 235 # Sum_x weight(sample_x,x) * y(x)''' | |
| 236 # from numpy import zeros, array | |
| 237 # smoothed = zeros(len(sampleX)) | |
| 238 # for i,x in enumerate(sampleX): | |
| 239 # weights = array([weightFunc(x,xx, halfwidth) for xx in X]) | |
| 240 # if sum(weights)>0: | |
| 241 # smoothed[i] = sum(weights*Y)/sum(weights) | |
| 242 # else: | |
| 243 # smoothed[i] = 0 | |
| 244 # return smoothed | |
| 245 | |
| 246 def kernelSmoothing(x, X, Y, weightFunc, halfwidth): | |
| 247 '''Returns the smoothed estimate of (X,Y) at x | |
| 248 Sum_x weight(sample_x,x) * y(x)''' | |
| 249 weights = array([weightFunc(x,observedx, halfwidth) for observedx in X]) | |
| 250 if sum(weights)>0: | |
| 251 return sum(weights*Y)/sum(weights) | |
| 252 else: | |
| 253 return 0 | |
| 254 | |
| 255 def uniform(center, x, halfwidth): | |
| 256 if abs(center-x)<halfwidth: | |
| 257 return 1. | |
| 258 else: | |
| 259 return 0. | |
| 260 | |
| 261 def gaussian(center, x, halfwidth): | |
| 262 return exp(-((center-x)/halfwidth)**2/2) | |
| 263 | |
| 264 def epanechnikov(center, x, halfwidth): | |
| 265 diff = abs(center-x) | |
| 266 if diff<halfwidth: | |
| 267 return 1.-(diff/halfwidth)**2 | |
| 268 else: | |
| 269 return 0. | |
| 270 | |
| 271 def triangular(center, x, halfwidth): | |
| 272 diff = abs(center-x) | |
| 273 if diff<halfwidth: | |
| 274 return 1.-abs(diff/halfwidth) | |
| 275 else: | |
| 276 return 0. | |
| 277 | |
| 278 def medianSmoothing(x, X, Y, halfwidth): | |
| 279 '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]''' | |
| 280 return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth]) | |
| 281 | |
| 282 def argmaxDict(d): | |
| 283 return max(d, key=d.get) | |
| 284 | |
| 285 def deltaFrames(t1, t2, frameRate): | |
| 286 '''Returns the number of frames between t1 and t2 | |
| 287 positive if t1<=t2, negative otherwise''' | |
| 288 if t1 > t2: | |
| 289 return -(t1-t2).seconds*frameRate | |
| 290 else: | |
| 291 return (t2-t1).seconds*frameRate | |
| 292 | |
| 293 def framesToTime(nFrames, frameRate, initialTime = time()): | |
| 294 '''returns a datetime.time for the time in hour, minutes and seconds | |
| 295 initialTime is a datetime.time''' | |
| 296 seconds = int(floor(float(nFrames)/float(frameRate))+initialTime.hour*3600+initialTime.minute*60+initialTime.second) | |
| 297 h = int(floor(seconds/3600.)) | |
| 298 seconds = seconds - h*3600 | |
| 299 m = int(floor(seconds/60)) | |
| 300 seconds = seconds - m*60 | |
| 301 return time(h, m, seconds) | |
| 302 | |
| 303 def timeToFrames(t, frameRate): | |
| 304 return frameRate*(t.hour*3600+t.minute*60+t.second) | |
| 305 | |
| 306 def sortXY(X,Y): | |
| 307 'returns the sorted (x, Y(x)) sorted on X' | |
| 308 D = {} | |
| 309 for x, y in zip(X,Y): | |
| 310 D[x]=y | |
| 311 xsorted = sorted(D.keys()) | |
| 312 return xsorted, [D[x] for x in xsorted] | |
| 313 | |
| 314 def compareLengthForSort(i, j): | |
| 315 if len(i) < len(j): | |
| 316 return -1 | |
| 317 elif len(i) == len(j): | |
| 318 return 0 | |
| 319 else: | |
| 320 return 1 | |
| 321 | |
| 322 def sortByLength(instances, reverse = False): | |
| 323 '''Returns a new list with the instances sorted by length (method __len__) | |
| 324 reverse is passed to sorted''' | |
| 325 return sorted(instances, key = len, reverse = reverse) | |
| 326 | |
| 327 def ceilDecimals(v, nDecimals): | |
| 328 '''Rounds the number at the nth decimal | |
| 329 eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3''' | |
| 330 tens = 10**nDecimals | |
| 331 return ceil(v*tens)/tens | |
| 332 | |
| 333 def inBetween(bound1, bound2, x): | |
| 334 'useful if one does not know the order of bound1/bound2' | |
| 335 return bound1 <= x <= bound2 or bound2 <= x <= bound1 | |
| 336 | |
| 337 def pointDistanceL2(x1,y1,x2,y2): | |
| 338 ''' Compute point-to-point distance (L2 norm, ie Euclidean distance)''' | |
| 339 return sqrt((x2-x1)**2+(y2-y1)**2) | |
| 340 | |
| 341 def crossProduct(l1, l2): | |
| 342 return l1[0]*l2[1]-l1[1]*l2[0] | |
| 343 | |
| 344 def cat_mvgavg(cat_list, halfWidth): | |
| 345 ''' Return a list of categories/values smoothed according to a window. | |
| 346 halfWidth is the search radius on either side''' | |
| 347 from copy import deepcopy | |
| 348 smoothed = deepcopy(cat_list) | |
| 349 for point in range(len(cat_list)): | |
| 350 lower_bound_check = max(0,point-halfWidth) | |
| 351 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1) | |
| 352 window_values = cat_list[lower_bound_check:upper_bound_check] | |
| 353 smoothed[point] = max(set(window_values), key=window_values.count) | |
| 354 return smoothed | |
| 355 | |
| 356 def filterMovingWindow(inputSignal, halfWidth): | |
| 357 '''Returns an array obtained after the smoothing of the input by a moving average | |
| 358 The first and last points are copied from the original.''' | |
| 359 width = float(halfWidth*2+1) | |
| 360 win = ones(width,'d') | |
| 361 result = convolve(win/width,array(inputSignal),'same') | |
| 362 result[:halfWidth] = inputSignal[:halfWidth] | |
| 363 result[-halfWidth:] = inputSignal[-halfWidth:] | |
| 364 return result | |
| 365 | |
| 366 def linearRegression(x, y, deg = 1, plotData = False): | |
| 367 '''returns the least square estimation of the linear regression of y = ax+b | |
| 368 as well as the plot''' | |
| 369 from numpy.lib.polynomial import polyfit | |
| 370 from numpy.core.multiarray import arange | |
| 371 coef = polyfit(x, y, deg) | |
| 372 if plotData: | |
| 373 def poly(x): | |
| 374 result = 0 | |
| 375 for i in range(len(coef)): | |
| 376 result += coef[i]*x**(len(coef)-i-1) | |
| 377 return result | |
| 378 plt.plot(x, y, 'x') | |
| 379 xx = arange(min(x), max(x),(max(x)-min(x))/1000) | |
| 380 plt.plot(xx, [poly(z) for z in xx]) | |
| 381 return coef | |
| 382 | |
| 383 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): | |
| 384 '''Computes (and displays) the correlation matrix for a pandas DataFrame''' | |
| 385 columns = data.columns.tolist() | |
| 386 for var in data.columns: | |
| 387 uniqueValues = data[var].unique() | |
| 388 if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan | |
| 389 columns.remove(var) | |
| 390 c=data[columns].corr(correlationMethod) | |
| 391 if plotFigure: | |
| 392 fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0])) | |
| 393 fig.add_subplot(1,1,1) | |
| 394 #plt.imshow(np.fabs(c), interpolation='none') | |
| 395 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm | |
| 396 if displayNames is not None: | |
| 397 colnames = [displayNames.get(s.strip(), s.strip()) for s in columns] | |
| 398 else: | |
| 399 colnames = columns | |
| 400 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) | |
| 401 plt.xticks(range(len(colnames)), colnames, rotation=90) | |
| 402 plt.yticks(range(len(colnames)), colnames) | |
| 403 plt.tick_params('both', length=0) | |
| 404 plt.subplots_adjust(bottom = 0.29) | |
| 405 plt.colorbar() | |
| 406 plt.title('Correlation ({})'.format(correlationMethod)) | |
| 407 plt.tight_layout() | |
| 408 if len(colnames) > 50: | |
| 409 plt.subplots_adjust(left=.06) | |
| 410 if figureFilename is not None: | |
| 411 plt.savefig(figureFilename, dpi = 150, transparent = True) | |
| 412 return c | |
| 413 | |
| 414 def addDummies(data, variables, allVariables = True): | |
| 415 '''Add binary dummy variables for each value of a nominal variable | |
| 416 in a pandas DataFrame''' | |
| 417 newVariables = [] | |
| 418 for var in variables: | |
| 419 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: | |
| 420 values = data[var].unique() | |
| 421 if not allVariables: | |
| 422 values = values[:-1] | |
| 423 for val in values: | |
| 424 if val is not NaN: | |
| 425 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | |
| 426 data[newVariable] = (data[var] == val) | |
| 427 newVariables.append(newVariable) | |
| 428 return newVariables | |
| 429 | |
| 430 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = ''): | |
| 431 '''Studies the influence of (nominal) independent variable over the dependent variable | |
| 432 | |
| 433 Makes tests if the conditional distributions are normal | |
| 434 using the Shapiro-Wilk test (in which case ANOVA could be used) | |
| 435 Implements uses the non-parametric Kruskal Wallis test''' | |
| 436 tmp = data[data[independentVariable].notnull()] | |
| 437 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | |
| 438 if len(independentVariableValues) >= 2: | |
| 439 if saveLatex: | |
| 440 from storage import openCheck | |
| 441 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') | |
| 442 for x in independentVariableValues: | |
| 443 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | |
| 444 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | |
| 445 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])) | |
| 446 if plotFigure: | |
| 447 plt.figure() | |
| 448 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
| 449 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | |
| 450 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | |
| 451 if filenamePrefix is not None: | |
| 452 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | |
| 453 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | |
| 454 table['count'] = table['count'].astype(int) | |
| 455 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
| 456 if saveLatex: | |
| 457 out.write('\\begin{minipage}{\\linewidth}\n' | |
| 458 +'\\centering\n' | |
| 459 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' | |
| 460 +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n' | |
| 461 +'\\end{minipage}\n' | |
| 462 +'\\ \\vspace{0.5cm}\n') | |
| 463 else: | |
| 464 print(table) | |
| 465 return testResult | |
| 466 else: | |
| 467 return None | |
| 468 | |
| 469 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''): | |
| 470 '''Removes variables from candidate independent variables if | |
| 471 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed | |
| 472 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) | |
| 473 Returns the remaining non-correlated variables, correlated with the dependent variable | |
| 474 | |
| 475 correlationFunc is spearmanr or pearsonr from scipy.stats | |
| 476 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise | |
| 477 | |
| 478 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' | |
| 479 from copy import copy | |
| 480 from pandas import DataFrame | |
| 481 result = copy(independentVariables) | |
| 482 table1 = '' | |
| 483 table2 = {} | |
| 484 # constant variables | |
| 485 for var in independentVariables: | |
| 486 uniqueValues = data[var].unique() | |
| 487 if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1): | |
| 488 print(stdoutText[0].format(var, uniqueValues)) | |
| 489 if saveFiles: | |
| 490 table1 += latexTable[0].format(var, *uniqueValues) | |
| 491 result.remove(var) | |
| 492 # correlated variables | |
| 493 for v1 in copy(result): | |
| 494 if v1 in correlations.index: | |
| 495 for v2 in copy(result): | |
| 496 if v2 != v1 and v2 in correlations.index: | |
| 497 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: | |
| 498 if v1 in result and v2 in result: | |
| 499 if saveFiles: | |
| 500 table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2]) | |
| 501 print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2])) | |
| 502 result.remove(v2) | |
| 503 # not correlated with dependent variable | |
| 504 table2['Correlations'] = [] | |
| 505 table2['Valeurs p'] = [] | |
| 506 for var in copy(result): | |
| 507 if data.dtypes[var] != dtype('O'): | |
| 508 cor, p = correlationFunc(data[dependentVariable], data[var]) | |
| 509 if p > maxCorrelationP: | |
| 510 if saveFiles: | |
| 511 table1 += latexTable[2].format(var, cor, p) | |
| 512 print(stdoutText[2].format(var, cor, p)) | |
| 513 result.remove(var) | |
| 514 else: | |
| 515 table2['Correlations'].append(cor) | |
| 516 table2['Valeurs p'].append(p) | |
| 517 | |
| 518 if saveFiles: | |
| 519 from storage import openCheck | |
| 520 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') | |
| 521 out.write(latexHeader) | |
| 522 out.write(table1) | |
| 523 out.write(latexFooter) | |
| 524 out.close() | |
| 525 out = openCheck(filenamePrefix+'-correlations.html', 'w') | |
| 526 table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')] | |
| 527 out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False)) | |
| 528 out.close() | |
| 529 return result | |
| 530 | |
| 531 def saveDokMatrix(filename, m, lowerTriangle = False): | |
| 532 'Saves a dok_matrix using savez' | |
| 533 if lowerTriangle: | |
| 534 keys = [k for k in m if k[0] > k[1]] | |
| 535 savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys]) | |
| 536 else: | |
| 537 savez(filename, shape = m.shape, keys = list(m.keys()), values = list(m.values())) | |
| 538 | |
| 539 def loadDokMatrix(filename): | |
| 540 'Loads a dok_matrix saved using the above saveDokMatrix' | |
| 541 data = npload(filename) | |
| 542 m = dok_matrix(tuple(data['shape'])) | |
| 543 for k, v in zip(data['keys'], data['values']): | |
| 544 m[tuple(k)] = v | |
| 545 return m | |
| 546 | |
| 547 def aggregationFunction(funcStr, centile = 50): | |
| 548 '''return the numpy function corresponding to funcStr | |
| 549 centile can be a list of centiles to compute at once, eg [25, 50, 75] for the 3 quartiles''' | |
| 550 if funcStr == 'median': | |
| 551 return median | |
| 552 elif funcStr == 'mean': | |
| 553 return mean | |
| 554 elif funcStr == 'centile': | |
| 555 return lambda x: percentile(x, centile) | |
| 556 elif funcStr == '85centile': | |
| 557 return lambda x: percentile(x, 85) | |
| 558 else: | |
| 559 print('Unknown aggregation method: {}'.format(funcStr)) | |
| 560 return None | |
| 561 | |
| 562 ######################### | |
| 563 # regression analysis using statsmodels (and pandas) | |
| 564 ######################### | |
| 565 | |
| 566 # TODO make class for experiments? | |
| 567 # TODO add tests with public dataset downloaded from Internet (IRIS et al) | |
| 568 def modelString(experiment, dependentVariable, independentVariables): | |
| 569 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) | |
| 570 | |
| 571 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
| 572 import statsmodels.formula.api as smf | |
| 573 modelStr = modelString(experiment, dependentVariable, independentVariables) | |
| 574 if regressionType == 'ols': | |
| 575 model = smf.ols(modelStr, data = data) | |
| 576 elif regressionType == 'gls': | |
| 577 model = smf.gls(modelStr, data = data) | |
| 578 elif regressionType == 'rlm': | |
| 579 model = smf.rlm(modelStr, data = data) | |
| 580 else: | |
| 581 print('Unknown regression type {}. Exiting'.format(regressionType)) | |
| 582 import sys | |
| 583 sys.exit() | |
| 584 return model.fit() | |
| 585 | |
| 586 def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
| 587 '''Runs several models and stores 3 statistics | |
| 588 adjusted R2, condition number (should be small, eg < 1000) | |
| 589 and p-value for Shapiro-Wilk test of residual normality''' | |
| 590 for i,experiment in experiments.iterrows(): | |
| 591 if experiment[independentVariables].any(): | |
| 592 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') | |
| 593 experiments.loc[i,'r2adj'] = results.rsquared_adj | |
| 594 experiments.loc[i,'condNum'] = results.condition_number | |
| 595 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] | |
| 596 experiments.loc[i,'nobs'] = int(results.nobs) | |
| 597 return experiments | |
| 598 | |
| 599 def generateExperiments(independentVariables): | |
| 600 '''Generates all possible models for including or not each independent variable''' | |
| 601 from pandas import DataFrame | |
| 602 experiments = {} | |
| 603 nIndependentVariables = len(independentVariables) | |
| 604 if nIndependentVariables != len(set(independentVariables)): | |
| 605 print("Duplicate variables. Exiting") | |
| 606 import sys | |
| 607 sys.exit() | |
| 608 nModels = 2**nIndependentVariables | |
| 609 for i,var in enumerate(independentVariables): | |
| 610 pattern = [False]*(2**i)+[True]*(2**i) | |
| 611 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) | |
| 612 experiments = DataFrame(experiments) | |
| 613 experiments['r2adj'] = 0. | |
| 614 experiments['condNum'] = NaN | |
| 615 experiments['shapiroP'] = -1 | |
| 616 experiments['nobs'] = -1 | |
| 617 return experiments | |
| 618 | |
| 619 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | |
| 620 '''Generates all possible model with the independentVariables | |
| 621 and runs them, saving the results in experiments | |
| 622 with multiprocess option''' | |
| 623 from pandas import concat | |
| 624 from multiprocessing import Pool | |
| 625 experiments = generateExperiments(independentVariables) | |
| 626 nModels = len(experiments) | |
| 627 print("Running {} models with {} processes".format(nModels, nProcesses)) | |
| 628 print("IndependentVariables: {}".format(independentVariables)) | |
| 629 if nProcesses == 1: | |
| 630 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) | |
| 631 else: | |
| 632 pool = Pool(processes = nProcesses) | |
| 633 chunkSize = int(ceil(nModels/nProcesses)) | |
| 634 jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)] | |
| 635 return concat([job.get() for job in jobs]) | |
| 636 | |
| 637 def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None): | |
| 638 '''Forward search for best model (based on adjusted R2) | |
| 639 Randomly starting with one variable and adding randomly variables | |
| 640 if they improve the model | |
| 641 | |
| 642 The results are added to experiments if provided as argument | |
| 643 Storing in experiment relies on the index being the number equal | |
| 644 to the binary code derived from the independent variables''' | |
| 645 from numpy.random import permutation as nppermutation | |
| 646 if experiments is None: | |
| 647 experiments = generateExperiments(independentVariables) | |
| 648 nIndependentVariables = len(independentVariables) | |
| 649 permutation = nppermutation(list(range(nIndependentVariables))) | |
| 650 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} | |
| 651 print('Tested variables '+', '.join([variableMapping[i] for i in range(nIndependentVariables)])) | |
| 652 bestModel = [False]*nIndependentVariables | |
| 653 currentVarNum = 0 | |
| 654 currentR2Adj = 0. | |
| 655 for currentVarNum in range(nIndependentVariables): | |
| 656 currentModel = [i for i in bestModel] | |
| 657 currentModel[currentVarNum] = True | |
| 658 rowIdx = sum([0]+[2**i for i in range(nIndependentVariables) if currentModel[permutation[i]]]) | |
| 659 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in range(nIndependentVariables) if currentModel[permutation[i]]]) | |
| 660 if experiments.loc[rowIdx, 'shapiroP'] < 0: | |
| 661 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) | |
| 662 model = modelFunc(modelStr, data = data) | |
| 663 results = model.fit() | |
| 664 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | |
| 665 experiments.loc[rowIdx, 'condNum'] = results.condition_number | |
| 666 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] | |
| 667 experiments.loc[rowIdx, 'nobs'] = int(results.nobs) | |
| 668 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | |
| 669 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | |
| 670 bestModel[currentVarNum] = True | |
| 671 return experiments | |
| 672 | |
| 673 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}): | |
| 674 import statsmodels.api as sm | |
| 675 '''Displays some model results | |
| 676 | |
| 677 3 graphics, true-predicted, residuals-predicted, ''' | |
| 678 print(results.summary()) | |
| 679 shapiroResult = shapiro(results.resid) | |
| 680 print(shapiroResult) | |
| 681 if plotFigures: | |
| 682 fig = plt.figure(figsize=(7,6.3*(2+int(model is not None)))) | |
| 683 if model is not None: | |
| 684 ax = fig.add_subplot(3,1,1) | |
| 685 plt.plot(results.predict(), model.endog, 'x') | |
| 686 x=plt.xlim() | |
| 687 y=plt.ylim() | |
| 688 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') | |
| 689 #plt.axis('equal') | |
| 690 if text is not None: | |
| 691 plt.title(text['title-shapiro'].format(*shapiroResult)) | |
| 692 #plt.title(text['true-predicted.title']) | |
| 693 plt.xlabel(text['true-predicted.xlabel']) | |
| 694 plt.ylabel(text['true-predicted.ylabel']) | |
| 695 fig.add_subplot(3,1,2, sharex = ax) | |
| 696 plt.plot(results.predict(), results.resid, 'x') | |
| 697 nextSubplotNum = 3 | |
| 698 else: | |
| 699 fig.add_subplot(2,1,1) | |
| 700 plt.plot(results.predict(), results.resid, 'x') | |
| 701 nextSubplotNum = 2 | |
| 702 if text is not None: | |
| 703 if model is None: | |
| 704 plt.title(text['title-shapiro'].format(*shapiroResult)) | |
| 705 plt.xlabel(text['residuals-predicted.xlabel']) | |
| 706 plt.ylabel(text['residuals-predicted.ylabel']) | |
| 707 qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum) | |
| 708 sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx) | |
| 709 plt.axis('equal') | |
| 710 if text is not None and 'qqplot.xlabel' in text: | |
| 711 plt.xlabel(text['qqplot.xlabel']) | |
| 712 plt.ylabel(text['qqplot.ylabel']) | |
| 713 plt.tight_layout() | |
| 714 if filenamePrefix is not None: | |
| 715 from storage import openCheck | |
| 716 out = openCheck(filenamePrefix+'-coefficients.html', 'w') | |
| 717 out.write(results.summary().as_html()) | |
| 718 plt.savefig(filenamePrefix+'-model-results.'+figureFileType) | |
| 719 | |
| 720 ######################### | |
| 721 # iterable section | |
| 722 ######################### | |
| 723 | |
| 724 def mostCommon(L): | |
| 725 '''Returns the most frequent element in a iterable | |
| 726 | |
| 727 taken from http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list''' | |
| 728 from itertools import groupby | |
| 729 from operator import itemgetter | |
| 730 # get an iterable of (item, iterable) pairs | |
| 731 SL = sorted((x, i) for i, x in enumerate(L)) | |
| 732 # print 'SL:', SL | |
| 733 groups = groupby(SL, key=itemgetter(0)) | |
| 734 # auxiliary function to get "quality" for an item | |
| 735 def _auxfun(g): | |
| 736 item, iterable = g | |
| 737 count = 0 | |
| 738 min_index = len(L) | |
| 739 for _, where in iterable: | |
| 740 count += 1 | |
| 741 min_index = min(min_index, where) | |
| 742 # print 'item %r, count %r, minind %r' % (item, count, min_index) | |
| 743 return count, -min_index | |
| 744 # pick the highest-count/earliest item | |
| 745 return max(groups, key=_auxfun)[0] | |
| 746 | |
| 747 ######################### | |
| 748 # sequence section | |
| 749 ######################### | |
| 750 | |
| 751 class LCSS(object): | |
| 752 '''Class that keeps the LCSS parameters | |
| 753 and puts together the various computations | |
| 754 | |
| 755 the methods with names starting with _ are not to be shadowed | |
| 756 in child classes, who will shadow the other methods, | |
| 757 ie compute and computeXX methods''' | |
| 758 def __init__(self, similarityFunc = None, metric = None, epsilon = None, delta = float('inf'), aligned = False, lengthFunc = min): | |
| 759 '''One should provide either a similarity function | |
| 760 that indicates (return bool) whether elements in the compares lists are similar | |
| 761 | |
| 762 eg distance(p1, p2) < epsilon | |
| 763 | |
| 764 or a type of metric usable in scipy.spatial.distance.cdist with an epsilon''' | |
| 765 if similarityFunc is None and metric is None: | |
| 766 print("No way to compute LCSS, similarityFunc and metric are None. Exiting") | |
| 767 import sys | |
| 768 sys.exit() | |
| 769 elif metric is not None and epsilon is None: | |
| 770 print("Please provide a value for epsilon if using a cdist metric. Exiting") | |
| 771 import sys | |
| 772 sys.exit() | |
| 773 else: | |
| 774 if similarityFunc is None and metric is not None and not isinf(delta): | |
| 775 print('Warning: you are using a cdist metric and a finite delta, which will make probably computation slower than using the equivalent similarityFunc (since all pairwise distances will be computed by cdist).') | |
| 776 self.similarityFunc = similarityFunc | |
| 777 self.metric = metric | |
| 778 self.epsilon = epsilon | |
| 779 self.aligned = aligned | |
| 780 self.delta = delta | |
| 781 self.lengthFunc = lengthFunc | |
| 782 self.subSequenceIndices = [(0,0)] | |
| 783 | |
| 784 def similarities(self, l1, l2, jshift=0): | |
| 785 n1 = len(l1) | |
| 786 n2 = len(l2) | |
| 787 self.similarityTable = zeros((n1+1,n2+1), dtype = npint) | |
| 788 if self.similarityFunc is not None: | |
| 789 for i in range(1,n1+1): | |
| 790 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): | |
| 791 if self.similarityFunc(l1[i-1], l2[j-1]): | |
| 792 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 | |
| 793 else: | |
| 794 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) | |
| 795 elif self.metric is not None: | |
| 796 similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon | |
| 797 for i in range(1,n1+1): | |
| 798 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): | |
| 799 if similarElements[i-1, j-1]: | |
| 800 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 | |
| 801 else: | |
| 802 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) | |
| 803 | |
| 804 | |
| 805 def subSequence(self, i, j): | |
| 806 '''Returns the subsequence of two sequences | |
| 807 http://en.wikipedia.org/wiki/Longest_common_subsequence_problem''' | |
| 808 if i == 0 or j == 0: | |
| 809 return [] | |
| 810 elif self.similarityTable[i][j] == self.similarityTable[i][j-1]: | |
| 811 return self.subSequence(i, j-1) | |
| 812 elif self.similarityTable[i][j] == self.similarityTable[i-1][j]: | |
| 813 return self.subSequence(i-1, j) | |
| 814 else: | |
| 815 return self.subSequence(i-1, j-1) + [(i-1,j-1)] | |
| 816 | |
| 817 def _compute(self, _l1, _l2, computeSubSequence = False): | |
| 818 '''returns the longest common subsequence similarity | |
| 819 l1 and l2 should be the right format | |
| 820 eg list of tuple points for cdist | |
| 821 or elements that can be compare using similarityFunc | |
| 822 | |
| 823 if aligned, returns the best matching if using a finite delta by shifting the series alignments | |
| 824 ''' | |
| 825 if len(_l2) < len(_l1): # l1 is the shortest | |
| 826 l1 = _l2 | |
| 827 l2 = _l1 | |
| 828 revertIndices = True | |
| 829 else: | |
| 830 l1 = _l1 | |
| 831 l2 = _l2 | |
| 832 revertIndices = False | |
| 833 n1 = len(l1) | |
| 834 n2 = len(l2) | |
| 835 | |
| 836 if self.aligned: | |
| 837 lcssValues = {} | |
| 838 similarityTables = {} | |
| 839 for i in range(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2 | |
| 840 self.similarities(l1, l2, i) | |
| 841 lcssValues[i] = self.similarityTable.max() | |
| 842 similarityTables[i] = self.similarityTable | |
| 843 #print self.similarityTable | |
| 844 alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance | |
| 845 self.similarityTable = similarityTables[alignmentShift] | |
| 846 else: | |
| 847 alignmentShift = 0 | |
| 848 self.similarities(l1, l2) | |
| 849 | |
| 850 # threshold values for the useful part of the similarity table are n2-n1-delta and n1-n2-delta | |
| 851 self.similarityTable = self.similarityTable[:min(n1, n2+alignmentShift+self.delta)+1, :min(n2, n1-alignmentShift+self.delta)+1] | |
| 852 | |
| 853 if computeSubSequence: | |
| 854 self.subSequenceIndices = self.subSequence(self.similarityTable.shape[0]-1, self.similarityTable.shape[1]-1) | |
| 855 if revertIndices: | |
| 856 self.subSequenceIndices = [(j,i) for i,j in self.subSequenceIndices] | |
| 857 return self.similarityTable[-1,-1] | |
| 858 | |
| 859 def compute(self, l1, l2, computeSubSequence = False): | |
| 860 '''get methods are to be shadowed in child classes ''' | |
| 861 return self._compute(l1, l2, computeSubSequence) | |
| 862 | |
| 863 def computeAlignment(self): | |
| 864 return mean([j-i for i,j in self.subSequenceIndices]) | |
| 865 | |
| 866 def _computeNormalized(self, l1, l2, computeSubSequence = False): | |
| 867 ''' compute the normalized LCSS | |
| 868 ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc) | |
| 869 lengthFunc = lambda x,y:float(x,y)/2''' | |
| 870 return float(self._compute(l1, l2, computeSubSequence))/self.lengthFunc(len(l1), len(l2)) | |
| 871 | |
| 872 def computeNormalized(self, l1, l2, computeSubSequence = False): | |
| 873 return self._computeNormalized(l1, l2, computeSubSequence) | |
| 874 | |
| 875 def _computeDistance(self, l1, l2, computeSubSequence = False): | |
| 876 ''' compute the LCSS distance''' | |
| 877 return 1-self._computeNormalized(l1, l2, computeSubSequence) | |
| 878 | |
| 879 def computeDistance(self, l1, l2, computeSubSequence = False): | |
| 880 return self._computeDistance(l1, l2, computeSubSequence) | |
| 881 | |
| 882 ######################### | |
| 883 # plotting section | |
| 884 ######################### | |
| 885 | |
| 886 def plotPolygon(poly, options = '', **kwargs): | |
| 887 'Plots shapely polygon poly' | |
| 888 from matplotlib.pyplot import plot | |
| 889 x,y = poly.exterior.xy | |
| 890 plot(x, y, options, **kwargs) | |
| 891 | |
| 892 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1): | |
| 893 '''for each value in X, increment by increment the initial count | |
| 894 returns the lists that can be plotted | |
| 895 to obtain a step plot increasing by one for each value in x, from first to last value | |
| 896 firstX and lastX should be respectively smaller and larger than all elements in X''' | |
| 897 | |
| 898 sortedX = [] | |
| 899 counts = [initialCount] | |
| 900 for x in sorted(X): | |
| 901 sortedX += [x,x] | |
| 902 counts.append(counts[-1]) | |
| 903 counts.append(counts[-1]+increment) | |
| 904 counts.append(counts[-1]) | |
| 905 return [firstX]+sortedX+[lastX], counts | |
| 906 | |
| 907 class PlottingPropertyValues(object): | |
| 908 def __init__(self, values): | |
| 909 self.values = values | |
| 910 | |
| 911 def __getitem__(self, i): | |
| 912 return self.values[i%len(self.values)] | |
| 913 | |
| 914 markers = PlottingPropertyValues(['+', '*', ',', '.', 'x', 'D', 's', 'o']) | |
| 915 scatterMarkers = PlottingPropertyValues(['s','o','^','>','v','<','d','p','h','8','+','x']) | |
| 916 | |
| 917 linestyles = PlottingPropertyValues(['-', '--', '-.', ':']) | |
| 918 | |
| 919 colors = PlottingPropertyValues('brgmyck') # 'w' | |
| 920 | |
| 921 def monochromeCycler(withMarker = False): | |
| 922 from cycler import cycler | |
| 923 if withMarker: | |
| 924 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']) * cycler('marker', ['^',',', '.'])) | |
| 925 else: | |
| 926 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) | |
| 927 plt.rc('axes', prop_cycle=monochrome) | |
| 928 | |
| 929 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): | |
| 930 from matplotlib.pyplot import pcolor | |
| 931 coords = array(list(indicatorMap.keys())) | |
| 932 minX = min(coords[:,0]) | |
| 933 minY = min(coords[:,1]) | |
| 934 X = arange(minX, max(coords[:,0])+1.1)*squareSize | |
| 935 Y = arange(minY, max(coords[:,1])+1.1)*squareSize | |
| 936 C = defaultValue*ones((len(Y), len(X))) | |
| 937 for k,v in indicatorMap.items(): | |
| 938 C[k[1]-minY,k[0]-minX] = v | |
| 939 if masked: | |
| 940 pcolor(X, Y, ma.masked_where(C==defaultValue,C)) | |
| 941 else: | |
| 942 pcolor(X, Y, C) | |
| 943 | |
| 944 ######################### | |
| 945 # Data download | |
| 946 ######################### | |
| 947 | |
| 948 def downloadECWeather(stationID, years, months = [], outputDirectoryname = '.', english = True): | |
| 949 '''Downloads monthly weather data from Environment Canada | |
| 950 If month is provided (number 1 to 12), it means hourly data for the whole month | |
| 951 Otherwise, means the data for each day, for the whole year | |
| 952 | |
| 953 Example: MONTREAL MCTAVISH 10761 | |
| 954 MONTREALPIERRE ELLIOTT TRUDEAU INTL A 5415 | |
| 955 see ftp://[email protected]/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv | |
| 956 | |
| 957 To get daily data for 2010 and 2011, downloadECWeather(10761, [2010,2011], [], '/tmp') | |
| 958 To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp') | |
| 959 | |
| 960 for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done | |
| 961 for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done | |
| 962 ''' | |
| 963 import urllib.request | |
| 964 if english: | |
| 965 language = 'e' | |
| 966 else: | |
| 967 language = 'f' | |
| 968 if len(months) == 0: | |
| 969 timeFrame = 2 | |
| 970 months = [1] | |
| 971 else: | |
| 972 timeFrame = 1 | |
| 973 | |
| 974 for year in years: | |
| 975 for month in months: | |
| 976 outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year) | |
| 977 if timeFrame == 1: | |
| 978 outFilename += '-{}-hourly'.format(month) | |
| 979 else: | |
| 980 outFilename += '-daily' | |
| 981 outFilename += '.csv' | |
| 982 url = urllib.request.urlretrieve('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=Download+Data'.format(language, stationID, year, month, timeFrame), outFilename) | |
| 983 | |
| 984 ######################### | |
| 985 # File I/O | |
| 986 ######################### | |
| 987 | |
| 988 def removeExtension(filename, delimiter = '.'): | |
| 989 '''Returns the filename minus the extension (all characters after last .)''' | |
| 990 i = filename.rfind(delimiter) | |
| 991 if i>0: | |
| 992 return filename[:i] | |
| 993 else: | |
| 994 return filename | |
| 995 | |
| 996 def getExtension(filename, delimiter = '.'): | |
| 997 '''Returns the filename minus the extension (all characters after last .)''' | |
| 998 i = filename.rfind(delimiter) | |
| 999 if i>0: | |
| 1000 return filename[i+1:] | |
| 1001 else: | |
| 1002 return '' | |
| 1003 | |
| 1004 def cleanFilename(s): | |
| 1005 'cleans filenames obtained when contatenating figure characteristics' | |
| 1006 return s.replace(' ','-').replace('.','').replace('/','-').replace(',','') | |
| 1007 | |
| 1008 def getRelativeFilename(parentPath, filename): | |
| 1009 'Returns filename if absolute, otherwise parentPath/filename as string' | |
| 1010 filePath = Path(filename) | |
| 1011 if filePath.is_absolute(): | |
| 1012 return filename | |
| 1013 else: | |
| 1014 return str(parentPath/filePath) | |
| 1015 | |
| 1016 def listfiles(dirname, extension, remove = False): | |
| 1017 '''Returns the list of files with the extension in the directory dirname | |
| 1018 If remove is True, the filenames are stripped from the extension''' | |
| 1019 d = Path(dirname) | |
| 1020 if d.is_dir(): | |
| 1021 tmp = [str(f) for f in d.glob('*.extension')] | |
| 1022 if remove: | |
| 1023 return [removeExtension(f, extension) for f in tmp] | |
| 1024 else: | |
| 1025 return tmp | |
| 1026 else: | |
| 1027 print(dirname+' is not a directory') | |
| 1028 return [] | |
| 1029 | |
| 1030 def mkdir(dirname): | |
| 1031 'Creates a directory if it does not exist' | |
| 1032 p = Path(dirname) | |
| 1033 if not p.exists(): | |
| 1034 p.mkdir() | |
| 1035 else: | |
| 1036 print(dirname+' already exists') | |
| 1037 | |
| 1038 def removeFile(filename): | |
| 1039 '''Deletes the file while avoiding raising an error | |
| 1040 if the file does not exist''' | |
| 1041 f = Path(filename) | |
| 1042 if (f.exists()): | |
| 1043 f.unlink() | |
| 1044 else: | |
| 1045 print(filename+' does not exist') | |
| 1046 | |
| 1047 def line2Floats(l, separator=' '): | |
| 1048 '''Returns the list of floats corresponding to the string''' | |
| 1049 return [float(x) for x in l.split(separator)] | |
| 1050 | |
| 1051 def line2Ints(l, separator=' '): | |
| 1052 '''Returns the list of ints corresponding to the string''' | |
| 1053 return [int(x) for x in l.split(separator)] | |
| 1054 | |
| 1055 ######################### | |
| 1056 # Profiling | |
| 1057 ######################### | |
| 1058 | |
| 1059 def analyzeProfile(profileFilename, stripDirs = True): | |
| 1060 '''Analyze the file produced by cProfile | |
| 1061 | |
| 1062 obtained by for example: | |
| 1063 - call in script (for main() function in script) | |
| 1064 import cProfile, os | |
| 1065 cProfile.run('main()', os.path.join(os.getcwd(),'main.profile')) | |
| 1066 | |
| 1067 - or on the command line: | |
| 1068 python -m cProfile [-o profile.bin] [-s sort] scriptfile [arg]''' | |
| 1069 import pstats, os | |
| 1070 p = pstats.Stats(os.path.join(os.pardir, profileFilename)) | |
| 1071 if stripDirs: | |
| 1072 p.strip_dirs() | |
| 1073 p.sort_stats('time') | |
| 1074 p.print_stats(.2) | |
| 1075 #p.sort_stats('time') | |
| 1076 # p.print_callees(.1, 'int_prediction.py:') | |
| 1077 return p | |
| 1078 | |
| 1079 ######################### | |
| 1080 # running tests | |
| 1081 ######################### | |
| 1082 | |
| 1083 if __name__ == "__main__": | |
| 1084 import doctest | |
| 1085 import unittest | |
| 1086 suite = doctest.DocFileSuite('tests/utils.txt') | |
| 1087 #suite = doctest.DocTestSuite() | |
| 1088 unittest.TextTestRunner().run(suite) | |
| 1089 #doctest.testmod() | |
| 1090 #doctest.testfile("example.txt") |
