Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132928 views
License: OTHER
1
"""This file contains code for use with "Think Stats" and
2
"Think Bayes", both by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
"""This file contains class definitions for:
11
12
Hist: represents a histogram (map from values to integer frequencies).
13
14
Pmf: represents a probability mass function (map from values to probs).
15
16
_DictWrapper: private parent class for Hist and Pmf.
17
18
Cdf: represents a discrete cumulative distribution function
19
20
Pdf: represents a continuous probability density function
21
22
"""
23
24
import bisect
25
import copy
26
import logging
27
import math
28
import random
29
import re
30
31
from collections import Counter
32
from operator import itemgetter
33
34
import thinkplot
35
36
import numpy as np
37
import pandas
38
39
import scipy
40
from scipy import stats
41
from scipy import special
42
from scipy import ndimage
43
44
from io import open
45
46
ROOT2 = math.sqrt(2)
47
48
def RandomSeed(x):
49
"""Initialize the random and np.random generators.
50
51
x: int seed
52
"""
53
random.seed(x)
54
np.random.seed(x)
55
56
57
def Odds(p):
58
"""Computes odds for a given probability.
59
60
Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
61
62
Note: when p=1, the formula for odds divides by zero, which is
63
normally undefined. But I think it is reasonable to define Odds(1)
64
to be infinity, so that's what this function does.
65
66
p: float 0-1
67
68
Returns: float odds
69
"""
70
if p == 1:
71
return float('inf')
72
return p / (1 - p)
73
74
75
def Probability(o):
76
"""Computes the probability corresponding to given odds.
77
78
Example: o=2 means 2:1 odds in favor, or 2/3 probability
79
80
o: float odds, strictly positive
81
82
Returns: float probability
83
"""
84
return o / (o + 1)
85
86
87
def Probability2(yes, no):
88
"""Computes the probability corresponding to given odds.
89
90
Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
91
92
yes, no: int or float odds in favor
93
"""
94
return yes / (yes + no)
95
96
97
class Interpolator(object):
98
"""Represents a mapping between sorted sequences; performs linear interp.
99
100
Attributes:
101
xs: sorted list
102
ys: sorted list
103
"""
104
105
def __init__(self, xs, ys):
106
self.xs = xs
107
self.ys = ys
108
109
def Lookup(self, x):
110
"""Looks up x and returns the corresponding value of y."""
111
return self._Bisect(x, self.xs, self.ys)
112
113
def Reverse(self, y):
114
"""Looks up y and returns the corresponding value of x."""
115
return self._Bisect(y, self.ys, self.xs)
116
117
def _Bisect(self, x, xs, ys):
118
"""Helper function."""
119
if x <= xs[0]:
120
return ys[0]
121
if x >= xs[-1]:
122
return ys[-1]
123
i = bisect.bisect(xs, x)
124
frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
125
y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
126
return y
127
128
129
class _DictWrapper(object):
130
"""An object that contains a dictionary."""
131
132
def __init__(self, obj=None, label=None):
133
"""Initializes the distribution.
134
135
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
136
label: string label
137
"""
138
self.label = label if label is not None else '_nolegend_'
139
self.d = {}
140
141
# flag whether the distribution is under a log transform
142
self.log = False
143
144
if obj is None:
145
return
146
147
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
148
self.label = label if label is not None else obj.label
149
150
if isinstance(obj, dict):
151
self.d.update(obj.items())
152
elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):
153
self.d.update(obj.Items())
154
elif isinstance(obj, pandas.Series):
155
self.d.update(obj.value_counts().iteritems())
156
else:
157
# finally, treat it like a list
158
self.d.update(Counter(obj))
159
160
if len(self) > 0 and isinstance(self, Pmf):
161
self.Normalize()
162
163
def __hash__(self):
164
return id(self)
165
166
def __str__(self):
167
cls = self.__class__.__name__
168
return '%s(%s)' % (cls, str(self.d))
169
170
__repr__ = __str__
171
172
def __eq__(self, other):
173
return self.d == other.d
174
175
def __len__(self):
176
return len(self.d)
177
178
def __iter__(self):
179
return iter(self.d)
180
181
def iterkeys(self):
182
"""Returns an iterator over keys."""
183
return iter(self.d)
184
185
def __contains__(self, value):
186
return value in self.d
187
188
def __getitem__(self, value):
189
return self.d.get(value, 0)
190
191
def __setitem__(self, value, prob):
192
self.d[value] = prob
193
194
def __delitem__(self, value):
195
del self.d[value]
196
197
def Copy(self, label=None):
198
"""Returns a copy.
199
200
Make a shallow copy of d. If you want a deep copy of d,
201
use copy.deepcopy on the whole object.
202
203
label: string label for the new Hist
204
205
returns: new _DictWrapper with the same type
206
"""
207
new = copy.copy(self)
208
new.d = copy.copy(self.d)
209
new.label = label if label is not None else self.label
210
return new
211
212
def Scale(self, factor):
213
"""Multiplies the values by a factor.
214
215
factor: what to multiply by
216
217
Returns: new object
218
"""
219
new = self.Copy()
220
new.d.clear()
221
222
for val, prob in self.Items():
223
new.Set(val * factor, prob)
224
return new
225
226
def Log(self, m=None):
227
"""Log transforms the probabilities.
228
229
Removes values with probability 0.
230
231
Normalizes so that the largest logprob is 0.
232
"""
233
if self.log:
234
raise ValueError("Pmf/Hist already under a log transform")
235
self.log = True
236
237
if m is None:
238
m = self.MaxLike()
239
240
for x, p in self.d.items():
241
if p:
242
self.Set(x, math.log(p / m))
243
else:
244
self.Remove(x)
245
246
def Exp(self, m=None):
247
"""Exponentiates the probabilities.
248
249
m: how much to shift the ps before exponentiating
250
251
If m is None, normalizes so that the largest prob is 1.
252
"""
253
if not self.log:
254
raise ValueError("Pmf/Hist not under a log transform")
255
self.log = False
256
257
if m is None:
258
m = self.MaxLike()
259
260
for x, p in self.d.items():
261
self.Set(x, math.exp(p - m))
262
263
def GetDict(self):
264
"""Gets the dictionary."""
265
return self.d
266
267
def SetDict(self, d):
268
"""Sets the dictionary."""
269
self.d = d
270
271
def Values(self):
272
"""Gets an unsorted sequence of values.
273
274
Note: one source of confusion is that the keys of this
275
dictionary are the values of the Hist/Pmf, and the
276
values of the dictionary are frequencies/probabilities.
277
"""
278
return self.d.keys()
279
280
def Items(self):
281
"""Gets an unsorted sequence of (value, freq/prob) pairs."""
282
return self.d.items()
283
284
def Render(self, **options):
285
"""Generates a sequence of points suitable for plotting.
286
287
Note: options are ignored
288
289
Returns:
290
tuple of (sorted value sequence, freq/prob sequence)
291
"""
292
if min(self.d.keys()) is np.nan:
293
logging.warning('Hist: contains NaN, may not render correctly.')
294
295
return zip(*sorted(self.Items()))
296
297
def MakeCdf(self, label=None):
298
"""Makes a Cdf."""
299
label = label if label is not None else self.label
300
return Cdf(self, label=label)
301
302
def Print(self):
303
"""Prints the values and freqs/probs in ascending order."""
304
for val, prob in sorted(self.d.items()):
305
print(val, prob)
306
307
def Set(self, x, y=0):
308
"""Sets the freq/prob associated with the value x.
309
310
Args:
311
x: number value
312
y: number freq or prob
313
"""
314
self.d[x] = y
315
316
def Incr(self, x, term=1):
317
"""Increments the freq/prob associated with the value x.
318
319
Args:
320
x: number value
321
term: how much to increment by
322
"""
323
self.d[x] = self.d.get(x, 0) + term
324
325
def Mult(self, x, factor):
326
"""Scales the freq/prob associated with the value x.
327
328
Args:
329
x: number value
330
factor: how much to multiply by
331
"""
332
self.d[x] = self.d.get(x, 0) * factor
333
334
def Remove(self, x):
335
"""Removes a value.
336
337
Throws an exception if the value is not there.
338
339
Args:
340
x: value to remove
341
"""
342
del self.d[x]
343
344
def Total(self):
345
"""Returns the total of the frequencies/probabilities in the map."""
346
total = sum(self.d.values())
347
return total
348
349
def MaxLike(self):
350
"""Returns the largest frequency/probability in the map."""
351
return max(self.d.values())
352
353
def Largest(self, n=10):
354
"""Returns the largest n values, with frequency/probability.
355
356
n: number of items to return
357
"""
358
return sorted(self.d.items(), reverse=True)[:n]
359
360
def Smallest(self, n=10):
361
"""Returns the smallest n values, with frequency/probability.
362
363
n: number of items to return
364
"""
365
return sorted(self.d.items(), reverse=False)[:n]
366
367
368
class Hist(_DictWrapper):
369
"""Represents a histogram, which is a map from values to frequencies.
370
371
Values can be any hashable type; frequencies are integer counters.
372
"""
373
def Freq(self, x):
374
"""Gets the frequency associated with the value x.
375
376
Args:
377
x: number value
378
379
Returns:
380
int frequency
381
"""
382
return self.d.get(x, 0)
383
384
def Freqs(self, xs):
385
"""Gets frequencies for a sequence of values."""
386
return [self.Freq(x) for x in xs]
387
388
def IsSubset(self, other):
389
"""Checks whether the values in this histogram are a subset of
390
the values in the given histogram."""
391
for val, freq in self.Items():
392
if freq > other.Freq(val):
393
return False
394
return True
395
396
def Subtract(self, other):
397
"""Subtracts the values in the given histogram from this histogram."""
398
for val, freq in other.Items():
399
self.Incr(val, -freq)
400
401
402
class Pmf(_DictWrapper):
403
"""Represents a probability mass function.
404
405
Values can be any hashable type; probabilities are floating-point.
406
Pmfs are not necessarily normalized.
407
"""
408
409
def Prob(self, x, default=0):
410
"""Gets the probability associated with the value x.
411
412
Args:
413
x: number value
414
default: value to return if the key is not there
415
416
Returns:
417
float probability
418
"""
419
return self.d.get(x, default)
420
421
def Probs(self, xs):
422
"""Gets probabilities for a sequence of values."""
423
return [self.Prob(x) for x in xs]
424
425
def Percentile(self, percentage):
426
"""Computes a percentile of a given Pmf.
427
428
Note: this is not super efficient. If you are planning
429
to compute more than a few percentiles, compute the Cdf.
430
431
percentage: float 0-100
432
433
returns: value from the Pmf
434
"""
435
p = percentage / 100.0
436
total = 0
437
for val, prob in sorted(self.Items()):
438
total += prob
439
if total >= p:
440
return val
441
442
def ProbGreater(self, x):
443
"""Probability that a sample from this Pmf exceeds x.
444
445
x: number
446
447
returns: float probability
448
"""
449
if isinstance(x, _DictWrapper):
450
return PmfProbGreater(self, x)
451
else:
452
t = [prob for (val, prob) in self.d.items() if val > x]
453
return sum(t)
454
455
def ProbLess(self, x):
456
"""Probability that a sample from this Pmf is less than x.
457
458
x: number
459
460
returns: float probability
461
"""
462
if isinstance(x, _DictWrapper):
463
return PmfProbLess(self, x)
464
else:
465
t = [prob for (val, prob) in self.d.items() if val < x]
466
return sum(t)
467
468
def __lt__(self, obj):
469
"""Less than.
470
471
obj: number or _DictWrapper
472
473
returns: float probability
474
"""
475
return self.ProbLess(obj)
476
477
def __gt__(self, obj):
478
"""Greater than.
479
480
obj: number or _DictWrapper
481
482
returns: float probability
483
"""
484
return self.ProbGreater(obj)
485
486
def __ge__(self, obj):
487
"""Greater than or equal.
488
489
obj: number or _DictWrapper
490
491
returns: float probability
492
"""
493
return 1 - (self < obj)
494
495
def __le__(self, obj):
496
"""Less than or equal.
497
498
obj: number or _DictWrapper
499
500
returns: float probability
501
"""
502
return 1 - (self > obj)
503
504
def Normalize(self, fraction=1.0):
505
"""Normalizes this PMF so the sum of all probs is fraction.
506
507
Args:
508
fraction: what the total should be after normalization
509
510
Returns: the total probability before normalizing
511
"""
512
if self.log:
513
raise ValueError("Normalize: Pmf is under a log transform")
514
515
total = self.Total()
516
if total == 0.0:
517
raise ValueError('Normalize: total probability is zero.')
518
#logging.warning('Normalize: total probability is zero.')
519
#return total
520
521
factor = fraction / total
522
for x in self.d:
523
self.d[x] *= factor
524
525
return total
526
527
def Random(self):
528
"""Chooses a random element from this PMF.
529
530
Note: this is not very efficient. If you plan to call
531
this more than a few times, consider converting to a CDF.
532
533
Returns:
534
float value from the Pmf
535
"""
536
target = random.random()
537
total = 0.0
538
for x, p in self.d.items():
539
total += p
540
if total >= target:
541
return x
542
543
# we shouldn't get here
544
raise ValueError('Random: Pmf might not be normalized.')
545
546
def Mean(self):
547
"""Computes the mean of a PMF.
548
549
Returns:
550
float mean
551
"""
552
mean = 0.0
553
for x, p in self.d.items():
554
mean += p * x
555
return mean
556
557
def Var(self, mu=None):
558
"""Computes the variance of a PMF.
559
560
mu: the point around which the variance is computed;
561
if omitted, computes the mean
562
563
returns: float variance
564
"""
565
if mu is None:
566
mu = self.Mean()
567
568
var = 0.0
569
for x, p in self.d.items():
570
var += p * (x - mu) ** 2
571
return var
572
573
def Std(self, mu=None):
574
"""Computes the standard deviation of a PMF.
575
576
mu: the point around which the variance is computed;
577
if omitted, computes the mean
578
579
returns: float standard deviation
580
"""
581
var = self.Var(mu)
582
return math.sqrt(var)
583
584
def MaximumLikelihood(self):
585
"""Returns the value with the highest probability.
586
587
Returns: float probability
588
"""
589
_, val = max((prob, val) for val, prob in self.Items())
590
return val
591
592
def CredibleInterval(self, percentage=90):
593
"""Computes the central credible interval.
594
595
If percentage=90, computes the 90% CI.
596
597
Args:
598
percentage: float between 0 and 100
599
600
Returns:
601
sequence of two floats, low and high
602
"""
603
cdf = self.MakeCdf()
604
return cdf.CredibleInterval(percentage)
605
606
def __add__(self, other):
607
"""Computes the Pmf of the sum of values drawn from self and other.
608
609
other: another Pmf or a scalar
610
611
returns: new Pmf
612
"""
613
try:
614
return self.AddPmf(other)
615
except AttributeError:
616
return self.AddConstant(other)
617
618
def AddPmf(self, other):
619
"""Computes the Pmf of the sum of values drawn from self and other.
620
621
other: another Pmf
622
623
returns: new Pmf
624
"""
625
pmf = Pmf()
626
for v1, p1 in self.Items():
627
for v2, p2 in other.Items():
628
pmf.Incr(v1 + v2, p1 * p2)
629
return pmf
630
631
def AddConstant(self, other):
632
"""Computes the Pmf of the sum a constant and values from self.
633
634
other: a number
635
636
returns: new Pmf
637
"""
638
pmf = Pmf()
639
for v1, p1 in self.Items():
640
pmf.Set(v1 + other, p1)
641
return pmf
642
643
def __sub__(self, other):
644
"""Computes the Pmf of the diff of values drawn from self and other.
645
646
other: another Pmf
647
648
returns: new Pmf
649
"""
650
try:
651
return self.SubPmf(other)
652
except AttributeError:
653
return self.AddConstant(-other)
654
655
def SubPmf(self, other):
656
"""Computes the Pmf of the diff of values drawn from self and other.
657
658
other: another Pmf
659
660
returns: new Pmf
661
"""
662
pmf = Pmf()
663
for v1, p1 in self.Items():
664
for v2, p2 in other.Items():
665
pmf.Incr(v1 - v2, p1 * p2)
666
return pmf
667
668
def __mul__(self, other):
669
"""Computes the Pmf of the product of values drawn from self and other.
670
671
other: another Pmf
672
673
returns: new Pmf
674
"""
675
try:
676
return self.MulPmf(other)
677
except AttributeError:
678
return self.MulConstant(other)
679
680
def MulPmf(self, other):
681
"""Computes the Pmf of the diff of values drawn from self and other.
682
683
other: another Pmf
684
685
returns: new Pmf
686
"""
687
pmf = Pmf()
688
for v1, p1 in self.Items():
689
for v2, p2 in other.Items():
690
pmf.Incr(v1 * v2, p1 * p2)
691
return pmf
692
693
def MulConstant(self, other):
694
"""Computes the Pmf of the product of a constant and values from self.
695
696
other: a number
697
698
returns: new Pmf
699
"""
700
pmf = Pmf()
701
for v1, p1 in self.Items():
702
pmf.Set(v1 * other, p1)
703
return pmf
704
705
def __div__(self, other):
706
"""Computes the Pmf of the ratio of values drawn from self and other.
707
708
other: another Pmf
709
710
returns: new Pmf
711
"""
712
try:
713
return self.DivPmf(other)
714
except AttributeError:
715
return self.MulConstant(1/other)
716
717
__truediv__ = __div__
718
719
def DivPmf(self, other):
720
"""Computes the Pmf of the ratio of values drawn from self and other.
721
722
other: another Pmf
723
724
returns: new Pmf
725
"""
726
pmf = Pmf()
727
for v1, p1 in self.Items():
728
for v2, p2 in other.Items():
729
pmf.Incr(v1 / v2, p1 * p2)
730
return pmf
731
732
def Max(self, k):
733
"""Computes the CDF of the maximum of k selections from this dist.
734
735
k: int
736
737
returns: new Cdf
738
"""
739
cdf = self.MakeCdf()
740
return cdf.Max(k)
741
742
743
class Joint(Pmf):
744
"""Represents a joint distribution.
745
746
The values are sequences (usually tuples)
747
"""
748
749
def Marginal(self, i, label=None):
750
"""Gets the marginal distribution of the indicated variable.
751
752
i: index of the variable we want
753
754
Returns: Pmf
755
"""
756
pmf = Pmf(label=label)
757
for vs, prob in self.Items():
758
pmf.Incr(vs[i], prob)
759
return pmf
760
761
def Conditional(self, i, j, val, label=None):
762
"""Gets the conditional distribution of the indicated variable.
763
764
Distribution of vs[i], conditioned on vs[j] = val.
765
766
i: index of the variable we want
767
j: which variable is conditioned on
768
val: the value the jth variable has to have
769
770
Returns: Pmf
771
"""
772
pmf = Pmf(label=label)
773
for vs, prob in self.Items():
774
if vs[j] != val:
775
continue
776
pmf.Incr(vs[i], prob)
777
778
pmf.Normalize()
779
return pmf
780
781
def MaxLikeInterval(self, percentage=90):
782
"""Returns the maximum-likelihood credible interval.
783
784
If percentage=90, computes a 90% CI containing the values
785
with the highest likelihoods.
786
787
percentage: float between 0 and 100
788
789
Returns: list of values from the suite
790
"""
791
interval = []
792
total = 0
793
794
t = [(prob, val) for val, prob in self.Items()]
795
t.sort(reverse=True)
796
797
for prob, val in t:
798
interval.append(val)
799
total += prob
800
if total >= percentage / 100.0:
801
break
802
803
return interval
804
805
806
def MakeJoint(pmf1, pmf2):
807
"""Joint distribution of values from pmf1 and pmf2.
808
809
Assumes that the PMFs represent independent random variables.
810
811
Args:
812
pmf1: Pmf object
813
pmf2: Pmf object
814
815
Returns:
816
Joint pmf of value pairs
817
"""
818
joint = Joint()
819
for v1, p1 in pmf1.Items():
820
for v2, p2 in pmf2.Items():
821
joint.Set((v1, v2), p1 * p2)
822
return joint
823
824
825
def MakeHistFromList(t, label=None):
826
"""Makes a histogram from an unsorted sequence of values.
827
828
Args:
829
t: sequence of numbers
830
label: string label for this histogram
831
832
Returns:
833
Hist object
834
"""
835
return Hist(t, label=label)
836
837
838
def MakeHistFromDict(d, label=None):
839
"""Makes a histogram from a map from values to frequencies.
840
841
Args:
842
d: dictionary that maps values to frequencies
843
label: string label for this histogram
844
845
Returns:
846
Hist object
847
"""
848
return Hist(d, label)
849
850
851
def MakePmfFromList(t, label=None):
852
"""Makes a PMF from an unsorted sequence of values.
853
854
Args:
855
t: sequence of numbers
856
label: string label for this PMF
857
858
Returns:
859
Pmf object
860
"""
861
return Pmf(t, label=label)
862
863
864
def MakePmfFromDict(d, label=None):
865
"""Makes a PMF from a map from values to probabilities.
866
867
Args:
868
d: dictionary that maps values to probabilities
869
label: string label for this PMF
870
871
Returns:
872
Pmf object
873
"""
874
return Pmf(d, label=label)
875
876
877
def MakePmfFromItems(t, label=None):
878
"""Makes a PMF from a sequence of value-probability pairs
879
880
Args:
881
t: sequence of value-probability pairs
882
label: string label for this PMF
883
884
Returns:
885
Pmf object
886
"""
887
return Pmf(dict(t), label=label)
888
889
890
def MakePmfFromHist(hist, label=None):
891
"""Makes a normalized PMF from a Hist object.
892
893
Args:
894
hist: Hist object
895
label: string label
896
897
Returns:
898
Pmf object
899
"""
900
if label is None:
901
label = hist.label
902
903
return Pmf(hist, label=label)
904
905
906
def MakeMixture(metapmf, label='mix'):
907
"""Make a mixture distribution.
908
909
Args:
910
metapmf: Pmf that maps from Pmfs to probs.
911
label: string label for the new Pmf.
912
913
Returns: Pmf object.
914
"""
915
mix = Pmf(label=label)
916
for pmf, p1 in metapmf.Items():
917
for x, p2 in pmf.Items():
918
mix.Incr(x, p1 * p2)
919
return mix
920
921
922
def MakeUniformPmf(low, high, n):
923
"""Make a uniform Pmf.
924
925
low: lowest value (inclusive)
926
high: highest value (inclusize)
927
n: number of values
928
"""
929
pmf = Pmf()
930
for x in np.linspace(low, high, n):
931
pmf.Set(x, 1)
932
pmf.Normalize()
933
return pmf
934
935
936
class Cdf(object):
937
"""Represents a cumulative distribution function.
938
939
Attributes:
940
xs: sequence of values
941
ps: sequence of probabilities
942
label: string used as a graph label.
943
"""
944
def __init__(self, obj=None, ps=None, label=None):
945
"""Initializes.
946
947
If ps is provided, obj must be the corresponding list of values.
948
949
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
950
ps: list of cumulative probabilities
951
label: string label
952
"""
953
self.label = label if label is not None else '_nolegend_'
954
955
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
956
if not label:
957
self.label = label if label is not None else obj.label
958
959
if obj is None:
960
# caller does not provide obj, make an empty Cdf
961
self.xs = np.asarray([])
962
self.ps = np.asarray([])
963
if ps is not None:
964
logging.warning("Cdf: can't pass ps without also passing xs.")
965
return
966
else:
967
# if the caller provides xs and ps, just store them
968
if ps is not None:
969
if isinstance(ps, str):
970
logging.warning("Cdf: ps can't be a string")
971
972
self.xs = np.asarray(obj)
973
self.ps = np.asarray(ps)
974
return
975
976
# caller has provided just obj, not ps
977
if isinstance(obj, Cdf):
978
self.xs = copy.copy(obj.xs)
979
self.ps = copy.copy(obj.ps)
980
return
981
982
if isinstance(obj, _DictWrapper):
983
dw = obj
984
else:
985
dw = Hist(obj)
986
987
if len(dw) == 0:
988
self.xs = np.asarray([])
989
self.ps = np.asarray([])
990
return
991
992
xs, freqs = zip(*sorted(dw.Items()))
993
self.xs = np.asarray(xs)
994
self.ps = np.cumsum(freqs, dtype=np.float)
995
self.ps /= self.ps[-1]
996
997
def __str__(self):
998
return 'Cdf(%s, %s)' % (str(self.xs), str(self.ps))
999
1000
__repr__ = __str__
1001
1002
def __len__(self):
1003
return len(self.xs)
1004
1005
def __getitem__(self, x):
1006
return self.Prob(x)
1007
1008
def __setitem__(self):
1009
raise UnimplementedMethodException()
1010
1011
def __delitem__(self):
1012
raise UnimplementedMethodException()
1013
1014
def __eq__(self, other):
1015
return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)
1016
1017
def Copy(self, label=None):
1018
"""Returns a copy of this Cdf.
1019
1020
label: string label for the new Cdf
1021
"""
1022
if label is None:
1023
label = self.label
1024
return Cdf(list(self.xs), list(self.ps), label=label)
1025
1026
def MakePmf(self, label=None):
1027
"""Makes a Pmf."""
1028
if label is None:
1029
label = self.label
1030
return Pmf(self, label=label)
1031
1032
def Values(self):
1033
"""Returns a sorted list of values.
1034
"""
1035
return self.xs
1036
1037
def Items(self):
1038
"""Returns a sorted sequence of (value, probability) pairs.
1039
1040
Note: in Python3, returns an iterator.
1041
"""
1042
a = self.ps
1043
b = np.roll(a, 1)
1044
b[0] = 0
1045
return zip(self.xs, a-b)
1046
1047
def Shift(self, term):
1048
"""Adds a term to the xs.
1049
1050
term: how much to add
1051
"""
1052
new = self.Copy()
1053
# don't use +=, or else an int array + float yields int array
1054
new.xs = new.xs + term
1055
return new
1056
1057
def Scale(self, factor):
1058
"""Multiplies the xs by a factor.
1059
1060
factor: what to multiply by
1061
"""
1062
new = self.Copy()
1063
# don't use *=, or else an int array * float yields int array
1064
new.xs = new.xs * factor
1065
return new
1066
1067
def Prob(self, x):
1068
"""Returns CDF(x), the probability that corresponds to value x.
1069
1070
Args:
1071
x: number
1072
1073
Returns:
1074
float probability
1075
"""
1076
if x < self.xs[0]:
1077
return 0.0
1078
index = bisect.bisect(self.xs, x)
1079
p = self.ps[index-1]
1080
return p
1081
1082
def Probs(self, xs):
1083
"""Gets probabilities for a sequence of values.
1084
1085
xs: any sequence that can be converted to NumPy array
1086
1087
returns: NumPy array of cumulative probabilities
1088
"""
1089
xs = np.asarray(xs)
1090
index = np.searchsorted(self.xs, xs, side='right')
1091
ps = self.ps[index-1]
1092
ps[xs < self.xs[0]] = 0.0
1093
return ps
1094
1095
ProbArray = Probs
1096
1097
def Value(self, p):
1098
"""Returns InverseCDF(p), the value that corresponds to probability p.
1099
1100
Args:
1101
p: number in the range [0, 1]
1102
1103
Returns:
1104
number value
1105
"""
1106
if p < 0 or p > 1:
1107
raise ValueError('Probability p must be in range [0, 1]')
1108
1109
index = bisect.bisect_left(self.ps, p)
1110
return self.xs[index]
1111
1112
def ValueArray(self, ps):
1113
"""Returns InverseCDF(p), the value that corresponds to probability p.
1114
1115
Args:
1116
ps: NumPy array of numbers in the range [0, 1]
1117
1118
Returns:
1119
NumPy array of values
1120
"""
1121
ps = np.asarray(ps)
1122
if np.any(ps < 0) or np.any(ps > 1):
1123
raise ValueError('Probability p must be in range [0, 1]')
1124
1125
index = np.searchsorted(self.ps, ps, side='left')
1126
return self.xs[index]
1127
1128
def Percentile(self, p):
1129
"""Returns the value that corresponds to percentile p.
1130
1131
Args:
1132
p: number in the range [0, 100]
1133
1134
Returns:
1135
number value
1136
"""
1137
return self.Value(p / 100.0)
1138
1139
def PercentileRank(self, x):
1140
"""Returns the percentile rank of the value x.
1141
1142
x: potential value in the CDF
1143
1144
returns: percentile rank in the range 0 to 100
1145
"""
1146
return self.Prob(x) * 100.0
1147
1148
def Random(self):
1149
"""Chooses a random value from this distribution."""
1150
return self.Value(random.random())
1151
1152
def Sample(self, n):
1153
"""Generates a random sample from this distribution.
1154
1155
n: int length of the sample
1156
returns: NumPy array
1157
"""
1158
ps = np.random.random(n)
1159
return self.ValueArray(ps)
1160
1161
def Mean(self):
1162
"""Computes the mean of a CDF.
1163
1164
Returns:
1165
float mean
1166
"""
1167
old_p = 0
1168
total = 0.0
1169
for x, new_p in zip(self.xs, self.ps):
1170
p = new_p - old_p
1171
total += p * x
1172
old_p = new_p
1173
return total
1174
1175
def CredibleInterval(self, percentage=90):
1176
"""Computes the central credible interval.
1177
1178
If percentage=90, computes the 90% CI.
1179
1180
Args:
1181
percentage: float between 0 and 100
1182
1183
Returns:
1184
sequence of two floats, low and high
1185
"""
1186
prob = (1 - percentage / 100.0) / 2
1187
interval = self.Value(prob), self.Value(1 - prob)
1188
return interval
1189
1190
ConfidenceInterval = CredibleInterval
1191
1192
def _Round(self, multiplier=1000.0):
1193
"""
1194
An entry is added to the cdf only if the percentile differs
1195
from the previous value in a significant digit, where the number
1196
of significant digits is determined by multiplier. The
1197
default is 1000, which keeps log10(1000) = 3 significant digits.
1198
"""
1199
# TODO(write this method)
1200
raise UnimplementedMethodException()
1201
1202
def Render(self, **options):
1203
"""Generates a sequence of points suitable for plotting.
1204
1205
An empirical CDF is a step function; linear interpolation
1206
can be misleading.
1207
1208
Note: options are ignored
1209
1210
Returns:
1211
tuple of (xs, ps)
1212
"""
1213
def interleave(a, b):
1214
c = np.empty(a.shape[0] + b.shape[0])
1215
c[::2] = a
1216
c[1::2] = b
1217
return c
1218
1219
a = np.array(self.xs)
1220
xs = interleave(a, a)
1221
shift_ps = np.roll(self.ps, 1)
1222
shift_ps[0] = 0
1223
ps = interleave(shift_ps, self.ps)
1224
return xs, ps
1225
1226
def Max(self, k):
1227
"""Computes the CDF of the maximum of k selections from this dist.
1228
1229
k: int
1230
1231
returns: new Cdf
1232
"""
1233
cdf = self.Copy()
1234
cdf.ps **= k
1235
return cdf
1236
1237
1238
def MakeCdfFromItems(items, label=None):
1239
"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1240
1241
Args:
1242
items: unsorted sequence of (value, frequency) pairs
1243
label: string label for this CDF
1244
1245
Returns:
1246
cdf: list of (value, fraction) pairs
1247
"""
1248
return Cdf(dict(items), label=label)
1249
1250
1251
def MakeCdfFromDict(d, label=None):
1252
"""Makes a CDF from a dictionary that maps values to frequencies.
1253
1254
Args:
1255
d: dictionary that maps values to frequencies.
1256
label: string label for the data.
1257
1258
Returns:
1259
Cdf object
1260
"""
1261
return Cdf(d, label=label)
1262
1263
1264
def MakeCdfFromList(seq, label=None):
1265
"""Creates a CDF from an unsorted sequence.
1266
1267
Args:
1268
seq: unsorted sequence of sortable values
1269
label: string label for the cdf
1270
1271
Returns:
1272
Cdf object
1273
"""
1274
return Cdf(seq, label=label)
1275
1276
1277
def MakeCdfFromHist(hist, label=None):
1278
"""Makes a CDF from a Hist object.
1279
1280
Args:
1281
hist: Pmf.Hist object
1282
label: string label for the data.
1283
1284
Returns:
1285
Cdf object
1286
"""
1287
if label is None:
1288
label = hist.label
1289
1290
return Cdf(hist, label=label)
1291
1292
1293
def MakeCdfFromPmf(pmf, label=None):
1294
"""Makes a CDF from a Pmf object.
1295
1296
Args:
1297
pmf: Pmf.Pmf object
1298
label: string label for the data.
1299
1300
Returns:
1301
Cdf object
1302
"""
1303
if label is None:
1304
label = pmf.label
1305
1306
return Cdf(pmf, label=label)
1307
1308
1309
class UnimplementedMethodException(Exception):
1310
"""Exception if someone calls a method that should be overridden."""
1311
1312
1313
class Suite(Pmf):
1314
"""Represents a suite of hypotheses and their probabilities."""
1315
1316
def Update(self, data):
1317
"""Updates each hypothesis based on the data.
1318
1319
data: any representation of the data
1320
1321
returns: the normalizing constant
1322
"""
1323
for hypo in self.Values():
1324
like = self.Likelihood(data, hypo)
1325
self.Mult(hypo, like)
1326
return self.Normalize()
1327
1328
def LogUpdate(self, data):
1329
"""Updates a suite of hypotheses based on new data.
1330
1331
Modifies the suite directly; if you want to keep the original, make
1332
a copy.
1333
1334
Note: unlike Update, LogUpdate does not normalize.
1335
1336
Args:
1337
data: any representation of the data
1338
"""
1339
for hypo in self.Values():
1340
like = self.LogLikelihood(data, hypo)
1341
self.Incr(hypo, like)
1342
1343
def UpdateSet(self, dataset):
1344
"""Updates each hypothesis based on the dataset.
1345
1346
This is more efficient than calling Update repeatedly because
1347
it waits until the end to Normalize.
1348
1349
Modifies the suite directly; if you want to keep the original, make
1350
a copy.
1351
1352
dataset: a sequence of data
1353
1354
returns: the normalizing constant
1355
"""
1356
for data in dataset:
1357
for hypo in self.Values():
1358
like = self.Likelihood(data, hypo)
1359
self.Mult(hypo, like)
1360
return self.Normalize()
1361
1362
def LogUpdateSet(self, dataset):
1363
"""Updates each hypothesis based on the dataset.
1364
1365
Modifies the suite directly; if you want to keep the original, make
1366
a copy.
1367
1368
dataset: a sequence of data
1369
1370
returns: None
1371
"""
1372
for data in dataset:
1373
self.LogUpdate(data)
1374
1375
def Likelihood(self, data, hypo):
1376
"""Computes the likelihood of the data under the hypothesis.
1377
1378
hypo: some representation of the hypothesis
1379
data: some representation of the data
1380
"""
1381
raise UnimplementedMethodException()
1382
1383
def LogLikelihood(self, data, hypo):
1384
"""Computes the log likelihood of the data under the hypothesis.
1385
1386
hypo: some representation of the hypothesis
1387
data: some representation of the data
1388
"""
1389
raise UnimplementedMethodException()
1390
1391
def Print(self):
1392
"""Prints the hypotheses and their probabilities."""
1393
for hypo, prob in sorted(self.Items()):
1394
print(hypo, prob)
1395
1396
def MakeOdds(self):
1397
"""Transforms from probabilities to odds.
1398
1399
Values with prob=0 are removed.
1400
"""
1401
for hypo, prob in self.Items():
1402
if prob:
1403
self.Set(hypo, Odds(prob))
1404
else:
1405
self.Remove(hypo)
1406
1407
def MakeProbs(self):
1408
"""Transforms from odds to probabilities."""
1409
for hypo, odds in self.Items():
1410
self.Set(hypo, Probability(odds))
1411
1412
1413
def MakeSuiteFromList(t, label=None):
1414
"""Makes a suite from an unsorted sequence of values.
1415
1416
Args:
1417
t: sequence of numbers
1418
label: string label for this suite
1419
1420
Returns:
1421
Suite object
1422
"""
1423
hist = MakeHistFromList(t, label=label)
1424
d = hist.GetDict()
1425
return MakeSuiteFromDict(d)
1426
1427
1428
def MakeSuiteFromHist(hist, label=None):
1429
"""Makes a normalized suite from a Hist object.
1430
1431
Args:
1432
hist: Hist object
1433
label: string label
1434
1435
Returns:
1436
Suite object
1437
"""
1438
if label is None:
1439
label = hist.label
1440
1441
# make a copy of the dictionary
1442
d = dict(hist.GetDict())
1443
return MakeSuiteFromDict(d, label)
1444
1445
1446
def MakeSuiteFromDict(d, label=None):
1447
"""Makes a suite from a map from values to probabilities.
1448
1449
Args:
1450
d: dictionary that maps values to probabilities
1451
label: string label for this suite
1452
1453
Returns:
1454
Suite object
1455
"""
1456
suite = Suite(label=label)
1457
suite.SetDict(d)
1458
suite.Normalize()
1459
return suite
1460
1461
1462
class Pdf(object):
1463
"""Represents a probability density function (PDF)."""
1464
1465
def Density(self, x):
1466
"""Evaluates this Pdf at x.
1467
1468
Returns: float or NumPy array of probability density
1469
"""
1470
raise UnimplementedMethodException()
1471
1472
def GetLinspace(self):
1473
"""Get a linspace for plotting.
1474
1475
Not all subclasses of Pdf implement this.
1476
1477
Returns: numpy array
1478
"""
1479
raise UnimplementedMethodException()
1480
1481
def MakePmf(self, **options):
1482
"""Makes a discrete version of this Pdf.
1483
1484
options can include
1485
label: string
1486
low: low end of range
1487
high: high end of range
1488
n: number of places to evaluate
1489
1490
Returns: new Pmf
1491
"""
1492
label = options.pop('label', '')
1493
xs, ds = self.Render(**options)
1494
return Pmf(dict(zip(xs, ds)), label=label)
1495
1496
def Render(self, **options):
1497
"""Generates a sequence of points suitable for plotting.
1498
1499
If options includes low and high, it must also include n;
1500
in that case the density is evaluated an n locations between
1501
low and high, including both.
1502
1503
If options includes xs, the density is evaluate at those location.
1504
1505
Otherwise, self.GetLinspace is invoked to provide the locations.
1506
1507
Returns:
1508
tuple of (xs, densities)
1509
"""
1510
low, high = options.pop('low', None), options.pop('high', None)
1511
if low is not None and high is not None:
1512
n = options.pop('n', 101)
1513
xs = np.linspace(low, high, n)
1514
else:
1515
xs = options.pop('xs', None)
1516
if xs is None:
1517
xs = self.GetLinspace()
1518
1519
ds = self.Density(xs)
1520
return xs, ds
1521
1522
def Items(self):
1523
"""Generates a sequence of (value, probability) pairs.
1524
"""
1525
return zip(*self.Render())
1526
1527
1528
class NormalPdf(Pdf):
1529
"""Represents the PDF of a Normal distribution."""
1530
1531
def __init__(self, mu=0, sigma=1, label=None):
1532
"""Constructs a Normal Pdf with given mu and sigma.
1533
1534
mu: mean
1535
sigma: standard deviation
1536
label: string
1537
"""
1538
self.mu = mu
1539
self.sigma = sigma
1540
self.label = label if label is not None else '_nolegend_'
1541
1542
def __str__(self):
1543
return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)
1544
1545
def GetLinspace(self):
1546
"""Get a linspace for plotting.
1547
1548
Returns: numpy array
1549
"""
1550
low, high = self.mu-3*self.sigma, self.mu+3*self.sigma
1551
return np.linspace(low, high, 101)
1552
1553
def Density(self, xs):
1554
"""Evaluates this Pdf at xs.
1555
1556
xs: scalar or sequence of floats
1557
1558
returns: float or NumPy array of probability density
1559
"""
1560
return stats.norm.pdf(xs, self.mu, self.sigma)
1561
1562
1563
class ExponentialPdf(Pdf):
1564
"""Represents the PDF of an exponential distribution."""
1565
1566
def __init__(self, lam=1, label=None):
1567
"""Constructs an exponential Pdf with given parameter.
1568
1569
lam: rate parameter
1570
label: string
1571
"""
1572
self.lam = lam
1573
self.label = label if label is not None else '_nolegend_'
1574
1575
def __str__(self):
1576
return 'ExponentialPdf(%f)' % (self.lam)
1577
1578
def GetLinspace(self):
1579
"""Get a linspace for plotting.
1580
1581
Returns: numpy array
1582
"""
1583
low, high = 0, 5.0/self.lam
1584
return np.linspace(low, high, 101)
1585
1586
def Density(self, xs):
1587
"""Evaluates this Pdf at xs.
1588
1589
xs: scalar or sequence of floats
1590
1591
returns: float or NumPy array of probability density
1592
"""
1593
return stats.expon.pdf(xs, scale=1.0/self.lam)
1594
1595
1596
class EstimatedPdf(Pdf):
1597
"""Represents a PDF estimated by KDE."""
1598
1599
def __init__(self, sample, label=None):
1600
"""Estimates the density function based on a sample.
1601
1602
sample: sequence of data
1603
label: string
1604
"""
1605
self.label = label if label is not None else '_nolegend_'
1606
self.kde = stats.gaussian_kde(sample)
1607
low = min(sample)
1608
high = max(sample)
1609
self.linspace = np.linspace(low, high, 101)
1610
1611
def __str__(self):
1612
return 'EstimatedPdf(label=%s)' % str(self.label)
1613
1614
def GetLinspace(self):
1615
"""Get a linspace for plotting.
1616
1617
Returns: numpy array
1618
"""
1619
return self.linspace
1620
1621
def Density(self, xs):
1622
"""Evaluates this Pdf at xs.
1623
1624
returns: float or NumPy array of probability density
1625
"""
1626
return self.kde.evaluate(xs)
1627
1628
1629
def CredibleInterval(pmf, percentage=90):
1630
"""Computes a credible interval for a given distribution.
1631
1632
If percentage=90, computes the 90% CI.
1633
1634
Args:
1635
pmf: Pmf object representing a posterior distribution
1636
percentage: float between 0 and 100
1637
1638
Returns:
1639
sequence of two floats, low and high
1640
"""
1641
cdf = pmf.MakeCdf()
1642
prob = (1 - percentage / 100.0) / 2
1643
interval = cdf.Value(prob), cdf.Value(1 - prob)
1644
return interval
1645
1646
1647
def PmfProbLess(pmf1, pmf2):
1648
"""Probability that a value from pmf1 is less than a value from pmf2.
1649
1650
Args:
1651
pmf1: Pmf object
1652
pmf2: Pmf object
1653
1654
Returns:
1655
float probability
1656
"""
1657
total = 0.0
1658
for v1, p1 in pmf1.Items():
1659
for v2, p2 in pmf2.Items():
1660
if v1 < v2:
1661
total += p1 * p2
1662
return total
1663
1664
1665
def PmfProbGreater(pmf1, pmf2):
1666
"""Probability that a value from pmf1 is less than a value from pmf2.
1667
1668
Args:
1669
pmf1: Pmf object
1670
pmf2: Pmf object
1671
1672
Returns:
1673
float probability
1674
"""
1675
total = 0.0
1676
for v1, p1 in pmf1.Items():
1677
for v2, p2 in pmf2.Items():
1678
if v1 > v2:
1679
total += p1 * p2
1680
return total
1681
1682
1683
def PmfProbEqual(pmf1, pmf2):
1684
"""Probability that a value from pmf1 equals a value from pmf2.
1685
1686
Args:
1687
pmf1: Pmf object
1688
pmf2: Pmf object
1689
1690
Returns:
1691
float probability
1692
"""
1693
total = 0.0
1694
for v1, p1 in pmf1.Items():
1695
for v2, p2 in pmf2.Items():
1696
if v1 == v2:
1697
total += p1 * p2
1698
return total
1699
1700
1701
def RandomSum(dists):
1702
"""Chooses a random value from each dist and returns the sum.
1703
1704
dists: sequence of Pmf or Cdf objects
1705
1706
returns: numerical sum
1707
"""
1708
total = sum(dist.Random() for dist in dists)
1709
return total
1710
1711
1712
def SampleSum(dists, n):
1713
"""Draws a sample of sums from a list of distributions.
1714
1715
dists: sequence of Pmf or Cdf objects
1716
n: sample size
1717
1718
returns: new Pmf of sums
1719
"""
1720
pmf = Pmf(RandomSum(dists) for i in range(n))
1721
return pmf
1722
1723
1724
def EvalNormalPdf(x, mu, sigma):
1725
"""Computes the unnormalized PDF of the normal distribution.
1726
1727
x: value
1728
mu: mean
1729
sigma: standard deviation
1730
1731
returns: float probability density
1732
"""
1733
return stats.norm.pdf(x, mu, sigma)
1734
1735
1736
def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
1737
"""Makes a PMF discrete approx to a Normal distribution.
1738
1739
mu: float mean
1740
sigma: float standard deviation
1741
num_sigmas: how many sigmas to extend in each direction
1742
n: number of values in the Pmf
1743
1744
returns: normalized Pmf
1745
"""
1746
pmf = Pmf()
1747
low = mu - num_sigmas * sigma
1748
high = mu + num_sigmas * sigma
1749
1750
for x in np.linspace(low, high, n):
1751
p = EvalNormalPdf(x, mu, sigma)
1752
pmf.Set(x, p)
1753
pmf.Normalize()
1754
return pmf
1755
1756
1757
def EvalBinomialPmf(k, n, p):
1758
"""Evaluates the binomial PMF.
1759
1760
Returns the probabily of k successes in n trials with probability p.
1761
"""
1762
return stats.binom.pmf(k, n, p)
1763
1764
1765
def EvalHypergeomPmf(k, N, K, n):
1766
"""Evaluates the hypergeometric PMF.
1767
1768
Returns the probabily of k successes in n trials from a population
1769
N with K successes in it.
1770
"""
1771
return stats.hypergeom.pmf(k, N, K, n)
1772
1773
1774
def EvalPoissonPmf(k, lam):
1775
"""Computes the Poisson PMF.
1776
1777
k: number of events
1778
lam: parameter lambda in events per unit time
1779
1780
returns: float probability
1781
"""
1782
# don't use the scipy function (yet). for lam=0 it returns NaN;
1783
# should be 0.0
1784
# return stats.poisson.pmf(k, lam)
1785
return lam ** k * math.exp(-lam) / special.gamma(k+1)
1786
1787
1788
def MakePoissonPmf(lam, high, step=1):
1789
"""Makes a PMF discrete approx to a Poisson distribution.
1790
1791
lam: parameter lambda in events per unit time
1792
high: upper bound of the Pmf
1793
1794
returns: normalized Pmf
1795
"""
1796
pmf = Pmf()
1797
for k in range(0, high + 1, step):
1798
p = EvalPoissonPmf(k, lam)
1799
pmf.Set(k, p)
1800
pmf.Normalize()
1801
return pmf
1802
1803
1804
def EvalExponentialPdf(x, lam):
1805
"""Computes the exponential PDF.
1806
1807
x: value
1808
lam: parameter lambda in events per unit time
1809
1810
returns: float probability density
1811
"""
1812
return lam * math.exp(-lam * x)
1813
1814
1815
def EvalExponentialCdf(x, lam):
1816
"""Evaluates CDF of the exponential distribution with parameter lam."""
1817
return 1 - math.exp(-lam * x)
1818
1819
1820
def MakeExponentialPmf(lam, high, n=200):
1821
"""Makes a PMF discrete approx to an exponential distribution.
1822
1823
lam: parameter lambda in events per unit time
1824
high: upper bound
1825
n: number of values in the Pmf
1826
1827
returns: normalized Pmf
1828
"""
1829
pmf = Pmf()
1830
for x in np.linspace(0, high, n):
1831
p = EvalExponentialPdf(x, lam)
1832
pmf.Set(x, p)
1833
pmf.Normalize()
1834
return pmf
1835
1836
1837
def StandardNormalCdf(x):
1838
"""Evaluates the CDF of the standard Normal distribution.
1839
1840
See http://en.wikipedia.org/wiki/Normal_distribution
1841
#Cumulative_distribution_function
1842
1843
Args:
1844
x: float
1845
1846
Returns:
1847
float
1848
"""
1849
return (math.erf(x / ROOT2) + 1) / 2
1850
1851
1852
def EvalNormalCdf(x, mu=0, sigma=1):
1853
"""Evaluates the CDF of the normal distribution.
1854
1855
Args:
1856
x: float
1857
1858
mu: mean parameter
1859
1860
sigma: standard deviation parameter
1861
1862
Returns:
1863
float
1864
"""
1865
return stats.norm.cdf(x, loc=mu, scale=sigma)
1866
1867
1868
def EvalNormalCdfInverse(p, mu=0, sigma=1):
1869
"""Evaluates the inverse CDF of the normal distribution.
1870
1871
See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function
1872
1873
Args:
1874
p: float
1875
1876
mu: mean parameter
1877
1878
sigma: standard deviation parameter
1879
1880
Returns:
1881
float
1882
"""
1883
return stats.norm.ppf(p, loc=mu, scale=sigma)
1884
1885
1886
def EvalLognormalCdf(x, mu=0, sigma=1):
1887
"""Evaluates the CDF of the lognormal distribution.
1888
1889
x: float or sequence
1890
mu: mean parameter
1891
sigma: standard deviation parameter
1892
1893
Returns: float or sequence
1894
"""
1895
return stats.lognorm.cdf(x, loc=mu, scale=sigma)
1896
1897
1898
def RenderExpoCdf(lam, low, high, n=101):
1899
"""Generates sequences of xs and ps for an exponential CDF.
1900
1901
lam: parameter
1902
low: float
1903
high: float
1904
n: number of points to render
1905
1906
returns: numpy arrays (xs, ps)
1907
"""
1908
xs = np.linspace(low, high, n)
1909
ps = 1 - np.exp(-lam * xs)
1910
#ps = stats.expon.cdf(xs, scale=1.0/lam)
1911
return xs, ps
1912
1913
1914
def RenderNormalCdf(mu, sigma, low, high, n=101):
1915
"""Generates sequences of xs and ps for a Normal CDF.
1916
1917
mu: parameter
1918
sigma: parameter
1919
low: float
1920
high: float
1921
n: number of points to render
1922
1923
returns: numpy arrays (xs, ps)
1924
"""
1925
xs = np.linspace(low, high, n)
1926
ps = stats.norm.cdf(xs, mu, sigma)
1927
return xs, ps
1928
1929
1930
def RenderParetoCdf(xmin, alpha, low, high, n=50):
1931
"""Generates sequences of xs and ps for a Pareto CDF.
1932
1933
xmin: parameter
1934
alpha: parameter
1935
low: float
1936
high: float
1937
n: number of points to render
1938
1939
returns: numpy arrays (xs, ps)
1940
"""
1941
if low < xmin:
1942
low = xmin
1943
xs = np.linspace(low, high, n)
1944
ps = 1 - (xs / xmin) ** -alpha
1945
#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
1946
return xs, ps
1947
1948
1949
class Beta(object):
1950
"""Represents a Beta distribution.
1951
1952
See http://en.wikipedia.org/wiki/Beta_distribution
1953
"""
1954
def __init__(self, alpha=1, beta=1, label=None):
1955
"""Initializes a Beta distribution."""
1956
self.alpha = alpha
1957
self.beta = beta
1958
self.label = label if label is not None else '_nolegend_'
1959
1960
def Update(self, data):
1961
"""Updates a Beta distribution.
1962
1963
data: pair of int (heads, tails)
1964
"""
1965
heads, tails = data
1966
self.alpha += heads
1967
self.beta += tails
1968
1969
def Mean(self):
1970
"""Computes the mean of this distribution."""
1971
return self.alpha / (self.alpha + self.beta)
1972
1973
def Random(self):
1974
"""Generates a random variate from this distribution."""
1975
return random.betavariate(self.alpha, self.beta)
1976
1977
def Sample(self, n):
1978
"""Generates a random sample from this distribution.
1979
1980
n: int sample size
1981
"""
1982
size = n,
1983
return np.random.beta(self.alpha, self.beta, size)
1984
1985
def EvalPdf(self, x):
1986
"""Evaluates the PDF at x."""
1987
return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
1988
1989
def MakePmf(self, steps=101, label=None):
1990
"""Returns a Pmf of this distribution.
1991
1992
Note: Normally, we just evaluate the PDF at a sequence
1993
of points and treat the probability density as a probability
1994
mass.
1995
1996
But if alpha or beta is less than one, we have to be
1997
more careful because the PDF goes to infinity at x=0
1998
and x=1. In that case we evaluate the CDF and compute
1999
differences.
2000
"""
2001
if self.alpha < 1 or self.beta < 1:
2002
cdf = self.MakeCdf()
2003
pmf = cdf.MakePmf()
2004
return pmf
2005
2006
xs = [i / (steps - 1.0) for i in range(steps)]
2007
probs = [self.EvalPdf(x) for x in xs]
2008
pmf = Pmf(dict(zip(xs, probs)), label=label)
2009
return pmf
2010
2011
def MakeCdf(self, steps=101):
2012
"""Returns the CDF of this distribution."""
2013
xs = [i / (steps - 1.0) for i in range(steps)]
2014
ps = [special.betainc(self.alpha, self.beta, x) for x in xs]
2015
cdf = Cdf(xs, ps)
2016
return cdf
2017
2018
2019
class Dirichlet(object):
2020
"""Represents a Dirichlet distribution.
2021
2022
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2023
"""
2024
2025
def __init__(self, n, conc=1, label=None):
2026
"""Initializes a Dirichlet distribution.
2027
2028
n: number of dimensions
2029
conc: concentration parameter (smaller yields more concentration)
2030
label: string label
2031
"""
2032
if n < 2:
2033
raise ValueError('A Dirichlet distribution with '
2034
'n<2 makes no sense')
2035
2036
self.n = n
2037
self.params = np.ones(n, dtype=np.float) * conc
2038
self.label = label if label is not None else '_nolegend_'
2039
2040
def Update(self, data):
2041
"""Updates a Dirichlet distribution.
2042
2043
data: sequence of observations, in order corresponding to params
2044
"""
2045
m = len(data)
2046
self.params[:m] += data
2047
2048
def Random(self):
2049
"""Generates a random variate from this distribution.
2050
2051
Returns: normalized vector of fractions
2052
"""
2053
p = np.random.gamma(self.params)
2054
return p / p.sum()
2055
2056
def Likelihood(self, data):
2057
"""Computes the likelihood of the data.
2058
2059
Selects a random vector of probabilities from this distribution.
2060
2061
Returns: float probability
2062
"""
2063
m = len(data)
2064
if self.n < m:
2065
return 0
2066
2067
x = data
2068
p = self.Random()
2069
q = p[:m] ** x
2070
return q.prod()
2071
2072
def LogLikelihood(self, data):
2073
"""Computes the log likelihood of the data.
2074
2075
Selects a random vector of probabilities from this distribution.
2076
2077
Returns: float log probability
2078
"""
2079
m = len(data)
2080
if self.n < m:
2081
return float('-inf')
2082
2083
x = self.Random()
2084
y = np.log(x[:m]) * data
2085
return y.sum()
2086
2087
def MarginalBeta(self, i):
2088
"""Computes the marginal distribution of the ith element.
2089
2090
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2091
#Marginal_distributions
2092
2093
i: int
2094
2095
Returns: Beta object
2096
"""
2097
alpha0 = self.params.sum()
2098
alpha = self.params[i]
2099
return Beta(alpha, alpha0 - alpha)
2100
2101
def PredictivePmf(self, xs, label=None):
2102
"""Makes a predictive distribution.
2103
2104
xs: values to go into the Pmf
2105
2106
Returns: Pmf that maps from x to the mean prevalence of x
2107
"""
2108
alpha0 = self.params.sum()
2109
ps = self.params / alpha0
2110
return Pmf(zip(xs, ps), label=label)
2111
2112
2113
def BinomialCoef(n, k):
2114
"""Compute the binomial coefficient "n choose k".
2115
2116
n: number of trials
2117
k: number of successes
2118
2119
Returns: float
2120
"""
2121
return scipy.misc.comb(n, k)
2122
2123
2124
def LogBinomialCoef(n, k):
2125
"""Computes the log of the binomial coefficient.
2126
2127
http://math.stackexchange.com/questions/64716/
2128
approximating-the-logarithm-of-the-binomial-coefficient
2129
2130
n: number of trials
2131
k: number of successes
2132
2133
Returns: float
2134
"""
2135
return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)
2136
2137
2138
def NormalProbability(ys, jitter=0.0):
2139
"""Generates data for a normal probability plot.
2140
2141
ys: sequence of values
2142
jitter: float magnitude of jitter added to the ys
2143
2144
returns: numpy arrays xs, ys
2145
"""
2146
n = len(ys)
2147
xs = np.random.normal(0, 1, n)
2148
xs.sort()
2149
2150
if jitter:
2151
ys = Jitter(ys, jitter)
2152
else:
2153
ys = np.array(ys)
2154
ys.sort()
2155
2156
return xs, ys
2157
2158
2159
def Jitter(values, jitter=0.5):
2160
"""Jitters the values by adding a uniform variate in (-jitter, jitter).
2161
2162
values: sequence
2163
jitter: scalar magnitude of jitter
2164
2165
returns: new numpy array
2166
"""
2167
n = len(values)
2168
return np.random.uniform(-jitter, +jitter, n) + values
2169
2170
2171
def NormalProbabilityPlot(sample, fit_color='0.8', **options):
2172
"""Makes a normal probability plot with a fitted line.
2173
2174
sample: sequence of numbers
2175
fit_color: color string for the fitted line
2176
options: passed along to Plot
2177
"""
2178
xs, ys = NormalProbability(sample)
2179
mean, var = MeanVar(sample)
2180
std = math.sqrt(var)
2181
2182
fit = FitLine(xs, mean, std)
2183
thinkplot.Plot(*fit, color=fit_color, label='model')
2184
2185
xs, ys = NormalProbability(sample)
2186
thinkplot.Plot(xs, ys, **options)
2187
2188
2189
def Mean(xs):
2190
"""Computes mean.
2191
2192
xs: sequence of values
2193
2194
returns: float mean
2195
"""
2196
return np.mean(xs)
2197
2198
2199
def Var(xs, mu=None, ddof=0):
2200
"""Computes variance.
2201
2202
xs: sequence of values
2203
mu: option known mean
2204
ddof: delta degrees of freedom
2205
2206
returns: float
2207
"""
2208
xs = np.asarray(xs)
2209
2210
if mu is None:
2211
mu = xs.mean()
2212
2213
ds = xs - mu
2214
return np.dot(ds, ds) / (len(xs) - ddof)
2215
2216
2217
def Std(xs, mu=None, ddof=0):
2218
"""Computes standard deviation.
2219
2220
xs: sequence of values
2221
mu: option known mean
2222
ddof: delta degrees of freedom
2223
2224
returns: float
2225
"""
2226
var = Var(xs, mu, ddof)
2227
return math.sqrt(var)
2228
2229
2230
def MeanVar(xs, ddof=0):
2231
"""Computes mean and variance.
2232
2233
Based on http://stackoverflow.com/questions/19391149/
2234
numpy-mean-and-variance-from-single-function
2235
2236
xs: sequence of values
2237
ddof: delta degrees of freedom
2238
2239
returns: pair of float, mean and var
2240
"""
2241
xs = np.asarray(xs)
2242
mean = xs.mean()
2243
s2 = Var(xs, mean, ddof)
2244
return mean, s2
2245
2246
2247
def Trim(t, p=0.01):
2248
"""Trims the largest and smallest elements of t.
2249
2250
Args:
2251
t: sequence of numbers
2252
p: fraction of values to trim off each end
2253
2254
Returns:
2255
sequence of values
2256
"""
2257
n = int(p * len(t))
2258
t = sorted(t)[n:-n]
2259
return t
2260
2261
2262
def TrimmedMean(t, p=0.01):
2263
"""Computes the trimmed mean of a sequence of numbers.
2264
2265
Args:
2266
t: sequence of numbers
2267
p: fraction of values to trim off each end
2268
2269
Returns:
2270
float
2271
"""
2272
t = Trim(t, p)
2273
return Mean(t)
2274
2275
2276
def TrimmedMeanVar(t, p=0.01):
2277
"""Computes the trimmed mean and variance of a sequence of numbers.
2278
2279
Side effect: sorts the list.
2280
2281
Args:
2282
t: sequence of numbers
2283
p: fraction of values to trim off each end
2284
2285
Returns:
2286
float
2287
"""
2288
t = Trim(t, p)
2289
mu, var = MeanVar(t)
2290
return mu, var
2291
2292
2293
def CohenEffectSize(group1, group2):
2294
"""Compute Cohen's d.
2295
2296
group1: Series or NumPy array
2297
group2: Series or NumPy array
2298
2299
returns: float
2300
"""
2301
diff = group1.mean() - group2.mean()
2302
2303
n1, n2 = len(group1), len(group2)
2304
var1 = group1.var()
2305
var2 = group2.var()
2306
2307
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
2308
d = diff / math.sqrt(pooled_var)
2309
return d
2310
2311
2312
def Cov(xs, ys, meanx=None, meany=None):
2313
"""Computes Cov(X, Y).
2314
2315
Args:
2316
xs: sequence of values
2317
ys: sequence of values
2318
meanx: optional float mean of xs
2319
meany: optional float mean of ys
2320
2321
Returns:
2322
Cov(X, Y)
2323
"""
2324
xs = np.asarray(xs)
2325
ys = np.asarray(ys)
2326
2327
if meanx is None:
2328
meanx = np.mean(xs)
2329
if meany is None:
2330
meany = np.mean(ys)
2331
2332
cov = np.dot(xs-meanx, ys-meany) / len(xs)
2333
return cov
2334
2335
2336
def Corr(xs, ys):
2337
"""Computes Corr(X, Y).
2338
2339
Args:
2340
xs: sequence of values
2341
ys: sequence of values
2342
2343
Returns:
2344
Corr(X, Y)
2345
"""
2346
xs = np.asarray(xs)
2347
ys = np.asarray(ys)
2348
2349
meanx, varx = MeanVar(xs)
2350
meany, vary = MeanVar(ys)
2351
2352
corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)
2353
2354
return corr
2355
2356
2357
def SerialCorr(series, lag=1):
2358
"""Computes the serial correlation of a series.
2359
2360
series: Series
2361
lag: integer number of intervals to shift
2362
2363
returns: float correlation
2364
"""
2365
xs = series[lag:]
2366
ys = series.shift(lag)[lag:]
2367
corr = Corr(xs, ys)
2368
return corr
2369
2370
2371
def SpearmanCorr(xs, ys):
2372
"""Computes Spearman's rank correlation.
2373
2374
Args:
2375
xs: sequence of values
2376
ys: sequence of values
2377
2378
Returns:
2379
float Spearman's correlation
2380
"""
2381
xranks = pandas.Series(xs).rank()
2382
yranks = pandas.Series(ys).rank()
2383
return Corr(xranks, yranks)
2384
2385
2386
def MapToRanks(t):
2387
"""Returns a list of ranks corresponding to the elements in t.
2388
2389
Args:
2390
t: sequence of numbers
2391
2392
Returns:
2393
list of integer ranks, starting at 1
2394
"""
2395
# pair up each value with its index
2396
pairs = enumerate(t)
2397
2398
# sort by value
2399
sorted_pairs = sorted(pairs, key=itemgetter(1))
2400
2401
# pair up each pair with its rank
2402
ranked = enumerate(sorted_pairs)
2403
2404
# sort by index
2405
resorted = sorted(ranked, key=lambda trip: trip[1][0])
2406
2407
# extract the ranks
2408
ranks = [trip[0]+1 for trip in resorted]
2409
return ranks
2410
2411
2412
def LeastSquares(xs, ys):
2413
"""Computes a linear least squares fit for ys as a function of xs.
2414
2415
Args:
2416
xs: sequence of values
2417
ys: sequence of values
2418
2419
Returns:
2420
tuple of (intercept, slope)
2421
"""
2422
meanx, varx = MeanVar(xs)
2423
meany = Mean(ys)
2424
2425
slope = Cov(xs, ys, meanx, meany) / varx
2426
inter = meany - slope * meanx
2427
2428
return inter, slope
2429
2430
2431
def FitLine(xs, inter, slope):
2432
"""Fits a line to the given data.
2433
2434
xs: sequence of x
2435
2436
returns: tuple of numpy arrays (sorted xs, fit ys)
2437
"""
2438
fit_xs = np.sort(xs)
2439
fit_ys = inter + slope * fit_xs
2440
return fit_xs, fit_ys
2441
2442
2443
def Residuals(xs, ys, inter, slope):
2444
"""Computes residuals for a linear fit with parameters inter and slope.
2445
2446
Args:
2447
xs: independent variable
2448
ys: dependent variable
2449
inter: float intercept
2450
slope: float slope
2451
2452
Returns:
2453
list of residuals
2454
"""
2455
xs = np.asarray(xs)
2456
ys = np.asarray(ys)
2457
res = ys - (inter + slope * xs)
2458
return res
2459
2460
2461
def CoefDetermination(ys, res):
2462
"""Computes the coefficient of determination (R^2) for given residuals.
2463
2464
Args:
2465
ys: dependent variable
2466
res: residuals
2467
2468
Returns:
2469
float coefficient of determination
2470
"""
2471
return 1 - Var(res) / Var(ys)
2472
2473
2474
def CorrelatedGenerator(rho):
2475
"""Generates standard normal variates with serial correlation.
2476
2477
rho: target coefficient of correlation
2478
2479
Returns: iterable
2480
"""
2481
x = random.gauss(0, 1)
2482
yield x
2483
2484
sigma = math.sqrt(1 - rho**2)
2485
while True:
2486
x = random.gauss(x * rho, sigma)
2487
yield x
2488
2489
2490
def CorrelatedNormalGenerator(mu, sigma, rho):
2491
"""Generates normal variates with serial correlation.
2492
2493
mu: mean of variate
2494
sigma: standard deviation of variate
2495
rho: target coefficient of correlation
2496
2497
Returns: iterable
2498
"""
2499
for x in CorrelatedGenerator(rho):
2500
yield x * sigma + mu
2501
2502
2503
def RawMoment(xs, k):
2504
"""Computes the kth raw moment of xs.
2505
"""
2506
return sum(x**k for x in xs) / len(xs)
2507
2508
2509
def CentralMoment(xs, k):
2510
"""Computes the kth central moment of xs.
2511
"""
2512
mean = RawMoment(xs, 1)
2513
return sum((x - mean)**k for x in xs) / len(xs)
2514
2515
2516
def StandardizedMoment(xs, k):
2517
"""Computes the kth standardized moment of xs.
2518
"""
2519
var = CentralMoment(xs, 2)
2520
std = math.sqrt(var)
2521
return CentralMoment(xs, k) / std**k
2522
2523
2524
def Skewness(xs):
2525
"""Computes skewness.
2526
"""
2527
return StandardizedMoment(xs, 3)
2528
2529
2530
def Median(xs):
2531
"""Computes the median (50th percentile) of a sequence.
2532
2533
xs: sequence or anything else that can initialize a Cdf
2534
2535
returns: float
2536
"""
2537
cdf = Cdf(xs)
2538
return cdf.Value(0.5)
2539
2540
2541
def IQR(xs):
2542
"""Computes the interquartile of a sequence.
2543
2544
xs: sequence or anything else that can initialize a Cdf
2545
2546
returns: pair of floats
2547
"""
2548
cdf = Cdf(xs)
2549
return cdf.Value(0.25), cdf.Value(0.75)
2550
2551
2552
def PearsonMedianSkewness(xs):
2553
"""Computes the Pearson median skewness.
2554
"""
2555
median = Median(xs)
2556
mean = RawMoment(xs, 1)
2557
var = CentralMoment(xs, 2)
2558
std = math.sqrt(var)
2559
gp = 3 * (mean - median) / std
2560
return gp
2561
2562
2563
class FixedWidthVariables(object):
2564
"""Represents a set of variables in a fixed width file."""
2565
2566
def __init__(self, variables, index_base=0):
2567
"""Initializes.
2568
2569
variables: DataFrame
2570
index_base: are the indices 0 or 1 based?
2571
2572
Attributes:
2573
colspecs: list of (start, end) index tuples
2574
names: list of string variable names
2575
"""
2576
self.variables = variables
2577
2578
# note: by default, subtract 1 from colspecs
2579
self.colspecs = variables[['start', 'end']] - index_base
2580
2581
# convert colspecs to a list of pair of int
2582
self.colspecs = self.colspecs.astype(np.int).values.tolist()
2583
self.names = variables['name']
2584
2585
def ReadFixedWidth(self, filename, **options):
2586
"""Reads a fixed width ASCII file.
2587
2588
filename: string filename
2589
2590
returns: DataFrame
2591
"""
2592
df = pandas.read_fwf(filename,
2593
colspecs=self.colspecs,
2594
names=self.names,
2595
**options)
2596
return df
2597
2598
2599
def ReadStataDct(dct_file, **options):
2600
"""Reads a Stata dictionary file.
2601
2602
dct_file: string filename
2603
options: dict of options passed to open()
2604
2605
returns: FixedWidthVariables object
2606
"""
2607
type_map = dict(byte=int, int=int, long=int, float=float, double=float)
2608
2609
var_info = []
2610
for line in open(dct_file, **options):
2611
match = re.search( r'_column\(([^)]*)\)', line)
2612
if match:
2613
start = int(match.group(1))
2614
t = line.split()
2615
vtype, name, fstring = t[1:4]
2616
name = name.lower()
2617
if vtype.startswith('str'):
2618
vtype = str
2619
else:
2620
vtype = type_map[vtype]
2621
long_desc = ' '.join(t[4:]).strip('"')
2622
var_info.append((start, vtype, name, fstring, long_desc))
2623
2624
columns = ['start', 'type', 'name', 'fstring', 'desc']
2625
variables = pandas.DataFrame(var_info, columns=columns)
2626
2627
# fill in the end column by shifting the start column
2628
variables['end'] = variables.start.shift(-1)
2629
variables.loc[len(variables)-1, 'end'] = 0
2630
2631
dct = FixedWidthVariables(variables, index_base=1)
2632
return dct
2633
2634
2635
def Resample(xs, n=None):
2636
"""Draw a sample from xs with the same length as xs.
2637
2638
xs: sequence
2639
n: sample size (default: len(xs))
2640
2641
returns: NumPy array
2642
"""
2643
if n is None:
2644
n = len(xs)
2645
return np.random.choice(xs, n, replace=True)
2646
2647
2648
def SampleRows(df, nrows, replace=False):
2649
"""Choose a sample of rows from a DataFrame.
2650
2651
df: DataFrame
2652
nrows: number of rows
2653
replace: whether to sample with replacement
2654
2655
returns: DataDf
2656
"""
2657
indices = np.random.choice(df.index, nrows, replace=replace)
2658
sample = df.loc[indices]
2659
return sample
2660
2661
2662
def ResampleRows(df):
2663
"""Resamples rows from a DataFrame.
2664
2665
df: DataFrame
2666
2667
returns: DataFrame
2668
"""
2669
return SampleRows(df, len(df), replace=True)
2670
2671
2672
def ResampleRowsWeighted(df, column='finalwgt'):
2673
"""Resamples a DataFrame using probabilities proportional to given column.
2674
2675
df: DataFrame
2676
column: string column name to use as weights
2677
2678
returns: DataFrame
2679
"""
2680
weights = df[column]
2681
cdf = Cdf(dict(weights))
2682
indices = cdf.Sample(len(weights))
2683
sample = df.loc[indices]
2684
return sample
2685
2686
2687
def PercentileRow(array, p):
2688
"""Selects the row from a sorted array that maps to percentile p.
2689
2690
p: float 0--100
2691
2692
returns: NumPy array (one row)
2693
"""
2694
rows, cols = array.shape
2695
index = int(rows * p / 100)
2696
return array[index,]
2697
2698
2699
def PercentileRows(ys_seq, percents):
2700
"""Given a collection of lines, selects percentiles along vertical axis.
2701
2702
For example, if ys_seq contains simulation results like ys as a
2703
function of time, and percents contains (5, 95), the result would
2704
be a 90% CI for each vertical slice of the simulation results.
2705
2706
ys_seq: sequence of lines (y values)
2707
percents: list of percentiles (0-100) to select
2708
2709
returns: list of NumPy arrays, one for each percentile
2710
"""
2711
nrows = len(ys_seq)
2712
ncols = len(ys_seq[0])
2713
array = np.zeros((nrows, ncols))
2714
2715
for i, ys in enumerate(ys_seq):
2716
array[i,] = ys
2717
2718
array = np.sort(array, axis=0)
2719
2720
rows = [PercentileRow(array, p) for p in percents]
2721
return rows
2722
2723
2724
def Smooth(xs, sigma=2, **options):
2725
"""Smooths a NumPy array with a Gaussian filter.
2726
2727
xs: sequence
2728
sigma: standard deviation of the filter
2729
"""
2730
return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
2731
2732
2733
class HypothesisTest(object):
2734
"""Represents a hypothesis test."""
2735
2736
def __init__(self, data):
2737
"""Initializes.
2738
2739
data: data in whatever form is relevant
2740
"""
2741
self.data = data
2742
self.MakeModel()
2743
self.actual = self.TestStatistic(data)
2744
self.test_stats = None
2745
self.test_cdf = None
2746
2747
def PValue(self, iters=1000):
2748
"""Computes the distribution of the test statistic and p-value.
2749
2750
iters: number of iterations
2751
2752
returns: float p-value
2753
"""
2754
self.test_stats = [self.TestStatistic(self.RunModel())
2755
for _ in range(iters)]
2756
self.test_cdf = Cdf(self.test_stats)
2757
2758
count = sum(1 for x in self.test_stats if x >= self.actual)
2759
return count / iters
2760
2761
def MaxTestStat(self):
2762
"""Returns the largest test statistic seen during simulations.
2763
"""
2764
return max(self.test_stats)
2765
2766
def PlotCdf(self, label=None):
2767
"""Draws a Cdf with vertical lines at the observed test stat.
2768
"""
2769
def VertLine(x):
2770
"""Draws a vertical line at x."""
2771
thinkplot.Plot([x, x], [0, 1], color='0.8')
2772
2773
VertLine(self.actual)
2774
thinkplot.Cdf(self.test_cdf, label=label)
2775
2776
def TestStatistic(self, data):
2777
"""Computes the test statistic.
2778
2779
data: data in whatever form is relevant
2780
"""
2781
raise UnimplementedMethodException()
2782
2783
def MakeModel(self):
2784
"""Build a model of the null hypothesis.
2785
"""
2786
pass
2787
2788
def RunModel(self):
2789
"""Run the model of the null hypothesis.
2790
2791
returns: simulated data
2792
"""
2793
raise UnimplementedMethodException()
2794
2795
2796
def main():
2797
pass
2798
2799
2800
if __name__ == '__main__':
2801
main()
2802
2803