Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

Repository for a workshop on Bayesian statistics

1430 views
1
"""This file contains code for use with "Think Stats" and
2
"Think Bayes", both by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
"""This file contains class definitions for:
11
12
Hist: represents a histogram (map from values to integer frequencies).
13
14
Pmf: represents a probability mass function (map from values to probs).
15
16
_DictWrapper: private parent class for Hist and Pmf.
17
18
Cdf: represents a discrete cumulative distribution function
19
20
Pdf: represents a continuous probability density function
21
22
"""
23
24
import bisect
25
import copy
26
import logging
27
import math
28
import random
29
import re
30
31
from collections import Counter
32
from operator import itemgetter
33
34
import thinkplot
35
36
import numpy as np
37
import pandas
38
39
import scipy
40
from scipy import stats
41
from scipy import special
42
from scipy import ndimage
43
44
from scipy.special import gamma
45
46
from io import open
47
48
ROOT2 = math.sqrt(2)
49
50
def RandomSeed(x):
51
"""Initialize the random and np.random generators.
52
53
x: int seed
54
"""
55
random.seed(x)
56
np.random.seed(x)
57
58
59
def Odds(p):
60
"""Computes odds for a given probability.
61
62
Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
63
64
Note: when p=1, the formula for odds divides by zero, which is
65
normally undefined. But I think it is reasonable to define Odds(1)
66
to be infinity, so that's what this function does.
67
68
p: float 0-1
69
70
Returns: float odds
71
"""
72
if p == 1:
73
return float('inf')
74
return p / (1 - p)
75
76
77
def Probability(o):
78
"""Computes the probability corresponding to given odds.
79
80
Example: o=2 means 2:1 odds in favor, or 2/3 probability
81
82
o: float odds, strictly positive
83
84
Returns: float probability
85
"""
86
return o / (o + 1)
87
88
89
def Probability2(yes, no):
90
"""Computes the probability corresponding to given odds.
91
92
Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
93
94
yes, no: int or float odds in favor
95
"""
96
return yes / (yes + no)
97
98
99
class Interpolator(object):
100
"""Represents a mapping between sorted sequences; performs linear interp.
101
102
Attributes:
103
xs: sorted list
104
ys: sorted list
105
"""
106
107
def __init__(self, xs, ys):
108
self.xs = xs
109
self.ys = ys
110
111
def Lookup(self, x):
112
"""Looks up x and returns the corresponding value of y."""
113
return self._Bisect(x, self.xs, self.ys)
114
115
def Reverse(self, y):
116
"""Looks up y and returns the corresponding value of x."""
117
return self._Bisect(y, self.ys, self.xs)
118
119
def _Bisect(self, x, xs, ys):
120
"""Helper function."""
121
if x <= xs[0]:
122
return ys[0]
123
if x >= xs[-1]:
124
return ys[-1]
125
i = bisect.bisect(xs, x)
126
frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
127
y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
128
return y
129
130
131
# When we plot Hist, Pmf and Cdf objects, they don't appear in
132
# the legend unless we override the default label.
133
DEFAULT_LABEL = '_nolegend_'
134
135
136
class _DictWrapper(object):
137
"""An object that contains a dictionary."""
138
139
def __init__(self, obj=None, label=None):
140
"""Initializes the distribution.
141
142
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
143
label: string label
144
"""
145
self.label = label if label is not None else DEFAULT_LABEL
146
self.d = {}
147
148
# flag whether the distribution is under a log transform
149
self.log = False
150
151
if obj is None:
152
return
153
154
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
155
self.label = label if label is not None else obj.label
156
157
if isinstance(obj, dict):
158
self.d.update(obj.items())
159
elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):
160
self.d.update(obj.Items())
161
elif isinstance(obj, pandas.Series):
162
self.d.update(obj.value_counts().iteritems())
163
else:
164
# finally, treat it like a list
165
self.d.update(Counter(obj))
166
167
if len(self) > 0 and isinstance(self, Pmf):
168
self.Normalize()
169
170
def __hash__(self):
171
return id(self)
172
173
def __str__(self):
174
cls = self.__class__.__name__
175
if self.label == DEFAULT_LABEL:
176
return '%s(%s)' % (cls, str(self.d))
177
else:
178
return self.label
179
180
def __repr__(self):
181
cls = self.__class__.__name__
182
if self.label == DEFAULT_LABEL:
183
return '%s(%s)' % (cls, repr(self.d))
184
else:
185
return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label))
186
187
def __eq__(self, other):
188
try:
189
return self.d == other.d
190
except AttributeError:
191
return False
192
193
def __len__(self):
194
return len(self.d)
195
196
def __iter__(self):
197
return iter(self.d)
198
199
def iterkeys(self):
200
"""Returns an iterator over keys."""
201
return iter(self.d)
202
203
def __contains__(self, value):
204
return value in self.d
205
206
def __getitem__(self, value):
207
return self.d.get(value, 0)
208
209
def __setitem__(self, value, prob):
210
self.d[value] = prob
211
212
def __delitem__(self, value):
213
del self.d[value]
214
215
def Copy(self, label=None):
216
"""Returns a copy.
217
218
Make a shallow copy of d. If you want a deep copy of d,
219
use copy.deepcopy on the whole object.
220
221
label: string label for the new Hist
222
223
returns: new _DictWrapper with the same type
224
"""
225
new = copy.copy(self)
226
new.d = copy.copy(self.d)
227
new.label = label if label is not None else self.label
228
return new
229
230
def Scale(self, factor):
231
"""Multiplies the values by a factor.
232
233
factor: what to multiply by
234
235
Returns: new object
236
"""
237
new = self.Copy()
238
new.d.clear()
239
240
for val, prob in self.Items():
241
new.Set(val * factor, prob)
242
return new
243
244
def Log(self, m=None):
245
"""Log transforms the probabilities.
246
247
Removes values with probability 0.
248
249
Normalizes so that the largest logprob is 0.
250
"""
251
if self.log:
252
raise ValueError("Pmf/Hist already under a log transform")
253
self.log = True
254
255
if m is None:
256
m = self.MaxLike()
257
258
for x, p in self.d.items():
259
if p:
260
self.Set(x, math.log(p / m))
261
else:
262
self.Remove(x)
263
264
def Exp(self, m=None):
265
"""Exponentiates the probabilities.
266
267
m: how much to shift the ps before exponentiating
268
269
If m is None, normalizes so that the largest prob is 1.
270
"""
271
if not self.log:
272
raise ValueError("Pmf/Hist not under a log transform")
273
self.log = False
274
275
if m is None:
276
m = self.MaxLike()
277
278
for x, p in self.d.items():
279
self.Set(x, math.exp(p - m))
280
281
def GetDict(self):
282
"""Gets the dictionary."""
283
return self.d
284
285
def SetDict(self, d):
286
"""Sets the dictionary."""
287
self.d = d
288
289
def Values(self):
290
"""Gets an unsorted sequence of values.
291
292
Note: one source of confusion is that the keys of this
293
dictionary are the values of the Hist/Pmf, and the
294
values of the dictionary are frequencies/probabilities.
295
"""
296
return self.d.keys()
297
298
def Items(self):
299
"""Gets an unsorted sequence of (value, freq/prob) pairs."""
300
return self.d.items()
301
302
def SortedItems(self):
303
"""Gets a sorted sequence of (value, freq/prob) pairs.
304
305
It items are unsortable, the result is unsorted.
306
"""
307
def isnan(x):
308
try:
309
return math.isnan(x)
310
except TypeError:
311
return False
312
313
if any([isnan(x) for x in self.Values()]):
314
msg = 'Keys contain NaN, may not sort correctly.'
315
logging.warning(msg)
316
317
try:
318
return sorted(self.d.items())
319
except TypeError:
320
return self.d.items()
321
322
def Render(self, **options):
323
"""Generates a sequence of points suitable for plotting.
324
325
Note: options are ignored
326
327
Returns:
328
tuple of (sorted value sequence, freq/prob sequence)
329
"""
330
return zip(*self.SortedItems())
331
332
def MakeCdf(self, label=None):
333
"""Makes a Cdf."""
334
label = label if label is not None else self.label
335
return Cdf(self, label=label)
336
337
def Print(self):
338
"""Prints the values and freqs/probs in ascending order."""
339
for val, prob in self.SortedItems():
340
print(val, prob)
341
342
def Set(self, x, y=0):
343
"""Sets the freq/prob associated with the value x.
344
345
Args:
346
x: number value
347
y: number freq or prob
348
"""
349
self.d[x] = y
350
351
def Incr(self, x, term=1):
352
"""Increments the freq/prob associated with the value x.
353
354
Args:
355
x: number value
356
term: how much to increment by
357
"""
358
self.d[x] = self.d.get(x, 0) + term
359
360
def Mult(self, x, factor):
361
"""Scales the freq/prob associated with the value x.
362
363
Args:
364
x: number value
365
factor: how much to multiply by
366
"""
367
self.d[x] = self.d.get(x, 0) * factor
368
369
def Remove(self, x):
370
"""Removes a value.
371
372
Throws an exception if the value is not there.
373
374
Args:
375
x: value to remove
376
"""
377
del self.d[x]
378
379
def Total(self):
380
"""Returns the total of the frequencies/probabilities in the map."""
381
total = sum(self.d.values())
382
return total
383
384
def MaxLike(self):
385
"""Returns the largest frequency/probability in the map."""
386
return max(self.d.values())
387
388
def Largest(self, n=10):
389
"""Returns the largest n values, with frequency/probability.
390
391
n: number of items to return
392
"""
393
return sorted(self.d.items(), reverse=True)[:n]
394
395
def Smallest(self, n=10):
396
"""Returns the smallest n values, with frequency/probability.
397
398
n: number of items to return
399
"""
400
return sorted(self.d.items(), reverse=False)[:n]
401
402
403
class Hist(_DictWrapper):
404
"""Represents a histogram, which is a map from values to frequencies.
405
406
Values can be any hashable type; frequencies are integer counters.
407
"""
408
def Freq(self, x):
409
"""Gets the frequency associated with the value x.
410
411
Args:
412
x: number value
413
414
Returns:
415
int frequency
416
"""
417
return self.d.get(x, 0)
418
419
def Freqs(self, xs):
420
"""Gets frequencies for a sequence of values."""
421
return [self.Freq(x) for x in xs]
422
423
def IsSubset(self, other):
424
"""Checks whether the values in this histogram are a subset of
425
the values in the given histogram."""
426
for val, freq in self.Items():
427
if freq > other.Freq(val):
428
return False
429
return True
430
431
def Subtract(self, other):
432
"""Subtracts the values in the given histogram from this histogram."""
433
for val, freq in other.Items():
434
self.Incr(val, -freq)
435
436
437
class Pmf(_DictWrapper):
438
"""Represents a probability mass function.
439
440
Values can be any hashable type; probabilities are floating-point.
441
Pmfs are not necessarily normalized.
442
"""
443
444
def Prob(self, x, default=0):
445
"""Gets the probability associated with the value x.
446
447
Args:
448
x: number value
449
default: value to return if the key is not there
450
451
Returns:
452
float probability
453
"""
454
return self.d.get(x, default)
455
456
def Probs(self, xs):
457
"""Gets probabilities for a sequence of values."""
458
return [self.Prob(x) for x in xs]
459
460
def Percentile(self, percentage):
461
"""Computes a percentile of a given Pmf.
462
463
Note: this is not super efficient. If you are planning
464
to compute more than a few percentiles, compute the Cdf.
465
466
percentage: float 0-100
467
468
returns: value from the Pmf
469
"""
470
p = percentage / 100
471
total = 0
472
for val, prob in sorted(self.Items()):
473
total += prob
474
if total >= p:
475
return val
476
477
def ProbGreater(self, x):
478
"""Probability that a sample from this Pmf exceeds x.
479
480
x: number
481
482
returns: float probability
483
"""
484
if isinstance(x, _DictWrapper):
485
return PmfProbGreater(self, x)
486
else:
487
t = [prob for (val, prob) in self.d.items() if val > x]
488
return sum(t)
489
490
def ProbLess(self, x):
491
"""Probability that a sample from this Pmf is less than x.
492
493
x: number
494
495
returns: float probability
496
"""
497
if isinstance(x, _DictWrapper):
498
return PmfProbLess(self, x)
499
else:
500
t = [prob for (val, prob) in self.d.items() if val < x]
501
return sum(t)
502
503
def ProbEqual(self, x):
504
"""Probability that a sample from this Pmf is exactly x.
505
506
x: number
507
508
returns: float probability
509
"""
510
if isinstance(x, _DictWrapper):
511
return PmfProbEqual(self, x)
512
else:
513
return self[x]
514
515
# NOTE: I've decided to remove the magic comparators because they
516
# have the side-effect of making Pmf sortable, but in fact they
517
# don't support sorting.
518
519
def Normalize(self, fraction=1):
520
"""Normalizes this PMF so the sum of all probs is fraction.
521
522
Args:
523
fraction: what the total should be after normalization
524
525
Returns: the total probability before normalizing
526
"""
527
if self.log:
528
raise ValueError("Normalize: Pmf is under a log transform")
529
530
total = self.Total()
531
if total == 0:
532
raise ValueError('Normalize: total probability is zero.')
533
534
factor = fraction / total
535
for x in self.d:
536
self.d[x] *= factor
537
538
return total
539
540
def Random(self):
541
"""Chooses a random element from this PMF.
542
543
Note: this is not very efficient. If you plan to call
544
this more than a few times, consider converting to a CDF.
545
546
Returns:
547
float value from the Pmf
548
"""
549
target = random.random()
550
total = 0
551
for x, p in self.d.items():
552
total += p
553
if total >= target:
554
return x
555
556
# we shouldn't get here
557
raise ValueError('Random: Pmf might not be normalized.')
558
559
def Sample(self, n):
560
"""Generates a random sample from this distribution.
561
562
n: int length of the sample
563
returns: NumPy array
564
"""
565
return self.MakeCdf().Sample(n)
566
567
def Mean(self):
568
"""Computes the mean of a PMF.
569
570
Returns:
571
float mean
572
"""
573
return sum(p * x for x, p in self.Items())
574
575
def Median(self):
576
"""Computes the median of a PMF.
577
578
Returns:
579
float median
580
"""
581
return self.MakeCdf().Percentile(50)
582
583
def Var(self, mu=None):
584
"""Computes the variance of a PMF.
585
586
mu: the point around which the variance is computed;
587
if omitted, computes the mean
588
589
returns: float variance
590
"""
591
if mu is None:
592
mu = self.Mean()
593
594
return sum(p * (x-mu)**2 for x, p in self.Items())
595
596
def Expect(self, func):
597
"""Computes the expectation of func(x).
598
599
Returns:
600
expectation
601
"""
602
return np.sum(p * func(x) for x, p in self.Items())
603
604
def Std(self, mu=None):
605
"""Computes the standard deviation of a PMF.
606
607
mu: the point around which the variance is computed;
608
if omitted, computes the mean
609
610
returns: float standard deviation
611
"""
612
var = self.Var(mu)
613
return math.sqrt(var)
614
615
def Mode(self):
616
"""Returns the value with the highest probability.
617
618
Returns: float probability
619
"""
620
_, val = max((prob, val) for val, prob in self.Items())
621
return val
622
623
# The mode of a posterior is the maximum aposteori probability (MAP)
624
MAP = Mode
625
626
# If the distribution contains likelihoods only, the peak is the
627
# maximum likelihood estimator.
628
MaximumLikelihood = Mode
629
630
def CredibleInterval(self, percentage=90):
631
"""Computes the central credible interval.
632
633
If percentage=90, computes the 90% CI.
634
635
Args:
636
percentage: float between 0 and 100
637
638
Returns:
639
sequence of two floats, low and high
640
"""
641
cdf = self.MakeCdf()
642
return cdf.CredibleInterval(percentage)
643
644
def __add__(self, other):
645
"""Computes the Pmf of the sum of values drawn from self and other.
646
647
other: another Pmf or a scalar
648
649
returns: new Pmf
650
"""
651
try:
652
return self.AddPmf(other)
653
except AttributeError:
654
return self.AddConstant(other)
655
656
__radd__ = __add__
657
658
def AddPmf(self, other):
659
"""Computes the Pmf of the sum of values drawn from self and other.
660
661
other: another Pmf
662
663
returns: new Pmf
664
"""
665
pmf = Pmf()
666
for v1, p1 in self.Items():
667
for v2, p2 in other.Items():
668
pmf[v1 + v2] += p1 * p2
669
return pmf
670
671
def AddConstant(self, other):
672
"""Computes the Pmf of the sum a constant and values from self.
673
674
other: a number
675
676
returns: new Pmf
677
"""
678
if other == 0:
679
return self.Copy()
680
681
pmf = Pmf()
682
for v1, p1 in self.Items():
683
pmf.Set(v1 + other, p1)
684
return pmf
685
686
def __sub__(self, other):
687
"""Computes the Pmf of the diff of values drawn from self and other.
688
689
other: another Pmf
690
691
returns: new Pmf
692
"""
693
try:
694
return self.SubPmf(other)
695
except AttributeError:
696
return self.AddConstant(-other)
697
698
def SubPmf(self, other):
699
"""Computes the Pmf of the diff of values drawn from self and other.
700
701
other: another Pmf
702
703
returns: new Pmf
704
"""
705
pmf = Pmf()
706
for v1, p1 in self.Items():
707
for v2, p2 in other.Items():
708
pmf.Incr(v1 - v2, p1 * p2)
709
return pmf
710
711
def __mul__(self, other):
712
"""Computes the Pmf of the product of values drawn from self and other.
713
714
other: another Pmf
715
716
returns: new Pmf
717
"""
718
try:
719
return self.MulPmf(other)
720
except AttributeError:
721
return self.MulConstant(other)
722
723
def MulPmf(self, other):
724
"""Computes the Pmf of the diff of values drawn from self and other.
725
726
other: another Pmf
727
728
returns: new Pmf
729
"""
730
pmf = Pmf()
731
for v1, p1 in self.Items():
732
for v2, p2 in other.Items():
733
pmf.Incr(v1 * v2, p1 * p2)
734
return pmf
735
736
def MulConstant(self, other):
737
"""Computes the Pmf of the product of a constant and values from self.
738
739
other: a number
740
741
returns: new Pmf
742
"""
743
pmf = Pmf()
744
for v1, p1 in self.Items():
745
pmf.Set(v1 * other, p1)
746
return pmf
747
748
def __div__(self, other):
749
"""Computes the Pmf of the ratio of values drawn from self and other.
750
751
other: another Pmf
752
753
returns: new Pmf
754
"""
755
try:
756
return self.DivPmf(other)
757
except AttributeError:
758
return self.MulConstant(1/other)
759
760
__truediv__ = __div__
761
762
def DivPmf(self, other):
763
"""Computes the Pmf of the ratio of values drawn from self and other.
764
765
other: another Pmf
766
767
returns: new Pmf
768
"""
769
pmf = Pmf()
770
for v1, p1 in self.Items():
771
for v2, p2 in other.Items():
772
pmf.Incr(v1 / v2, p1 * p2)
773
return pmf
774
775
def Max(self, k):
776
"""Computes the CDF of the maximum of k selections from this dist.
777
778
k: int
779
780
returns: new Cdf
781
"""
782
cdf = self.MakeCdf()
783
cdf.ps **= k
784
return cdf
785
786
787
class Joint(Pmf):
788
"""Represents a joint distribution.
789
790
The values are sequences (usually tuples)
791
"""
792
793
def Marginal(self, i, label=None):
794
"""Gets the marginal distribution of the indicated variable.
795
796
i: index of the variable we want
797
798
Returns: Pmf
799
"""
800
pmf = Pmf(label=label)
801
for vs, prob in self.Items():
802
pmf.Incr(vs[i], prob)
803
return pmf
804
805
def Conditional(self, i, j, val, label=None):
806
"""Gets the conditional distribution of the indicated variable.
807
808
Distribution of vs[i], conditioned on vs[j] = val.
809
810
i: index of the variable we want
811
j: which variable is conditioned on
812
val: the value the jth variable has to have
813
814
Returns: Pmf
815
"""
816
pmf = Pmf(label=label)
817
for vs, prob in self.Items():
818
if vs[j] != val:
819
continue
820
pmf.Incr(vs[i], prob)
821
822
pmf.Normalize()
823
return pmf
824
825
def MaxLikeInterval(self, percentage=90):
826
"""Returns the maximum-likelihood credible interval.
827
828
If percentage=90, computes a 90% CI containing the values
829
with the highest likelihoods.
830
831
percentage: float between 0 and 100
832
833
Returns: list of values from the suite
834
"""
835
interval = []
836
total = 0
837
838
t = [(prob, val) for val, prob in self.Items()]
839
t.sort(reverse=True)
840
841
for prob, val in t:
842
interval.append(val)
843
total += prob
844
if total >= percentage / 100:
845
break
846
847
return interval
848
849
850
def MakeJoint(pmf1, pmf2):
851
"""Joint distribution of values from pmf1 and pmf2.
852
853
Assumes that the PMFs represent independent random variables.
854
855
Args:
856
pmf1: Pmf object
857
pmf2: Pmf object
858
859
Returns:
860
Joint pmf of value pairs
861
"""
862
joint = Joint()
863
for v1, p1 in pmf1.Items():
864
for v2, p2 in pmf2.Items():
865
joint.Set((v1, v2), p1 * p2)
866
return joint
867
868
869
def MakeHistFromList(t, label=None):
870
"""Makes a histogram from an unsorted sequence of values.
871
872
Args:
873
t: sequence of numbers
874
label: string label for this histogram
875
876
Returns:
877
Hist object
878
"""
879
return Hist(t, label=label)
880
881
882
def MakeHistFromDict(d, label=None):
883
"""Makes a histogram from a map from values to frequencies.
884
885
Args:
886
d: dictionary that maps values to frequencies
887
label: string label for this histogram
888
889
Returns:
890
Hist object
891
"""
892
return Hist(d, label)
893
894
895
def MakePmfFromList(t, label=None):
896
"""Makes a PMF from an unsorted sequence of values.
897
898
Args:
899
t: sequence of numbers
900
label: string label for this PMF
901
902
Returns:
903
Pmf object
904
"""
905
return Pmf(t, label=label)
906
907
908
def MakePmfFromDict(d, label=None):
909
"""Makes a PMF from a map from values to probabilities.
910
911
Args:
912
d: dictionary that maps values to probabilities
913
label: string label for this PMF
914
915
Returns:
916
Pmf object
917
"""
918
return Pmf(d, label=label)
919
920
921
def MakePmfFromItems(t, label=None):
922
"""Makes a PMF from a sequence of value-probability pairs
923
924
Args:
925
t: sequence of value-probability pairs
926
label: string label for this PMF
927
928
Returns:
929
Pmf object
930
"""
931
return Pmf(dict(t), label=label)
932
933
934
def MakePmfFromHist(hist, label=None):
935
"""Makes a normalized PMF from a Hist object.
936
937
Args:
938
hist: Hist object
939
label: string label
940
941
Returns:
942
Pmf object
943
"""
944
if label is None:
945
label = hist.label
946
947
return Pmf(hist, label=label)
948
949
950
def MakeMixture(metapmf, label='mix'):
951
"""Make a mixture distribution.
952
953
Args:
954
metapmf: Pmf that maps from Pmfs to probs.
955
label: string label for the new Pmf.
956
957
Returns: Pmf object.
958
"""
959
mix = Pmf(label=label)
960
for pmf, p1 in metapmf.Items():
961
for x, p2 in pmf.Items():
962
mix[x] += p1 * p2
963
return mix
964
965
966
def MakeUniformPmf(low, high, n):
967
"""Make a uniform Pmf.
968
969
low: lowest value (inclusive)
970
high: highest value (inclusize)
971
n: number of values
972
"""
973
pmf = Pmf()
974
for x in np.linspace(low, high, n):
975
pmf.Set(x, 1)
976
pmf.Normalize()
977
return pmf
978
979
980
class Cdf:
981
"""Represents a cumulative distribution function.
982
983
Attributes:
984
xs: sequence of values
985
ps: sequence of probabilities
986
label: string used as a graph label.
987
"""
988
def __init__(self, obj=None, ps=None, label=None):
989
"""Initializes.
990
991
If ps is provided, obj must be the corresponding list of values.
992
993
obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
994
ps: list of cumulative probabilities
995
label: string label
996
"""
997
self.label = label if label is not None else DEFAULT_LABEL
998
999
if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
1000
if not label:
1001
self.label = label if label is not None else obj.label
1002
1003
if obj is None:
1004
# caller does not provide obj, make an empty Cdf
1005
self.xs = np.asarray([])
1006
self.ps = np.asarray([])
1007
if ps is not None:
1008
logging.warning("Cdf: can't pass ps without also passing xs.")
1009
return
1010
else:
1011
# if the caller provides xs and ps, just store them
1012
if ps is not None:
1013
if isinstance(ps, str):
1014
logging.warning("Cdf: ps can't be a string")
1015
1016
self.xs = np.asarray(obj)
1017
self.ps = np.asarray(ps)
1018
return
1019
1020
# caller has provided just obj, not ps
1021
if isinstance(obj, Cdf):
1022
self.xs = copy.copy(obj.xs)
1023
self.ps = copy.copy(obj.ps)
1024
return
1025
1026
if isinstance(obj, _DictWrapper):
1027
dw = obj
1028
else:
1029
dw = Hist(obj)
1030
1031
if len(dw) == 0:
1032
self.xs = np.asarray([])
1033
self.ps = np.asarray([])
1034
return
1035
1036
xs, freqs = zip(*sorted(dw.Items()))
1037
self.xs = np.asarray(xs)
1038
self.ps = np.cumsum(freqs, dtype=np.float)
1039
self.ps /= self.ps[-1]
1040
1041
def __str__(self):
1042
cls = self.__class__.__name__
1043
if self.label == DEFAULT_LABEL:
1044
return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1045
else:
1046
return self.label
1047
1048
def __repr__(self):
1049
cls = self.__class__.__name__
1050
if self.label == DEFAULT_LABEL:
1051
return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1052
else:
1053
return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps),
1054
repr(self.label))
1055
1056
def __len__(self):
1057
return len(self.xs)
1058
1059
def __getitem__(self, x):
1060
return self.Prob(x)
1061
1062
def __setitem__(self):
1063
raise UnimplementedMethodException()
1064
1065
def __delitem__(self):
1066
raise UnimplementedMethodException()
1067
1068
def __eq__(self, other):
1069
return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)
1070
1071
def Print(self):
1072
"""Prints the values and freqs/probs in ascending order."""
1073
for val, prob in zip(self.xs, self.ps):
1074
print(val, prob)
1075
1076
def Copy(self, label=None):
1077
"""Returns a copy of this Cdf.
1078
1079
label: string label for the new Cdf
1080
"""
1081
if label is None:
1082
label = self.label
1083
return Cdf(list(self.xs), list(self.ps), label=label)
1084
1085
def MakePmf(self, label=None):
1086
"""Makes a Pmf."""
1087
if label is None:
1088
label = self.label
1089
return Pmf(self, label=label)
1090
1091
def Items(self):
1092
"""Returns a sorted sequence of (value, probability) pairs.
1093
1094
Note: in Python3, returns an iterator.
1095
"""
1096
a = self.ps
1097
b = np.roll(a, 1)
1098
b[0] = 0
1099
return zip(self.xs, a-b)
1100
1101
def Shift(self, term):
1102
"""Adds a term to the xs.
1103
1104
term: how much to add
1105
"""
1106
new = self.Copy()
1107
# don't use +=, or else an int array + float yields int array
1108
new.xs = new.xs + term
1109
return new
1110
1111
def Scale(self, factor):
1112
"""Multiplies the xs by a factor.
1113
1114
factor: what to multiply by
1115
"""
1116
new = self.Copy()
1117
# don't use *=, or else an int array * float yields int array
1118
new.xs = new.xs * factor
1119
return new
1120
1121
def Prob(self, x):
1122
"""Returns CDF(x), the probability that corresponds to value x.
1123
1124
Args:
1125
x: number
1126
1127
Returns:
1128
float probability
1129
"""
1130
if x < self.xs[0]:
1131
return 0
1132
index = bisect.bisect(self.xs, x)
1133
p = self.ps[index-1]
1134
return p
1135
1136
def Probs(self, xs):
1137
"""Gets probabilities for a sequence of values.
1138
1139
xs: any sequence that can be converted to NumPy array
1140
1141
returns: NumPy array of cumulative probabilities
1142
"""
1143
xs = np.asarray(xs)
1144
index = np.searchsorted(self.xs, xs, side='right')
1145
ps = self.ps[index-1]
1146
ps[xs < self.xs[0]] = 0
1147
return ps
1148
1149
ProbArray = Probs
1150
1151
def Value(self, p):
1152
"""Returns InverseCDF(p), the value that corresponds to probability p.
1153
1154
Args:
1155
p: number in the range [0, 1]
1156
1157
Returns:
1158
number value
1159
"""
1160
if p < 0 or p > 1:
1161
raise ValueError('Probability p must be in range [0, 1]')
1162
1163
index = bisect.bisect_left(self.ps, p)
1164
return self.xs[index]
1165
1166
def Values(self, ps=None):
1167
"""Returns InverseCDF(p), the value that corresponds to probability p.
1168
1169
If ps is not provided, returns all values.
1170
1171
Args:
1172
ps: NumPy array of numbers in the range [0, 1]
1173
1174
Returns:
1175
NumPy array of values
1176
"""
1177
if ps is None:
1178
return self.xs
1179
1180
ps = np.asarray(ps)
1181
if np.any(ps < 0) or np.any(ps > 1):
1182
raise ValueError('Probability p must be in range [0, 1]')
1183
1184
index = np.searchsorted(self.ps, ps, side='left')
1185
return self.xs[index]
1186
1187
ValueArray = Values
1188
1189
def Percentile(self, p):
1190
"""Returns the value that corresponds to percentile p.
1191
1192
Args:
1193
p: number in the range [0, 100]
1194
1195
Returns:
1196
number value
1197
"""
1198
return self.Value(p / 100)
1199
1200
def Percentiles(self, ps):
1201
"""Returns the value that corresponds to percentiles ps.
1202
1203
Args:
1204
ps: numbers in the range [0, 100]
1205
1206
Returns:
1207
array of values
1208
"""
1209
ps = np.asarray(ps)
1210
return self.Values(ps / 100)
1211
1212
def PercentileRank(self, x):
1213
"""Returns the percentile rank of the value x.
1214
1215
x: potential value in the CDF
1216
1217
returns: percentile rank in the range 0 to 100
1218
"""
1219
return self.Prob(x) * 100
1220
1221
def PercentileRanks(self, xs):
1222
"""Returns the percentile ranks of the values in xs.
1223
1224
xs: potential value in the CDF
1225
1226
returns: array of percentile ranks in the range 0 to 100
1227
"""
1228
return self.Probs(x) * 100
1229
1230
def Random(self):
1231
"""Chooses a random value from this distribution."""
1232
return self.Value(random.random())
1233
1234
def Sample(self, n):
1235
"""Generates a random sample from this distribution.
1236
1237
n: int length of the sample
1238
returns: NumPy array
1239
"""
1240
ps = np.random.random(n)
1241
return self.ValueArray(ps)
1242
1243
def Mean(self):
1244
"""Computes the mean of a CDF.
1245
1246
Returns:
1247
float mean
1248
"""
1249
old_p = 0
1250
total = 0
1251
for x, new_p in zip(self.xs, self.ps):
1252
p = new_p - old_p
1253
total += p * x
1254
old_p = new_p
1255
return total
1256
1257
def CredibleInterval(self, percentage=90):
1258
"""Computes the central credible interval.
1259
1260
If percentage=90, computes the 90% CI.
1261
1262
Args:
1263
percentage: float between 0 and 100
1264
1265
Returns:
1266
sequence of two floats, low and high
1267
"""
1268
prob = (1 - percentage / 100) / 2
1269
interval = self.Value(prob), self.Value(1 - prob)
1270
return interval
1271
1272
ConfidenceInterval = CredibleInterval
1273
1274
def _Round(self, multiplier=1000):
1275
"""
1276
An entry is added to the cdf only if the percentile differs
1277
from the previous value in a significant digit, where the number
1278
of significant digits is determined by multiplier. The
1279
default is 1000, which keeps log10(1000) = 3 significant digits.
1280
"""
1281
# TODO(write this method)
1282
raise UnimplementedMethodException()
1283
1284
def Render(self, **options):
1285
"""Generates a sequence of points suitable for plotting.
1286
1287
An empirical CDF is a step function; linear interpolation
1288
can be misleading.
1289
1290
Note: options are ignored
1291
1292
Returns:
1293
tuple of (xs, ps)
1294
"""
1295
def interleave(a, b):
1296
c = np.empty(a.shape[0] + b.shape[0])
1297
c[::2] = a
1298
c[1::2] = b
1299
return c
1300
1301
a = np.array(self.xs)
1302
xs = interleave(a, a)
1303
shift_ps = np.roll(self.ps, 1)
1304
shift_ps[0] = 0
1305
ps = interleave(shift_ps, self.ps)
1306
return xs, ps
1307
1308
def Max(self, k):
1309
"""Computes the CDF of the maximum of k selections from this dist.
1310
1311
k: int
1312
1313
returns: new Cdf
1314
"""
1315
cdf = self.Copy()
1316
cdf.ps **= k
1317
return cdf
1318
1319
1320
def MakeCdfFromItems(items, label=None):
1321
"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1322
1323
Args:
1324
items: unsorted sequence of (value, frequency) pairs
1325
label: string label for this CDF
1326
1327
Returns:
1328
cdf: list of (value, fraction) pairs
1329
"""
1330
return Cdf(dict(items), label=label)
1331
1332
1333
def MakeCdfFromDict(d, label=None):
1334
"""Makes a CDF from a dictionary that maps values to frequencies.
1335
1336
Args:
1337
d: dictionary that maps values to frequencies.
1338
label: string label for the data.
1339
1340
Returns:
1341
Cdf object
1342
"""
1343
return Cdf(d, label=label)
1344
1345
1346
def MakeCdfFromList(seq, label=None):
1347
"""Creates a CDF from an unsorted sequence.
1348
1349
Args:
1350
seq: unsorted sequence of sortable values
1351
label: string label for the cdf
1352
1353
Returns:
1354
Cdf object
1355
"""
1356
return Cdf(seq, label=label)
1357
1358
1359
def MakeCdfFromHist(hist, label=None):
1360
"""Makes a CDF from a Hist object.
1361
1362
Args:
1363
hist: Pmf.Hist object
1364
label: string label for the data.
1365
1366
Returns:
1367
Cdf object
1368
"""
1369
if label is None:
1370
label = hist.label
1371
1372
return Cdf(hist, label=label)
1373
1374
1375
def MakeCdfFromPmf(pmf, label=None):
1376
"""Makes a CDF from a Pmf object.
1377
1378
Args:
1379
pmf: Pmf.Pmf object
1380
label: string label for the data.
1381
1382
Returns:
1383
Cdf object
1384
"""
1385
if label is None:
1386
label = pmf.label
1387
1388
return Cdf(pmf, label=label)
1389
1390
1391
class UnimplementedMethodException(Exception):
1392
"""Exception if someone calls a method that should be overridden."""
1393
1394
1395
class Suite(Pmf):
1396
"""Represents a suite of hypotheses and their probabilities."""
1397
1398
def Update(self, data):
1399
"""Updates each hypothesis based on the data.
1400
1401
data: any representation of the data
1402
1403
returns: the normalizing constant
1404
"""
1405
for hypo in self.Values():
1406
like = self.Likelihood(data, hypo)
1407
self.Mult(hypo, like)
1408
return self.Normalize()
1409
1410
def LogUpdate(self, data):
1411
"""Updates a suite of hypotheses based on new data.
1412
1413
Modifies the suite directly; if you want to keep the original, make
1414
a copy.
1415
1416
Note: unlike Update, LogUpdate does not normalize.
1417
1418
Args:
1419
data: any representation of the data
1420
"""
1421
for hypo in self.Values():
1422
like = self.LogLikelihood(data, hypo)
1423
self.Incr(hypo, like)
1424
1425
def UpdateSet(self, dataset):
1426
"""Updates each hypothesis based on the dataset.
1427
1428
This is more efficient than calling Update repeatedly because
1429
it waits until the end to Normalize.
1430
1431
Modifies the suite directly; if you want to keep the original, make
1432
a copy.
1433
1434
dataset: a sequence of data
1435
1436
returns: the normalizing constant
1437
"""
1438
for data in dataset:
1439
for hypo in self.Values():
1440
like = self.Likelihood(data, hypo)
1441
self.Mult(hypo, like)
1442
return self.Normalize()
1443
1444
def LogUpdateSet(self, dataset):
1445
"""Updates each hypothesis based on the dataset.
1446
1447
Modifies the suite directly; if you want to keep the original, make
1448
a copy.
1449
1450
dataset: a sequence of data
1451
1452
returns: None
1453
"""
1454
for data in dataset:
1455
self.LogUpdate(data)
1456
1457
def Likelihood(self, data, hypo):
1458
"""Computes the likelihood of the data under the hypothesis.
1459
1460
hypo: some representation of the hypothesis
1461
data: some representation of the data
1462
"""
1463
raise UnimplementedMethodException()
1464
1465
def LogLikelihood(self, data, hypo):
1466
"""Computes the log likelihood of the data under the hypothesis.
1467
1468
hypo: some representation of the hypothesis
1469
data: some representation of the data
1470
"""
1471
raise UnimplementedMethodException()
1472
1473
def Print(self):
1474
"""Prints the hypotheses and their probabilities."""
1475
for hypo, prob in sorted(self.Items()):
1476
print(hypo, prob)
1477
1478
def MakeOdds(self):
1479
"""Transforms from probabilities to odds.
1480
1481
Values with prob=0 are removed.
1482
"""
1483
for hypo, prob in self.Items():
1484
if prob:
1485
self.Set(hypo, Odds(prob))
1486
else:
1487
self.Remove(hypo)
1488
1489
def MakeProbs(self):
1490
"""Transforms from odds to probabilities."""
1491
for hypo, odds in self.Items():
1492
self.Set(hypo, Probability(odds))
1493
1494
1495
def MakeSuiteFromList(t, label=None):
1496
"""Makes a suite from an unsorted sequence of values.
1497
1498
Args:
1499
t: sequence of numbers
1500
label: string label for this suite
1501
1502
Returns:
1503
Suite object
1504
"""
1505
hist = MakeHistFromList(t, label=label)
1506
d = hist.GetDict()
1507
return MakeSuiteFromDict(d)
1508
1509
1510
def MakeSuiteFromHist(hist, label=None):
1511
"""Makes a normalized suite from a Hist object.
1512
1513
Args:
1514
hist: Hist object
1515
label: string label
1516
1517
Returns:
1518
Suite object
1519
"""
1520
if label is None:
1521
label = hist.label
1522
1523
# make a copy of the dictionary
1524
d = dict(hist.GetDict())
1525
return MakeSuiteFromDict(d, label)
1526
1527
1528
def MakeSuiteFromDict(d, label=None):
1529
"""Makes a suite from a map from values to probabilities.
1530
1531
Args:
1532
d: dictionary that maps values to probabilities
1533
label: string label for this suite
1534
1535
Returns:
1536
Suite object
1537
"""
1538
suite = Suite(label=label)
1539
suite.SetDict(d)
1540
suite.Normalize()
1541
return suite
1542
1543
1544
class Pdf(object):
1545
"""Represents a probability density function (PDF)."""
1546
1547
def Density(self, x):
1548
"""Evaluates this Pdf at x.
1549
1550
Returns: float or NumPy array of probability density
1551
"""
1552
raise UnimplementedMethodException()
1553
1554
def GetLinspace(self):
1555
"""Get a linspace for plotting.
1556
1557
Not all subclasses of Pdf implement this.
1558
1559
Returns: numpy array
1560
"""
1561
raise UnimplementedMethodException()
1562
1563
def MakePmf(self, **options):
1564
"""Makes a discrete version of this Pdf.
1565
1566
options can include
1567
label: string
1568
low: low end of range
1569
high: high end of range
1570
n: number of places to evaluate
1571
1572
Returns: new Pmf
1573
"""
1574
label = options.pop('label', '')
1575
xs, ds = self.Render(**options)
1576
return Pmf(dict(zip(xs, ds)), label=label)
1577
1578
def Render(self, **options):
1579
"""Generates a sequence of points suitable for plotting.
1580
1581
If options includes low and high, it must also include n;
1582
in that case the density is evaluated an n locations between
1583
low and high, including both.
1584
1585
If options includes xs, the density is evaluate at those location.
1586
1587
Otherwise, self.GetLinspace is invoked to provide the locations.
1588
1589
Returns:
1590
tuple of (xs, densities)
1591
"""
1592
low, high = options.pop('low', None), options.pop('high', None)
1593
if low is not None and high is not None:
1594
n = options.pop('n', 101)
1595
xs = np.linspace(low, high, n)
1596
else:
1597
xs = options.pop('xs', None)
1598
if xs is None:
1599
xs = self.GetLinspace()
1600
1601
ds = self.Density(xs)
1602
return xs, ds
1603
1604
def Items(self):
1605
"""Generates a sequence of (value, probability) pairs.
1606
"""
1607
return zip(*self.Render())
1608
1609
1610
class NormalPdf(Pdf):
1611
"""Represents the PDF of a Normal distribution."""
1612
1613
def __init__(self, mu=0, sigma=1, label=None):
1614
"""Constructs a Normal Pdf with given mu and sigma.
1615
1616
mu: mean
1617
sigma: standard deviation
1618
label: string
1619
"""
1620
self.mu = mu
1621
self.sigma = sigma
1622
self.label = label if label is not None else '_nolegend_'
1623
1624
def __str__(self):
1625
return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)
1626
1627
def GetLinspace(self):
1628
"""Get a linspace for plotting.
1629
1630
Returns: numpy array
1631
"""
1632
low, high = self.mu-3*self.sigma, self.mu+3*self.sigma
1633
return np.linspace(low, high, 101)
1634
1635
def Density(self, xs):
1636
"""Evaluates this Pdf at xs.
1637
1638
xs: scalar or sequence of floats
1639
1640
returns: float or NumPy array of probability density
1641
"""
1642
return stats.norm.pdf(xs, self.mu, self.sigma)
1643
1644
1645
class ExponentialPdf(Pdf):
1646
"""Represents the PDF of an exponential distribution."""
1647
1648
def __init__(self, lam=1, label=None):
1649
"""Constructs an exponential Pdf with given parameter.
1650
1651
lam: rate parameter
1652
label: string
1653
"""
1654
self.lam = lam
1655
self.label = label if label is not None else '_nolegend_'
1656
1657
def __str__(self):
1658
return 'ExponentialPdf(%f)' % (self.lam)
1659
1660
def GetLinspace(self):
1661
"""Get a linspace for plotting.
1662
1663
Returns: numpy array
1664
"""
1665
low, high = 0, 5.0/self.lam
1666
return np.linspace(low, high, 101)
1667
1668
def Density(self, xs):
1669
"""Evaluates this Pdf at xs.
1670
1671
xs: scalar or sequence of floats
1672
1673
returns: float or NumPy array of probability density
1674
"""
1675
return stats.expon.pdf(xs, scale=1.0/self.lam)
1676
1677
1678
class EstimatedPdf(Pdf):
1679
"""Represents a PDF estimated by KDE."""
1680
1681
def __init__(self, sample, label=None):
1682
"""Estimates the density function based on a sample.
1683
1684
sample: sequence of data
1685
label: string
1686
"""
1687
self.label = label if label is not None else '_nolegend_'
1688
self.kde = stats.gaussian_kde(sample)
1689
low = min(sample)
1690
high = max(sample)
1691
self.linspace = np.linspace(low, high, 101)
1692
1693
def __str__(self):
1694
return 'EstimatedPdf(label=%s)' % str(self.label)
1695
1696
def GetLinspace(self):
1697
"""Get a linspace for plotting.
1698
1699
Returns: numpy array
1700
"""
1701
return self.linspace
1702
1703
def Density(self, xs):
1704
"""Evaluates this Pdf at xs.
1705
1706
returns: float or NumPy array of probability density
1707
"""
1708
return self.kde.evaluate(xs)
1709
1710
def Sample(self, n):
1711
"""Generates a random sample from the estimated Pdf.
1712
1713
n: size of sample
1714
"""
1715
# NOTE: we have to flatten because resample returns a 2-D
1716
# array for some reason.
1717
return self.kde.resample(n).flatten()
1718
1719
1720
def CredibleInterval(pmf, percentage=90):
1721
"""Computes a credible interval for a given distribution.
1722
1723
If percentage=90, computes the 90% CI.
1724
1725
Args:
1726
pmf: Pmf object representing a posterior distribution
1727
percentage: float between 0 and 100
1728
1729
Returns:
1730
sequence of two floats, low and high
1731
"""
1732
cdf = pmf.MakeCdf()
1733
prob = (1 - percentage / 100) / 2
1734
interval = cdf.Value(prob), cdf.Value(1 - prob)
1735
return interval
1736
1737
1738
def PmfProbLess(pmf1, pmf2):
1739
"""Probability that a value from pmf1 is less than a value from pmf2.
1740
1741
Args:
1742
pmf1: Pmf object
1743
pmf2: Pmf object
1744
1745
Returns:
1746
float probability
1747
"""
1748
total = 0
1749
for v1, p1 in pmf1.Items():
1750
for v2, p2 in pmf2.Items():
1751
if v1 < v2:
1752
total += p1 * p2
1753
return total
1754
1755
1756
def PmfProbGreater(pmf1, pmf2):
1757
"""Probability that a value from pmf1 is less than a value from pmf2.
1758
1759
Args:
1760
pmf1: Pmf object
1761
pmf2: Pmf object
1762
1763
Returns:
1764
float probability
1765
"""
1766
total = 0
1767
for v1, p1 in pmf1.Items():
1768
for v2, p2 in pmf2.Items():
1769
if v1 > v2:
1770
total += p1 * p2
1771
return total
1772
1773
1774
def PmfProbEqual(pmf1, pmf2):
1775
"""Probability that a value from pmf1 equals a value from pmf2.
1776
1777
Args:
1778
pmf1: Pmf object
1779
pmf2: Pmf object
1780
1781
Returns:
1782
float probability
1783
"""
1784
total = 0
1785
for v1, p1 in pmf1.Items():
1786
for v2, p2 in pmf2.Items():
1787
if v1 == v2:
1788
total += p1 * p2
1789
return total
1790
1791
1792
def RandomSum(dists):
1793
"""Chooses a random value from each dist and returns the sum.
1794
1795
dists: sequence of Pmf or Cdf objects
1796
1797
returns: numerical sum
1798
"""
1799
total = sum(dist.Random() for dist in dists)
1800
return total
1801
1802
1803
def SampleSum(dists, n):
1804
"""Draws a sample of sums from a list of distributions.
1805
1806
dists: sequence of Pmf or Cdf objects
1807
n: sample size
1808
1809
returns: new Pmf of sums
1810
"""
1811
pmf = Pmf(RandomSum(dists) for i in range(n))
1812
return pmf
1813
1814
1815
def EvalNormalPdf(x, mu, sigma):
1816
"""Computes the unnormalized PDF of the normal distribution.
1817
1818
x: value
1819
mu: mean
1820
sigma: standard deviation
1821
1822
returns: float probability density
1823
"""
1824
return stats.norm.pdf(x, mu, sigma)
1825
1826
1827
def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
1828
"""Makes a PMF discrete approx to a Normal distribution.
1829
1830
mu: float mean
1831
sigma: float standard deviation
1832
num_sigmas: how many sigmas to extend in each direction
1833
n: number of values in the Pmf
1834
1835
returns: normalized Pmf
1836
"""
1837
pmf = Pmf()
1838
low = mu - num_sigmas * sigma
1839
high = mu + num_sigmas * sigma
1840
1841
for x in np.linspace(low, high, n):
1842
p = EvalNormalPdf(x, mu, sigma)
1843
pmf.Set(x, p)
1844
pmf.Normalize()
1845
return pmf
1846
1847
1848
def EvalBinomialPmf(k, n, p):
1849
"""Evaluates the binomial PMF.
1850
1851
Returns the probabily of k successes in n trials with probability p.
1852
"""
1853
return stats.binom.pmf(k, n, p)
1854
1855
1856
def MakeBinomialPmf(n, p):
1857
"""Evaluates the binomial PMF.
1858
1859
Returns the distribution of successes in n trials with probability p.
1860
"""
1861
pmf = Pmf()
1862
for k in range(n+1):
1863
pmf[k] = stats.binom.pmf(k, n, p)
1864
return pmf
1865
1866
1867
def EvalGammaPdf(x, a):
1868
"""Computes the Gamma PDF.
1869
1870
x: where to evaluate the PDF
1871
a: parameter of the gamma distribution
1872
1873
returns: float probability
1874
"""
1875
return x**(a-1) * np.exp(-x) / gamma(a)
1876
1877
1878
def MakeGammaPmf(xs, a):
1879
"""Makes a PMF discrete approx to a Gamma distribution.
1880
1881
lam: parameter lambda in events per unit time
1882
xs: upper bound of the Pmf
1883
1884
returns: normalized Pmf
1885
"""
1886
xs = np.asarray(xs)
1887
ps = EvalGammaPdf(xs, a)
1888
pmf = Pmf(dict(zip(xs, ps)))
1889
pmf.Normalize()
1890
return pmf
1891
1892
1893
def EvalGeometricPmf(k, p, loc=0):
1894
"""Evaluates the geometric PMF.
1895
1896
With loc=0: Probability of `k` trials to get one success.
1897
With loc=-1: Probability of `k` trials before first success.
1898
1899
k: number of trials
1900
p: probability of success on each trial
1901
"""
1902
return stats.geom.pmf(k, p, loc=loc)
1903
1904
1905
def MakeGeometricPmf(p, loc=0, high=10):
1906
"""Evaluates the binomial PMF.
1907
1908
With loc=0: PMF of trials to get one success.
1909
With loc=-1: PMF of trials before first success.
1910
1911
p: probability of success
1912
high: upper bound where PMF is truncated
1913
"""
1914
pmf = Pmf()
1915
for k in range(high):
1916
pmf[k] = stats.geom.pmf(k, p, loc=loc)
1917
pmf.Normalize()
1918
return pmf
1919
1920
1921
def EvalHypergeomPmf(k, N, K, n):
1922
"""Evaluates the hypergeometric PMF.
1923
1924
Returns the probabily of k successes in n trials from a population
1925
N with K successes in it.
1926
"""
1927
return stats.hypergeom.pmf(k, N, K, n)
1928
1929
1930
def EvalPoissonPmf(k, lam):
1931
"""Computes the Poisson PMF.
1932
1933
k: number of events
1934
lam: parameter lambda in events per unit time
1935
1936
returns: float probability
1937
"""
1938
return stats.poisson.pmf(k, lam)
1939
1940
1941
def MakePoissonPmf(lam, high, step=1):
1942
"""Makes a PMF discrete approx to a Poisson distribution.
1943
1944
lam: parameter lambda in events per unit time
1945
high: upper bound of the Pmf
1946
1947
returns: normalized Pmf
1948
"""
1949
pmf = Pmf()
1950
for k in range(0, high + 1, step):
1951
p = stats.poisson.pmf(k, lam)
1952
pmf.Set(k, p)
1953
pmf.Normalize()
1954
return pmf
1955
1956
1957
def EvalExponentialPdf(x, lam):
1958
"""Computes the exponential PDF.
1959
1960
x: value
1961
lam: parameter lambda in events per unit time
1962
1963
returns: float probability density
1964
"""
1965
return lam * math.exp(-lam * x)
1966
1967
1968
def EvalExponentialCdf(x, lam):
1969
"""Evaluates CDF of the exponential distribution with parameter lam."""
1970
return 1 - math.exp(-lam * x)
1971
1972
1973
def MakeExponentialPmf(lam, high, n=200):
1974
"""Makes a PMF discrete approx to an exponential distribution.
1975
1976
lam: parameter lambda in events per unit time
1977
high: upper bound
1978
n: number of values in the Pmf
1979
1980
returns: normalized Pmf
1981
"""
1982
pmf = Pmf()
1983
for x in np.linspace(0, high, n):
1984
p = EvalExponentialPdf(x, lam)
1985
pmf.Set(x, p)
1986
pmf.Normalize()
1987
return pmf
1988
1989
1990
def EvalWeibullPdf(x, lam, k):
1991
"""Computes the Weibull PDF.
1992
1993
x: value
1994
lam: parameter lambda in events per unit time
1995
k: parameter
1996
1997
returns: float probability density
1998
"""
1999
arg = (x / lam)
2000
return k / lam * arg**(k-1) * np.exp(-arg**k)
2001
2002
2003
def EvalWeibullCdf(x, lam, k):
2004
"""Evaluates CDF of the Weibull distribution."""
2005
arg = (x / lam)
2006
return 1 - np.exp(-arg**k)
2007
2008
2009
def MakeWeibullPmf(lam, k, high, n=200):
2010
"""Makes a PMF discrete approx to a Weibull distribution.
2011
2012
lam: parameter lambda in events per unit time
2013
k: parameter
2014
high: upper bound
2015
n: number of values in the Pmf
2016
2017
returns: normalized Pmf
2018
"""
2019
xs = np.linspace(0, high, n)
2020
ps = EvalWeibullPdf(xs, lam, k)
2021
return Pmf(dict(zip(xs, ps)))
2022
2023
2024
def EvalParetoPdf(x, xm, alpha):
2025
"""Computes the Pareto.
2026
2027
xm: minimum value (scale parameter)
2028
alpha: shape parameter
2029
2030
returns: float probability density
2031
"""
2032
return stats.pareto.pdf(x, alpha, scale=xm)
2033
2034
2035
def MakeParetoPmf(xm, alpha, high, num=101):
2036
"""Makes a PMF discrete approx to a Pareto distribution.
2037
2038
xm: minimum value (scale parameter)
2039
alpha: shape parameter
2040
high: upper bound value
2041
num: number of values
2042
2043
returns: normalized Pmf
2044
"""
2045
xs = np.linspace(xm, high, num)
2046
ps = stats.pareto.pdf(xs, alpha, scale=xm)
2047
pmf = Pmf(dict(zip(xs, ps)))
2048
return pmf
2049
2050
def StandardNormalCdf(x):
2051
"""Evaluates the CDF of the standard Normal distribution.
2052
2053
See http://en.wikipedia.org/wiki/Normal_distribution
2054
#Cumulative_distribution_function
2055
2056
Args:
2057
x: float
2058
2059
Returns:
2060
float
2061
"""
2062
return (math.erf(x / ROOT2) + 1) / 2
2063
2064
2065
def EvalNormalCdf(x, mu=0, sigma=1):
2066
"""Evaluates the CDF of the normal distribution.
2067
2068
Args:
2069
x: float
2070
2071
mu: mean parameter
2072
2073
sigma: standard deviation parameter
2074
2075
Returns:
2076
float
2077
"""
2078
return stats.norm.cdf(x, loc=mu, scale=sigma)
2079
2080
2081
def EvalNormalCdfInverse(p, mu=0, sigma=1):
2082
"""Evaluates the inverse CDF of the normal distribution.
2083
2084
See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function
2085
2086
Args:
2087
p: float
2088
2089
mu: mean parameter
2090
2091
sigma: standard deviation parameter
2092
2093
Returns:
2094
float
2095
"""
2096
return stats.norm.ppf(p, loc=mu, scale=sigma)
2097
2098
2099
def EvalLognormalCdf(x, mu=0, sigma=1):
2100
"""Evaluates the CDF of the lognormal distribution.
2101
2102
x: float or sequence
2103
mu: mean parameter
2104
sigma: standard deviation parameter
2105
2106
Returns: float or sequence
2107
"""
2108
return stats.lognorm.cdf(x, loc=mu, scale=sigma)
2109
2110
2111
def RenderExpoCdf(lam, low, high, n=101):
2112
"""Generates sequences of xs and ps for an exponential CDF.
2113
2114
lam: parameter
2115
low: float
2116
high: float
2117
n: number of points to render
2118
2119
returns: numpy arrays (xs, ps)
2120
"""
2121
xs = np.linspace(low, high, n)
2122
ps = 1 - np.exp(-lam * xs)
2123
#ps = stats.expon.cdf(xs, scale=1.0/lam)
2124
return xs, ps
2125
2126
2127
def RenderNormalCdf(mu, sigma, low, high, n=101):
2128
"""Generates sequences of xs and ps for a Normal CDF.
2129
2130
mu: parameter
2131
sigma: parameter
2132
low: float
2133
high: float
2134
n: number of points to render
2135
2136
returns: numpy arrays (xs, ps)
2137
"""
2138
xs = np.linspace(low, high, n)
2139
ps = stats.norm.cdf(xs, mu, sigma)
2140
return xs, ps
2141
2142
2143
def RenderParetoCdf(xmin, alpha, low, high, n=50):
2144
"""Generates sequences of xs and ps for a Pareto CDF.
2145
2146
xmin: parameter
2147
alpha: parameter
2148
low: float
2149
high: float
2150
n: number of points to render
2151
2152
returns: numpy arrays (xs, ps)
2153
"""
2154
if low < xmin:
2155
low = xmin
2156
xs = np.linspace(low, high, n)
2157
ps = 1 - (xs / xmin) ** -alpha
2158
#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
2159
return xs, ps
2160
2161
2162
class Beta:
2163
"""Represents a Beta distribution.
2164
2165
See http://en.wikipedia.org/wiki/Beta_distribution
2166
"""
2167
def __init__(self, alpha=1, beta=1, label=None):
2168
"""Initializes a Beta distribution."""
2169
self.alpha = alpha
2170
self.beta = beta
2171
self.label = label if label is not None else '_nolegend_'
2172
2173
def Update(self, data):
2174
"""Updates a Beta distribution.
2175
2176
data: pair of int (heads, tails)
2177
"""
2178
heads, tails = data
2179
self.alpha += heads
2180
self.beta += tails
2181
2182
def Mean(self):
2183
"""Computes the mean of this distribution."""
2184
return self.alpha / (self.alpha + self.beta)
2185
2186
def MAP(self):
2187
"""Computes the value with maximum a posteori probability."""
2188
a = self.alpha - 1
2189
b = self.beta - 1
2190
return a / (a + b)
2191
2192
def Random(self):
2193
"""Generates a random variate from this distribution."""
2194
return random.betavariate(self.alpha, self.beta)
2195
2196
def Sample(self, n):
2197
"""Generates a random sample from this distribution.
2198
2199
n: int sample size
2200
"""
2201
size = n,
2202
return np.random.beta(self.alpha, self.beta, size)
2203
2204
def EvalPdf(self, x):
2205
"""Evaluates the PDF at x."""
2206
return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
2207
2208
def MakePmf(self, steps=101, label=None):
2209
"""Returns a Pmf of this distribution.
2210
2211
Note: Normally, we just evaluate the PDF at a sequence
2212
of points and treat the probability density as a probability
2213
mass.
2214
2215
But if alpha or beta is less than one, we have to be
2216
more careful because the PDF goes to infinity at x=0
2217
and x=1. In that case we evaluate the CDF and compute
2218
differences.
2219
2220
The result is a little funny, because the values at 0 and 1
2221
are not symmetric. Nevertheless, it is a reasonable discrete
2222
model of the continuous distribution, and behaves well as
2223
the number of values increases.
2224
"""
2225
if label is None and self.label is not None:
2226
label = self.label
2227
2228
if self.alpha < 1 or self.beta < 1:
2229
cdf = self.MakeCdf()
2230
pmf = cdf.MakePmf()
2231
return pmf
2232
2233
xs = [i / (steps - 1.0) for i in range(steps)]
2234
probs = [self.EvalPdf(x) for x in xs]
2235
pmf = Pmf(dict(zip(xs, probs)), label=label)
2236
return pmf
2237
2238
def MakeCdf(self, steps=101):
2239
"""Returns the CDF of this distribution."""
2240
xs = [i / (steps - 1.0) for i in range(steps)]
2241
ps = special.betainc(self.alpha, self.beta, xs)
2242
cdf = Cdf(xs, ps)
2243
return cdf
2244
2245
def Percentile(self, ps):
2246
"""Returns the given percentiles from this distribution.
2247
2248
ps: scalar, array, or list of [0-100]
2249
"""
2250
ps = np.asarray(ps) / 100
2251
xs = special.betaincinv(self.alpha, self.beta, ps)
2252
return xs
2253
2254
2255
class Dirichlet(object):
2256
"""Represents a Dirichlet distribution.
2257
2258
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2259
"""
2260
2261
def __init__(self, n, conc=1, label=None):
2262
"""Initializes a Dirichlet distribution.
2263
2264
n: number of dimensions
2265
conc: concentration parameter (smaller yields more concentration)
2266
label: string label
2267
"""
2268
if n < 2:
2269
raise ValueError('A Dirichlet distribution with '
2270
'n<2 makes no sense')
2271
2272
self.n = n
2273
self.params = np.ones(n, dtype=np.float) * conc
2274
self.label = label if label is not None else '_nolegend_'
2275
2276
def Update(self, data):
2277
"""Updates a Dirichlet distribution.
2278
2279
data: sequence of observations, in order corresponding to params
2280
"""
2281
m = len(data)
2282
self.params[:m] += data
2283
2284
def Random(self):
2285
"""Generates a random variate from this distribution.
2286
2287
Returns: normalized vector of fractions
2288
"""
2289
p = np.random.gamma(self.params)
2290
return p / p.sum()
2291
2292
def Likelihood(self, data):
2293
"""Computes the likelihood of the data.
2294
2295
Selects a random vector of probabilities from this distribution.
2296
2297
Returns: float probability
2298
"""
2299
m = len(data)
2300
if self.n < m:
2301
return 0
2302
2303
x = data
2304
p = self.Random()
2305
q = p[:m] ** x
2306
return q.prod()
2307
2308
def LogLikelihood(self, data):
2309
"""Computes the log likelihood of the data.
2310
2311
Selects a random vector of probabilities from this distribution.
2312
2313
Returns: float log probability
2314
"""
2315
m = len(data)
2316
if self.n < m:
2317
return float('-inf')
2318
2319
x = self.Random()
2320
y = np.log(x[:m]) * data
2321
return y.sum()
2322
2323
def MarginalBeta(self, i):
2324
"""Computes the marginal distribution of the ith element.
2325
2326
See http://en.wikipedia.org/wiki/Dirichlet_distribution
2327
#Marginal_distributions
2328
2329
i: int
2330
2331
Returns: Beta object
2332
"""
2333
alpha0 = self.params.sum()
2334
alpha = self.params[i]
2335
return Beta(alpha, alpha0 - alpha)
2336
2337
def PredictivePmf(self, xs, label=None):
2338
"""Makes a predictive distribution.
2339
2340
xs: values to go into the Pmf
2341
2342
Returns: Pmf that maps from x to the mean prevalence of x
2343
"""
2344
alpha0 = self.params.sum()
2345
ps = self.params / alpha0
2346
return Pmf(zip(xs, ps), label=label)
2347
2348
2349
def BinomialCoef(n, k):
2350
"""Compute the binomial coefficient "n choose k".
2351
2352
n: number of trials
2353
k: number of successes
2354
2355
Returns: float
2356
"""
2357
return scipy.misc.comb(n, k)
2358
2359
2360
def LogBinomialCoef(n, k):
2361
"""Computes the log of the binomial coefficient.
2362
2363
http://math.stackexchange.com/questions/64716/
2364
approximating-the-logarithm-of-the-binomial-coefficient
2365
2366
n: number of trials
2367
k: number of successes
2368
2369
Returns: float
2370
"""
2371
return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)
2372
2373
2374
def NormalProbability(ys, jitter=0):
2375
"""Generates data for a normal probability plot.
2376
2377
ys: sequence of values
2378
jitter: float magnitude of jitter added to the ys
2379
2380
returns: numpy arrays xs, ys
2381
"""
2382
n = len(ys)
2383
xs = np.random.normal(0, 1, n)
2384
xs.sort()
2385
2386
if jitter:
2387
ys = Jitter(ys, jitter)
2388
else:
2389
ys = np.array(ys)
2390
ys.sort()
2391
2392
return xs, ys
2393
2394
2395
def Jitter(values, jitter=0.5):
2396
"""Jitters the values by adding a uniform variate in (-jitter, jitter).
2397
2398
values: sequence
2399
jitter: scalar magnitude of jitter
2400
2401
returns: new numpy array
2402
"""
2403
n = len(values)
2404
return np.random.normal(0, jitter, n) + values
2405
2406
2407
def NormalProbabilityPlot(sample, fit_color='0.8', **options):
2408
"""Makes a normal probability plot with a fitted line.
2409
2410
sample: sequence of numbers
2411
fit_color: color string for the fitted line
2412
options: passed along to Plot
2413
"""
2414
xs, ys = NormalProbability(sample)
2415
mean, var = MeanVar(sample)
2416
std = math.sqrt(var)
2417
2418
fit = FitLine(xs, mean, std)
2419
thinkplot.Plot(*fit, color=fit_color, label='model')
2420
2421
xs, ys = NormalProbability(sample)
2422
thinkplot.Plot(xs, ys, **options)
2423
2424
2425
def Mean(xs):
2426
"""Computes mean.
2427
2428
xs: sequence of values
2429
2430
returns: float mean
2431
"""
2432
return np.mean(xs)
2433
2434
2435
def Var(xs, mu=None, ddof=0):
2436
"""Computes variance.
2437
2438
xs: sequence of values
2439
mu: option known mean
2440
ddof: delta degrees of freedom
2441
2442
returns: float
2443
"""
2444
xs = np.asarray(xs)
2445
2446
if mu is None:
2447
mu = xs.mean()
2448
2449
ds = xs - mu
2450
return np.dot(ds, ds) / (len(xs) - ddof)
2451
2452
2453
def Std(xs, mu=None, ddof=0):
2454
"""Computes standard deviation.
2455
2456
xs: sequence of values
2457
mu: option known mean
2458
ddof: delta degrees of freedom
2459
2460
returns: float
2461
"""
2462
var = Var(xs, mu, ddof)
2463
return math.sqrt(var)
2464
2465
2466
def MeanVar(xs, ddof=0):
2467
"""Computes mean and variance.
2468
2469
Based on http://stackoverflow.com/questions/19391149/
2470
numpy-mean-and-variance-from-single-function
2471
2472
xs: sequence of values
2473
ddof: delta degrees of freedom
2474
2475
returns: pair of float, mean and var
2476
"""
2477
xs = np.asarray(xs)
2478
mean = xs.mean()
2479
s2 = Var(xs, mean, ddof)
2480
return mean, s2
2481
2482
2483
def Trim(t, p=0.01):
2484
"""Trims the largest and smallest elements of t.
2485
2486
Args:
2487
t: sequence of numbers
2488
p: fraction of values to trim off each end
2489
2490
Returns:
2491
sequence of values
2492
"""
2493
n = int(p * len(t))
2494
t = sorted(t)[n:-n]
2495
return t
2496
2497
2498
def TrimmedMean(t, p=0.01):
2499
"""Computes the trimmed mean of a sequence of numbers.
2500
2501
Args:
2502
t: sequence of numbers
2503
p: fraction of values to trim off each end
2504
2505
Returns:
2506
float
2507
"""
2508
t = Trim(t, p)
2509
return Mean(t)
2510
2511
2512
def TrimmedMeanVar(t, p=0.01):
2513
"""Computes the trimmed mean and variance of a sequence of numbers.
2514
2515
Side effect: sorts the list.
2516
2517
Args:
2518
t: sequence of numbers
2519
p: fraction of values to trim off each end
2520
2521
Returns:
2522
float
2523
"""
2524
t = Trim(t, p)
2525
mu, var = MeanVar(t)
2526
return mu, var
2527
2528
2529
def CohenEffectSize(group1, group2):
2530
"""Compute Cohen's d.
2531
2532
group1: Series or NumPy array
2533
group2: Series or NumPy array
2534
2535
returns: float
2536
"""
2537
diff = group1.mean() - group2.mean()
2538
2539
n1, n2 = len(group1), len(group2)
2540
var1 = group1.var()
2541
var2 = group2.var()
2542
2543
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
2544
d = diff / math.sqrt(pooled_var)
2545
return d
2546
2547
2548
def Cov(xs, ys, meanx=None, meany=None):
2549
"""Computes Cov(X, Y).
2550
2551
Args:
2552
xs: sequence of values
2553
ys: sequence of values
2554
meanx: optional float mean of xs
2555
meany: optional float mean of ys
2556
2557
Returns:
2558
Cov(X, Y)
2559
"""
2560
xs = np.asarray(xs)
2561
ys = np.asarray(ys)
2562
2563
if meanx is None:
2564
meanx = np.mean(xs)
2565
if meany is None:
2566
meany = np.mean(ys)
2567
2568
cov = np.dot(xs-meanx, ys-meany) / len(xs)
2569
return cov
2570
2571
2572
def Corr(xs, ys):
2573
"""Computes Corr(X, Y).
2574
2575
Args:
2576
xs: sequence of values
2577
ys: sequence of values
2578
2579
Returns:
2580
Corr(X, Y)
2581
"""
2582
xs = np.asarray(xs)
2583
ys = np.asarray(ys)
2584
2585
meanx, varx = MeanVar(xs)
2586
meany, vary = MeanVar(ys)
2587
2588
corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)
2589
2590
return corr
2591
2592
2593
def SerialCorr(series, lag=1):
2594
"""Computes the serial correlation of a series.
2595
2596
series: Series
2597
lag: integer number of intervals to shift
2598
2599
returns: float correlation
2600
"""
2601
xs = series[lag:]
2602
ys = series.shift(lag)[lag:]
2603
corr = Corr(xs, ys)
2604
return corr
2605
2606
2607
def SpearmanCorr(xs, ys):
2608
"""Computes Spearman's rank correlation.
2609
2610
Args:
2611
xs: sequence of values
2612
ys: sequence of values
2613
2614
Returns:
2615
float Spearman's correlation
2616
"""
2617
xranks = pandas.Series(xs).rank()
2618
yranks = pandas.Series(ys).rank()
2619
return Corr(xranks, yranks)
2620
2621
2622
def MapToRanks(t):
2623
"""Returns a list of ranks corresponding to the elements in t.
2624
2625
Args:
2626
t: sequence of numbers
2627
2628
Returns:
2629
list of integer ranks, starting at 1
2630
"""
2631
# pair up each value with its index
2632
pairs = enumerate(t)
2633
2634
# sort by value
2635
sorted_pairs = sorted(pairs, key=itemgetter(1))
2636
2637
# pair up each pair with its rank
2638
ranked = enumerate(sorted_pairs)
2639
2640
# sort by index
2641
resorted = sorted(ranked, key=lambda trip: trip[1][0])
2642
2643
# extract the ranks
2644
ranks = [trip[0]+1 for trip in resorted]
2645
return ranks
2646
2647
2648
def LeastSquares(xs, ys):
2649
"""Computes a linear least squares fit for ys as a function of xs.
2650
2651
Args:
2652
xs: sequence of values
2653
ys: sequence of values
2654
2655
Returns:
2656
tuple of (intercept, slope)
2657
"""
2658
meanx, varx = MeanVar(xs)
2659
meany = Mean(ys)
2660
2661
slope = Cov(xs, ys, meanx, meany) / varx
2662
inter = meany - slope * meanx
2663
2664
return inter, slope
2665
2666
2667
def FitLine(xs, inter, slope):
2668
"""Fits a line to the given data.
2669
2670
xs: sequence of x
2671
2672
returns: tuple of numpy arrays (sorted xs, fit ys)
2673
"""
2674
fit_xs = np.sort(xs)
2675
fit_ys = inter + slope * fit_xs
2676
return fit_xs, fit_ys
2677
2678
2679
def Residuals(xs, ys, inter, slope):
2680
"""Computes residuals for a linear fit with parameters inter and slope.
2681
2682
Args:
2683
xs: independent variable
2684
ys: dependent variable
2685
inter: float intercept
2686
slope: float slope
2687
2688
Returns:
2689
list of residuals
2690
"""
2691
xs = np.asarray(xs)
2692
ys = np.asarray(ys)
2693
res = ys - (inter + slope * xs)
2694
return res
2695
2696
2697
def CoefDetermination(ys, res):
2698
"""Computes the coefficient of determination (R^2) for given residuals.
2699
2700
Args:
2701
ys: dependent variable
2702
res: residuals
2703
2704
Returns:
2705
float coefficient of determination
2706
"""
2707
return 1 - Var(res) / Var(ys)
2708
2709
2710
def CorrelatedGenerator(rho):
2711
"""Generates standard normal variates with serial correlation.
2712
2713
rho: target coefficient of correlation
2714
2715
Returns: iterable
2716
"""
2717
x = random.gauss(0, 1)
2718
yield x
2719
2720
sigma = math.sqrt(1 - rho**2)
2721
while True:
2722
x = random.gauss(x * rho, sigma)
2723
yield x
2724
2725
2726
def CorrelatedNormalGenerator(mu, sigma, rho):
2727
"""Generates normal variates with serial correlation.
2728
2729
mu: mean of variate
2730
sigma: standard deviation of variate
2731
rho: target coefficient of correlation
2732
2733
Returns: iterable
2734
"""
2735
for x in CorrelatedGenerator(rho):
2736
yield x * sigma + mu
2737
2738
2739
def RawMoment(xs, k):
2740
"""Computes the kth raw moment of xs.
2741
"""
2742
return sum(x**k for x in xs) / len(xs)
2743
2744
2745
def CentralMoment(xs, k):
2746
"""Computes the kth central moment of xs.
2747
"""
2748
mean = RawMoment(xs, 1)
2749
return sum((x - mean)**k for x in xs) / len(xs)
2750
2751
2752
def StandardizedMoment(xs, k):
2753
"""Computes the kth standardized moment of xs.
2754
"""
2755
var = CentralMoment(xs, 2)
2756
std = math.sqrt(var)
2757
return CentralMoment(xs, k) / std**k
2758
2759
2760
def Skewness(xs):
2761
"""Computes skewness.
2762
"""
2763
return StandardizedMoment(xs, 3)
2764
2765
2766
def Median(xs):
2767
"""Computes the median (50th percentile) of a sequence.
2768
2769
xs: sequence or anything else that can initialize a Cdf
2770
2771
returns: float
2772
"""
2773
cdf = Cdf(xs)
2774
return cdf.Value(0.5)
2775
2776
2777
def IQR(xs):
2778
"""Computes the interquartile of a sequence.
2779
2780
xs: sequence or anything else that can initialize a Cdf
2781
2782
returns: pair of floats
2783
"""
2784
cdf = Cdf(xs)
2785
return cdf.Value(0.25), cdf.Value(0.75)
2786
2787
2788
def PearsonMedianSkewness(xs):
2789
"""Computes the Pearson median skewness.
2790
"""
2791
median = Median(xs)
2792
mean = RawMoment(xs, 1)
2793
var = CentralMoment(xs, 2)
2794
std = math.sqrt(var)
2795
gp = 3 * (mean - median) / std
2796
return gp
2797
2798
2799
class FixedWidthVariables(object):
2800
"""Represents a set of variables in a fixed width file."""
2801
2802
def __init__(self, variables, index_base=0):
2803
"""Initializes.
2804
2805
variables: DataFrame
2806
index_base: are the indices 0 or 1 based?
2807
2808
Attributes:
2809
colspecs: list of (start, end) index tuples
2810
names: list of string variable names
2811
"""
2812
self.variables = variables
2813
2814
# note: by default, subtract 1 from colspecs
2815
self.colspecs = variables[['start', 'end']] - index_base
2816
2817
# convert colspecs to a list of pair of int
2818
self.colspecs = self.colspecs.astype(np.int).values.tolist()
2819
self.names = variables['name']
2820
2821
def ReadFixedWidth(self, filename, **options):
2822
"""Reads a fixed width ASCII file.
2823
2824
filename: string filename
2825
2826
returns: DataFrame
2827
"""
2828
df = pandas.read_fwf(filename,
2829
colspecs=self.colspecs,
2830
names=self.names,
2831
**options)
2832
return df
2833
2834
2835
def ReadStataDct(dct_file, **options):
2836
"""Reads a Stata dictionary file.
2837
2838
dct_file: string filename
2839
options: dict of options passed to open()
2840
2841
returns: FixedWidthVariables object
2842
"""
2843
type_map = dict(byte=int, int=int, long=int, float=float, double=float)
2844
2845
var_info = []
2846
with open(dct_file, **options) as f:
2847
for line in f:
2848
match = re.search( r'_column\(([^)]*)\)', line)
2849
if not match:
2850
continue
2851
start = int(match.group(1))
2852
t = line.split()
2853
vtype, name, fstring = t[1:4]
2854
name = name.lower()
2855
if vtype.startswith('str'):
2856
vtype = str
2857
else:
2858
vtype = type_map[vtype]
2859
long_desc = ' '.join(t[4:]).strip('"')
2860
var_info.append((start, vtype, name, fstring, long_desc))
2861
2862
columns = ['start', 'type', 'name', 'fstring', 'desc']
2863
variables = pandas.DataFrame(var_info, columns=columns)
2864
2865
# fill in the end column by shifting the start column
2866
variables['end'] = variables.start.shift(-1)
2867
variables.loc[len(variables)-1, 'end'] = 0
2868
2869
dct = FixedWidthVariables(variables, index_base=1)
2870
return dct
2871
2872
2873
def Resample(xs, n=None):
2874
"""Draw a sample from xs with the same length as xs.
2875
2876
xs: sequence
2877
n: sample size (default: len(xs))
2878
2879
returns: NumPy array
2880
"""
2881
if n is None:
2882
n = len(xs)
2883
return np.random.choice(xs, n, replace=True)
2884
2885
2886
def SampleRows(df, nrows, replace=False):
2887
"""Choose a sample of rows from a DataFrame.
2888
2889
df: DataFrame
2890
nrows: number of rows
2891
replace: whether to sample with replacement
2892
2893
returns: DataDf
2894
"""
2895
indices = np.random.choice(df.index, nrows, replace=replace)
2896
sample = df.loc[indices]
2897
return sample
2898
2899
2900
def ResampleRows(df):
2901
"""Resamples rows from a DataFrame.
2902
2903
df: DataFrame
2904
2905
returns: DataFrame
2906
"""
2907
return SampleRows(df, len(df), replace=True)
2908
2909
2910
def ResampleRowsWeighted(df, column='finalwgt'):
2911
"""Resamples a DataFrame using probabilities proportional to given column.
2912
2913
df: DataFrame
2914
column: string column name to use as weights
2915
2916
returns: DataFrame
2917
"""
2918
weights = df[column]
2919
cdf = Cdf(dict(weights))
2920
indices = cdf.Sample(len(weights))
2921
sample = df.loc[indices]
2922
return sample
2923
2924
2925
def PercentileRow(array, p):
2926
"""Selects the row from a sorted array that maps to percentile p.
2927
2928
p: float 0--100
2929
2930
returns: NumPy array (one row)
2931
"""
2932
rows, cols = array.shape
2933
index = int(rows * p / 100)
2934
return array[index,]
2935
2936
2937
def PercentileRows(ys_seq, percents):
2938
"""Given a collection of lines, selects percentiles along vertical axis.
2939
2940
For example, if ys_seq contains simulation results like ys as a
2941
function of time, and percents contains (5, 95), the result would
2942
be a 90% CI for each vertical slice of the simulation results.
2943
2944
ys_seq: sequence of lines (y values)
2945
percents: list of percentiles (0-100) to select
2946
2947
returns: list of NumPy arrays, one for each percentile
2948
"""
2949
nrows = len(ys_seq)
2950
ncols = len(ys_seq[0])
2951
array = np.zeros((nrows, ncols))
2952
2953
for i, ys in enumerate(ys_seq):
2954
array[i,] = ys
2955
2956
array = np.sort(array, axis=0)
2957
2958
rows = [PercentileRow(array, p) for p in percents]
2959
return rows
2960
2961
2962
def Smooth(xs, sigma=2, **options):
2963
"""Smooths a NumPy array with a Gaussian filter.
2964
2965
xs: sequence
2966
sigma: standard deviation of the filter
2967
"""
2968
return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
2969
2970
2971
class HypothesisTest(object):
2972
"""Represents a hypothesis test."""
2973
2974
def __init__(self, data):
2975
"""Initializes.
2976
2977
data: data in whatever form is relevant
2978
"""
2979
self.data = data
2980
self.MakeModel()
2981
self.actual = self.TestStatistic(data)
2982
self.test_stats = None
2983
self.test_cdf = None
2984
2985
def PValue(self, iters=1000):
2986
"""Computes the distribution of the test statistic and p-value.
2987
2988
iters: number of iterations
2989
2990
returns: float p-value
2991
"""
2992
self.test_stats = [self.TestStatistic(self.RunModel())
2993
for _ in range(iters)]
2994
self.test_cdf = Cdf(self.test_stats)
2995
2996
count = sum(1 for x in self.test_stats if x >= self.actual)
2997
return count / iters
2998
2999
def MaxTestStat(self):
3000
"""Returns the largest test statistic seen during simulations.
3001
"""
3002
return max(self.test_stats)
3003
3004
def PlotCdf(self, label=None):
3005
"""Draws a Cdf with vertical lines at the observed test stat.
3006
"""
3007
def VertLine(x):
3008
"""Draws a vertical line at x."""
3009
thinkplot.Plot([x, x], [0, 1], color='0.8')
3010
3011
VertLine(self.actual)
3012
thinkplot.Cdf(self.test_cdf, label=label)
3013
3014
def TestStatistic(self, data):
3015
"""Computes the test statistic.
3016
3017
data: data in whatever form is relevant
3018
"""
3019
raise UnimplementedMethodException()
3020
3021
def MakeModel(self):
3022
"""Build a model of the null hypothesis.
3023
"""
3024
pass
3025
3026
def RunModel(self):
3027
"""Run the model of the null hypothesis.
3028
3029
returns: simulated data
3030
"""
3031
raise UnimplementedMethodException()
3032
3033
3034
def main():
3035
pass
3036
3037
3038
if __name__ == '__main__':
3039
main()
3040
3041