CoCalc -- thinkbayes2.py

Repository for a workshop on Bayesian statistics
¹⁴³⁰ views
1
"""This file contains code for use with "Think Stats" and
2
"Think Bayes", both by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
"""This file contains class definitions for:
11

12
Hist: represents a histogram (map from values to integer frequencies).
13

14
Pmf: represents a probability mass function (map from values to probs).
15

16
_DictWrapper: private parent class for Hist and Pmf.
17

18
Cdf: represents a discrete cumulative distribution function
19

20
Pdf: represents a continuous probability density function
21

22
"""
23

24
import bisect
25
import copy
26
import logging
27
import math
28
import random
29
import re
30

31
from collections import Counter
32
from operator import itemgetter
33

34
import thinkplot
35

36
import numpy as np
37
import pandas
38

39
import scipy
40
from scipy import stats
41
from scipy import special
42
from scipy import ndimage
43

44
from scipy.special import gamma
45

46
from io import open
47

48
ROOT2 = math.sqrt(2)
49

50
def RandomSeed(x):
51
    """Initialize the random and np.random generators.
52

53
    x: int seed
54
    """
55
    random.seed(x)
56
    np.random.seed(x)
57
    
58

59
def Odds(p):
60
    """Computes odds for a given probability.
61

62
    Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
63

64
    Note: when p=1, the formula for odds divides by zero, which is
65
    normally undefined.  But I think it is reasonable to define Odds(1)
66
    to be infinity, so that's what this function does.
67

68
    p: float 0-1
69

70
    Returns: float odds
71
    """
72
    if p == 1:
73
        return float('inf')
74
    return p / (1 - p)
75

76

77
def Probability(o):
78
    """Computes the probability corresponding to given odds.
79

80
    Example: o=2 means 2:1 odds in favor, or 2/3 probability
81

82
    o: float odds, strictly positive
83

84
    Returns: float probability
85
    """
86
    return o / (o + 1)
87

88

89
def Probability2(yes, no):
90
    """Computes the probability corresponding to given odds.
91

92
    Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
93
    
94
    yes, no: int or float odds in favor
95
    """
96
    return yes / (yes + no)
97

98

99
class Interpolator(object):
100
    """Represents a mapping between sorted sequences; performs linear interp.
101

102
    Attributes:
103
        xs: sorted list
104
        ys: sorted list
105
    """
106

107
    def __init__(self, xs, ys):
108
        self.xs = xs
109
        self.ys = ys
110

111
    def Lookup(self, x):
112
        """Looks up x and returns the corresponding value of y."""
113
        return self._Bisect(x, self.xs, self.ys)
114

115
    def Reverse(self, y):
116
        """Looks up y and returns the corresponding value of x."""
117
        return self._Bisect(y, self.ys, self.xs)
118

119
    def _Bisect(self, x, xs, ys):
120
        """Helper function."""
121
        if x <= xs[0]:
122
            return ys[0]
123
        if x >= xs[-1]:
124
            return ys[-1]
125
        i = bisect.bisect(xs, x)
126
        frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
127
        y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
128
        return y
129

130

131
# When we plot Hist, Pmf and Cdf objects, they don't appear in
132
# the legend unless we override the default label.
133
DEFAULT_LABEL = '_nolegend_' 
134

135

136
class _DictWrapper(object):
137
    """An object that contains a dictionary."""
138

139
    def __init__(self, obj=None, label=None):
140
        """Initializes the distribution.
141

142
        obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
143
        label: string label
144
        """
145
        self.label = label if label is not None else DEFAULT_LABEL
146
        self.d = {}
147

148
        # flag whether the distribution is under a log transform
149
        self.log = False
150

151
        if obj is None:
152
            return
153

154
        if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
155
            self.label = label if label is not None else obj.label
156

157
        if isinstance(obj, dict):
158
            self.d.update(obj.items())
159
        elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):
160
            self.d.update(obj.Items())
161
        elif isinstance(obj, pandas.Series):
162
            self.d.update(obj.value_counts().iteritems())
163
        else:
164
            # finally, treat it like a list
165
            self.d.update(Counter(obj))
166

167
        if len(self) > 0 and isinstance(self, Pmf):
168
            self.Normalize()
169

170
    def __hash__(self):
171
        return id(self)
172

173
    def __str__(self):
174
        cls = self.__class__.__name__
175
        if self.label == DEFAULT_LABEL:
176
            return '%s(%s)' % (cls, str(self.d))
177
        else:
178
            return self.label
179

180
    def __repr__(self):
181
        cls = self.__class__.__name__
182
        if self.label == DEFAULT_LABEL:
183
            return '%s(%s)' % (cls, repr(self.d))
184
        else:
185
            return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label))
186

187
    def __eq__(self, other):
188
        try:
189
            return self.d == other.d
190
        except AttributeError:
191
            return False
192

193
    def __len__(self):
194
        return len(self.d)
195

196
    def __iter__(self):
197
        return iter(self.d)
198

199
    def iterkeys(self):
200
        """Returns an iterator over keys."""
201
        return iter(self.d)
202

203
    def __contains__(self, value):
204
        return value in self.d
205

206
    def __getitem__(self, value):
207
        return self.d.get(value, 0)
208

209
    def __setitem__(self, value, prob):
210
        self.d[value] = prob
211

212
    def __delitem__(self, value):
213
        del self.d[value]
214

215
    def Copy(self, label=None):
216
        """Returns a copy.
217

218
        Make a shallow copy of d.  If you want a deep copy of d,
219
        use copy.deepcopy on the whole object.
220

221
        label: string label for the new Hist
222

223
        returns: new _DictWrapper with the same type
224
        """
225
        new = copy.copy(self)
226
        new.d = copy.copy(self.d)
227
        new.label = label if label is not None else self.label
228
        return new
229

230
    def Scale(self, factor):
231
        """Multiplies the values by a factor.
232

233
        factor: what to multiply by
234

235
        Returns: new object
236
        """
237
        new = self.Copy()
238
        new.d.clear()
239

240
        for val, prob in self.Items():
241
            new.Set(val * factor, prob)
242
        return new
243

244
    def Log(self, m=None):
245
        """Log transforms the probabilities.
246
        
247
        Removes values with probability 0.
248

249
        Normalizes so that the largest logprob is 0.
250
        """
251
        if self.log:
252
            raise ValueError("Pmf/Hist already under a log transform")
253
        self.log = True
254

255
        if m is None:
256
            m = self.MaxLike()
257

258
        for x, p in self.d.items():
259
            if p:
260
                self.Set(x, math.log(p / m))
261
            else:
262
                self.Remove(x)
263

264
    def Exp(self, m=None):
265
        """Exponentiates the probabilities.
266

267
        m: how much to shift the ps before exponentiating
268

269
        If m is None, normalizes so that the largest prob is 1.
270
        """
271
        if not self.log:
272
            raise ValueError("Pmf/Hist not under a log transform")
273
        self.log = False
274

275
        if m is None:
276
            m = self.MaxLike()
277

278
        for x, p in self.d.items():
279
            self.Set(x, math.exp(p - m))
280

281
    def GetDict(self):
282
        """Gets the dictionary."""
283
        return self.d
284

285
    def SetDict(self, d):
286
        """Sets the dictionary."""
287
        self.d = d
288

289
    def Values(self):
290
        """Gets an unsorted sequence of values.
291

292
        Note: one source of confusion is that the keys of this
293
        dictionary are the values of the Hist/Pmf, and the
294
        values of the dictionary are frequencies/probabilities.
295
        """
296
        return self.d.keys()
297

298
    def Items(self):
299
        """Gets an unsorted sequence of (value, freq/prob) pairs."""
300
        return self.d.items()
301

302
    def SortedItems(self):
303
        """Gets a sorted sequence of (value, freq/prob) pairs.
304

305
        It items are unsortable, the result is unsorted.
306
        """
307
        def isnan(x):
308
            try:
309
                return math.isnan(x)
310
            except TypeError:
311
                return False
312

313
        if any([isnan(x) for x in self.Values()]):
314
            msg = 'Keys contain NaN, may not sort correctly.'
315
            logging.warning(msg)
316

317
        try:
318
            return sorted(self.d.items())
319
        except TypeError:
320
            return self.d.items()
321

322
    def Render(self, **options):
323
        """Generates a sequence of points suitable for plotting.
324

325
        Note: options are ignored
326

327
        Returns:
328
            tuple of (sorted value sequence, freq/prob sequence)
329
        """
330
        return zip(*self.SortedItems())
331

332
    def MakeCdf(self, label=None):
333
        """Makes a Cdf."""
334
        label = label if label is not None else self.label
335
        return Cdf(self, label=label)
336

337
    def Print(self):
338
        """Prints the values and freqs/probs in ascending order."""
339
        for val, prob in self.SortedItems():
340
            print(val, prob)
341

342
    def Set(self, x, y=0):
343
        """Sets the freq/prob associated with the value x.
344

345
        Args:
346
            x: number value
347
            y: number freq or prob
348
        """
349
        self.d[x] = y
350

351
    def Incr(self, x, term=1):
352
        """Increments the freq/prob associated with the value x.
353

354
        Args:
355
            x: number value
356
            term: how much to increment by
357
        """
358
        self.d[x] = self.d.get(x, 0) + term
359

360
    def Mult(self, x, factor):
361
        """Scales the freq/prob associated with the value x.
362

363
        Args:
364
            x: number value
365
            factor: how much to multiply by
366
        """
367
        self.d[x] = self.d.get(x, 0) * factor
368

369
    def Remove(self, x):
370
        """Removes a value.
371

372
        Throws an exception if the value is not there.
373

374
        Args:
375
            x: value to remove
376
        """
377
        del self.d[x]
378

379
    def Total(self):
380
        """Returns the total of the frequencies/probabilities in the map."""
381
        total = sum(self.d.values())
382
        return total
383

384
    def MaxLike(self):
385
        """Returns the largest frequency/probability in the map."""
386
        return max(self.d.values())
387

388
    def Largest(self, n=10):
389
        """Returns the largest n values, with frequency/probability.
390

391
        n: number of items to return
392
        """
393
        return sorted(self.d.items(), reverse=True)[:n]
394

395
    def Smallest(self, n=10):
396
        """Returns the smallest n values, with frequency/probability.
397

398
        n: number of items to return
399
        """
400
        return sorted(self.d.items(), reverse=False)[:n]
401

402

403
class Hist(_DictWrapper):
404
    """Represents a histogram, which is a map from values to frequencies.
405

406
    Values can be any hashable type; frequencies are integer counters.
407
    """
408
    def Freq(self, x):
409
        """Gets the frequency associated with the value x.
410

411
        Args:
412
            x: number value
413

414
        Returns:
415
            int frequency
416
        """
417
        return self.d.get(x, 0)
418

419
    def Freqs(self, xs):
420
        """Gets frequencies for a sequence of values."""
421
        return [self.Freq(x) for x in xs]
422

423
    def IsSubset(self, other):
424
        """Checks whether the values in this histogram are a subset of
425
        the values in the given histogram."""
426
        for val, freq in self.Items():
427
            if freq > other.Freq(val):
428
                return False
429
        return True
430

431
    def Subtract(self, other):
432
        """Subtracts the values in the given histogram from this histogram."""
433
        for val, freq in other.Items():
434
            self.Incr(val, -freq)
435

436

437
class Pmf(_DictWrapper):
438
    """Represents a probability mass function.
439
    
440
    Values can be any hashable type; probabilities are floating-point.
441
    Pmfs are not necessarily normalized.
442
    """
443

444
    def Prob(self, x, default=0):
445
        """Gets the probability associated with the value x.
446

447
        Args:
448
            x: number value
449
            default: value to return if the key is not there
450

451
        Returns:
452
            float probability
453
        """
454
        return self.d.get(x, default)
455

456
    def Probs(self, xs):
457
        """Gets probabilities for a sequence of values."""
458
        return [self.Prob(x) for x in xs]
459

460
    def Percentile(self, percentage):
461
        """Computes a percentile of a given Pmf.
462

463
        Note: this is not super efficient.  If you are planning
464
        to compute more than a few percentiles, compute the Cdf.
465

466
        percentage: float 0-100
467

468
        returns: value from the Pmf
469
        """
470
        p = percentage / 100
471
        total = 0
472
        for val, prob in sorted(self.Items()):
473
            total += prob
474
            if total >= p:
475
                return val
476

477
    def ProbGreater(self, x):
478
        """Probability that a sample from this Pmf exceeds x.
479

480
        x: number
481

482
        returns: float probability
483
        """
484
        if isinstance(x, _DictWrapper):
485
            return PmfProbGreater(self, x)
486
        else:
487
            t = [prob for (val, prob) in self.d.items() if val > x]
488
            return sum(t)
489

490
    def ProbLess(self, x):
491
        """Probability that a sample from this Pmf is less than x.
492

493
        x: number
494

495
        returns: float probability
496
        """
497
        if isinstance(x, _DictWrapper):
498
            return PmfProbLess(self, x)
499
        else:
500
            t = [prob for (val, prob) in self.d.items() if val < x]
501
            return sum(t)
502

503
    def ProbEqual(self, x):
504
        """Probability that a sample from this Pmf is exactly x.
505

506
        x: number
507

508
        returns: float probability
509
        """
510
        if isinstance(x, _DictWrapper):
511
            return PmfProbEqual(self, x)
512
        else:
513
            return self[x]
514

515
    # NOTE: I've decided to remove the magic comparators because they
516
    # have the side-effect of making Pmf sortable, but in fact they
517
    # don't support sorting.
518

519
    def Normalize(self, fraction=1):
520
        """Normalizes this PMF so the sum of all probs is fraction.
521

522
        Args:
523
            fraction: what the total should be after normalization
524

525
        Returns: the total probability before normalizing
526
        """
527
        if self.log:
528
            raise ValueError("Normalize: Pmf is under a log transform")
529

530
        total = self.Total()
531
        if total == 0:
532
            raise ValueError('Normalize: total probability is zero.')
533

534
        factor = fraction / total
535
        for x in self.d:
536
            self.d[x] *= factor
537

538
        return total
539

540
    def Random(self):
541
        """Chooses a random element from this PMF.
542

543
        Note: this is not very efficient.  If you plan to call
544
        this more than a few times, consider converting to a CDF.
545

546
        Returns:
547
            float value from the Pmf
548
        """
549
        target = random.random()
550
        total = 0
551
        for x, p in self.d.items():
552
            total += p
553
            if total >= target:
554
                return x
555

556
        # we shouldn't get here
557
        raise ValueError('Random: Pmf might not be normalized.')
558

559
    def Sample(self, n):
560
        """Generates a random sample from this distribution.
561
        
562
        n: int length of the sample
563
        returns: NumPy array
564
        """
565
        return self.MakeCdf().Sample(n)
566

567
    def Mean(self):
568
        """Computes the mean of a PMF.
569

570
        Returns:
571
            float mean
572
        """
573
        return sum(p * x for x, p in self.Items())
574

575
    def Median(self):
576
        """Computes the median of a PMF.
577

578
        Returns:
579
            float median
580
        """
581
        return self.MakeCdf().Percentile(50)
582

583
    def Var(self, mu=None):
584
        """Computes the variance of a PMF.
585

586
        mu: the point around which the variance is computed;
587
                if omitted, computes the mean
588

589
        returns: float variance
590
        """
591
        if mu is None:
592
            mu = self.Mean()
593

594
        return sum(p * (x-mu)**2 for x, p in self.Items())
595

596
    def Expect(self, func):
597
        """Computes the expectation of func(x).
598

599
        Returns:
600
            expectation
601
        """
602
        return np.sum(p * func(x) for x, p in self.Items()) 
603

604
    def Std(self, mu=None):
605
        """Computes the standard deviation of a PMF.
606

607
        mu: the point around which the variance is computed;
608
                if omitted, computes the mean
609

610
        returns: float standard deviation
611
        """
612
        var = self.Var(mu)
613
        return math.sqrt(var)
614

615
    def Mode(self):
616
        """Returns the value with the highest probability.
617

618
        Returns: float probability
619
        """
620
        _, val = max((prob, val) for val, prob in self.Items())
621
        return val
622

623
    # The mode of a posterior is the maximum aposteori probability (MAP)
624
    MAP = Mode
625

626
    # If the distribution contains likelihoods only, the peak is the
627
    # maximum likelihood estimator.
628
    MaximumLikelihood = Mode
629

630
    def CredibleInterval(self, percentage=90):
631
        """Computes the central credible interval.
632

633
        If percentage=90, computes the 90% CI.
634

635
        Args:
636
            percentage: float between 0 and 100
637

638
        Returns:
639
            sequence of two floats, low and high
640
        """
641
        cdf = self.MakeCdf()
642
        return cdf.CredibleInterval(percentage)
643

644
    def __add__(self, other):
645
        """Computes the Pmf of the sum of values drawn from self and other.
646

647
        other: another Pmf or a scalar
648

649
        returns: new Pmf
650
        """
651
        try:
652
            return self.AddPmf(other)
653
        except AttributeError:
654
            return self.AddConstant(other)
655

656
    __radd__ = __add__
657

658
    def AddPmf(self, other):
659
        """Computes the Pmf of the sum of values drawn from self and other.
660

661
        other: another Pmf
662

663
        returns: new Pmf
664
        """
665
        pmf = Pmf()
666
        for v1, p1 in self.Items():
667
            for v2, p2 in other.Items():
668
                pmf[v1 + v2] += p1 * p2
669
        return pmf
670

671
    def AddConstant(self, other):
672
        """Computes the Pmf of the sum a constant and values from self.
673

674
        other: a number
675

676
        returns: new Pmf
677
        """
678
        if other == 0:
679
            return self.Copy()
680

681
        pmf = Pmf()
682
        for v1, p1 in self.Items():
683
            pmf.Set(v1 + other, p1)
684
        return pmf
685

686
    def __sub__(self, other):
687
        """Computes the Pmf of the diff of values drawn from self and other.
688

689
        other: another Pmf
690

691
        returns: new Pmf
692
        """
693
        try:
694
            return self.SubPmf(other)
695
        except AttributeError:
696
            return self.AddConstant(-other)
697

698
    def SubPmf(self, other):
699
        """Computes the Pmf of the diff of values drawn from self and other.
700

701
        other: another Pmf
702

703
        returns: new Pmf
704
        """
705
        pmf = Pmf()
706
        for v1, p1 in self.Items():
707
            for v2, p2 in other.Items():
708
                pmf.Incr(v1 - v2, p1 * p2)
709
        return pmf
710

711
    def __mul__(self, other):
712
        """Computes the Pmf of the product of values drawn from self and other.
713

714
        other: another Pmf
715

716
        returns: new Pmf
717
        """
718
        try:
719
            return self.MulPmf(other)
720
        except AttributeError:
721
            return self.MulConstant(other)
722

723
    def MulPmf(self, other):
724
        """Computes the Pmf of the diff of values drawn from self and other.
725

726
        other: another Pmf
727

728
        returns: new Pmf
729
        """
730
        pmf = Pmf()
731
        for v1, p1 in self.Items():
732
            for v2, p2 in other.Items():
733
                pmf.Incr(v1 * v2, p1 * p2)
734
        return pmf
735

736
    def MulConstant(self, other):
737
        """Computes the Pmf of the product of a constant and values from self.
738

739
        other: a number
740

741
        returns: new Pmf
742
        """
743
        pmf = Pmf()
744
        for v1, p1 in self.Items():
745
            pmf.Set(v1 * other, p1)
746
        return pmf
747

748
    def __div__(self, other):
749
        """Computes the Pmf of the ratio of values drawn from self and other.
750

751
        other: another Pmf
752

753
        returns: new Pmf
754
        """
755
        try:
756
            return self.DivPmf(other)
757
        except AttributeError:
758
            return self.MulConstant(1/other)
759

760
    __truediv__ = __div__
761

762
    def DivPmf(self, other):
763
        """Computes the Pmf of the ratio of values drawn from self and other.
764

765
        other: another Pmf
766

767
        returns: new Pmf
768
        """
769
        pmf = Pmf()
770
        for v1, p1 in self.Items():
771
            for v2, p2 in other.Items():
772
                pmf.Incr(v1 / v2, p1 * p2)
773
        return pmf
774

775
    def Max(self, k):
776
        """Computes the CDF of the maximum of k selections from this dist.
777

778
        k: int
779

780
        returns: new Cdf
781
        """
782
        cdf = self.MakeCdf()
783
        cdf.ps **= k
784
        return cdf
785

786

787
class Joint(Pmf):
788
    """Represents a joint distribution.
789

790
    The values are sequences (usually tuples)
791
    """
792

793
    def Marginal(self, i, label=None):
794
        """Gets the marginal distribution of the indicated variable.
795

796
        i: index of the variable we want
797

798
        Returns: Pmf
799
        """
800
        pmf = Pmf(label=label)
801
        for vs, prob in self.Items():
802
            pmf.Incr(vs[i], prob)
803
        return pmf
804

805
    def Conditional(self, i, j, val, label=None):
806
        """Gets the conditional distribution of the indicated variable.
807

808
        Distribution of vs[i], conditioned on vs[j] = val.
809

810
        i: index of the variable we want
811
        j: which variable is conditioned on
812
        val: the value the jth variable has to have
813

814
        Returns: Pmf
815
        """
816
        pmf = Pmf(label=label)
817
        for vs, prob in self.Items():
818
            if vs[j] != val:
819
                continue
820
            pmf.Incr(vs[i], prob)
821

822
        pmf.Normalize()
823
        return pmf
824

825
    def MaxLikeInterval(self, percentage=90):
826
        """Returns the maximum-likelihood credible interval.
827

828
        If percentage=90, computes a 90% CI containing the values
829
        with the highest likelihoods.
830

831
        percentage: float between 0 and 100
832

833
        Returns: list of values from the suite
834
        """
835
        interval = []
836
        total = 0
837

838
        t = [(prob, val) for val, prob in self.Items()]
839
        t.sort(reverse=True)
840

841
        for prob, val in t:
842
            interval.append(val)
843
            total += prob
844
            if total >= percentage / 100:
845
                break
846

847
        return interval
848

849

850
def MakeJoint(pmf1, pmf2):
851
    """Joint distribution of values from pmf1 and pmf2.
852

853
    Assumes that the PMFs represent independent random variables.
854

855
    Args:
856
        pmf1: Pmf object
857
        pmf2: Pmf object
858

859
    Returns:
860
        Joint pmf of value pairs
861
    """
862
    joint = Joint()
863
    for v1, p1 in pmf1.Items():
864
        for v2, p2 in pmf2.Items():
865
            joint.Set((v1, v2), p1 * p2)
866
    return joint
867

868

869
def MakeHistFromList(t, label=None):
870
    """Makes a histogram from an unsorted sequence of values.
871

872
    Args:
873
        t: sequence of numbers
874
        label: string label for this histogram
875

876
    Returns:
877
        Hist object
878
    """
879
    return Hist(t, label=label)
880

881

882
def MakeHistFromDict(d, label=None):
883
    """Makes a histogram from a map from values to frequencies.
884

885
    Args:
886
        d: dictionary that maps values to frequencies
887
        label: string label for this histogram
888

889
    Returns:
890
        Hist object
891
    """
892
    return Hist(d, label)
893

894

895
def MakePmfFromList(t, label=None):
896
    """Makes a PMF from an unsorted sequence of values.
897

898
    Args:
899
        t: sequence of numbers
900
        label: string label for this PMF
901

902
    Returns:
903
        Pmf object
904
    """
905
    return Pmf(t, label=label)
906

907

908
def MakePmfFromDict(d, label=None):
909
    """Makes a PMF from a map from values to probabilities.
910

911
    Args:
912
        d: dictionary that maps values to probabilities
913
        label: string label for this PMF
914

915
    Returns:
916
        Pmf object
917
    """
918
    return Pmf(d, label=label)
919

920

921
def MakePmfFromItems(t, label=None):
922
    """Makes a PMF from a sequence of value-probability pairs
923

924
    Args:
925
        t: sequence of value-probability pairs
926
        label: string label for this PMF
927

928
    Returns:
929
        Pmf object
930
    """
931
    return Pmf(dict(t), label=label)
932

933

934
def MakePmfFromHist(hist, label=None):
935
    """Makes a normalized PMF from a Hist object.
936

937
    Args:
938
        hist: Hist object
939
        label: string label
940

941
    Returns:
942
        Pmf object
943
    """
944
    if label is None:
945
        label = hist.label
946

947
    return Pmf(hist, label=label)
948

949

950
def MakeMixture(metapmf, label='mix'):
951
    """Make a mixture distribution.
952

953
    Args:
954
      metapmf: Pmf that maps from Pmfs to probs.
955
      label: string label for the new Pmf.
956

957
    Returns: Pmf object.
958
    """
959
    mix = Pmf(label=label)
960
    for pmf, p1 in metapmf.Items():
961
        for x, p2 in pmf.Items():
962
            mix[x] += p1 * p2
963
    return mix
964

965

966
def MakeUniformPmf(low, high, n):
967
    """Make a uniform Pmf.
968

969
    low: lowest value (inclusive)
970
    high: highest value (inclusize)
971
    n: number of values
972
    """
973
    pmf = Pmf()
974
    for x in np.linspace(low, high, n):
975
        pmf.Set(x, 1)
976
    pmf.Normalize()
977
    return pmf
978

979

980
class Cdf:
981
    """Represents a cumulative distribution function.
982

983
    Attributes:
984
        xs: sequence of values
985
        ps: sequence of probabilities
986
        label: string used as a graph label.
987
    """
988
    def __init__(self, obj=None, ps=None, label=None):
989
        """Initializes.
990
        
991
        If ps is provided, obj must be the corresponding list of values.
992

993
        obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
994
        ps: list of cumulative probabilities
995
        label: string label
996
        """
997
        self.label = label if label is not None else DEFAULT_LABEL
998

999
        if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
1000
            if not label:
1001
                self.label = label if label is not None else obj.label
1002

1003
        if obj is None:
1004
            # caller does not provide obj, make an empty Cdf
1005
            self.xs = np.asarray([])
1006
            self.ps = np.asarray([])
1007
            if ps is not None:
1008
                logging.warning("Cdf: can't pass ps without also passing xs.")
1009
            return
1010
        else:
1011
            # if the caller provides xs and ps, just store them          
1012
            if ps is not None:
1013
                if isinstance(ps, str):
1014
                    logging.warning("Cdf: ps can't be a string")
1015

1016
                self.xs = np.asarray(obj)
1017
                self.ps = np.asarray(ps)
1018
                return
1019

1020
        # caller has provided just obj, not ps
1021
        if isinstance(obj, Cdf):
1022
            self.xs = copy.copy(obj.xs)
1023
            self.ps = copy.copy(obj.ps)
1024
            return
1025

1026
        if isinstance(obj, _DictWrapper):
1027
            dw = obj
1028
        else:
1029
            dw = Hist(obj)
1030

1031
        if len(dw) == 0:
1032
            self.xs = np.asarray([])
1033
            self.ps = np.asarray([])
1034
            return
1035

1036
        xs, freqs = zip(*sorted(dw.Items()))
1037
        self.xs = np.asarray(xs)
1038
        self.ps = np.cumsum(freqs, dtype=np.float)
1039
        self.ps /= self.ps[-1]
1040

1041
    def __str__(self):
1042
        cls = self.__class__.__name__
1043
        if self.label == DEFAULT_LABEL:
1044
            return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1045
        else:
1046
            return self.label
1047

1048
    def __repr__(self):
1049
        cls = self.__class__.__name__
1050
        if self.label == DEFAULT_LABEL:
1051
            return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))
1052
        else:
1053
            return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps), 
1054
                                       repr(self.label))
1055

1056
    def __len__(self):
1057
        return len(self.xs)
1058

1059
    def __getitem__(self, x):
1060
        return self.Prob(x)
1061

1062
    def __setitem__(self):
1063
        raise UnimplementedMethodException()
1064

1065
    def __delitem__(self):
1066
        raise UnimplementedMethodException()
1067

1068
    def __eq__(self, other):
1069
        return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)
1070

1071
    def Print(self):
1072
        """Prints the values and freqs/probs in ascending order."""
1073
        for val, prob in zip(self.xs, self.ps):
1074
            print(val, prob)
1075

1076
    def Copy(self, label=None):
1077
        """Returns a copy of this Cdf.
1078

1079
        label: string label for the new Cdf
1080
        """
1081
        if label is None:
1082
            label = self.label
1083
        return Cdf(list(self.xs), list(self.ps), label=label)
1084

1085
    def MakePmf(self, label=None):
1086
        """Makes a Pmf."""
1087
        if label is None:
1088
            label = self.label
1089
        return Pmf(self, label=label)
1090

1091
    def Items(self):
1092
        """Returns a sorted sequence of (value, probability) pairs.
1093

1094
        Note: in Python3, returns an iterator.
1095
        """
1096
        a = self.ps
1097
        b = np.roll(a, 1)
1098
        b[0] = 0
1099
        return zip(self.xs, a-b)
1100

1101
    def Shift(self, term):
1102
        """Adds a term to the xs.
1103

1104
        term: how much to add
1105
        """
1106
        new = self.Copy()
1107
        # don't use +=, or else an int array + float yields int array
1108
        new.xs = new.xs + term
1109
        return new
1110

1111
    def Scale(self, factor):
1112
        """Multiplies the xs by a factor.
1113

1114
        factor: what to multiply by
1115
        """
1116
        new = self.Copy()
1117
        # don't use *=, or else an int array * float yields int array
1118
        new.xs = new.xs * factor
1119
        return new
1120

1121
    def Prob(self, x):
1122
        """Returns CDF(x), the probability that corresponds to value x.
1123

1124
        Args:
1125
            x: number
1126

1127
        Returns:
1128
            float probability
1129
        """
1130
        if x < self.xs[0]:
1131
            return 0
1132
        index = bisect.bisect(self.xs, x)
1133
        p = self.ps[index-1]
1134
        return p
1135

1136
    def Probs(self, xs):
1137
        """Gets probabilities for a sequence of values.
1138

1139
        xs: any sequence that can be converted to NumPy array
1140

1141
        returns: NumPy array of cumulative probabilities
1142
        """
1143
        xs = np.asarray(xs)
1144
        index = np.searchsorted(self.xs, xs, side='right')
1145
        ps = self.ps[index-1]
1146
        ps[xs < self.xs[0]] = 0
1147
        return ps
1148

1149
    ProbArray = Probs
1150

1151
    def Value(self, p):
1152
        """Returns InverseCDF(p), the value that corresponds to probability p.
1153

1154
        Args:
1155
            p: number in the range [0, 1]
1156

1157
        Returns:
1158
            number value
1159
        """
1160
        if p < 0 or p > 1:
1161
            raise ValueError('Probability p must be in range [0, 1]')
1162

1163
        index = bisect.bisect_left(self.ps, p)
1164
        return self.xs[index]
1165

1166
    def Values(self, ps=None):
1167
        """Returns InverseCDF(p), the value that corresponds to probability p.
1168

1169
        If ps is not provided, returns all values.
1170

1171
        Args:
1172
            ps: NumPy array of numbers in the range [0, 1]
1173

1174
        Returns:
1175
            NumPy array of values
1176
        """
1177
        if ps is None:
1178
            return self.xs
1179

1180
        ps = np.asarray(ps)
1181
        if np.any(ps < 0) or np.any(ps > 1):
1182
            raise ValueError('Probability p must be in range [0, 1]')
1183

1184
        index = np.searchsorted(self.ps, ps, side='left')
1185
        return self.xs[index]
1186

1187
    ValueArray = Values
1188

1189
    def Percentile(self, p):
1190
        """Returns the value that corresponds to percentile p.
1191

1192
        Args:
1193
            p: number in the range [0, 100]
1194

1195
        Returns:
1196
            number value
1197
        """
1198
        return self.Value(p / 100)
1199

1200
    def Percentiles(self, ps):
1201
        """Returns the value that corresponds to percentiles ps.
1202

1203
        Args:
1204
            ps: numbers in the range [0, 100]
1205

1206
        Returns:
1207
            array of values
1208
        """
1209
        ps = np.asarray(ps)
1210
        return self.Values(ps / 100)
1211

1212
    def PercentileRank(self, x):
1213
        """Returns the percentile rank of the value x.
1214

1215
        x: potential value in the CDF
1216

1217
        returns: percentile rank in the range 0 to 100
1218
        """
1219
        return self.Prob(x) * 100
1220

1221
    def PercentileRanks(self, xs):
1222
        """Returns the percentile ranks of the values in xs.
1223

1224
        xs: potential value in the CDF
1225

1226
        returns: array of percentile ranks in the range 0 to 100
1227
        """
1228
        return self.Probs(x) * 100
1229

1230
    def Random(self):
1231
        """Chooses a random value from this distribution."""
1232
        return self.Value(random.random())
1233

1234
    def Sample(self, n):
1235
        """Generates a random sample from this distribution.
1236
        
1237
        n: int length of the sample
1238
        returns: NumPy array
1239
        """
1240
        ps = np.random.random(n)
1241
        return self.ValueArray(ps)
1242

1243
    def Mean(self):
1244
        """Computes the mean of a CDF.
1245

1246
        Returns:
1247
            float mean
1248
        """
1249
        old_p = 0
1250
        total = 0
1251
        for x, new_p in zip(self.xs, self.ps):
1252
            p = new_p - old_p
1253
            total += p * x
1254
            old_p = new_p
1255
        return total
1256

1257
    def CredibleInterval(self, percentage=90):
1258
        """Computes the central credible interval.
1259

1260
        If percentage=90, computes the 90% CI.
1261

1262
        Args:
1263
            percentage: float between 0 and 100
1264

1265
        Returns:
1266
            sequence of two floats, low and high
1267
        """
1268
        prob = (1 - percentage / 100) / 2
1269
        interval = self.Value(prob), self.Value(1 - prob)
1270
        return interval
1271

1272
    ConfidenceInterval = CredibleInterval
1273

1274
    def _Round(self, multiplier=1000):
1275
        """
1276
        An entry is added to the cdf only if the percentile differs
1277
        from the previous value in a significant digit, where the number
1278
        of significant digits is determined by multiplier.  The
1279
        default is 1000, which keeps log10(1000) = 3 significant digits.
1280
        """
1281
        # TODO(write this method)
1282
        raise UnimplementedMethodException()
1283

1284
    def Render(self, **options):
1285
        """Generates a sequence of points suitable for plotting.
1286

1287
        An empirical CDF is a step function; linear interpolation
1288
        can be misleading.
1289

1290
        Note: options are ignored
1291

1292
        Returns:
1293
            tuple of (xs, ps)
1294
        """
1295
        def interleave(a, b):
1296
            c = np.empty(a.shape[0] + b.shape[0])
1297
            c[::2] = a
1298
            c[1::2] = b
1299
            return c
1300

1301
        a = np.array(self.xs)
1302
        xs = interleave(a, a)
1303
        shift_ps = np.roll(self.ps, 1)
1304
        shift_ps[0] = 0
1305
        ps = interleave(shift_ps, self.ps)
1306
        return xs, ps
1307

1308
    def Max(self, k):
1309
        """Computes the CDF of the maximum of k selections from this dist.
1310

1311
        k: int
1312

1313
        returns: new Cdf
1314
        """
1315
        cdf = self.Copy()
1316
        cdf.ps **= k
1317
        return cdf
1318

1319

1320
def MakeCdfFromItems(items, label=None):
1321
    """Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1322

1323
    Args:
1324
        items: unsorted sequence of (value, frequency) pairs
1325
        label: string label for this CDF
1326

1327
    Returns:
1328
        cdf: list of (value, fraction) pairs
1329
    """
1330
    return Cdf(dict(items), label=label)
1331

1332

1333
def MakeCdfFromDict(d, label=None):
1334
    """Makes a CDF from a dictionary that maps values to frequencies.
1335

1336
    Args:
1337
       d: dictionary that maps values to frequencies.
1338
       label: string label for the data.
1339

1340
    Returns:
1341
        Cdf object
1342
    """
1343
    return Cdf(d, label=label)
1344

1345

1346
def MakeCdfFromList(seq, label=None):
1347
    """Creates a CDF from an unsorted sequence.
1348

1349
    Args:
1350
        seq: unsorted sequence of sortable values
1351
        label: string label for the cdf
1352

1353
    Returns:
1354
       Cdf object
1355
    """
1356
    return Cdf(seq, label=label)
1357

1358

1359
def MakeCdfFromHist(hist, label=None):
1360
    """Makes a CDF from a Hist object.
1361

1362
    Args:
1363
       hist: Pmf.Hist object
1364
       label: string label for the data.
1365

1366
    Returns:
1367
        Cdf object
1368
    """
1369
    if label is None:
1370
        label = hist.label
1371

1372
    return Cdf(hist, label=label)
1373

1374

1375
def MakeCdfFromPmf(pmf, label=None):
1376
    """Makes a CDF from a Pmf object.
1377

1378
    Args:
1379
       pmf: Pmf.Pmf object
1380
       label: string label for the data.
1381

1382
    Returns:
1383
        Cdf object
1384
    """
1385
    if label is None:
1386
        label = pmf.label
1387

1388
    return Cdf(pmf, label=label)
1389

1390

1391
class UnimplementedMethodException(Exception):
1392
    """Exception if someone calls a method that should be overridden."""
1393

1394

1395
class Suite(Pmf):
1396
    """Represents a suite of hypotheses and their probabilities."""
1397

1398
    def Update(self, data):
1399
        """Updates each hypothesis based on the data.
1400

1401
        data: any representation of the data
1402

1403
        returns: the normalizing constant
1404
        """
1405
        for hypo in self.Values():
1406
            like = self.Likelihood(data, hypo)
1407
            self.Mult(hypo, like)
1408
        return self.Normalize()
1409

1410
    def LogUpdate(self, data):
1411
        """Updates a suite of hypotheses based on new data.
1412

1413
        Modifies the suite directly; if you want to keep the original, make
1414
        a copy.
1415

1416
        Note: unlike Update, LogUpdate does not normalize.
1417

1418
        Args:
1419
            data: any representation of the data
1420
        """
1421
        for hypo in self.Values():
1422
            like = self.LogLikelihood(data, hypo)
1423
            self.Incr(hypo, like)
1424

1425
    def UpdateSet(self, dataset):
1426
        """Updates each hypothesis based on the dataset.
1427

1428
        This is more efficient than calling Update repeatedly because
1429
        it waits until the end to Normalize.
1430

1431
        Modifies the suite directly; if you want to keep the original, make
1432
        a copy.
1433

1434
        dataset: a sequence of data
1435

1436
        returns: the normalizing constant
1437
        """
1438
        for data in dataset:
1439
            for hypo in self.Values():
1440
                like = self.Likelihood(data, hypo)
1441
                self.Mult(hypo, like)
1442
        return self.Normalize()
1443

1444
    def LogUpdateSet(self, dataset):
1445
        """Updates each hypothesis based on the dataset.
1446

1447
        Modifies the suite directly; if you want to keep the original, make
1448
        a copy.
1449

1450
        dataset: a sequence of data
1451

1452
        returns: None
1453
        """
1454
        for data in dataset:
1455
            self.LogUpdate(data)
1456

1457
    def Likelihood(self, data, hypo):
1458
        """Computes the likelihood of the data under the hypothesis.
1459

1460
        hypo: some representation of the hypothesis
1461
        data: some representation of the data
1462
        """
1463
        raise UnimplementedMethodException()
1464

1465
    def LogLikelihood(self, data, hypo):
1466
        """Computes the log likelihood of the data under the hypothesis.
1467

1468
        hypo: some representation of the hypothesis
1469
        data: some representation of the data
1470
        """
1471
        raise UnimplementedMethodException()
1472

1473
    def Print(self):
1474
        """Prints the hypotheses and their probabilities."""
1475
        for hypo, prob in sorted(self.Items()):
1476
            print(hypo, prob)
1477

1478
    def MakeOdds(self):
1479
        """Transforms from probabilities to odds.
1480

1481
        Values with prob=0 are removed.
1482
        """
1483
        for hypo, prob in self.Items():
1484
            if prob:
1485
                self.Set(hypo, Odds(prob))
1486
            else:
1487
                self.Remove(hypo)
1488

1489
    def MakeProbs(self):
1490
        """Transforms from odds to probabilities."""
1491
        for hypo, odds in self.Items():
1492
            self.Set(hypo, Probability(odds))
1493

1494

1495
def MakeSuiteFromList(t, label=None):
1496
    """Makes a suite from an unsorted sequence of values.
1497

1498
    Args:
1499
        t: sequence of numbers
1500
        label: string label for this suite
1501

1502
    Returns:
1503
        Suite object
1504
    """
1505
    hist = MakeHistFromList(t, label=label)
1506
    d = hist.GetDict()
1507
    return MakeSuiteFromDict(d)
1508

1509

1510
def MakeSuiteFromHist(hist, label=None):
1511
    """Makes a normalized suite from a Hist object.
1512

1513
    Args:
1514
        hist: Hist object
1515
        label: string label
1516

1517
    Returns:
1518
        Suite object
1519
    """
1520
    if label is None:
1521
        label = hist.label
1522

1523
    # make a copy of the dictionary
1524
    d = dict(hist.GetDict())
1525
    return MakeSuiteFromDict(d, label)
1526

1527

1528
def MakeSuiteFromDict(d, label=None):
1529
    """Makes a suite from a map from values to probabilities.
1530

1531
    Args:
1532
        d: dictionary that maps values to probabilities
1533
        label: string label for this suite
1534

1535
    Returns:
1536
        Suite object
1537
    """
1538
    suite = Suite(label=label)
1539
    suite.SetDict(d)
1540
    suite.Normalize()
1541
    return suite
1542

1543

1544
class Pdf(object):
1545
    """Represents a probability density function (PDF)."""
1546

1547
    def Density(self, x):
1548
        """Evaluates this Pdf at x.
1549

1550
        Returns: float or NumPy array of probability density
1551
        """
1552
        raise UnimplementedMethodException()
1553

1554
    def GetLinspace(self):
1555
        """Get a linspace for plotting.
1556

1557
        Not all subclasses of Pdf implement this.
1558

1559
        Returns: numpy array
1560
        """
1561
        raise UnimplementedMethodException()
1562

1563
    def MakePmf(self, **options):
1564
        """Makes a discrete version of this Pdf.
1565

1566
        options can include
1567
        label: string
1568
        low: low end of range
1569
        high: high end of range
1570
        n: number of places to evaluate
1571

1572
        Returns: new Pmf
1573
        """
1574
        label = options.pop('label', '')
1575
        xs, ds = self.Render(**options)
1576
        return Pmf(dict(zip(xs, ds)), label=label)
1577

1578
    def Render(self, **options):
1579
        """Generates a sequence of points suitable for plotting.
1580

1581
        If options includes low and high, it must also include n;
1582
        in that case the density is evaluated an n locations between
1583
        low and high, including both.
1584

1585
        If options includes xs, the density is evaluate at those location.
1586

1587
        Otherwise, self.GetLinspace is invoked to provide the locations.
1588

1589
        Returns:
1590
            tuple of (xs, densities)
1591
        """
1592
        low, high = options.pop('low', None), options.pop('high', None)
1593
        if low is not None and high is not None:
1594
            n = options.pop('n', 101)
1595
            xs = np.linspace(low, high, n)
1596
        else:
1597
            xs = options.pop('xs', None)
1598
            if xs is None:
1599
                xs = self.GetLinspace()
1600
            
1601
        ds = self.Density(xs)
1602
        return xs, ds
1603

1604
    def Items(self):
1605
        """Generates a sequence of (value, probability) pairs.
1606
        """
1607
        return zip(*self.Render())
1608

1609

1610
class NormalPdf(Pdf):
1611
    """Represents the PDF of a Normal distribution."""
1612

1613
    def __init__(self, mu=0, sigma=1, label=None):
1614
        """Constructs a Normal Pdf with given mu and sigma.
1615

1616
        mu: mean
1617
        sigma: standard deviation
1618
        label: string
1619
        """
1620
        self.mu = mu
1621
        self.sigma = sigma
1622
        self.label = label if label is not None else '_nolegend_'
1623

1624
    def __str__(self):
1625
        return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)
1626

1627
    def GetLinspace(self):
1628
        """Get a linspace for plotting.
1629

1630
        Returns: numpy array
1631
        """
1632
        low, high = self.mu-3*self.sigma, self.mu+3*self.sigma
1633
        return np.linspace(low, high, 101)
1634

1635
    def Density(self, xs):
1636
        """Evaluates this Pdf at xs.
1637

1638
        xs: scalar or sequence of floats
1639

1640
        returns: float or NumPy array of probability density
1641
        """
1642
        return stats.norm.pdf(xs, self.mu, self.sigma)
1643

1644

1645
class ExponentialPdf(Pdf):
1646
    """Represents the PDF of an exponential distribution."""
1647

1648
    def __init__(self, lam=1, label=None):
1649
        """Constructs an exponential Pdf with given parameter.
1650

1651
        lam: rate parameter
1652
        label: string
1653
        """
1654
        self.lam = lam
1655
        self.label = label if label is not None else '_nolegend_'
1656

1657
    def __str__(self):
1658
        return 'ExponentialPdf(%f)' % (self.lam)
1659

1660
    def GetLinspace(self):
1661
        """Get a linspace for plotting.
1662

1663
        Returns: numpy array
1664
        """
1665
        low, high = 0, 5.0/self.lam
1666
        return np.linspace(low, high, 101)
1667

1668
    def Density(self, xs):
1669
        """Evaluates this Pdf at xs.
1670

1671
        xs: scalar or sequence of floats
1672

1673
        returns: float or NumPy array of probability density
1674
        """
1675
        return stats.expon.pdf(xs, scale=1.0/self.lam)
1676

1677

1678
class EstimatedPdf(Pdf):
1679
    """Represents a PDF estimated by KDE."""
1680

1681
    def __init__(self, sample, label=None):
1682
        """Estimates the density function based on a sample.
1683

1684
        sample: sequence of data
1685
        label: string
1686
        """
1687
        self.label = label if label is not None else '_nolegend_'
1688
        self.kde = stats.gaussian_kde(sample)
1689
        low = min(sample)
1690
        high = max(sample)
1691
        self.linspace = np.linspace(low, high, 101)
1692

1693
    def __str__(self):
1694
        return 'EstimatedPdf(label=%s)' % str(self.label)
1695

1696
    def GetLinspace(self):
1697
        """Get a linspace for plotting.
1698

1699
        Returns: numpy array
1700
        """
1701
        return self.linspace
1702

1703
    def Density(self, xs):
1704
        """Evaluates this Pdf at xs.
1705

1706
        returns: float or NumPy array of probability density
1707
        """
1708
        return self.kde.evaluate(xs)
1709

1710
    def Sample(self, n):
1711
        """Generates a random sample from the estimated Pdf.
1712

1713
        n: size of sample
1714
        """
1715
        # NOTE: we have to flatten because resample returns a 2-D
1716
        # array for some reason.
1717
        return self.kde.resample(n).flatten()
1718

1719

1720
def CredibleInterval(pmf, percentage=90):
1721
    """Computes a credible interval for a given distribution.
1722

1723
    If percentage=90, computes the 90% CI.
1724

1725
    Args:
1726
        pmf: Pmf object representing a posterior distribution
1727
        percentage: float between 0 and 100
1728

1729
    Returns:
1730
        sequence of two floats, low and high
1731
    """
1732
    cdf = pmf.MakeCdf()
1733
    prob = (1 - percentage / 100) / 2
1734
    interval = cdf.Value(prob), cdf.Value(1 - prob)
1735
    return interval
1736

1737

1738
def PmfProbLess(pmf1, pmf2):
1739
    """Probability that a value from pmf1 is less than a value from pmf2.
1740

1741
    Args:
1742
        pmf1: Pmf object
1743
        pmf2: Pmf object
1744

1745
    Returns:
1746
        float probability
1747
    """
1748
    total = 0
1749
    for v1, p1 in pmf1.Items():
1750
        for v2, p2 in pmf2.Items():
1751
            if v1 < v2:
1752
                total += p1 * p2
1753
    return total
1754

1755

1756
def PmfProbGreater(pmf1, pmf2):
1757
    """Probability that a value from pmf1 is less than a value from pmf2.
1758

1759
    Args:
1760
        pmf1: Pmf object
1761
        pmf2: Pmf object
1762

1763
    Returns:
1764
        float probability
1765
    """
1766
    total = 0
1767
    for v1, p1 in pmf1.Items():
1768
        for v2, p2 in pmf2.Items():
1769
            if v1 > v2:
1770
                total += p1 * p2
1771
    return total
1772

1773

1774
def PmfProbEqual(pmf1, pmf2):
1775
    """Probability that a value from pmf1 equals a value from pmf2.
1776

1777
    Args:
1778
        pmf1: Pmf object
1779
        pmf2: Pmf object
1780

1781
    Returns:
1782
        float probability
1783
    """
1784
    total = 0
1785
    for v1, p1 in pmf1.Items():
1786
        for v2, p2 in pmf2.Items():
1787
            if v1 == v2:
1788
                total += p1 * p2
1789
    return total
1790

1791

1792
def RandomSum(dists):
1793
    """Chooses a random value from each dist and returns the sum.
1794

1795
    dists: sequence of Pmf or Cdf objects
1796

1797
    returns: numerical sum
1798
    """
1799
    total = sum(dist.Random() for dist in dists)
1800
    return total
1801

1802

1803
def SampleSum(dists, n):
1804
    """Draws a sample of sums from a list of distributions.
1805

1806
    dists: sequence of Pmf or Cdf objects
1807
    n: sample size
1808

1809
    returns: new Pmf of sums
1810
    """
1811
    pmf = Pmf(RandomSum(dists) for i in range(n))
1812
    return pmf
1813

1814

1815
def EvalNormalPdf(x, mu, sigma):
1816
    """Computes the unnormalized PDF of the normal distribution.
1817

1818
    x: value
1819
    mu: mean
1820
    sigma: standard deviation
1821
    
1822
    returns: float probability density
1823
    """
1824
    return stats.norm.pdf(x, mu, sigma)
1825

1826

1827
def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
1828
    """Makes a PMF discrete approx to a Normal distribution.
1829
    
1830
    mu: float mean
1831
    sigma: float standard deviation
1832
    num_sigmas: how many sigmas to extend in each direction
1833
    n: number of values in the Pmf
1834

1835
    returns: normalized Pmf
1836
    """
1837
    pmf = Pmf()
1838
    low = mu - num_sigmas * sigma
1839
    high = mu + num_sigmas * sigma
1840

1841
    for x in np.linspace(low, high, n):
1842
        p = EvalNormalPdf(x, mu, sigma)
1843
        pmf.Set(x, p)
1844
    pmf.Normalize()
1845
    return pmf
1846

1847

1848
def EvalBinomialPmf(k, n, p):
1849
    """Evaluates the binomial PMF.
1850

1851
    Returns the probabily of k successes in n trials with probability p.
1852
    """
1853
    return stats.binom.pmf(k, n, p)
1854
    
1855

1856
def MakeBinomialPmf(n, p):
1857
    """Evaluates the binomial PMF.
1858

1859
    Returns the distribution of successes in n trials with probability p.
1860
    """
1861
    pmf = Pmf()
1862
    for k in range(n+1):
1863
        pmf[k] = stats.binom.pmf(k, n, p)
1864
    return pmf
1865

1866

1867
def EvalGammaPdf(x, a):
1868
    """Computes the Gamma PDF.
1869

1870
    x: where to evaluate the PDF
1871
    a: parameter of the gamma distribution
1872

1873
    returns: float probability
1874
    """
1875
    return x**(a-1) * np.exp(-x) / gamma(a)
1876

1877

1878
def MakeGammaPmf(xs, a):
1879
    """Makes a PMF discrete approx to a Gamma distribution.
1880

1881
    lam: parameter lambda in events per unit time
1882
    xs: upper bound of the Pmf
1883

1884
    returns: normalized Pmf
1885
    """
1886
    xs = np.asarray(xs)
1887
    ps = EvalGammaPdf(xs, a)
1888
    pmf = Pmf(dict(zip(xs, ps)))
1889
    pmf.Normalize()
1890
    return pmf
1891

1892

1893
def EvalGeometricPmf(k, p, loc=0):
1894
    """Evaluates the geometric PMF.
1895

1896
    With loc=0: Probability of `k` trials to get one success.
1897
    With loc=-1: Probability of `k` trials before first success.
1898

1899
    k: number of trials
1900
    p: probability of success on each trial
1901
    """
1902
    return stats.geom.pmf(k, p, loc=loc)
1903
    
1904

1905
def MakeGeometricPmf(p, loc=0, high=10):
1906
    """Evaluates the binomial PMF.
1907

1908
    With loc=0: PMF of trials to get one success.
1909
    With loc=-1: PMF of trials before first success.
1910

1911
    p: probability of success
1912
    high: upper bound where PMF is truncated
1913
    """
1914
    pmf = Pmf()
1915
    for k in range(high):
1916
        pmf[k] = stats.geom.pmf(k, p, loc=loc)
1917
    pmf.Normalize()
1918
    return pmf
1919

1920

1921
def EvalHypergeomPmf(k, N, K, n):
1922
    """Evaluates the hypergeometric PMF.
1923

1924
    Returns the probabily of k successes in n trials from a population
1925
    N with K successes in it.
1926
    """
1927
    return stats.hypergeom.pmf(k, N, K, n)
1928
    
1929

1930
def EvalPoissonPmf(k, lam):
1931
    """Computes the Poisson PMF.
1932

1933
    k: number of events
1934
    lam: parameter lambda in events per unit time
1935

1936
    returns: float probability
1937
    """
1938
    return stats.poisson.pmf(k, lam)
1939

1940

1941
def MakePoissonPmf(lam, high, step=1):
1942
    """Makes a PMF discrete approx to a Poisson distribution.
1943

1944
    lam: parameter lambda in events per unit time
1945
    high: upper bound of the Pmf
1946

1947
    returns: normalized Pmf
1948
    """
1949
    pmf = Pmf()
1950
    for k in range(0, high + 1, step):
1951
        p = stats.poisson.pmf(k, lam)
1952
        pmf.Set(k, p)
1953
    pmf.Normalize()
1954
    return pmf
1955

1956

1957
def EvalExponentialPdf(x, lam):
1958
    """Computes the exponential PDF.
1959

1960
    x: value
1961
    lam: parameter lambda in events per unit time
1962

1963
    returns: float probability density
1964
    """
1965
    return lam * math.exp(-lam * x)
1966

1967

1968
def EvalExponentialCdf(x, lam):
1969
    """Evaluates CDF of the exponential distribution with parameter lam."""
1970
    return 1 - math.exp(-lam * x)
1971

1972

1973
def MakeExponentialPmf(lam, high, n=200):
1974
    """Makes a PMF discrete approx to an exponential distribution.
1975

1976
    lam: parameter lambda in events per unit time
1977
    high: upper bound
1978
    n: number of values in the Pmf
1979

1980
    returns: normalized Pmf
1981
    """
1982
    pmf = Pmf()
1983
    for x in np.linspace(0, high, n):
1984
        p = EvalExponentialPdf(x, lam)
1985
        pmf.Set(x, p)
1986
    pmf.Normalize()
1987
    return pmf
1988

1989

1990
def EvalWeibullPdf(x, lam, k):
1991
    """Computes the Weibull PDF.
1992

1993
    x: value
1994
    lam: parameter lambda in events per unit time
1995
    k: parameter
1996

1997
    returns: float probability density
1998
    """
1999
    arg = (x / lam)
2000
    return k / lam * arg**(k-1) * np.exp(-arg**k)
2001

2002

2003
def EvalWeibullCdf(x, lam, k):
2004
    """Evaluates CDF of the Weibull distribution."""
2005
    arg = (x / lam)
2006
    return 1 - np.exp(-arg**k)
2007

2008

2009
def MakeWeibullPmf(lam, k, high, n=200):
2010
    """Makes a PMF discrete approx to a Weibull distribution.
2011

2012
    lam: parameter lambda in events per unit time
2013
    k: parameter
2014
    high: upper bound
2015
    n: number of values in the Pmf
2016

2017
    returns: normalized Pmf
2018
    """
2019
    xs = np.linspace(0, high, n)
2020
    ps = EvalWeibullPdf(xs, lam, k)
2021
    return Pmf(dict(zip(xs, ps)))
2022

2023

2024
def EvalParetoPdf(x, xm, alpha):
2025
    """Computes the Pareto.
2026

2027
    xm: minimum value (scale parameter)
2028
    alpha: shape parameter
2029

2030
    returns: float probability density
2031
    """
2032
    return stats.pareto.pdf(x, alpha, scale=xm)
2033

2034

2035
def MakeParetoPmf(xm, alpha, high, num=101):
2036
    """Makes a PMF discrete approx to a Pareto distribution.
2037

2038
    xm: minimum value (scale parameter)
2039
    alpha: shape parameter
2040
    high: upper bound value
2041
    num: number of values
2042

2043
    returns: normalized Pmf
2044
    """
2045
    xs = np.linspace(xm, high, num)
2046
    ps = stats.pareto.pdf(xs, alpha, scale=xm)
2047
    pmf = Pmf(dict(zip(xs, ps)))
2048
    return pmf
2049

2050
def StandardNormalCdf(x):
2051
    """Evaluates the CDF of the standard Normal distribution.
2052
    
2053
    See http://en.wikipedia.org/wiki/Normal_distribution
2054
    #Cumulative_distribution_function
2055

2056
    Args:
2057
        x: float
2058
                
2059
    Returns:
2060
        float
2061
    """
2062
    return (math.erf(x / ROOT2) + 1) / 2
2063

2064

2065
def EvalNormalCdf(x, mu=0, sigma=1):
2066
    """Evaluates the CDF of the normal distribution.
2067
    
2068
    Args:
2069
        x: float
2070

2071
        mu: mean parameter
2072
        
2073
        sigma: standard deviation parameter
2074
                
2075
    Returns:
2076
        float
2077
    """
2078
    return stats.norm.cdf(x, loc=mu, scale=sigma)
2079

2080

2081
def EvalNormalCdfInverse(p, mu=0, sigma=1):
2082
    """Evaluates the inverse CDF of the normal distribution.
2083

2084
    See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function  
2085

2086
    Args:
2087
        p: float
2088

2089
        mu: mean parameter
2090
        
2091
        sigma: standard deviation parameter
2092
                
2093
    Returns:
2094
        float
2095
    """
2096
    return stats.norm.ppf(p, loc=mu, scale=sigma)
2097

2098

2099
def EvalLognormalCdf(x, mu=0, sigma=1):
2100
    """Evaluates the CDF of the lognormal distribution.
2101
    
2102
    x: float or sequence
2103
    mu: mean parameter
2104
    sigma: standard deviation parameter
2105
                
2106
    Returns: float or sequence
2107
    """
2108
    return stats.lognorm.cdf(x, loc=mu, scale=sigma)
2109

2110

2111
def RenderExpoCdf(lam, low, high, n=101):
2112
    """Generates sequences of xs and ps for an exponential CDF.
2113

2114
    lam: parameter
2115
    low: float
2116
    high: float
2117
    n: number of points to render
2118

2119
    returns: numpy arrays (xs, ps)
2120
    """
2121
    xs = np.linspace(low, high, n)
2122
    ps = 1 - np.exp(-lam * xs)
2123
    #ps = stats.expon.cdf(xs, scale=1.0/lam)
2124
    return xs, ps
2125

2126

2127
def RenderNormalCdf(mu, sigma, low, high, n=101):
2128
    """Generates sequences of xs and ps for a Normal CDF.
2129

2130
    mu: parameter
2131
    sigma: parameter
2132
    low: float
2133
    high: float
2134
    n: number of points to render
2135

2136
    returns: numpy arrays (xs, ps)
2137
    """
2138
    xs = np.linspace(low, high, n)
2139
    ps = stats.norm.cdf(xs, mu, sigma)
2140
    return xs, ps
2141

2142

2143
def RenderParetoCdf(xmin, alpha, low, high, n=50):
2144
    """Generates sequences of xs and ps for a Pareto CDF.
2145

2146
    xmin: parameter
2147
    alpha: parameter
2148
    low: float
2149
    high: float
2150
    n: number of points to render
2151

2152
    returns: numpy arrays (xs, ps)
2153
    """
2154
    if low < xmin:
2155
        low = xmin
2156
    xs = np.linspace(low, high, n)
2157
    ps = 1 - (xs / xmin) ** -alpha
2158
    #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
2159
    return xs, ps
2160

2161

2162
class Beta:
2163
    """Represents a Beta distribution.
2164

2165
    See http://en.wikipedia.org/wiki/Beta_distribution
2166
    """
2167
    def __init__(self, alpha=1, beta=1, label=None):
2168
        """Initializes a Beta distribution."""
2169
        self.alpha = alpha
2170
        self.beta = beta
2171
        self.label = label if label is not None else '_nolegend_'
2172

2173
    def Update(self, data):
2174
        """Updates a Beta distribution.
2175

2176
        data: pair of int (heads, tails)
2177
        """
2178
        heads, tails = data
2179
        self.alpha += heads
2180
        self.beta += tails
2181

2182
    def Mean(self):
2183
        """Computes the mean of this distribution."""
2184
        return self.alpha / (self.alpha + self.beta)
2185

2186
    def MAP(self):
2187
        """Computes the value with maximum a posteori probability."""
2188
        a = self.alpha - 1
2189
        b = self.beta - 1
2190
        return a / (a + b)
2191

2192
    def Random(self):
2193
        """Generates a random variate from this distribution."""
2194
        return random.betavariate(self.alpha, self.beta)
2195

2196
    def Sample(self, n):
2197
        """Generates a random sample from this distribution.
2198

2199
        n: int sample size
2200
        """
2201
        size = n,
2202
        return np.random.beta(self.alpha, self.beta, size)
2203

2204
    def EvalPdf(self, x):
2205
        """Evaluates the PDF at x."""
2206
        return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
2207

2208
    def MakePmf(self, steps=101, label=None):
2209
        """Returns a Pmf of this distribution.
2210

2211
        Note: Normally, we just evaluate the PDF at a sequence
2212
        of points and treat the probability density as a probability
2213
        mass.
2214

2215
        But if alpha or beta is less than one, we have to be
2216
        more careful because the PDF goes to infinity at x=0
2217
        and x=1.  In that case we evaluate the CDF and compute
2218
        differences.
2219

2220
        The result is a little funny, because the values at 0 and 1
2221
        are not symmetric.  Nevertheless, it is a reasonable discrete
2222
        model of the continuous distribution, and behaves well as
2223
        the number of values increases.
2224
        """
2225
        if label is None and self.label is not None:
2226
            label = self.label
2227

2228
        if self.alpha < 1 or self.beta < 1:
2229
            cdf = self.MakeCdf()
2230
            pmf = cdf.MakePmf()
2231
            return pmf
2232

2233
        xs = [i / (steps - 1.0) for i in range(steps)]
2234
        probs = [self.EvalPdf(x) for x in xs]
2235
        pmf = Pmf(dict(zip(xs, probs)), label=label)
2236
        return pmf
2237

2238
    def MakeCdf(self, steps=101):
2239
        """Returns the CDF of this distribution."""
2240
        xs = [i / (steps - 1.0) for i in range(steps)]
2241
        ps = special.betainc(self.alpha, self.beta, xs)
2242
        cdf = Cdf(xs, ps)
2243
        return cdf
2244

2245
    def Percentile(self, ps):
2246
        """Returns the given percentiles from this distribution.
2247

2248
        ps: scalar, array, or list of [0-100]
2249
        """
2250
        ps = np.asarray(ps) / 100
2251
        xs = special.betaincinv(self.alpha, self.beta, ps)
2252
        return xs
2253

2254

2255
class Dirichlet(object):
2256
    """Represents a Dirichlet distribution.
2257

2258
    See http://en.wikipedia.org/wiki/Dirichlet_distribution
2259
    """
2260

2261
    def __init__(self, n, conc=1, label=None):
2262
        """Initializes a Dirichlet distribution.
2263

2264
        n: number of dimensions
2265
        conc: concentration parameter (smaller yields more concentration)
2266
        label: string label
2267
        """
2268
        if n < 2:
2269
            raise ValueError('A Dirichlet distribution with '
2270
                             'n<2 makes no sense')
2271

2272
        self.n = n
2273
        self.params = np.ones(n, dtype=np.float) * conc
2274
        self.label = label if label is not None else '_nolegend_'
2275

2276
    def Update(self, data):
2277
        """Updates a Dirichlet distribution.
2278

2279
        data: sequence of observations, in order corresponding to params
2280
        """
2281
        m = len(data)
2282
        self.params[:m] += data
2283

2284
    def Random(self):
2285
        """Generates a random variate from this distribution.
2286

2287
        Returns: normalized vector of fractions
2288
        """
2289
        p = np.random.gamma(self.params)
2290
        return p / p.sum()
2291

2292
    def Likelihood(self, data):
2293
        """Computes the likelihood of the data.
2294

2295
        Selects a random vector of probabilities from this distribution.
2296

2297
        Returns: float probability
2298
        """
2299
        m = len(data)
2300
        if self.n < m:
2301
            return 0
2302

2303
        x = data
2304
        p = self.Random()
2305
        q = p[:m] ** x
2306
        return q.prod()
2307

2308
    def LogLikelihood(self, data):
2309
        """Computes the log likelihood of the data.
2310

2311
        Selects a random vector of probabilities from this distribution.
2312

2313
        Returns: float log probability
2314
        """
2315
        m = len(data)
2316
        if self.n < m:
2317
            return float('-inf')
2318

2319
        x = self.Random()
2320
        y = np.log(x[:m]) * data
2321
        return y.sum()
2322

2323
    def MarginalBeta(self, i):
2324
        """Computes the marginal distribution of the ith element.
2325

2326
        See http://en.wikipedia.org/wiki/Dirichlet_distribution
2327
        #Marginal_distributions
2328

2329
        i: int
2330

2331
        Returns: Beta object
2332
        """
2333
        alpha0 = self.params.sum()
2334
        alpha = self.params[i]
2335
        return Beta(alpha, alpha0 - alpha)
2336

2337
    def PredictivePmf(self, xs, label=None):
2338
        """Makes a predictive distribution.
2339

2340
        xs: values to go into the Pmf
2341

2342
        Returns: Pmf that maps from x to the mean prevalence of x
2343
        """
2344
        alpha0 = self.params.sum()
2345
        ps = self.params / alpha0
2346
        return Pmf(zip(xs, ps), label=label)
2347

2348

2349
def BinomialCoef(n, k):
2350
    """Compute the binomial coefficient "n choose k".
2351

2352
    n: number of trials
2353
    k: number of successes
2354

2355
    Returns: float
2356
    """
2357
    return scipy.misc.comb(n, k)
2358

2359

2360
def LogBinomialCoef(n, k):
2361
    """Computes the log of the binomial coefficient.
2362

2363
    http://math.stackexchange.com/questions/64716/
2364
    approximating-the-logarithm-of-the-binomial-coefficient
2365

2366
    n: number of trials
2367
    k: number of successes
2368

2369
    Returns: float
2370
    """
2371
    return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)
2372

2373

2374
def NormalProbability(ys, jitter=0):
2375
    """Generates data for a normal probability plot.
2376

2377
    ys: sequence of values
2378
    jitter: float magnitude of jitter added to the ys 
2379

2380
    returns: numpy arrays xs, ys
2381
    """
2382
    n = len(ys)
2383
    xs = np.random.normal(0, 1, n)
2384
    xs.sort()
2385
    
2386
    if jitter:
2387
        ys = Jitter(ys, jitter)
2388
    else:
2389
        ys = np.array(ys)
2390
    ys.sort()
2391

2392
    return xs, ys
2393

2394

2395
def Jitter(values, jitter=0.5):
2396
    """Jitters the values by adding a uniform variate in (-jitter, jitter).
2397

2398
    values: sequence
2399
    jitter: scalar magnitude of jitter
2400
    
2401
    returns: new numpy array
2402
    """
2403
    n = len(values)
2404
    return np.random.normal(0, jitter, n) + values
2405

2406

2407
def NormalProbabilityPlot(sample, fit_color='0.8', **options):
2408
    """Makes a normal probability plot with a fitted line.
2409

2410
    sample: sequence of numbers
2411
    fit_color: color string for the fitted line
2412
    options: passed along to Plot
2413
    """
2414
    xs, ys = NormalProbability(sample)
2415
    mean, var = MeanVar(sample)
2416
    std = math.sqrt(var)
2417

2418
    fit = FitLine(xs, mean, std)
2419
    thinkplot.Plot(*fit, color=fit_color, label='model')
2420

2421
    xs, ys = NormalProbability(sample)
2422
    thinkplot.Plot(xs, ys, **options)
2423

2424
 
2425
def Mean(xs):
2426
    """Computes mean.
2427

2428
    xs: sequence of values
2429

2430
    returns: float mean
2431
    """
2432
    return np.mean(xs)
2433

2434

2435
def Var(xs, mu=None, ddof=0):
2436
    """Computes variance.
2437

2438
    xs: sequence of values
2439
    mu: option known mean
2440
    ddof: delta degrees of freedom
2441

2442
    returns: float
2443
    """
2444
    xs = np.asarray(xs)
2445

2446
    if mu is None:
2447
        mu = xs.mean()
2448

2449
    ds = xs - mu
2450
    return np.dot(ds, ds) / (len(xs) - ddof)
2451

2452

2453
def Std(xs, mu=None, ddof=0):
2454
    """Computes standard deviation.
2455

2456
    xs: sequence of values
2457
    mu: option known mean
2458
    ddof: delta degrees of freedom
2459

2460
    returns: float
2461
    """
2462
    var = Var(xs, mu, ddof)
2463
    return math.sqrt(var)
2464

2465

2466
def MeanVar(xs, ddof=0):
2467
    """Computes mean and variance.
2468

2469
    Based on http://stackoverflow.com/questions/19391149/
2470
    numpy-mean-and-variance-from-single-function
2471

2472
    xs: sequence of values
2473
    ddof: delta degrees of freedom
2474
    
2475
    returns: pair of float, mean and var
2476
    """
2477
    xs = np.asarray(xs)
2478
    mean = xs.mean()
2479
    s2 = Var(xs, mean, ddof)
2480
    return mean, s2
2481

2482

2483
def Trim(t, p=0.01):
2484
    """Trims the largest and smallest elements of t.
2485

2486
    Args:
2487
        t: sequence of numbers
2488
        p: fraction of values to trim off each end
2489

2490
    Returns:
2491
        sequence of values
2492
    """
2493
    n = int(p * len(t))
2494
    t = sorted(t)[n:-n]
2495
    return t
2496

2497

2498
def TrimmedMean(t, p=0.01):
2499
    """Computes the trimmed mean of a sequence of numbers.
2500

2501
    Args:
2502
        t: sequence of numbers
2503
        p: fraction of values to trim off each end
2504

2505
    Returns:
2506
        float
2507
    """
2508
    t = Trim(t, p)
2509
    return Mean(t)
2510

2511

2512
def TrimmedMeanVar(t, p=0.01):
2513
    """Computes the trimmed mean and variance of a sequence of numbers.
2514

2515
    Side effect: sorts the list.
2516

2517
    Args:
2518
        t: sequence of numbers
2519
        p: fraction of values to trim off each end
2520

2521
    Returns:
2522
        float
2523
    """
2524
    t = Trim(t, p)
2525
    mu, var = MeanVar(t)
2526
    return mu, var
2527

2528

2529
def CohenEffectSize(group1, group2):
2530
    """Compute Cohen's d.
2531

2532
    group1: Series or NumPy array
2533
    group2: Series or NumPy array
2534

2535
    returns: float
2536
    """
2537
    diff = group1.mean() - group2.mean()
2538

2539
    n1, n2 = len(group1), len(group2)
2540
    var1 = group1.var()
2541
    var2 = group2.var()
2542

2543
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
2544
    d = diff / math.sqrt(pooled_var)
2545
    return d
2546

2547

2548
def Cov(xs, ys, meanx=None, meany=None):
2549
    """Computes Cov(X, Y).
2550

2551
    Args:
2552
        xs: sequence of values
2553
        ys: sequence of values
2554
        meanx: optional float mean of xs
2555
        meany: optional float mean of ys
2556

2557
    Returns:
2558
        Cov(X, Y)
2559
    """
2560
    xs = np.asarray(xs)
2561
    ys = np.asarray(ys)
2562

2563
    if meanx is None:
2564
        meanx = np.mean(xs)
2565
    if meany is None:
2566
        meany = np.mean(ys)
2567

2568
    cov = np.dot(xs-meanx, ys-meany) / len(xs)
2569
    return cov
2570

2571

2572
def Corr(xs, ys):
2573
    """Computes Corr(X, Y).
2574

2575
    Args:
2576
        xs: sequence of values
2577
        ys: sequence of values
2578

2579
    Returns:
2580
        Corr(X, Y)
2581
    """
2582
    xs = np.asarray(xs)
2583
    ys = np.asarray(ys)
2584

2585
    meanx, varx = MeanVar(xs)
2586
    meany, vary = MeanVar(ys)
2587

2588
    corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)
2589

2590
    return corr
2591

2592

2593
def SerialCorr(series, lag=1):
2594
    """Computes the serial correlation of a series.
2595

2596
    series: Series
2597
    lag: integer number of intervals to shift
2598

2599
    returns: float correlation
2600
    """
2601
    xs = series[lag:]
2602
    ys = series.shift(lag)[lag:]
2603
    corr = Corr(xs, ys)
2604
    return corr
2605

2606

2607
def SpearmanCorr(xs, ys):
2608
    """Computes Spearman's rank correlation.
2609

2610
    Args:
2611
        xs: sequence of values
2612
        ys: sequence of values
2613

2614
    Returns:
2615
        float Spearman's correlation
2616
    """
2617
    xranks = pandas.Series(xs).rank()
2618
    yranks = pandas.Series(ys).rank()
2619
    return Corr(xranks, yranks)
2620

2621

2622
def MapToRanks(t):
2623
    """Returns a list of ranks corresponding to the elements in t.
2624

2625
    Args:
2626
        t: sequence of numbers
2627
    
2628
    Returns:
2629
        list of integer ranks, starting at 1
2630
    """
2631
    # pair up each value with its index
2632
    pairs = enumerate(t)
2633
    
2634
    # sort by value
2635
    sorted_pairs = sorted(pairs, key=itemgetter(1))
2636

2637
    # pair up each pair with its rank
2638
    ranked = enumerate(sorted_pairs)
2639

2640
    # sort by index
2641
    resorted = sorted(ranked, key=lambda trip: trip[1][0])
2642

2643
    # extract the ranks
2644
    ranks = [trip[0]+1 for trip in resorted]
2645
    return ranks
2646

2647

2648
def LeastSquares(xs, ys):
2649
    """Computes a linear least squares fit for ys as a function of xs.
2650

2651
    Args:
2652
        xs: sequence of values
2653
        ys: sequence of values
2654

2655
    Returns:
2656
        tuple of (intercept, slope)
2657
    """
2658
    meanx, varx = MeanVar(xs)
2659
    meany = Mean(ys)
2660

2661
    slope = Cov(xs, ys, meanx, meany) / varx
2662
    inter = meany - slope * meanx
2663

2664
    return inter, slope
2665

2666

2667
def FitLine(xs, inter, slope):
2668
    """Fits a line to the given data.
2669

2670
    xs: sequence of x
2671

2672
    returns: tuple of numpy arrays (sorted xs, fit ys)
2673
    """
2674
    fit_xs = np.sort(xs)
2675
    fit_ys = inter + slope * fit_xs
2676
    return fit_xs, fit_ys
2677

2678

2679
def Residuals(xs, ys, inter, slope):
2680
    """Computes residuals for a linear fit with parameters inter and slope.
2681

2682
    Args:
2683
        xs: independent variable
2684
        ys: dependent variable
2685
        inter: float intercept
2686
        slope: float slope
2687

2688
    Returns:
2689
        list of residuals
2690
    """
2691
    xs = np.asarray(xs)
2692
    ys = np.asarray(ys)
2693
    res = ys - (inter + slope * xs)
2694
    return res
2695

2696

2697
def CoefDetermination(ys, res):
2698
    """Computes the coefficient of determination (R^2) for given residuals.
2699

2700
    Args:
2701
        ys: dependent variable
2702
        res: residuals
2703
        
2704
    Returns:
2705
        float coefficient of determination
2706
    """
2707
    return 1 - Var(res) / Var(ys)
2708

2709

2710
def CorrelatedGenerator(rho):
2711
    """Generates standard normal variates with serial correlation.
2712

2713
    rho: target coefficient of correlation
2714

2715
    Returns: iterable
2716
    """
2717
    x = random.gauss(0, 1)
2718
    yield x
2719

2720
    sigma = math.sqrt(1 - rho**2)
2721
    while True:
2722
        x = random.gauss(x * rho, sigma)
2723
        yield x
2724

2725

2726
def CorrelatedNormalGenerator(mu, sigma, rho):
2727
    """Generates normal variates with serial correlation.
2728

2729
    mu: mean of variate
2730
    sigma: standard deviation of variate
2731
    rho: target coefficient of correlation
2732

2733
    Returns: iterable
2734
    """
2735
    for x in CorrelatedGenerator(rho):
2736
        yield x * sigma + mu
2737

2738

2739
def RawMoment(xs, k):
2740
    """Computes the kth raw moment of xs.
2741
    """
2742
    return sum(x**k for x in xs) / len(xs)
2743

2744

2745
def CentralMoment(xs, k):
2746
    """Computes the kth central moment of xs.
2747
    """
2748
    mean = RawMoment(xs, 1)
2749
    return sum((x - mean)**k for x in xs) / len(xs)
2750

2751

2752
def StandardizedMoment(xs, k):
2753
    """Computes the kth standardized moment of xs.
2754
    """
2755
    var = CentralMoment(xs, 2)
2756
    std = math.sqrt(var)
2757
    return CentralMoment(xs, k) / std**k
2758

2759

2760
def Skewness(xs):
2761
    """Computes skewness.
2762
    """
2763
    return StandardizedMoment(xs, 3)
2764

2765

2766
def Median(xs):
2767
    """Computes the median (50th percentile) of a sequence.
2768

2769
    xs: sequence or anything else that can initialize a Cdf
2770

2771
    returns: float
2772
    """
2773
    cdf = Cdf(xs)
2774
    return cdf.Value(0.5)
2775

2776

2777
def IQR(xs):
2778
    """Computes the interquartile of a sequence.
2779

2780
    xs: sequence or anything else that can initialize a Cdf
2781

2782
    returns: pair of floats
2783
    """
2784
    cdf = Cdf(xs)
2785
    return cdf.Value(0.25), cdf.Value(0.75)
2786

2787

2788
def PearsonMedianSkewness(xs):
2789
    """Computes the Pearson median skewness.
2790
    """
2791
    median = Median(xs)
2792
    mean = RawMoment(xs, 1)
2793
    var = CentralMoment(xs, 2)
2794
    std = math.sqrt(var)
2795
    gp = 3 * (mean - median) / std
2796
    return gp
2797

2798

2799
class FixedWidthVariables(object):
2800
    """Represents a set of variables in a fixed width file."""
2801

2802
    def __init__(self, variables, index_base=0):
2803
        """Initializes.
2804

2805
        variables: DataFrame
2806
        index_base: are the indices 0 or 1 based?
2807

2808
        Attributes:
2809
        colspecs: list of (start, end) index tuples
2810
        names: list of string variable names
2811
        """
2812
        self.variables = variables
2813

2814
        # note: by default, subtract 1 from colspecs
2815
        self.colspecs = variables[['start', 'end']] - index_base
2816

2817
        # convert colspecs to a list of pair of int
2818
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
2819
        self.names = variables['name']
2820

2821
    def ReadFixedWidth(self, filename, **options):
2822
        """Reads a fixed width ASCII file.
2823

2824
        filename: string filename
2825

2826
        returns: DataFrame
2827
        """
2828
        df = pandas.read_fwf(filename,
2829
                             colspecs=self.colspecs, 
2830
                             names=self.names,
2831
                             **options)
2832
        return df
2833

2834

2835
def ReadStataDct(dct_file, **options):
2836
    """Reads a Stata dictionary file.
2837

2838
    dct_file: string filename
2839
    options: dict of options passed to open()
2840

2841
    returns: FixedWidthVariables object
2842
    """
2843
    type_map = dict(byte=int, int=int, long=int, float=float, double=float)
2844

2845
    var_info = []
2846
    with open(dct_file, **options) as f:
2847
        for line in f:
2848
            match = re.search( r'_column\(([^)]*)\)', line)
2849
            if not match:
2850
                continue
2851
            start = int(match.group(1))
2852
            t = line.split()
2853
            vtype, name, fstring = t[1:4]
2854
            name = name.lower()
2855
            if vtype.startswith('str'):
2856
                vtype = str
2857
            else:
2858
                vtype = type_map[vtype]
2859
            long_desc = ' '.join(t[4:]).strip('"')
2860
            var_info.append((start, vtype, name, fstring, long_desc))
2861
            
2862
    columns = ['start', 'type', 'name', 'fstring', 'desc']
2863
    variables = pandas.DataFrame(var_info, columns=columns)
2864

2865
    # fill in the end column by shifting the start column
2866
    variables['end'] = variables.start.shift(-1)
2867
    variables.loc[len(variables)-1, 'end'] = 0
2868

2869
    dct = FixedWidthVariables(variables, index_base=1)
2870
    return dct
2871

2872

2873
def Resample(xs, n=None):
2874
    """Draw a sample from xs with the same length as xs.
2875

2876
    xs: sequence
2877
    n: sample size (default: len(xs))
2878

2879
    returns: NumPy array
2880
    """
2881
    if n is None:
2882
        n = len(xs)
2883
    return np.random.choice(xs, n, replace=True)
2884

2885

2886
def SampleRows(df, nrows, replace=False):
2887
    """Choose a sample of rows from a DataFrame.
2888

2889
    df: DataFrame
2890
    nrows: number of rows
2891
    replace: whether to sample with replacement
2892

2893
    returns: DataDf
2894
    """
2895
    indices = np.random.choice(df.index, nrows, replace=replace)
2896
    sample = df.loc[indices]
2897
    return sample
2898

2899

2900
def ResampleRows(df):
2901
    """Resamples rows from a DataFrame.
2902

2903
    df: DataFrame
2904

2905
    returns: DataFrame
2906
    """
2907
    return SampleRows(df, len(df), replace=True)
2908

2909

2910
def ResampleRowsWeighted(df, column='finalwgt'):
2911
    """Resamples a DataFrame using probabilities proportional to given column.
2912

2913
    df: DataFrame
2914
    column: string column name to use as weights
2915

2916
    returns: DataFrame
2917
    """
2918
    weights = df[column]
2919
    cdf = Cdf(dict(weights))
2920
    indices = cdf.Sample(len(weights))
2921
    sample = df.loc[indices]
2922
    return sample
2923

2924

2925
def PercentileRow(array, p):
2926
    """Selects the row from a sorted array that maps to percentile p.
2927

2928
    p: float 0--100
2929

2930
    returns: NumPy array (one row)
2931
    """
2932
    rows, cols = array.shape
2933
    index = int(rows * p / 100)
2934
    return array[index,]
2935

2936

2937
def PercentileRows(ys_seq, percents):
2938
    """Given a collection of lines, selects percentiles along vertical axis.
2939

2940
    For example, if ys_seq contains simulation results like ys as a
2941
    function of time, and percents contains (5, 95), the result would
2942
    be a 90% CI for each vertical slice of the simulation results.
2943

2944
    ys_seq: sequence of lines (y values)
2945
    percents: list of percentiles (0-100) to select
2946

2947
    returns: list of NumPy arrays, one for each percentile
2948
    """
2949
    nrows = len(ys_seq)
2950
    ncols = len(ys_seq[0])
2951
    array = np.zeros((nrows, ncols))
2952

2953
    for i, ys in enumerate(ys_seq):
2954
        array[i,] = ys
2955

2956
    array = np.sort(array, axis=0)
2957

2958
    rows = [PercentileRow(array, p) for p in percents]
2959
    return rows
2960

2961

2962
def Smooth(xs, sigma=2, **options):
2963
    """Smooths a NumPy array with a Gaussian filter.
2964

2965
    xs: sequence
2966
    sigma: standard deviation of the filter
2967
    """
2968
    return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
2969

2970

2971
class HypothesisTest(object):
2972
    """Represents a hypothesis test."""
2973

2974
    def __init__(self, data):
2975
        """Initializes.
2976

2977
        data: data in whatever form is relevant
2978
        """
2979
        self.data = data
2980
        self.MakeModel()
2981
        self.actual = self.TestStatistic(data)
2982
        self.test_stats = None
2983
        self.test_cdf = None
2984

2985
    def PValue(self, iters=1000):
2986
        """Computes the distribution of the test statistic and p-value.
2987

2988
        iters: number of iterations
2989

2990
        returns: float p-value
2991
        """
2992
        self.test_stats = [self.TestStatistic(self.RunModel()) 
2993
                           for _ in range(iters)]
2994
        self.test_cdf = Cdf(self.test_stats)
2995

2996
        count = sum(1 for x in self.test_stats if x >= self.actual)
2997
        return count / iters
2998

2999
    def MaxTestStat(self):
3000
        """Returns the largest test statistic seen during simulations.
3001
        """
3002
        return max(self.test_stats)
3003

3004
    def PlotCdf(self, label=None):
3005
        """Draws a Cdf with vertical lines at the observed test stat.
3006
        """
3007
        def VertLine(x):
3008
            """Draws a vertical line at x."""
3009
            thinkplot.Plot([x, x], [0, 1], color='0.8')
3010

3011
        VertLine(self.actual)
3012
        thinkplot.Cdf(self.test_cdf, label=label)
3013

3014
    def TestStatistic(self, data):
3015
        """Computes the test statistic.
3016

3017
        data: data in whatever form is relevant        
3018
        """
3019
        raise UnimplementedMethodException()
3020

3021
    def MakeModel(self):
3022
        """Build a model of the null hypothesis.
3023
        """
3024
        pass
3025

3026
    def RunModel(self):
3027
        """Run the model of the null hypothesis.
3028

3029
        returns: simulated data
3030
        """
3031
        raise UnimplementedMethodException()
3032

3033

3034
def main():
3035
    pass
3036
    
3037

3038
if __name__ == '__main__':
3039
    main()
3040

3041
Product

Resources

Company