CoCalc -- thinkstats2.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / scipy / thinkstats2.py
¹³²⁹²⁸ views
License: OTHER
1
"""This file contains code for use with "Think Stats" and
2
"Think Bayes", both by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
from __future__ import print_function, division
9

10
"""This file contains class definitions for:
11

12
Hist: represents a histogram (map from values to integer frequencies).
13

14
Pmf: represents a probability mass function (map from values to probs).
15

16
_DictWrapper: private parent class for Hist and Pmf.
17

18
Cdf: represents a discrete cumulative distribution function
19

20
Pdf: represents a continuous probability density function
21

22
"""
23

24
import bisect
25
import copy
26
import logging
27
import math
28
import random
29
import re
30

31
from collections import Counter
32
from operator import itemgetter
33

34
import thinkplot
35

36
import numpy as np
37
import pandas
38

39
import scipy
40
from scipy import stats
41
from scipy import special
42
from scipy import ndimage
43

44
from io import open
45

46
ROOT2 = math.sqrt(2)
47

48
def RandomSeed(x):
49
    """Initialize the random and np.random generators.
50

51
    x: int seed
52
    """
53
    random.seed(x)
54
    np.random.seed(x)
55
    
56

57
def Odds(p):
58
    """Computes odds for a given probability.
59

60
    Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.
61

62
    Note: when p=1, the formula for odds divides by zero, which is
63
    normally undefined.  But I think it is reasonable to define Odds(1)
64
    to be infinity, so that's what this function does.
65

66
    p: float 0-1
67

68
    Returns: float odds
69
    """
70
    if p == 1:
71
        return float('inf')
72
    return p / (1 - p)
73

74

75
def Probability(o):
76
    """Computes the probability corresponding to given odds.
77

78
    Example: o=2 means 2:1 odds in favor, or 2/3 probability
79

80
    o: float odds, strictly positive
81

82
    Returns: float probability
83
    """
84
    return o / (o + 1)
85

86

87
def Probability2(yes, no):
88
    """Computes the probability corresponding to given odds.
89

90
    Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.
91
    
92
    yes, no: int or float odds in favor
93
    """
94
    return yes / (yes + no)
95

96

97
class Interpolator(object):
98
    """Represents a mapping between sorted sequences; performs linear interp.
99

100
    Attributes:
101
        xs: sorted list
102
        ys: sorted list
103
    """
104

105
    def __init__(self, xs, ys):
106
        self.xs = xs
107
        self.ys = ys
108

109
    def Lookup(self, x):
110
        """Looks up x and returns the corresponding value of y."""
111
        return self._Bisect(x, self.xs, self.ys)
112

113
    def Reverse(self, y):
114
        """Looks up y and returns the corresponding value of x."""
115
        return self._Bisect(y, self.ys, self.xs)
116

117
    def _Bisect(self, x, xs, ys):
118
        """Helper function."""
119
        if x <= xs[0]:
120
            return ys[0]
121
        if x >= xs[-1]:
122
            return ys[-1]
123
        i = bisect.bisect(xs, x)
124
        frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])
125
        y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])
126
        return y
127

128

129
class _DictWrapper(object):
130
    """An object that contains a dictionary."""
131

132
    def __init__(self, obj=None, label=None):
133
        """Initializes the distribution.
134

135
        obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
136
        label: string label
137
        """
138
        self.label = label if label is not None else '_nolegend_'
139
        self.d = {}
140

141
        # flag whether the distribution is under a log transform
142
        self.log = False
143

144
        if obj is None:
145
            return
146

147
        if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
148
            self.label = label if label is not None else obj.label
149

150
        if isinstance(obj, dict):
151
            self.d.update(obj.items())
152
        elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):
153
            self.d.update(obj.Items())
154
        elif isinstance(obj, pandas.Series):
155
            self.d.update(obj.value_counts().iteritems())
156
        else:
157
            # finally, treat it like a list
158
            self.d.update(Counter(obj))
159

160
        if len(self) > 0 and isinstance(self, Pmf):
161
            self.Normalize()
162

163
    def __hash__(self):
164
        return id(self)
165

166
    def __str__(self):
167
        cls = self.__class__.__name__
168
        return '%s(%s)' % (cls, str(self.d))
169

170
    __repr__ = __str__
171

172
    def __eq__(self, other):
173
        return self.d == other.d
174

175
    def __len__(self):
176
        return len(self.d)
177

178
    def __iter__(self):
179
        return iter(self.d)
180

181
    def iterkeys(self):
182
        """Returns an iterator over keys."""
183
        return iter(self.d)
184

185
    def __contains__(self, value):
186
        return value in self.d
187

188
    def __getitem__(self, value):
189
        return self.d.get(value, 0)
190

191
    def __setitem__(self, value, prob):
192
        self.d[value] = prob
193

194
    def __delitem__(self, value):
195
        del self.d[value]
196

197
    def Copy(self, label=None):
198
        """Returns a copy.
199

200
        Make a shallow copy of d.  If you want a deep copy of d,
201
        use copy.deepcopy on the whole object.
202

203
        label: string label for the new Hist
204

205
        returns: new _DictWrapper with the same type
206
        """
207
        new = copy.copy(self)
208
        new.d = copy.copy(self.d)
209
        new.label = label if label is not None else self.label
210
        return new
211

212
    def Scale(self, factor):
213
        """Multiplies the values by a factor.
214

215
        factor: what to multiply by
216

217
        Returns: new object
218
        """
219
        new = self.Copy()
220
        new.d.clear()
221

222
        for val, prob in self.Items():
223
            new.Set(val * factor, prob)
224
        return new
225

226
    def Log(self, m=None):
227
        """Log transforms the probabilities.
228
        
229
        Removes values with probability 0.
230

231
        Normalizes so that the largest logprob is 0.
232
        """
233
        if self.log:
234
            raise ValueError("Pmf/Hist already under a log transform")
235
        self.log = True
236

237
        if m is None:
238
            m = self.MaxLike()
239

240
        for x, p in self.d.items():
241
            if p:
242
                self.Set(x, math.log(p / m))
243
            else:
244
                self.Remove(x)
245

246
    def Exp(self, m=None):
247
        """Exponentiates the probabilities.
248

249
        m: how much to shift the ps before exponentiating
250

251
        If m is None, normalizes so that the largest prob is 1.
252
        """
253
        if not self.log:
254
            raise ValueError("Pmf/Hist not under a log transform")
255
        self.log = False
256

257
        if m is None:
258
            m = self.MaxLike()
259

260
        for x, p in self.d.items():
261
            self.Set(x, math.exp(p - m))
262

263
    def GetDict(self):
264
        """Gets the dictionary."""
265
        return self.d
266

267
    def SetDict(self, d):
268
        """Sets the dictionary."""
269
        self.d = d
270

271
    def Values(self):
272
        """Gets an unsorted sequence of values.
273

274
        Note: one source of confusion is that the keys of this
275
        dictionary are the values of the Hist/Pmf, and the
276
        values of the dictionary are frequencies/probabilities.
277
        """
278
        return self.d.keys()
279

280
    def Items(self):
281
        """Gets an unsorted sequence of (value, freq/prob) pairs."""
282
        return self.d.items()
283

284
    def Render(self, **options):
285
        """Generates a sequence of points suitable for plotting.
286

287
        Note: options are ignored
288

289
        Returns:
290
            tuple of (sorted value sequence, freq/prob sequence)
291
        """
292
        if min(self.d.keys()) is np.nan:
293
            logging.warning('Hist: contains NaN, may not render correctly.')
294

295
        return zip(*sorted(self.Items()))
296

297
    def MakeCdf(self, label=None):
298
        """Makes a Cdf."""
299
        label = label if label is not None else self.label
300
        return Cdf(self, label=label)
301

302
    def Print(self):
303
        """Prints the values and freqs/probs in ascending order."""
304
        for val, prob in sorted(self.d.items()):
305
            print(val, prob)
306

307
    def Set(self, x, y=0):
308
        """Sets the freq/prob associated with the value x.
309

310
        Args:
311
            x: number value
312
            y: number freq or prob
313
        """
314
        self.d[x] = y
315

316
    def Incr(self, x, term=1):
317
        """Increments the freq/prob associated with the value x.
318

319
        Args:
320
            x: number value
321
            term: how much to increment by
322
        """
323
        self.d[x] = self.d.get(x, 0) + term
324

325
    def Mult(self, x, factor):
326
        """Scales the freq/prob associated with the value x.
327

328
        Args:
329
            x: number value
330
            factor: how much to multiply by
331
        """
332
        self.d[x] = self.d.get(x, 0) * factor
333

334
    def Remove(self, x):
335
        """Removes a value.
336

337
        Throws an exception if the value is not there.
338

339
        Args:
340
            x: value to remove
341
        """
342
        del self.d[x]
343

344
    def Total(self):
345
        """Returns the total of the frequencies/probabilities in the map."""
346
        total = sum(self.d.values())
347
        return total
348

349
    def MaxLike(self):
350
        """Returns the largest frequency/probability in the map."""
351
        return max(self.d.values())
352

353
    def Largest(self, n=10):
354
        """Returns the largest n values, with frequency/probability.
355

356
        n: number of items to return
357
        """
358
        return sorted(self.d.items(), reverse=True)[:n]
359

360
    def Smallest(self, n=10):
361
        """Returns the smallest n values, with frequency/probability.
362

363
        n: number of items to return
364
        """
365
        return sorted(self.d.items(), reverse=False)[:n]
366

367

368
class Hist(_DictWrapper):
369
    """Represents a histogram, which is a map from values to frequencies.
370

371
    Values can be any hashable type; frequencies are integer counters.
372
    """
373
    def Freq(self, x):
374
        """Gets the frequency associated with the value x.
375

376
        Args:
377
            x: number value
378

379
        Returns:
380
            int frequency
381
        """
382
        return self.d.get(x, 0)
383

384
    def Freqs(self, xs):
385
        """Gets frequencies for a sequence of values."""
386
        return [self.Freq(x) for x in xs]
387

388
    def IsSubset(self, other):
389
        """Checks whether the values in this histogram are a subset of
390
        the values in the given histogram."""
391
        for val, freq in self.Items():
392
            if freq > other.Freq(val):
393
                return False
394
        return True
395

396
    def Subtract(self, other):
397
        """Subtracts the values in the given histogram from this histogram."""
398
        for val, freq in other.Items():
399
            self.Incr(val, -freq)
400

401

402
class Pmf(_DictWrapper):
403
    """Represents a probability mass function.
404
    
405
    Values can be any hashable type; probabilities are floating-point.
406
    Pmfs are not necessarily normalized.
407
    """
408

409
    def Prob(self, x, default=0):
410
        """Gets the probability associated with the value x.
411

412
        Args:
413
            x: number value
414
            default: value to return if the key is not there
415

416
        Returns:
417
            float probability
418
        """
419
        return self.d.get(x, default)
420

421
    def Probs(self, xs):
422
        """Gets probabilities for a sequence of values."""
423
        return [self.Prob(x) for x in xs]
424

425
    def Percentile(self, percentage):
426
        """Computes a percentile of a given Pmf.
427

428
        Note: this is not super efficient.  If you are planning
429
        to compute more than a few percentiles, compute the Cdf.
430

431
        percentage: float 0-100
432

433
        returns: value from the Pmf
434
        """
435
        p = percentage / 100.0
436
        total = 0
437
        for val, prob in sorted(self.Items()):
438
            total += prob
439
            if total >= p:
440
                return val
441

442
    def ProbGreater(self, x):
443
        """Probability that a sample from this Pmf exceeds x.
444

445
        x: number
446

447
        returns: float probability
448
        """
449
        if isinstance(x, _DictWrapper):
450
            return PmfProbGreater(self, x)
451
        else:
452
            t = [prob for (val, prob) in self.d.items() if val > x]
453
            return sum(t)
454

455
    def ProbLess(self, x):
456
        """Probability that a sample from this Pmf is less than x.
457

458
        x: number
459

460
        returns: float probability
461
        """
462
        if isinstance(x, _DictWrapper):
463
            return PmfProbLess(self, x)
464
        else:
465
            t = [prob for (val, prob) in self.d.items() if val < x]
466
            return sum(t)
467

468
    def __lt__(self, obj):
469
        """Less than.
470

471
        obj: number or _DictWrapper
472

473
        returns: float probability
474
        """
475
        return self.ProbLess(obj)
476

477
    def __gt__(self, obj):
478
        """Greater than.
479

480
        obj: number or _DictWrapper
481

482
        returns: float probability
483
        """
484
        return self.ProbGreater(obj)
485

486
    def __ge__(self, obj):
487
        """Greater than or equal.
488

489
        obj: number or _DictWrapper
490

491
        returns: float probability
492
        """
493
        return 1 - (self < obj)
494

495
    def __le__(self, obj):
496
        """Less than or equal.
497

498
        obj: number or _DictWrapper
499

500
        returns: float probability
501
        """
502
        return 1 - (self > obj)
503

504
    def Normalize(self, fraction=1.0):
505
        """Normalizes this PMF so the sum of all probs is fraction.
506

507
        Args:
508
            fraction: what the total should be after normalization
509

510
        Returns: the total probability before normalizing
511
        """
512
        if self.log:
513
            raise ValueError("Normalize: Pmf is under a log transform")
514

515
        total = self.Total()
516
        if total == 0.0:
517
            raise ValueError('Normalize: total probability is zero.')
518
            #logging.warning('Normalize: total probability is zero.')
519
            #return total
520

521
        factor = fraction / total
522
        for x in self.d:
523
            self.d[x] *= factor
524

525
        return total
526

527
    def Random(self):
528
        """Chooses a random element from this PMF.
529

530
        Note: this is not very efficient.  If you plan to call
531
        this more than a few times, consider converting to a CDF.
532

533
        Returns:
534
            float value from the Pmf
535
        """
536
        target = random.random()
537
        total = 0.0
538
        for x, p in self.d.items():
539
            total += p
540
            if total >= target:
541
                return x
542

543
        # we shouldn't get here
544
        raise ValueError('Random: Pmf might not be normalized.')
545

546
    def Mean(self):
547
        """Computes the mean of a PMF.
548

549
        Returns:
550
            float mean
551
        """
552
        mean = 0.0
553
        for x, p in self.d.items():
554
            mean += p * x
555
        return mean
556

557
    def Var(self, mu=None):
558
        """Computes the variance of a PMF.
559

560
        mu: the point around which the variance is computed;
561
                if omitted, computes the mean
562

563
        returns: float variance
564
        """
565
        if mu is None:
566
            mu = self.Mean()
567

568
        var = 0.0
569
        for x, p in self.d.items():
570
            var += p * (x - mu) ** 2
571
        return var
572

573
    def Std(self, mu=None):
574
        """Computes the standard deviation of a PMF.
575

576
        mu: the point around which the variance is computed;
577
                if omitted, computes the mean
578

579
        returns: float standard deviation
580
        """
581
        var = self.Var(mu)
582
        return math.sqrt(var)
583

584
    def MaximumLikelihood(self):
585
        """Returns the value with the highest probability.
586

587
        Returns: float probability
588
        """
589
        _, val = max((prob, val) for val, prob in self.Items())
590
        return val
591

592
    def CredibleInterval(self, percentage=90):
593
        """Computes the central credible interval.
594

595
        If percentage=90, computes the 90% CI.
596

597
        Args:
598
            percentage: float between 0 and 100
599

600
        Returns:
601
            sequence of two floats, low and high
602
        """
603
        cdf = self.MakeCdf()
604
        return cdf.CredibleInterval(percentage)
605

606
    def __add__(self, other):
607
        """Computes the Pmf of the sum of values drawn from self and other.
608

609
        other: another Pmf or a scalar
610

611
        returns: new Pmf
612
        """
613
        try:
614
            return self.AddPmf(other)
615
        except AttributeError:
616
            return self.AddConstant(other)
617

618
    def AddPmf(self, other):
619
        """Computes the Pmf of the sum of values drawn from self and other.
620

621
        other: another Pmf
622

623
        returns: new Pmf
624
        """
625
        pmf = Pmf()
626
        for v1, p1 in self.Items():
627
            for v2, p2 in other.Items():
628
                pmf.Incr(v1 + v2, p1 * p2)
629
        return pmf
630

631
    def AddConstant(self, other):
632
        """Computes the Pmf of the sum a constant and values from self.
633

634
        other: a number
635

636
        returns: new Pmf
637
        """
638
        pmf = Pmf()
639
        for v1, p1 in self.Items():
640
            pmf.Set(v1 + other, p1)
641
        return pmf
642

643
    def __sub__(self, other):
644
        """Computes the Pmf of the diff of values drawn from self and other.
645

646
        other: another Pmf
647

648
        returns: new Pmf
649
        """
650
        try:
651
            return self.SubPmf(other)
652
        except AttributeError:
653
            return self.AddConstant(-other)
654

655
    def SubPmf(self, other):
656
        """Computes the Pmf of the diff of values drawn from self and other.
657

658
        other: another Pmf
659

660
        returns: new Pmf
661
        """
662
        pmf = Pmf()
663
        for v1, p1 in self.Items():
664
            for v2, p2 in other.Items():
665
                pmf.Incr(v1 - v2, p1 * p2)
666
        return pmf
667

668
    def __mul__(self, other):
669
        """Computes the Pmf of the product of values drawn from self and other.
670

671
        other: another Pmf
672

673
        returns: new Pmf
674
        """
675
        try:
676
            return self.MulPmf(other)
677
        except AttributeError:
678
            return self.MulConstant(other)
679

680
    def MulPmf(self, other):
681
        """Computes the Pmf of the diff of values drawn from self and other.
682

683
        other: another Pmf
684

685
        returns: new Pmf
686
        """
687
        pmf = Pmf()
688
        for v1, p1 in self.Items():
689
            for v2, p2 in other.Items():
690
                pmf.Incr(v1 * v2, p1 * p2)
691
        return pmf
692

693
    def MulConstant(self, other):
694
        """Computes the Pmf of the product of a constant and values from self.
695

696
        other: a number
697

698
        returns: new Pmf
699
        """
700
        pmf = Pmf()
701
        for v1, p1 in self.Items():
702
            pmf.Set(v1 * other, p1)
703
        return pmf
704

705
    def __div__(self, other):
706
        """Computes the Pmf of the ratio of values drawn from self and other.
707

708
        other: another Pmf
709

710
        returns: new Pmf
711
        """
712
        try:
713
            return self.DivPmf(other)
714
        except AttributeError:
715
            return self.MulConstant(1/other)
716

717
    __truediv__ = __div__
718

719
    def DivPmf(self, other):
720
        """Computes the Pmf of the ratio of values drawn from self and other.
721

722
        other: another Pmf
723

724
        returns: new Pmf
725
        """
726
        pmf = Pmf()
727
        for v1, p1 in self.Items():
728
            for v2, p2 in other.Items():
729
                pmf.Incr(v1 / v2, p1 * p2)
730
        return pmf
731

732
    def Max(self, k):
733
        """Computes the CDF of the maximum of k selections from this dist.
734

735
        k: int
736

737
        returns: new Cdf
738
        """
739
        cdf = self.MakeCdf()
740
        return cdf.Max(k)
741

742

743
class Joint(Pmf):
744
    """Represents a joint distribution.
745

746
    The values are sequences (usually tuples)
747
    """
748

749
    def Marginal(self, i, label=None):
750
        """Gets the marginal distribution of the indicated variable.
751

752
        i: index of the variable we want
753

754
        Returns: Pmf
755
        """
756
        pmf = Pmf(label=label)
757
        for vs, prob in self.Items():
758
            pmf.Incr(vs[i], prob)
759
        return pmf
760

761
    def Conditional(self, i, j, val, label=None):
762
        """Gets the conditional distribution of the indicated variable.
763

764
        Distribution of vs[i], conditioned on vs[j] = val.
765

766
        i: index of the variable we want
767
        j: which variable is conditioned on
768
        val: the value the jth variable has to have
769

770
        Returns: Pmf
771
        """
772
        pmf = Pmf(label=label)
773
        for vs, prob in self.Items():
774
            if vs[j] != val:
775
                continue
776
            pmf.Incr(vs[i], prob)
777

778
        pmf.Normalize()
779
        return pmf
780

781
    def MaxLikeInterval(self, percentage=90):
782
        """Returns the maximum-likelihood credible interval.
783

784
        If percentage=90, computes a 90% CI containing the values
785
        with the highest likelihoods.
786

787
        percentage: float between 0 and 100
788

789
        Returns: list of values from the suite
790
        """
791
        interval = []
792
        total = 0
793

794
        t = [(prob, val) for val, prob in self.Items()]
795
        t.sort(reverse=True)
796

797
        for prob, val in t:
798
            interval.append(val)
799
            total += prob
800
            if total >= percentage / 100.0:
801
                break
802

803
        return interval
804

805

806
def MakeJoint(pmf1, pmf2):
807
    """Joint distribution of values from pmf1 and pmf2.
808

809
    Assumes that the PMFs represent independent random variables.
810

811
    Args:
812
        pmf1: Pmf object
813
        pmf2: Pmf object
814

815
    Returns:
816
        Joint pmf of value pairs
817
    """
818
    joint = Joint()
819
    for v1, p1 in pmf1.Items():
820
        for v2, p2 in pmf2.Items():
821
            joint.Set((v1, v2), p1 * p2)
822
    return joint
823

824

825
def MakeHistFromList(t, label=None):
826
    """Makes a histogram from an unsorted sequence of values.
827

828
    Args:
829
        t: sequence of numbers
830
        label: string label for this histogram
831

832
    Returns:
833
        Hist object
834
    """
835
    return Hist(t, label=label)
836

837

838
def MakeHistFromDict(d, label=None):
839
    """Makes a histogram from a map from values to frequencies.
840

841
    Args:
842
        d: dictionary that maps values to frequencies
843
        label: string label for this histogram
844

845
    Returns:
846
        Hist object
847
    """
848
    return Hist(d, label)
849

850

851
def MakePmfFromList(t, label=None):
852
    """Makes a PMF from an unsorted sequence of values.
853

854
    Args:
855
        t: sequence of numbers
856
        label: string label for this PMF
857

858
    Returns:
859
        Pmf object
860
    """
861
    return Pmf(t, label=label)
862

863

864
def MakePmfFromDict(d, label=None):
865
    """Makes a PMF from a map from values to probabilities.
866

867
    Args:
868
        d: dictionary that maps values to probabilities
869
        label: string label for this PMF
870

871
    Returns:
872
        Pmf object
873
    """
874
    return Pmf(d, label=label)
875

876

877
def MakePmfFromItems(t, label=None):
878
    """Makes a PMF from a sequence of value-probability pairs
879

880
    Args:
881
        t: sequence of value-probability pairs
882
        label: string label for this PMF
883

884
    Returns:
885
        Pmf object
886
    """
887
    return Pmf(dict(t), label=label)
888

889

890
def MakePmfFromHist(hist, label=None):
891
    """Makes a normalized PMF from a Hist object.
892

893
    Args:
894
        hist: Hist object
895
        label: string label
896

897
    Returns:
898
        Pmf object
899
    """
900
    if label is None:
901
        label = hist.label
902

903
    return Pmf(hist, label=label)
904

905

906
def MakeMixture(metapmf, label='mix'):
907
    """Make a mixture distribution.
908

909
    Args:
910
      metapmf: Pmf that maps from Pmfs to probs.
911
      label: string label for the new Pmf.
912

913
    Returns: Pmf object.
914
    """
915
    mix = Pmf(label=label)
916
    for pmf, p1 in metapmf.Items():
917
        for x, p2 in pmf.Items():
918
            mix.Incr(x, p1 * p2)
919
    return mix
920

921

922
def MakeUniformPmf(low, high, n):
923
    """Make a uniform Pmf.
924

925
    low: lowest value (inclusive)
926
    high: highest value (inclusize)
927
    n: number of values
928
    """
929
    pmf = Pmf()
930
    for x in np.linspace(low, high, n):
931
        pmf.Set(x, 1)
932
    pmf.Normalize()
933
    return pmf
934

935

936
class Cdf(object):
937
    """Represents a cumulative distribution function.
938

939
    Attributes:
940
        xs: sequence of values
941
        ps: sequence of probabilities
942
        label: string used as a graph label.
943
    """
944
    def __init__(self, obj=None, ps=None, label=None):
945
        """Initializes.
946
        
947
        If ps is provided, obj must be the corresponding list of values.
948

949
        obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs
950
        ps: list of cumulative probabilities
951
        label: string label
952
        """
953
        self.label = label if label is not None else '_nolegend_'
954

955
        if isinstance(obj, (_DictWrapper, Cdf, Pdf)):
956
            if not label:
957
                self.label = label if label is not None else obj.label
958

959
        if obj is None:
960
            # caller does not provide obj, make an empty Cdf
961
            self.xs = np.asarray([])
962
            self.ps = np.asarray([])
963
            if ps is not None:
964
                logging.warning("Cdf: can't pass ps without also passing xs.")
965
            return
966
        else:
967
            # if the caller provides xs and ps, just store them          
968
            if ps is not None:
969
                if isinstance(ps, str):
970
                    logging.warning("Cdf: ps can't be a string")
971

972
                self.xs = np.asarray(obj)
973
                self.ps = np.asarray(ps)
974
                return
975

976
        # caller has provided just obj, not ps
977
        if isinstance(obj, Cdf):
978
            self.xs = copy.copy(obj.xs)
979
            self.ps = copy.copy(obj.ps)
980
            return
981

982
        if isinstance(obj, _DictWrapper):
983
            dw = obj
984
        else:
985
            dw = Hist(obj)
986

987
        if len(dw) == 0:
988
            self.xs = np.asarray([])
989
            self.ps = np.asarray([])
990
            return
991

992
        xs, freqs = zip(*sorted(dw.Items()))
993
        self.xs = np.asarray(xs)
994
        self.ps = np.cumsum(freqs, dtype=np.float)
995
        self.ps /= self.ps[-1]
996

997
    def __str__(self):
998
        return 'Cdf(%s, %s)' % (str(self.xs), str(self.ps))
999

1000
    __repr__ = __str__
1001

1002
    def __len__(self):
1003
        return len(self.xs)
1004

1005
    def __getitem__(self, x):
1006
        return self.Prob(x)
1007

1008
    def __setitem__(self):
1009
        raise UnimplementedMethodException()
1010

1011
    def __delitem__(self):
1012
        raise UnimplementedMethodException()
1013

1014
    def __eq__(self, other):
1015
        return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)
1016

1017
    def Copy(self, label=None):
1018
        """Returns a copy of this Cdf.
1019

1020
        label: string label for the new Cdf
1021
        """
1022
        if label is None:
1023
            label = self.label
1024
        return Cdf(list(self.xs), list(self.ps), label=label)
1025

1026
    def MakePmf(self, label=None):
1027
        """Makes a Pmf."""
1028
        if label is None:
1029
            label = self.label
1030
        return Pmf(self, label=label)
1031

1032
    def Values(self):
1033
        """Returns a sorted list of values.
1034
        """
1035
        return self.xs
1036

1037
    def Items(self):
1038
        """Returns a sorted sequence of (value, probability) pairs.
1039

1040
        Note: in Python3, returns an iterator.
1041
        """
1042
        a = self.ps
1043
        b = np.roll(a, 1)
1044
        b[0] = 0
1045
        return zip(self.xs, a-b)
1046

1047
    def Shift(self, term):
1048
        """Adds a term to the xs.
1049

1050
        term: how much to add
1051
        """
1052
        new = self.Copy()
1053
        # don't use +=, or else an int array + float yields int array
1054
        new.xs = new.xs + term
1055
        return new
1056

1057
    def Scale(self, factor):
1058
        """Multiplies the xs by a factor.
1059

1060
        factor: what to multiply by
1061
        """
1062
        new = self.Copy()
1063
        # don't use *=, or else an int array * float yields int array
1064
        new.xs = new.xs * factor
1065
        return new
1066

1067
    def Prob(self, x):
1068
        """Returns CDF(x), the probability that corresponds to value x.
1069

1070
        Args:
1071
            x: number
1072

1073
        Returns:
1074
            float probability
1075
        """
1076
        if x < self.xs[0]:
1077
            return 0.0
1078
        index = bisect.bisect(self.xs, x)
1079
        p = self.ps[index-1]
1080
        return p
1081

1082
    def Probs(self, xs):
1083
        """Gets probabilities for a sequence of values.
1084

1085
        xs: any sequence that can be converted to NumPy array
1086

1087
        returns: NumPy array of cumulative probabilities
1088
        """
1089
        xs = np.asarray(xs)
1090
        index = np.searchsorted(self.xs, xs, side='right')
1091
        ps = self.ps[index-1]
1092
        ps[xs < self.xs[0]] = 0.0
1093
        return ps
1094

1095
    ProbArray = Probs
1096

1097
    def Value(self, p):
1098
        """Returns InverseCDF(p), the value that corresponds to probability p.
1099

1100
        Args:
1101
            p: number in the range [0, 1]
1102

1103
        Returns:
1104
            number value
1105
        """
1106
        if p < 0 or p > 1:
1107
            raise ValueError('Probability p must be in range [0, 1]')
1108

1109
        index = bisect.bisect_left(self.ps, p)
1110
        return self.xs[index]
1111

1112
    def ValueArray(self, ps):
1113
        """Returns InverseCDF(p), the value that corresponds to probability p.
1114

1115
        Args:
1116
            ps: NumPy array of numbers in the range [0, 1]
1117

1118
        Returns:
1119
            NumPy array of values
1120
        """
1121
        ps = np.asarray(ps)
1122
        if np.any(ps < 0) or np.any(ps > 1):
1123
            raise ValueError('Probability p must be in range [0, 1]')
1124

1125
        index = np.searchsorted(self.ps, ps, side='left')
1126
        return self.xs[index]
1127

1128
    def Percentile(self, p):
1129
        """Returns the value that corresponds to percentile p.
1130

1131
        Args:
1132
            p: number in the range [0, 100]
1133

1134
        Returns:
1135
            number value
1136
        """
1137
        return self.Value(p / 100.0)
1138

1139
    def PercentileRank(self, x):
1140
        """Returns the percentile rank of the value x.
1141

1142
        x: potential value in the CDF
1143

1144
        returns: percentile rank in the range 0 to 100
1145
        """
1146
        return self.Prob(x) * 100.0
1147

1148
    def Random(self):
1149
        """Chooses a random value from this distribution."""
1150
        return self.Value(random.random())
1151

1152
    def Sample(self, n):
1153
        """Generates a random sample from this distribution.
1154
        
1155
        n: int length of the sample
1156
        returns: NumPy array
1157
        """
1158
        ps = np.random.random(n)
1159
        return self.ValueArray(ps)
1160

1161
    def Mean(self):
1162
        """Computes the mean of a CDF.
1163

1164
        Returns:
1165
            float mean
1166
        """
1167
        old_p = 0
1168
        total = 0.0
1169
        for x, new_p in zip(self.xs, self.ps):
1170
            p = new_p - old_p
1171
            total += p * x
1172
            old_p = new_p
1173
        return total
1174

1175
    def CredibleInterval(self, percentage=90):
1176
        """Computes the central credible interval.
1177

1178
        If percentage=90, computes the 90% CI.
1179

1180
        Args:
1181
            percentage: float between 0 and 100
1182

1183
        Returns:
1184
            sequence of two floats, low and high
1185
        """
1186
        prob = (1 - percentage / 100.0) / 2
1187
        interval = self.Value(prob), self.Value(1 - prob)
1188
        return interval
1189

1190
    ConfidenceInterval = CredibleInterval
1191

1192
    def _Round(self, multiplier=1000.0):
1193
        """
1194
        An entry is added to the cdf only if the percentile differs
1195
        from the previous value in a significant digit, where the number
1196
        of significant digits is determined by multiplier.  The
1197
        default is 1000, which keeps log10(1000) = 3 significant digits.
1198
        """
1199
        # TODO(write this method)
1200
        raise UnimplementedMethodException()
1201

1202
    def Render(self, **options):
1203
        """Generates a sequence of points suitable for plotting.
1204

1205
        An empirical CDF is a step function; linear interpolation
1206
        can be misleading.
1207

1208
        Note: options are ignored
1209

1210
        Returns:
1211
            tuple of (xs, ps)
1212
        """
1213
        def interleave(a, b):
1214
            c = np.empty(a.shape[0] + b.shape[0])
1215
            c[::2] = a
1216
            c[1::2] = b
1217
            return c
1218

1219
        a = np.array(self.xs)
1220
        xs = interleave(a, a)
1221
        shift_ps = np.roll(self.ps, 1)
1222
        shift_ps[0] = 0
1223
        ps = interleave(shift_ps, self.ps)
1224
        return xs, ps
1225

1226
    def Max(self, k):
1227
        """Computes the CDF of the maximum of k selections from this dist.
1228

1229
        k: int
1230

1231
        returns: new Cdf
1232
        """
1233
        cdf = self.Copy()
1234
        cdf.ps **= k
1235
        return cdf
1236

1237

1238
def MakeCdfFromItems(items, label=None):
1239
    """Makes a cdf from an unsorted sequence of (value, frequency) pairs.
1240

1241
    Args:
1242
        items: unsorted sequence of (value, frequency) pairs
1243
        label: string label for this CDF
1244

1245
    Returns:
1246
        cdf: list of (value, fraction) pairs
1247
    """
1248
    return Cdf(dict(items), label=label)
1249

1250

1251
def MakeCdfFromDict(d, label=None):
1252
    """Makes a CDF from a dictionary that maps values to frequencies.
1253

1254
    Args:
1255
       d: dictionary that maps values to frequencies.
1256
       label: string label for the data.
1257

1258
    Returns:
1259
        Cdf object
1260
    """
1261
    return Cdf(d, label=label)
1262

1263

1264
def MakeCdfFromList(seq, label=None):
1265
    """Creates a CDF from an unsorted sequence.
1266

1267
    Args:
1268
        seq: unsorted sequence of sortable values
1269
        label: string label for the cdf
1270

1271
    Returns:
1272
       Cdf object
1273
    """
1274
    return Cdf(seq, label=label)
1275

1276

1277
def MakeCdfFromHist(hist, label=None):
1278
    """Makes a CDF from a Hist object.
1279

1280
    Args:
1281
       hist: Pmf.Hist object
1282
       label: string label for the data.
1283

1284
    Returns:
1285
        Cdf object
1286
    """
1287
    if label is None:
1288
        label = hist.label
1289

1290
    return Cdf(hist, label=label)
1291

1292

1293
def MakeCdfFromPmf(pmf, label=None):
1294
    """Makes a CDF from a Pmf object.
1295

1296
    Args:
1297
       pmf: Pmf.Pmf object
1298
       label: string label for the data.
1299

1300
    Returns:
1301
        Cdf object
1302
    """
1303
    if label is None:
1304
        label = pmf.label
1305

1306
    return Cdf(pmf, label=label)
1307

1308

1309
class UnimplementedMethodException(Exception):
1310
    """Exception if someone calls a method that should be overridden."""
1311

1312

1313
class Suite(Pmf):
1314
    """Represents a suite of hypotheses and their probabilities."""
1315

1316
    def Update(self, data):
1317
        """Updates each hypothesis based on the data.
1318

1319
        data: any representation of the data
1320

1321
        returns: the normalizing constant
1322
        """
1323
        for hypo in self.Values():
1324
            like = self.Likelihood(data, hypo)
1325
            self.Mult(hypo, like)
1326
        return self.Normalize()
1327

1328
    def LogUpdate(self, data):
1329
        """Updates a suite of hypotheses based on new data.
1330

1331
        Modifies the suite directly; if you want to keep the original, make
1332
        a copy.
1333

1334
        Note: unlike Update, LogUpdate does not normalize.
1335

1336
        Args:
1337
            data: any representation of the data
1338
        """
1339
        for hypo in self.Values():
1340
            like = self.LogLikelihood(data, hypo)
1341
            self.Incr(hypo, like)
1342

1343
    def UpdateSet(self, dataset):
1344
        """Updates each hypothesis based on the dataset.
1345

1346
        This is more efficient than calling Update repeatedly because
1347
        it waits until the end to Normalize.
1348

1349
        Modifies the suite directly; if you want to keep the original, make
1350
        a copy.
1351

1352
        dataset: a sequence of data
1353

1354
        returns: the normalizing constant
1355
        """
1356
        for data in dataset:
1357
            for hypo in self.Values():
1358
                like = self.Likelihood(data, hypo)
1359
                self.Mult(hypo, like)
1360
        return self.Normalize()
1361

1362
    def LogUpdateSet(self, dataset):
1363
        """Updates each hypothesis based on the dataset.
1364

1365
        Modifies the suite directly; if you want to keep the original, make
1366
        a copy.
1367

1368
        dataset: a sequence of data
1369

1370
        returns: None
1371
        """
1372
        for data in dataset:
1373
            self.LogUpdate(data)
1374

1375
    def Likelihood(self, data, hypo):
1376
        """Computes the likelihood of the data under the hypothesis.
1377

1378
        hypo: some representation of the hypothesis
1379
        data: some representation of the data
1380
        """
1381
        raise UnimplementedMethodException()
1382

1383
    def LogLikelihood(self, data, hypo):
1384
        """Computes the log likelihood of the data under the hypothesis.
1385

1386
        hypo: some representation of the hypothesis
1387
        data: some representation of the data
1388
        """
1389
        raise UnimplementedMethodException()
1390

1391
    def Print(self):
1392
        """Prints the hypotheses and their probabilities."""
1393
        for hypo, prob in sorted(self.Items()):
1394
            print(hypo, prob)
1395

1396
    def MakeOdds(self):
1397
        """Transforms from probabilities to odds.
1398

1399
        Values with prob=0 are removed.
1400
        """
1401
        for hypo, prob in self.Items():
1402
            if prob:
1403
                self.Set(hypo, Odds(prob))
1404
            else:
1405
                self.Remove(hypo)
1406

1407
    def MakeProbs(self):
1408
        """Transforms from odds to probabilities."""
1409
        for hypo, odds in self.Items():
1410
            self.Set(hypo, Probability(odds))
1411

1412

1413
def MakeSuiteFromList(t, label=None):
1414
    """Makes a suite from an unsorted sequence of values.
1415

1416
    Args:
1417
        t: sequence of numbers
1418
        label: string label for this suite
1419

1420
    Returns:
1421
        Suite object
1422
    """
1423
    hist = MakeHistFromList(t, label=label)
1424
    d = hist.GetDict()
1425
    return MakeSuiteFromDict(d)
1426

1427

1428
def MakeSuiteFromHist(hist, label=None):
1429
    """Makes a normalized suite from a Hist object.
1430

1431
    Args:
1432
        hist: Hist object
1433
        label: string label
1434

1435
    Returns:
1436
        Suite object
1437
    """
1438
    if label is None:
1439
        label = hist.label
1440

1441
    # make a copy of the dictionary
1442
    d = dict(hist.GetDict())
1443
    return MakeSuiteFromDict(d, label)
1444

1445

1446
def MakeSuiteFromDict(d, label=None):
1447
    """Makes a suite from a map from values to probabilities.
1448

1449
    Args:
1450
        d: dictionary that maps values to probabilities
1451
        label: string label for this suite
1452

1453
    Returns:
1454
        Suite object
1455
    """
1456
    suite = Suite(label=label)
1457
    suite.SetDict(d)
1458
    suite.Normalize()
1459
    return suite
1460

1461

1462
class Pdf(object):
1463
    """Represents a probability density function (PDF)."""
1464

1465
    def Density(self, x):
1466
        """Evaluates this Pdf at x.
1467

1468
        Returns: float or NumPy array of probability density
1469
        """
1470
        raise UnimplementedMethodException()
1471

1472
    def GetLinspace(self):
1473
        """Get a linspace for plotting.
1474

1475
        Not all subclasses of Pdf implement this.
1476

1477
        Returns: numpy array
1478
        """
1479
        raise UnimplementedMethodException()
1480

1481
    def MakePmf(self, **options):
1482
        """Makes a discrete version of this Pdf.
1483

1484
        options can include
1485
        label: string
1486
        low: low end of range
1487
        high: high end of range
1488
        n: number of places to evaluate
1489

1490
        Returns: new Pmf
1491
        """
1492
        label = options.pop('label', '')
1493
        xs, ds = self.Render(**options)
1494
        return Pmf(dict(zip(xs, ds)), label=label)
1495

1496
    def Render(self, **options):
1497
        """Generates a sequence of points suitable for plotting.
1498

1499
        If options includes low and high, it must also include n;
1500
        in that case the density is evaluated an n locations between
1501
        low and high, including both.
1502

1503
        If options includes xs, the density is evaluate at those location.
1504

1505
        Otherwise, self.GetLinspace is invoked to provide the locations.
1506

1507
        Returns:
1508
            tuple of (xs, densities)
1509
        """
1510
        low, high = options.pop('low', None), options.pop('high', None)
1511
        if low is not None and high is not None:
1512
            n = options.pop('n', 101)
1513
            xs = np.linspace(low, high, n)
1514
        else:
1515
            xs = options.pop('xs', None)
1516
            if xs is None:
1517
                xs = self.GetLinspace()
1518
            
1519
        ds = self.Density(xs)
1520
        return xs, ds
1521

1522
    def Items(self):
1523
        """Generates a sequence of (value, probability) pairs.
1524
        """
1525
        return zip(*self.Render())
1526

1527

1528
class NormalPdf(Pdf):
1529
    """Represents the PDF of a Normal distribution."""
1530

1531
    def __init__(self, mu=0, sigma=1, label=None):
1532
        """Constructs a Normal Pdf with given mu and sigma.
1533

1534
        mu: mean
1535
        sigma: standard deviation
1536
        label: string
1537
        """
1538
        self.mu = mu
1539
        self.sigma = sigma
1540
        self.label = label if label is not None else '_nolegend_'
1541

1542
    def __str__(self):
1543
        return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)
1544

1545
    def GetLinspace(self):
1546
        """Get a linspace for plotting.
1547

1548
        Returns: numpy array
1549
        """
1550
        low, high = self.mu-3*self.sigma, self.mu+3*self.sigma
1551
        return np.linspace(low, high, 101)
1552

1553
    def Density(self, xs):
1554
        """Evaluates this Pdf at xs.
1555

1556
        xs: scalar or sequence of floats
1557

1558
        returns: float or NumPy array of probability density
1559
        """
1560
        return stats.norm.pdf(xs, self.mu, self.sigma)
1561

1562

1563
class ExponentialPdf(Pdf):
1564
    """Represents the PDF of an exponential distribution."""
1565

1566
    def __init__(self, lam=1, label=None):
1567
        """Constructs an exponential Pdf with given parameter.
1568

1569
        lam: rate parameter
1570
        label: string
1571
        """
1572
        self.lam = lam
1573
        self.label = label if label is not None else '_nolegend_'
1574

1575
    def __str__(self):
1576
        return 'ExponentialPdf(%f)' % (self.lam)
1577

1578
    def GetLinspace(self):
1579
        """Get a linspace for plotting.
1580

1581
        Returns: numpy array
1582
        """
1583
        low, high = 0, 5.0/self.lam
1584
        return np.linspace(low, high, 101)
1585

1586
    def Density(self, xs):
1587
        """Evaluates this Pdf at xs.
1588

1589
        xs: scalar or sequence of floats
1590

1591
        returns: float or NumPy array of probability density
1592
        """
1593
        return stats.expon.pdf(xs, scale=1.0/self.lam)
1594

1595

1596
class EstimatedPdf(Pdf):
1597
    """Represents a PDF estimated by KDE."""
1598

1599
    def __init__(self, sample, label=None):
1600
        """Estimates the density function based on a sample.
1601

1602
        sample: sequence of data
1603
        label: string
1604
        """
1605
        self.label = label if label is not None else '_nolegend_'
1606
        self.kde = stats.gaussian_kde(sample)
1607
        low = min(sample)
1608
        high = max(sample)
1609
        self.linspace = np.linspace(low, high, 101)
1610

1611
    def __str__(self):
1612
        return 'EstimatedPdf(label=%s)' % str(self.label)
1613

1614
    def GetLinspace(self):
1615
        """Get a linspace for plotting.
1616

1617
        Returns: numpy array
1618
        """
1619
        return self.linspace
1620

1621
    def Density(self, xs):
1622
        """Evaluates this Pdf at xs.
1623

1624
        returns: float or NumPy array of probability density
1625
        """
1626
        return self.kde.evaluate(xs)
1627

1628

1629
def CredibleInterval(pmf, percentage=90):
1630
    """Computes a credible interval for a given distribution.
1631

1632
    If percentage=90, computes the 90% CI.
1633

1634
    Args:
1635
        pmf: Pmf object representing a posterior distribution
1636
        percentage: float between 0 and 100
1637

1638
    Returns:
1639
        sequence of two floats, low and high
1640
    """
1641
    cdf = pmf.MakeCdf()
1642
    prob = (1 - percentage / 100.0) / 2
1643
    interval = cdf.Value(prob), cdf.Value(1 - prob)
1644
    return interval
1645

1646

1647
def PmfProbLess(pmf1, pmf2):
1648
    """Probability that a value from pmf1 is less than a value from pmf2.
1649

1650
    Args:
1651
        pmf1: Pmf object
1652
        pmf2: Pmf object
1653

1654
    Returns:
1655
        float probability
1656
    """
1657
    total = 0.0
1658
    for v1, p1 in pmf1.Items():
1659
        for v2, p2 in pmf2.Items():
1660
            if v1 < v2:
1661
                total += p1 * p2
1662
    return total
1663

1664

1665
def PmfProbGreater(pmf1, pmf2):
1666
    """Probability that a value from pmf1 is less than a value from pmf2.
1667

1668
    Args:
1669
        pmf1: Pmf object
1670
        pmf2: Pmf object
1671

1672
    Returns:
1673
        float probability
1674
    """
1675
    total = 0.0
1676
    for v1, p1 in pmf1.Items():
1677
        for v2, p2 in pmf2.Items():
1678
            if v1 > v2:
1679
                total += p1 * p2
1680
    return total
1681

1682

1683
def PmfProbEqual(pmf1, pmf2):
1684
    """Probability that a value from pmf1 equals a value from pmf2.
1685

1686
    Args:
1687
        pmf1: Pmf object
1688
        pmf2: Pmf object
1689

1690
    Returns:
1691
        float probability
1692
    """
1693
    total = 0.0
1694
    for v1, p1 in pmf1.Items():
1695
        for v2, p2 in pmf2.Items():
1696
            if v1 == v2:
1697
                total += p1 * p2
1698
    return total
1699

1700

1701
def RandomSum(dists):
1702
    """Chooses a random value from each dist and returns the sum.
1703

1704
    dists: sequence of Pmf or Cdf objects
1705

1706
    returns: numerical sum
1707
    """
1708
    total = sum(dist.Random() for dist in dists)
1709
    return total
1710

1711

1712
def SampleSum(dists, n):
1713
    """Draws a sample of sums from a list of distributions.
1714

1715
    dists: sequence of Pmf or Cdf objects
1716
    n: sample size
1717

1718
    returns: new Pmf of sums
1719
    """
1720
    pmf = Pmf(RandomSum(dists) for i in range(n))
1721
    return pmf
1722

1723

1724
def EvalNormalPdf(x, mu, sigma):
1725
    """Computes the unnormalized PDF of the normal distribution.
1726

1727
    x: value
1728
    mu: mean
1729
    sigma: standard deviation
1730
    
1731
    returns: float probability density
1732
    """
1733
    return stats.norm.pdf(x, mu, sigma)
1734

1735

1736
def MakeNormalPmf(mu, sigma, num_sigmas, n=201):
1737
    """Makes a PMF discrete approx to a Normal distribution.
1738
    
1739
    mu: float mean
1740
    sigma: float standard deviation
1741
    num_sigmas: how many sigmas to extend in each direction
1742
    n: number of values in the Pmf
1743

1744
    returns: normalized Pmf
1745
    """
1746
    pmf = Pmf()
1747
    low = mu - num_sigmas * sigma
1748
    high = mu + num_sigmas * sigma
1749

1750
    for x in np.linspace(low, high, n):
1751
        p = EvalNormalPdf(x, mu, sigma)
1752
        pmf.Set(x, p)
1753
    pmf.Normalize()
1754
    return pmf
1755

1756

1757
def EvalBinomialPmf(k, n, p):
1758
    """Evaluates the binomial PMF.
1759

1760
    Returns the probabily of k successes in n trials with probability p.
1761
    """
1762
    return stats.binom.pmf(k, n, p)
1763
    
1764

1765
def EvalHypergeomPmf(k, N, K, n):
1766
    """Evaluates the hypergeometric PMF.
1767

1768
    Returns the probabily of k successes in n trials from a population
1769
    N with K successes in it.
1770
    """
1771
    return stats.hypergeom.pmf(k, N, K, n)
1772
    
1773

1774
def EvalPoissonPmf(k, lam):
1775
    """Computes the Poisson PMF.
1776

1777
    k: number of events
1778
    lam: parameter lambda in events per unit time
1779

1780
    returns: float probability
1781
    """
1782
    # don't use the scipy function (yet).  for lam=0 it returns NaN;
1783
    # should be 0.0
1784
    # return stats.poisson.pmf(k, lam)
1785
    return lam ** k * math.exp(-lam) / special.gamma(k+1)
1786

1787

1788
def MakePoissonPmf(lam, high, step=1):
1789
    """Makes a PMF discrete approx to a Poisson distribution.
1790

1791
    lam: parameter lambda in events per unit time
1792
    high: upper bound of the Pmf
1793

1794
    returns: normalized Pmf
1795
    """
1796
    pmf = Pmf()
1797
    for k in range(0, high + 1, step):
1798
        p = EvalPoissonPmf(k, lam)
1799
        pmf.Set(k, p)
1800
    pmf.Normalize()
1801
    return pmf
1802

1803

1804
def EvalExponentialPdf(x, lam):
1805
    """Computes the exponential PDF.
1806

1807
    x: value
1808
    lam: parameter lambda in events per unit time
1809

1810
    returns: float probability density
1811
    """
1812
    return lam * math.exp(-lam * x)
1813

1814

1815
def EvalExponentialCdf(x, lam):
1816
    """Evaluates CDF of the exponential distribution with parameter lam."""
1817
    return 1 - math.exp(-lam * x)
1818

1819

1820
def MakeExponentialPmf(lam, high, n=200):
1821
    """Makes a PMF discrete approx to an exponential distribution.
1822

1823
    lam: parameter lambda in events per unit time
1824
    high: upper bound
1825
    n: number of values in the Pmf
1826

1827
    returns: normalized Pmf
1828
    """
1829
    pmf = Pmf()
1830
    for x in np.linspace(0, high, n):
1831
        p = EvalExponentialPdf(x, lam)
1832
        pmf.Set(x, p)
1833
    pmf.Normalize()
1834
    return pmf
1835

1836

1837
def StandardNormalCdf(x):
1838
    """Evaluates the CDF of the standard Normal distribution.
1839
    
1840
    See http://en.wikipedia.org/wiki/Normal_distribution
1841
    #Cumulative_distribution_function
1842

1843
    Args:
1844
        x: float
1845
                
1846
    Returns:
1847
        float
1848
    """
1849
    return (math.erf(x / ROOT2) + 1) / 2
1850

1851

1852
def EvalNormalCdf(x, mu=0, sigma=1):
1853
    """Evaluates the CDF of the normal distribution.
1854
    
1855
    Args:
1856
        x: float
1857

1858
        mu: mean parameter
1859
        
1860
        sigma: standard deviation parameter
1861
                
1862
    Returns:
1863
        float
1864
    """
1865
    return stats.norm.cdf(x, loc=mu, scale=sigma)
1866

1867

1868
def EvalNormalCdfInverse(p, mu=0, sigma=1):
1869
    """Evaluates the inverse CDF of the normal distribution.
1870

1871
    See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function  
1872

1873
    Args:
1874
        p: float
1875

1876
        mu: mean parameter
1877
        
1878
        sigma: standard deviation parameter
1879
                
1880
    Returns:
1881
        float
1882
    """
1883
    return stats.norm.ppf(p, loc=mu, scale=sigma)
1884

1885

1886
def EvalLognormalCdf(x, mu=0, sigma=1):
1887
    """Evaluates the CDF of the lognormal distribution.
1888
    
1889
    x: float or sequence
1890
    mu: mean parameter
1891
    sigma: standard deviation parameter
1892
                
1893
    Returns: float or sequence
1894
    """
1895
    return stats.lognorm.cdf(x, loc=mu, scale=sigma)
1896

1897

1898
def RenderExpoCdf(lam, low, high, n=101):
1899
    """Generates sequences of xs and ps for an exponential CDF.
1900

1901
    lam: parameter
1902
    low: float
1903
    high: float
1904
    n: number of points to render
1905

1906
    returns: numpy arrays (xs, ps)
1907
    """
1908
    xs = np.linspace(low, high, n)
1909
    ps = 1 - np.exp(-lam * xs)
1910
    #ps = stats.expon.cdf(xs, scale=1.0/lam)
1911
    return xs, ps
1912

1913

1914
def RenderNormalCdf(mu, sigma, low, high, n=101):
1915
    """Generates sequences of xs and ps for a Normal CDF.
1916

1917
    mu: parameter
1918
    sigma: parameter
1919
    low: float
1920
    high: float
1921
    n: number of points to render
1922

1923
    returns: numpy arrays (xs, ps)
1924
    """
1925
    xs = np.linspace(low, high, n)
1926
    ps = stats.norm.cdf(xs, mu, sigma)
1927
    return xs, ps
1928

1929

1930
def RenderParetoCdf(xmin, alpha, low, high, n=50):
1931
    """Generates sequences of xs and ps for a Pareto CDF.
1932

1933
    xmin: parameter
1934
    alpha: parameter
1935
    low: float
1936
    high: float
1937
    n: number of points to render
1938

1939
    returns: numpy arrays (xs, ps)
1940
    """
1941
    if low < xmin:
1942
        low = xmin
1943
    xs = np.linspace(low, high, n)
1944
    ps = 1 - (xs / xmin) ** -alpha
1945
    #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
1946
    return xs, ps
1947

1948

1949
class Beta(object):
1950
    """Represents a Beta distribution.
1951

1952
    See http://en.wikipedia.org/wiki/Beta_distribution
1953
    """
1954
    def __init__(self, alpha=1, beta=1, label=None):
1955
        """Initializes a Beta distribution."""
1956
        self.alpha = alpha
1957
        self.beta = beta
1958
        self.label = label if label is not None else '_nolegend_'
1959

1960
    def Update(self, data):
1961
        """Updates a Beta distribution.
1962

1963
        data: pair of int (heads, tails)
1964
        """
1965
        heads, tails = data
1966
        self.alpha += heads
1967
        self.beta += tails
1968

1969
    def Mean(self):
1970
        """Computes the mean of this distribution."""
1971
        return self.alpha / (self.alpha + self.beta)
1972

1973
    def Random(self):
1974
        """Generates a random variate from this distribution."""
1975
        return random.betavariate(self.alpha, self.beta)
1976

1977
    def Sample(self, n):
1978
        """Generates a random sample from this distribution.
1979

1980
        n: int sample size
1981
        """
1982
        size = n,
1983
        return np.random.beta(self.alpha, self.beta, size)
1984

1985
    def EvalPdf(self, x):
1986
        """Evaluates the PDF at x."""
1987
        return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)
1988

1989
    def MakePmf(self, steps=101, label=None):
1990
        """Returns a Pmf of this distribution.
1991

1992
        Note: Normally, we just evaluate the PDF at a sequence
1993
        of points and treat the probability density as a probability
1994
        mass.
1995

1996
        But if alpha or beta is less than one, we have to be
1997
        more careful because the PDF goes to infinity at x=0
1998
        and x=1.  In that case we evaluate the CDF and compute
1999
        differences.
2000
        """
2001
        if self.alpha < 1 or self.beta < 1:
2002
            cdf = self.MakeCdf()
2003
            pmf = cdf.MakePmf()
2004
            return pmf
2005

2006
        xs = [i / (steps - 1.0) for i in range(steps)]
2007
        probs = [self.EvalPdf(x) for x in xs]
2008
        pmf = Pmf(dict(zip(xs, probs)), label=label)
2009
        return pmf
2010

2011
    def MakeCdf(self, steps=101):
2012
        """Returns the CDF of this distribution."""
2013
        xs = [i / (steps - 1.0) for i in range(steps)]
2014
        ps = [special.betainc(self.alpha, self.beta, x) for x in xs]
2015
        cdf = Cdf(xs, ps)
2016
        return cdf
2017

2018

2019
class Dirichlet(object):
2020
    """Represents a Dirichlet distribution.
2021

2022
    See http://en.wikipedia.org/wiki/Dirichlet_distribution
2023
    """
2024

2025
    def __init__(self, n, conc=1, label=None):
2026
        """Initializes a Dirichlet distribution.
2027

2028
        n: number of dimensions
2029
        conc: concentration parameter (smaller yields more concentration)
2030
        label: string label
2031
        """
2032
        if n < 2:
2033
            raise ValueError('A Dirichlet distribution with '
2034
                             'n<2 makes no sense')
2035

2036
        self.n = n
2037
        self.params = np.ones(n, dtype=np.float) * conc
2038
        self.label = label if label is not None else '_nolegend_'
2039

2040
    def Update(self, data):
2041
        """Updates a Dirichlet distribution.
2042

2043
        data: sequence of observations, in order corresponding to params
2044
        """
2045
        m = len(data)
2046
        self.params[:m] += data
2047

2048
    def Random(self):
2049
        """Generates a random variate from this distribution.
2050

2051
        Returns: normalized vector of fractions
2052
        """
2053
        p = np.random.gamma(self.params)
2054
        return p / p.sum()
2055

2056
    def Likelihood(self, data):
2057
        """Computes the likelihood of the data.
2058

2059
        Selects a random vector of probabilities from this distribution.
2060

2061
        Returns: float probability
2062
        """
2063
        m = len(data)
2064
        if self.n < m:
2065
            return 0
2066

2067
        x = data
2068
        p = self.Random()
2069
        q = p[:m] ** x
2070
        return q.prod()
2071

2072
    def LogLikelihood(self, data):
2073
        """Computes the log likelihood of the data.
2074

2075
        Selects a random vector of probabilities from this distribution.
2076

2077
        Returns: float log probability
2078
        """
2079
        m = len(data)
2080
        if self.n < m:
2081
            return float('-inf')
2082

2083
        x = self.Random()
2084
        y = np.log(x[:m]) * data
2085
        return y.sum()
2086

2087
    def MarginalBeta(self, i):
2088
        """Computes the marginal distribution of the ith element.
2089

2090
        See http://en.wikipedia.org/wiki/Dirichlet_distribution
2091
        #Marginal_distributions
2092

2093
        i: int
2094

2095
        Returns: Beta object
2096
        """
2097
        alpha0 = self.params.sum()
2098
        alpha = self.params[i]
2099
        return Beta(alpha, alpha0 - alpha)
2100

2101
    def PredictivePmf(self, xs, label=None):
2102
        """Makes a predictive distribution.
2103

2104
        xs: values to go into the Pmf
2105

2106
        Returns: Pmf that maps from x to the mean prevalence of x
2107
        """
2108
        alpha0 = self.params.sum()
2109
        ps = self.params / alpha0
2110
        return Pmf(zip(xs, ps), label=label)
2111

2112

2113
def BinomialCoef(n, k):
2114
    """Compute the binomial coefficient "n choose k".
2115

2116
    n: number of trials
2117
    k: number of successes
2118

2119
    Returns: float
2120
    """
2121
    return scipy.misc.comb(n, k)
2122

2123

2124
def LogBinomialCoef(n, k):
2125
    """Computes the log of the binomial coefficient.
2126

2127
    http://math.stackexchange.com/questions/64716/
2128
    approximating-the-logarithm-of-the-binomial-coefficient
2129

2130
    n: number of trials
2131
    k: number of successes
2132

2133
    Returns: float
2134
    """
2135
    return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)
2136

2137

2138
def NormalProbability(ys, jitter=0.0):
2139
    """Generates data for a normal probability plot.
2140

2141
    ys: sequence of values
2142
    jitter: float magnitude of jitter added to the ys 
2143

2144
    returns: numpy arrays xs, ys
2145
    """
2146
    n = len(ys)
2147
    xs = np.random.normal(0, 1, n)
2148
    xs.sort()
2149
    
2150
    if jitter:
2151
        ys = Jitter(ys, jitter)
2152
    else:
2153
        ys = np.array(ys)
2154
    ys.sort()
2155

2156
    return xs, ys
2157

2158

2159
def Jitter(values, jitter=0.5):
2160
    """Jitters the values by adding a uniform variate in (-jitter, jitter).
2161

2162
    values: sequence
2163
    jitter: scalar magnitude of jitter
2164
    
2165
    returns: new numpy array
2166
    """
2167
    n = len(values)
2168
    return np.random.uniform(-jitter, +jitter, n) + values
2169

2170

2171
def NormalProbabilityPlot(sample, fit_color='0.8', **options):
2172
    """Makes a normal probability plot with a fitted line.
2173

2174
    sample: sequence of numbers
2175
    fit_color: color string for the fitted line
2176
    options: passed along to Plot
2177
    """
2178
    xs, ys = NormalProbability(sample)
2179
    mean, var = MeanVar(sample)
2180
    std = math.sqrt(var)
2181

2182
    fit = FitLine(xs, mean, std)
2183
    thinkplot.Plot(*fit, color=fit_color, label='model')
2184

2185
    xs, ys = NormalProbability(sample)
2186
    thinkplot.Plot(xs, ys, **options)
2187

2188
 
2189
def Mean(xs):
2190
    """Computes mean.
2191

2192
    xs: sequence of values
2193

2194
    returns: float mean
2195
    """
2196
    return np.mean(xs)
2197

2198

2199
def Var(xs, mu=None, ddof=0):
2200
    """Computes variance.
2201

2202
    xs: sequence of values
2203
    mu: option known mean
2204
    ddof: delta degrees of freedom
2205

2206
    returns: float
2207
    """
2208
    xs = np.asarray(xs)
2209

2210
    if mu is None:
2211
        mu = xs.mean()
2212

2213
    ds = xs - mu
2214
    return np.dot(ds, ds) / (len(xs) - ddof)
2215

2216

2217
def Std(xs, mu=None, ddof=0):
2218
    """Computes standard deviation.
2219

2220
    xs: sequence of values
2221
    mu: option known mean
2222
    ddof: delta degrees of freedom
2223

2224
    returns: float
2225
    """
2226
    var = Var(xs, mu, ddof)
2227
    return math.sqrt(var)
2228

2229

2230
def MeanVar(xs, ddof=0):
2231
    """Computes mean and variance.
2232

2233
    Based on http://stackoverflow.com/questions/19391149/
2234
    numpy-mean-and-variance-from-single-function
2235

2236
    xs: sequence of values
2237
    ddof: delta degrees of freedom
2238
    
2239
    returns: pair of float, mean and var
2240
    """
2241
    xs = np.asarray(xs)
2242
    mean = xs.mean()
2243
    s2 = Var(xs, mean, ddof)
2244
    return mean, s2
2245

2246

2247
def Trim(t, p=0.01):
2248
    """Trims the largest and smallest elements of t.
2249

2250
    Args:
2251
        t: sequence of numbers
2252
        p: fraction of values to trim off each end
2253

2254
    Returns:
2255
        sequence of values
2256
    """
2257
    n = int(p * len(t))
2258
    t = sorted(t)[n:-n]
2259
    return t
2260

2261

2262
def TrimmedMean(t, p=0.01):
2263
    """Computes the trimmed mean of a sequence of numbers.
2264

2265
    Args:
2266
        t: sequence of numbers
2267
        p: fraction of values to trim off each end
2268

2269
    Returns:
2270
        float
2271
    """
2272
    t = Trim(t, p)
2273
    return Mean(t)
2274

2275

2276
def TrimmedMeanVar(t, p=0.01):
2277
    """Computes the trimmed mean and variance of a sequence of numbers.
2278

2279
    Side effect: sorts the list.
2280

2281
    Args:
2282
        t: sequence of numbers
2283
        p: fraction of values to trim off each end
2284

2285
    Returns:
2286
        float
2287
    """
2288
    t = Trim(t, p)
2289
    mu, var = MeanVar(t)
2290
    return mu, var
2291

2292

2293
def CohenEffectSize(group1, group2):
2294
    """Compute Cohen's d.
2295

2296
    group1: Series or NumPy array
2297
    group2: Series or NumPy array
2298

2299
    returns: float
2300
    """
2301
    diff = group1.mean() - group2.mean()
2302

2303
    n1, n2 = len(group1), len(group2)
2304
    var1 = group1.var()
2305
    var2 = group2.var()
2306

2307
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
2308
    d = diff / math.sqrt(pooled_var)
2309
    return d
2310

2311

2312
def Cov(xs, ys, meanx=None, meany=None):
2313
    """Computes Cov(X, Y).
2314

2315
    Args:
2316
        xs: sequence of values
2317
        ys: sequence of values
2318
        meanx: optional float mean of xs
2319
        meany: optional float mean of ys
2320

2321
    Returns:
2322
        Cov(X, Y)
2323
    """
2324
    xs = np.asarray(xs)
2325
    ys = np.asarray(ys)
2326

2327
    if meanx is None:
2328
        meanx = np.mean(xs)
2329
    if meany is None:
2330
        meany = np.mean(ys)
2331

2332
    cov = np.dot(xs-meanx, ys-meany) / len(xs)
2333
    return cov
2334

2335

2336
def Corr(xs, ys):
2337
    """Computes Corr(X, Y).
2338

2339
    Args:
2340
        xs: sequence of values
2341
        ys: sequence of values
2342

2343
    Returns:
2344
        Corr(X, Y)
2345
    """
2346
    xs = np.asarray(xs)
2347
    ys = np.asarray(ys)
2348

2349
    meanx, varx = MeanVar(xs)
2350
    meany, vary = MeanVar(ys)
2351

2352
    corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)
2353

2354
    return corr
2355

2356

2357
def SerialCorr(series, lag=1):
2358
    """Computes the serial correlation of a series.
2359

2360
    series: Series
2361
    lag: integer number of intervals to shift
2362

2363
    returns: float correlation
2364
    """
2365
    xs = series[lag:]
2366
    ys = series.shift(lag)[lag:]
2367
    corr = Corr(xs, ys)
2368
    return corr
2369

2370

2371
def SpearmanCorr(xs, ys):
2372
    """Computes Spearman's rank correlation.
2373

2374
    Args:
2375
        xs: sequence of values
2376
        ys: sequence of values
2377

2378
    Returns:
2379
        float Spearman's correlation
2380
    """
2381
    xranks = pandas.Series(xs).rank()
2382
    yranks = pandas.Series(ys).rank()
2383
    return Corr(xranks, yranks)
2384

2385

2386
def MapToRanks(t):
2387
    """Returns a list of ranks corresponding to the elements in t.
2388

2389
    Args:
2390
        t: sequence of numbers
2391
    
2392
    Returns:
2393
        list of integer ranks, starting at 1
2394
    """
2395
    # pair up each value with its index
2396
    pairs = enumerate(t)
2397
    
2398
    # sort by value
2399
    sorted_pairs = sorted(pairs, key=itemgetter(1))
2400

2401
    # pair up each pair with its rank
2402
    ranked = enumerate(sorted_pairs)
2403

2404
    # sort by index
2405
    resorted = sorted(ranked, key=lambda trip: trip[1][0])
2406

2407
    # extract the ranks
2408
    ranks = [trip[0]+1 for trip in resorted]
2409
    return ranks
2410

2411

2412
def LeastSquares(xs, ys):
2413
    """Computes a linear least squares fit for ys as a function of xs.
2414

2415
    Args:
2416
        xs: sequence of values
2417
        ys: sequence of values
2418

2419
    Returns:
2420
        tuple of (intercept, slope)
2421
    """
2422
    meanx, varx = MeanVar(xs)
2423
    meany = Mean(ys)
2424

2425
    slope = Cov(xs, ys, meanx, meany) / varx
2426
    inter = meany - slope * meanx
2427

2428
    return inter, slope
2429

2430

2431
def FitLine(xs, inter, slope):
2432
    """Fits a line to the given data.
2433

2434
    xs: sequence of x
2435

2436
    returns: tuple of numpy arrays (sorted xs, fit ys)
2437
    """
2438
    fit_xs = np.sort(xs)
2439
    fit_ys = inter + slope * fit_xs
2440
    return fit_xs, fit_ys
2441

2442

2443
def Residuals(xs, ys, inter, slope):
2444
    """Computes residuals for a linear fit with parameters inter and slope.
2445

2446
    Args:
2447
        xs: independent variable
2448
        ys: dependent variable
2449
        inter: float intercept
2450
        slope: float slope
2451

2452
    Returns:
2453
        list of residuals
2454
    """
2455
    xs = np.asarray(xs)
2456
    ys = np.asarray(ys)
2457
    res = ys - (inter + slope * xs)
2458
    return res
2459

2460

2461
def CoefDetermination(ys, res):
2462
    """Computes the coefficient of determination (R^2) for given residuals.
2463

2464
    Args:
2465
        ys: dependent variable
2466
        res: residuals
2467
        
2468
    Returns:
2469
        float coefficient of determination
2470
    """
2471
    return 1 - Var(res) / Var(ys)
2472

2473

2474
def CorrelatedGenerator(rho):
2475
    """Generates standard normal variates with serial correlation.
2476

2477
    rho: target coefficient of correlation
2478

2479
    Returns: iterable
2480
    """
2481
    x = random.gauss(0, 1)
2482
    yield x
2483

2484
    sigma = math.sqrt(1 - rho**2)
2485
    while True:
2486
        x = random.gauss(x * rho, sigma)
2487
        yield x
2488

2489

2490
def CorrelatedNormalGenerator(mu, sigma, rho):
2491
    """Generates normal variates with serial correlation.
2492

2493
    mu: mean of variate
2494
    sigma: standard deviation of variate
2495
    rho: target coefficient of correlation
2496

2497
    Returns: iterable
2498
    """
2499
    for x in CorrelatedGenerator(rho):
2500
        yield x * sigma + mu
2501

2502

2503
def RawMoment(xs, k):
2504
    """Computes the kth raw moment of xs.
2505
    """
2506
    return sum(x**k for x in xs) / len(xs)
2507

2508

2509
def CentralMoment(xs, k):
2510
    """Computes the kth central moment of xs.
2511
    """
2512
    mean = RawMoment(xs, 1)
2513
    return sum((x - mean)**k for x in xs) / len(xs)
2514

2515

2516
def StandardizedMoment(xs, k):
2517
    """Computes the kth standardized moment of xs.
2518
    """
2519
    var = CentralMoment(xs, 2)
2520
    std = math.sqrt(var)
2521
    return CentralMoment(xs, k) / std**k
2522

2523

2524
def Skewness(xs):
2525
    """Computes skewness.
2526
    """
2527
    return StandardizedMoment(xs, 3)
2528

2529

2530
def Median(xs):
2531
    """Computes the median (50th percentile) of a sequence.
2532

2533
    xs: sequence or anything else that can initialize a Cdf
2534

2535
    returns: float
2536
    """
2537
    cdf = Cdf(xs)
2538
    return cdf.Value(0.5)
2539

2540

2541
def IQR(xs):
2542
    """Computes the interquartile of a sequence.
2543

2544
    xs: sequence or anything else that can initialize a Cdf
2545

2546
    returns: pair of floats
2547
    """
2548
    cdf = Cdf(xs)
2549
    return cdf.Value(0.25), cdf.Value(0.75)
2550

2551

2552
def PearsonMedianSkewness(xs):
2553
    """Computes the Pearson median skewness.
2554
    """
2555
    median = Median(xs)
2556
    mean = RawMoment(xs, 1)
2557
    var = CentralMoment(xs, 2)
2558
    std = math.sqrt(var)
2559
    gp = 3 * (mean - median) / std
2560
    return gp
2561

2562

2563
class FixedWidthVariables(object):
2564
    """Represents a set of variables in a fixed width file."""
2565

2566
    def __init__(self, variables, index_base=0):
2567
        """Initializes.
2568

2569
        variables: DataFrame
2570
        index_base: are the indices 0 or 1 based?
2571

2572
        Attributes:
2573
        colspecs: list of (start, end) index tuples
2574
        names: list of string variable names
2575
        """
2576
        self.variables = variables
2577

2578
        # note: by default, subtract 1 from colspecs
2579
        self.colspecs = variables[['start', 'end']] - index_base
2580

2581
        # convert colspecs to a list of pair of int
2582
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
2583
        self.names = variables['name']
2584

2585
    def ReadFixedWidth(self, filename, **options):
2586
        """Reads a fixed width ASCII file.
2587

2588
        filename: string filename
2589

2590
        returns: DataFrame
2591
        """
2592
        df = pandas.read_fwf(filename,
2593
                             colspecs=self.colspecs, 
2594
                             names=self.names,
2595
                             **options)
2596
        return df
2597

2598

2599
def ReadStataDct(dct_file, **options):
2600
    """Reads a Stata dictionary file.
2601

2602
    dct_file: string filename
2603
    options: dict of options passed to open()
2604

2605
    returns: FixedWidthVariables object
2606
    """
2607
    type_map = dict(byte=int, int=int, long=int, float=float, double=float)
2608

2609
    var_info = []
2610
    for line in open(dct_file, **options):
2611
        match = re.search( r'_column\(([^)]*)\)', line)
2612
        if match:
2613
            start = int(match.group(1))
2614
            t = line.split()
2615
            vtype, name, fstring = t[1:4]
2616
            name = name.lower()
2617
            if vtype.startswith('str'):
2618
                vtype = str
2619
            else:
2620
                vtype = type_map[vtype]
2621
            long_desc = ' '.join(t[4:]).strip('"')
2622
            var_info.append((start, vtype, name, fstring, long_desc))
2623
            
2624
    columns = ['start', 'type', 'name', 'fstring', 'desc']
2625
    variables = pandas.DataFrame(var_info, columns=columns)
2626

2627
    # fill in the end column by shifting the start column
2628
    variables['end'] = variables.start.shift(-1)
2629
    variables.loc[len(variables)-1, 'end'] = 0
2630

2631
    dct = FixedWidthVariables(variables, index_base=1)
2632
    return dct
2633

2634

2635
def Resample(xs, n=None):
2636
    """Draw a sample from xs with the same length as xs.
2637

2638
    xs: sequence
2639
    n: sample size (default: len(xs))
2640

2641
    returns: NumPy array
2642
    """
2643
    if n is None:
2644
        n = len(xs)
2645
    return np.random.choice(xs, n, replace=True)
2646

2647

2648
def SampleRows(df, nrows, replace=False):
2649
    """Choose a sample of rows from a DataFrame.
2650

2651
    df: DataFrame
2652
    nrows: number of rows
2653
    replace: whether to sample with replacement
2654

2655
    returns: DataDf
2656
    """
2657
    indices = np.random.choice(df.index, nrows, replace=replace)
2658
    sample = df.loc[indices]
2659
    return sample
2660

2661

2662
def ResampleRows(df):
2663
    """Resamples rows from a DataFrame.
2664

2665
    df: DataFrame
2666

2667
    returns: DataFrame
2668
    """
2669
    return SampleRows(df, len(df), replace=True)
2670

2671

2672
def ResampleRowsWeighted(df, column='finalwgt'):
2673
    """Resamples a DataFrame using probabilities proportional to given column.
2674

2675
    df: DataFrame
2676
    column: string column name to use as weights
2677

2678
    returns: DataFrame
2679
    """
2680
    weights = df[column]
2681
    cdf = Cdf(dict(weights))
2682
    indices = cdf.Sample(len(weights))
2683
    sample = df.loc[indices]
2684
    return sample
2685

2686

2687
def PercentileRow(array, p):
2688
    """Selects the row from a sorted array that maps to percentile p.
2689

2690
    p: float 0--100
2691

2692
    returns: NumPy array (one row)
2693
    """
2694
    rows, cols = array.shape
2695
    index = int(rows * p / 100)
2696
    return array[index,]
2697

2698

2699
def PercentileRows(ys_seq, percents):
2700
    """Given a collection of lines, selects percentiles along vertical axis.
2701

2702
    For example, if ys_seq contains simulation results like ys as a
2703
    function of time, and percents contains (5, 95), the result would
2704
    be a 90% CI for each vertical slice of the simulation results.
2705

2706
    ys_seq: sequence of lines (y values)
2707
    percents: list of percentiles (0-100) to select
2708

2709
    returns: list of NumPy arrays, one for each percentile
2710
    """
2711
    nrows = len(ys_seq)
2712
    ncols = len(ys_seq[0])
2713
    array = np.zeros((nrows, ncols))
2714

2715
    for i, ys in enumerate(ys_seq):
2716
        array[i,] = ys
2717

2718
    array = np.sort(array, axis=0)
2719

2720
    rows = [PercentileRow(array, p) for p in percents]
2721
    return rows
2722

2723

2724
def Smooth(xs, sigma=2, **options):
2725
    """Smooths a NumPy array with a Gaussian filter.
2726

2727
    xs: sequence
2728
    sigma: standard deviation of the filter
2729
    """
2730
    return ndimage.filters.gaussian_filter1d(xs, sigma, **options)
2731

2732

2733
class HypothesisTest(object):
2734
    """Represents a hypothesis test."""
2735

2736
    def __init__(self, data):
2737
        """Initializes.
2738

2739
        data: data in whatever form is relevant
2740
        """
2741
        self.data = data
2742
        self.MakeModel()
2743
        self.actual = self.TestStatistic(data)
2744
        self.test_stats = None
2745
        self.test_cdf = None
2746

2747
    def PValue(self, iters=1000):
2748
        """Computes the distribution of the test statistic and p-value.
2749

2750
        iters: number of iterations
2751

2752
        returns: float p-value
2753
        """
2754
        self.test_stats = [self.TestStatistic(self.RunModel()) 
2755
                           for _ in range(iters)]
2756
        self.test_cdf = Cdf(self.test_stats)
2757

2758
        count = sum(1 for x in self.test_stats if x >= self.actual)
2759
        return count / iters
2760

2761
    def MaxTestStat(self):
2762
        """Returns the largest test statistic seen during simulations.
2763
        """
2764
        return max(self.test_stats)
2765

2766
    def PlotCdf(self, label=None):
2767
        """Draws a Cdf with vertical lines at the observed test stat.
2768
        """
2769
        def VertLine(x):
2770
            """Draws a vertical line at x."""
2771
            thinkplot.Plot([x, x], [0, 1], color='0.8')
2772

2773
        VertLine(self.actual)
2774
        thinkplot.Cdf(self.test_cdf, label=label)
2775

2776
    def TestStatistic(self, data):
2777
        """Computes the test statistic.
2778

2779
        data: data in whatever form is relevant        
2780
        """
2781
        raise UnimplementedMethodException()
2782

2783
    def MakeModel(self):
2784
        """Build a model of the null hypothesis.
2785
        """
2786
        pass
2787

2788
    def RunModel(self):
2789
        """Run the model of the null hypothesis.
2790

2791
        returns: simulated data
2792
        """
2793
        raise UnimplementedMethodException()
2794

2795

2796
def main():
2797
    pass
2798
    
2799

2800
if __name__ == '__main__':
2801
    main()
2802

2803
Product

Resources

Company