CoCalc -- variability.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / think-bayes / code / variability.py
¹³²⁹²⁸ views
License: OTHER
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3

4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7

8
import math
9
import numpy
10
import cPickle
11
import numpy
12
import random
13
import scipy
14

15
import brfss
16

17
import thinkplot
18
import thinkbayes
19
import thinkstats
20

21
import matplotlib.pyplot as pyplot
22

23

24
NUM_SIGMAS = 1
25

26
class Height(thinkbayes.Suite, thinkbayes.Joint):
27
    """Hypotheses about parameters of the distribution of height."""
28

29
    def __init__(self, mus, sigmas, name=''):
30
        """Makes a prior distribution for mu and sigma based on a sample.
31

32
        mus: sequence of possible mus
33
        sigmas: sequence of possible sigmas
34
        name: string name for the Suite
35
        """
36
        pairs = [(mu, sigma) 
37
                 for mu in mus
38
                 for sigma in sigmas]
39

40
        thinkbayes.Suite.__init__(self, pairs, name=name)
41

42
    def Likelihood(self, data, hypo):
43
        """Computes the likelihood of the data under the hypothesis.
44

45
        Args:
46
            hypo: tuple of hypothetical mu and sigma
47
            data: float sample
48

49
        Returns:
50
            likelihood of the sample given mu and sigma
51
        """
52
        x = data
53
        mu, sigma = hypo
54
        like = scipy.stats.norm.pdf(x, mu, sigma)
55
        return like
56

57
    def LogLikelihood(self, data, hypo):
58
        """Computes the log likelihood of the data under the hypothesis.
59

60
        Args:
61
            data: a list of values
62
            hypo: tuple of hypothetical mu and sigma
63

64
        Returns:
65
            log likelihood of the sample given mu and sigma (unnormalized)
66
        """
67
        x = data
68
        mu, sigma = hypo
69
        loglike = EvalGaussianLogPdf(x, mu, sigma)
70
        return loglike
71

72
    def LogUpdateSetFast(self, data):
73
        """Updates the suite using a faster implementation.
74

75
        Computes the sum of the log likelihoods directly.
76

77
        Args:
78
            data: sequence of values
79
        """
80
        xs = tuple(data)
81
        n = len(xs)
82

83
        for hypo in self.Values():
84
            mu, sigma = hypo
85
            total = Summation(xs, mu)
86
            loglike = -n * math.log(sigma) - total / 2 / sigma**2
87
            self.Incr(hypo, loglike)
88

89
    def LogUpdateSetMeanVar(self, data):
90
        """Updates the suite using ABC and mean/var.
91

92
        Args:
93
            data: sequence of values
94
        """
95
        xs = data
96
        n = len(xs)
97

98
        m = numpy.mean(xs)
99
        s = numpy.std(xs)
100

101
        self.LogUpdateSetABC(n, m, s)
102

103
    def LogUpdateSetMedianIPR(self, data):
104
        """Updates the suite using ABC and median/iqr.
105

106
        Args:
107
            data: sequence of values
108
        """
109
        xs = data
110
        n = len(xs)
111

112
        # compute summary stats
113
        median, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
114
        print 'median, s', median, s
115

116
        self.LogUpdateSetABC(n, median, s)
117

118
    def LogUpdateSetABC(self, n, m, s):
119
        """Updates the suite using ABC.
120

121
        n: sample size
122
        m: estimated central tendency
123
        s: estimated spread
124
        """
125
        for hypo in sorted(self.Values()):
126
            mu, sigma = hypo
127

128
            # compute log likelihood of m, given hypo
129
            stderr_m = sigma / math.sqrt(n)
130
            loglike = EvalGaussianLogPdf(m, mu, stderr_m)
131

132
            #compute log likelihood of s, given hypo
133
            stderr_s = sigma / math.sqrt(2 * (n-1))
134
            loglike += EvalGaussianLogPdf(s, sigma, stderr_s)
135

136
            self.Incr(hypo, loglike)
137

138

139
def EvalGaussianLogPdf(x, mu, sigma):
140
    """Computes the log PDF of x given mu and sigma.
141

142
    x: float values
143
    mu, sigma: paramemters of Gaussian
144

145
    returns: float log-likelihood
146
    """
147
    return scipy.stats.norm.logpdf(x, mu, sigma)
148

149

150
def FindPriorRanges(xs, num_points, num_stderrs=3.0, median_flag=False):
151
    """Find ranges for mu and sigma with non-negligible likelihood.
152

153
    xs: sample
154
    num_points: number of values in each dimension
155
    num_stderrs: number of standard errors to include on either side
156
    
157
    Returns: sequence of mus, sequence of sigmas    
158
    """
159
    def MakeRange(estimate, stderr):
160
        """Makes a linear range around the estimate.
161

162
        estimate: central value
163
        stderr: standard error of the estimate
164

165
        returns: numpy array of float
166
        """
167
        spread = stderr * num_stderrs
168
        array = numpy.linspace(estimate-spread, estimate+spread, num_points)
169
        return array
170

171
    # estimate mean and stddev of xs
172
    n = len(xs)
173
    if median_flag:
174
        m, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
175
    else:
176
        m = numpy.mean(xs)
177
        s = numpy.std(xs)
178

179
    print 'classical estimators', m, s
180

181
    # compute ranges for m and s
182
    stderr_m = s / math.sqrt(n)
183
    mus = MakeRange(m, stderr_m)
184

185
    stderr_s = s / math.sqrt(2 * (n-1))
186
    sigmas = MakeRange(s, stderr_s)
187

188
    return mus, sigmas
189

190

191
def Summation(xs, mu, cache={}):
192
    """Computes the sum of (x-mu)**2 for x in t.
193

194
    Caches previous results.
195

196
    xs: tuple of values
197
    mu: hypothetical mean
198
    cache: cache of previous results
199
    """
200
    try:
201
        return cache[xs, mu]
202
    except KeyError:
203
        ds = [(x-mu)**2 for x in xs]
204
        total = sum(ds)
205
        cache[xs, mu] = total
206
        return total
207

208

209
def CoefVariation(suite):
210
    """Computes the distribution of CV.
211

212
    suite: Pmf that maps (x, y) to z
213

214
    Returns: Pmf object for CV.
215
    """
216
    pmf = thinkbayes.Pmf()
217
    for (m, s), p in suite.Items():
218
        pmf.Incr(s/m, p)
219
    return pmf
220

221

222
def PlotCdfs(d, labels):
223
    """Plot CDFs for each sequence in a dictionary.
224

225
    Jitters the data and subtracts away the mean.
226

227
    d: map from key to sequence of values
228
    labels: map from key to string label
229
    """
230
    thinkplot.Clf()
231
    for key, xs in d.iteritems():
232
        mu = thinkstats.Mean(xs)
233
        xs = thinkstats.Jitter(xs, 1.3)
234
        xs = [x-mu for x in xs]
235
        cdf = thinkbayes.MakeCdfFromList(xs)
236
        thinkplot.Cdf(cdf, label=labels[key])
237
    thinkplot.Show()
238
                  
239

240
def PlotPosterior(suite, pcolor=False, contour=True):
241
    """Makes a contour plot.
242
    
243
    suite: Suite that maps (mu, sigma) to probability
244
    """
245
    thinkplot.Clf()
246
    thinkplot.Contour(suite.GetDict(), pcolor=pcolor, contour=contour)
247

248
    thinkplot.Save(root='variability_posterior_%s' % suite.name,
249
                title='Posterior joint distribution',
250
                xlabel='Mean height (cm)',
251
                ylabel='Stddev (cm)')
252

253

254
def PlotCoefVariation(suites):
255
    """Plot the posterior distributions for CV.
256

257
    suites: map from label to Pmf of CVs.
258
    """
259
    thinkplot.Clf()
260
    thinkplot.PrePlot(num=2)
261

262
    pmfs = {}
263
    for label, suite in suites.iteritems():
264
        pmf = CoefVariation(suite)
265
        print 'CV posterior mean', pmf.Mean()
266
        cdf = thinkbayes.MakeCdfFromPmf(pmf, label)
267
        thinkplot.Cdf(cdf)
268
    
269
        pmfs[label] = pmf
270

271
    thinkplot.Save(root='variability_cv',
272
                xlabel='Coefficient of variation',
273
                ylabel='Probability')
274

275
    print 'female bigger', thinkbayes.PmfProbGreater(pmfs['female'],
276
                                                     pmfs['male'])
277
    print 'male bigger', thinkbayes.PmfProbGreater(pmfs['male'],
278
                                                   pmfs['female'])
279

280

281
def PlotOutliers(samples):
282
    """Make CDFs showing the distribution of outliers."""
283
    cdfs = []
284
    for label, sample in samples.iteritems():
285
        outliers = [x for x in sample if x < 150]
286

287
        cdf = thinkbayes.MakeCdfFromList(outliers, label)
288
        cdfs.append(cdf)
289

290
    thinkplot.Clf()
291
    thinkplot.Cdfs(cdfs)
292
    thinkplot.Save(root='variability_cdfs',
293
                title='CDF of height',
294
                xlabel='Reported height (cm)',
295
                ylabel='CDF')
296

297

298
def PlotMarginals(suite):
299
    """Plots marginal distributions from a joint distribution.
300

301
    suite: joint distribution of mu and sigma.
302
    """
303
    thinkplot.Clf()
304

305
    pyplot.subplot(1, 2, 1)
306
    pmf_m = suite.Marginal(0)
307
    cdf_m = thinkbayes.MakeCdfFromPmf(pmf_m)
308
    thinkplot.Cdf(cdf_m)
309

310
    pyplot.subplot(1, 2, 2)
311
    pmf_s = suite.Marginal(1)
312
    cdf_s = thinkbayes.MakeCdfFromPmf(pmf_s)
313
    thinkplot.Cdf(cdf_s)
314

315
    thinkplot.Show()
316

317

318
def DumpHeights(data_dir='.', n=10000):
319
    """Read the BRFSS dataset, extract the heights and pickle them."""
320
    resp = brfss.Respondents()
321
    resp.ReadRecords(data_dir, n)
322

323
    d = {1:[], 2:[]}
324
    [d[r.sex].append(r.htm3) for r in resp.records if r.htm3 != 'NA']
325

326
    fp = open('variability_data.pkl', 'wb')
327
    cPickle.dump(d, fp)
328
    fp.close()
329

330

331
def LoadHeights():
332
    """Read the pickled height data.
333

334
    returns: map from sex code to list of heights.
335
    """
336
    fp = open('variability_data.pkl', 'r')
337
    d = cPickle.load(fp)
338
    fp.close()
339
    return d
340

341

342
def UpdateSuite1(suite, xs):
343
    """Computes the posterior distibution of mu and sigma.
344

345
    Computes untransformed likelihoods.
346

347
    suite: Suite that maps from (mu, sigma) to prob
348
    xs: sequence
349
    """
350
    suite.UpdateSet(xs)
351

352

353
def UpdateSuite2(suite, xs):
354
    """Computes the posterior distibution of mu and sigma.
355

356
    Computes log likelihoods.
357

358
    suite: Suite that maps from (mu, sigma) to prob
359
    xs: sequence
360
    """
361
    suite.Log()
362
    suite.LogUpdateSet(xs)
363
    suite.Exp()
364
    suite.Normalize()
365

366

367
def UpdateSuite3(suite, xs):
368
    """Computes the posterior distibution of mu and sigma.
369

370
    Computes log likelihoods efficiently.
371

372
    suite: Suite that maps from (mu, sigma) to prob
373
    t: sequence
374
    """
375
    suite.Log()
376
    suite.LogUpdateSetFast(xs)
377
    suite.Exp()
378
    suite.Normalize()
379

380

381
def UpdateSuite4(suite, xs):
382
    """Computes the posterior distibution of mu and sigma.
383

384
    Computes log likelihoods efficiently.
385

386
    suite: Suite that maps from (mu, sigma) to prob
387
    t: sequence
388
    """
389
    suite.Log()
390
    suite.LogUpdateSetMeanVar(xs)
391
    suite.Exp()
392
    suite.Normalize()
393

394

395
def UpdateSuite5(suite, xs):
396
    """Computes the posterior distibution of mu and sigma.
397

398
    Computes log likelihoods efficiently.
399

400
    suite: Suite that maps from (mu, sigma) to prob
401
    t: sequence
402
    """
403
    suite.Log()
404
    suite.LogUpdateSetMedianIPR(xs)
405
    suite.Exp()
406
    suite.Normalize()
407

408

409
def MedianIPR(xs, p):
410
    """Computes the median and interpercentile range.
411

412
    xs: sequence of values
413
    p: range (0-1), 0.5 yields the interquartile range
414

415
    returns: tuple of float (median, IPR)
416
    """
417
    cdf = thinkbayes.MakeCdfFromList(xs)
418
    median = cdf.Percentile(50)
419

420
    alpha = (1-p) / 2
421
    ipr = cdf.Value(1-alpha) - cdf.Value(alpha)
422
    return median, ipr
423

424

425
def MedianS(xs, num_sigmas):
426
    """Computes the median and an estimate of sigma.
427

428
    Based on an interpercentile range (IPR).
429

430
    factor: number of standard deviations spanned by the IPR
431
    """
432
    half_p = thinkbayes.StandardGaussianCdf(num_sigmas) - 0.5
433
    median, ipr = MedianIPR(xs, half_p * 2)
434
    s = ipr / 2 / num_sigmas
435

436
    return median, s
437

438
def Summarize(xs):
439
    """Prints summary statistics from a sequence of values.
440

441
    xs: sequence of values
442
    """
443
    # print smallest and largest
444
    xs.sort()
445
    print 'smallest', xs[:10]
446
    print 'largest', xs[-10:]
447

448
    # print median and interquartile range
449
    cdf = thinkbayes.MakeCdfFromList(xs)
450
    print cdf.Percentile(25), cdf.Percentile(50), cdf.Percentile(75)
451

452

453
def RunEstimate(update_func, num_points=31, median_flag=False):
454
    """Runs the whole analysis.
455

456
    update_func: which of the update functions to use
457
    num_points: number of points in the Suite (in each dimension)
458
    """
459
    DumpHeights(n=10000000)
460
    d = LoadHeights()
461
    labels = {1:'male', 2:'female'}
462

463
    # PlotCdfs(d, labels)
464

465
    suites = {}
466
    for key, xs in d.iteritems():
467
        name = labels[key]
468
        print name, len(xs)
469
        Summarize(xs)
470

471
        xs = thinkstats.Jitter(xs, 1.3)
472

473
        mus, sigmas = FindPriorRanges(xs, num_points, median_flag=median_flag)
474
        suite = Height(mus, sigmas, name)
475
        suites[name] = suite
476
        update_func(suite, xs)
477
        print 'MLE', suite.MaximumLikelihood()
478

479
        PlotPosterior(suite)
480

481
        pmf_m = suite.Marginal(0)
482
        pmf_s = suite.Marginal(1)
483
        print 'marginal mu', pmf_m.Mean(), pmf_m.Var()
484
        print 'marginal sigma', pmf_s.Mean(), pmf_s.Var()
485

486
        # PlotMarginals(suite)
487

488
    PlotCoefVariation(suites)
489

490

491
def main():
492
    random.seed(17)
493

494
    func = UpdateSuite5
495
    median_flag = (func == UpdateSuite5)
496
    RunEstimate(func, median_flag=median_flag)
497

498

499
if __name__ == '__main__':
500
    main()
501

502

503
""" Results:
504

505
UpdateSuite1 (100):
506
marginal mu 162.816901408 0.55779791443
507
marginal sigma 6.36966103214 0.277026082819
508

509
UpdateSuite2 (100):
510
marginal mu 162.816901408 0.55779791443
511
marginal sigma 6.36966103214 0.277026082819
512

513
UpdateSuite3 (100):
514
marginal mu 162.816901408 0.55779791443
515
marginal sigma 6.36966103214 0.277026082819
516

517
UpdateSuite4 (100):
518
marginal mu 162.816901408 0.547456009605
519
marginal sigma 6.30305516111 0.27544106054
520

521
UpdateSuite3 (1000):
522
marginal mu 163.722137405 0.0660294386397
523
marginal sigma 6.64453251495 0.0329935312671
524

525
UpdateSuite4 (1000):
526
marginal mu 163.722137405 0.0658920503302
527
marginal sigma 6.63692197049 0.0329689887609
528

529
UpdateSuite3 (all):
530
marginal mu 163.223475005 0.000203282582659
531
marginal sigma 7.26918836916 0.000101641131229
532

533
UpdateSuite4 (all):
534
marginal mu 163.223475004 0.000203281499857
535
marginal sigma 7.26916693422 0.000101640932082
536

537
UpdateSuite5 (all):
538
marginal mu 163.1805214 7.9399898468e-07
539
marginal sigma 7.29969524118 3.26257030869e-14
540

541
"""
542

543

544
Product

Resources

Company