📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code used in "Think Bayes",1by Allen B. Downey, available from greenteapress.com23Copyright 2012 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67import math8import numpy9import cPickle10import numpy11import random12import scipy1314import brfss1516import thinkplot17import thinkbayes18import thinkstats1920import matplotlib.pyplot as pyplot212223NUM_SIGMAS = 12425class Height(thinkbayes.Suite, thinkbayes.Joint):26"""Hypotheses about parameters of the distribution of height."""2728def __init__(self, mus, sigmas, name=''):29"""Makes a prior distribution for mu and sigma based on a sample.3031mus: sequence of possible mus32sigmas: sequence of possible sigmas33name: string name for the Suite34"""35pairs = [(mu, sigma)36for mu in mus37for sigma in sigmas]3839thinkbayes.Suite.__init__(self, pairs, name=name)4041def Likelihood(self, data, hypo):42"""Computes the likelihood of the data under the hypothesis.4344Args:45hypo: tuple of hypothetical mu and sigma46data: float sample4748Returns:49likelihood of the sample given mu and sigma50"""51x = data52mu, sigma = hypo53like = scipy.stats.norm.pdf(x, mu, sigma)54return like5556def LogLikelihood(self, data, hypo):57"""Computes the log likelihood of the data under the hypothesis.5859Args:60data: a list of values61hypo: tuple of hypothetical mu and sigma6263Returns:64log likelihood of the sample given mu and sigma (unnormalized)65"""66x = data67mu, sigma = hypo68loglike = EvalGaussianLogPdf(x, mu, sigma)69return loglike7071def LogUpdateSetFast(self, data):72"""Updates the suite using a faster implementation.7374Computes the sum of the log likelihoods directly.7576Args:77data: sequence of values78"""79xs = tuple(data)80n = len(xs)8182for hypo in self.Values():83mu, sigma = hypo84total = Summation(xs, mu)85loglike = -n * math.log(sigma) - total / 2 / sigma**286self.Incr(hypo, loglike)8788def LogUpdateSetMeanVar(self, data):89"""Updates the suite using ABC and mean/var.9091Args:92data: sequence of values93"""94xs = data95n = len(xs)9697m = numpy.mean(xs)98s = numpy.std(xs)99100self.LogUpdateSetABC(n, m, s)101102def LogUpdateSetMedianIPR(self, data):103"""Updates the suite using ABC and median/iqr.104105Args:106data: sequence of values107"""108xs = data109n = len(xs)110111# compute summary stats112median, s = MedianS(xs, num_sigmas=NUM_SIGMAS)113print 'median, s', median, s114115self.LogUpdateSetABC(n, median, s)116117def LogUpdateSetABC(self, n, m, s):118"""Updates the suite using ABC.119120n: sample size121m: estimated central tendency122s: estimated spread123"""124for hypo in sorted(self.Values()):125mu, sigma = hypo126127# compute log likelihood of m, given hypo128stderr_m = sigma / math.sqrt(n)129loglike = EvalGaussianLogPdf(m, mu, stderr_m)130131#compute log likelihood of s, given hypo132stderr_s = sigma / math.sqrt(2 * (n-1))133loglike += EvalGaussianLogPdf(s, sigma, stderr_s)134135self.Incr(hypo, loglike)136137138def EvalGaussianLogPdf(x, mu, sigma):139"""Computes the log PDF of x given mu and sigma.140141x: float values142mu, sigma: paramemters of Gaussian143144returns: float log-likelihood145"""146return scipy.stats.norm.logpdf(x, mu, sigma)147148149def FindPriorRanges(xs, num_points, num_stderrs=3.0, median_flag=False):150"""Find ranges for mu and sigma with non-negligible likelihood.151152xs: sample153num_points: number of values in each dimension154num_stderrs: number of standard errors to include on either side155156Returns: sequence of mus, sequence of sigmas157"""158def MakeRange(estimate, stderr):159"""Makes a linear range around the estimate.160161estimate: central value162stderr: standard error of the estimate163164returns: numpy array of float165"""166spread = stderr * num_stderrs167array = numpy.linspace(estimate-spread, estimate+spread, num_points)168return array169170# estimate mean and stddev of xs171n = len(xs)172if median_flag:173m, s = MedianS(xs, num_sigmas=NUM_SIGMAS)174else:175m = numpy.mean(xs)176s = numpy.std(xs)177178print 'classical estimators', m, s179180# compute ranges for m and s181stderr_m = s / math.sqrt(n)182mus = MakeRange(m, stderr_m)183184stderr_s = s / math.sqrt(2 * (n-1))185sigmas = MakeRange(s, stderr_s)186187return mus, sigmas188189190def Summation(xs, mu, cache={}):191"""Computes the sum of (x-mu)**2 for x in t.192193Caches previous results.194195xs: tuple of values196mu: hypothetical mean197cache: cache of previous results198"""199try:200return cache[xs, mu]201except KeyError:202ds = [(x-mu)**2 for x in xs]203total = sum(ds)204cache[xs, mu] = total205return total206207208def CoefVariation(suite):209"""Computes the distribution of CV.210211suite: Pmf that maps (x, y) to z212213Returns: Pmf object for CV.214"""215pmf = thinkbayes.Pmf()216for (m, s), p in suite.Items():217pmf.Incr(s/m, p)218return pmf219220221def PlotCdfs(d, labels):222"""Plot CDFs for each sequence in a dictionary.223224Jitters the data and subtracts away the mean.225226d: map from key to sequence of values227labels: map from key to string label228"""229thinkplot.Clf()230for key, xs in d.iteritems():231mu = thinkstats.Mean(xs)232xs = thinkstats.Jitter(xs, 1.3)233xs = [x-mu for x in xs]234cdf = thinkbayes.MakeCdfFromList(xs)235thinkplot.Cdf(cdf, label=labels[key])236thinkplot.Show()237238239def PlotPosterior(suite, pcolor=False, contour=True):240"""Makes a contour plot.241242suite: Suite that maps (mu, sigma) to probability243"""244thinkplot.Clf()245thinkplot.Contour(suite.GetDict(), pcolor=pcolor, contour=contour)246247thinkplot.Save(root='variability_posterior_%s' % suite.name,248title='Posterior joint distribution',249xlabel='Mean height (cm)',250ylabel='Stddev (cm)')251252253def PlotCoefVariation(suites):254"""Plot the posterior distributions for CV.255256suites: map from label to Pmf of CVs.257"""258thinkplot.Clf()259thinkplot.PrePlot(num=2)260261pmfs = {}262for label, suite in suites.iteritems():263pmf = CoefVariation(suite)264print 'CV posterior mean', pmf.Mean()265cdf = thinkbayes.MakeCdfFromPmf(pmf, label)266thinkplot.Cdf(cdf)267268pmfs[label] = pmf269270thinkplot.Save(root='variability_cv',271xlabel='Coefficient of variation',272ylabel='Probability')273274print 'female bigger', thinkbayes.PmfProbGreater(pmfs['female'],275pmfs['male'])276print 'male bigger', thinkbayes.PmfProbGreater(pmfs['male'],277pmfs['female'])278279280def PlotOutliers(samples):281"""Make CDFs showing the distribution of outliers."""282cdfs = []283for label, sample in samples.iteritems():284outliers = [x for x in sample if x < 150]285286cdf = thinkbayes.MakeCdfFromList(outliers, label)287cdfs.append(cdf)288289thinkplot.Clf()290thinkplot.Cdfs(cdfs)291thinkplot.Save(root='variability_cdfs',292title='CDF of height',293xlabel='Reported height (cm)',294ylabel='CDF')295296297def PlotMarginals(suite):298"""Plots marginal distributions from a joint distribution.299300suite: joint distribution of mu and sigma.301"""302thinkplot.Clf()303304pyplot.subplot(1, 2, 1)305pmf_m = suite.Marginal(0)306cdf_m = thinkbayes.MakeCdfFromPmf(pmf_m)307thinkplot.Cdf(cdf_m)308309pyplot.subplot(1, 2, 2)310pmf_s = suite.Marginal(1)311cdf_s = thinkbayes.MakeCdfFromPmf(pmf_s)312thinkplot.Cdf(cdf_s)313314thinkplot.Show()315316317def DumpHeights(data_dir='.', n=10000):318"""Read the BRFSS dataset, extract the heights and pickle them."""319resp = brfss.Respondents()320resp.ReadRecords(data_dir, n)321322d = {1:[], 2:[]}323[d[r.sex].append(r.htm3) for r in resp.records if r.htm3 != 'NA']324325fp = open('variability_data.pkl', 'wb')326cPickle.dump(d, fp)327fp.close()328329330def LoadHeights():331"""Read the pickled height data.332333returns: map from sex code to list of heights.334"""335fp = open('variability_data.pkl', 'r')336d = cPickle.load(fp)337fp.close()338return d339340341def UpdateSuite1(suite, xs):342"""Computes the posterior distibution of mu and sigma.343344Computes untransformed likelihoods.345346suite: Suite that maps from (mu, sigma) to prob347xs: sequence348"""349suite.UpdateSet(xs)350351352def UpdateSuite2(suite, xs):353"""Computes the posterior distibution of mu and sigma.354355Computes log likelihoods.356357suite: Suite that maps from (mu, sigma) to prob358xs: sequence359"""360suite.Log()361suite.LogUpdateSet(xs)362suite.Exp()363suite.Normalize()364365366def UpdateSuite3(suite, xs):367"""Computes the posterior distibution of mu and sigma.368369Computes log likelihoods efficiently.370371suite: Suite that maps from (mu, sigma) to prob372t: sequence373"""374suite.Log()375suite.LogUpdateSetFast(xs)376suite.Exp()377suite.Normalize()378379380def UpdateSuite4(suite, xs):381"""Computes the posterior distibution of mu and sigma.382383Computes log likelihoods efficiently.384385suite: Suite that maps from (mu, sigma) to prob386t: sequence387"""388suite.Log()389suite.LogUpdateSetMeanVar(xs)390suite.Exp()391suite.Normalize()392393394def UpdateSuite5(suite, xs):395"""Computes the posterior distibution of mu and sigma.396397Computes log likelihoods efficiently.398399suite: Suite that maps from (mu, sigma) to prob400t: sequence401"""402suite.Log()403suite.LogUpdateSetMedianIPR(xs)404suite.Exp()405suite.Normalize()406407408def MedianIPR(xs, p):409"""Computes the median and interpercentile range.410411xs: sequence of values412p: range (0-1), 0.5 yields the interquartile range413414returns: tuple of float (median, IPR)415"""416cdf = thinkbayes.MakeCdfFromList(xs)417median = cdf.Percentile(50)418419alpha = (1-p) / 2420ipr = cdf.Value(1-alpha) - cdf.Value(alpha)421return median, ipr422423424def MedianS(xs, num_sigmas):425"""Computes the median and an estimate of sigma.426427Based on an interpercentile range (IPR).428429factor: number of standard deviations spanned by the IPR430"""431half_p = thinkbayes.StandardGaussianCdf(num_sigmas) - 0.5432median, ipr = MedianIPR(xs, half_p * 2)433s = ipr / 2 / num_sigmas434435return median, s436437def Summarize(xs):438"""Prints summary statistics from a sequence of values.439440xs: sequence of values441"""442# print smallest and largest443xs.sort()444print 'smallest', xs[:10]445print 'largest', xs[-10:]446447# print median and interquartile range448cdf = thinkbayes.MakeCdfFromList(xs)449print cdf.Percentile(25), cdf.Percentile(50), cdf.Percentile(75)450451452def RunEstimate(update_func, num_points=31, median_flag=False):453"""Runs the whole analysis.454455update_func: which of the update functions to use456num_points: number of points in the Suite (in each dimension)457"""458DumpHeights(n=10000000)459d = LoadHeights()460labels = {1:'male', 2:'female'}461462# PlotCdfs(d, labels)463464suites = {}465for key, xs in d.iteritems():466name = labels[key]467print name, len(xs)468Summarize(xs)469470xs = thinkstats.Jitter(xs, 1.3)471472mus, sigmas = FindPriorRanges(xs, num_points, median_flag=median_flag)473suite = Height(mus, sigmas, name)474suites[name] = suite475update_func(suite, xs)476print 'MLE', suite.MaximumLikelihood()477478PlotPosterior(suite)479480pmf_m = suite.Marginal(0)481pmf_s = suite.Marginal(1)482print 'marginal mu', pmf_m.Mean(), pmf_m.Var()483print 'marginal sigma', pmf_s.Mean(), pmf_s.Var()484485# PlotMarginals(suite)486487PlotCoefVariation(suites)488489490def main():491random.seed(17)492493func = UpdateSuite5494median_flag = (func == UpdateSuite5)495RunEstimate(func, median_flag=median_flag)496497498if __name__ == '__main__':499main()500501502""" Results:503504UpdateSuite1 (100):505marginal mu 162.816901408 0.55779791443506marginal sigma 6.36966103214 0.277026082819507508UpdateSuite2 (100):509marginal mu 162.816901408 0.55779791443510marginal sigma 6.36966103214 0.277026082819511512UpdateSuite3 (100):513marginal mu 162.816901408 0.55779791443514marginal sigma 6.36966103214 0.277026082819515516UpdateSuite4 (100):517marginal mu 162.816901408 0.547456009605518marginal sigma 6.30305516111 0.27544106054519520UpdateSuite3 (1000):521marginal mu 163.722137405 0.0660294386397522marginal sigma 6.64453251495 0.0329935312671523524UpdateSuite4 (1000):525marginal mu 163.722137405 0.0658920503302526marginal sigma 6.63692197049 0.0329689887609527528UpdateSuite3 (all):529marginal mu 163.223475005 0.000203282582659530marginal sigma 7.26918836916 0.000101641131229531532UpdateSuite4 (all):533marginal mu 163.223475004 0.000203281499857534marginal sigma 7.26916693422 0.000101640932082535536UpdateSuite5 (all):537marginal mu 163.1805214 7.9399898468e-07538marginal sigma 7.29969524118 3.26257030869e-14539540"""541542543544