📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code used in "Think Bayes",1by Allen B. Downey, available from greenteapress.com23Copyright 2013 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67import thinkbayes89import thinkplot10import numpy1112import math13import random14import sys1516FORMATS = ['pdf', 'eps', 'png', 'jpg']1718"""19Notation guide:2021z: time between trains22x: time since the last train23y: time until the next train2425zb: distribution of z as seen by a random arrival2627"""2829# longest hypothetical time between trains, in seconds3031UPPER_BOUND = 12003233# observed gaps between trains, in seconds34# collected using code in redline_data.py, run daily 4-6pm35# for 5 days, Monday 6 May 2013 to Friday 10 May 20133637OBSERVED_GAP_TIMES = [38428.0, 705.0, 407.0, 465.0, 433.0, 425.0, 204.0, 506.0, 143.0, 351.0,39450.0, 598.0, 464.0, 749.0, 341.0, 586.0, 754.0, 256.0, 378.0, 435.0,40176.0, 405.0, 360.0, 519.0, 648.0, 374.0, 483.0, 537.0, 578.0, 534.0,41577.0, 619.0, 538.0, 331.0, 186.0, 629.0, 193.0, 360.0, 660.0, 484.0,42512.0, 315.0, 457.0, 404.0, 740.0, 388.0, 357.0, 485.0, 567.0, 160.0,43428.0, 387.0, 901.0, 187.0, 622.0, 616.0, 585.0, 474.0, 442.0, 499.0,44437.0, 620.0, 351.0, 286.0, 373.0, 232.0, 393.0, 745.0, 636.0, 758.0,45]464748def BiasPmf(pmf, name='', invert=False):49"""Returns the Pmf with oversampling proportional to value.5051If pmf is the distribution of true values, the result is the52distribution that would be seen if values are oversampled in53proportion to their values; for example, if you ask students54how big their classes are, large classes are oversampled in55proportion to their size.5657If invert=True, computes in inverse operation; for example,58unbiasing a sample collected from students.5960Args:61pmf: Pmf object.62name: string name for the new Pmf.63invert: boolean6465Returns:66Pmf object67"""68new_pmf = pmf.Copy(name=name)6970for x in pmf.Values():71if invert:72new_pmf.Mult(x, 1.0/x)73else:74new_pmf.Mult(x, x)7576new_pmf.Normalize()77return new_pmf787980def UnbiasPmf(pmf, name=''):81"""Returns the Pmf with oversampling proportional to 1/value.8283Args:84pmf: Pmf object.85name: string name for the new Pmf.8687Returns:88Pmf object89"""90return BiasPmf(pmf, name, invert=True)919293def MakeUniformPmf(low, high):94"""Make a uniform Pmf.9596low: lowest value (inclusive)97high: highest value (inclusive)98"""99pmf = thinkbayes.Pmf()100for x in MakeRange(low=low, high=high):101pmf.Set(x, 1)102pmf.Normalize()103return pmf104105106def MakeRange(low=10, high=None, skip=10):107"""Makes a range representing possible gap times in seconds.108109low: where to start110high: where to end111skip: how many to skip112"""113if high is None:114high = UPPER_BOUND115116return range(low, high+skip, skip)117118119class WaitTimeCalculator(object):120"""Encapsulates the forward inference process.121122Given the actual distribution of gap times (z),123computes the distribution of gaps as seen by124a random passenger (zb), which yields the distribution125of wait times (y) and the distribution of elapsed times (x).126"""127128def __init__(self, pmf, inverse=False):129"""Constructor.130131pmf: Pmf of either z or zb132inverse: boolean, true if pmf is zb, false if pmf is z133"""134if inverse:135self.pmf_zb = pmf136self.pmf_z = UnbiasPmf(pmf, name="z")137else:138self.pmf_z = pmf139self.pmf_zb = BiasPmf(pmf, name="zb")140141# distribution of wait time142self.pmf_y = PmfOfWaitTime(self.pmf_zb)143144# the distribution of elapsed time is the same as the145# distribution of wait time146self.pmf_x = self.pmf_y147148def GenerateSampleWaitTimes(self, n):149"""Generates a random sample of wait times.150151n: sample size152153Returns: sequence of values154"""155cdf_y = thinkbayes.MakeCdfFromPmf(self.pmf_y)156sample = cdf_y.Sample(n)157return sample158159def GenerateSampleGaps(self, n):160"""Generates a random sample of gaps seen by passengers.161162n: sample size163164Returns: sequence of values165"""166cdf_zb = thinkbayes.MakeCdfFromPmf(self.pmf_zb)167sample = cdf_zb.Sample(n)168return sample169170def GenerateSamplePassengers(self, lam, n):171"""Generates a sample wait time and number of arrivals.172173lam: arrival rate in passengers per second174n: number of samples175176Returns: list of (k1, y, k2) tuples177k1: passengers there on arrival178y: wait time179k2: passengers arrived while waiting180"""181zs = self.GenerateSampleGaps(n)182xs, ys = SplitGaps(zs)183184res = []185for x, y in zip(xs, ys):186k1 = numpy.random.poisson(lam * x)187k2 = numpy.random.poisson(lam * y)188res.append((k1, y, k2))189190return res191192def PlotPmfs(self, root='redline0'):193"""Plots the computed Pmfs.194195root: string196"""197pmfs = ScaleDists([self.pmf_z, self.pmf_zb], 1.0/60)198199thinkplot.Clf()200thinkplot.PrePlot(2)201thinkplot.Pmfs(pmfs)202thinkplot.Save(root=root,203xlabel='Time (min)',204ylabel='CDF',205formats=FORMATS)206207208def MakePlot(self, root='redline2'):209"""Plots the computed CDFs.210211root: string212"""213print 'Mean z', self.pmf_z.Mean() / 60214print 'Mean zb', self.pmf_zb.Mean() / 60215print 'Mean y', self.pmf_y.Mean() / 60216217cdf_z = self.pmf_z.MakeCdf()218cdf_zb = self.pmf_zb.MakeCdf()219cdf_y = self.pmf_y.MakeCdf()220221cdfs = ScaleDists([cdf_z, cdf_zb, cdf_y], 1.0/60)222223thinkplot.Clf()224thinkplot.PrePlot(3)225thinkplot.Cdfs(cdfs)226thinkplot.Save(root=root,227xlabel='Time (min)',228ylabel='CDF',229formats=FORMATS)230231232def SplitGaps(zs):233"""Splits zs into xs and ys.234235zs: sequence of gaps236237Returns: tuple of sequences (xs, ys)238"""239xs = [random.uniform(0, z) for z in zs]240ys = [z-x for z, x in zip(zs, xs)]241return xs, ys242243244def PmfOfWaitTime(pmf_zb):245"""Distribution of wait time.246247pmf_zb: dist of gap time as seen by a random observer248249Returns: dist of wait time (also dist of elapsed time)250"""251metapmf = thinkbayes.Pmf()252for gap, prob in pmf_zb.Items():253uniform = MakeUniformPmf(0, gap)254metapmf.Set(uniform, prob)255256pmf_y = thinkbayes.MakeMixture(metapmf, name='y')257return pmf_y258259260def ScaleDists(dists, factor):261"""Scales each of the distributions in a sequence.262263dists: sequence of Pmf or Cdf264factor: float scale factor265"""266return [dist.Scale(factor) for dist in dists]267268269class ElapsedTimeEstimator(object):270"""Uses the number of passengers to estimate time since last train."""271272def __init__(self, wtc, lam, num_passengers):273"""Constructor.274275pmf_x: expected distribution of elapsed time276lam: arrival rate in passengers per second277num_passengers: # passengers seen on the platform278"""279# prior for elapsed time280self.prior_x = Elapsed(wtc.pmf_x, name='prior x')281282# posterior of elapsed time (based on number of passengers)283self.post_x = self.prior_x.Copy(name='posterior x')284self.post_x.Update((lam, num_passengers))285286# predictive distribution of wait time287self.pmf_y = PredictWaitTime(wtc.pmf_zb, self.post_x)288289def MakePlot(self, root='redline3'):290"""Plot the CDFs.291292root: string293"""294# observed gaps295cdf_prior_x = self.prior_x.MakeCdf()296cdf_post_x = self.post_x.MakeCdf()297cdf_y = self.pmf_y.MakeCdf()298299cdfs = ScaleDists([cdf_prior_x, cdf_post_x, cdf_y], 1.0/60)300301thinkplot.Clf()302thinkplot.PrePlot(3)303thinkplot.Cdfs(cdfs)304thinkplot.Save(root=root,305xlabel='Time (min)',306ylabel='CDF',307formats=FORMATS)308309310class ArrivalRate(thinkbayes.Suite):311"""Represents the distribution of arrival rates (lambda)."""312313def Likelihood(self, data, hypo):314"""Computes the likelihood of the data under the hypothesis.315316Evaluates the Poisson PMF for lambda and k.317318hypo: arrival rate in passengers per second319data: tuple of elapsed_time and number of passengers320"""321lam = hypo322x, k = data323like = thinkbayes.EvalPoissonPmf(k, lam * x)324return like325326327class ArrivalRateEstimator(object):328"""Estimates arrival rate based on passengers that arrive while waiting.329"""330331def __init__(self, passenger_data):332"""Constructor333334passenger_data: sequence of (k1, y, k2) pairs335"""336# range for lambda337low, high = 0, 5338n = 51339hypos = numpy.linspace(low, high, n) / 60340341self.prior_lam = ArrivalRate(hypos, name='prior')342self.prior_lam.Remove(0)343344self.post_lam = self.prior_lam.Copy(name='posterior')345346for _k1, y, k2 in passenger_data:347self.post_lam.Update((y, k2))348349print 'Mean posterior lambda', self.post_lam.Mean()350351def MakePlot(self, root='redline1'):352"""Plot the prior and posterior CDF of passengers arrival rate.353354root: string355"""356thinkplot.Clf()357thinkplot.PrePlot(2)358359# convert units to passengers per minute360prior = self.prior_lam.MakeCdf().Scale(60)361post = self.post_lam.MakeCdf().Scale(60)362363thinkplot.Cdfs([prior, post])364365thinkplot.Save(root=root,366xlabel='Arrival rate (passengers / min)',367ylabel='CDF',368formats=FORMATS)369370371class Elapsed(thinkbayes.Suite):372"""Represents the distribution of elapsed time (x)."""373374def Likelihood(self, data, hypo):375"""Computes the likelihood of the data under the hypothesis.376377Evaluates the Poisson PMF for lambda and k.378379hypo: elapsed time since the last train380data: tuple of arrival rate and number of passengers381"""382x = hypo383lam, k = data384like = thinkbayes.EvalPoissonPmf(k, lam * x)385return like386387388def PredictWaitTime(pmf_zb, pmf_x):389"""Computes the distribution of wait times.390391Enumerate all pairs of zb from pmf_zb and x from pmf_x,392and accumulate the distribution of y = z - x.393394pmf_zb: distribution of gaps seen by random observer395pmf_x: distribution of elapsed time396"""397pmf_y = pmf_zb - pmf_x398pmf_y.name = 'pred y'399RemoveNegatives(pmf_y)400return pmf_y401402403def RemoveNegatives(pmf):404"""Removes negative values from a PMF.405406pmf: Pmf407"""408for val in pmf.Values():409if val < 0:410pmf.Remove(val)411pmf.Normalize()412413414class Gaps(thinkbayes.Suite):415"""Represents the distribution of gap times,416as updated by an observed waiting time."""417418def Likelihood(self, data, hypo):419"""The likelihood of the data under the hypothesis.420421If the actual gap time is z, what is the likelihood422of waiting y seconds?423424hypo: actual time between trains425data: observed wait time426"""427z = hypo428y = data429if y > z:430return 0431return 1.0 / z432433434class GapDirichlet(thinkbayes.Dirichlet):435"""Represents the distribution of prevalences for each436gap time."""437438def __init__(self, xs):439"""Constructor.440441xs: sequence of possible gap times442"""443n = len(xs)444thinkbayes.Dirichlet.__init__(self, n)445self.xs = xs446self.mean_zbs = []447448def PmfMeanZb(self):449"""Makes the Pmf of mean zb.450451Values stored in mean_zbs.452"""453return thinkbayes.MakePmfFromList(self.mean_zbs)454455def Preload(self, data):456"""Adds pseudocounts to the parameters.457458data: sequence of pseudocounts459"""460thinkbayes.Dirichlet.Update(self, data)461462def Update(self, data):463"""Computes the likelihood of the data.464465data: wait time observed by random arrival (y)466467Returns: float probability468"""469k, y = data470471print k, y472prior = self.PredictivePmf(self.xs)473gaps = Gaps(prior)474gaps.Update(y)475probs = gaps.Probs(self.xs)476477self.params += numpy.array(probs)478479480class GapDirichlet2(GapDirichlet):481"""Represents the distribution of prevalences for each482gap time."""483484def Update(self, data):485"""Computes the likelihood of the data.486487data: wait time observed by random arrival (y)488489Returns: float probability490"""491k, y = data492493# get the current best guess for pmf_z494pmf_zb = self.PredictivePmf(self.xs)495496# use it to compute prior pmf_x, pmf_y, pmf_z497wtc = WaitTimeCalculator(pmf_zb, inverse=True)498499# use the observed passengers to estimate posterior pmf_x500elapsed = ElapsedTimeEstimator(wtc,501lam=0.0333,502num_passengers=k)503504# use posterior_x and observed y to estimate observed z505obs_zb = elapsed.post_x + Floor(y)506probs = obs_zb.Probs(self.xs)507508mean_zb = obs_zb.Mean()509self.mean_zbs.append(mean_zb)510print k, y, mean_zb511512# use observed z to update beliefs about pmf_z513self.params += numpy.array(probs)514515516class GapTimeEstimator(object):517"""Infers gap times using passenger data."""518519def __init__(self, xs, pcounts, passenger_data):520self.xs = xs521self.pcounts = pcounts522self.passenger_data = passenger_data523524self.wait_times = [y for _k1, y, _k2 in passenger_data]525self.pmf_y = thinkbayes.MakePmfFromList(self.wait_times, name="y")526527dirichlet = GapDirichlet2(self.xs)528dirichlet.params /= 1.0529530dirichlet.Preload(self.pcounts)531dirichlet.params /= 20.0532533self.prior_zb = dirichlet.PredictivePmf(self.xs, name="prior zb")534535for k1, y, _k2 in passenger_data:536dirichlet.Update((k1, y))537538self.pmf_mean_zb = dirichlet.PmfMeanZb()539540self.post_zb = dirichlet.PredictivePmf(self.xs, name="post zb")541self.post_z = UnbiasPmf(self.post_zb, name="post z")542543def PlotPmfs(self):544"""Plot the PMFs."""545print 'Mean y', self.pmf_y.Mean()546print 'Mean z', self.post_z.Mean()547print 'Mean zb', self.post_zb.Mean()548549thinkplot.Pmf(self.pmf_y)550thinkplot.Pmf(self.post_z)551thinkplot.Pmf(self.post_zb)552553def MakePlot(self):554"""Plot the CDFs."""555thinkplot.Cdf(self.pmf_y.MakeCdf())556thinkplot.Cdf(self.prior_zb.MakeCdf())557thinkplot.Cdf(self.post_zb.MakeCdf())558thinkplot.Cdf(self.pmf_mean_zb.MakeCdf())559thinkplot.Show()560561562def Floor(x, factor=10):563"""Rounds down to the nearest multiple of factor.564565When factor=10, all numbers from 10 to 19 get floored to 10.566"""567return int(x/factor) * factor568569570def TestGte():571"""Tests the GapTimeEstimator."""572random.seed(17)573574xs = [60, 120, 240]575576gap_times = [60, 60, 60, 60, 60, 120, 120, 120, 240, 240]577578# distribution of gap time (z)579pdf_z = thinkbayes.EstimatedPdf(gap_times)580pmf_z = pdf_z.MakePmf(xs, name="z")581582wtc = WaitTimeCalculator(pmf_z, inverse=False)583584lam = 0.0333585n = 100586passenger_data = wtc.GenerateSamplePassengers(lam, n)587588pcounts = [0, 0, 0]589590ite = GapTimeEstimator(xs, pcounts, passenger_data)591592thinkplot.Clf()593594# thinkplot.Cdf(wtc.pmf_z.MakeCdf(name="actual z"))595thinkplot.Cdf(wtc.pmf_zb.MakeCdf(name="actual zb"))596ite.MakePlot()597598599class WaitMixtureEstimator(object):600"""Encapsulates the process of estimating wait time with uncertain lam.601"""602603def __init__(self, wtc, are, num_passengers=15):604"""Constructor.605606wtc: WaitTimeCalculator607are: ArrivalTimeEstimator608num_passengers: number of passengers seen on the platform609"""610self.metapmf = thinkbayes.Pmf()611612for lam, prob in sorted(are.post_lam.Items()):613ete = ElapsedTimeEstimator(wtc, lam, num_passengers)614self.metapmf.Set(ete.pmf_y, prob)615616self.mixture = thinkbayes.MakeMixture(self.metapmf)617618lam = are.post_lam.Mean()619ete = ElapsedTimeEstimator(wtc, lam, num_passengers)620self.point = ete.pmf_y621622def MakePlot(self, root='redline4'):623"""Makes a plot showing the mixture."""624thinkplot.Clf()625626# plot the MetaPmf627for pmf, prob in sorted(self.metapmf.Items()):628cdf = pmf.MakeCdf().Scale(1.0/60)629width = 2/math.log(-math.log(prob))630thinkplot.Plot(cdf.xs, cdf.ps,631alpha=0.2, linewidth=width, color='blue',632label='')633634# plot the mixture and the distribution based on a point estimate635thinkplot.PrePlot(2)636#thinkplot.Cdf(self.point.MakeCdf(name='point').Scale(1.0/60))637thinkplot.Cdf(self.mixture.MakeCdf(name='mix').Scale(1.0/60))638639thinkplot.Save(root=root,640xlabel='Wait time (min)',641ylabel='CDF',642formats=FORMATS,643axis=[0,10,0,1])644645646647def GenerateSampleData(gap_times, lam=0.0333, n=10):648"""Generates passenger data based on actual gap times.649650gap_times: sequence of float651lam: arrival rate in passengers per second652n: number of simulated observations653"""654xs = MakeRange(low=10)655pdf_z = thinkbayes.EstimatedPdf(gap_times)656pmf_z = pdf_z.MakePmf(xs, name="z")657658wtc = WaitTimeCalculator(pmf_z, inverse=False)659passenger_data = wtc.GenerateSamplePassengers(lam, n)660return wtc, passenger_data661662663def RandomSeed(x):664"""Initialize the random and numpy.random generators.665666x: int seed667"""668random.seed(x)669numpy.random.seed(x)670671672def RunSimpleProcess(gap_times, lam=0.0333, num_passengers=15, plot=True):673"""Runs the basic analysis and generates figures.674675gap_times: sequence of float676lam: arrival rate in passengers per second677num_passengers: int number of passengers on the platform678plot: boolean, whether to generate plots679680Returns: WaitTimeCalculator, ElapsedTimeEstimator681"""682global UPPER_BOUND683UPPER_BOUND = 1200684685cdf_z = thinkbayes.MakeCdfFromList(gap_times).Scale(1.0/60)686print 'CI z', cdf_z.CredibleInterval(90)687688xs = MakeRange(low=10)689690pdf_z = thinkbayes.EstimatedPdf(gap_times)691pmf_z = pdf_z.MakePmf(xs, name="z")692693wtc = WaitTimeCalculator(pmf_z, inverse=False)694695if plot:696wtc.PlotPmfs()697wtc.MakePlot()698699ete = ElapsedTimeEstimator(wtc, lam, num_passengers)700701if plot:702ete.MakePlot()703704return wtc, ete705706707def RunMixProcess(gap_times, lam=0.0333, num_passengers=15, plot=True):708"""Runs the analysis for unknown lambda.709710gap_times: sequence of float711lam: arrival rate in passengers per second712num_passengers: int number of passengers on the platform713plot: boolean, whether to generate plots714715Returns: WaitMixtureEstimator716"""717global UPPER_BOUND718UPPER_BOUND = 1200719720wtc, _ete = RunSimpleProcess(gap_times, lam, num_passengers)721722RandomSeed(20)723passenger_data = wtc.GenerateSamplePassengers(lam, n=5)724725total_y = 0726total_k2 = 0727for k1, y, k2 in passenger_data:728print k1, y/60, k2729total_y += y/60730total_k2 += k2731print total_k2, total_y732print 'Average arrival rate', total_k2 / total_y733734are = ArrivalRateEstimator(passenger_data)735736if plot:737are.MakePlot()738739wme = WaitMixtureEstimator(wtc, are, num_passengers)740741if plot:742wme.MakePlot()743744return wme745746747def RunLoop(gap_times, nums, lam=0.0333):748"""Runs the basic analysis for a range of num_passengers.749750gap_times: sequence of float751nums: sequence of values for num_passengers752lam: arrival rate in passengers per second753754Returns: WaitMixtureEstimator755"""756global UPPER_BOUND757UPPER_BOUND = 4000758759thinkplot.Clf()760761RandomSeed(18)762763# resample gap_times764n = 220765cdf_z = thinkbayes.MakeCdfFromList(gap_times)766sample_z = cdf_z.Sample(n)767pmf_z = thinkbayes.MakePmfFromList(sample_z)768769# compute the biased pmf and add some long delays770cdf_zp = BiasPmf(pmf_z).MakeCdf()771sample_zb = cdf_zp.Sample(n) + [1800, 2400, 3000]772773# smooth the distribution of zb774pdf_zb = thinkbayes.EstimatedPdf(sample_zb)775xs = MakeRange(low=60)776pmf_zb = pdf_zb.MakePmf(xs)777778# unbias the distribution of zb and make wtc779pmf_z = UnbiasPmf(pmf_zb)780wtc = WaitTimeCalculator(pmf_z)781782probs = []783for num_passengers in nums:784ete = ElapsedTimeEstimator(wtc, lam, num_passengers)785786# compute the posterior prob of waiting more than 15 minutes787cdf_y = ete.pmf_y.MakeCdf()788prob = 1 - cdf_y.Prob(900)789probs.append(prob)790791# thinkplot.Cdf(ete.pmf_y.MakeCdf(name=str(num_passengers)))792793thinkplot.Plot(nums, probs)794thinkplot.Save(root='redline5',795xlabel='Num passengers',796ylabel='P(y > 15 min)',797formats=FORMATS,798)799800801def main(script):802RunLoop(OBSERVED_GAP_TIMES, nums=[0, 5, 10, 15, 20, 25, 30, 35])803RunMixProcess(OBSERVED_GAP_TIMES)804805806if __name__ == '__main__':807main(*sys.argv)808809810