📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code for use with "Think Bayes",1by Allen B. Downey, available from greenteapress.com23Copyright 2012 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67import math8import numpy9import random10import sys1112import correlation13import thinkplot14import matplotlib.pyplot as pyplot15import thinkbayes161718INTERVAL = 245/365.019FORMATS = ['pdf', 'eps']20MINSIZE = 0.221MAXSIZE = 2022BUCKET_FACTOR = 10232425def log2(x, denom=math.log(2)):26"""Computes log base 2."""27return math.log(x) / denom282930def SimpleModel():31"""Runs calculations based on a simple model."""3233# time between discharge and diagnosis, in days34interval = 3291.03536# doubling time in linear measure is doubling time in volume * 337dt = 811.0 * 33839# number of doublings since discharge40doublings = interval / dt4142# how big was the tumor at time of discharge (diameter in cm)43d1 = 15.544d0 = d1 / 2.0 ** doublings4546print 'interval (days)', interval47print 'interval (years)', interval / 36548print 'dt', dt49print 'doublings', doublings50print 'd1', d151print 'd0', d05253# assume an initial linear measure of 0.1 cm54d0 = 0.155d1 = 15.55657# how many doublings would it take to get from d0 to d158doublings = log2(d1 / d0)5960# what linear doubling time does that imply?61dt = interval / doublings6263print 'doublings', doublings64print 'dt', dt6566# compute the volumetric doubling time and RDT67vdt = dt / 368rdt = 365 / vdt6970print 'vdt', vdt71print 'rdt', rdt7273cdf = MakeCdf()74p = cdf.Prob(rdt)75print 'Prob{RDT > 2.4}', 1-p767778def MakeCdf():79"""Uses the data from Zhang et al. to construct a CDF."""80n = 53.081freqs = [0, 2, 31, 42, 48, 51, 52, 53]82ps = [freq/n for freq in freqs]83xs = numpy.arange(-1.5, 6.5, 1.0)8485cdf = thinkbayes.Cdf(xs, ps)86return cdf878889def PlotCdf(cdf):90"""Plots the actual and fitted distributions.9192cdf: CDF object93"""94xs, ps = cdf.xs, cdf.ps95cps = [1-p for p in ps]9697# CCDF on logy scale: shows exponential behavior98thinkplot.Clf()99thinkplot.Plot(xs, cps, 'bo-')100thinkplot.Save(root='kidney1',101formats=FORMATS,102xlabel='RDT',103ylabel='CCDF (log scale)',104yscale='log')105106# CDF, model and data107108thinkplot.Clf()109thinkplot.PrePlot(num=2)110mxs, mys = ModelCdf()111thinkplot.Plot(mxs, mys, label='model', linestyle='dashed')112113thinkplot.Plot(xs, ps, 'gs', label='data')114thinkplot.Save(root='kidney2',115formats=FORMATS,116xlabel='RDT (volume doublings per year)',117ylabel='CDF',118title='Distribution of RDT',119axis=[-2, 7, 0, 1],120loc=4)121122123def QQPlot(cdf, fit):124"""Makes a QQPlot of the values from actual and fitted distributions.125126cdf: actual Cdf of RDT127fit: model128"""129xs = [-1.5, 5.5]130thinkplot.Clf()131thinkplot.Plot(xs, xs, 'b-')132133xs, ps = cdf.xs, cdf.ps134fs = [fit.Value(p) for p in ps]135136thinkplot.Plot(xs, fs, 'gs')137thinkplot.Save(root = 'kidney3',138formats=FORMATS,139xlabel='Actual',140ylabel='Model')141142143def FitCdf(cdf):144"""Fits a line to the log CCDF and returns the slope.145146cdf: Cdf of RDT147"""148xs, ps = cdf.xs, cdf.ps149cps = [1-p for p in ps]150151xs = xs[1:-1]152lcps = [math.log(p) for p in cps[1:-1]]153154_inter, slope = correlation.LeastSquares(xs, lcps)155return -slope156157158def CorrelatedGenerator(cdf, rho):159"""Generates a sequence of values from cdf with correlation.160161Generates a correlated standard Gaussian series, then transforms to162values from cdf163164cdf: distribution to choose from165rho: target coefficient of correlation166"""167def Transform(x):168"""Maps from a Gaussian variate to a variate with the given CDF."""169p = thinkbayes.GaussianCdf(x)170y = cdf.Value(p)171return y172173# for the first value, choose from a Gaussian and transform it174x = random.gauss(0, 1)175yield Transform(x)176177# for subsequent values, choose from the conditional distribution178# based on the previous value179sigma = math.sqrt(1 - rho**2)180while True:181x = random.gauss(x * rho, sigma)182yield Transform(x)183184185def UncorrelatedGenerator(cdf, _rho=None):186"""Generates a sequence of values from cdf with no correlation.187188Ignores rho, which is accepted as a parameter to provide the189same interface as CorrelatedGenerator190191cdf: distribution to choose from192rho: ignored193"""194while True:195x = cdf.Random()196yield x197198199def RdtGenerator(cdf, rho):200"""Returns an iterator with n values from cdf and the given correlation.201202cdf: Cdf object203rho: coefficient of correlation204"""205if rho == 0.0:206return UncorrelatedGenerator(cdf)207else:208return CorrelatedGenerator(cdf, rho)209210211def GenerateRdt(pc, lam1, lam2):212"""Generate an RDT from a mixture of exponential distributions.213214With prob pc, generate a negative value with param lam2;215otherwise generate a positive value with param lam1.216"""217if random.random() < pc:218return -random.expovariate(lam2)219else:220return random.expovariate(lam1)221222223def GenerateSample(n, pc, lam1, lam2):224"""Generates a sample of RDTs.225226n: sample size227pc: probablity of negative growth228lam1: exponential parameter of positive growth229lam2: exponential parameter of negative growth230231Returns: list of random variates232"""233xs = [GenerateRdt(pc, lam1, lam2) for _ in xrange(n)]234return xs235236237def GenerateCdf(n=1000, pc=0.35, lam1=0.79, lam2=5.0):238"""Generates a sample of RDTs and returns its CDF.239240n: sample size241pc: probablity of negative growth242lam1: exponential parameter of positive growth243lam2: exponential parameter of negative growth244245Returns: Cdf of generated sample246"""247xs = GenerateSample(n, pc, lam1, lam2)248cdf = thinkbayes.MakeCdfFromList(xs)249return cdf250251252def ModelCdf(pc=0.35, lam1=0.79, lam2=5.0):253"""254255pc: probablity of negative growth256lam1: exponential parameter of positive growth257lam2: exponential parameter of negative growth258259Returns: list of xs, list of ys260"""261cdf = thinkbayes.EvalExponentialCdf262x1 = numpy.arange(-2, 0, 0.1)263y1 = [pc * (1 - cdf(-x, lam2)) for x in x1]264x2 = numpy.arange(0, 7, 0.1)265y2 = [pc + (1-pc) * cdf(x, lam1) for x in x2]266return list(x1) + list(x2), y1+y2267268269def BucketToCm(y, factor=BUCKET_FACTOR):270"""Computes the linear dimension for a given bucket.271272t: bucket number273factor: multiplicitive factor from one bucket to the next274275Returns: linear dimension in cm276"""277return math.exp(y / factor)278279280def CmToBucket(x, factor=BUCKET_FACTOR):281"""Computes the bucket for a given linear dimension.282283x: linear dimension in cm284factor: multiplicitive factor from one bucket to the next285286Returns: float bucket number287"""288return round(factor * math.log(x))289290291def Diameter(volume, factor=3/math.pi/4, exp=1/3.0):292"""Converts a volume to a diameter.293294d = 2r = 2 * (3/4/pi V)^1/3295"""296return 2 * (factor * volume) ** exp297298299def Volume(diameter, factor=4*math.pi/3):300"""Converts a diameter to a volume.301302V = 4/3 pi (d/2)^3303"""304return factor * (diameter/2.0)**3305306307class Cache(object):308"""Records each observation point for each tumor."""309310def __init__(self):311"""Initializes the cache.312313joint: map from (age, bucket) to frequency314sequences: map from bucket to a list of sequences315initial_rdt: sequence of (V0, rdt) pairs316"""317self.joint = thinkbayes.Joint()318self.sequences = {}319self.initial_rdt = []320321def GetBuckets(self):322"""Returns an iterator for the keys in the cache."""323return self.sequences.iterkeys()324325def GetSequence(self, bucket):326"""Looks up a bucket in the cache."""327return self.sequences[bucket]328329def ConditionalCdf(self, bucket, name=''):330"""Forms the cdf of ages for a given bucket.331332bucket: int bucket number333name: string334"""335pmf = self.joint.Conditional(0, 1, bucket, name=name)336cdf = pmf.MakeCdf()337return cdf338339def ProbOlder(self, cm, age):340"""Computes the probability of exceeding age, given size.341342cm: size in cm343age: age in years344"""345bucket = CmToBucket(cm)346cdf = self.ConditionalCdf(bucket)347p = cdf.Prob(age)348return 1-p349350def GetDistAgeSize(self, size_thresh=MAXSIZE):351"""Gets the joint distribution of age and size.352353Map from (age, log size in cm) to log freq354355Returns: new Pmf object356"""357joint = thinkbayes.Joint()358359for val, freq in self.joint.Items():360age, bucket = val361cm = BucketToCm(bucket)362if cm > size_thresh:363continue364log_cm = math.log10(cm)365joint.Set((age, log_cm), math.log(freq) * 10)366367return joint368369def Add(self, age, seq, rdt):370"""Adds this observation point to the cache.371372age: age of the tumor in years373seq: sequence of volumes374rdt: RDT during this interval375"""376final = seq[-1]377cm = Diameter(final)378bucket = CmToBucket(cm)379self.joint.Incr((age, bucket))380381self.sequences.setdefault(bucket, []).append(seq)382383initial = seq[-2]384self.initial_rdt.append((initial, rdt))385386def Print(self):387"""Prints the size (cm) for each bucket, and the number of sequences."""388for bucket in sorted(self.GetBuckets()):389ss = self.GetSequence(bucket)390diameter = BucketToCm(bucket)391print diameter, len(ss)392393def Correlation(self):394"""Computes the correlation between log volumes and rdts."""395vs, rdts = zip(*self.initial_rdt)396lvs = [math.log(v) for v in vs]397return correlation.Corr(lvs, rdts)398399400class Calculator(object):401"""Encapsulates the state of the computation."""402403def __init__(self):404"""Initializes the cache."""405self.cache = Cache()406407def MakeSequences(self, n, rho, cdf):408"""Returns a list of sequences of volumes.409410n: number of sequences to make411rho: serial correlation412cdf: Cdf of rdts413414Returns: list of n sequences of volumes415"""416sequences = []417for i in range(n):418rdt_seq = RdtGenerator(cdf, rho)419seq = self.MakeSequence(rdt_seq)420sequences.append(seq)421422if i % 100 == 0:423print i424425return sequences426427def MakeSequence(self, rdt_seq, v0=0.01, interval=INTERVAL,428vmax=Volume(MAXSIZE)):429"""Simulate the growth of a tumor.430431rdt_seq: sequence of rdts432v0: initial volume in mL (cm^3)433interval: timestep in years434vmax: volume to stop at435436Returns: sequence of volumes437"""438seq = v0,439age = 0440441for rdt in rdt_seq:442age += interval443final, seq = self.ExtendSequence(age, seq, rdt, interval)444if final > vmax:445break446447return seq448449def ExtendSequence(self, age, seq, rdt, interval):450"""Generates a new random value and adds it to the end of seq.451452Side-effect: adds sub-sequences to the cache.453454age: age of tumor at the end of this interval455seq: sequence of values so far456rdt: reciprocal doubling time in doublings per year457interval: timestep in years458459Returns: final volume, extended sequence460"""461initial = seq[-1]462doublings = rdt * interval463final = initial * 2**doublings464new_seq = seq + (final,)465self.cache.Add(age, new_seq, rdt)466467return final, new_seq468469def PlotBucket(self, bucket, color='blue'):470"""Plots the set of sequences for the given bucket.471472bucket: int bucket number473color: string474"""475sequences = self.cache.GetSequence(bucket)476for seq in sequences:477n = len(seq)478age = n * INTERVAL479ts = numpy.linspace(-age, 0, n)480PlotSequence(ts, seq, color)481482def PlotBuckets(self):483"""Plots the set of sequences that ended in a given bucket."""484# 2.01, 4.95 cm, 9.97 cm485buckets = [7.0, 16.0, 23.0]486buckets = [23.0]487colors = ['blue', 'green', 'red', 'cyan']488489thinkplot.Clf()490for bucket, color in zip(buckets, colors):491self.PlotBucket(bucket, color)492493thinkplot.Save(root='kidney5',494formats=FORMATS,495title='History of simulated tumors',496axis=[-40, 1, MINSIZE, 12],497xlabel='years',498ylabel='diameter (cm, log scale)',499yscale='log')500501def PlotJointDist(self):502"""Makes a pcolor plot of the age-size joint distribution."""503thinkplot.Clf()504505joint = self.cache.GetDistAgeSize()506thinkplot.Contour(joint, contour=False, pcolor=True)507508thinkplot.Save(root='kidney8',509formats=FORMATS,510axis=[0, 41, -0.7, 1.31],511yticks=MakeLogTicks([0.2, 0.5, 1, 2, 5, 10, 20]),512xlabel='ages',513ylabel='diameter (cm, log scale)')514515def PlotConditionalCdfs(self):516"""Plots the cdf of ages for each bucket."""517buckets = [7.0, 16.0, 23.0, 27.0]518# 2.01, 4.95 cm, 9.97 cm, 14.879 cm519names = ['2 cm', '5 cm', '10 cm', '15 cm']520cdfs = []521522for bucket, name in zip(buckets, names):523cdf = self.cache.ConditionalCdf(bucket, name)524cdfs.append(cdf)525526thinkplot.Clf()527thinkplot.PrePlot(num=len(cdfs))528thinkplot.Cdfs(cdfs)529thinkplot.Save(root='kidney6',530title='Distribution of age for several diameters',531formats=FORMATS,532xlabel='tumor age (years)',533ylabel='CDF',534loc=4)535536def PlotCredibleIntervals(self, xscale='linear'):537"""Plots the confidence interval for each bucket."""538xs = []539ts = []540percentiles = [95, 75, 50, 25, 5]541min_size = 0.3542543# loop through the buckets, accumulate544# xs: sequence of sizes in cm545# ts: sequence of percentile tuples546for _, bucket in enumerate(sorted(self.cache.GetBuckets())):547cm = BucketToCm(bucket)548if cm < min_size or cm > 20.0:549continue550xs.append(cm)551cdf = self.cache.ConditionalCdf(bucket)552ps = [cdf.Percentile(p) for p in percentiles]553ts.append(ps)554555# dump the results into a table556fp = open('kidney_table.tex', 'w')557PrintTable(fp, xs, ts)558fp.close()559560# make the figure561linewidths = [1, 2, 3, 2, 1]562alphas = [0.3, 0.5, 1, 0.5, 0.3]563labels = ['95th', '75th', '50th', '25th', '5th']564565# transpose the ts so we have sequences for each percentile rank566thinkplot.Clf()567yys = zip(*ts)568569for ys, linewidth, alpha, label in zip(yys, linewidths, alphas, labels):570options = dict(color='blue', linewidth=linewidth,571alpha=alpha, label=label, markersize=2)572573# plot the data points574thinkplot.Plot(xs, ys, 'bo', **options)575576# plot the fit lines577fxs = [min_size, 20.0]578fys = FitLine(xs, ys, fxs)579580thinkplot.Plot(fxs, fys, **options)581582# put a label at the end of each line583x, y = fxs[-1], fys[-1]584pyplot.text(x*1.05, y, label, color='blue',585horizontalalignment='left',586verticalalignment='center')587588# make the figure589thinkplot.Save(root='kidney7',590formats=FORMATS,591title='Credible interval for age vs diameter',592xlabel='diameter (cm, log scale)',593ylabel='tumor age (years)',594xscale=xscale,595xticks=MakeTicks([0.5, 1, 2, 5, 10, 20]),596axis=[0.25, 35, 0, 45],597legend=False,598)599600601def PlotSequences(sequences):602"""Plots linear measurement vs time.603604sequences: list of sequences of volumes605"""606thinkplot.Clf()607608options = dict(color='gray', linewidth=1, linestyle='dashed')609thinkplot.Plot([0, 40], [10, 10], **options)610611for seq in sequences:612n = len(seq)613age = n * INTERVAL614ts = numpy.linspace(0, age, n)615PlotSequence(ts, seq)616617thinkplot.Save(root='kidney4',618formats=FORMATS,619axis=[0, 40, MINSIZE, 20],620title='Simulations of tumor growth',621xlabel='tumor age (years)',622yticks=MakeTicks([0.2, 0.5, 1, 2, 5, 10, 20]),623ylabel='diameter (cm, log scale)',624yscale='log')625626627def PlotSequence(ts, seq, color='blue'):628"""Plots a time series of linear measurements.629630ts: sequence of times in years631seq: sequence of columes632color: color string633"""634options = dict(color=color, linewidth=1, alpha=0.2)635xs = [Diameter(v) for v in seq]636637thinkplot.Plot(ts, xs, **options)638639640def PrintCI(fp, cm, ps):641"""Writes a line in the LaTeX table.642643fp: file pointer644cm: diameter in cm645ts: tuples of percentiles646"""647fp.write('%0.1f' % round(cm, 1))648for p in reversed(ps):649fp.write(' & %0.1f ' % round(p, 1))650fp.write(r'\\' '\n')651652653def PrintTable(fp, xs, ts):654"""Writes the data in a LaTeX table.655656fp: file pointer657xs: diameters in cm658ts: sequence of tuples of percentiles659"""660fp.write(r'\begin{tabular}{|r||r|r|r|r|r|}' '\n')661fp.write(r'\hline' '\n')662fp.write(r'Diameter & \multicolumn{5}{c|}{Percentiles of age} \\' '\n')663fp.write(r'(cm) & 5th & 25th & 50th & 75th & 95th \\' '\n')664fp.write(r'\hline' '\n')665666for i, (cm, ps) in enumerate(zip(xs, ts)):667#print cm, ps668if i % 3 == 0:669PrintCI(fp, cm, ps)670671fp.write(r'\hline' '\n')672fp.write(r'\end{tabular}' '\n')673674675def FitLine(xs, ys, fxs):676"""Fits a line to the xs and ys, and returns fitted values for fxs.677678Applies a log transform to the xs.679680xs: diameter in cm681ys: age in years682fxs: diameter in cm683"""684lxs = [math.log(x) for x in xs]685inter, slope = correlation.LeastSquares(lxs, ys)686# res = correlation.Residuals(lxs, ys, inter, slope)687# r2 = correlation.CoefDetermination(ys, res)688689lfxs = [math.log(x) for x in fxs]690fys = [inter + slope * x for x in lfxs]691return fys692693694def MakeTicks(xs):695"""Makes a pair of sequences for use as pyplot ticks.696697xs: sequence of floats698699Returns (xs, labels), where labels is a sequence of strings.700"""701labels = [str(x) for x in xs]702return xs, labels703704705def MakeLogTicks(xs):706"""Makes a pair of sequences for use as pyplot ticks.707708xs: sequence of floats709710Returns (xs, labels), where labels is a sequence of strings.711"""712lxs = [math.log10(x) for x in xs]713labels = [str(x) for x in xs]714return lxs, labels715716717def TestCorrelation(cdf):718"""Tests the correlated generator.719720Makes sure that the sequence has the right distribution and correlation.721"""722n = 10000723rho = 0.4724725rdt_seq = CorrelatedGenerator(cdf, rho)726xs = [rdt_seq.next() for _ in range(n)]727728rho2 = correlation.SerialCorr(xs)729print rho, rho2730cdf2 = thinkbayes.MakeCdfFromList(xs)731732thinkplot.Cdfs([cdf, cdf2])733thinkplot.Show()734735736def main(script):737for size in [1, 5, 10]:738bucket = CmToBucket(size)739print 'Size, bucket', size, bucket740741SimpleModel()742743random.seed(17)744745cdf = MakeCdf()746747lam1 = FitCdf(cdf)748fit = GenerateCdf(lam1=lam1)749750# TestCorrelation(fit)751752PlotCdf(cdf)753# QQPlot(cdf, fit)754755calc = Calculator()756rho = 0.0757sequences = calc.MakeSequences(100, rho, fit)758PlotSequences(sequences)759760calc.PlotBuckets()761762_ = calc.MakeSequences(1900, rho, fit)763print 'V0-RDT correlation', calc.cache.Correlation()764765print '15.5 Probability age > 8 year', calc.cache.ProbOlder(15.5, 8)766print '6.0 Probability age > 8 year', calc.cache.ProbOlder(6.0, 8)767768calc.PlotConditionalCdfs()769770calc.PlotCredibleIntervals(xscale='log')771772calc.PlotJointDist()773774775if __name__ == '__main__':776main(*sys.argv)777778779780781