📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code for use with "Think Bayes",1by Allen B. Downey, available from greenteapress.com23Copyright 2012 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67"""This file contains class definitions for:89Hist: represents a histogram (map from values to integer frequencies).1011Pmf: represents a probability mass function (map from values to probs).1213_DictWrapper: private parent class for Hist and Pmf.1415Cdf: represents a discrete cumulative distribution function1617Pdf: represents a continuous probability density function1819"""2021import bisect22import copy23import logging24import math25import numpy26import random2728import scipy.stats29from scipy.special import erf, erfinv, gammaln3031ROOT2 = math.sqrt(2)3233def RandomSeed(x):34"""Initialize the random and numpy.random generators.3536x: int seed37"""38random.seed(x)39numpy.random.seed(x)404142def Odds(p):43"""Computes odds for a given probability.4445Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.4647Note: when p=1, the formula for odds divides by zero, which is48normally undefined. But I think it is reasonable to define Odds(1)49to be infinity, so that's what this function does.5051p: float 0-15253Returns: float odds54"""55if p == 1:56return float('inf')57return p / (1 - p)585960def Probability(o):61"""Computes the probability corresponding to given odds.6263Example: o=2 means 2:1 odds in favor, or 2/3 probability6465o: float odds, strictly positive6667Returns: float probability68"""69return o / (o + 1)707172def Probability2(yes, no):73"""Computes the probability corresponding to given odds.7475Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.7677yes, no: int or float odds in favor78"""79return float(yes) / (yes + no)808182class Interpolator(object):83"""Represents a mapping between sorted sequences; performs linear interp.8485Attributes:86xs: sorted list87ys: sorted list88"""8990def __init__(self, xs, ys):91self.xs = xs92self.ys = ys9394def Lookup(self, x):95"""Looks up x and returns the corresponding value of y."""96return self._Bisect(x, self.xs, self.ys)9798def Reverse(self, y):99"""Looks up y and returns the corresponding value of x."""100return self._Bisect(y, self.ys, self.xs)101102def _Bisect(self, x, xs, ys):103"""Helper function."""104if x <= xs[0]:105return ys[0]106if x >= xs[-1]:107return ys[-1]108i = bisect.bisect(xs, x)109frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])110y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])111return y112113114class _DictWrapper(object):115"""An object that contains a dictionary."""116117def __init__(self, values=None, name=''):118"""Initializes the distribution.119120hypos: sequence of hypotheses121"""122self.name = name123self.d = {}124125# flag whether the distribution is under a log transform126self.log = False127128if values is None:129return130131init_methods = [132self.InitPmf,133self.InitMapping,134self.InitSequence,135self.InitFailure,136]137138for method in init_methods:139try:140method(values)141break142except AttributeError:143continue144145if len(self) > 0:146self.Normalize()147148def InitSequence(self, values):149"""Initializes with a sequence of equally-likely values.150151values: sequence of values152"""153for value in values:154self.Set(value, 1)155156def InitMapping(self, values):157"""Initializes with a map from value to probability.158159values: map from value to probability160"""161for value, prob in values.iteritems():162self.Set(value, prob)163164def InitPmf(self, values):165"""Initializes with a Pmf.166167values: Pmf object168"""169for value, prob in values.Items():170self.Set(value, prob)171172def InitFailure(self, values):173"""Raises an error."""174raise ValueError('None of the initialization methods worked.')175176def __len__(self):177return len(self.d)178179def __iter__(self):180return iter(self.d)181182def iterkeys(self):183return iter(self.d)184185def __contains__(self, value):186return value in self.d187188def Copy(self, name=None):189"""Returns a copy.190191Make a shallow copy of d. If you want a deep copy of d,192use copy.deepcopy on the whole object.193194Args:195name: string name for the new Hist196"""197new = copy.copy(self)198new.d = copy.copy(self.d)199new.name = name if name is not None else self.name200return new201202def Scale(self, factor):203"""Multiplies the values by a factor.204205factor: what to multiply by206207Returns: new object208"""209new = self.Copy()210new.d.clear()211212for val, prob in self.Items():213new.Set(val * factor, prob)214return new215216def Log(self, m=None):217"""Log transforms the probabilities.218219Removes values with probability 0.220221Normalizes so that the largest logprob is 0.222"""223if self.log:224raise ValueError("Pmf/Hist already under a log transform")225self.log = True226227if m is None:228m = self.MaxLike()229230for x, p in self.d.iteritems():231if p:232self.Set(x, math.log(p / m))233else:234self.Remove(x)235236def Exp(self, m=None):237"""Exponentiates the probabilities.238239m: how much to shift the ps before exponentiating240241If m is None, normalizes so that the largest prob is 1.242"""243if not self.log:244raise ValueError("Pmf/Hist not under a log transform")245self.log = False246247if m is None:248m = self.MaxLike()249250for x, p in self.d.iteritems():251self.Set(x, math.exp(p - m))252253def GetDict(self):254"""Gets the dictionary."""255return self.d256257def SetDict(self, d):258"""Sets the dictionary."""259self.d = d260261def Values(self):262"""Gets an unsorted sequence of values.263264Note: one source of confusion is that the keys of this265dictionary are the values of the Hist/Pmf, and the266values of the dictionary are frequencies/probabilities.267"""268return self.d.keys()269270def Items(self):271"""Gets an unsorted sequence of (value, freq/prob) pairs."""272return self.d.items()273274def Render(self):275"""Generates a sequence of points suitable for plotting.276277Returns:278tuple of (sorted value sequence, freq/prob sequence)279"""280return zip(*sorted(self.Items()))281282def Print(self):283"""Prints the values and freqs/probs in ascending order."""284for val, prob in sorted(self.d.iteritems()):285print val, prob286287def Set(self, x, y=0):288"""Sets the freq/prob associated with the value x.289290Args:291x: number value292y: number freq or prob293"""294self.d[x] = y295296def Incr(self, x, term=1):297"""Increments the freq/prob associated with the value x.298299Args:300x: number value301term: how much to increment by302"""303self.d[x] = self.d.get(x, 0) + term304305def Mult(self, x, factor):306"""Scales the freq/prob associated with the value x.307308Args:309x: number value310factor: how much to multiply by311"""312self.d[x] = self.d.get(x, 0) * factor313314def Remove(self, x):315"""Removes a value.316317Throws an exception if the value is not there.318319Args:320x: value to remove321"""322del self.d[x]323324def Total(self):325"""Returns the total of the frequencies/probabilities in the map."""326total = sum(self.d.itervalues())327return total328329def MaxLike(self):330"""Returns the largest frequency/probability in the map."""331return max(self.d.itervalues())332333334class Hist(_DictWrapper):335"""Represents a histogram, which is a map from values to frequencies.336337Values can be any hashable type; frequencies are integer counters.338"""339340def Freq(self, x):341"""Gets the frequency associated with the value x.342343Args:344x: number value345346Returns:347int frequency348"""349return self.d.get(x, 0)350351def Freqs(self, xs):352"""Gets frequencies for a sequence of values."""353return [self.Freq(x) for x in xs]354355def IsSubset(self, other):356"""Checks whether the values in this histogram are a subset of357the values in the given histogram."""358for val, freq in self.Items():359if freq > other.Freq(val):360return False361return True362363def Subtract(self, other):364"""Subtracts the values in the given histogram from this histogram."""365for val, freq in other.Items():366self.Incr(val, -freq)367368369class Pmf(_DictWrapper):370"""Represents a probability mass function.371372Values can be any hashable type; probabilities are floating-point.373Pmfs are not necessarily normalized.374"""375376def Prob(self, x, default=0):377"""Gets the probability associated with the value x.378379Args:380x: number value381default: value to return if the key is not there382383Returns:384float probability385"""386return self.d.get(x, default)387388def Probs(self, xs):389"""Gets probabilities for a sequence of values."""390return [self.Prob(x) for x in xs]391392def MakeCdf(self, name=None):393"""Makes a Cdf."""394return MakeCdfFromPmf(self, name=name)395396def ProbGreater(self, x):397"""Probability that a sample from this Pmf exceeds x.398399x: number400401returns: float probability402"""403t = [prob for (val, prob) in self.d.iteritems() if val > x]404return sum(t)405406def ProbLess(self, x):407"""Probability that a sample from this Pmf is less than x.408409x: number410411returns: float probability412"""413t = [prob for (val, prob) in self.d.iteritems() if val < x]414return sum(t)415416def __lt__(self, obj):417"""Less than.418419obj: number or _DictWrapper420421returns: float probability422"""423if isinstance(obj, _DictWrapper):424return PmfProbLess(self, obj)425else:426return self.ProbLess(obj)427428def __gt__(self, obj):429"""Greater than.430431obj: number or _DictWrapper432433returns: float probability434"""435if isinstance(obj, _DictWrapper):436return PmfProbGreater(self, obj)437else:438return self.ProbGreater(obj)439440def __ge__(self, obj):441"""Greater than or equal.442443obj: number or _DictWrapper444445returns: float probability446"""447return 1 - (self < obj)448449def __le__(self, obj):450"""Less than or equal.451452obj: number or _DictWrapper453454returns: float probability455"""456return 1 - (self > obj)457458def __eq__(self, obj):459"""Equal to.460461obj: number or _DictWrapper462463returns: float probability464"""465if isinstance(obj, _DictWrapper):466return PmfProbEqual(self, obj)467else:468return self.Prob(obj)469470def __ne__(self, obj):471"""Not equal to.472473obj: number or _DictWrapper474475returns: float probability476"""477return 1 - (self == obj)478479def Normalize(self, fraction=1.0):480"""Normalizes this PMF so the sum of all probs is fraction.481482Args:483fraction: what the total should be after normalization484485Returns: the total probability before normalizing486"""487if self.log:488raise ValueError("Pmf is under a log transform")489490total = self.Total()491if total == 0.0:492raise ValueError('total probability is zero.')493logging.warning('Normalize: total probability is zero.')494return total495496factor = float(fraction) / total497for x in self.d:498self.d[x] *= factor499500return total501502def Random(self):503"""Chooses a random element from this PMF.504505Returns:506float value from the Pmf507"""508if len(self.d) == 0:509raise ValueError('Pmf contains no values.')510511target = random.random()512total = 0.0513for x, p in self.d.iteritems():514total += p515if total >= target:516return x517518# we shouldn't get here519assert False520521def Mean(self):522"""Computes the mean of a PMF.523524Returns:525float mean526"""527mu = 0.0528for x, p in self.d.iteritems():529mu += p * x530return mu531532def Var(self, mu=None):533"""Computes the variance of a PMF.534535Args:536mu: the point around which the variance is computed;537if omitted, computes the mean538539Returns:540float variance541"""542if mu is None:543mu = self.Mean()544545var = 0.0546for x, p in self.d.iteritems():547var += p * (x - mu) ** 2548return var549550def MaximumLikelihood(self):551"""Returns the value with the highest probability.552553Returns: float probability554"""555prob, val = max((prob, val) for val, prob in self.Items())556return val557558def CredibleInterval(self, percentage=90):559"""Computes the central credible interval.560561If percentage=90, computes the 90% CI.562563Args:564percentage: float between 0 and 100565566Returns:567sequence of two floats, low and high568"""569cdf = self.MakeCdf()570return cdf.CredibleInterval(percentage)571572def __add__(self, other):573"""Computes the Pmf of the sum of values drawn from self and other.574575other: another Pmf576577returns: new Pmf578"""579try:580return self.AddPmf(other)581except AttributeError:582return self.AddConstant(other)583584def AddPmf(self, other):585"""Computes the Pmf of the sum of values drawn from self and other.586587other: another Pmf588589returns: new Pmf590"""591pmf = Pmf()592for v1, p1 in self.Items():593for v2, p2 in other.Items():594pmf.Incr(v1 + v2, p1 * p2)595return pmf596597def AddConstant(self, other):598"""Computes the Pmf of the sum a constant and values from self.599600other: a number601602returns: new Pmf603"""604pmf = Pmf()605for v1, p1 in self.Items():606pmf.Set(v1 + other, p1)607return pmf608609def __sub__(self, other):610"""Computes the Pmf of the diff of values drawn from self and other.611612other: another Pmf613614returns: new Pmf615"""616pmf = Pmf()617for v1, p1 in self.Items():618for v2, p2 in other.Items():619pmf.Incr(v1 - v2, p1 * p2)620return pmf621622def Max(self, k):623"""Computes the CDF of the maximum of k selections from this dist.624625k: int626627returns: new Cdf628"""629cdf = self.MakeCdf()630cdf.ps = [p ** k for p in cdf.ps]631return cdf632633634class Joint(Pmf):635"""Represents a joint distribution.636637The values are sequences (usually tuples)638"""639640def Marginal(self, i, name=''):641"""Gets the marginal distribution of the indicated variable.642643i: index of the variable we want644645Returns: Pmf646"""647pmf = Pmf(name=name)648for vs, prob in self.Items():649pmf.Incr(vs[i], prob)650return pmf651652def Conditional(self, i, j, val, name=''):653"""Gets the conditional distribution of the indicated variable.654655Distribution of vs[i], conditioned on vs[j] = val.656657i: index of the variable we want658j: which variable is conditioned on659val: the value the jth variable has to have660661Returns: Pmf662"""663pmf = Pmf(name=name)664for vs, prob in self.Items():665if vs[j] != val: continue666pmf.Incr(vs[i], prob)667668pmf.Normalize()669return pmf670671def MaxLikeInterval(self, percentage=90):672"""Returns the maximum-likelihood credible interval.673674If percentage=90, computes a 90% CI containing the values675with the highest likelihoods.676677percentage: float between 0 and 100678679Returns: list of values from the suite680"""681interval = []682total = 0683684t = [(prob, val) for val, prob in self.Items()]685t.sort(reverse=True)686687for prob, val in t:688interval.append(val)689total += prob690if total >= percentage / 100.0:691break692693return interval694695696def MakeJoint(pmf1, pmf2):697"""Joint distribution of values from pmf1 and pmf2.698699Args:700pmf1: Pmf object701pmf2: Pmf object702703Returns:704Joint pmf of value pairs705"""706joint = Joint()707for v1, p1 in pmf1.Items():708for v2, p2 in pmf2.Items():709joint.Set((v1, v2), p1 * p2)710return joint711712713def MakeHistFromList(t, name=''):714"""Makes a histogram from an unsorted sequence of values.715716Args:717t: sequence of numbers718name: string name for this histogram719720Returns:721Hist object722"""723hist = Hist(name=name)724[hist.Incr(x) for x in t]725return hist726727728def MakeHistFromDict(d, name=''):729"""Makes a histogram from a map from values to frequencies.730731Args:732d: dictionary that maps values to frequencies733name: string name for this histogram734735Returns:736Hist object737"""738return Hist(d, name)739740741def MakePmfFromList(t, name=''):742"""Makes a PMF from an unsorted sequence of values.743744Args:745t: sequence of numbers746name: string name for this PMF747748Returns:749Pmf object750"""751hist = MakeHistFromList(t)752d = hist.GetDict()753pmf = Pmf(d, name)754pmf.Normalize()755return pmf756757758def MakePmfFromDict(d, name=''):759"""Makes a PMF from a map from values to probabilities.760761Args:762d: dictionary that maps values to probabilities763name: string name for this PMF764765Returns:766Pmf object767"""768pmf = Pmf(d, name)769pmf.Normalize()770return pmf771772773def MakePmfFromItems(t, name=''):774"""Makes a PMF from a sequence of value-probability pairs775776Args:777t: sequence of value-probability pairs778name: string name for this PMF779780Returns:781Pmf object782"""783pmf = Pmf(dict(t), name)784pmf.Normalize()785return pmf786787788def MakePmfFromHist(hist, name=None):789"""Makes a normalized PMF from a Hist object.790791Args:792hist: Hist object793name: string name794795Returns:796Pmf object797"""798if name is None:799name = hist.name800801# make a copy of the dictionary802d = dict(hist.GetDict())803pmf = Pmf(d, name)804pmf.Normalize()805return pmf806807808def MakePmfFromCdf(cdf, name=None):809"""Makes a normalized Pmf from a Cdf object.810811Args:812cdf: Cdf object813name: string name for the new Pmf814815Returns:816Pmf object817"""818if name is None:819name = cdf.name820821pmf = Pmf(name=name)822823prev = 0.0824for val, prob in cdf.Items():825pmf.Incr(val, prob - prev)826prev = prob827828return pmf829830831def MakeMixture(metapmf, name='mix'):832"""Make a mixture distribution.833834Args:835metapmf: Pmf that maps from Pmfs to probs.836name: string name for the new Pmf.837838Returns: Pmf object.839"""840mix = Pmf(name=name)841for pmf, p1 in metapmf.Items():842for x, p2 in pmf.Items():843mix.Incr(x, p1 * p2)844return mix845846847def MakeUniformPmf(low, high, n):848"""Make a uniform Pmf.849850low: lowest value (inclusive)851high: highest value (inclusize)852n: number of values853"""854pmf = Pmf()855for x in numpy.linspace(low, high, n):856pmf.Set(x, 1)857pmf.Normalize()858return pmf859860861class Cdf(object):862"""Represents a cumulative distribution function.863864Attributes:865xs: sequence of values866ps: sequence of probabilities867name: string used as a graph label.868"""869870def __init__(self, xs=None, ps=None, name=''):871self.xs = [] if xs is None else xs872self.ps = [] if ps is None else ps873self.name = name874875def Copy(self, name=None):876"""Returns a copy of this Cdf.877878Args:879name: string name for the new Cdf880"""881if name is None:882name = self.name883return Cdf(list(self.xs), list(self.ps), name)884885def MakePmf(self, name=None):886"""Makes a Pmf."""887return MakePmfFromCdf(self, name=name)888889def Values(self):890"""Returns a sorted list of values.891"""892return self.xs893894def Items(self):895"""Returns a sorted sequence of (value, probability) pairs.896897Note: in Python3, returns an iterator.898"""899return zip(self.xs, self.ps)900901def Append(self, x, p):902"""Add an (x, p) pair to the end of this CDF.903904Note: this us normally used to build a CDF from scratch, not905to modify existing CDFs. It is up to the caller to make sure906that the result is a legal CDF.907"""908self.xs.append(x)909self.ps.append(p)910911def Shift(self, term):912"""Adds a term to the xs.913914term: how much to add915"""916new = self.Copy()917new.xs = [x + term for x in self.xs]918return new919920def Scale(self, factor):921"""Multiplies the xs by a factor.922923factor: what to multiply by924"""925new = self.Copy()926new.xs = [x * factor for x in self.xs]927return new928929def Prob(self, x):930"""Returns CDF(x), the probability that corresponds to value x.931932Args:933x: number934935Returns:936float probability937"""938if x < self.xs[0]: return 0.0939index = bisect.bisect(self.xs, x)940p = self.ps[index - 1]941return p942943def Value(self, p):944"""Returns InverseCDF(p), the value that corresponds to probability p.945946Args:947p: number in the range [0, 1]948949Returns:950number value951"""952if p < 0 or p > 1:953raise ValueError('Probability p must be in range [0, 1]')954955if p == 0: return self.xs[0]956if p == 1: return self.xs[-1]957index = bisect.bisect(self.ps, p)958if p == self.ps[index - 1]:959return self.xs[index - 1]960else:961return self.xs[index]962963def Percentile(self, p):964"""Returns the value that corresponds to percentile p.965966Args:967p: number in the range [0, 100]968969Returns:970number value971"""972return self.Value(p / 100.0)973974def Random(self):975"""Chooses a random value from this distribution."""976return self.Value(random.random())977978def Sample(self, n):979"""Generates a random sample from this distribution.980981Args:982n: int length of the sample983"""984return [self.Random() for i in range(n)]985986def Mean(self):987"""Computes the mean of a CDF.988989Returns:990float mean991"""992old_p = 0993total = 0.0994for x, new_p in zip(self.xs, self.ps):995p = new_p - old_p996total += p * x997old_p = new_p998return total9991000def CredibleInterval(self, percentage=90):1001"""Computes the central credible interval.10021003If percentage=90, computes the 90% CI.10041005Args:1006percentage: float between 0 and 10010071008Returns:1009sequence of two floats, low and high1010"""1011prob = (1 - percentage / 100.0) / 21012interval = self.Value(prob), self.Value(1 - prob)1013return interval10141015def _Round(self, multiplier=1000.0):1016"""1017An entry is added to the cdf only if the percentile differs1018from the previous value in a significant digit, where the number1019of significant digits is determined by multiplier. The1020default is 1000, which keeps log10(1000) = 3 significant digits.1021"""1022# TODO(write this method)1023raise UnimplementedMethodException()10241025def Render(self):1026"""Generates a sequence of points suitable for plotting.10271028An empirical CDF is a step function; linear interpolation1029can be misleading.10301031Returns:1032tuple of (xs, ps)1033"""1034xs = [self.xs[0]]1035ps = [0.0]1036for i, p in enumerate(self.ps):1037xs.append(self.xs[i])1038ps.append(p)10391040try:1041xs.append(self.xs[i + 1])1042ps.append(p)1043except IndexError:1044pass1045return xs, ps10461047def Max(self, k):1048"""Computes the CDF of the maximum of k selections from this dist.10491050k: int10511052returns: new Cdf1053"""1054cdf = self.Copy()1055cdf.ps = [p ** k for p in cdf.ps]1056return cdf105710581059def MakeCdfFromItems(items, name=''):1060"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.10611062Args:1063items: unsorted sequence of (value, frequency) pairs1064name: string name for this CDF10651066Returns:1067cdf: list of (value, fraction) pairs1068"""1069runsum = 01070xs = []1071cs = []10721073for value, count in sorted(items):1074runsum += count1075xs.append(value)1076cs.append(runsum)10771078total = float(runsum)1079ps = [c / total for c in cs]10801081cdf = Cdf(xs, ps, name)1082return cdf108310841085def MakeCdfFromDict(d, name=''):1086"""Makes a CDF from a dictionary that maps values to frequencies.10871088Args:1089d: dictionary that maps values to frequencies.1090name: string name for the data.10911092Returns:1093Cdf object1094"""1095return MakeCdfFromItems(d.iteritems(), name)109610971098def MakeCdfFromHist(hist, name=''):1099"""Makes a CDF from a Hist object.11001101Args:1102hist: Pmf.Hist object1103name: string name for the data.11041105Returns:1106Cdf object1107"""1108return MakeCdfFromItems(hist.Items(), name)110911101111def MakeCdfFromPmf(pmf, name=None):1112"""Makes a CDF from a Pmf object.11131114Args:1115pmf: Pmf.Pmf object1116name: string name for the data.11171118Returns:1119Cdf object1120"""1121if name == None:1122name = pmf.name1123return MakeCdfFromItems(pmf.Items(), name)112411251126def MakeCdfFromList(seq, name=''):1127"""Creates a CDF from an unsorted sequence.11281129Args:1130seq: unsorted sequence of sortable values1131name: string name for the cdf11321133Returns:1134Cdf object1135"""1136hist = MakeHistFromList(seq)1137return MakeCdfFromHist(hist, name)113811391140class UnimplementedMethodException(Exception):1141"""Exception if someone calls a method that should be overridden."""114211431144class Suite(Pmf):1145"""Represents a suite of hypotheses and their probabilities."""11461147def Update(self, data):1148"""Updates each hypothesis based on the data.11491150data: any representation of the data11511152returns: the normalizing constant1153"""1154for hypo in self.Values():1155like = self.Likelihood(data, hypo)1156self.Mult(hypo, like)1157return self.Normalize()11581159def LogUpdate(self, data):1160"""Updates a suite of hypotheses based on new data.11611162Modifies the suite directly; if you want to keep the original, make1163a copy.11641165Note: unlike Update, LogUpdate does not normalize.11661167Args:1168data: any representation of the data1169"""1170for hypo in self.Values():1171like = self.LogLikelihood(data, hypo)1172self.Incr(hypo, like)11731174def UpdateSet(self, dataset):1175"""Updates each hypothesis based on the dataset.11761177This is more efficient than calling Update repeatedly because1178it waits until the end to Normalize.11791180Modifies the suite directly; if you want to keep the original, make1181a copy.11821183dataset: a sequence of data11841185returns: the normalizing constant1186"""1187for data in dataset:1188for hypo in self.Values():1189like = self.Likelihood(data, hypo)1190self.Mult(hypo, like)1191return self.Normalize()11921193def LogUpdateSet(self, dataset):1194"""Updates each hypothesis based on the dataset.11951196Modifies the suite directly; if you want to keep the original, make1197a copy.11981199dataset: a sequence of data12001201returns: None1202"""1203for data in dataset:1204self.LogUpdate(data)12051206def Likelihood(self, data, hypo):1207"""Computes the likelihood of the data under the hypothesis.12081209hypo: some representation of the hypothesis1210data: some representation of the data1211"""1212raise UnimplementedMethodException()12131214def LogLikelihood(self, data, hypo):1215"""Computes the log likelihood of the data under the hypothesis.12161217hypo: some representation of the hypothesis1218data: some representation of the data1219"""1220raise UnimplementedMethodException()12211222def Print(self):1223"""Prints the hypotheses and their probabilities."""1224for hypo, prob in sorted(self.Items()):1225print hypo, prob12261227def MakeOdds(self):1228"""Transforms from probabilities to odds.12291230Values with prob=0 are removed.1231"""1232for hypo, prob in self.Items():1233if prob:1234self.Set(hypo, Odds(prob))1235else:1236self.Remove(hypo)12371238def MakeProbs(self):1239"""Transforms from odds to probabilities."""1240for hypo, odds in self.Items():1241self.Set(hypo, Probability(odds))124212431244def MakeSuiteFromList(t, name=''):1245"""Makes a suite from an unsorted sequence of values.12461247Args:1248t: sequence of numbers1249name: string name for this suite12501251Returns:1252Suite object1253"""1254hist = MakeHistFromList(t)1255d = hist.GetDict()1256return MakeSuiteFromDict(d)125712581259def MakeSuiteFromHist(hist, name=None):1260"""Makes a normalized suite from a Hist object.12611262Args:1263hist: Hist object1264name: string name12651266Returns:1267Suite object1268"""1269if name is None:1270name = hist.name12711272# make a copy of the dictionary1273d = dict(hist.GetDict())1274return MakeSuiteFromDict(d, name)127512761277def MakeSuiteFromDict(d, name=''):1278"""Makes a suite from a map from values to probabilities.12791280Args:1281d: dictionary that maps values to probabilities1282name: string name for this suite12831284Returns:1285Suite object1286"""1287suite = Suite(name=name)1288suite.SetDict(d)1289suite.Normalize()1290return suite129112921293def MakeSuiteFromCdf(cdf, name=None):1294"""Makes a normalized Suite from a Cdf object.12951296Args:1297cdf: Cdf object1298name: string name for the new Suite12991300Returns:1301Suite object1302"""1303if name is None:1304name = cdf.name13051306suite = Suite(name=name)13071308prev = 0.01309for val, prob in cdf.Items():1310suite.Incr(val, prob - prev)1311prev = prob13121313return suite131413151316class Pdf(object):1317"""Represents a probability density function (PDF)."""13181319def Density(self, x):1320"""Evaluates this Pdf at x.13211322Returns: float probability density1323"""1324raise UnimplementedMethodException()13251326def MakePmf(self, xs, name=''):1327"""Makes a discrete version of this Pdf, evaluated at xs.13281329xs: equally-spaced sequence of values13301331Returns: new Pmf1332"""1333pmf = Pmf(name=name)1334for x in xs:1335pmf.Set(x, self.Density(x))1336pmf.Normalize()1337return pmf133813391340class GaussianPdf(Pdf):1341"""Represents the PDF of a Gaussian distribution."""13421343def __init__(self, mu, sigma):1344"""Constructs a Gaussian Pdf with given mu and sigma.13451346mu: mean1347sigma: standard deviation1348"""1349self.mu = mu1350self.sigma = sigma13511352def Density(self, x):1353"""Evaluates this Pdf at x.13541355Returns: float probability density1356"""1357return EvalGaussianPdf(x, self.mu, self.sigma)135813591360class EstimatedPdf(Pdf):1361"""Represents a PDF estimated by KDE."""13621363def __init__(self, sample):1364"""Estimates the density function based on a sample.13651366sample: sequence of data1367"""1368self.kde = scipy.stats.gaussian_kde(sample)13691370def Density(self, x):1371"""Evaluates this Pdf at x.13721373Returns: float probability density1374"""1375return self.kde.evaluate(x)13761377def MakePmf(self, xs, name=''):1378ps = self.kde.evaluate(xs)1379pmf = MakePmfFromItems(zip(xs, ps), name=name)1380return pmf138113821383def Percentile(pmf, percentage):1384"""Computes a percentile of a given Pmf.13851386percentage: float 0-1001387"""1388p = percentage / 100.01389total = 01390for val, prob in pmf.Items():1391total += prob1392if total >= p:1393return val139413951396def CredibleInterval(pmf, percentage=90):1397"""Computes a credible interval for a given distribution.13981399If percentage=90, computes the 90% CI.14001401Args:1402pmf: Pmf object representing a posterior distribution1403percentage: float between 0 and 10014041405Returns:1406sequence of two floats, low and high1407"""1408cdf = pmf.MakeCdf()1409prob = (1 - percentage / 100.0) / 21410interval = cdf.Value(prob), cdf.Value(1 - prob)1411return interval141214131414def PmfProbLess(pmf1, pmf2):1415"""Probability that a value from pmf1 is less than a value from pmf2.14161417Args:1418pmf1: Pmf object1419pmf2: Pmf object14201421Returns:1422float probability1423"""1424total = 0.01425for v1, p1 in pmf1.Items():1426for v2, p2 in pmf2.Items():1427if v1 < v2:1428total += p1 * p21429return total143014311432def PmfProbGreater(pmf1, pmf2):1433"""Probability that a value from pmf1 is less than a value from pmf2.14341435Args:1436pmf1: Pmf object1437pmf2: Pmf object14381439Returns:1440float probability1441"""1442total = 0.01443for v1, p1 in pmf1.Items():1444for v2, p2 in pmf2.Items():1445if v1 > v2:1446total += p1 * p21447return total144814491450def PmfProbEqual(pmf1, pmf2):1451"""Probability that a value from pmf1 equals a value from pmf2.14521453Args:1454pmf1: Pmf object1455pmf2: Pmf object14561457Returns:1458float probability1459"""1460total = 0.01461for v1, p1 in pmf1.Items():1462for v2, p2 in pmf2.Items():1463if v1 == v2:1464total += p1 * p21465return total146614671468def RandomSum(dists):1469"""Chooses a random value from each dist and returns the sum.14701471dists: sequence of Pmf or Cdf objects14721473returns: numerical sum1474"""1475total = sum(dist.Random() for dist in dists)1476return total147714781479def SampleSum(dists, n):1480"""Draws a sample of sums from a list of distributions.14811482dists: sequence of Pmf or Cdf objects1483n: sample size14841485returns: new Pmf of sums1486"""1487pmf = MakePmfFromList(RandomSum(dists) for i in xrange(n))1488return pmf148914901491def EvalGaussianPdf(x, mu, sigma):1492"""Computes the unnormalized PDF of the normal distribution.14931494x: value1495mu: mean1496sigma: standard deviation14971498returns: float probability density1499"""1500return scipy.stats.norm.pdf(x, mu, sigma)150115021503def MakeGaussianPmf(mu, sigma, num_sigmas, n=201):1504"""Makes a PMF discrete approx to a Gaussian distribution.15051506mu: float mean1507sigma: float standard deviation1508num_sigmas: how many sigmas to extend in each direction1509n: number of values in the Pmf15101511returns: normalized Pmf1512"""1513pmf = Pmf()1514low = mu - num_sigmas * sigma1515high = mu + num_sigmas * sigma15161517for x in numpy.linspace(low, high, n):1518p = EvalGaussianPdf(x, mu, sigma)1519pmf.Set(x, p)1520pmf.Normalize()1521return pmf152215231524def EvalBinomialPmf(k, n, p):1525"""Evaluates the binomial pmf.15261527Returns the probabily of k successes in n trials with probability p.1528"""1529return scipy.stats.binom.pmf(k, n, p)153015311532def EvalPoissonPmf(k, lam):1533"""Computes the Poisson PMF.15341535k: number of events1536lam: parameter lambda in events per unit time15371538returns: float probability1539"""1540return scipy.stats.poisson.pmf(k, lam)154115421543def MakePoissonPmf(lam, high, step=1):1544"""Makes a PMF discrete approx to a Poisson distribution.15451546lam: parameter lambda in events per unit time1547high: upper bound of the Pmf15481549returns: normalized Pmf1550"""1551pmf = Pmf()1552for k in xrange(0, high + 1, step):1553p = EvalPoissonPmf(k, lam)1554pmf.Set(k, p)1555pmf.Normalize()1556return pmf155715581559def EvalExponentialPdf(x, lam):1560"""Computes the exponential PDF.15611562x: value1563lam: parameter lambda in events per unit time15641565returns: float probability density1566"""1567return lam * math.exp(-lam * x)156815691570def EvalExponentialCdf(x, lam):1571"""Evaluates CDF of the exponential distribution with parameter lam."""1572return 1 - math.exp(-lam * x)157315741575def MakeExponentialPmf(lam, high, n=200):1576"""Makes a PMF discrete approx to an exponential distribution.15771578lam: parameter lambda in events per unit time1579high: upper bound1580n: number of values in the Pmf15811582returns: normalized Pmf1583"""1584pmf = Pmf()1585for x in numpy.linspace(0, high, n):1586p = EvalExponentialPdf(x, lam)1587pmf.Set(x, p)1588pmf.Normalize()1589return pmf159015911592def StandardGaussianCdf(x):1593"""Evaluates the CDF of the standard Gaussian distribution.15941595See http://en.wikipedia.org/wiki/Normal_distribution1596#Cumulative_distribution_function15971598Args:1599x: float16001601Returns:1602float1603"""1604return (erf(x / ROOT2) + 1) / 2160516061607def GaussianCdf(x, mu=0, sigma=1):1608"""Evaluates the CDF of the gaussian distribution.16091610Args:1611x: float16121613mu: mean parameter16141615sigma: standard deviation parameter16161617Returns:1618float1619"""1620return StandardGaussianCdf(float(x - mu) / sigma)162116221623def GaussianCdfInverse(p, mu=0, sigma=1):1624"""Evaluates the inverse CDF of the gaussian distribution.16251626See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function16271628Args:1629p: float16301631mu: mean parameter16321633sigma: standard deviation parameter16341635Returns:1636float1637"""1638x = ROOT2 * erfinv(2 * p - 1)1639return mu + x * sigma164016411642class Beta(object):1643"""Represents a Beta distribution.16441645See http://en.wikipedia.org/wiki/Beta_distribution1646"""1647def __init__(self, alpha=1, beta=1, name=''):1648"""Initializes a Beta distribution."""1649self.alpha = alpha1650self.beta = beta1651self.name = name16521653def Update(self, data):1654"""Updates a Beta distribution.16551656data: pair of int (heads, tails)1657"""1658heads, tails = data1659self.alpha += heads1660self.beta += tails16611662def Mean(self):1663"""Computes the mean of this distribution."""1664return float(self.alpha) / (self.alpha + self.beta)16651666def Random(self):1667"""Generates a random variate from this distribution."""1668return random.betavariate(self.alpha, self.beta)16691670def Sample(self, n):1671"""Generates a random sample from this distribution.16721673n: int sample size1674"""1675size = n,1676return numpy.random.beta(self.alpha, self.beta, size)16771678def EvalPdf(self, x):1679"""Evaluates the PDF at x."""1680return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)16811682def MakePmf(self, steps=101, name=''):1683"""Returns a Pmf of this distribution.16841685Note: Normally, we just evaluate the PDF at a sequence1686of points and treat the probability density as a probability1687mass.16881689But if alpha or beta is less than one, we have to be1690more careful because the PDF goes to infinity at x=01691and x=1. In that case we evaluate the CDF and compute1692differences.1693"""1694if self.alpha < 1 or self.beta < 1:1695cdf = self.MakeCdf()1696pmf = cdf.MakePmf()1697return pmf16981699xs = [i / (steps - 1.0) for i in xrange(steps)]1700probs = [self.EvalPdf(x) for x in xs]1701pmf = MakePmfFromDict(dict(zip(xs, probs)), name)1702return pmf17031704def MakeCdf(self, steps=101):1705"""Returns the CDF of this distribution."""1706xs = [i / (steps - 1.0) for i in xrange(steps)]1707ps = [scipy.special.betainc(self.alpha, self.beta, x) for x in xs]1708cdf = Cdf(xs, ps)1709return cdf171017111712class Dirichlet(object):1713"""Represents a Dirichlet distribution.17141715See http://en.wikipedia.org/wiki/Dirichlet_distribution1716"""17171718def __init__(self, n, conc=1, name=''):1719"""Initializes a Dirichlet distribution.17201721n: number of dimensions1722conc: concentration parameter (smaller yields more concentration)1723name: string name1724"""1725if n < 2:1726raise ValueError('A Dirichlet distribution with '1727'n<2 makes no sense')17281729self.n = n1730self.params = numpy.ones(n, dtype=numpy.float) * conc1731self.name = name17321733def Update(self, data):1734"""Updates a Dirichlet distribution.17351736data: sequence of observations, in order corresponding to params1737"""1738m = len(data)1739self.params[:m] += data17401741def Random(self):1742"""Generates a random variate from this distribution.17431744Returns: normalized vector of fractions1745"""1746p = numpy.random.gamma(self.params)1747return p / p.sum()17481749def Likelihood(self, data):1750"""Computes the likelihood of the data.17511752Selects a random vector of probabilities from this distribution.17531754Returns: float probability1755"""1756m = len(data)1757if self.n < m:1758return 017591760x = data1761p = self.Random()1762q = p[:m] ** x1763return q.prod()17641765def LogLikelihood(self, data):1766"""Computes the log likelihood of the data.17671768Selects a random vector of probabilities from this distribution.17691770Returns: float log probability1771"""1772m = len(data)1773if self.n < m:1774return float('-inf')17751776x = self.Random()1777y = numpy.log(x[:m]) * data1778return y.sum()17791780def MarginalBeta(self, i):1781"""Computes the marginal distribution of the ith element.17821783See http://en.wikipedia.org/wiki/Dirichlet_distribution1784#Marginal_distributions17851786i: int17871788Returns: Beta object1789"""1790alpha0 = self.params.sum()1791alpha = self.params[i]1792return Beta(alpha, alpha0 - alpha)17931794def PredictivePmf(self, xs, name=''):1795"""Makes a predictive distribution.17961797xs: values to go into the Pmf17981799Returns: Pmf that maps from x to the mean prevalence of x1800"""1801alpha0 = self.params.sum()1802ps = self.params / alpha01803return MakePmfFromItems(zip(xs, ps), name=name)180418051806def BinomialCoef(n, k):1807"""Compute the binomial coefficient "n choose k".18081809n: number of trials1810k: number of successes18111812Returns: float1813"""1814return scipy.misc.comb(n, k)181518161817def LogBinomialCoef(n, k):1818"""Computes the log of the binomial coefficient.18191820http://math.stackexchange.com/questions/64716/1821approximating-the-logarithm-of-the-binomial-coefficient18221823n: number of trials1824k: number of successes18251826Returns: float1827"""1828return n * log(n) - k * log(k) - (n - k) * log(n - k)18291830183118321833