Repository for a workshop on Bayesian statistics
"""This file contains code for use with "Think Stats" and1"Think Bayes", both by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89"""This file contains class definitions for:1011Hist: represents a histogram (map from values to integer frequencies).1213Pmf: represents a probability mass function (map from values to probs).1415_DictWrapper: private parent class for Hist and Pmf.1617Cdf: represents a discrete cumulative distribution function1819Pdf: represents a continuous probability density function2021"""2223import bisect24import copy25import logging26import math27import random28import re2930from collections import Counter31from operator import itemgetter3233import thinkplot3435import numpy as np36import pandas3738import scipy39from scipy import stats40from scipy import special41from scipy import ndimage4243from scipy.special import gamma4445from io import open4647ROOT2 = math.sqrt(2)4849def RandomSeed(x):50"""Initialize the random and np.random generators.5152x: int seed53"""54random.seed(x)55np.random.seed(x)565758def Odds(p):59"""Computes odds for a given probability.6061Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.6263Note: when p=1, the formula for odds divides by zero, which is64normally undefined. But I think it is reasonable to define Odds(1)65to be infinity, so that's what this function does.6667p: float 0-16869Returns: float odds70"""71if p == 1:72return float('inf')73return p / (1 - p)747576def Probability(o):77"""Computes the probability corresponding to given odds.7879Example: o=2 means 2:1 odds in favor, or 2/3 probability8081o: float odds, strictly positive8283Returns: float probability84"""85return o / (o + 1)868788def Probability2(yes, no):89"""Computes the probability corresponding to given odds.9091Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.9293yes, no: int or float odds in favor94"""95return yes / (yes + no)969798class Interpolator(object):99"""Represents a mapping between sorted sequences; performs linear interp.100101Attributes:102xs: sorted list103ys: sorted list104"""105106def __init__(self, xs, ys):107self.xs = xs108self.ys = ys109110def Lookup(self, x):111"""Looks up x and returns the corresponding value of y."""112return self._Bisect(x, self.xs, self.ys)113114def Reverse(self, y):115"""Looks up y and returns the corresponding value of x."""116return self._Bisect(y, self.ys, self.xs)117118def _Bisect(self, x, xs, ys):119"""Helper function."""120if x <= xs[0]:121return ys[0]122if x >= xs[-1]:123return ys[-1]124i = bisect.bisect(xs, x)125frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])126y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])127return y128129130# When we plot Hist, Pmf and Cdf objects, they don't appear in131# the legend unless we override the default label.132DEFAULT_LABEL = '_nolegend_'133134135class _DictWrapper(object):136"""An object that contains a dictionary."""137138def __init__(self, obj=None, label=None):139"""Initializes the distribution.140141obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs142label: string label143"""144self.label = label if label is not None else DEFAULT_LABEL145self.d = {}146147# flag whether the distribution is under a log transform148self.log = False149150if obj is None:151return152153if isinstance(obj, (_DictWrapper, Cdf, Pdf)):154self.label = label if label is not None else obj.label155156if isinstance(obj, dict):157self.d.update(obj.items())158elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):159self.d.update(obj.Items())160elif isinstance(obj, pandas.Series):161self.d.update(obj.value_counts().iteritems())162else:163# finally, treat it like a list164self.d.update(Counter(obj))165166if len(self) > 0 and isinstance(self, Pmf):167self.Normalize()168169def __hash__(self):170return id(self)171172def __str__(self):173cls = self.__class__.__name__174if self.label == DEFAULT_LABEL:175return '%s(%s)' % (cls, str(self.d))176else:177return self.label178179def __repr__(self):180cls = self.__class__.__name__181if self.label == DEFAULT_LABEL:182return '%s(%s)' % (cls, repr(self.d))183else:184return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label))185186def __eq__(self, other):187try:188return self.d == other.d189except AttributeError:190return False191192def __len__(self):193return len(self.d)194195def __iter__(self):196return iter(self.d)197198def iterkeys(self):199"""Returns an iterator over keys."""200return iter(self.d)201202def __contains__(self, value):203return value in self.d204205def __getitem__(self, value):206return self.d.get(value, 0)207208def __setitem__(self, value, prob):209self.d[value] = prob210211def __delitem__(self, value):212del self.d[value]213214def Copy(self, label=None):215"""Returns a copy.216217Make a shallow copy of d. If you want a deep copy of d,218use copy.deepcopy on the whole object.219220label: string label for the new Hist221222returns: new _DictWrapper with the same type223"""224new = copy.copy(self)225new.d = copy.copy(self.d)226new.label = label if label is not None else self.label227return new228229def Scale(self, factor):230"""Multiplies the values by a factor.231232factor: what to multiply by233234Returns: new object235"""236new = self.Copy()237new.d.clear()238239for val, prob in self.Items():240new.Set(val * factor, prob)241return new242243def Log(self, m=None):244"""Log transforms the probabilities.245246Removes values with probability 0.247248Normalizes so that the largest logprob is 0.249"""250if self.log:251raise ValueError("Pmf/Hist already under a log transform")252self.log = True253254if m is None:255m = self.MaxLike()256257for x, p in self.d.items():258if p:259self.Set(x, math.log(p / m))260else:261self.Remove(x)262263def Exp(self, m=None):264"""Exponentiates the probabilities.265266m: how much to shift the ps before exponentiating267268If m is None, normalizes so that the largest prob is 1.269"""270if not self.log:271raise ValueError("Pmf/Hist not under a log transform")272self.log = False273274if m is None:275m = self.MaxLike()276277for x, p in self.d.items():278self.Set(x, math.exp(p - m))279280def GetDict(self):281"""Gets the dictionary."""282return self.d283284def SetDict(self, d):285"""Sets the dictionary."""286self.d = d287288def Values(self):289"""Gets an unsorted sequence of values.290291Note: one source of confusion is that the keys of this292dictionary are the values of the Hist/Pmf, and the293values of the dictionary are frequencies/probabilities.294"""295return self.d.keys()296297def Items(self):298"""Gets an unsorted sequence of (value, freq/prob) pairs."""299return self.d.items()300301def SortedItems(self):302"""Gets a sorted sequence of (value, freq/prob) pairs.303304It items are unsortable, the result is unsorted.305"""306def isnan(x):307try:308return math.isnan(x)309except TypeError:310return False311312if any([isnan(x) for x in self.Values()]):313msg = 'Keys contain NaN, may not sort correctly.'314logging.warning(msg)315316try:317return sorted(self.d.items())318except TypeError:319return self.d.items()320321def Render(self, **options):322"""Generates a sequence of points suitable for plotting.323324Note: options are ignored325326Returns:327tuple of (sorted value sequence, freq/prob sequence)328"""329return zip(*self.SortedItems())330331def MakeCdf(self, label=None):332"""Makes a Cdf."""333label = label if label is not None else self.label334return Cdf(self, label=label)335336def Print(self):337"""Prints the values and freqs/probs in ascending order."""338for val, prob in self.SortedItems():339print(val, prob)340341def Set(self, x, y=0):342"""Sets the freq/prob associated with the value x.343344Args:345x: number value346y: number freq or prob347"""348self.d[x] = y349350def Incr(self, x, term=1):351"""Increments the freq/prob associated with the value x.352353Args:354x: number value355term: how much to increment by356"""357self.d[x] = self.d.get(x, 0) + term358359def Mult(self, x, factor):360"""Scales the freq/prob associated with the value x.361362Args:363x: number value364factor: how much to multiply by365"""366self.d[x] = self.d.get(x, 0) * factor367368def Remove(self, x):369"""Removes a value.370371Throws an exception if the value is not there.372373Args:374x: value to remove375"""376del self.d[x]377378def Total(self):379"""Returns the total of the frequencies/probabilities in the map."""380total = sum(self.d.values())381return total382383def MaxLike(self):384"""Returns the largest frequency/probability in the map."""385return max(self.d.values())386387def Largest(self, n=10):388"""Returns the largest n values, with frequency/probability.389390n: number of items to return391"""392return sorted(self.d.items(), reverse=True)[:n]393394def Smallest(self, n=10):395"""Returns the smallest n values, with frequency/probability.396397n: number of items to return398"""399return sorted(self.d.items(), reverse=False)[:n]400401402class Hist(_DictWrapper):403"""Represents a histogram, which is a map from values to frequencies.404405Values can be any hashable type; frequencies are integer counters.406"""407def Freq(self, x):408"""Gets the frequency associated with the value x.409410Args:411x: number value412413Returns:414int frequency415"""416return self.d.get(x, 0)417418def Freqs(self, xs):419"""Gets frequencies for a sequence of values."""420return [self.Freq(x) for x in xs]421422def IsSubset(self, other):423"""Checks whether the values in this histogram are a subset of424the values in the given histogram."""425for val, freq in self.Items():426if freq > other.Freq(val):427return False428return True429430def Subtract(self, other):431"""Subtracts the values in the given histogram from this histogram."""432for val, freq in other.Items():433self.Incr(val, -freq)434435436class Pmf(_DictWrapper):437"""Represents a probability mass function.438439Values can be any hashable type; probabilities are floating-point.440Pmfs are not necessarily normalized.441"""442443def Prob(self, x, default=0):444"""Gets the probability associated with the value x.445446Args:447x: number value448default: value to return if the key is not there449450Returns:451float probability452"""453return self.d.get(x, default)454455def Probs(self, xs):456"""Gets probabilities for a sequence of values."""457return [self.Prob(x) for x in xs]458459def Percentile(self, percentage):460"""Computes a percentile of a given Pmf.461462Note: this is not super efficient. If you are planning463to compute more than a few percentiles, compute the Cdf.464465percentage: float 0-100466467returns: value from the Pmf468"""469p = percentage / 100470total = 0471for val, prob in sorted(self.Items()):472total += prob473if total >= p:474return val475476def ProbGreater(self, x):477"""Probability that a sample from this Pmf exceeds x.478479x: number480481returns: float probability482"""483if isinstance(x, _DictWrapper):484return PmfProbGreater(self, x)485else:486t = [prob for (val, prob) in self.d.items() if val > x]487return sum(t)488489def ProbLess(self, x):490"""Probability that a sample from this Pmf is less than x.491492x: number493494returns: float probability495"""496if isinstance(x, _DictWrapper):497return PmfProbLess(self, x)498else:499t = [prob for (val, prob) in self.d.items() if val < x]500return sum(t)501502def ProbEqual(self, x):503"""Probability that a sample from this Pmf is exactly x.504505x: number506507returns: float probability508"""509if isinstance(x, _DictWrapper):510return PmfProbEqual(self, x)511else:512return self[x]513514# NOTE: I've decided to remove the magic comparators because they515# have the side-effect of making Pmf sortable, but in fact they516# don't support sorting.517518def Normalize(self, fraction=1):519"""Normalizes this PMF so the sum of all probs is fraction.520521Args:522fraction: what the total should be after normalization523524Returns: the total probability before normalizing525"""526if self.log:527raise ValueError("Normalize: Pmf is under a log transform")528529total = self.Total()530if total == 0:531raise ValueError('Normalize: total probability is zero.')532533factor = fraction / total534for x in self.d:535self.d[x] *= factor536537return total538539def Random(self):540"""Chooses a random element from this PMF.541542Note: this is not very efficient. If you plan to call543this more than a few times, consider converting to a CDF.544545Returns:546float value from the Pmf547"""548target = random.random()549total = 0550for x, p in self.d.items():551total += p552if total >= target:553return x554555# we shouldn't get here556raise ValueError('Random: Pmf might not be normalized.')557558def Sample(self, n):559"""Generates a random sample from this distribution.560561n: int length of the sample562returns: NumPy array563"""564return self.MakeCdf().Sample(n)565566def Mean(self):567"""Computes the mean of a PMF.568569Returns:570float mean571"""572return sum(p * x for x, p in self.Items())573574def Median(self):575"""Computes the median of a PMF.576577Returns:578float median579"""580return self.MakeCdf().Percentile(50)581582def Var(self, mu=None):583"""Computes the variance of a PMF.584585mu: the point around which the variance is computed;586if omitted, computes the mean587588returns: float variance589"""590if mu is None:591mu = self.Mean()592593return sum(p * (x-mu)**2 for x, p in self.Items())594595def Expect(self, func):596"""Computes the expectation of func(x).597598Returns:599expectation600"""601return np.sum(p * func(x) for x, p in self.Items())602603def Std(self, mu=None):604"""Computes the standard deviation of a PMF.605606mu: the point around which the variance is computed;607if omitted, computes the mean608609returns: float standard deviation610"""611var = self.Var(mu)612return math.sqrt(var)613614def Mode(self):615"""Returns the value with the highest probability.616617Returns: float probability618"""619_, val = max((prob, val) for val, prob in self.Items())620return val621622# The mode of a posterior is the maximum aposteori probability (MAP)623MAP = Mode624625# If the distribution contains likelihoods only, the peak is the626# maximum likelihood estimator.627MaximumLikelihood = Mode628629def CredibleInterval(self, percentage=90):630"""Computes the central credible interval.631632If percentage=90, computes the 90% CI.633634Args:635percentage: float between 0 and 100636637Returns:638sequence of two floats, low and high639"""640cdf = self.MakeCdf()641return cdf.CredibleInterval(percentage)642643def __add__(self, other):644"""Computes the Pmf of the sum of values drawn from self and other.645646other: another Pmf or a scalar647648returns: new Pmf649"""650try:651return self.AddPmf(other)652except AttributeError:653return self.AddConstant(other)654655__radd__ = __add__656657def AddPmf(self, other):658"""Computes the Pmf of the sum of values drawn from self and other.659660other: another Pmf661662returns: new Pmf663"""664pmf = Pmf()665for v1, p1 in self.Items():666for v2, p2 in other.Items():667pmf[v1 + v2] += p1 * p2668return pmf669670def AddConstant(self, other):671"""Computes the Pmf of the sum a constant and values from self.672673other: a number674675returns: new Pmf676"""677if other == 0:678return self.Copy()679680pmf = Pmf()681for v1, p1 in self.Items():682pmf.Set(v1 + other, p1)683return pmf684685def __sub__(self, other):686"""Computes the Pmf of the diff of values drawn from self and other.687688other: another Pmf689690returns: new Pmf691"""692try:693return self.SubPmf(other)694except AttributeError:695return self.AddConstant(-other)696697def SubPmf(self, other):698"""Computes the Pmf of the diff of values drawn from self and other.699700other: another Pmf701702returns: new Pmf703"""704pmf = Pmf()705for v1, p1 in self.Items():706for v2, p2 in other.Items():707pmf.Incr(v1 - v2, p1 * p2)708return pmf709710def __mul__(self, other):711"""Computes the Pmf of the product of values drawn from self and other.712713other: another Pmf714715returns: new Pmf716"""717try:718return self.MulPmf(other)719except AttributeError:720return self.MulConstant(other)721722def MulPmf(self, other):723"""Computes the Pmf of the diff of values drawn from self and other.724725other: another Pmf726727returns: new Pmf728"""729pmf = Pmf()730for v1, p1 in self.Items():731for v2, p2 in other.Items():732pmf.Incr(v1 * v2, p1 * p2)733return pmf734735def MulConstant(self, other):736"""Computes the Pmf of the product of a constant and values from self.737738other: a number739740returns: new Pmf741"""742pmf = Pmf()743for v1, p1 in self.Items():744pmf.Set(v1 * other, p1)745return pmf746747def __div__(self, other):748"""Computes the Pmf of the ratio of values drawn from self and other.749750other: another Pmf751752returns: new Pmf753"""754try:755return self.DivPmf(other)756except AttributeError:757return self.MulConstant(1/other)758759__truediv__ = __div__760761def DivPmf(self, other):762"""Computes the Pmf of the ratio of values drawn from self and other.763764other: another Pmf765766returns: new Pmf767"""768pmf = Pmf()769for v1, p1 in self.Items():770for v2, p2 in other.Items():771pmf.Incr(v1 / v2, p1 * p2)772return pmf773774def Max(self, k):775"""Computes the CDF of the maximum of k selections from this dist.776777k: int778779returns: new Cdf780"""781cdf = self.MakeCdf()782cdf.ps **= k783return cdf784785786class Joint(Pmf):787"""Represents a joint distribution.788789The values are sequences (usually tuples)790"""791792def Marginal(self, i, label=None):793"""Gets the marginal distribution of the indicated variable.794795i: index of the variable we want796797Returns: Pmf798"""799pmf = Pmf(label=label)800for vs, prob in self.Items():801pmf.Incr(vs[i], prob)802return pmf803804def Conditional(self, i, j, val, label=None):805"""Gets the conditional distribution of the indicated variable.806807Distribution of vs[i], conditioned on vs[j] = val.808809i: index of the variable we want810j: which variable is conditioned on811val: the value the jth variable has to have812813Returns: Pmf814"""815pmf = Pmf(label=label)816for vs, prob in self.Items():817if vs[j] != val:818continue819pmf.Incr(vs[i], prob)820821pmf.Normalize()822return pmf823824def MaxLikeInterval(self, percentage=90):825"""Returns the maximum-likelihood credible interval.826827If percentage=90, computes a 90% CI containing the values828with the highest likelihoods.829830percentage: float between 0 and 100831832Returns: list of values from the suite833"""834interval = []835total = 0836837t = [(prob, val) for val, prob in self.Items()]838t.sort(reverse=True)839840for prob, val in t:841interval.append(val)842total += prob843if total >= percentage / 100:844break845846return interval847848849def MakeJoint(pmf1, pmf2):850"""Joint distribution of values from pmf1 and pmf2.851852Assumes that the PMFs represent independent random variables.853854Args:855pmf1: Pmf object856pmf2: Pmf object857858Returns:859Joint pmf of value pairs860"""861joint = Joint()862for v1, p1 in pmf1.Items():863for v2, p2 in pmf2.Items():864joint.Set((v1, v2), p1 * p2)865return joint866867868def MakeHistFromList(t, label=None):869"""Makes a histogram from an unsorted sequence of values.870871Args:872t: sequence of numbers873label: string label for this histogram874875Returns:876Hist object877"""878return Hist(t, label=label)879880881def MakeHistFromDict(d, label=None):882"""Makes a histogram from a map from values to frequencies.883884Args:885d: dictionary that maps values to frequencies886label: string label for this histogram887888Returns:889Hist object890"""891return Hist(d, label)892893894def MakePmfFromList(t, label=None):895"""Makes a PMF from an unsorted sequence of values.896897Args:898t: sequence of numbers899label: string label for this PMF900901Returns:902Pmf object903"""904return Pmf(t, label=label)905906907def MakePmfFromDict(d, label=None):908"""Makes a PMF from a map from values to probabilities.909910Args:911d: dictionary that maps values to probabilities912label: string label for this PMF913914Returns:915Pmf object916"""917return Pmf(d, label=label)918919920def MakePmfFromItems(t, label=None):921"""Makes a PMF from a sequence of value-probability pairs922923Args:924t: sequence of value-probability pairs925label: string label for this PMF926927Returns:928Pmf object929"""930return Pmf(dict(t), label=label)931932933def MakePmfFromHist(hist, label=None):934"""Makes a normalized PMF from a Hist object.935936Args:937hist: Hist object938label: string label939940Returns:941Pmf object942"""943if label is None:944label = hist.label945946return Pmf(hist, label=label)947948949def MakeMixture(metapmf, label='mix'):950"""Make a mixture distribution.951952Args:953metapmf: Pmf that maps from Pmfs to probs.954label: string label for the new Pmf.955956Returns: Pmf object.957"""958mix = Pmf(label=label)959for pmf, p1 in metapmf.Items():960for x, p2 in pmf.Items():961mix[x] += p1 * p2962return mix963964965def MakeUniformPmf(low, high, n):966"""Make a uniform Pmf.967968low: lowest value (inclusive)969high: highest value (inclusize)970n: number of values971"""972pmf = Pmf()973for x in np.linspace(low, high, n):974pmf.Set(x, 1)975pmf.Normalize()976return pmf977978979class Cdf:980"""Represents a cumulative distribution function.981982Attributes:983xs: sequence of values984ps: sequence of probabilities985label: string used as a graph label.986"""987def __init__(self, obj=None, ps=None, label=None):988"""Initializes.989990If ps is provided, obj must be the corresponding list of values.991992obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs993ps: list of cumulative probabilities994label: string label995"""996self.label = label if label is not None else DEFAULT_LABEL997998if isinstance(obj, (_DictWrapper, Cdf, Pdf)):999if not label:1000self.label = label if label is not None else obj.label10011002if obj is None:1003# caller does not provide obj, make an empty Cdf1004self.xs = np.asarray([])1005self.ps = np.asarray([])1006if ps is not None:1007logging.warning("Cdf: can't pass ps without also passing xs.")1008return1009else:1010# if the caller provides xs and ps, just store them1011if ps is not None:1012if isinstance(ps, str):1013logging.warning("Cdf: ps can't be a string")10141015self.xs = np.asarray(obj)1016self.ps = np.asarray(ps)1017return10181019# caller has provided just obj, not ps1020if isinstance(obj, Cdf):1021self.xs = copy.copy(obj.xs)1022self.ps = copy.copy(obj.ps)1023return10241025if isinstance(obj, _DictWrapper):1026dw = obj1027else:1028dw = Hist(obj)10291030if len(dw) == 0:1031self.xs = np.asarray([])1032self.ps = np.asarray([])1033return10341035xs, freqs = zip(*sorted(dw.Items()))1036self.xs = np.asarray(xs)1037self.ps = np.cumsum(freqs, dtype=np.float)1038self.ps /= self.ps[-1]10391040def __str__(self):1041cls = self.__class__.__name__1042if self.label == DEFAULT_LABEL:1043return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))1044else:1045return self.label10461047def __repr__(self):1048cls = self.__class__.__name__1049if self.label == DEFAULT_LABEL:1050return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps))1051else:1052return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps),1053repr(self.label))10541055def __len__(self):1056return len(self.xs)10571058def __getitem__(self, x):1059return self.Prob(x)10601061def __setitem__(self):1062raise UnimplementedMethodException()10631064def __delitem__(self):1065raise UnimplementedMethodException()10661067def __eq__(self, other):1068return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)10691070def Print(self):1071"""Prints the values and freqs/probs in ascending order."""1072for val, prob in zip(self.xs, self.ps):1073print(val, prob)10741075def Copy(self, label=None):1076"""Returns a copy of this Cdf.10771078label: string label for the new Cdf1079"""1080if label is None:1081label = self.label1082return Cdf(list(self.xs), list(self.ps), label=label)10831084def MakePmf(self, label=None):1085"""Makes a Pmf."""1086if label is None:1087label = self.label1088return Pmf(self, label=label)10891090def Items(self):1091"""Returns a sorted sequence of (value, probability) pairs.10921093Note: in Python3, returns an iterator.1094"""1095a = self.ps1096b = np.roll(a, 1)1097b[0] = 01098return zip(self.xs, a-b)10991100def Shift(self, term):1101"""Adds a term to the xs.11021103term: how much to add1104"""1105new = self.Copy()1106# don't use +=, or else an int array + float yields int array1107new.xs = new.xs + term1108return new11091110def Scale(self, factor):1111"""Multiplies the xs by a factor.11121113factor: what to multiply by1114"""1115new = self.Copy()1116# don't use *=, or else an int array * float yields int array1117new.xs = new.xs * factor1118return new11191120def Prob(self, x):1121"""Returns CDF(x), the probability that corresponds to value x.11221123Args:1124x: number11251126Returns:1127float probability1128"""1129if x < self.xs[0]:1130return 01131index = bisect.bisect(self.xs, x)1132p = self.ps[index-1]1133return p11341135def Probs(self, xs):1136"""Gets probabilities for a sequence of values.11371138xs: any sequence that can be converted to NumPy array11391140returns: NumPy array of cumulative probabilities1141"""1142xs = np.asarray(xs)1143index = np.searchsorted(self.xs, xs, side='right')1144ps = self.ps[index-1]1145ps[xs < self.xs[0]] = 01146return ps11471148ProbArray = Probs11491150def Value(self, p):1151"""Returns InverseCDF(p), the value that corresponds to probability p.11521153Args:1154p: number in the range [0, 1]11551156Returns:1157number value1158"""1159if p < 0 or p > 1:1160raise ValueError('Probability p must be in range [0, 1]')11611162index = bisect.bisect_left(self.ps, p)1163return self.xs[index]11641165def Values(self, ps=None):1166"""Returns InverseCDF(p), the value that corresponds to probability p.11671168If ps is not provided, returns all values.11691170Args:1171ps: NumPy array of numbers in the range [0, 1]11721173Returns:1174NumPy array of values1175"""1176if ps is None:1177return self.xs11781179ps = np.asarray(ps)1180if np.any(ps < 0) or np.any(ps > 1):1181raise ValueError('Probability p must be in range [0, 1]')11821183index = np.searchsorted(self.ps, ps, side='left')1184return self.xs[index]11851186ValueArray = Values11871188def Percentile(self, p):1189"""Returns the value that corresponds to percentile p.11901191Args:1192p: number in the range [0, 100]11931194Returns:1195number value1196"""1197return self.Value(p / 100)11981199def Percentiles(self, ps):1200"""Returns the value that corresponds to percentiles ps.12011202Args:1203ps: numbers in the range [0, 100]12041205Returns:1206array of values1207"""1208ps = np.asarray(ps)1209return self.Values(ps / 100)12101211def PercentileRank(self, x):1212"""Returns the percentile rank of the value x.12131214x: potential value in the CDF12151216returns: percentile rank in the range 0 to 1001217"""1218return self.Prob(x) * 10012191220def PercentileRanks(self, xs):1221"""Returns the percentile ranks of the values in xs.12221223xs: potential value in the CDF12241225returns: array of percentile ranks in the range 0 to 1001226"""1227return self.Probs(x) * 10012281229def Random(self):1230"""Chooses a random value from this distribution."""1231return self.Value(random.random())12321233def Sample(self, n):1234"""Generates a random sample from this distribution.12351236n: int length of the sample1237returns: NumPy array1238"""1239ps = np.random.random(n)1240return self.ValueArray(ps)12411242def Mean(self):1243"""Computes the mean of a CDF.12441245Returns:1246float mean1247"""1248old_p = 01249total = 01250for x, new_p in zip(self.xs, self.ps):1251p = new_p - old_p1252total += p * x1253old_p = new_p1254return total12551256def CredibleInterval(self, percentage=90):1257"""Computes the central credible interval.12581259If percentage=90, computes the 90% CI.12601261Args:1262percentage: float between 0 and 10012631264Returns:1265sequence of two floats, low and high1266"""1267prob = (1 - percentage / 100) / 21268interval = self.Value(prob), self.Value(1 - prob)1269return interval12701271ConfidenceInterval = CredibleInterval12721273def _Round(self, multiplier=1000):1274"""1275An entry is added to the cdf only if the percentile differs1276from the previous value in a significant digit, where the number1277of significant digits is determined by multiplier. The1278default is 1000, which keeps log10(1000) = 3 significant digits.1279"""1280# TODO(write this method)1281raise UnimplementedMethodException()12821283def Render(self, **options):1284"""Generates a sequence of points suitable for plotting.12851286An empirical CDF is a step function; linear interpolation1287can be misleading.12881289Note: options are ignored12901291Returns:1292tuple of (xs, ps)1293"""1294def interleave(a, b):1295c = np.empty(a.shape[0] + b.shape[0])1296c[::2] = a1297c[1::2] = b1298return c12991300a = np.array(self.xs)1301xs = interleave(a, a)1302shift_ps = np.roll(self.ps, 1)1303shift_ps[0] = 01304ps = interleave(shift_ps, self.ps)1305return xs, ps13061307def Max(self, k):1308"""Computes the CDF of the maximum of k selections from this dist.13091310k: int13111312returns: new Cdf1313"""1314cdf = self.Copy()1315cdf.ps **= k1316return cdf131713181319def MakeCdfFromItems(items, label=None):1320"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.13211322Args:1323items: unsorted sequence of (value, frequency) pairs1324label: string label for this CDF13251326Returns:1327cdf: list of (value, fraction) pairs1328"""1329return Cdf(dict(items), label=label)133013311332def MakeCdfFromDict(d, label=None):1333"""Makes a CDF from a dictionary that maps values to frequencies.13341335Args:1336d: dictionary that maps values to frequencies.1337label: string label for the data.13381339Returns:1340Cdf object1341"""1342return Cdf(d, label=label)134313441345def MakeCdfFromList(seq, label=None):1346"""Creates a CDF from an unsorted sequence.13471348Args:1349seq: unsorted sequence of sortable values1350label: string label for the cdf13511352Returns:1353Cdf object1354"""1355return Cdf(seq, label=label)135613571358def MakeCdfFromHist(hist, label=None):1359"""Makes a CDF from a Hist object.13601361Args:1362hist: Pmf.Hist object1363label: string label for the data.13641365Returns:1366Cdf object1367"""1368if label is None:1369label = hist.label13701371return Cdf(hist, label=label)137213731374def MakeCdfFromPmf(pmf, label=None):1375"""Makes a CDF from a Pmf object.13761377Args:1378pmf: Pmf.Pmf object1379label: string label for the data.13801381Returns:1382Cdf object1383"""1384if label is None:1385label = pmf.label13861387return Cdf(pmf, label=label)138813891390class UnimplementedMethodException(Exception):1391"""Exception if someone calls a method that should be overridden."""139213931394class Suite(Pmf):1395"""Represents a suite of hypotheses and their probabilities."""13961397def Update(self, data):1398"""Updates each hypothesis based on the data.13991400data: any representation of the data14011402returns: the normalizing constant1403"""1404for hypo in self.Values():1405like = self.Likelihood(data, hypo)1406self.Mult(hypo, like)1407return self.Normalize()14081409def LogUpdate(self, data):1410"""Updates a suite of hypotheses based on new data.14111412Modifies the suite directly; if you want to keep the original, make1413a copy.14141415Note: unlike Update, LogUpdate does not normalize.14161417Args:1418data: any representation of the data1419"""1420for hypo in self.Values():1421like = self.LogLikelihood(data, hypo)1422self.Incr(hypo, like)14231424def UpdateSet(self, dataset):1425"""Updates each hypothesis based on the dataset.14261427This is more efficient than calling Update repeatedly because1428it waits until the end to Normalize.14291430Modifies the suite directly; if you want to keep the original, make1431a copy.14321433dataset: a sequence of data14341435returns: the normalizing constant1436"""1437for data in dataset:1438for hypo in self.Values():1439like = self.Likelihood(data, hypo)1440self.Mult(hypo, like)1441return self.Normalize()14421443def LogUpdateSet(self, dataset):1444"""Updates each hypothesis based on the dataset.14451446Modifies the suite directly; if you want to keep the original, make1447a copy.14481449dataset: a sequence of data14501451returns: None1452"""1453for data in dataset:1454self.LogUpdate(data)14551456def Likelihood(self, data, hypo):1457"""Computes the likelihood of the data under the hypothesis.14581459hypo: some representation of the hypothesis1460data: some representation of the data1461"""1462raise UnimplementedMethodException()14631464def LogLikelihood(self, data, hypo):1465"""Computes the log likelihood of the data under the hypothesis.14661467hypo: some representation of the hypothesis1468data: some representation of the data1469"""1470raise UnimplementedMethodException()14711472def Print(self):1473"""Prints the hypotheses and their probabilities."""1474for hypo, prob in sorted(self.Items()):1475print(hypo, prob)14761477def MakeOdds(self):1478"""Transforms from probabilities to odds.14791480Values with prob=0 are removed.1481"""1482for hypo, prob in self.Items():1483if prob:1484self.Set(hypo, Odds(prob))1485else:1486self.Remove(hypo)14871488def MakeProbs(self):1489"""Transforms from odds to probabilities."""1490for hypo, odds in self.Items():1491self.Set(hypo, Probability(odds))149214931494def MakeSuiteFromList(t, label=None):1495"""Makes a suite from an unsorted sequence of values.14961497Args:1498t: sequence of numbers1499label: string label for this suite15001501Returns:1502Suite object1503"""1504hist = MakeHistFromList(t, label=label)1505d = hist.GetDict()1506return MakeSuiteFromDict(d)150715081509def MakeSuiteFromHist(hist, label=None):1510"""Makes a normalized suite from a Hist object.15111512Args:1513hist: Hist object1514label: string label15151516Returns:1517Suite object1518"""1519if label is None:1520label = hist.label15211522# make a copy of the dictionary1523d = dict(hist.GetDict())1524return MakeSuiteFromDict(d, label)152515261527def MakeSuiteFromDict(d, label=None):1528"""Makes a suite from a map from values to probabilities.15291530Args:1531d: dictionary that maps values to probabilities1532label: string label for this suite15331534Returns:1535Suite object1536"""1537suite = Suite(label=label)1538suite.SetDict(d)1539suite.Normalize()1540return suite154115421543class Pdf(object):1544"""Represents a probability density function (PDF)."""15451546def Density(self, x):1547"""Evaluates this Pdf at x.15481549Returns: float or NumPy array of probability density1550"""1551raise UnimplementedMethodException()15521553def GetLinspace(self):1554"""Get a linspace for plotting.15551556Not all subclasses of Pdf implement this.15571558Returns: numpy array1559"""1560raise UnimplementedMethodException()15611562def MakePmf(self, **options):1563"""Makes a discrete version of this Pdf.15641565options can include1566label: string1567low: low end of range1568high: high end of range1569n: number of places to evaluate15701571Returns: new Pmf1572"""1573label = options.pop('label', '')1574xs, ds = self.Render(**options)1575return Pmf(dict(zip(xs, ds)), label=label)15761577def Render(self, **options):1578"""Generates a sequence of points suitable for plotting.15791580If options includes low and high, it must also include n;1581in that case the density is evaluated an n locations between1582low and high, including both.15831584If options includes xs, the density is evaluate at those location.15851586Otherwise, self.GetLinspace is invoked to provide the locations.15871588Returns:1589tuple of (xs, densities)1590"""1591low, high = options.pop('low', None), options.pop('high', None)1592if low is not None and high is not None:1593n = options.pop('n', 101)1594xs = np.linspace(low, high, n)1595else:1596xs = options.pop('xs', None)1597if xs is None:1598xs = self.GetLinspace()15991600ds = self.Density(xs)1601return xs, ds16021603def Items(self):1604"""Generates a sequence of (value, probability) pairs.1605"""1606return zip(*self.Render())160716081609class NormalPdf(Pdf):1610"""Represents the PDF of a Normal distribution."""16111612def __init__(self, mu=0, sigma=1, label=None):1613"""Constructs a Normal Pdf with given mu and sigma.16141615mu: mean1616sigma: standard deviation1617label: string1618"""1619self.mu = mu1620self.sigma = sigma1621self.label = label if label is not None else '_nolegend_'16221623def __str__(self):1624return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)16251626def GetLinspace(self):1627"""Get a linspace for plotting.16281629Returns: numpy array1630"""1631low, high = self.mu-3*self.sigma, self.mu+3*self.sigma1632return np.linspace(low, high, 101)16331634def Density(self, xs):1635"""Evaluates this Pdf at xs.16361637xs: scalar or sequence of floats16381639returns: float or NumPy array of probability density1640"""1641return stats.norm.pdf(xs, self.mu, self.sigma)164216431644class ExponentialPdf(Pdf):1645"""Represents the PDF of an exponential distribution."""16461647def __init__(self, lam=1, label=None):1648"""Constructs an exponential Pdf with given parameter.16491650lam: rate parameter1651label: string1652"""1653self.lam = lam1654self.label = label if label is not None else '_nolegend_'16551656def __str__(self):1657return 'ExponentialPdf(%f)' % (self.lam)16581659def GetLinspace(self):1660"""Get a linspace for plotting.16611662Returns: numpy array1663"""1664low, high = 0, 5.0/self.lam1665return np.linspace(low, high, 101)16661667def Density(self, xs):1668"""Evaluates this Pdf at xs.16691670xs: scalar or sequence of floats16711672returns: float or NumPy array of probability density1673"""1674return stats.expon.pdf(xs, scale=1.0/self.lam)167516761677class EstimatedPdf(Pdf):1678"""Represents a PDF estimated by KDE."""16791680def __init__(self, sample, label=None):1681"""Estimates the density function based on a sample.16821683sample: sequence of data1684label: string1685"""1686self.label = label if label is not None else '_nolegend_'1687self.kde = stats.gaussian_kde(sample)1688low = min(sample)1689high = max(sample)1690self.linspace = np.linspace(low, high, 101)16911692def __str__(self):1693return 'EstimatedPdf(label=%s)' % str(self.label)16941695def GetLinspace(self):1696"""Get a linspace for plotting.16971698Returns: numpy array1699"""1700return self.linspace17011702def Density(self, xs):1703"""Evaluates this Pdf at xs.17041705returns: float or NumPy array of probability density1706"""1707return self.kde.evaluate(xs)17081709def Sample(self, n):1710"""Generates a random sample from the estimated Pdf.17111712n: size of sample1713"""1714# NOTE: we have to flatten because resample returns a 2-D1715# array for some reason.1716return self.kde.resample(n).flatten()171717181719def CredibleInterval(pmf, percentage=90):1720"""Computes a credible interval for a given distribution.17211722If percentage=90, computes the 90% CI.17231724Args:1725pmf: Pmf object representing a posterior distribution1726percentage: float between 0 and 10017271728Returns:1729sequence of two floats, low and high1730"""1731cdf = pmf.MakeCdf()1732prob = (1 - percentage / 100) / 21733interval = cdf.Value(prob), cdf.Value(1 - prob)1734return interval173517361737def PmfProbLess(pmf1, pmf2):1738"""Probability that a value from pmf1 is less than a value from pmf2.17391740Args:1741pmf1: Pmf object1742pmf2: Pmf object17431744Returns:1745float probability1746"""1747total = 01748for v1, p1 in pmf1.Items():1749for v2, p2 in pmf2.Items():1750if v1 < v2:1751total += p1 * p21752return total175317541755def PmfProbGreater(pmf1, pmf2):1756"""Probability that a value from pmf1 is less than a value from pmf2.17571758Args:1759pmf1: Pmf object1760pmf2: Pmf object17611762Returns:1763float probability1764"""1765total = 01766for v1, p1 in pmf1.Items():1767for v2, p2 in pmf2.Items():1768if v1 > v2:1769total += p1 * p21770return total177117721773def PmfProbEqual(pmf1, pmf2):1774"""Probability that a value from pmf1 equals a value from pmf2.17751776Args:1777pmf1: Pmf object1778pmf2: Pmf object17791780Returns:1781float probability1782"""1783total = 01784for v1, p1 in pmf1.Items():1785for v2, p2 in pmf2.Items():1786if v1 == v2:1787total += p1 * p21788return total178917901791def RandomSum(dists):1792"""Chooses a random value from each dist and returns the sum.17931794dists: sequence of Pmf or Cdf objects17951796returns: numerical sum1797"""1798total = sum(dist.Random() for dist in dists)1799return total180018011802def SampleSum(dists, n):1803"""Draws a sample of sums from a list of distributions.18041805dists: sequence of Pmf or Cdf objects1806n: sample size18071808returns: new Pmf of sums1809"""1810pmf = Pmf(RandomSum(dists) for i in range(n))1811return pmf181218131814def EvalNormalPdf(x, mu, sigma):1815"""Computes the unnormalized PDF of the normal distribution.18161817x: value1818mu: mean1819sigma: standard deviation18201821returns: float probability density1822"""1823return stats.norm.pdf(x, mu, sigma)182418251826def MakeNormalPmf(mu, sigma, num_sigmas, n=201):1827"""Makes a PMF discrete approx to a Normal distribution.18281829mu: float mean1830sigma: float standard deviation1831num_sigmas: how many sigmas to extend in each direction1832n: number of values in the Pmf18331834returns: normalized Pmf1835"""1836pmf = Pmf()1837low = mu - num_sigmas * sigma1838high = mu + num_sigmas * sigma18391840for x in np.linspace(low, high, n):1841p = EvalNormalPdf(x, mu, sigma)1842pmf.Set(x, p)1843pmf.Normalize()1844return pmf184518461847def EvalBinomialPmf(k, n, p):1848"""Evaluates the binomial PMF.18491850Returns the probabily of k successes in n trials with probability p.1851"""1852return stats.binom.pmf(k, n, p)185318541855def MakeBinomialPmf(n, p):1856"""Evaluates the binomial PMF.18571858Returns the distribution of successes in n trials with probability p.1859"""1860pmf = Pmf()1861for k in range(n+1):1862pmf[k] = stats.binom.pmf(k, n, p)1863return pmf186418651866def EvalGammaPdf(x, a):1867"""Computes the Gamma PDF.18681869x: where to evaluate the PDF1870a: parameter of the gamma distribution18711872returns: float probability1873"""1874return x**(a-1) * np.exp(-x) / gamma(a)187518761877def MakeGammaPmf(xs, a):1878"""Makes a PMF discrete approx to a Gamma distribution.18791880lam: parameter lambda in events per unit time1881xs: upper bound of the Pmf18821883returns: normalized Pmf1884"""1885xs = np.asarray(xs)1886ps = EvalGammaPdf(xs, a)1887pmf = Pmf(dict(zip(xs, ps)))1888pmf.Normalize()1889return pmf189018911892def EvalGeometricPmf(k, p, loc=0):1893"""Evaluates the geometric PMF.18941895With loc=0: Probability of `k` trials to get one success.1896With loc=-1: Probability of `k` trials before first success.18971898k: number of trials1899p: probability of success on each trial1900"""1901return stats.geom.pmf(k, p, loc=loc)190219031904def MakeGeometricPmf(p, loc=0, high=10):1905"""Evaluates the binomial PMF.19061907With loc=0: PMF of trials to get one success.1908With loc=-1: PMF of trials before first success.19091910p: probability of success1911high: upper bound where PMF is truncated1912"""1913pmf = Pmf()1914for k in range(high):1915pmf[k] = stats.geom.pmf(k, p, loc=loc)1916pmf.Normalize()1917return pmf191819191920def EvalHypergeomPmf(k, N, K, n):1921"""Evaluates the hypergeometric PMF.19221923Returns the probabily of k successes in n trials from a population1924N with K successes in it.1925"""1926return stats.hypergeom.pmf(k, N, K, n)192719281929def EvalPoissonPmf(k, lam):1930"""Computes the Poisson PMF.19311932k: number of events1933lam: parameter lambda in events per unit time19341935returns: float probability1936"""1937return stats.poisson.pmf(k, lam)193819391940def MakePoissonPmf(lam, high, step=1):1941"""Makes a PMF discrete approx to a Poisson distribution.19421943lam: parameter lambda in events per unit time1944high: upper bound of the Pmf19451946returns: normalized Pmf1947"""1948pmf = Pmf()1949for k in range(0, high + 1, step):1950p = stats.poisson.pmf(k, lam)1951pmf.Set(k, p)1952pmf.Normalize()1953return pmf195419551956def EvalExponentialPdf(x, lam):1957"""Computes the exponential PDF.19581959x: value1960lam: parameter lambda in events per unit time19611962returns: float probability density1963"""1964return lam * math.exp(-lam * x)196519661967def EvalExponentialCdf(x, lam):1968"""Evaluates CDF of the exponential distribution with parameter lam."""1969return 1 - math.exp(-lam * x)197019711972def MakeExponentialPmf(lam, high, n=200):1973"""Makes a PMF discrete approx to an exponential distribution.19741975lam: parameter lambda in events per unit time1976high: upper bound1977n: number of values in the Pmf19781979returns: normalized Pmf1980"""1981pmf = Pmf()1982for x in np.linspace(0, high, n):1983p = EvalExponentialPdf(x, lam)1984pmf.Set(x, p)1985pmf.Normalize()1986return pmf198719881989def EvalWeibullPdf(x, lam, k):1990"""Computes the Weibull PDF.19911992x: value1993lam: parameter lambda in events per unit time1994k: parameter19951996returns: float probability density1997"""1998arg = (x / lam)1999return k / lam * arg**(k-1) * np.exp(-arg**k)200020012002def EvalWeibullCdf(x, lam, k):2003"""Evaluates CDF of the Weibull distribution."""2004arg = (x / lam)2005return 1 - np.exp(-arg**k)200620072008def MakeWeibullPmf(lam, k, high, n=200):2009"""Makes a PMF discrete approx to a Weibull distribution.20102011lam: parameter lambda in events per unit time2012k: parameter2013high: upper bound2014n: number of values in the Pmf20152016returns: normalized Pmf2017"""2018xs = np.linspace(0, high, n)2019ps = EvalWeibullPdf(xs, lam, k)2020return Pmf(dict(zip(xs, ps)))202120222023def EvalParetoPdf(x, xm, alpha):2024"""Computes the Pareto.20252026xm: minimum value (scale parameter)2027alpha: shape parameter20282029returns: float probability density2030"""2031return stats.pareto.pdf(x, alpha, scale=xm)203220332034def MakeParetoPmf(xm, alpha, high, num=101):2035"""Makes a PMF discrete approx to a Pareto distribution.20362037xm: minimum value (scale parameter)2038alpha: shape parameter2039high: upper bound value2040num: number of values20412042returns: normalized Pmf2043"""2044xs = np.linspace(xm, high, num)2045ps = stats.pareto.pdf(xs, alpha, scale=xm)2046pmf = Pmf(dict(zip(xs, ps)))2047return pmf20482049def StandardNormalCdf(x):2050"""Evaluates the CDF of the standard Normal distribution.20512052See http://en.wikipedia.org/wiki/Normal_distribution2053#Cumulative_distribution_function20542055Args:2056x: float20572058Returns:2059float2060"""2061return (math.erf(x / ROOT2) + 1) / 2206220632064def EvalNormalCdf(x, mu=0, sigma=1):2065"""Evaluates the CDF of the normal distribution.20662067Args:2068x: float20692070mu: mean parameter20712072sigma: standard deviation parameter20732074Returns:2075float2076"""2077return stats.norm.cdf(x, loc=mu, scale=sigma)207820792080def EvalNormalCdfInverse(p, mu=0, sigma=1):2081"""Evaluates the inverse CDF of the normal distribution.20822083See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function20842085Args:2086p: float20872088mu: mean parameter20892090sigma: standard deviation parameter20912092Returns:2093float2094"""2095return stats.norm.ppf(p, loc=mu, scale=sigma)209620972098def EvalLognormalCdf(x, mu=0, sigma=1):2099"""Evaluates the CDF of the lognormal distribution.21002101x: float or sequence2102mu: mean parameter2103sigma: standard deviation parameter21042105Returns: float or sequence2106"""2107return stats.lognorm.cdf(x, loc=mu, scale=sigma)210821092110def RenderExpoCdf(lam, low, high, n=101):2111"""Generates sequences of xs and ps for an exponential CDF.21122113lam: parameter2114low: float2115high: float2116n: number of points to render21172118returns: numpy arrays (xs, ps)2119"""2120xs = np.linspace(low, high, n)2121ps = 1 - np.exp(-lam * xs)2122#ps = stats.expon.cdf(xs, scale=1.0/lam)2123return xs, ps212421252126def RenderNormalCdf(mu, sigma, low, high, n=101):2127"""Generates sequences of xs and ps for a Normal CDF.21282129mu: parameter2130sigma: parameter2131low: float2132high: float2133n: number of points to render21342135returns: numpy arrays (xs, ps)2136"""2137xs = np.linspace(low, high, n)2138ps = stats.norm.cdf(xs, mu, sigma)2139return xs, ps214021412142def RenderParetoCdf(xmin, alpha, low, high, n=50):2143"""Generates sequences of xs and ps for a Pareto CDF.21442145xmin: parameter2146alpha: parameter2147low: float2148high: float2149n: number of points to render21502151returns: numpy arrays (xs, ps)2152"""2153if low < xmin:2154low = xmin2155xs = np.linspace(low, high, n)2156ps = 1 - (xs / xmin) ** -alpha2157#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)2158return xs, ps215921602161class Beta:2162"""Represents a Beta distribution.21632164See http://en.wikipedia.org/wiki/Beta_distribution2165"""2166def __init__(self, alpha=1, beta=1, label=None):2167"""Initializes a Beta distribution."""2168self.alpha = alpha2169self.beta = beta2170self.label = label if label is not None else '_nolegend_'21712172def Update(self, data):2173"""Updates a Beta distribution.21742175data: pair of int (heads, tails)2176"""2177heads, tails = data2178self.alpha += heads2179self.beta += tails21802181def Mean(self):2182"""Computes the mean of this distribution."""2183return self.alpha / (self.alpha + self.beta)21842185def MAP(self):2186"""Computes the value with maximum a posteori probability."""2187a = self.alpha - 12188b = self.beta - 12189return a / (a + b)21902191def Random(self):2192"""Generates a random variate from this distribution."""2193return random.betavariate(self.alpha, self.beta)21942195def Sample(self, n):2196"""Generates a random sample from this distribution.21972198n: int sample size2199"""2200size = n,2201return np.random.beta(self.alpha, self.beta, size)22022203def EvalPdf(self, x):2204"""Evaluates the PDF at x."""2205return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)22062207def MakePmf(self, steps=101, label=None):2208"""Returns a Pmf of this distribution.22092210Note: Normally, we just evaluate the PDF at a sequence2211of points and treat the probability density as a probability2212mass.22132214But if alpha or beta is less than one, we have to be2215more careful because the PDF goes to infinity at x=02216and x=1. In that case we evaluate the CDF and compute2217differences.22182219The result is a little funny, because the values at 0 and 12220are not symmetric. Nevertheless, it is a reasonable discrete2221model of the continuous distribution, and behaves well as2222the number of values increases.2223"""2224if label is None and self.label is not None:2225label = self.label22262227if self.alpha < 1 or self.beta < 1:2228cdf = self.MakeCdf()2229pmf = cdf.MakePmf()2230return pmf22312232xs = [i / (steps - 1.0) for i in range(steps)]2233probs = [self.EvalPdf(x) for x in xs]2234pmf = Pmf(dict(zip(xs, probs)), label=label)2235return pmf22362237def MakeCdf(self, steps=101):2238"""Returns the CDF of this distribution."""2239xs = [i / (steps - 1.0) for i in range(steps)]2240ps = special.betainc(self.alpha, self.beta, xs)2241cdf = Cdf(xs, ps)2242return cdf22432244def Percentile(self, ps):2245"""Returns the given percentiles from this distribution.22462247ps: scalar, array, or list of [0-100]2248"""2249ps = np.asarray(ps) / 1002250xs = special.betaincinv(self.alpha, self.beta, ps)2251return xs225222532254class Dirichlet(object):2255"""Represents a Dirichlet distribution.22562257See http://en.wikipedia.org/wiki/Dirichlet_distribution2258"""22592260def __init__(self, n, conc=1, label=None):2261"""Initializes a Dirichlet distribution.22622263n: number of dimensions2264conc: concentration parameter (smaller yields more concentration)2265label: string label2266"""2267if n < 2:2268raise ValueError('A Dirichlet distribution with '2269'n<2 makes no sense')22702271self.n = n2272self.params = np.ones(n, dtype=np.float) * conc2273self.label = label if label is not None else '_nolegend_'22742275def Update(self, data):2276"""Updates a Dirichlet distribution.22772278data: sequence of observations, in order corresponding to params2279"""2280m = len(data)2281self.params[:m] += data22822283def Random(self):2284"""Generates a random variate from this distribution.22852286Returns: normalized vector of fractions2287"""2288p = np.random.gamma(self.params)2289return p / p.sum()22902291def Likelihood(self, data):2292"""Computes the likelihood of the data.22932294Selects a random vector of probabilities from this distribution.22952296Returns: float probability2297"""2298m = len(data)2299if self.n < m:2300return 023012302x = data2303p = self.Random()2304q = p[:m] ** x2305return q.prod()23062307def LogLikelihood(self, data):2308"""Computes the log likelihood of the data.23092310Selects a random vector of probabilities from this distribution.23112312Returns: float log probability2313"""2314m = len(data)2315if self.n < m:2316return float('-inf')23172318x = self.Random()2319y = np.log(x[:m]) * data2320return y.sum()23212322def MarginalBeta(self, i):2323"""Computes the marginal distribution of the ith element.23242325See http://en.wikipedia.org/wiki/Dirichlet_distribution2326#Marginal_distributions23272328i: int23292330Returns: Beta object2331"""2332alpha0 = self.params.sum()2333alpha = self.params[i]2334return Beta(alpha, alpha0 - alpha)23352336def PredictivePmf(self, xs, label=None):2337"""Makes a predictive distribution.23382339xs: values to go into the Pmf23402341Returns: Pmf that maps from x to the mean prevalence of x2342"""2343alpha0 = self.params.sum()2344ps = self.params / alpha02345return Pmf(zip(xs, ps), label=label)234623472348def BinomialCoef(n, k):2349"""Compute the binomial coefficient "n choose k".23502351n: number of trials2352k: number of successes23532354Returns: float2355"""2356return scipy.misc.comb(n, k)235723582359def LogBinomialCoef(n, k):2360"""Computes the log of the binomial coefficient.23612362http://math.stackexchange.com/questions/64716/2363approximating-the-logarithm-of-the-binomial-coefficient23642365n: number of trials2366k: number of successes23672368Returns: float2369"""2370return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)237123722373def NormalProbability(ys, jitter=0):2374"""Generates data for a normal probability plot.23752376ys: sequence of values2377jitter: float magnitude of jitter added to the ys23782379returns: numpy arrays xs, ys2380"""2381n = len(ys)2382xs = np.random.normal(0, 1, n)2383xs.sort()23842385if jitter:2386ys = Jitter(ys, jitter)2387else:2388ys = np.array(ys)2389ys.sort()23902391return xs, ys239223932394def Jitter(values, jitter=0.5):2395"""Jitters the values by adding a uniform variate in (-jitter, jitter).23962397values: sequence2398jitter: scalar magnitude of jitter23992400returns: new numpy array2401"""2402n = len(values)2403return np.random.normal(0, jitter, n) + values240424052406def NormalProbabilityPlot(sample, fit_color='0.8', **options):2407"""Makes a normal probability plot with a fitted line.24082409sample: sequence of numbers2410fit_color: color string for the fitted line2411options: passed along to Plot2412"""2413xs, ys = NormalProbability(sample)2414mean, var = MeanVar(sample)2415std = math.sqrt(var)24162417fit = FitLine(xs, mean, std)2418thinkplot.Plot(*fit, color=fit_color, label='model')24192420xs, ys = NormalProbability(sample)2421thinkplot.Plot(xs, ys, **options)242224232424def Mean(xs):2425"""Computes mean.24262427xs: sequence of values24282429returns: float mean2430"""2431return np.mean(xs)243224332434def Var(xs, mu=None, ddof=0):2435"""Computes variance.24362437xs: sequence of values2438mu: option known mean2439ddof: delta degrees of freedom24402441returns: float2442"""2443xs = np.asarray(xs)24442445if mu is None:2446mu = xs.mean()24472448ds = xs - mu2449return np.dot(ds, ds) / (len(xs) - ddof)245024512452def Std(xs, mu=None, ddof=0):2453"""Computes standard deviation.24542455xs: sequence of values2456mu: option known mean2457ddof: delta degrees of freedom24582459returns: float2460"""2461var = Var(xs, mu, ddof)2462return math.sqrt(var)246324642465def MeanVar(xs, ddof=0):2466"""Computes mean and variance.24672468Based on http://stackoverflow.com/questions/19391149/2469numpy-mean-and-variance-from-single-function24702471xs: sequence of values2472ddof: delta degrees of freedom24732474returns: pair of float, mean and var2475"""2476xs = np.asarray(xs)2477mean = xs.mean()2478s2 = Var(xs, mean, ddof)2479return mean, s2248024812482def Trim(t, p=0.01):2483"""Trims the largest and smallest elements of t.24842485Args:2486t: sequence of numbers2487p: fraction of values to trim off each end24882489Returns:2490sequence of values2491"""2492n = int(p * len(t))2493t = sorted(t)[n:-n]2494return t249524962497def TrimmedMean(t, p=0.01):2498"""Computes the trimmed mean of a sequence of numbers.24992500Args:2501t: sequence of numbers2502p: fraction of values to trim off each end25032504Returns:2505float2506"""2507t = Trim(t, p)2508return Mean(t)250925102511def TrimmedMeanVar(t, p=0.01):2512"""Computes the trimmed mean and variance of a sequence of numbers.25132514Side effect: sorts the list.25152516Args:2517t: sequence of numbers2518p: fraction of values to trim off each end25192520Returns:2521float2522"""2523t = Trim(t, p)2524mu, var = MeanVar(t)2525return mu, var252625272528def CohenEffectSize(group1, group2):2529"""Compute Cohen's d.25302531group1: Series or NumPy array2532group2: Series or NumPy array25332534returns: float2535"""2536diff = group1.mean() - group2.mean()25372538n1, n2 = len(group1), len(group2)2539var1 = group1.var()2540var2 = group2.var()25412542pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)2543d = diff / math.sqrt(pooled_var)2544return d254525462547def Cov(xs, ys, meanx=None, meany=None):2548"""Computes Cov(X, Y).25492550Args:2551xs: sequence of values2552ys: sequence of values2553meanx: optional float mean of xs2554meany: optional float mean of ys25552556Returns:2557Cov(X, Y)2558"""2559xs = np.asarray(xs)2560ys = np.asarray(ys)25612562if meanx is None:2563meanx = np.mean(xs)2564if meany is None:2565meany = np.mean(ys)25662567cov = np.dot(xs-meanx, ys-meany) / len(xs)2568return cov256925702571def Corr(xs, ys):2572"""Computes Corr(X, Y).25732574Args:2575xs: sequence of values2576ys: sequence of values25772578Returns:2579Corr(X, Y)2580"""2581xs = np.asarray(xs)2582ys = np.asarray(ys)25832584meanx, varx = MeanVar(xs)2585meany, vary = MeanVar(ys)25862587corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)25882589return corr259025912592def SerialCorr(series, lag=1):2593"""Computes the serial correlation of a series.25942595series: Series2596lag: integer number of intervals to shift25972598returns: float correlation2599"""2600xs = series[lag:]2601ys = series.shift(lag)[lag:]2602corr = Corr(xs, ys)2603return corr260426052606def SpearmanCorr(xs, ys):2607"""Computes Spearman's rank correlation.26082609Args:2610xs: sequence of values2611ys: sequence of values26122613Returns:2614float Spearman's correlation2615"""2616xranks = pandas.Series(xs).rank()2617yranks = pandas.Series(ys).rank()2618return Corr(xranks, yranks)261926202621def MapToRanks(t):2622"""Returns a list of ranks corresponding to the elements in t.26232624Args:2625t: sequence of numbers26262627Returns:2628list of integer ranks, starting at 12629"""2630# pair up each value with its index2631pairs = enumerate(t)26322633# sort by value2634sorted_pairs = sorted(pairs, key=itemgetter(1))26352636# pair up each pair with its rank2637ranked = enumerate(sorted_pairs)26382639# sort by index2640resorted = sorted(ranked, key=lambda trip: trip[1][0])26412642# extract the ranks2643ranks = [trip[0]+1 for trip in resorted]2644return ranks264526462647def LeastSquares(xs, ys):2648"""Computes a linear least squares fit for ys as a function of xs.26492650Args:2651xs: sequence of values2652ys: sequence of values26532654Returns:2655tuple of (intercept, slope)2656"""2657meanx, varx = MeanVar(xs)2658meany = Mean(ys)26592660slope = Cov(xs, ys, meanx, meany) / varx2661inter = meany - slope * meanx26622663return inter, slope266426652666def FitLine(xs, inter, slope):2667"""Fits a line to the given data.26682669xs: sequence of x26702671returns: tuple of numpy arrays (sorted xs, fit ys)2672"""2673fit_xs = np.sort(xs)2674fit_ys = inter + slope * fit_xs2675return fit_xs, fit_ys267626772678def Residuals(xs, ys, inter, slope):2679"""Computes residuals for a linear fit with parameters inter and slope.26802681Args:2682xs: independent variable2683ys: dependent variable2684inter: float intercept2685slope: float slope26862687Returns:2688list of residuals2689"""2690xs = np.asarray(xs)2691ys = np.asarray(ys)2692res = ys - (inter + slope * xs)2693return res269426952696def CoefDetermination(ys, res):2697"""Computes the coefficient of determination (R^2) for given residuals.26982699Args:2700ys: dependent variable2701res: residuals27022703Returns:2704float coefficient of determination2705"""2706return 1 - Var(res) / Var(ys)270727082709def CorrelatedGenerator(rho):2710"""Generates standard normal variates with serial correlation.27112712rho: target coefficient of correlation27132714Returns: iterable2715"""2716x = random.gauss(0, 1)2717yield x27182719sigma = math.sqrt(1 - rho**2)2720while True:2721x = random.gauss(x * rho, sigma)2722yield x272327242725def CorrelatedNormalGenerator(mu, sigma, rho):2726"""Generates normal variates with serial correlation.27272728mu: mean of variate2729sigma: standard deviation of variate2730rho: target coefficient of correlation27312732Returns: iterable2733"""2734for x in CorrelatedGenerator(rho):2735yield x * sigma + mu273627372738def RawMoment(xs, k):2739"""Computes the kth raw moment of xs.2740"""2741return sum(x**k for x in xs) / len(xs)274227432744def CentralMoment(xs, k):2745"""Computes the kth central moment of xs.2746"""2747mean = RawMoment(xs, 1)2748return sum((x - mean)**k for x in xs) / len(xs)274927502751def StandardizedMoment(xs, k):2752"""Computes the kth standardized moment of xs.2753"""2754var = CentralMoment(xs, 2)2755std = math.sqrt(var)2756return CentralMoment(xs, k) / std**k275727582759def Skewness(xs):2760"""Computes skewness.2761"""2762return StandardizedMoment(xs, 3)276327642765def Median(xs):2766"""Computes the median (50th percentile) of a sequence.27672768xs: sequence or anything else that can initialize a Cdf27692770returns: float2771"""2772cdf = Cdf(xs)2773return cdf.Value(0.5)277427752776def IQR(xs):2777"""Computes the interquartile of a sequence.27782779xs: sequence or anything else that can initialize a Cdf27802781returns: pair of floats2782"""2783cdf = Cdf(xs)2784return cdf.Value(0.25), cdf.Value(0.75)278527862787def PearsonMedianSkewness(xs):2788"""Computes the Pearson median skewness.2789"""2790median = Median(xs)2791mean = RawMoment(xs, 1)2792var = CentralMoment(xs, 2)2793std = math.sqrt(var)2794gp = 3 * (mean - median) / std2795return gp279627972798class FixedWidthVariables(object):2799"""Represents a set of variables in a fixed width file."""28002801def __init__(self, variables, index_base=0):2802"""Initializes.28032804variables: DataFrame2805index_base: are the indices 0 or 1 based?28062807Attributes:2808colspecs: list of (start, end) index tuples2809names: list of string variable names2810"""2811self.variables = variables28122813# note: by default, subtract 1 from colspecs2814self.colspecs = variables[['start', 'end']] - index_base28152816# convert colspecs to a list of pair of int2817self.colspecs = self.colspecs.astype(np.int).values.tolist()2818self.names = variables['name']28192820def ReadFixedWidth(self, filename, **options):2821"""Reads a fixed width ASCII file.28222823filename: string filename28242825returns: DataFrame2826"""2827df = pandas.read_fwf(filename,2828colspecs=self.colspecs,2829names=self.names,2830**options)2831return df283228332834def ReadStataDct(dct_file, **options):2835"""Reads a Stata dictionary file.28362837dct_file: string filename2838options: dict of options passed to open()28392840returns: FixedWidthVariables object2841"""2842type_map = dict(byte=int, int=int, long=int, float=float, double=float)28432844var_info = []2845with open(dct_file, **options) as f:2846for line in f:2847match = re.search( r'_column\(([^)]*)\)', line)2848if not match:2849continue2850start = int(match.group(1))2851t = line.split()2852vtype, name, fstring = t[1:4]2853name = name.lower()2854if vtype.startswith('str'):2855vtype = str2856else:2857vtype = type_map[vtype]2858long_desc = ' '.join(t[4:]).strip('"')2859var_info.append((start, vtype, name, fstring, long_desc))28602861columns = ['start', 'type', 'name', 'fstring', 'desc']2862variables = pandas.DataFrame(var_info, columns=columns)28632864# fill in the end column by shifting the start column2865variables['end'] = variables.start.shift(-1)2866variables.loc[len(variables)-1, 'end'] = 028672868dct = FixedWidthVariables(variables, index_base=1)2869return dct287028712872def Resample(xs, n=None):2873"""Draw a sample from xs with the same length as xs.28742875xs: sequence2876n: sample size (default: len(xs))28772878returns: NumPy array2879"""2880if n is None:2881n = len(xs)2882return np.random.choice(xs, n, replace=True)288328842885def SampleRows(df, nrows, replace=False):2886"""Choose a sample of rows from a DataFrame.28872888df: DataFrame2889nrows: number of rows2890replace: whether to sample with replacement28912892returns: DataDf2893"""2894indices = np.random.choice(df.index, nrows, replace=replace)2895sample = df.loc[indices]2896return sample289728982899def ResampleRows(df):2900"""Resamples rows from a DataFrame.29012902df: DataFrame29032904returns: DataFrame2905"""2906return SampleRows(df, len(df), replace=True)290729082909def ResampleRowsWeighted(df, column='finalwgt'):2910"""Resamples a DataFrame using probabilities proportional to given column.29112912df: DataFrame2913column: string column name to use as weights29142915returns: DataFrame2916"""2917weights = df[column]2918cdf = Cdf(dict(weights))2919indices = cdf.Sample(len(weights))2920sample = df.loc[indices]2921return sample292229232924def PercentileRow(array, p):2925"""Selects the row from a sorted array that maps to percentile p.29262927p: float 0--10029282929returns: NumPy array (one row)2930"""2931rows, cols = array.shape2932index = int(rows * p / 100)2933return array[index,]293429352936def PercentileRows(ys_seq, percents):2937"""Given a collection of lines, selects percentiles along vertical axis.29382939For example, if ys_seq contains simulation results like ys as a2940function of time, and percents contains (5, 95), the result would2941be a 90% CI for each vertical slice of the simulation results.29422943ys_seq: sequence of lines (y values)2944percents: list of percentiles (0-100) to select29452946returns: list of NumPy arrays, one for each percentile2947"""2948nrows = len(ys_seq)2949ncols = len(ys_seq[0])2950array = np.zeros((nrows, ncols))29512952for i, ys in enumerate(ys_seq):2953array[i,] = ys29542955array = np.sort(array, axis=0)29562957rows = [PercentileRow(array, p) for p in percents]2958return rows295929602961def Smooth(xs, sigma=2, **options):2962"""Smooths a NumPy array with a Gaussian filter.29632964xs: sequence2965sigma: standard deviation of the filter2966"""2967return ndimage.filters.gaussian_filter1d(xs, sigma, **options)296829692970class HypothesisTest(object):2971"""Represents a hypothesis test."""29722973def __init__(self, data):2974"""Initializes.29752976data: data in whatever form is relevant2977"""2978self.data = data2979self.MakeModel()2980self.actual = self.TestStatistic(data)2981self.test_stats = None2982self.test_cdf = None29832984def PValue(self, iters=1000):2985"""Computes the distribution of the test statistic and p-value.29862987iters: number of iterations29882989returns: float p-value2990"""2991self.test_stats = [self.TestStatistic(self.RunModel())2992for _ in range(iters)]2993self.test_cdf = Cdf(self.test_stats)29942995count = sum(1 for x in self.test_stats if x >= self.actual)2996return count / iters29972998def MaxTestStat(self):2999"""Returns the largest test statistic seen during simulations.3000"""3001return max(self.test_stats)30023003def PlotCdf(self, label=None):3004"""Draws a Cdf with vertical lines at the observed test stat.3005"""3006def VertLine(x):3007"""Draws a vertical line at x."""3008thinkplot.Plot([x, x], [0, 1], color='0.8')30093010VertLine(self.actual)3011thinkplot.Cdf(self.test_cdf, label=label)30123013def TestStatistic(self, data):3014"""Computes the test statistic.30153016data: data in whatever form is relevant3017"""3018raise UnimplementedMethodException()30193020def MakeModel(self):3021"""Build a model of the null hypothesis.3022"""3023pass30243025def RunModel(self):3026"""Run the model of the null hypothesis.30273028returns: simulated data3029"""3030raise UnimplementedMethodException()303130323033def main():3034pass303530363037if __name__ == '__main__':3038main()303930403041