📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code for use with "Think Stats" and1"Think Bayes", both by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89"""This file contains class definitions for:1011Hist: represents a histogram (map from values to integer frequencies).1213Pmf: represents a probability mass function (map from values to probs).1415_DictWrapper: private parent class for Hist and Pmf.1617Cdf: represents a discrete cumulative distribution function1819Pdf: represents a continuous probability density function2021"""2223import bisect24import copy25import logging26import math27import random28import re2930from collections import Counter31from operator import itemgetter3233import thinkplot3435import numpy as np36import pandas3738import scipy39from scipy import stats40from scipy import special41from scipy import ndimage4243from io import open4445ROOT2 = math.sqrt(2)4647def RandomSeed(x):48"""Initialize the random and np.random generators.4950x: int seed51"""52random.seed(x)53np.random.seed(x)545556def Odds(p):57"""Computes odds for a given probability.5859Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor.6061Note: when p=1, the formula for odds divides by zero, which is62normally undefined. But I think it is reasonable to define Odds(1)63to be infinity, so that's what this function does.6465p: float 0-16667Returns: float odds68"""69if p == 1:70return float('inf')71return p / (1 - p)727374def Probability(o):75"""Computes the probability corresponding to given odds.7677Example: o=2 means 2:1 odds in favor, or 2/3 probability7879o: float odds, strictly positive8081Returns: float probability82"""83return o / (o + 1)848586def Probability2(yes, no):87"""Computes the probability corresponding to given odds.8889Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability.9091yes, no: int or float odds in favor92"""93return yes / (yes + no)949596class Interpolator(object):97"""Represents a mapping between sorted sequences; performs linear interp.9899Attributes:100xs: sorted list101ys: sorted list102"""103104def __init__(self, xs, ys):105self.xs = xs106self.ys = ys107108def Lookup(self, x):109"""Looks up x and returns the corresponding value of y."""110return self._Bisect(x, self.xs, self.ys)111112def Reverse(self, y):113"""Looks up y and returns the corresponding value of x."""114return self._Bisect(y, self.ys, self.xs)115116def _Bisect(self, x, xs, ys):117"""Helper function."""118if x <= xs[0]:119return ys[0]120if x >= xs[-1]:121return ys[-1]122i = bisect.bisect(xs, x)123frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1])124y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1])125return y126127128class _DictWrapper(object):129"""An object that contains a dictionary."""130131def __init__(self, obj=None, label=None):132"""Initializes the distribution.133134obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs135label: string label136"""137self.label = label if label is not None else '_nolegend_'138self.d = {}139140# flag whether the distribution is under a log transform141self.log = False142143if obj is None:144return145146if isinstance(obj, (_DictWrapper, Cdf, Pdf)):147self.label = label if label is not None else obj.label148149if isinstance(obj, dict):150self.d.update(obj.items())151elif isinstance(obj, (_DictWrapper, Cdf, Pdf)):152self.d.update(obj.Items())153elif isinstance(obj, pandas.Series):154self.d.update(obj.value_counts().iteritems())155else:156# finally, treat it like a list157self.d.update(Counter(obj))158159if len(self) > 0 and isinstance(self, Pmf):160self.Normalize()161162def __hash__(self):163return id(self)164165def __str__(self):166cls = self.__class__.__name__167return '%s(%s)' % (cls, str(self.d))168169__repr__ = __str__170171def __eq__(self, other):172return self.d == other.d173174def __len__(self):175return len(self.d)176177def __iter__(self):178return iter(self.d)179180def iterkeys(self):181"""Returns an iterator over keys."""182return iter(self.d)183184def __contains__(self, value):185return value in self.d186187def __getitem__(self, value):188return self.d.get(value, 0)189190def __setitem__(self, value, prob):191self.d[value] = prob192193def __delitem__(self, value):194del self.d[value]195196def Copy(self, label=None):197"""Returns a copy.198199Make a shallow copy of d. If you want a deep copy of d,200use copy.deepcopy on the whole object.201202label: string label for the new Hist203204returns: new _DictWrapper with the same type205"""206new = copy.copy(self)207new.d = copy.copy(self.d)208new.label = label if label is not None else self.label209return new210211def Scale(self, factor):212"""Multiplies the values by a factor.213214factor: what to multiply by215216Returns: new object217"""218new = self.Copy()219new.d.clear()220221for val, prob in self.Items():222new.Set(val * factor, prob)223return new224225def Log(self, m=None):226"""Log transforms the probabilities.227228Removes values with probability 0.229230Normalizes so that the largest logprob is 0.231"""232if self.log:233raise ValueError("Pmf/Hist already under a log transform")234self.log = True235236if m is None:237m = self.MaxLike()238239for x, p in self.d.items():240if p:241self.Set(x, math.log(p / m))242else:243self.Remove(x)244245def Exp(self, m=None):246"""Exponentiates the probabilities.247248m: how much to shift the ps before exponentiating249250If m is None, normalizes so that the largest prob is 1.251"""252if not self.log:253raise ValueError("Pmf/Hist not under a log transform")254self.log = False255256if m is None:257m = self.MaxLike()258259for x, p in self.d.items():260self.Set(x, math.exp(p - m))261262def GetDict(self):263"""Gets the dictionary."""264return self.d265266def SetDict(self, d):267"""Sets the dictionary."""268self.d = d269270def Values(self):271"""Gets an unsorted sequence of values.272273Note: one source of confusion is that the keys of this274dictionary are the values of the Hist/Pmf, and the275values of the dictionary are frequencies/probabilities.276"""277return self.d.keys()278279def Items(self):280"""Gets an unsorted sequence of (value, freq/prob) pairs."""281return self.d.items()282283def Render(self, **options):284"""Generates a sequence of points suitable for plotting.285286Note: options are ignored287288Returns:289tuple of (sorted value sequence, freq/prob sequence)290"""291if min(self.d.keys()) is np.nan:292logging.warning('Hist: contains NaN, may not render correctly.')293294return zip(*sorted(self.Items()))295296def MakeCdf(self, label=None):297"""Makes a Cdf."""298label = label if label is not None else self.label299return Cdf(self, label=label)300301def Print(self):302"""Prints the values and freqs/probs in ascending order."""303for val, prob in sorted(self.d.items()):304print(val, prob)305306def Set(self, x, y=0):307"""Sets the freq/prob associated with the value x.308309Args:310x: number value311y: number freq or prob312"""313self.d[x] = y314315def Incr(self, x, term=1):316"""Increments the freq/prob associated with the value x.317318Args:319x: number value320term: how much to increment by321"""322self.d[x] = self.d.get(x, 0) + term323324def Mult(self, x, factor):325"""Scales the freq/prob associated with the value x.326327Args:328x: number value329factor: how much to multiply by330"""331self.d[x] = self.d.get(x, 0) * factor332333def Remove(self, x):334"""Removes a value.335336Throws an exception if the value is not there.337338Args:339x: value to remove340"""341del self.d[x]342343def Total(self):344"""Returns the total of the frequencies/probabilities in the map."""345total = sum(self.d.values())346return total347348def MaxLike(self):349"""Returns the largest frequency/probability in the map."""350return max(self.d.values())351352def Largest(self, n=10):353"""Returns the largest n values, with frequency/probability.354355n: number of items to return356"""357return sorted(self.d.items(), reverse=True)[:n]358359def Smallest(self, n=10):360"""Returns the smallest n values, with frequency/probability.361362n: number of items to return363"""364return sorted(self.d.items(), reverse=False)[:n]365366367class Hist(_DictWrapper):368"""Represents a histogram, which is a map from values to frequencies.369370Values can be any hashable type; frequencies are integer counters.371"""372def Freq(self, x):373"""Gets the frequency associated with the value x.374375Args:376x: number value377378Returns:379int frequency380"""381return self.d.get(x, 0)382383def Freqs(self, xs):384"""Gets frequencies for a sequence of values."""385return [self.Freq(x) for x in xs]386387def IsSubset(self, other):388"""Checks whether the values in this histogram are a subset of389the values in the given histogram."""390for val, freq in self.Items():391if freq > other.Freq(val):392return False393return True394395def Subtract(self, other):396"""Subtracts the values in the given histogram from this histogram."""397for val, freq in other.Items():398self.Incr(val, -freq)399400401class Pmf(_DictWrapper):402"""Represents a probability mass function.403404Values can be any hashable type; probabilities are floating-point.405Pmfs are not necessarily normalized.406"""407408def Prob(self, x, default=0):409"""Gets the probability associated with the value x.410411Args:412x: number value413default: value to return if the key is not there414415Returns:416float probability417"""418return self.d.get(x, default)419420def Probs(self, xs):421"""Gets probabilities for a sequence of values."""422return [self.Prob(x) for x in xs]423424def Percentile(self, percentage):425"""Computes a percentile of a given Pmf.426427Note: this is not super efficient. If you are planning428to compute more than a few percentiles, compute the Cdf.429430percentage: float 0-100431432returns: value from the Pmf433"""434p = percentage / 100.0435total = 0436for val, prob in sorted(self.Items()):437total += prob438if total >= p:439return val440441def ProbGreater(self, x):442"""Probability that a sample from this Pmf exceeds x.443444x: number445446returns: float probability447"""448if isinstance(x, _DictWrapper):449return PmfProbGreater(self, x)450else:451t = [prob for (val, prob) in self.d.items() if val > x]452return sum(t)453454def ProbLess(self, x):455"""Probability that a sample from this Pmf is less than x.456457x: number458459returns: float probability460"""461if isinstance(x, _DictWrapper):462return PmfProbLess(self, x)463else:464t = [prob for (val, prob) in self.d.items() if val < x]465return sum(t)466467def __lt__(self, obj):468"""Less than.469470obj: number or _DictWrapper471472returns: float probability473"""474return self.ProbLess(obj)475476def __gt__(self, obj):477"""Greater than.478479obj: number or _DictWrapper480481returns: float probability482"""483return self.ProbGreater(obj)484485def __ge__(self, obj):486"""Greater than or equal.487488obj: number or _DictWrapper489490returns: float probability491"""492return 1 - (self < obj)493494def __le__(self, obj):495"""Less than or equal.496497obj: number or _DictWrapper498499returns: float probability500"""501return 1 - (self > obj)502503def Normalize(self, fraction=1.0):504"""Normalizes this PMF so the sum of all probs is fraction.505506Args:507fraction: what the total should be after normalization508509Returns: the total probability before normalizing510"""511if self.log:512raise ValueError("Normalize: Pmf is under a log transform")513514total = self.Total()515if total == 0.0:516raise ValueError('Normalize: total probability is zero.')517#logging.warning('Normalize: total probability is zero.')518#return total519520factor = fraction / total521for x in self.d:522self.d[x] *= factor523524return total525526def Random(self):527"""Chooses a random element from this PMF.528529Note: this is not very efficient. If you plan to call530this more than a few times, consider converting to a CDF.531532Returns:533float value from the Pmf534"""535target = random.random()536total = 0.0537for x, p in self.d.items():538total += p539if total >= target:540return x541542# we shouldn't get here543raise ValueError('Random: Pmf might not be normalized.')544545def Mean(self):546"""Computes the mean of a PMF.547548Returns:549float mean550"""551mean = 0.0552for x, p in self.d.items():553mean += p * x554return mean555556def Var(self, mu=None):557"""Computes the variance of a PMF.558559mu: the point around which the variance is computed;560if omitted, computes the mean561562returns: float variance563"""564if mu is None:565mu = self.Mean()566567var = 0.0568for x, p in self.d.items():569var += p * (x - mu) ** 2570return var571572def Std(self, mu=None):573"""Computes the standard deviation of a PMF.574575mu: the point around which the variance is computed;576if omitted, computes the mean577578returns: float standard deviation579"""580var = self.Var(mu)581return math.sqrt(var)582583def MaximumLikelihood(self):584"""Returns the value with the highest probability.585586Returns: float probability587"""588_, val = max((prob, val) for val, prob in self.Items())589return val590591def CredibleInterval(self, percentage=90):592"""Computes the central credible interval.593594If percentage=90, computes the 90% CI.595596Args:597percentage: float between 0 and 100598599Returns:600sequence of two floats, low and high601"""602cdf = self.MakeCdf()603return cdf.CredibleInterval(percentage)604605def __add__(self, other):606"""Computes the Pmf of the sum of values drawn from self and other.607608other: another Pmf or a scalar609610returns: new Pmf611"""612try:613return self.AddPmf(other)614except AttributeError:615return self.AddConstant(other)616617def AddPmf(self, other):618"""Computes the Pmf of the sum of values drawn from self and other.619620other: another Pmf621622returns: new Pmf623"""624pmf = Pmf()625for v1, p1 in self.Items():626for v2, p2 in other.Items():627pmf.Incr(v1 + v2, p1 * p2)628return pmf629630def AddConstant(self, other):631"""Computes the Pmf of the sum a constant and values from self.632633other: a number634635returns: new Pmf636"""637pmf = Pmf()638for v1, p1 in self.Items():639pmf.Set(v1 + other, p1)640return pmf641642def __sub__(self, other):643"""Computes the Pmf of the diff of values drawn from self and other.644645other: another Pmf646647returns: new Pmf648"""649try:650return self.SubPmf(other)651except AttributeError:652return self.AddConstant(-other)653654def SubPmf(self, other):655"""Computes the Pmf of the diff of values drawn from self and other.656657other: another Pmf658659returns: new Pmf660"""661pmf = Pmf()662for v1, p1 in self.Items():663for v2, p2 in other.Items():664pmf.Incr(v1 - v2, p1 * p2)665return pmf666667def __mul__(self, other):668"""Computes the Pmf of the product of values drawn from self and other.669670other: another Pmf671672returns: new Pmf673"""674try:675return self.MulPmf(other)676except AttributeError:677return self.MulConstant(other)678679def MulPmf(self, other):680"""Computes the Pmf of the diff of values drawn from self and other.681682other: another Pmf683684returns: new Pmf685"""686pmf = Pmf()687for v1, p1 in self.Items():688for v2, p2 in other.Items():689pmf.Incr(v1 * v2, p1 * p2)690return pmf691692def MulConstant(self, other):693"""Computes the Pmf of the product of a constant and values from self.694695other: a number696697returns: new Pmf698"""699pmf = Pmf()700for v1, p1 in self.Items():701pmf.Set(v1 * other, p1)702return pmf703704def __div__(self, other):705"""Computes the Pmf of the ratio of values drawn from self and other.706707other: another Pmf708709returns: new Pmf710"""711try:712return self.DivPmf(other)713except AttributeError:714return self.MulConstant(1/other)715716__truediv__ = __div__717718def DivPmf(self, other):719"""Computes the Pmf of the ratio of values drawn from self and other.720721other: another Pmf722723returns: new Pmf724"""725pmf = Pmf()726for v1, p1 in self.Items():727for v2, p2 in other.Items():728pmf.Incr(v1 / v2, p1 * p2)729return pmf730731def Max(self, k):732"""Computes the CDF of the maximum of k selections from this dist.733734k: int735736returns: new Cdf737"""738cdf = self.MakeCdf()739return cdf.Max(k)740741742class Joint(Pmf):743"""Represents a joint distribution.744745The values are sequences (usually tuples)746"""747748def Marginal(self, i, label=None):749"""Gets the marginal distribution of the indicated variable.750751i: index of the variable we want752753Returns: Pmf754"""755pmf = Pmf(label=label)756for vs, prob in self.Items():757pmf.Incr(vs[i], prob)758return pmf759760def Conditional(self, i, j, val, label=None):761"""Gets the conditional distribution of the indicated variable.762763Distribution of vs[i], conditioned on vs[j] = val.764765i: index of the variable we want766j: which variable is conditioned on767val: the value the jth variable has to have768769Returns: Pmf770"""771pmf = Pmf(label=label)772for vs, prob in self.Items():773if vs[j] != val:774continue775pmf.Incr(vs[i], prob)776777pmf.Normalize()778return pmf779780def MaxLikeInterval(self, percentage=90):781"""Returns the maximum-likelihood credible interval.782783If percentage=90, computes a 90% CI containing the values784with the highest likelihoods.785786percentage: float between 0 and 100787788Returns: list of values from the suite789"""790interval = []791total = 0792793t = [(prob, val) for val, prob in self.Items()]794t.sort(reverse=True)795796for prob, val in t:797interval.append(val)798total += prob799if total >= percentage / 100.0:800break801802return interval803804805def MakeJoint(pmf1, pmf2):806"""Joint distribution of values from pmf1 and pmf2.807808Assumes that the PMFs represent independent random variables.809810Args:811pmf1: Pmf object812pmf2: Pmf object813814Returns:815Joint pmf of value pairs816"""817joint = Joint()818for v1, p1 in pmf1.Items():819for v2, p2 in pmf2.Items():820joint.Set((v1, v2), p1 * p2)821return joint822823824def MakeHistFromList(t, label=None):825"""Makes a histogram from an unsorted sequence of values.826827Args:828t: sequence of numbers829label: string label for this histogram830831Returns:832Hist object833"""834return Hist(t, label=label)835836837def MakeHistFromDict(d, label=None):838"""Makes a histogram from a map from values to frequencies.839840Args:841d: dictionary that maps values to frequencies842label: string label for this histogram843844Returns:845Hist object846"""847return Hist(d, label)848849850def MakePmfFromList(t, label=None):851"""Makes a PMF from an unsorted sequence of values.852853Args:854t: sequence of numbers855label: string label for this PMF856857Returns:858Pmf object859"""860return Pmf(t, label=label)861862863def MakePmfFromDict(d, label=None):864"""Makes a PMF from a map from values to probabilities.865866Args:867d: dictionary that maps values to probabilities868label: string label for this PMF869870Returns:871Pmf object872"""873return Pmf(d, label=label)874875876def MakePmfFromItems(t, label=None):877"""Makes a PMF from a sequence of value-probability pairs878879Args:880t: sequence of value-probability pairs881label: string label for this PMF882883Returns:884Pmf object885"""886return Pmf(dict(t), label=label)887888889def MakePmfFromHist(hist, label=None):890"""Makes a normalized PMF from a Hist object.891892Args:893hist: Hist object894label: string label895896Returns:897Pmf object898"""899if label is None:900label = hist.label901902return Pmf(hist, label=label)903904905def MakeMixture(metapmf, label='mix'):906"""Make a mixture distribution.907908Args:909metapmf: Pmf that maps from Pmfs to probs.910label: string label for the new Pmf.911912Returns: Pmf object.913"""914mix = Pmf(label=label)915for pmf, p1 in metapmf.Items():916for x, p2 in pmf.Items():917mix.Incr(x, p1 * p2)918return mix919920921def MakeUniformPmf(low, high, n):922"""Make a uniform Pmf.923924low: lowest value (inclusive)925high: highest value (inclusize)926n: number of values927"""928pmf = Pmf()929for x in np.linspace(low, high, n):930pmf.Set(x, 1)931pmf.Normalize()932return pmf933934935class Cdf(object):936"""Represents a cumulative distribution function.937938Attributes:939xs: sequence of values940ps: sequence of probabilities941label: string used as a graph label.942"""943def __init__(self, obj=None, ps=None, label=None):944"""Initializes.945946If ps is provided, obj must be the corresponding list of values.947948obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs949ps: list of cumulative probabilities950label: string label951"""952self.label = label if label is not None else '_nolegend_'953954if isinstance(obj, (_DictWrapper, Cdf, Pdf)):955if not label:956self.label = label if label is not None else obj.label957958if obj is None:959# caller does not provide obj, make an empty Cdf960self.xs = np.asarray([])961self.ps = np.asarray([])962if ps is not None:963logging.warning("Cdf: can't pass ps without also passing xs.")964return965else:966# if the caller provides xs and ps, just store them967if ps is not None:968if isinstance(ps, str):969logging.warning("Cdf: ps can't be a string")970971self.xs = np.asarray(obj)972self.ps = np.asarray(ps)973return974975# caller has provided just obj, not ps976if isinstance(obj, Cdf):977self.xs = copy.copy(obj.xs)978self.ps = copy.copy(obj.ps)979return980981if isinstance(obj, _DictWrapper):982dw = obj983else:984dw = Hist(obj)985986if len(dw) == 0:987self.xs = np.asarray([])988self.ps = np.asarray([])989return990991xs, freqs = zip(*sorted(dw.Items()))992self.xs = np.asarray(xs)993self.ps = np.cumsum(freqs, dtype=np.float)994self.ps /= self.ps[-1]995996def __str__(self):997return 'Cdf(%s, %s)' % (str(self.xs), str(self.ps))998999__repr__ = __str__10001001def __len__(self):1002return len(self.xs)10031004def __getitem__(self, x):1005return self.Prob(x)10061007def __setitem__(self):1008raise UnimplementedMethodException()10091010def __delitem__(self):1011raise UnimplementedMethodException()10121013def __eq__(self, other):1014return np.all(self.xs == other.xs) and np.all(self.ps == other.ps)10151016def Copy(self, label=None):1017"""Returns a copy of this Cdf.10181019label: string label for the new Cdf1020"""1021if label is None:1022label = self.label1023return Cdf(list(self.xs), list(self.ps), label=label)10241025def MakePmf(self, label=None):1026"""Makes a Pmf."""1027if label is None:1028label = self.label1029return Pmf(self, label=label)10301031def Values(self):1032"""Returns a sorted list of values.1033"""1034return self.xs10351036def Items(self):1037"""Returns a sorted sequence of (value, probability) pairs.10381039Note: in Python3, returns an iterator.1040"""1041a = self.ps1042b = np.roll(a, 1)1043b[0] = 01044return zip(self.xs, a-b)10451046def Shift(self, term):1047"""Adds a term to the xs.10481049term: how much to add1050"""1051new = self.Copy()1052# don't use +=, or else an int array + float yields int array1053new.xs = new.xs + term1054return new10551056def Scale(self, factor):1057"""Multiplies the xs by a factor.10581059factor: what to multiply by1060"""1061new = self.Copy()1062# don't use *=, or else an int array * float yields int array1063new.xs = new.xs * factor1064return new10651066def Prob(self, x):1067"""Returns CDF(x), the probability that corresponds to value x.10681069Args:1070x: number10711072Returns:1073float probability1074"""1075if x < self.xs[0]:1076return 0.01077index = bisect.bisect(self.xs, x)1078p = self.ps[index-1]1079return p10801081def Probs(self, xs):1082"""Gets probabilities for a sequence of values.10831084xs: any sequence that can be converted to NumPy array10851086returns: NumPy array of cumulative probabilities1087"""1088xs = np.asarray(xs)1089index = np.searchsorted(self.xs, xs, side='right')1090ps = self.ps[index-1]1091ps[xs < self.xs[0]] = 0.01092return ps10931094ProbArray = Probs10951096def Value(self, p):1097"""Returns InverseCDF(p), the value that corresponds to probability p.10981099Args:1100p: number in the range [0, 1]11011102Returns:1103number value1104"""1105if p < 0 or p > 1:1106raise ValueError('Probability p must be in range [0, 1]')11071108index = bisect.bisect_left(self.ps, p)1109return self.xs[index]11101111def ValueArray(self, ps):1112"""Returns InverseCDF(p), the value that corresponds to probability p.11131114Args:1115ps: NumPy array of numbers in the range [0, 1]11161117Returns:1118NumPy array of values1119"""1120ps = np.asarray(ps)1121if np.any(ps < 0) or np.any(ps > 1):1122raise ValueError('Probability p must be in range [0, 1]')11231124index = np.searchsorted(self.ps, ps, side='left')1125return self.xs[index]11261127def Percentile(self, p):1128"""Returns the value that corresponds to percentile p.11291130Args:1131p: number in the range [0, 100]11321133Returns:1134number value1135"""1136return self.Value(p / 100.0)11371138def PercentileRank(self, x):1139"""Returns the percentile rank of the value x.11401141x: potential value in the CDF11421143returns: percentile rank in the range 0 to 1001144"""1145return self.Prob(x) * 100.011461147def Random(self):1148"""Chooses a random value from this distribution."""1149return self.Value(random.random())11501151def Sample(self, n):1152"""Generates a random sample from this distribution.11531154n: int length of the sample1155returns: NumPy array1156"""1157ps = np.random.random(n)1158return self.ValueArray(ps)11591160def Mean(self):1161"""Computes the mean of a CDF.11621163Returns:1164float mean1165"""1166old_p = 01167total = 0.01168for x, new_p in zip(self.xs, self.ps):1169p = new_p - old_p1170total += p * x1171old_p = new_p1172return total11731174def CredibleInterval(self, percentage=90):1175"""Computes the central credible interval.11761177If percentage=90, computes the 90% CI.11781179Args:1180percentage: float between 0 and 10011811182Returns:1183sequence of two floats, low and high1184"""1185prob = (1 - percentage / 100.0) / 21186interval = self.Value(prob), self.Value(1 - prob)1187return interval11881189ConfidenceInterval = CredibleInterval11901191def _Round(self, multiplier=1000.0):1192"""1193An entry is added to the cdf only if the percentile differs1194from the previous value in a significant digit, where the number1195of significant digits is determined by multiplier. The1196default is 1000, which keeps log10(1000) = 3 significant digits.1197"""1198# TODO(write this method)1199raise UnimplementedMethodException()12001201def Render(self, **options):1202"""Generates a sequence of points suitable for plotting.12031204An empirical CDF is a step function; linear interpolation1205can be misleading.12061207Note: options are ignored12081209Returns:1210tuple of (xs, ps)1211"""1212def interleave(a, b):1213c = np.empty(a.shape[0] + b.shape[0])1214c[::2] = a1215c[1::2] = b1216return c12171218a = np.array(self.xs)1219xs = interleave(a, a)1220shift_ps = np.roll(self.ps, 1)1221shift_ps[0] = 01222ps = interleave(shift_ps, self.ps)1223return xs, ps12241225def Max(self, k):1226"""Computes the CDF of the maximum of k selections from this dist.12271228k: int12291230returns: new Cdf1231"""1232cdf = self.Copy()1233cdf.ps **= k1234return cdf123512361237def MakeCdfFromItems(items, label=None):1238"""Makes a cdf from an unsorted sequence of (value, frequency) pairs.12391240Args:1241items: unsorted sequence of (value, frequency) pairs1242label: string label for this CDF12431244Returns:1245cdf: list of (value, fraction) pairs1246"""1247return Cdf(dict(items), label=label)124812491250def MakeCdfFromDict(d, label=None):1251"""Makes a CDF from a dictionary that maps values to frequencies.12521253Args:1254d: dictionary that maps values to frequencies.1255label: string label for the data.12561257Returns:1258Cdf object1259"""1260return Cdf(d, label=label)126112621263def MakeCdfFromList(seq, label=None):1264"""Creates a CDF from an unsorted sequence.12651266Args:1267seq: unsorted sequence of sortable values1268label: string label for the cdf12691270Returns:1271Cdf object1272"""1273return Cdf(seq, label=label)127412751276def MakeCdfFromHist(hist, label=None):1277"""Makes a CDF from a Hist object.12781279Args:1280hist: Pmf.Hist object1281label: string label for the data.12821283Returns:1284Cdf object1285"""1286if label is None:1287label = hist.label12881289return Cdf(hist, label=label)129012911292def MakeCdfFromPmf(pmf, label=None):1293"""Makes a CDF from a Pmf object.12941295Args:1296pmf: Pmf.Pmf object1297label: string label for the data.12981299Returns:1300Cdf object1301"""1302if label is None:1303label = pmf.label13041305return Cdf(pmf, label=label)130613071308class UnimplementedMethodException(Exception):1309"""Exception if someone calls a method that should be overridden."""131013111312class Suite(Pmf):1313"""Represents a suite of hypotheses and their probabilities."""13141315def Update(self, data):1316"""Updates each hypothesis based on the data.13171318data: any representation of the data13191320returns: the normalizing constant1321"""1322for hypo in self.Values():1323like = self.Likelihood(data, hypo)1324self.Mult(hypo, like)1325return self.Normalize()13261327def LogUpdate(self, data):1328"""Updates a suite of hypotheses based on new data.13291330Modifies the suite directly; if you want to keep the original, make1331a copy.13321333Note: unlike Update, LogUpdate does not normalize.13341335Args:1336data: any representation of the data1337"""1338for hypo in self.Values():1339like = self.LogLikelihood(data, hypo)1340self.Incr(hypo, like)13411342def UpdateSet(self, dataset):1343"""Updates each hypothesis based on the dataset.13441345This is more efficient than calling Update repeatedly because1346it waits until the end to Normalize.13471348Modifies the suite directly; if you want to keep the original, make1349a copy.13501351dataset: a sequence of data13521353returns: the normalizing constant1354"""1355for data in dataset:1356for hypo in self.Values():1357like = self.Likelihood(data, hypo)1358self.Mult(hypo, like)1359return self.Normalize()13601361def LogUpdateSet(self, dataset):1362"""Updates each hypothesis based on the dataset.13631364Modifies the suite directly; if you want to keep the original, make1365a copy.13661367dataset: a sequence of data13681369returns: None1370"""1371for data in dataset:1372self.LogUpdate(data)13731374def Likelihood(self, data, hypo):1375"""Computes the likelihood of the data under the hypothesis.13761377hypo: some representation of the hypothesis1378data: some representation of the data1379"""1380raise UnimplementedMethodException()13811382def LogLikelihood(self, data, hypo):1383"""Computes the log likelihood of the data under the hypothesis.13841385hypo: some representation of the hypothesis1386data: some representation of the data1387"""1388raise UnimplementedMethodException()13891390def Print(self):1391"""Prints the hypotheses and their probabilities."""1392for hypo, prob in sorted(self.Items()):1393print(hypo, prob)13941395def MakeOdds(self):1396"""Transforms from probabilities to odds.13971398Values with prob=0 are removed.1399"""1400for hypo, prob in self.Items():1401if prob:1402self.Set(hypo, Odds(prob))1403else:1404self.Remove(hypo)14051406def MakeProbs(self):1407"""Transforms from odds to probabilities."""1408for hypo, odds in self.Items():1409self.Set(hypo, Probability(odds))141014111412def MakeSuiteFromList(t, label=None):1413"""Makes a suite from an unsorted sequence of values.14141415Args:1416t: sequence of numbers1417label: string label for this suite14181419Returns:1420Suite object1421"""1422hist = MakeHistFromList(t, label=label)1423d = hist.GetDict()1424return MakeSuiteFromDict(d)142514261427def MakeSuiteFromHist(hist, label=None):1428"""Makes a normalized suite from a Hist object.14291430Args:1431hist: Hist object1432label: string label14331434Returns:1435Suite object1436"""1437if label is None:1438label = hist.label14391440# make a copy of the dictionary1441d = dict(hist.GetDict())1442return MakeSuiteFromDict(d, label)144314441445def MakeSuiteFromDict(d, label=None):1446"""Makes a suite from a map from values to probabilities.14471448Args:1449d: dictionary that maps values to probabilities1450label: string label for this suite14511452Returns:1453Suite object1454"""1455suite = Suite(label=label)1456suite.SetDict(d)1457suite.Normalize()1458return suite145914601461class Pdf(object):1462"""Represents a probability density function (PDF)."""14631464def Density(self, x):1465"""Evaluates this Pdf at x.14661467Returns: float or NumPy array of probability density1468"""1469raise UnimplementedMethodException()14701471def GetLinspace(self):1472"""Get a linspace for plotting.14731474Not all subclasses of Pdf implement this.14751476Returns: numpy array1477"""1478raise UnimplementedMethodException()14791480def MakePmf(self, **options):1481"""Makes a discrete version of this Pdf.14821483options can include1484label: string1485low: low end of range1486high: high end of range1487n: number of places to evaluate14881489Returns: new Pmf1490"""1491label = options.pop('label', '')1492xs, ds = self.Render(**options)1493return Pmf(dict(zip(xs, ds)), label=label)14941495def Render(self, **options):1496"""Generates a sequence of points suitable for plotting.14971498If options includes low and high, it must also include n;1499in that case the density is evaluated an n locations between1500low and high, including both.15011502If options includes xs, the density is evaluate at those location.15031504Otherwise, self.GetLinspace is invoked to provide the locations.15051506Returns:1507tuple of (xs, densities)1508"""1509low, high = options.pop('low', None), options.pop('high', None)1510if low is not None and high is not None:1511n = options.pop('n', 101)1512xs = np.linspace(low, high, n)1513else:1514xs = options.pop('xs', None)1515if xs is None:1516xs = self.GetLinspace()15171518ds = self.Density(xs)1519return xs, ds15201521def Items(self):1522"""Generates a sequence of (value, probability) pairs.1523"""1524return zip(*self.Render())152515261527class NormalPdf(Pdf):1528"""Represents the PDF of a Normal distribution."""15291530def __init__(self, mu=0, sigma=1, label=None):1531"""Constructs a Normal Pdf with given mu and sigma.15321533mu: mean1534sigma: standard deviation1535label: string1536"""1537self.mu = mu1538self.sigma = sigma1539self.label = label if label is not None else '_nolegend_'15401541def __str__(self):1542return 'NormalPdf(%f, %f)' % (self.mu, self.sigma)15431544def GetLinspace(self):1545"""Get a linspace for plotting.15461547Returns: numpy array1548"""1549low, high = self.mu-3*self.sigma, self.mu+3*self.sigma1550return np.linspace(low, high, 101)15511552def Density(self, xs):1553"""Evaluates this Pdf at xs.15541555xs: scalar or sequence of floats15561557returns: float or NumPy array of probability density1558"""1559return stats.norm.pdf(xs, self.mu, self.sigma)156015611562class ExponentialPdf(Pdf):1563"""Represents the PDF of an exponential distribution."""15641565def __init__(self, lam=1, label=None):1566"""Constructs an exponential Pdf with given parameter.15671568lam: rate parameter1569label: string1570"""1571self.lam = lam1572self.label = label if label is not None else '_nolegend_'15731574def __str__(self):1575return 'ExponentialPdf(%f)' % (self.lam)15761577def GetLinspace(self):1578"""Get a linspace for plotting.15791580Returns: numpy array1581"""1582low, high = 0, 5.0/self.lam1583return np.linspace(low, high, 101)15841585def Density(self, xs):1586"""Evaluates this Pdf at xs.15871588xs: scalar or sequence of floats15891590returns: float or NumPy array of probability density1591"""1592return stats.expon.pdf(xs, scale=1.0/self.lam)159315941595class EstimatedPdf(Pdf):1596"""Represents a PDF estimated by KDE."""15971598def __init__(self, sample, label=None):1599"""Estimates the density function based on a sample.16001601sample: sequence of data1602label: string1603"""1604self.label = label if label is not None else '_nolegend_'1605self.kde = stats.gaussian_kde(sample)1606low = min(sample)1607high = max(sample)1608self.linspace = np.linspace(low, high, 101)16091610def __str__(self):1611return 'EstimatedPdf(label=%s)' % str(self.label)16121613def GetLinspace(self):1614"""Get a linspace for plotting.16151616Returns: numpy array1617"""1618return self.linspace16191620def Density(self, xs):1621"""Evaluates this Pdf at xs.16221623returns: float or NumPy array of probability density1624"""1625return self.kde.evaluate(xs)162616271628def CredibleInterval(pmf, percentage=90):1629"""Computes a credible interval for a given distribution.16301631If percentage=90, computes the 90% CI.16321633Args:1634pmf: Pmf object representing a posterior distribution1635percentage: float between 0 and 10016361637Returns:1638sequence of two floats, low and high1639"""1640cdf = pmf.MakeCdf()1641prob = (1 - percentage / 100.0) / 21642interval = cdf.Value(prob), cdf.Value(1 - prob)1643return interval164416451646def PmfProbLess(pmf1, pmf2):1647"""Probability that a value from pmf1 is less than a value from pmf2.16481649Args:1650pmf1: Pmf object1651pmf2: Pmf object16521653Returns:1654float probability1655"""1656total = 0.01657for v1, p1 in pmf1.Items():1658for v2, p2 in pmf2.Items():1659if v1 < v2:1660total += p1 * p21661return total166216631664def PmfProbGreater(pmf1, pmf2):1665"""Probability that a value from pmf1 is less than a value from pmf2.16661667Args:1668pmf1: Pmf object1669pmf2: Pmf object16701671Returns:1672float probability1673"""1674total = 0.01675for v1, p1 in pmf1.Items():1676for v2, p2 in pmf2.Items():1677if v1 > v2:1678total += p1 * p21679return total168016811682def PmfProbEqual(pmf1, pmf2):1683"""Probability that a value from pmf1 equals a value from pmf2.16841685Args:1686pmf1: Pmf object1687pmf2: Pmf object16881689Returns:1690float probability1691"""1692total = 0.01693for v1, p1 in pmf1.Items():1694for v2, p2 in pmf2.Items():1695if v1 == v2:1696total += p1 * p21697return total169816991700def RandomSum(dists):1701"""Chooses a random value from each dist and returns the sum.17021703dists: sequence of Pmf or Cdf objects17041705returns: numerical sum1706"""1707total = sum(dist.Random() for dist in dists)1708return total170917101711def SampleSum(dists, n):1712"""Draws a sample of sums from a list of distributions.17131714dists: sequence of Pmf or Cdf objects1715n: sample size17161717returns: new Pmf of sums1718"""1719pmf = Pmf(RandomSum(dists) for i in range(n))1720return pmf172117221723def EvalNormalPdf(x, mu, sigma):1724"""Computes the unnormalized PDF of the normal distribution.17251726x: value1727mu: mean1728sigma: standard deviation17291730returns: float probability density1731"""1732return stats.norm.pdf(x, mu, sigma)173317341735def MakeNormalPmf(mu, sigma, num_sigmas, n=201):1736"""Makes a PMF discrete approx to a Normal distribution.17371738mu: float mean1739sigma: float standard deviation1740num_sigmas: how many sigmas to extend in each direction1741n: number of values in the Pmf17421743returns: normalized Pmf1744"""1745pmf = Pmf()1746low = mu - num_sigmas * sigma1747high = mu + num_sigmas * sigma17481749for x in np.linspace(low, high, n):1750p = EvalNormalPdf(x, mu, sigma)1751pmf.Set(x, p)1752pmf.Normalize()1753return pmf175417551756def EvalBinomialPmf(k, n, p):1757"""Evaluates the binomial PMF.17581759Returns the probabily of k successes in n trials with probability p.1760"""1761return stats.binom.pmf(k, n, p)176217631764def EvalHypergeomPmf(k, N, K, n):1765"""Evaluates the hypergeometric PMF.17661767Returns the probabily of k successes in n trials from a population1768N with K successes in it.1769"""1770return stats.hypergeom.pmf(k, N, K, n)177117721773def EvalPoissonPmf(k, lam):1774"""Computes the Poisson PMF.17751776k: number of events1777lam: parameter lambda in events per unit time17781779returns: float probability1780"""1781# don't use the scipy function (yet). for lam=0 it returns NaN;1782# should be 0.01783# return stats.poisson.pmf(k, lam)1784return lam ** k * math.exp(-lam) / special.gamma(k+1)178517861787def MakePoissonPmf(lam, high, step=1):1788"""Makes a PMF discrete approx to a Poisson distribution.17891790lam: parameter lambda in events per unit time1791high: upper bound of the Pmf17921793returns: normalized Pmf1794"""1795pmf = Pmf()1796for k in range(0, high + 1, step):1797p = EvalPoissonPmf(k, lam)1798pmf.Set(k, p)1799pmf.Normalize()1800return pmf180118021803def EvalExponentialPdf(x, lam):1804"""Computes the exponential PDF.18051806x: value1807lam: parameter lambda in events per unit time18081809returns: float probability density1810"""1811return lam * math.exp(-lam * x)181218131814def EvalExponentialCdf(x, lam):1815"""Evaluates CDF of the exponential distribution with parameter lam."""1816return 1 - math.exp(-lam * x)181718181819def MakeExponentialPmf(lam, high, n=200):1820"""Makes a PMF discrete approx to an exponential distribution.18211822lam: parameter lambda in events per unit time1823high: upper bound1824n: number of values in the Pmf18251826returns: normalized Pmf1827"""1828pmf = Pmf()1829for x in np.linspace(0, high, n):1830p = EvalExponentialPdf(x, lam)1831pmf.Set(x, p)1832pmf.Normalize()1833return pmf183418351836def StandardNormalCdf(x):1837"""Evaluates the CDF of the standard Normal distribution.18381839See http://en.wikipedia.org/wiki/Normal_distribution1840#Cumulative_distribution_function18411842Args:1843x: float18441845Returns:1846float1847"""1848return (math.erf(x / ROOT2) + 1) / 2184918501851def EvalNormalCdf(x, mu=0, sigma=1):1852"""Evaluates the CDF of the normal distribution.18531854Args:1855x: float18561857mu: mean parameter18581859sigma: standard deviation parameter18601861Returns:1862float1863"""1864return stats.norm.cdf(x, loc=mu, scale=sigma)186518661867def EvalNormalCdfInverse(p, mu=0, sigma=1):1868"""Evaluates the inverse CDF of the normal distribution.18691870See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function18711872Args:1873p: float18741875mu: mean parameter18761877sigma: standard deviation parameter18781879Returns:1880float1881"""1882return stats.norm.ppf(p, loc=mu, scale=sigma)188318841885def EvalLognormalCdf(x, mu=0, sigma=1):1886"""Evaluates the CDF of the lognormal distribution.18871888x: float or sequence1889mu: mean parameter1890sigma: standard deviation parameter18911892Returns: float or sequence1893"""1894return stats.lognorm.cdf(x, loc=mu, scale=sigma)189518961897def RenderExpoCdf(lam, low, high, n=101):1898"""Generates sequences of xs and ps for an exponential CDF.18991900lam: parameter1901low: float1902high: float1903n: number of points to render19041905returns: numpy arrays (xs, ps)1906"""1907xs = np.linspace(low, high, n)1908ps = 1 - np.exp(-lam * xs)1909#ps = stats.expon.cdf(xs, scale=1.0/lam)1910return xs, ps191119121913def RenderNormalCdf(mu, sigma, low, high, n=101):1914"""Generates sequences of xs and ps for a Normal CDF.19151916mu: parameter1917sigma: parameter1918low: float1919high: float1920n: number of points to render19211922returns: numpy arrays (xs, ps)1923"""1924xs = np.linspace(low, high, n)1925ps = stats.norm.cdf(xs, mu, sigma)1926return xs, ps192719281929def RenderParetoCdf(xmin, alpha, low, high, n=50):1930"""Generates sequences of xs and ps for a Pareto CDF.19311932xmin: parameter1933alpha: parameter1934low: float1935high: float1936n: number of points to render19371938returns: numpy arrays (xs, ps)1939"""1940if low < xmin:1941low = xmin1942xs = np.linspace(low, high, n)1943ps = 1 - (xs / xmin) ** -alpha1944#ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)1945return xs, ps194619471948class Beta(object):1949"""Represents a Beta distribution.19501951See http://en.wikipedia.org/wiki/Beta_distribution1952"""1953def __init__(self, alpha=1, beta=1, label=None):1954"""Initializes a Beta distribution."""1955self.alpha = alpha1956self.beta = beta1957self.label = label if label is not None else '_nolegend_'19581959def Update(self, data):1960"""Updates a Beta distribution.19611962data: pair of int (heads, tails)1963"""1964heads, tails = data1965self.alpha += heads1966self.beta += tails19671968def Mean(self):1969"""Computes the mean of this distribution."""1970return self.alpha / (self.alpha + self.beta)19711972def Random(self):1973"""Generates a random variate from this distribution."""1974return random.betavariate(self.alpha, self.beta)19751976def Sample(self, n):1977"""Generates a random sample from this distribution.19781979n: int sample size1980"""1981size = n,1982return np.random.beta(self.alpha, self.beta, size)19831984def EvalPdf(self, x):1985"""Evaluates the PDF at x."""1986return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1)19871988def MakePmf(self, steps=101, label=None):1989"""Returns a Pmf of this distribution.19901991Note: Normally, we just evaluate the PDF at a sequence1992of points and treat the probability density as a probability1993mass.19941995But if alpha or beta is less than one, we have to be1996more careful because the PDF goes to infinity at x=01997and x=1. In that case we evaluate the CDF and compute1998differences.1999"""2000if self.alpha < 1 or self.beta < 1:2001cdf = self.MakeCdf()2002pmf = cdf.MakePmf()2003return pmf20042005xs = [i / (steps - 1.0) for i in range(steps)]2006probs = [self.EvalPdf(x) for x in xs]2007pmf = Pmf(dict(zip(xs, probs)), label=label)2008return pmf20092010def MakeCdf(self, steps=101):2011"""Returns the CDF of this distribution."""2012xs = [i / (steps - 1.0) for i in range(steps)]2013ps = [special.betainc(self.alpha, self.beta, x) for x in xs]2014cdf = Cdf(xs, ps)2015return cdf201620172018class Dirichlet(object):2019"""Represents a Dirichlet distribution.20202021See http://en.wikipedia.org/wiki/Dirichlet_distribution2022"""20232024def __init__(self, n, conc=1, label=None):2025"""Initializes a Dirichlet distribution.20262027n: number of dimensions2028conc: concentration parameter (smaller yields more concentration)2029label: string label2030"""2031if n < 2:2032raise ValueError('A Dirichlet distribution with '2033'n<2 makes no sense')20342035self.n = n2036self.params = np.ones(n, dtype=np.float) * conc2037self.label = label if label is not None else '_nolegend_'20382039def Update(self, data):2040"""Updates a Dirichlet distribution.20412042data: sequence of observations, in order corresponding to params2043"""2044m = len(data)2045self.params[:m] += data20462047def Random(self):2048"""Generates a random variate from this distribution.20492050Returns: normalized vector of fractions2051"""2052p = np.random.gamma(self.params)2053return p / p.sum()20542055def Likelihood(self, data):2056"""Computes the likelihood of the data.20572058Selects a random vector of probabilities from this distribution.20592060Returns: float probability2061"""2062m = len(data)2063if self.n < m:2064return 020652066x = data2067p = self.Random()2068q = p[:m] ** x2069return q.prod()20702071def LogLikelihood(self, data):2072"""Computes the log likelihood of the data.20732074Selects a random vector of probabilities from this distribution.20752076Returns: float log probability2077"""2078m = len(data)2079if self.n < m:2080return float('-inf')20812082x = self.Random()2083y = np.log(x[:m]) * data2084return y.sum()20852086def MarginalBeta(self, i):2087"""Computes the marginal distribution of the ith element.20882089See http://en.wikipedia.org/wiki/Dirichlet_distribution2090#Marginal_distributions20912092i: int20932094Returns: Beta object2095"""2096alpha0 = self.params.sum()2097alpha = self.params[i]2098return Beta(alpha, alpha0 - alpha)20992100def PredictivePmf(self, xs, label=None):2101"""Makes a predictive distribution.21022103xs: values to go into the Pmf21042105Returns: Pmf that maps from x to the mean prevalence of x2106"""2107alpha0 = self.params.sum()2108ps = self.params / alpha02109return Pmf(zip(xs, ps), label=label)211021112112def BinomialCoef(n, k):2113"""Compute the binomial coefficient "n choose k".21142115n: number of trials2116k: number of successes21172118Returns: float2119"""2120return scipy.misc.comb(n, k)212121222123def LogBinomialCoef(n, k):2124"""Computes the log of the binomial coefficient.21252126http://math.stackexchange.com/questions/64716/2127approximating-the-logarithm-of-the-binomial-coefficient21282129n: number of trials2130k: number of successes21312132Returns: float2133"""2134return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k)213521362137def NormalProbability(ys, jitter=0.0):2138"""Generates data for a normal probability plot.21392140ys: sequence of values2141jitter: float magnitude of jitter added to the ys21422143returns: numpy arrays xs, ys2144"""2145n = len(ys)2146xs = np.random.normal(0, 1, n)2147xs.sort()21482149if jitter:2150ys = Jitter(ys, jitter)2151else:2152ys = np.array(ys)2153ys.sort()21542155return xs, ys215621572158def Jitter(values, jitter=0.5):2159"""Jitters the values by adding a uniform variate in (-jitter, jitter).21602161values: sequence2162jitter: scalar magnitude of jitter21632164returns: new numpy array2165"""2166n = len(values)2167return np.random.uniform(-jitter, +jitter, n) + values216821692170def NormalProbabilityPlot(sample, fit_color='0.8', **options):2171"""Makes a normal probability plot with a fitted line.21722173sample: sequence of numbers2174fit_color: color string for the fitted line2175options: passed along to Plot2176"""2177xs, ys = NormalProbability(sample)2178mean, var = MeanVar(sample)2179std = math.sqrt(var)21802181fit = FitLine(xs, mean, std)2182thinkplot.Plot(*fit, color=fit_color, label='model')21832184xs, ys = NormalProbability(sample)2185thinkplot.Plot(xs, ys, **options)218621872188def Mean(xs):2189"""Computes mean.21902191xs: sequence of values21922193returns: float mean2194"""2195return np.mean(xs)219621972198def Var(xs, mu=None, ddof=0):2199"""Computes variance.22002201xs: sequence of values2202mu: option known mean2203ddof: delta degrees of freedom22042205returns: float2206"""2207xs = np.asarray(xs)22082209if mu is None:2210mu = xs.mean()22112212ds = xs - mu2213return np.dot(ds, ds) / (len(xs) - ddof)221422152216def Std(xs, mu=None, ddof=0):2217"""Computes standard deviation.22182219xs: sequence of values2220mu: option known mean2221ddof: delta degrees of freedom22222223returns: float2224"""2225var = Var(xs, mu, ddof)2226return math.sqrt(var)222722282229def MeanVar(xs, ddof=0):2230"""Computes mean and variance.22312232Based on http://stackoverflow.com/questions/19391149/2233numpy-mean-and-variance-from-single-function22342235xs: sequence of values2236ddof: delta degrees of freedom22372238returns: pair of float, mean and var2239"""2240xs = np.asarray(xs)2241mean = xs.mean()2242s2 = Var(xs, mean, ddof)2243return mean, s2224422452246def Trim(t, p=0.01):2247"""Trims the largest and smallest elements of t.22482249Args:2250t: sequence of numbers2251p: fraction of values to trim off each end22522253Returns:2254sequence of values2255"""2256n = int(p * len(t))2257t = sorted(t)[n:-n]2258return t225922602261def TrimmedMean(t, p=0.01):2262"""Computes the trimmed mean of a sequence of numbers.22632264Args:2265t: sequence of numbers2266p: fraction of values to trim off each end22672268Returns:2269float2270"""2271t = Trim(t, p)2272return Mean(t)227322742275def TrimmedMeanVar(t, p=0.01):2276"""Computes the trimmed mean and variance of a sequence of numbers.22772278Side effect: sorts the list.22792280Args:2281t: sequence of numbers2282p: fraction of values to trim off each end22832284Returns:2285float2286"""2287t = Trim(t, p)2288mu, var = MeanVar(t)2289return mu, var229022912292def CohenEffectSize(group1, group2):2293"""Compute Cohen's d.22942295group1: Series or NumPy array2296group2: Series or NumPy array22972298returns: float2299"""2300diff = group1.mean() - group2.mean()23012302n1, n2 = len(group1), len(group2)2303var1 = group1.var()2304var2 = group2.var()23052306pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)2307d = diff / math.sqrt(pooled_var)2308return d230923102311def Cov(xs, ys, meanx=None, meany=None):2312"""Computes Cov(X, Y).23132314Args:2315xs: sequence of values2316ys: sequence of values2317meanx: optional float mean of xs2318meany: optional float mean of ys23192320Returns:2321Cov(X, Y)2322"""2323xs = np.asarray(xs)2324ys = np.asarray(ys)23252326if meanx is None:2327meanx = np.mean(xs)2328if meany is None:2329meany = np.mean(ys)23302331cov = np.dot(xs-meanx, ys-meany) / len(xs)2332return cov233323342335def Corr(xs, ys):2336"""Computes Corr(X, Y).23372338Args:2339xs: sequence of values2340ys: sequence of values23412342Returns:2343Corr(X, Y)2344"""2345xs = np.asarray(xs)2346ys = np.asarray(ys)23472348meanx, varx = MeanVar(xs)2349meany, vary = MeanVar(ys)23502351corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary)23522353return corr235423552356def SerialCorr(series, lag=1):2357"""Computes the serial correlation of a series.23582359series: Series2360lag: integer number of intervals to shift23612362returns: float correlation2363"""2364xs = series[lag:]2365ys = series.shift(lag)[lag:]2366corr = Corr(xs, ys)2367return corr236823692370def SpearmanCorr(xs, ys):2371"""Computes Spearman's rank correlation.23722373Args:2374xs: sequence of values2375ys: sequence of values23762377Returns:2378float Spearman's correlation2379"""2380xranks = pandas.Series(xs).rank()2381yranks = pandas.Series(ys).rank()2382return Corr(xranks, yranks)238323842385def MapToRanks(t):2386"""Returns a list of ranks corresponding to the elements in t.23872388Args:2389t: sequence of numbers23902391Returns:2392list of integer ranks, starting at 12393"""2394# pair up each value with its index2395pairs = enumerate(t)23962397# sort by value2398sorted_pairs = sorted(pairs, key=itemgetter(1))23992400# pair up each pair with its rank2401ranked = enumerate(sorted_pairs)24022403# sort by index2404resorted = sorted(ranked, key=lambda trip: trip[1][0])24052406# extract the ranks2407ranks = [trip[0]+1 for trip in resorted]2408return ranks240924102411def LeastSquares(xs, ys):2412"""Computes a linear least squares fit for ys as a function of xs.24132414Args:2415xs: sequence of values2416ys: sequence of values24172418Returns:2419tuple of (intercept, slope)2420"""2421meanx, varx = MeanVar(xs)2422meany = Mean(ys)24232424slope = Cov(xs, ys, meanx, meany) / varx2425inter = meany - slope * meanx24262427return inter, slope242824292430def FitLine(xs, inter, slope):2431"""Fits a line to the given data.24322433xs: sequence of x24342435returns: tuple of numpy arrays (sorted xs, fit ys)2436"""2437fit_xs = np.sort(xs)2438fit_ys = inter + slope * fit_xs2439return fit_xs, fit_ys244024412442def Residuals(xs, ys, inter, slope):2443"""Computes residuals for a linear fit with parameters inter and slope.24442445Args:2446xs: independent variable2447ys: dependent variable2448inter: float intercept2449slope: float slope24502451Returns:2452list of residuals2453"""2454xs = np.asarray(xs)2455ys = np.asarray(ys)2456res = ys - (inter + slope * xs)2457return res245824592460def CoefDetermination(ys, res):2461"""Computes the coefficient of determination (R^2) for given residuals.24622463Args:2464ys: dependent variable2465res: residuals24662467Returns:2468float coefficient of determination2469"""2470return 1 - Var(res) / Var(ys)247124722473def CorrelatedGenerator(rho):2474"""Generates standard normal variates with serial correlation.24752476rho: target coefficient of correlation24772478Returns: iterable2479"""2480x = random.gauss(0, 1)2481yield x24822483sigma = math.sqrt(1 - rho**2)2484while True:2485x = random.gauss(x * rho, sigma)2486yield x248724882489def CorrelatedNormalGenerator(mu, sigma, rho):2490"""Generates normal variates with serial correlation.24912492mu: mean of variate2493sigma: standard deviation of variate2494rho: target coefficient of correlation24952496Returns: iterable2497"""2498for x in CorrelatedGenerator(rho):2499yield x * sigma + mu250025012502def RawMoment(xs, k):2503"""Computes the kth raw moment of xs.2504"""2505return sum(x**k for x in xs) / len(xs)250625072508def CentralMoment(xs, k):2509"""Computes the kth central moment of xs.2510"""2511mean = RawMoment(xs, 1)2512return sum((x - mean)**k for x in xs) / len(xs)251325142515def StandardizedMoment(xs, k):2516"""Computes the kth standardized moment of xs.2517"""2518var = CentralMoment(xs, 2)2519std = math.sqrt(var)2520return CentralMoment(xs, k) / std**k252125222523def Skewness(xs):2524"""Computes skewness.2525"""2526return StandardizedMoment(xs, 3)252725282529def Median(xs):2530"""Computes the median (50th percentile) of a sequence.25312532xs: sequence or anything else that can initialize a Cdf25332534returns: float2535"""2536cdf = Cdf(xs)2537return cdf.Value(0.5)253825392540def IQR(xs):2541"""Computes the interquartile of a sequence.25422543xs: sequence or anything else that can initialize a Cdf25442545returns: pair of floats2546"""2547cdf = Cdf(xs)2548return cdf.Value(0.25), cdf.Value(0.75)254925502551def PearsonMedianSkewness(xs):2552"""Computes the Pearson median skewness.2553"""2554median = Median(xs)2555mean = RawMoment(xs, 1)2556var = CentralMoment(xs, 2)2557std = math.sqrt(var)2558gp = 3 * (mean - median) / std2559return gp256025612562class FixedWidthVariables(object):2563"""Represents a set of variables in a fixed width file."""25642565def __init__(self, variables, index_base=0):2566"""Initializes.25672568variables: DataFrame2569index_base: are the indices 0 or 1 based?25702571Attributes:2572colspecs: list of (start, end) index tuples2573names: list of string variable names2574"""2575self.variables = variables25762577# note: by default, subtract 1 from colspecs2578self.colspecs = variables[['start', 'end']] - index_base25792580# convert colspecs to a list of pair of int2581self.colspecs = self.colspecs.astype(np.int).values.tolist()2582self.names = variables['name']25832584def ReadFixedWidth(self, filename, **options):2585"""Reads a fixed width ASCII file.25862587filename: string filename25882589returns: DataFrame2590"""2591df = pandas.read_fwf(filename,2592colspecs=self.colspecs,2593names=self.names,2594**options)2595return df259625972598def ReadStataDct(dct_file, **options):2599"""Reads a Stata dictionary file.26002601dct_file: string filename2602options: dict of options passed to open()26032604returns: FixedWidthVariables object2605"""2606type_map = dict(byte=int, int=int, long=int, float=float, double=float)26072608var_info = []2609for line in open(dct_file, **options):2610match = re.search( r'_column\(([^)]*)\)', line)2611if match:2612start = int(match.group(1))2613t = line.split()2614vtype, name, fstring = t[1:4]2615name = name.lower()2616if vtype.startswith('str'):2617vtype = str2618else:2619vtype = type_map[vtype]2620long_desc = ' '.join(t[4:]).strip('"')2621var_info.append((start, vtype, name, fstring, long_desc))26222623columns = ['start', 'type', 'name', 'fstring', 'desc']2624variables = pandas.DataFrame(var_info, columns=columns)26252626# fill in the end column by shifting the start column2627variables['end'] = variables.start.shift(-1)2628variables.loc[len(variables)-1, 'end'] = 026292630dct = FixedWidthVariables(variables, index_base=1)2631return dct263226332634def Resample(xs, n=None):2635"""Draw a sample from xs with the same length as xs.26362637xs: sequence2638n: sample size (default: len(xs))26392640returns: NumPy array2641"""2642if n is None:2643n = len(xs)2644return np.random.choice(xs, n, replace=True)264526462647def SampleRows(df, nrows, replace=False):2648"""Choose a sample of rows from a DataFrame.26492650df: DataFrame2651nrows: number of rows2652replace: whether to sample with replacement26532654returns: DataDf2655"""2656indices = np.random.choice(df.index, nrows, replace=replace)2657sample = df.loc[indices]2658return sample265926602661def ResampleRows(df):2662"""Resamples rows from a DataFrame.26632664df: DataFrame26652666returns: DataFrame2667"""2668return SampleRows(df, len(df), replace=True)266926702671def ResampleRowsWeighted(df, column='finalwgt'):2672"""Resamples a DataFrame using probabilities proportional to given column.26732674df: DataFrame2675column: string column name to use as weights26762677returns: DataFrame2678"""2679weights = df[column]2680cdf = Cdf(dict(weights))2681indices = cdf.Sample(len(weights))2682sample = df.loc[indices]2683return sample268426852686def PercentileRow(array, p):2687"""Selects the row from a sorted array that maps to percentile p.26882689p: float 0--10026902691returns: NumPy array (one row)2692"""2693rows, cols = array.shape2694index = int(rows * p / 100)2695return array[index,]269626972698def PercentileRows(ys_seq, percents):2699"""Given a collection of lines, selects percentiles along vertical axis.27002701For example, if ys_seq contains simulation results like ys as a2702function of time, and percents contains (5, 95), the result would2703be a 90% CI for each vertical slice of the simulation results.27042705ys_seq: sequence of lines (y values)2706percents: list of percentiles (0-100) to select27072708returns: list of NumPy arrays, one for each percentile2709"""2710nrows = len(ys_seq)2711ncols = len(ys_seq[0])2712array = np.zeros((nrows, ncols))27132714for i, ys in enumerate(ys_seq):2715array[i,] = ys27162717array = np.sort(array, axis=0)27182719rows = [PercentileRow(array, p) for p in percents]2720return rows272127222723def Smooth(xs, sigma=2, **options):2724"""Smooths a NumPy array with a Gaussian filter.27252726xs: sequence2727sigma: standard deviation of the filter2728"""2729return ndimage.filters.gaussian_filter1d(xs, sigma, **options)273027312732class HypothesisTest(object):2733"""Represents a hypothesis test."""27342735def __init__(self, data):2736"""Initializes.27372738data: data in whatever form is relevant2739"""2740self.data = data2741self.MakeModel()2742self.actual = self.TestStatistic(data)2743self.test_stats = None2744self.test_cdf = None27452746def PValue(self, iters=1000):2747"""Computes the distribution of the test statistic and p-value.27482749iters: number of iterations27502751returns: float p-value2752"""2753self.test_stats = [self.TestStatistic(self.RunModel())2754for _ in range(iters)]2755self.test_cdf = Cdf(self.test_stats)27562757count = sum(1 for x in self.test_stats if x >= self.actual)2758return count / iters27592760def MaxTestStat(self):2761"""Returns the largest test statistic seen during simulations.2762"""2763return max(self.test_stats)27642765def PlotCdf(self, label=None):2766"""Draws a Cdf with vertical lines at the observed test stat.2767"""2768def VertLine(x):2769"""Draws a vertical line at x."""2770thinkplot.Plot([x, x], [0, 1], color='0.8')27712772VertLine(self.actual)2773thinkplot.Cdf(self.test_cdf, label=label)27742775def TestStatistic(self, data):2776"""Computes the test statistic.27772778data: data in whatever form is relevant2779"""2780raise UnimplementedMethodException()27812782def MakeModel(self):2783"""Build a model of the null hypothesis.2784"""2785pass27862787def RunModel(self):2788"""Run the model of the null hypothesis.27892790returns: simulated data2791"""2792raise UnimplementedMethodException()279327942795def main():2796pass279727982799if __name__ == '__main__':2800main()280128022803