📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code used in "Think Bayes",1by Allen B. Downey, available from greenteapress.com23Copyright 2012 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67import csv8import math9import numpy10import sys1112import matplotlib13import matplotlib.pyplot as pyplot1415import thinkbayes16import thinkplot171819def ReadScale(filename='sat_scale.csv', col=2):20"""Reads a CSV file of SAT scales (maps from raw score to standard score).2122Args:23filename: string filename24col: which column to start with (0=Reading, 2=Math, 4=Writing)2526Returns: thinkbayes.Interpolator object27"""28def ParseRange(s):29"""Parse a range of values in the form 123-4563031s: string32"""33t = [int(x) for x in s.split('-')]34return 1.0 * sum(t) / len(t)3536fp = open(filename)37reader = csv.reader(fp)38raws = []39scores = []4041for t in reader:42try:43raw = int(t[col])44raws.append(raw)45score = ParseRange(t[col+1])46scores.append(score)47except ValueError:48pass4950raws.sort()51scores.sort()52return thinkbayes.Interpolator(raws, scores)535455def ReadRanks(filename='sat_ranks.csv'):56"""Reads a CSV file of SAT scores.5758Args:59filename: string filename6061Returns:62list of (score, freq) pairs63"""64fp = open(filename)65reader = csv.reader(fp)66res = []6768for t in reader:69try:70score = int(t[0])71freq = int(t[1])72res.append((score, freq))73except ValueError:74pass7576return res777879def DivideValues(pmf, denom):80"""Divides the values in a Pmf by denom.8182Returns a new Pmf.83"""84new = thinkbayes.Pmf()85denom = float(denom)86for val, prob in pmf.Items():87x = val / denom88new.Set(x, prob)89return new909192class Exam(object):93"""Encapsulates information about an exam.9495Contains the distribution of scaled scores and an96Interpolator that maps between scaled and raw scores.97"""98def __init__(self):99self.scale = ReadScale()100101scores = ReadRanks()102score_pmf = thinkbayes.MakePmfFromDict(dict(scores))103104self.raw = self.ReverseScale(score_pmf)105self.max_score = max(self.raw.Values())106self.prior = DivideValues(self.raw, denom=self.max_score)107108center = -0.05109width = 1.8110self.difficulties = MakeDifficulties(center, width, self.max_score)111112def CompareScores(self, a_score, b_score, constructor):113"""Computes posteriors for two test scores and the likelihood ratio.114115a_score, b_score: scales SAT scores116constructor: function that instantiates an Sat or Sat2 object117"""118a_sat = constructor(self, a_score)119b_sat = constructor(self, b_score)120121a_sat.PlotPosteriors(b_sat)122123if constructor is Sat:124PlotJointDist(a_sat, b_sat)125126top = TopLevel('AB')127top.Update((a_sat, b_sat))128top.Print()129130ratio = top.Prob('A') / top.Prob('B')131132print 'Likelihood ratio', ratio133134posterior = ratio / (ratio + 1)135print 'Posterior', posterior136137if constructor is Sat2:138ComparePosteriorPredictive(a_sat, b_sat)139140def MakeRawScoreDist(self, efficacies):141"""Makes the distribution of raw scores for given difficulty.142143efficacies: Pmf of efficacy144"""145pmfs = thinkbayes.Pmf()146for efficacy, prob in efficacies.Items():147scores = self.PmfCorrect(efficacy)148pmfs.Set(scores, prob)149150mix = thinkbayes.MakeMixture(pmfs)151return mix152153def CalibrateDifficulty(self):154"""Make a plot showing the model distribution of raw scores."""155thinkplot.Clf()156thinkplot.PrePlot(num=2)157158cdf = thinkbayes.MakeCdfFromPmf(self.raw, name='data')159thinkplot.Cdf(cdf)160161efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)162pmf = self.MakeRawScoreDist(efficacies)163cdf = thinkbayes.MakeCdfFromPmf(pmf, name='model')164thinkplot.Cdf(cdf)165166thinkplot.Save(root='sat_calibrate',167xlabel='raw score',168ylabel='CDF',169formats=['pdf', 'eps'])170171def PmfCorrect(self, efficacy):172"""Returns the PMF of number of correct responses.173174efficacy: float175"""176pmf = PmfCorrect(efficacy, self.difficulties)177return pmf178179def Lookup(self, raw):180"""Looks up a raw score and returns a scaled score."""181return self.scale.Lookup(raw)182183def Reverse(self, score):184"""Looks up a scaled score and returns a raw score.185186Since we ignore the penalty, negative scores round up to zero.187"""188raw = self.scale.Reverse(score)189return raw if raw > 0 else 0190191def ReverseScale(self, pmf):192"""Applies the reverse scale to the values of a PMF.193194Args:195pmf: Pmf object196scale: Interpolator object197198Returns:199new Pmf200"""201new = thinkbayes.Pmf()202for val, prob in pmf.Items():203raw = self.Reverse(val)204new.Incr(raw, prob)205return new206207208class Sat(thinkbayes.Suite):209"""Represents the distribution of p_correct for a test-taker."""210211def __init__(self, exam, score):212self.exam = exam213self.score = score214215# start with the prior distribution216thinkbayes.Suite.__init__(self, exam.prior)217218# update based on an exam score219self.Update(score)220221def Likelihood(self, data, hypo):222"""Computes the likelihood of a test score, given efficacy."""223p_correct = hypo224score = data225226k = self.exam.Reverse(score)227n = self.exam.max_score228like = thinkbayes.EvalBinomialPmf(k, n, p_correct)229return like230231def PlotPosteriors(self, other):232"""Plots posterior distributions of efficacy.233234self, other: Sat objects.235"""236thinkplot.Clf()237thinkplot.PrePlot(num=2)238239cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)240cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)241242thinkplot.Cdfs([cdf1, cdf2])243thinkplot.Save(xlabel='p_correct',244ylabel='CDF',245axis=[0.7, 1.0, 0.0, 1.0],246root='sat_posteriors_p_corr',247formats=['pdf', 'eps'])248249250class Sat2(thinkbayes.Suite):251"""Represents the distribution of efficacy for a test-taker."""252253def __init__(self, exam, score):254self.exam = exam255self.score = score256257# start with the Gaussian prior258efficacies = thinkbayes.MakeGaussianPmf(0, 1.5, 3)259thinkbayes.Suite.__init__(self, efficacies)260261# update based on an exam score262self.Update(score)263264def Likelihood(self, data, hypo):265"""Computes the likelihood of a test score, given efficacy."""266efficacy = hypo267score = data268raw = self.exam.Reverse(score)269270pmf = self.exam.PmfCorrect(efficacy)271like = pmf.Prob(raw)272return like273274def MakePredictiveDist(self):275"""Returns the distribution of raw scores expected on a re-test."""276raw_pmf = self.exam.MakeRawScoreDist(self)277return raw_pmf278279def PlotPosteriors(self, other):280"""Plots posterior distributions of efficacy.281282self, other: Sat objects.283"""284thinkplot.Clf()285thinkplot.PrePlot(num=2)286287cdf1 = thinkbayes.MakeCdfFromPmf(self, 'posterior %d' % self.score)288cdf2 = thinkbayes.MakeCdfFromPmf(other, 'posterior %d' % other.score)289290thinkplot.Cdfs([cdf1, cdf2])291thinkplot.Save(xlabel='efficacy',292ylabel='CDF',293axis=[0, 4.6, 0.0, 1.0],294root='sat_posteriors_eff',295formats=['pdf', 'eps'])296297298def PlotJointDist(pmf1, pmf2, thresh=0.8):299"""Plot the joint distribution of p_correct.300301pmf1, pmf2: posterior distributions302thresh: lower bound of the range to be plotted303"""304def Clean(pmf):305"""Removes values below thresh."""306vals = [val for val in pmf.Values() if val < thresh]307[pmf.Remove(val) for val in vals]308309Clean(pmf1)310Clean(pmf2)311pmf = thinkbayes.MakeJoint(pmf1, pmf2)312313thinkplot.Figure(figsize=(6, 6))314thinkplot.Contour(pmf, contour=False, pcolor=True)315316thinkplot.Plot([thresh, 1.0], [thresh, 1.0],317color='gray', alpha=0.2, linewidth=4)318319thinkplot.Save(root='sat_joint',320xlabel='p_correct Alice',321ylabel='p_correct Bob',322axis=[thresh, 1.0, thresh, 1.0],323formats=['pdf', 'eps'])324325326def ComparePosteriorPredictive(a_sat, b_sat):327"""Compares the predictive distributions of raw scores.328329a_sat: posterior distribution330b_sat:331"""332a_pred = a_sat.MakePredictiveDist()333b_pred = b_sat.MakePredictiveDist()334335#thinkplot.Clf()336#thinkplot.Pmfs([a_pred, b_pred])337#thinkplot.Show()338339a_like = thinkbayes.PmfProbGreater(a_pred, b_pred)340b_like = thinkbayes.PmfProbLess(a_pred, b_pred)341c_like = thinkbayes.PmfProbEqual(a_pred, b_pred)342343print 'Posterior predictive'344print 'A', a_like345print 'B', b_like346print 'C', c_like347348349def PlotPriorDist(pmf):350"""Plot the prior distribution of p_correct.351352pmf: prior353"""354thinkplot.Clf()355thinkplot.PrePlot(num=1)356357cdf1 = thinkbayes.MakeCdfFromPmf(pmf, 'prior')358thinkplot.Cdf(cdf1)359thinkplot.Save(root='sat_prior',360xlabel='p_correct',361ylabel='CDF',362formats=['pdf', 'eps'])363364365class TopLevel(thinkbayes.Suite):366"""Evaluates the top-level hypotheses about Alice and Bob.367368Uses the bottom-level posterior distribution about p_correct369(or efficacy).370"""371372def Update(self, data):373a_sat, b_sat = data374375a_like = thinkbayes.PmfProbGreater(a_sat, b_sat)376b_like = thinkbayes.PmfProbLess(a_sat, b_sat)377c_like = thinkbayes.PmfProbEqual(a_sat, b_sat)378379a_like += c_like / 2380b_like += c_like / 2381382self.Mult('A', a_like)383self.Mult('B', b_like)384385self.Normalize()386387388def ProbCorrect(efficacy, difficulty, a=1):389"""Returns the probability that a person gets a question right.390391efficacy: personal ability to answer questions392difficulty: how hard the question is393394Returns: float prob395"""396return 1 / (1 + math.exp(-a * (efficacy - difficulty)))397398399def BinaryPmf(p):400"""Makes a Pmf with values 1 and 0.401402p: probability given to 1403404Returns: Pmf object405"""406pmf = thinkbayes.Pmf()407pmf.Set(1, p)408pmf.Set(0, 1-p)409return pmf410411412def PmfCorrect(efficacy, difficulties):413"""Computes the distribution of correct responses.414415efficacy: personal ability to answer questions416difficulties: list of difficulties, one for each question417418Returns: new Pmf object419"""420pmf0 = thinkbayes.Pmf([0])421422ps = [ProbCorrect(efficacy, difficulty) for difficulty in difficulties]423pmfs = [BinaryPmf(p) for p in ps]424dist = sum(pmfs, pmf0)425return dist426427428def MakeDifficulties(center, width, n):429"""Makes a list of n difficulties with a given center and width.430431Returns: list of n floats between center-width and center+width432"""433low, high = center-width, center+width434return numpy.linspace(low, high, n)435436437def ProbCorrectTable():438"""Makes a table of p_correct for a range of efficacy and difficulty."""439efficacies = [3, 1.5, 0, -1.5, -3]440difficulties = [-1.85, -0.05, 1.75]441442for eff in efficacies:443print '%0.2f & ' % eff,444for diff in difficulties:445p = ProbCorrect(eff, diff)446print '%0.2f & ' % p,447print r'\\'448449450def main(script):451ProbCorrectTable()452453exam = Exam()454455PlotPriorDist(exam.prior)456exam.CalibrateDifficulty()457458exam.CompareScores(780, 740, constructor=Sat)459460exam.CompareScores(780, 740, constructor=Sat2)461462463if __name__ == '__main__':464main(*sys.argv)465466467