Repository for a workshop on Bayesian statistics
"""This file contains code used in "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2014 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function, division89import thinkbayes10import thinkplot1112import numpy1314"""15Bayesian solution to the Lincoln index, described in a blog16article at Probably Overthinking It.1718Last year my occasional correspondent John D. Cook wrote an excellent19blog post about the Lincoln index, which is a way to estimate the20number of errors in a document (or program) by comparing results from21two independent testers. Here's his presentation of the problem:2223"Suppose you have a tester who finds 20 bugs in your program. You24want to estimate how many bugs are really in the program. You know25there are at least 20 bugs, and if you have supreme confidence in your26tester, you may suppose there are around 20 bugs. But maybe your27tester isn't very good. Maybe there are hundreds of bugs. How can you28have any idea how many bugs there are? There's no way to know with one29tester. But if you have two testers, you can get a good idea, even if30you don't know how skilled the testers are."3132Then he presents the Lincoln index, an estimator "described by33Frederick Charles Lincoln in 1930," where Wikpedia's use of34"described" is a hint that the index is another example of Stigler's35law of eponymy.3637"Suppose two testers independently search for bugs. Let k1 be the38number of errors the first tester finds and k2 the number of errors39the second tester finds. Let c be the number of errors both testers40find. The Lincoln Index estimates the total number of errors as k1 k241/ c [I changed his notation to be consistent with mine]."4243So if the first tester finds 20 bugs, the second finds 15, and they44find 3 in common, we estimate that there are about 100 bugs.4546Of course, whenever I see something like this, the idea that pops into47my head is that there must be a (better) Bayesian solution! And there48is.4950"""5152def choose(n, k, d={}):53"""The binomial coefficient "n choose k".5455Args:56n: number of trials57k: number of successes58d: map from (n,k) tuples to cached results5960Returns:61int62"""63if k == 0:64return 165if n == 0:66return 06768try:69return d[n, k]70except KeyError:71res = choose(n-1, k) + choose(n-1, k-1)72d[n, k] = res73return res7475def binom(k, n, p):76"""Computes the rest of the binomial PMF.7778k: number of hits79n: number of attempts80p: probability of a hit81"""82return p**k * (1-p)**(n-k)838485class Lincoln(thinkbayes.Suite, thinkbayes.Joint):86"""Represents hypotheses about the number of errors."""8788def Likelihood(self, data, hypo):89"""Computes the likelihood of the data under the hypothesis.9091hypo: n, p1, p292data: k1, k2, c93"""94n, p1, p2 = hypo95k1, k2, c = data9697part1 = choose(n, k1) * binom(k1, n, p1)98part2 = choose(k1, c) * choose(n-k1, k2-c) * binom(k2, n, p2)99return part1 * part2100101102def main():103data = 20, 15, 3104probs = numpy.linspace(0, 1, 101)105hypos = []106for n in range(32, 350):107for p1 in probs:108for p2 in probs:109hypos.append((n, p1, p2))110111suite = Lincoln(hypos)112suite.Update(data)113114n_marginal = suite.Marginal(0)115116thinkplot.Pmf(n_marginal, label='n')117thinkplot.Save(root='lincoln1',118xlabel='number of bugs',119ylabel='PMF',120formats=['pdf', 'png'])121122print(n_marginal.Mean())123print(n_marginal.MaximumLikelihood())124125126if __name__ == '__main__':127main()128129130