📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67"""89Results: on a log-log scale the tail of the CCDF is a straight line,10which suggests that the Pareto distribution is a good model for this data,11at least for people with taxable income above the median.1213"""1415import csv16import sys1718import myplot19import Pmf20import Cdf212223def ReadIncomeFile(filename='08in11si.csv'):24"""Reads a data file from the IRS and returns the first two columns.2526Skips the header and returns only the first table (non-cumulative).2728Args:29filename: string data file3031Returns:32list of string pairs33"""34reader = csv.reader(open(filename))35for line in reader:36if line[0] == 'All returns':37break3839t = []40for line in reader:41if line[0].startswith('Accumulated'):42break43t.append(line[0:2])4445return t464748def MakeIncomeDist(data):49"""Converts the strings from the IRS file to a Hist, Pmf and Cdf.5051Args:52data: list of (dollar range, number of returns) string pairs5354Returns:55tuple of (Hist, Pmf, Cdf) representing the number of returns in each bin56"""57def clean(s):58"""Converts dollar amounts to integers."""59try:60return int(s.lstrip('$'))61except ValueError:62if s in ['No', 'income']:63return 064if s == 'more':65return -166return None6768def midpoint(low, high):69"""Finds the midpoint of a range."""70if high == -1:71return low * 3 / 272else:73return (low + high) / 27475hist = Pmf.Hist()7677for column, number in data:78# convert the number of returns79number = number.replace(',', '')80number = int(number)8182# convert the income range83column = column.replace(',', '')84t = column.split()85low, high = t[0], t[-1]86low, high = clean(low), clean(high)8788# add to the histogram89x = midpoint(low, high)90hist.Incr(x, number)91print x, number9293pmf = Pmf.MakePmfFromHist(hist)94cdf = Cdf.MakeCdfFromDict(pmf.GetDict())95return hist, pmf, cdf969798def main(script, *args):99data = ReadIncomeFile()100hist, pmf, cdf = MakeIncomeDist(data)101102# plot the CDF on a log-x scale103myplot.Clf()104myplot.Cdf(cdf)105myplot.Save(root='income_logx',106xscale='log',107xlabel='income',108ylabel='CDF')109110# plot the complementary CDF on a log-log scale111myplot.Clf()112myplot.Cdf(cdf, complement=True)113myplot.Save(root='income_loglog',114complement=True,115xscale='log',116yscale='log',117xlabel='income',118ylabel='complementary CDF')119120121if __name__ == "__main__":122main(*sys.argv)123124125