📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This file contains code for use with "Think Stats",1by Allen B. Downey, available from greenteapress.com23Copyright 2010 Allen B. Downey4License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html5"""67from __future__ import print_function89from collections import defaultdict10import numpy as np11import sys1213import thinkstats2141516def ReadFemPreg(dct_file='2002FemPreg.dct',17dat_file='2002FemPreg.dat.gz'):18"""Reads the NSFG pregnancy data.1920dct_file: string file name21dat_file: string file name2223returns: DataFrame24"""25dct = thinkstats2.ReadStataDct(dct_file)26df = dct.ReadFixedWidth(dat_file, compression='gzip')27CleanFemPreg(df)28return df293031def CleanFemPreg(df):32"""Recodes variables from the pregnancy frame.3334df: DataFrame35"""36# mother's age is encoded in centiyears; convert to years37df.agepreg /= 100.03839# birthwgt_lb contains at least one bogus value (51 lbs)40# replace with NaN41df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan4243# replace 'not ascertained', 'refused', 'don't know' with NaN44na_vals = [97, 98, 99]45df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)46df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)47df.hpagelb.replace(na_vals, np.nan, inplace=True)4849df.babysex.replace([7, 9], np.nan, inplace=True)50df.nbrnaliv.replace([9], np.nan, inplace=True)5152# birthweight is stored in two columns, lbs and oz.53# convert to a single column in lb54# NOTE: creating a new column requires dictionary syntax,55# not attribute assignment (like df.totalwgt_lb)56df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.05758# due to a bug in ReadStataDct, the last variable gets clipped;59# so for now set it to NaN60df.cmintvw = np.nan616263def MakePregMap(df):64"""Make a map from caseid to list of preg indices.6566df: DataFrame6768returns: dict that maps from caseid to list of indices into preg df69"""70d = defaultdict(list)71for index, caseid in df.caseid.iteritems():72d[caseid].append(index)73return d747576def main(script):77"""Tests the functions in this module.7879script: string script name80"""81df = ReadFemPreg()82print(df.shape)8384assert len(df) == 135938586assert df.caseid[13592] == 1257187assert df.pregordr.value_counts()[1] == 503388assert df.nbrnaliv.value_counts()[1] == 898189assert df.babysex.value_counts()[1] == 464190assert df.birthwgt_lb.value_counts()[7] == 304991assert df.birthwgt_oz.value_counts()[0] == 103792assert df.prglngth.value_counts()[39] == 474493assert df.outcome.value_counts()[1] == 914894assert df.birthord.value_counts()[1] == 441395assert df.agepreg.value_counts()[22.75] == 10096assert df.totalwgt_lb.value_counts()[7.5] == 3029798weights = df.finalwgt.value_counts()99key = max(weights.keys())100assert df.finalwgt.value_counts()[key] == 6101102print('%s: All tests passed.' % script)103104if __name__ == '__main__':105main(*sys.argv)106107108