📚 The CoCalc Library - books, templates and other resources
License: OTHER
import pandas as pd1import numpy as np2import matplotlib.pyplot as plt34import re56class FixedWidthVariables(object):7"""Represents a set of variables in a fixed width file."""89def __init__(self, variables, index_base=0):10"""Initializes.1112variables: DataFrame13index_base: are the indices 0 or 1 based?1415Attributes:16colspecs: list of (start, end) index tuples17names: list of string variable names18"""19self.variables = variables2021# note: by default, subtract 1 from colspecs22self.colspecs = variables[['start', 'end']] - index_base2324# convert colspecs to a list of pair of int25self.colspecs = self.colspecs.astype(np.int).values.tolist()26self.names = variables['name']2728def read_fixed_width(self, filename, **options):29"""Reads a fixed width ASCII file.3031filename: string filename3233returns: DataFrame34"""35df = pd.read_fwf(filename,36colspecs=self.colspecs,37names=self.names,38**options)39return df404142def read_stata_dict(dct_file, **options):43"""Reads a Stata dictionary file.4445dct_file: string filename46options: dict of options passed to open()4748returns: FixedWidthVariables object49"""50type_map = dict(byte=int, int=int, long=int, float=float,51double=float, numeric=float)5253var_info = []54with open(dct_file, **options) as f:55for line in f:56match = re.search( r'_column\(([^)]*)\)', line)57if not match:58continue59start = int(match.group(1))60t = line.split()61vtype, name, fstring = t[1:4]62name = name.lower()63if vtype.startswith('str'):64vtype = str65else:66vtype = type_map[vtype]67long_desc = ' '.join(t[4:]).strip('"')68var_info.append((start, vtype, name, fstring, long_desc))6970columns = ['start', 'type', 'name', 'fstring', 'desc']71variables = pd.DataFrame(var_info, columns=columns)7273# fill in the end column by shifting the start column74variables['end'] = variables.start.shift(-1)75variables.loc[len(variables)-1, 'end'] = 07677dct = FixedWidthVariables(variables, index_base=1)78return dct798081def read_stata(dct_name, dat_name, **options):82"""Reads Stata files from the given directory.8384dirname: string8586returns: DataFrame87"""88dct = read_stata_dict(dct_name)89df = dct.read_fixed_width(dat_name, **options)90return df919293def sample_rows(df, nrows, replace=False):94"""Choose a sample of rows from a DataFrame.9596df: DataFrame97nrows: number of rows98replace: whether to sample with replacement99100returns: DataDf101"""102indices = np.random.choice(df.index, nrows, replace=replace)103sample = df.loc[indices]104return sample105106107def resample_rows(df):108"""Resamples rows from a DataFrame.109110df: DataFrame111112returns: DataFrame113"""114return sample_rows(df, len(df), replace=True)115116117def resample_rows_weighted(df, column='finalwgt'):118"""Resamples a DataFrame using probabilities proportional to given column.119120df: DataFrame121column: string column name to use as weights122123returns: DataFrame124"""125weights = df[column].copy()126weights /= sum(weights)127indices = np.random.choice(df.index, len(df), replace=True, p=weights)128sample = df.loc[indices]129return sample130131132def resample_by_year(df, column='wtssall'):133"""Resample rows within each year.134135df: DataFrame136column: string name of weight variable137138returns DataFrame139"""140grouped = df.groupby('year')141samples = [resample_rows_weighted(group, column)142for _, group in grouped]143sample = pd.concat(samples, ignore_index=True)144return sample145146147def values(df, varname):148"""Values and counts in index order.149150df: DataFrame151varname: strign column name152153returns: Series that maps from value to frequency154"""155return df[varname].value_counts().sort_index()156157158def fill_missing(df, varname, badvals=[98, 99]):159"""Fill missing data with random values.160161df: DataFrame162varname: string column name163badvals: list of values to be replaced164"""165# replace badvals with NaN166df[varname].replace(badvals, np.nan, inplace=True)167168# get the index of rows missing varname169null = df[varname].isnull()170n_missing = sum(null)171172# choose a random sample from the non-missing values173fill = np.random.choice(df[varname].dropna(), n_missing, replace=True)174175# replace missing data with the samples176df.loc[null, varname] = fill177178# return the number of missing values replaced179return n_missing180181182def round_into_bins(df, var, bin_width, high=None, low=0):183"""Rounds values down to the bin they belong in.184185df: DataFrame186var: string variable name187bin_width: number, width of the bins188189returns: array of bin values190"""191if high is None:192high = df[var].max()193194bins = np.arange(low, high+bin_width, bin_width)195indices = np.digitize(df[var], bins)196return bins[indices-1]197198199def underride(d, **options):200"""Add key-value pairs to d only if key is not in d.201202d: dictionary203options: keyword args to add to d204"""205for key, val in options.items():206d.setdefault(key, val)207208return d209210211def decorate(**options):212"""Decorate the current axes.213Call decorate with keyword arguments like214decorate(title='Title',215xlabel='x',216ylabel='y')217The keyword arguments can be any of the axis properties218https://matplotlib.org/api/axes_api.html219In addition, you can use `legend=False` to suppress the legend.220And you can use `loc` to indicate the location of the legend221(the default value is 'best')222"""223loc = options.pop('loc', 'best')224if options.pop('legend', True):225legend(loc=loc)226227plt.gca().set(**options)228plt.tight_layout()229230231def legend(**options):232"""Draws a legend only if there is at least one labeled item.233options are passed to plt.legend()234https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html235"""236underride(options, loc='best')237238ax = plt.gca()239handles, labels = ax.get_legend_handles_labels()240#TODO: don't draw if there are none241ax.legend(handles, labels, **options)242243244