CoCalc -- utils.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / think-stats-2ed / homeworks / utils.py
²⁰¹⁸⁵⁸ views
License: OTHER
1
import pandas as pd
2
import numpy as np
3
import matplotlib.pyplot as plt
4

5
import re
6

7
class FixedWidthVariables(object):
8
    """Represents a set of variables in a fixed width file."""
9

10
    def __init__(self, variables, index_base=0):
11
        """Initializes.
12

13
        variables: DataFrame
14
        index_base: are the indices 0 or 1 based?
15

16
        Attributes:
17
        colspecs: list of (start, end) index tuples
18
        names: list of string variable names
19
        """
20
        self.variables = variables
21

22
        # note: by default, subtract 1 from colspecs
23
        self.colspecs = variables[['start', 'end']] - index_base
24

25
        # convert colspecs to a list of pair of int
26
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
27
        self.names = variables['name']
28

29
    def read_fixed_width(self, filename, **options):
30
        """Reads a fixed width ASCII file.
31

32
        filename: string filename
33

34
        returns: DataFrame
35
        """
36
        df = pd.read_fwf(filename,
37
                             colspecs=self.colspecs,
38
                             names=self.names,
39
                             **options)
40
        return df
41

42

43
def read_stata_dict(dct_file, **options):
44
    """Reads a Stata dictionary file.
45

46
    dct_file: string filename
47
    options: dict of options passed to open()
48

49
    returns: FixedWidthVariables object
50
    """
51
    type_map = dict(byte=int, int=int, long=int, float=float,
52
                    double=float, numeric=float)
53

54
    var_info = []
55
    with open(dct_file, **options) as f:
56
        for line in f:
57
            match = re.search( r'_column\(([^)]*)\)', line)
58
            if not match:
59
                continue
60
            start = int(match.group(1))
61
            t = line.split()
62
            vtype, name, fstring = t[1:4]
63
            name = name.lower()
64
            if vtype.startswith('str'):
65
                vtype = str
66
            else:
67
                vtype = type_map[vtype]
68
            long_desc = ' '.join(t[4:]).strip('"')
69
            var_info.append((start, vtype, name, fstring, long_desc))
70

71
    columns = ['start', 'type', 'name', 'fstring', 'desc']
72
    variables = pd.DataFrame(var_info, columns=columns)
73

74
    # fill in the end column by shifting the start column
75
    variables['end'] = variables.start.shift(-1)
76
    variables.loc[len(variables)-1, 'end'] = 0
77

78
    dct = FixedWidthVariables(variables, index_base=1)
79
    return dct
80

81

82
def read_stata(dct_name, dat_name, **options):
83
    """Reads Stata files from the given directory.
84

85
    dirname: string
86

87
    returns: DataFrame
88
    """
89
    dct = read_stata_dict(dct_name)
90
    df = dct.read_fixed_width(dat_name, **options)
91
    return df
92

93

94
def sample_rows(df, nrows, replace=False):
95
    """Choose a sample of rows from a DataFrame.
96

97
    df: DataFrame
98
    nrows: number of rows
99
    replace: whether to sample with replacement
100

101
    returns: DataDf
102
    """
103
    indices = np.random.choice(df.index, nrows, replace=replace)
104
    sample = df.loc[indices]
105
    return sample
106

107

108
def resample_rows(df):
109
    """Resamples rows from a DataFrame.
110

111
    df: DataFrame
112

113
    returns: DataFrame
114
    """
115
    return sample_rows(df, len(df), replace=True)
116

117

118
def resample_rows_weighted(df, column='finalwgt'):
119
    """Resamples a DataFrame using probabilities proportional to given column.
120

121
    df: DataFrame
122
    column: string column name to use as weights
123

124
    returns: DataFrame
125
    """
126
    weights = df[column].copy()
127
    weights /= sum(weights)
128
    indices = np.random.choice(df.index, len(df), replace=True, p=weights)
129
    sample = df.loc[indices]
130
    return sample
131

132

133
def resample_by_year(df, column='wtssall'):
134
    """Resample rows within each year.
135

136
    df: DataFrame
137
    column: string name of weight variable
138

139
    returns DataFrame
140
    """
141
    grouped = df.groupby('year')
142
    samples = [resample_rows_weighted(group, column)
143
               for _, group in grouped]
144
    sample = pd.concat(samples, ignore_index=True)
145
    return sample
146

147

148
def values(df, varname):
149
    """Values and counts in index order.
150

151
    df: DataFrame
152
    varname: strign column name
153

154
    returns: Series that maps from value to frequency
155
    """
156
    return df[varname].value_counts().sort_index()
157

158

159
def fill_missing(df, varname, badvals=[98, 99]):
160
    """Fill missing data with random values.
161

162
    df: DataFrame
163
    varname: string column name
164
    badvals: list of values to be replaced
165
    """
166
    # replace badvals with NaN
167
    df[varname].replace(badvals, np.nan, inplace=True)
168

169
    # get the index of rows missing varname
170
    null = df[varname].isnull()
171
    n_missing = sum(null)
172

173
    # choose a random sample from the non-missing values
174
    fill = np.random.choice(df[varname].dropna(), n_missing, replace=True)
175

176
    # replace missing data with the samples
177
    df.loc[null, varname] = fill
178

179
    # return the number of missing values replaced
180
    return n_missing
181

182

183
def round_into_bins(df, var, bin_width, high=None, low=0):
184
    """Rounds values down to the bin they belong in.
185

186
    df: DataFrame
187
    var: string variable name
188
    bin_width: number, width of the bins
189

190
    returns: array of bin values
191
    """
192
    if high is None:
193
        high = df[var].max()
194

195
    bins = np.arange(low, high+bin_width, bin_width)
196
    indices = np.digitize(df[var], bins)
197
    return bins[indices-1]
198

199

200
def underride(d, **options):
201
    """Add key-value pairs to d only if key is not in d.
202

203
    d: dictionary
204
    options: keyword args to add to d
205
    """
206
    for key, val in options.items():
207
        d.setdefault(key, val)
208

209
    return d
210

211

212
def decorate(**options):
213
    """Decorate the current axes.
214
    Call decorate with keyword arguments like
215
    decorate(title='Title',
216
             xlabel='x',
217
             ylabel='y')
218
    The keyword arguments can be any of the axis properties
219
    https://matplotlib.org/api/axes_api.html
220
    In addition, you can use `legend=False` to suppress the legend.
221
    And you can use `loc` to indicate the location of the legend
222
    (the default value is 'best')
223
    """
224
    loc = options.pop('loc', 'best')
225
    if options.pop('legend', True):
226
        legend(loc=loc)
227

228
    plt.gca().set(**options)
229
    plt.tight_layout()
230

231

232
def legend(**options):
233
    """Draws a legend only if there is at least one labeled item.
234
    options are passed to plt.legend()
235
    https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
236
    """
237
    underride(options, loc='best')
238

239
    ax = plt.gca()
240
    handles, labels = ax.get_legend_handles_labels()
241
    #TODO: don't draw if there are none
242
    ax.legend(handles, labels, **options)
243

244
Product

Resources

Company