Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mrdbourke
GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/introduction-to-scikit-learn-video.ipynb
874 views
Kernel: Python 3

Introduction to Scikit-Learn (sklearn)

This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn library.

What we're going to cover:

# Let's listify the contents what_were_covering = [ "0. An end-to-end Scikit-Learn workflow", "1. Getting the data ready", "2. Choose the right estimator/algorithm for our problems", "3. Fit the model/algorithm and use it to make predictions on our data", "4. Evaluating a model", "5. Improve a model", "6. Save and load a trained model", "7. Putting it all together!"]
what_were_covering
['0. An end-to-end Scikit-Learn workflow', '1. Getting the data ready', '2. Choose the right estimator/algorithm for our problems', '3. Fit the model/algorithm and use it to make predictions on our data', '4. Evaluating a model', '5. Improve a model', '6. Save and load a trained model', '7. Putting it all together!']
# Standard imports import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline

0. An end-to-end Scikit-Learn workflow

# 1. Get the data ready import pandas as pd heart_disease = pd.read_csv("data/heart-disease.csv") heart_disease
# Create X (features matrix) X = heart_disease.drop("target", axis=1) # Create y (labels) y = heart_disease["target"]
# 2. Choose the right model and hyperparameters from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=100) # We'll keep the default hyperparameters clf.get_params()
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
# 3. Fit the model to the training data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train);
X_train
# make a prediction y_label = clf.predict(np.array([0, 2, 3, 4]))
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names warnings.warn(
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-10-7cea9660990e> in <module> 1 # make a prediction ----> 2 y_label = clf.predict(np.array([0, 2, 3, 4])) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in predict(self, X) 795 The predicted classes. 796 """ --> 797 proba = self.predict_proba(X) 798 799 if self.n_outputs_ == 1: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X) 837 check_is_fitted(self) 838 # Check data --> 839 X = self._validate_X_predict(X) 840 841 # Assign chunk of trees to jobs ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in _validate_X_predict(self, X) 566 Validate X whenever one tries to predict, apply, predict_proba.""" 567 check_is_fitted(self) --> 568 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) 569 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): 570 raise ValueError("No support for np.int64 index based sparse matrices") ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 555 raise ValueError("Validation should be done on X, y or both.") 556 elif not no_val_X and no_val_y: --> 557 X = check_array(X, **check_params) 558 out = X 559 elif no_val_X and not no_val_y: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 759 # If input is 1D raise error 760 if array.ndim == 1: --> 761 raise ValueError( 762 "Expected 2D array, got 1D array instead:\narray={}.\n" 763 "Reshape your data either using array.reshape(-1, 1) if " ValueError: Expected 2D array, got 1D array instead: array=[0. 2. 3. 4.]. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
y_preds = clf.predict(X_test) y_preds
array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0])
y_test
286 0 29 1 159 1 3 1 182 0 .. 257 0 197 0 49 1 265 0 215 0 Name: target, Length: 61, dtype: int64
# 4. Evaluate the model on the training data and test data clf.score(X_train, y_train)
1.0
clf.score(X_test, y_test)
0.8688524590163934
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score print(classification_report(y_test, y_preds))
precision recall f1-score support 0 1.00 0.74 0.85 31 1 0.79 1.00 0.88 30 accuracy 0.87 61 macro avg 0.89 0.87 0.87 61 weighted avg 0.90 0.87 0.87 61
confusion_matrix(y_test, y_preds)
array([[23, 8], [ 0, 30]])
accuracy_score(y_test, y_preds)
0.8688524590163934
# 5. Improve a model # Try different amount of n_estimators np.random.seed(42) for i in range(10, 100, 10): print(f"Trying model with {i} estimators...") clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train) print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%") print("")
Trying model with 10 estimators... Model accuracy on test set: 85.25% Trying model with 20 estimators... Model accuracy on test set: 85.25% Trying model with 30 estimators... Model accuracy on test set: 86.89% Trying model with 40 estimators... Model accuracy on test set: 85.25% Trying model with 50 estimators... Model accuracy on test set: 86.89% Trying model with 60 estimators... Model accuracy on test set: 83.61% Trying model with 70 estimators... Model accuracy on test set: 81.97% Trying model with 80 estimators... Model accuracy on test set: 85.25% Trying model with 90 estimators... Model accuracy on test set: 86.89%
# 6. Save a model and load it import pickle pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))
loaded_model = pickle.load(open("random_forst_model_1.pkl", "rb")) loaded_model.score(X_test, y_test)
0.8688524590163934

1. Getting our data ready to be used with machine learning

Three main things we have to do: 1. Split the data into features and labels (usually X & y) 2. Filling (also called imputing) or disregarding missing values 3. Converting non-numerical values to numerical values (also called feature encoding)

heart_disease.head()
X = heart_disease.drop("target", axis=1) X.head()
y = heart_disease["target"] y.head()
0 1 1 1 2 1 3 1 4 1 Name: target, dtype: int64
# Split the data into training and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((212, 13), (91, 13), (212,), (91,))
X.shape[0] * 0.8
242.4
242 + 61
303
len(heart_disease)
303

1.1 Make sure it's all numerical

car_sales = pd.read_csv("data/car-sales-extended.csv") car_sales.head()
car_sales["Doors"].value_counts()
4 856 5 79 3 65 Name: Doors, dtype: int64
len(car_sales)
1000
car_sales.dtypes
Make object Colour object Odometer (KM) int64 Doors int64 Price int64 dtype: object
# Split into X/y X = car_sales.drop("Price", axis=1) y = car_sales["Price"] # Split into training and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Build machine learning model from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() model.fit(X_train, y_train) model.score(X_test, y_test)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-34-2eeea2d0b490> in <module> 3 4 model = RandomForestRegressor() ----> 5 model.fit(X_train, y_train) 6 model.score(X_test, y_test) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight) 324 if issparse(y): 325 raise ValueError("sparse multilabel-indicator for y is not supported.") --> 326 X, y = self._validate_data( 327 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE 328 ) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 570 y = check_array(y, **check_y_params) 571 else: --> 572 X, y = check_X_y(X, y, **check_params) 573 out = X, y 574 ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 954 raise ValueError("y cannot be None") 955 --> 956 X = check_array( 957 X, 958 accept_sparse=accept_sparse, ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 736 array = array.astype(dtype, casting="unsafe", copy=False) 737 else: --> 738 array = np.asarray(array, order=order, dtype=dtype) 739 except ComplexWarning as complex_warning: 740 raise ValueError( ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order) 83 84 """ ---> 85 return array(a, dtype, copy=False, order=order) 86 87 ValueError: could not convert string to float: 'Toyota'
X.head()
# Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(X) transformed_X
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 3.54310e+04], [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00, 1.00000e+00, 1.92714e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 8.47140e+04], ..., [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00, 0.00000e+00, 6.66040e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 2.15883e+05], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 2.48360e+05]])
X.head()
pd.DataFrame(transformed_X)
# Another way to do it with pd.dummies... dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]]) dummies
# Let's refit the model np.random.seed(42) X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2) model.fit(X_train, y_train)
RandomForestRegressor()
X.head()
model.score(X_test, y_test)
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names warnings.warn(
0.3235867221569877

1.2 What if there were missing values?

  1. Fill them with some value (also known as imputation).

  2. Remove the samples with missing data altogether.

# Import car sales missing data car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv") car_sales_missing.head()
car_sales_missing.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Create X & y X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"]
# Let's try and convert our data to numbers # Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(X) transformed_X
<1000x16 sparse matrix of type '<class 'numpy.float64'>' with 4000 stored elements in Compressed Sparse Row format>
car_sales_missing
car_sales_missing["Doors"].value_counts()
4.0 811 5.0 75 3.0 64 Name: Doors, dtype: int64

Option 1: Fill missing data with Pandas

# Fill the "Make" column car_sales_missing["Make"].fillna("missing", inplace=True) # Fill the "Colour" column car_sales_missing["Colour"].fillna("missing", inplace=True) # Fill the "Odometer (KM)" column car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True) # Fill the "Doors" column car_sales_missing["Doors"].fillna(4, inplace=True)
# Check our dataframe again car_sales_missing.isna().sum()
Make 0 Colour 0 Odometer (KM) 0 Doors 0 Price 50 dtype: int64
# Remove rows with missing Price value car_sales_missing.dropna(inplace=True)
car_sales_missing.isna().sum()
Make 0 Colour 0 Odometer (KM) 0 Doors 0 Price 0 dtype: int64
len(car_sales_missing)
950
X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"]
# Let's try and convert our data to numbers # Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(car_sales_missing) transformed_X
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 3.54310e+04, 1.53230e+04], [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 1.92714e+05, 1.99430e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 8.47140e+04, 2.83430e+04], ..., [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00, 6.66040e+04, 3.15700e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 2.15883e+05, 4.00100e+03], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00, 2.48360e+05, 1.27320e+04]])

Option 2: Filling missing data and transforming categorical data with Scikit-Learn

Note: This section is different to the video. The video shows filling and transforming the entire dataset (X) and although the techniques are correct, it's best to fill and transform training and test sets separately (as shown in the code below).

The main takeaways:

  • Split your data first (into train/test)

  • Fill/transform the training set and test sets separately

Thank you Robert for pointing this out.

car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv") car_sales_missing.head()
car_sales_missing.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Drop the rows with no labels car_sales_missing.dropna(subset=["Price"], inplace=True) car_sales_missing.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 Price 0 dtype: int64
# Split into X & y X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"] # Split data into train and test np.random.seed(42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Check missing values X.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 dtype: int64
# Fill missing values with Scikit-Learn from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer # Fill categorical values with 'missing' & numerical values with mean cat_imputer = SimpleImputer(strategy="constant", fill_value="missing") door_imputer = SimpleImputer(strategy="constant", fill_value=4) num_imputer = SimpleImputer(strategy="mean") # Define columns cat_features = ["Make", "Colour"] door_feature = ["Doors"] num_features = ["Odometer (KM)"] # Create an imputer (something that fills missing data) imputer = ColumnTransformer([ ("cat_imputer", cat_imputer, cat_features), ("door_imputer", door_imputer, door_feature), ("num_imputer", num_imputer, num_features) ]) # Fill train and test values separately filled_X_train = imputer.fit_transform(X_train) filled_X_test = imputer.transform(X_test) # Check filled X_train filled_X_train
array([['Honda', 'White', 4.0, 71934.0], ['Toyota', 'Red', 4.0, 162665.0], ['Honda', 'White', 4.0, 42844.0], ..., ['Toyota', 'White', 4.0, 196225.0], ['Honda', 'Blue', 4.0, 133117.0], ['Honda', 'missing', 4.0, 150582.0]], dtype=object)
# Get our transformed data array's back into DataFrame's car_sales_filled_train = pd.DataFrame(filled_X_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"]) car_sales_filled_test = pd.DataFrame(filled_X_test, columns=["Make", "Colour", "Doors", "Odometer (KM)"]) # Check missing data in training set car_sales_filled_train.isna().sum()
Make 0 Colour 0 Doors 0 Odometer (KM) 0 dtype: int64
# Check to see the original... still missing values car_sales_missing.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 Price 0 dtype: int64
# Now let's one hot encode the features with the same code as before categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") # Fill train and test values separately transformed_X_train = transformer.fit_transform(car_sales_filled_train) transformed_X_test = transformer.transform(car_sales_filled_test) # Check transformed and filled X_train transformed_X_train.toarray()
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 7.19340e+04], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.62665e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 4.28440e+04], ..., [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.96225e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.33117e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.50582e+05]])
# Now we've transformed X, let's see if we can fit a model np.random.seed(42) from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() # Make sure to use transformed (filled and one-hot encoded X data) model.fit(transformed_X_train, y_train) model.score(transformed_X_test, y_test)
0.21229043336119102
# Check length of transformed data (filled and one-hot encoded) # vs. length of original data len(transformed_X_train.toarray())+len(transformed_X_test.toarray()), len(car_sales)
(950, 1000)

Note: The 50 less values in the transformed data is because we dropped the rows (50 total) with missing values in the Price column.

2. Choosing the right estimator/algorithm for your problem

Some things to note:

  • Sklearn refers to machine learning models, algorithms as estimators.

  • Classification problem - predicting a category (heart disease or not)

    • Sometimes you'll see clf (short for classifier) used as a classification estimator

  • Regression problem - predicting a number (selling price of a car)

If you're working on a machine learning problem and looking to use Sklearn and not sure what model you should use, refer to the sklearn machine learning map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

2.1 Picking a machine learning model for a regression problem

Let's use the California Housing dataset - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

# Get California Housing dataset from sklearn.datasets import fetch_california_housing housing = fetch_california_housing() housing
{'data': array([[ 8.3252 , 41. , 6.98412698, ..., 2.55555556, 37.88 , -122.23 ], [ 8.3014 , 21. , 6.23813708, ..., 2.10984183, 37.86 , -122.22 ], [ 7.2574 , 52. , 8.28813559, ..., 2.80225989, 37.85 , -122.24 ], ..., [ 1.7 , 17. , 5.20554273, ..., 2.3256351 , 39.43 , -121.22 ], [ 1.8672 , 18. , 5.32951289, ..., 2.12320917, 39.43 , -121.32 ], [ 2.3886 , 16. , 5.25471698, ..., 2.61698113, 39.37 , -121.24 ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 20640\n\n :Number of Attributes: 8 numeric, predictive attributes and the target\n\n :Attribute Information:\n - MedInc median income in block group\n - HouseAge median house age in block group\n - AveRooms average number of rooms per household\n - AveBedrms average number of bedrooms per household\n - Population block group population\n - AveOccup average number of household members\n - Latitude block group latitude\n - Longitude block group longitude\n\n :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000).\n\nThis dataset was derived from the 1990 U.S. census, using one row per census\nblock group. A block group is the smallest geographical unit for which the U.S.\nCensus Bureau publishes sample data (a block group typically has a population\nof 600 to 3,000 people).\n\nAn household is a group of people residing within a home. Since the average\nnumber of rooms and bedrooms in this dataset are provided per household, these\ncolumns may take surpinsingly large values for block groups with few households\nand many empty houses, such as vacation resorts.\n\nIt can be downloaded/loaded using the\n:func:`sklearn.datasets.fetch_california_housing` function.\n\n.. topic:: References\n\n - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\n Statistics and Probability Letters, 33 (1997) 291-297\n'}
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"]) housing_df
housing_df["target"] = housing["target"] housing_df.head()
housing_df
# Import algorithm/estimator from sklearn.linear_model import Ridge # Setup random seed np.random.seed(42) # Create the data X = housing_df.drop("target", axis=1) y = housing_df["target"] # median house price in $100,000s # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate and fit the model (on the training set) model = Ridge() model.fit(X_train, y_train) # Check the score of the model (on the test set) model.score(X_test, y_test)
0.5758549611440125

What if Ridge didn't work or the score didn't fit our needs?

Well, we could always try a different model...

How about we try an ensemble model (an ensemble is combination of smaller models to try and make better predictions than just a single model)?

Sklearn's ensemble models can be found here: https://scikit-learn.org/stable/modules/ensemble.html

# Import the RandomForestRegressor model class from the ensemble module from sklearn.ensemble import RandomForestRegressor # Setup random seed np.random.seed(42) # Create the data X = housing_df.drop("target", axis=1) y = housing_df["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create random forest model model = RandomForestRegressor() model.fit(X_train, y_train) # Check the score of the model (on the test set) model.score(X_test, y_test)
0.8066196804802649

2.2 Picking a machine learning model for a classification problem

Let's go to the map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

# Get the data (be sure to click "raw") - https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/heart-disease.csv heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv") heart_disease.head()
len(heart_disease)
303

Consulting the map and it says to try LinearSVC.

# Import the LinearSVC estimator class from sklearn.svm import LinearSVC # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate LinearSVC clf = LinearSVC(max_iter=10000) clf.fit(X_train, y_train) # Evaluate the LinearSVC clf.score(X_test, y_test)
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/svm/_base.py:1199: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(
0.8688524590163934
heart_disease["target"].value_counts()
1 165 0 138 Name: target, dtype: int64
# Import the RandomForestClassifier estimator class from sklearn.ensemble import RandomForestClassifier # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Random Forest Classifier clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) # Evaluate the Random Forest Classifier clf.score(X_test, y_test)
0.8524590163934426

Tidbit:

1. If you have structured data, used ensemble methods 2. If you have unstructured data, use deep learning or transfer learning
heart_disease

3. Fit the model/algorithm on our data and use it to make predictions

3.1 Fitting the model to the data

Different names for:

  • X = features, features variables, data

  • y = labels, targets, target variables

# Import the RandomForestClassifier estimator class from sklearn.ensemble import RandomForestClassifier # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Random Forest Classifier clf = RandomForestClassifier(n_estimators=100) # Fit the model to the data (training the machine learning model) clf.fit(X_train, y_train) # Evaluate the Random Forest Classifier (use the patterns the model has learned) clf.score(X_test, y_test)
0.8524590163934426
X.head()
y.tail()
298 0 299 0 300 0 301 0 302 0 Name: target, dtype: int64

Random Forest model deep dive

These resources will help you understand what's happening inside the Random Forest models we've been using.

3.2 Make predictions using a machine learning model

2 ways to make predictions:

  1. predict()

  2. predict_proba()

# Use a trained model to make predictions clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work...
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names warnings.warn(
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-82-5908053f578c> in <module> 1 # Use a trained model to make predictions ----> 2 clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work... ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in predict(self, X) 795 The predicted classes. 796 """ --> 797 proba = self.predict_proba(X) 798 799 if self.n_outputs_ == 1: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X) 837 check_is_fitted(self) 838 # Check data --> 839 X = self._validate_X_predict(X) 840 841 # Assign chunk of trees to jobs ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/ensemble/_forest.py in _validate_X_predict(self, X) 566 Validate X whenever one tries to predict, apply, predict_proba.""" 567 check_is_fitted(self) --> 568 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) 569 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): 570 raise ValueError("No support for np.int64 index based sparse matrices") ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 555 raise ValueError("Validation should be done on X, y or both.") 556 elif not no_val_X and no_val_y: --> 557 X = check_array(X, **check_params) 558 out = X 559 elif no_val_X and not no_val_y: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 759 # If input is 1D raise error 760 if array.ndim == 1: --> 761 raise ValueError( 762 "Expected 2D array, got 1D array instead:\narray={}.\n" 763 "Reshape your data either using array.reshape(-1, 1) if " ValueError: Expected 2D array, got 1D array instead: array=[1. 7. 8. 3. 4.]. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
X_test.head()
clf.predict(X_test)
array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])
np.array(y_test)
array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])
# Compare predictions to truth labels to evaluate the model y_preds = clf.predict(X_test) np.mean(y_preds == y_test)
0.8524590163934426
clf.score(X_test, y_test)
0.8524590163934426
from sklearn.metrics import accuracy_score accuracy_score(y_test, y_preds)
0.8524590163934426

Make predictions with predict_proba() - use this if someone asks you "what's the probability your model is assigning to each prediction?"

# predict_proba() returns probabilities of a classification label clf.predict_proba(X_test[:5])
array([[0.89, 0.11], [0.49, 0.51], [0.43, 0.57], [0.84, 0.16], [0.18, 0.82]])
# Let's predict() on the same data... clf.predict(X_test[:5])
array([0, 1, 1, 0, 1])
X_test[:5]
heart_disease["target"].value_counts()
1 165 0 138 Name: target, dtype: int64

predict() can also be used for regression models.

housing_df.head()
from sklearn.ensemble import RandomForestRegressor np.random.seed(42) # Create the data X = housing_df.drop("target", axis=1) y = housing_df["target"] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create model instance model = RandomForestRegressor() # Fit the model to the data model.fit(X_train, y_train) # Make predictions y_preds = model.predict(X_test)
y_preds[:10]
array([0.49384 , 0.75494 , 4.9285964, 2.54029 , 2.33176 , 1.6549701, 2.34323 , 1.66182 , 2.47489 , 4.8344779])
np.array(y_test[:10])
array([0.477 , 0.458 , 5.00001, 2.186 , 2.78 , 1.587 , 1.982 , 1.575 , 3.4 , 4.466 ])
# Compare the predictions to the truth from sklearn.metrics import mean_absolute_error mean_absolute_error(y_test, y_preds)
0.3265721842781009
housing_df["target"]
0 4.526 1 3.585 2 3.521 3 3.413 4 3.422 ... 20635 0.781 20636 0.771 20637 0.923 20638 0.847 20639 0.894 Name: target, Length: 20640, dtype: float64

4. Evaluating a machine learning model

Three ways to evaluate Scikit-Learn models/estimators:

  1. Estimator's built-in score() method

  2. The scoring parameter

  3. Problem-specific metric functions

You can read more about these here: https://scikit-learn.org/stable/modules/model_evaluation.html

4.1 Evaluating a model with the score method

from sklearn.ensemble import RandomForestClassifier # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Random Forest Classifier clf = RandomForestClassifier(n_estimators=1000) # Fit the model to the data (training the machine learning model) clf.fit(X_train, y_train)
RandomForestClassifier(n_estimators=1000)
# The highest value for the .score() method is 1.0, the lowest is 0.0 clf.score(X_train, y_train)
1.0
clf.score(X_test, y_test)
0.8688524590163934

Let's use the score() on our regression problem...

from sklearn.ensemble import RandomForestRegressor np.random.seed(42) # Create the data X = housing_df.drop("target", axis=1) y = housing_df["target"] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create model instance model = RandomForestRegressor(n_estimators=100) # Fit the model to the data model.fit(X_train, y_train)
RandomForestRegressor()
# The default score() evaluation metric is r_squared for regression algorithms # Highest = 1.0, lowest = 0.0 model.score(X_test, y_test)
0.8066196804802649

4.2 Evaluating a model using the scoring parameter

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train);
clf.score(X_test, y_test)
0.8524590163934426
cross_val_score(clf, X, y, cv=5)
array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])
cross_val_score(clf, X, y, cv=10)
array([0.90322581, 0.80645161, 0.87096774, 0.9 , 0.86666667, 0.8 , 0.73333333, 0.86666667, 0.73333333, 0.8 ])
np.random.seed(42) # Single training and test split score clf_single_score = clf.score(X_test, y_test) # Take the mean of 5-fold cross-validation score clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5)) # Compare the two clf_single_score, clf_cross_val_score
(0.8524590163934426, 0.8248087431693989)
# Default scoring parameter of classifier = mean accuracy # clf.score()
# Scoring parameter set to None by default cross_val_score(clf, X, y, cv=5, scoring=None)
array([0.78688525, 0.86885246, 0.80327869, 0.78333333, 0.76666667])

4.2.1 Classification model evaluation metrics

  1. Accuracy

  2. Area under ROC curve

  3. Confusion matrix

  4. Classification report

Accuracy

heart_disease.head()
from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] clf = RandomForestClassifier(n_estimators=100) cross_val_score = cross_val_score(clf, X, y, cv=5)
np.mean(cross_val_score)
0.8248087431693989
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) *100:.2f}%")
Heart Disease Classifier Cross-Validated Accuracy: 82.48%

Area under the receiver operating characteristic curve (AUC/ROC)

  • Area under curve (AUC)

  • ROC curve

ROC curves are a comparison of a model's true postive rate (tpr) versus a models false positive rate (fpr).

  • True positive = model predicts 1 when truth is 1

  • False positive = model predicts 1 when truth is 0

  • True negative = model predicts 0 when truth is 0

  • False negative = model predicts 0 when truth is 1

# Create X_test... etc X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.metrics import roc_curve # Fit the classifier clf.fit(X_train, y_train) # Make predictions with probabilities y_probs = clf.predict_proba(X_test) y_probs[:10], len(y_probs)
(array([[0.51, 0.49], [0.17, 0.83], [0.51, 0.49], [0.72, 0.28], [0.43, 0.57], [0.12, 0.88], [0.3 , 0.7 ], [0.97, 0.03], [0.15, 0.85], [0.4 , 0.6 ]]), 61)
y_probs_positive = y_probs[:, 1] y_probs_positive[:10]
array([0.49, 0.83, 0.49, 0.28, 0.57, 0.88, 0.7 , 0.03, 0.85, 0.6 ])
# Caculate fpr, tpr and thresholds fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive) # Check the false positive rates fpr
array([0. , 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.06896552, 0.06896552, 0.06896552, 0.10344828, 0.10344828, 0.13793103, 0.13793103, 0.13793103, 0.20689655, 0.20689655, 0.20689655, 0.27586207, 0.37931034, 0.37931034, 0.48275862, 0.48275862, 0.55172414, 0.55172414, 1. ])
# Create a function for plotting ROC curves import matplotlib.pyplot as plt def plot_roc_curve(fpr, tpr): """ Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model. """ # Plot roc curve plt.plot(fpr, tpr, color="orange", label="ROC") # Plot line with no predictive power (baseline) #plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing") # Customize the plot plt.xlabel("False positive rate (fpr)") plt.ylabel("True positive rate (tpr)") plt.title("Receiver Operating Characteristic (ROC) Curve") plt.legend() plt.show() plot_roc_curve(fpr, tpr)
Image in a Jupyter notebook
from sklearn.metrics import roc_auc_score roc_auc_score(y_test, y_probs_positive)
0.8669181034482759
# Plot perfect ROC curve and AUC score fpr, tpr, thresholds = roc_curve(y_test, y_test) plot_roc_curve(fpr, tpr)
Image in a Jupyter notebook
# Perfect AUC score roc_auc_score(y_test, y_test)
1.0

Confusion matrix

The next way to evaluate a classification model is by using a confusion matrix.

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict. In essence, giving you an idea of where the model is getting confused.

from sklearn.metrics import confusion_matrix y_preds = clf.predict(X_test) confusion_matrix(y_test, y_preds)
array([[23, 6], [ 6, 26]])

Again, this is probably easier visualized.

One way to do it is with pd.crosstab().

pd.crosstab(y_test, y_preds, rownames=["Actual Label"], colnames=["Predicted Label"])

Creating a confusion matrix using Scikit-Learn

Scikit-Learn has multiple different implementations of plotting confusion matrices:

  1. sklearn.metrics.ConfusionMatrixDisplay.from_estimator(estimator, X, y) - this takes a fitted estimator (like our clf model), features (X) and labels (y), it then uses the trained estimator to make predictions on X and compares the predictions to y by displaying a confusion matrix.

  2. sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred) - this takes truth labels and predicted labels and compares them by displaying a confusion matrix.

Note: Both of these methods/classes require Scikit-Learn 1.0+. To check your version of Scikit-Learn run:

import sklearn sklearn.__version__

If you don't have 1.0+, you can upgrade at: https://scikit-learn.org/stable/install.html

from sklearn.metrics import ConfusionMatrixDisplay ConfusionMatrixDisplay.from_estimator(estimator=clf, X=X, y=y);
Image in a Jupyter notebook
# Plot confusion matrix from predictions ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_preds);
Image in a Jupyter notebook

Classification Report

from sklearn.metrics import classification_report print(classification_report(y_test, y_preds))
precision recall f1-score support 0 0.79 0.79 0.79 29 1 0.81 0.81 0.81 32 accuracy 0.80 61 macro avg 0.80 0.80 0.80 61 weighted avg 0.80 0.80 0.80 61
# Where precision and recall become valuable disease_true = np.zeros(10000) disease_true[0] = 1 # only one positive case disease_preds = np.zeros(10000) # model predicts every case as 0 pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True, zero_division=0))

To summarize classification metrics:

  • Accuracy is a good measure to start with if all classes are balanced (e.g. same amount of samples which are labelled with 0 or 1).

  • Precision and recall become more important when classes are imbalanced.

  • If false positive predictions are worse than false negatives, aim for higher precision.

  • If false negative predictions are worse than false positives, aim for higher recall.

  • F1-score is a combination of precision and recall.

4.2.2 Regression model evaluation metrics

Model evaluation metrics documentation - https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

The ones we're going to cover are:

  1. R^2 (pronounced r-squared) or coefficient of determination

  2. Mean absolute error (MAE)

  3. Mean squared error (MSE)

R^2

What R-squared does: Compares your models predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, it's R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1.

from sklearn.ensemble import RandomForestRegressor np.random.seed(42) X = housing_df.drop("target", axis=1) y = housing_df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = RandomForestRegressor(n_estimators=100) model.fit(X_train, y_train)
RandomForestRegressor()
model.score(X_test, y_test)
0.8066196804802649
housing_df.head()
y_test
20046 0.47700 3024 0.45800 15663 5.00001 20484 2.18600 9814 2.78000 ... 15362 2.63300 16623 2.66800 18086 5.00001 2144 0.72300 3665 1.51500 Name: target, Length: 4128, dtype: float64
y_test.mean()
2.0550030959302323
from sklearn.metrics import r2_score # Fill an array with y_test mean y_test_mean = np.full(len(y_test), y_test.mean())
y_test_mean[:10]
array([2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031, 2.0550031])
r2_score(y_true=y_test, y_pred=y_test_mean)
0.0
r2_score(y_true=y_test, y_pred=y_test)
1.0

Mean absolute error (MAE)

MAE is the average of the absolute differences between predictions and actual values.

It gives you an idea of how wrong your models predictions are.

# MAE from sklearn.metrics import mean_absolute_error y_preds = model.predict(X_test) mae = mean_absolute_error(y_test, y_preds) mae
0.3265721842781009
df = pd.DataFrame(data={"actual values": y_test, "predicted values": y_preds}) df["differences"] = df["predicted values"] - df["actual values"] df.head(10)
# MAE using formulas and differences np.abs(df["differences"]).mean()
0.3265721842781009

Mean squared error (MSE)

MSE is the mean of the square of the errors between actual and predicted values.

# Mean squared error from sklearn.metrics import mean_squared_error y_preds = model.predict(X_test) mse = mean_squared_error(y_test, y_preds) mse
0.2534073069137548
df["squared_differences"] = np.square(df["differences"]) df.head()
# Calculate MSE by hand squared = np.square(df["differences"]) squared.mean()
0.2534073069137548
df_large_error = df.copy() df_large_error.iloc[0]["squared_differences"] = 16 # increase "squared_differences" for 1 sample
df_large_error.head()
# Calculate MSE with large error df_large_error["squared_differences"].mean()
0.25728320720794084
# Artificially increase error in "squared_differences" column for ~100 samples df_large_error.iloc[1:100, 3] = 20 df_large_error
# Calculate MSE with large error(s) df_large_error["squared_differences"].mean()
0.7333540351264799

4.2.3 Finally using the scoring parameter

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] clf = RandomForestClassifier(n_estimators=100)
np.random.seed(42) # Cross-validation accuracy cv_acc = cross_val_score(clf, X, y, cv=5, scoring=None) # if scoring=None, esitmator's default scoring evaulation metric is used (accuracy for classification models) cv_acc
array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])
# Cross-validated accuracy print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")
The cross-validated accuracy is: 82.48%
np.random.seed(42) cv_acc = cross_val_score(clf, X, y, cv=5, scoring="accuracy") cv_acc
array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])
# Cross-validated accuracy print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")
The cross-validated accuracy is: 82.48%
# Precision np.random.seed(42) cv_precision = cross_val_score(clf, X, y, cv=5, scoring="precision") cv_precision
array([0.82352941, 0.93548387, 0.84848485, 0.79411765, 0.76315789])
# Cross-validated precision print(f"The cross-validated precision is: {np.mean(cv_precision)}")
The cross-validated precision is: 0.8329547346025924
# Recall np.random.seed(42) cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall") cv_recall
array([0.84848485, 0.87878788, 0.84848485, 0.81818182, 0.87878788])
# Cross-validated recall print(f"The cross-validated recall is: {np.mean(cv_recall)}")
The cross-validated recall is: 0.8545454545454545

Let's see the scoring parameter being using for a regression problem...

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor np.random.seed(42) X = housing_df.drop("target", axis=1) y = housing_df["target"] model = RandomForestRegressor(n_estimators=100)
np.random.seed(42) cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None) np.mean(cv_r2)
0.6545756342466266
cv_r2
array([0.62159677, 0.72076221, 0.62136792])
# Mean squared error cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error") np.mean(cv_mse)
-0.4302688111843372
cv_mse
array([-0.52532899, -0.34772023, -0.37442475, -0.43896032, -0.46490977])
# Mean absolute error cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error") np.mean(cv_mae)
-0.46681015659399233
cv_mae
array([-0.54518527, -0.41051585, -0.43822943, -0.46672399, -0.47339624])

4.3 Using different evaluation metrics as Scikit-Learn functions

The 3rd way to evaluate scikit-learn machine learning models/estimators is to using the sklearn.metrics module - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split np.random.seed(42) # Create X & y X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create model clf = RandomForestClassifier() # Fit model clf.fit(X_train, y_train) # Make predictions y_preds = clf.predict(X_test) # Evaluate model using evaluation functions print("Classifier metrics on the test set") print(f"Accurracy: {accuracy_score(y_test, y_preds)*100:.2f}%") print(f"Precision: {precision_score(y_test, y_preds)}") print(f"Recall: {recall_score(y_test, y_preds)}") print(f"F1: {f1_score(y_test, y_preds)}")
Classifier metrics on the test set Accurracy: 85.25% Precision: 0.8484848484848485 Recall: 0.875 F1: 0.8615384615384615
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split np.random.seed(42) # Create X & y X = housing_df.drop("target", axis=1) y = housing_df["target"] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create model model = RandomForestRegressor() # Fit model model.fit(X_train, y_train) # Make predictions y_preds = model.predict(X_test) # Evaluate model using evaluation functions print("Regression metrics on the test set") print(f"R2 score: {r2_score(y_test, y_preds)}") print(f"MAE: {mean_absolute_error(y_test, y_preds)}") print(f"MSE: {mean_squared_error(y_test, y_preds)}")
Regression metrics on the test set R2 score: 0.8066196804802649 MAE: 0.3265721842781009 MSE: 0.2534073069137548
what_were_covering
['0. An end-to-end Scikit-Learn workflow', '1. Getting the data ready', '2. Choose the right estimator/algorithm for our problems', '3. Fit the model/algorithm and use it to make predictions on our data', '4. Evaluating a model', '5. Improve a model', '6. Save and load a trained model', '7. Putting it all together!']

5. Improving a model

First predictions = baseline predictions. First model = baseline model.

From a data perspective:

  • Could we collect more data? (generally, the more data, the better)

  • Could we improve our data?

From a model perspective:

  • Is there a better model we could use?

  • Could we improve the current model?

Hyperparameters vs. Parameters

  • Parameters = model find these patterns in data

  • Hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns

Three ways to adjust hyperparameters:

  1. By hand

  2. Randomly with RandomSearchCV

  3. Exhaustively with GridSearchCV

from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=100)
clf.get_params()
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

5.1 Tuning hyperparameters by hand

Let's make 3 sets, training, validation and test.

clf.get_params()
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

We're going to try and adjust:

  • max_depth

  • max_features

  • min_samples_leaf

  • min_samples_split

  • n_estimators

def evaluate_preds(y_true, y_preds): """ Performs evaluation comparison on y_true labels vs. y_pred labels on a classification. """ accuracy = accuracy_score(y_true, y_preds) precision = precision_score(y_true, y_preds) recall = recall_score(y_true, y_preds) f1 = f1_score(y_true, y_preds) metric_dict = {"accuracy": round(accuracy, 2), "precision": round(precision, 2), "recall": round(recall, 2), "f1": round(f1, 2)} print(f"Acc: {accuracy * 100:.2f}%") print(f"Precision: {precision:.2f}") print(f"Recall: {recall:.2f}") print(f"F1 score: {f1:.2f}") return metric_dict
from sklearn.ensemble import RandomForestClassifier np.random.seed(42) # Shuffle the data heart_disease_shuffled = heart_disease.sample(frac=1) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split the data into train, validation & test sets train_split = round(0.7 * len(heart_disease_shuffled)) # 70% of data valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15% of data X_train, y_train = X[:train_split], y[:train_split] X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split] X_test, y_test = X[valid_split:], y[:valid_split] clf = RandomForestClassifier() clf.fit(X_train, y_train) # Make baseline predictions y_preds = clf.predict(X_valid) # Evaluate the classifier on validation set baseline_metrics = evaluate_preds(y_valid, y_preds) baseline_metrics
Acc: 82.22% Precision: 0.81 Recall: 0.88 F1 score: 0.85
{'accuracy': 0.82, 'precision': 0.81, 'recall': 0.88, 'f1': 0.85}
np.random.seed(42) # Create a second classifier with different hyperparameters clf_2 = RandomForestClassifier(n_estimators=100) clf_2.fit(X_train, y_train) # Make predictions with different hyperparameters y_preds_2 = clf_2.predict(X_valid) # Evalute the 2nd classsifier clf_2_metrics = evaluate_preds(y_valid, y_preds_2)
Acc: 82.22% Precision: 0.84 Recall: 0.84 F1 score: 0.84

5.2 Hyperparameter tuning with RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200], "max_depth": [None, 5, 10, 20, 30], "max_features": ["auto", "sqrt"], "min_samples_split": [2, 4, 6], "min_samples_leaf": [1, 2, 4]} np.random.seed(42) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate RandomForestClassifier clf = RandomForestClassifier(n_jobs=1) # Setup RandomizedSearchCV rs_clf = RandomizedSearchCV(estimator=clf, param_distributions=grid, n_iter=10, # number of models to try cv=5, verbose=2) # Fit the RandomizedSearchCV version of clf rs_clf.fit(X_train, y_train);
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time= 1.3s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time= 1.2s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time= 1.2s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time= 1.3s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time= 1.2s [CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time= 0.1s [CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time= 0.1s [CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time= 0.1s [CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time= 0.1s [CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time= 0.1s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.2s [CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time= 0.0s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.6s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.4s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s [CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s [CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time= 1.0s [CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time= 1.0s [CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time= 1.1s [CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time= 1.0s [CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time= 1.0s
rs_clf.best_params_
{'n_estimators': 200, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}
# Make predictions with the best hyperparameters rs_y_preds = rs_clf.predict(X_test) # Evaluate the predictions rs_metrics = evaluate_preds(y_test, rs_y_preds)
Acc: 81.97% Precision: 0.77 Recall: 0.86 F1 score: 0.81

5.3 Hyperparameter tuning with GridSearchCV

grid
{'n_estimators': [10, 100, 200, 500, 1000, 1200], 'max_depth': [None, 5, 10, 20, 30], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 4]}
grid_2 = {'n_estimators': [100, 200, 500], 'max_depth': [None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [6], 'min_samples_leaf': [1, 2]}
from sklearn.model_selection import GridSearchCV, train_test_split np.random.seed(42) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate RandomForestClassifier clf = RandomForestClassifier(n_jobs=1) # Setup GridSearchCV gs_clf = GridSearchCV(estimator=clf, param_grid=grid_2, cv=5, verbose=2) # Fit the GridSearchCV version of clf gs_clf.fit(X_train, y_train);
Fitting 5 folds for each of 12 candidates, totalling 60 fits [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.6s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.6s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.6s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time= 0.1s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s [CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time= 0.5s
gs_clf.best_params_
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 200}
gs_y_preds = gs_clf.predict(X_test) # evaluate the predictions gs_metrics = evaluate_preds(y_test, gs_y_preds)
Acc: 78.69% Precision: 0.74 Recall: 0.82 F1 score: 0.78

Let's compare our different models metrics.

compare_metrics = pd.DataFrame({"baseline": baseline_metrics, "clf_2": clf_2_metrics, "random search": rs_metrics, "grid search": gs_metrics}) compare_metrics.plot.bar(figsize=(10, 8));
Image in a Jupyter notebook

6. Saving and loading trained machine learning models

Two ways to save and load machine learning models:

  1. With Python's pickle module

  2. With the joblib module

Pickle

import pickle # Save an extisting model to file pickle.dump(gs_clf, open("gs_random_random_forest_model_1.pkl", "wb"))
# Load a saved model loaded_pickle_model = pickle.load(open("gs_random_random_forest_model_1.pkl", "rb"))
# Make some predictions pickle_y_preds = loaded_pickle_model.predict(X_test) evaluate_preds(y_test, pickle_y_preds)
Acc: 78.69% Precision: 0.74 Recall: 0.82 F1 score: 0.78
{'accuracy': 0.79, 'precision': 0.74, 'recall': 0.82, 'f1': 0.78}

Joblib

from joblib import dump, load # Save model to file dump(gs_clf, filename="gs_random_forest_model_1.joblib")
['gs_random_forest_model_1.joblib']
# Import a saved joblib model loaded_joblib_model = load(filename="gs_random_forest_model_1.joblib")
# Make and evaluate joblib predictions joblib_y_preds = loaded_joblib_model.predict(X_test) evaluate_preds(y_test, joblib_y_preds)
Acc: 78.69% Precision: 0.74 Recall: 0.82 F1 score: 0.78
{'accuracy': 0.79, 'precision': 0.74, 'recall': 0.82, 'f1': 0.78}

7. Putting it all together!

data = pd.read_csv("data/car-sales-extended-missing-data.csv") data
data.dtypes
Make object Colour object Odometer (KM) float64 Doors float64 Price float64 dtype: object
data.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64

Steps we want to do (all in one cell):

  1. Fill missing data

  2. Convert data to numbers

  3. Build a model on the data

# Getting data ready import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder # Modelling from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, GridSearchCV # Setup random seed import numpy as np np.random.seed(42) # Import data and drop rows with missing labels data = pd.read_csv("data/car-sales-extended-missing-data.csv") data.dropna(subset=["Price"], inplace=True) # Define different features and transformer pipeline categorical_features = ["Make", "Colour"] categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]) door_feature = ["Doors"] door_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value=4)) ]) numeric_features = ["Odometer (KM)"] numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="mean")) ]) # Setup preprocessing steps (fill missing values, then convert to numbers) preprocessor = ColumnTransformer( transformers=[ ("cat", categorical_transformer, categorical_features), ("door", door_transformer, door_feature), ("num", numeric_transformer, numeric_features) ]) # Creating a preprocessing and modelling pipeline model = Pipeline(steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor())]) # Split data X = data.drop("Price", axis=1) y = data["Price"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Fit and score the model model.fit(X_train, y_train) model.score(X_test, y_test)
0.22188417408787875

It's also possible to use GridSearchCV or RandomizedSesrchCV with our Pipeline.

# Use GridSearchCV with our regression Pipeline from sklearn.model_selection import GridSearchCV pipe_grid = { "preprocessor__num__imputer__strategy": ["mean", "median"], "model__n_estimators": [100, 1000], "model__max_depth": [None, 5], "model__max_features": ["auto"], "model__min_samples_split": [2, 4] } gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2) gs_model.fit(X_train, y_train)
Fitting 5 folds for each of 16 candidates, totalling 80 fits [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.7s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.5s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.5s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.5s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.2s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.4s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.4s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.4s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.4s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.4s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.6s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.5s [CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.4s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.2s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.2s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.2s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.1s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s [CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 1.0s
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('cat', Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['Make', 'Colour']), ('door', Pipeline(steps=[('imputer', SimpleImputer(fill_value=4, strategy='constant'))]), ['Doors']), ('num', Pipeline(steps=[('imputer', SimpleImputer())]), ['Odometer ' '(KM)'])])), ('model', RandomForestRegressor())]), param_grid={'model__max_depth': [None, 5], 'model__max_features': ['auto'], 'model__min_samples_split': [2, 4], 'model__n_estimators': [100, 1000], 'preprocessor__num__imputer__strategy': ['mean', 'median']}, verbose=2)
gs_model.score(X_test, y_test)
0.3339554263158365
what_were_covering
['0. An end-to-end Scikit-Learn workflow', '1. Getting the data ready', '2. Choose the right estimator/algorithm for our problems', '3. Fit the model/algorithm and use it to make predictions on our data', '4. Evaluating a model', '5. Improve a model', '6. Save and load a trained model', '7. Putting it all together!']