Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mrdbourke
GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/introduction-to-scikit-learn-video-OLD.ipynb
874 views
Kernel: Python 3

Introduction to Scikit-Learn (sklearn)

This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn library.

What we're going to cover:

# Let's listify the contents what_were_covering = [ "0. An end-to-end Scikit-Learn workflow", "1. Getting the data ready", "2. Choose the right estimator/algorithm for our problems", "3. Fit the model/algorithm and use it to make predictions on our data", "4. Evaluating a model", "5. Improve a model", "6. Save and load a trained model", "7. Putting it all together!"]
what_were_covering
['0. An end-to-end Scikit-Learn workflow', '1. Getting the data ready', '2. Choose the right estimator/algorithm for our problems', '3. Fit the model/algorithm and use it to make predictions on our data', '4. Evaluating a model', '5. Improve a model', '6. Save and load a trained model', '7. Putting it all together!']
# Standard imports import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline

0. An end-to-end Scikit-Learn workflow

# 1. Get the data ready import pandas as pd heart_disease = pd.read_csv("../data/heart-disease.csv") heart_disease
# Create X (features matrix) X = heart_disease.drop("target", axis=1) # Create y (labels) y = heart_disease["target"]
# 2. Choose the right model and hyperparameters from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=100) # We'll keep the default hyperparameters clf.get_params()
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
# 3. Fit the model to the training data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train);
X_train
# make a prediction y_label = clf.predict(np.array([0, 2, 3, 4]))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-10-7cea9660990e> in <module> 1 # make a prediction ----> 2 y_label = clf.predict(np.array([0, 2, 3, 4])) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict(self, X) 610 The predicted classes. 611 """ --> 612 proba = self.predict_proba(X) 613 614 if self.n_outputs_ == 1: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X) 654 check_is_fitted(self) 655 # Check data --> 656 X = self._validate_X_predict(X) 657 658 # Assign chunk of trees to jobs ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in _validate_X_predict(self, X) 410 check_is_fitted(self) 411 --> 412 return self.estimators_[0]._validate_X_predict(X, check_input=True) 413 414 @property ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/tree/_classes.py in _validate_X_predict(self, X, check_input) 378 """Validate X whenever one tries to predict, apply, predict_proba""" 379 if check_input: --> 380 X = check_array(X, dtype=DTYPE, accept_sparse="csr") 381 if issparse(X) and (X.indices.dtype != np.intc or 382 X.indptr.dtype != np.intc): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 554 "Reshape your data either using array.reshape(-1, 1) if " 555 "your data has a single feature or array.reshape(1, -1) " --> 556 "if it contains a single sample.".format(array)) 557 558 # in the future np.flexible dtypes will be handled like object dtypes ValueError: Expected 2D array, got 1D array instead: array=[0. 2. 3. 4.]. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
y_preds = clf.predict(X_test) y_preds
array([1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1])
y_test
50 1 83 1 295 0 120 1 40 1 .. 25 1 246 0 205 0 3 1 7 1 Name: target, Length: 61, dtype: int64
# 4. Evaluate the model on the training data and test data clf.score(X_train, y_train)
1.0
clf.score(X_test, y_test)
0.7540983606557377
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score print(classification_report(y_test, y_preds))
precision recall f1-score support 0 0.71 0.79 0.75 28 1 0.80 0.73 0.76 33 accuracy 0.75 61 macro avg 0.75 0.76 0.75 61 weighted avg 0.76 0.75 0.75 61
confusion_matrix(y_test, y_preds)
array([[22, 6], [ 9, 24]])
accuracy_score(y_test, y_preds)
0.7540983606557377
# 5. Improve a model # Try different amount of n_estimators np.random.seed(42) for i in range(10, 100, 10): print(f"Trying model with {i} estimators...") clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train) print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%") print("")
Trying model with 10 estimators... Model accuracy on test set: 75.41% Trying model with 20 estimators... Model accuracy on test set: 78.69% Trying model with 30 estimators... Model accuracy on test set: 77.05% Trying model with 40 estimators... Model accuracy on test set: 80.33% Trying model with 50 estimators... Model accuracy on test set: 80.33% Trying model with 60 estimators... Model accuracy on test set: 80.33% Trying model with 70 estimators... Model accuracy on test set: 81.97% Trying model with 80 estimators... Model accuracy on test set: 78.69% Trying model with 90 estimators... Model accuracy on test set: 80.33%
# 6. Save a model and load it import pickle pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))
loaded_model = pickle.load(open("random_forst_model_1.pkl", "rb")) loaded_model.score(X_test, y_test)
0.8032786885245902

1. Getting our data ready to be used with machine learning

Three main things we have to do: 1. Split the data into features and labels (usually X & y) 2. Filling (also called imputing) or disregarding missing values 3. Converting non-numerical values to numerical values (also called feature encoding)

heart_disease.head()
X = heart_disease.drop("target", axis=1) X.head()
y = heart_disease["target"] y.head()
0 1 1 1 2 1 3 1 4 1 Name: target, dtype: int64
# Split the data into training and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((212, 13), (91, 13), (212,), (91,))
X.shape[0] * 0.8
242.4
242 + 61
303
len(heart_disease)
303

1.1 Make sure it's all numerical

car_sales = pd.read_csv("../data/car-sales-extended.csv") car_sales.head()
car_sales["Doors"].value_counts()
4 856 5 79 3 65 Name: Doors, dtype: int64
len(car_sales)
1000
car_sales.dtypes
Make object Colour object Odometer (KM) int64 Doors int64 Price int64 dtype: object
# Split into X/y X = car_sales.drop("Price", axis=1) y = car_sales["Price"] # Split into training and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Build machine learning model from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() model.fit(X_train, y_train) model.score(X_test, y_test)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-34-2eeea2d0b490> in <module> 3 4 model = RandomForestRegressor() ----> 5 model.fit(X_train, y_train) 6 model.score(X_test, y_test) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight) 293 """ 294 # Validate or convert input data --> 295 X = check_array(X, accept_sparse="csc", dtype=DTYPE) 296 y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 297 if sample_weight is not None: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 529 array = array.astype(dtype, casting="unsafe", copy=False) 530 else: --> 531 array = np.asarray(array, order=order, dtype=dtype) 532 except ComplexWarning: 533 raise ValueError("Complex data not supported\n" ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order) 83 84 """ ---> 85 return array(a, dtype, copy=False, order=order) 86 87 ValueError: could not convert string to float: 'Toyota'
X.head()
# Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(X) transformed_X
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 3.54310e+04], [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00, 1.00000e+00, 1.92714e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 8.47140e+04], ..., [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00, 0.00000e+00, 6.66040e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 2.15883e+05], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 2.48360e+05]])
X.head()
pd.DataFrame(transformed_X)
# Another way to do it with pd.dummies... dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]]) dummies
# Let's refit the model np.random.seed(42) X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2) model.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)
X.head()
model.score(X_test, y_test)
0.3235867221569877

1.2 What if there were missing values?

  1. Fill them with some value (also known as imputation).

  2. Remove the samples with missing data altogether.

# Import car sales missing data car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv") car_sales_missing.head()
car_sales_missing.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Create X & y X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"]
# Let's try and convert our data to numbers # Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(X) transformed_X
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-46-f532939289ac> in <module> 11 remainder="passthrough") 12 ---> 13 transformed_X = transformer.fit_transform(X) 14 transformed_X ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 516 self._validate_remainder(X) 517 --> 518 result = self._fit_transform(X, y, _fit_transform_one) 519 520 if not result: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted) 455 message=self._log_message(name, idx, len(transformers))) 456 for idx, (name, trans, column, weight) in enumerate( --> 457 self._iter(fitted=fitted, replace_strings=True), 1)) 458 except ValueError as e: 459 if "Expected 2D array, got 1D array instead" in str(e): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1002 # remaining jobs. 1003 self._iterating = False -> 1004 if self.dispatch_one_batch(iterator): 1005 self._iterating = self._original_iterator is not None 1006 ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 833 return False 834 else: --> 835 self._dispatch(tasks) 836 return True 837 ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 752 with self._lock: 753 job_idx = len(self._jobs) --> 754 job = self._backend.apply_async(batch, callback=cb) 755 # A job can complete so quickly than its callback is 756 # called before we get here, causing self._jobs to ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 207 def apply_async(self, func, callback=None): 208 """Schedule a func to be run""" --> 209 result = ImmediateResult(func) 210 if callback: 211 callback(result) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 588 # Don't delay the application, to avoid keeping the input 589 # arguments in memory --> 590 self.results = batch() 591 592 def get(self): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 254 with parallel_backend(self._backend, n_jobs=self._n_jobs): 255 return [func(*args, **kwargs) --> 256 for func, args, kwargs in self.items] 257 258 def __len__(self): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 254 with parallel_backend(self._backend, n_jobs=self._n_jobs): 255 return [func(*args, **kwargs) --> 256 for func, args, kwargs in self.items] 257 258 def __len__(self): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 726 with _print_elapsed_time(message_clsname, message): 727 if hasattr(transformer, 'fit_transform'): --> 728 res = transformer.fit_transform(X, y, **fit_params) 729 else: 730 res = transformer.fit(X, y, **fit_params).transform(X) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py in fit_transform(self, X, y) 370 """ 371 self._validate_keywords() --> 372 return super().fit_transform(X, y) 373 374 def transform(self, X): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 569 if y is None: 570 # fit method of arity 1 (unsupervised transformation) --> 571 return self.fit(X, **fit_params).transform(X) 572 else: 573 # fit method of arity 2 (supervised transformation) ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py in fit(self, X, y) 345 """ 346 self._validate_keywords() --> 347 self._fit(X, handle_unknown=self.handle_unknown) 348 self.drop_idx_ = self._compute_drop_idx() 349 return self ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py in _fit(self, X, handle_unknown) 72 73 def _fit(self, X, handle_unknown='error'): ---> 74 X_list, n_samples, n_features = self._check_X(X) 75 76 if self.categories != 'auto': ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py in _check_X(self, X) 59 Xi = self._get_feature(X, feature_idx=i) 60 Xi = check_array(Xi, ensure_2d=False, dtype=None, ---> 61 force_all_finite=needs_validation) 62 X_columns.append(Xi) 63 ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 576 if force_all_finite: 577 _assert_all_finite(array, --> 578 allow_nan=force_all_finite == 'allow-nan') 579 580 if ensure_min_samples > 0: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 63 elif X.dtype == np.dtype('object') and not allow_nan: 64 if _object_dtype_isnan(X).any(): ---> 65 raise ValueError("Input contains NaN") 66 67 ValueError: Input contains NaN
car_sales_missing
car_sales_missing["Doors"].value_counts()
4.0 811 5.0 75 3.0 64 Name: Doors, dtype: int64

Option 1: Fill missing data with Pandas

# Fill the "Make" column car_sales_missing["Make"].fillna("missing", inplace=True) # Fill the "Colour" column car_sales_missing["Colour"].fillna("missing", inplace=True) # Fill the "Odometer (KM)" column car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True) # Fill the "Doors" column car_sales_missing["Doors"].fillna(4, inplace=True)
# Check our dataframe again car_sales_missing.isna().sum()
Make 0 Colour 0 Odometer (KM) 0 Doors 0 Price 50 dtype: int64
# Remove rows with missing Price value car_sales_missing.dropna(inplace=True)
car_sales_missing.isna().sum()
Make 0 Colour 0 Odometer (KM) 0 Doors 0 Price 0 dtype: int64
len(car_sales_missing)
950
X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"]
# Let's try and convert our data to numbers # Turn the categories into numbers from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") transformed_X = transformer.fit_transform(car_sales_missing) transformed_X
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 3.54310e+04, 1.53230e+04], [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 1.92714e+05, 1.99430e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 8.47140e+04, 2.83430e+04], ..., [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00, 6.66040e+04, 3.15700e+04], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00, 2.15883e+05, 4.00100e+03], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00, 2.48360e+05, 1.27320e+04]])

Option 2: Filling missing data and transforming categorical data with Scikit-Learn

Note: This section is different to the video. The video shows filling and transforming the entire dataset (X) and although the techniques are correct, it's best to fill and transform training and test sets separately (as shown in the code below).

The main takeaways:

  • Split your data first (into train/test)

  • Fill/transform the training set and test sets separately

Thank you Robert for pointing this out.

car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv") car_sales_missing.head()
car_sales_missing.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Drop the rows with no labels car_sales_missing.dropna(subset=["Price"], inplace=True) car_sales_missing.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 Price 0 dtype: int64
# Split into X & y X = car_sales_missing.drop("Price", axis=1) y = car_sales_missing["Price"] # Split data into train and test np.random.seed(42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Check missing values X.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 dtype: int64
# Fill missing values with Scikit-Learn from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer # Fill categorical values with 'missing' & numerical values with mean cat_imputer = SimpleImputer(strategy="constant", fill_value="missing") door_imputer = SimpleImputer(strategy="constant", fill_value=4) num_imputer = SimpleImputer(strategy="mean") # Define columns cat_features = ["Make", "Colour"] door_feature = ["Doors"] num_features = ["Odometer (KM)"] # Create an imputer (something that fills missing data) imputer = ColumnTransformer([ ("cat_imputer", cat_imputer, cat_features), ("door_imputer", door_imputer, door_feature), ("num_imputer", num_imputer, num_features) ]) # Fill train and test values separately filled_X_train = imputer.fit_transform(X_train) filled_X_test = imputer.transform(X_test) # Check filled X_train filled_X_train
array([['Honda', 'White', 4.0, 71934.0], ['Toyota', 'Red', 4.0, 162665.0], ['Honda', 'White', 4.0, 42844.0], ..., ['Toyota', 'White', 4.0, 196225.0], ['Honda', 'Blue', 4.0, 133117.0], ['Honda', 'missing', 4.0, 150582.0]], dtype=object)
# Get our transformed data array's back into DataFrame's car_sales_filled_train = pd.DataFrame(filled_X_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"]) car_sales_filled_test = pd.DataFrame(filled_X_test, columns=["Make", "Colour", "Doors", "Odometer (KM)"]) # Check missing data in training set car_sales_filled_train.isna().sum()
Make 0 Colour 0 Doors 0 Odometer (KM) 0 dtype: int64
# Check to see the original... still missing values car_sales_missing.isna().sum()
Make 47 Colour 46 Odometer (KM) 48 Doors 47 Price 0 dtype: int64
# Now let's one hot encode the features with the same code as before categorical_features = ["Make", "Colour", "Doors"] one_hot = OneHotEncoder() transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough") # Fill train and test values separately transformed_X_train = transformer.fit_transform(car_sales_filled_train) transformed_X_test = transformer.transform(car_sales_filled_test) # Check transformed and filled X_train transformed_X_train.toarray()
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 7.19340e+04], [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.62665e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 4.28440e+04], ..., [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.96225e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.33117e+05], [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00, 0.00000e+00, 1.50582e+05]])
# Now we've transformed X, let's see if we can fit a model np.random.seed(42) from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() # Make sure to use transformed (filled and one-hot encoded X data) model.fit(transformed_X_train, y_train) model.score(transformed_X_test, y_test)
0.21229043336119102
# Check length of transformed data (filled and one-hot encoded) # vs. length of original data len(transformed_X_train.toarray())+len(transformed_X_test.toarray()), len(car_sales)
(950, 1000)

Note: The 50 less values in the transformed data is because we dropped the rows (50 total) with missing values in the Price column.

2. Choosing the right estimator/algorithm for our problem

Scikit-Learn uses estimator as another term for machine learning model or algorithm.

  • Classification - predicting whether a sample is one thing or another

  • Regression - predicting a number

Step 1 - Check the Scikit-Learn machine learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

2.1 Picking a machine learning model for a regression problem

# Import Boston housing dataset from sklearn.datasets import load_boston boston = load_boston() boston;
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"]) boston_df["target"] = pd.Series(boston["target"]) boston_df.head()
# How many samples? len(boston_df)
506
# Let's try the Ridge Regression model from sklearn.linear_model import Ridge # Setup random seed np.random.seed(42) # Create the data X = boston_df.drop("target", axis=1) y = boston_df["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Ridge model model = Ridge() model.fit(X_train, y_train) # Check the score of the Ridge model on test data model.score(X_test, y_test)
0.6662221670168518

How do we improve this score?

What if Ridge wasn't working?

Let's refer back to the map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

# Let's try the Random Forst Regressor from sklearn.ensemble import RandomForestRegressor # Setup random seed np.random.seed(42) # Create the data X = boston_df.drop("target", axis=1) y = boston_df["target"] # Split the data #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instatiate Random Forest Regressor rf = RandomForestRegressor(n_estimators=100) rf.fit(X_train, y_train) # Evaluate the Random Forest Regressor rf.score(X_test, y_test)
0.8896648705127477
# Check the Ridge model again model.score(X_test, y_test)
0.6662221670168518

2.2 Choosing an estimator for a classification problem

Let's go to the map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

heart_disease = pd.read_csv("data/heart-disease.csv") heart_disease.head()
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-73-44f78f3704d4> in <module> ----> 1 heart_disease = pd.read_csv("data/heart-disease.csv") 2 heart_disease.head() ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision) 674 ) 675 --> 676 return _read(filepath_or_buffer, kwds) 677 678 parser_f.__name__ = name ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds) 446 447 # Create the parser. --> 448 parser = TextFileReader(fp_or_buf, **kwds) 449 450 if chunksize or iterator: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds) 878 self.options["has_index_names"] = kwds["has_index_names"] 879 --> 880 self._make_engine(self.engine) 881 882 def close(self): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine) 1112 def _make_engine(self, engine="c"): 1113 if engine == "c": -> 1114 self._engine = CParserWrapper(self.f, **self.options) 1115 else: 1116 if engine == "python": ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds) 1889 kwds["usecols"] = self.usecols 1890 -> 1891 self._reader = parsers.TextReader(src, **kwds) 1892 self.unnamed_cols = self._reader.unnamed_cols 1893 pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__() pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source() FileNotFoundError: [Errno 2] File data/heart-disease.csv does not exist: 'data/heart-disease.csv'
len(heart_disease)
303

Consulting the map and it says to try LinearSVC.

# Import the LinearSVC estimator class from sklearn.svm import LinearSVC # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate LinearSVC clf = LinearSVC(max_iter=10000) clf.fit(X_train, y_train) # Evaluate the LinearSVC clf.score(X_test, y_test)
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/svm/_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
0.47540983606557374
heart_disease["target"].value_counts()
1 165 0 138 Name: target, dtype: int64
# Import the RandomForestClassifier estimator class from sklearn.ensemble import RandomForestClassifier # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Random Forest Classifier clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) # Evaluate the Random Forest Classifier clf.score(X_test, y_test)
0.8524590163934426

Tidbit:

1. If you have structured data, used ensemble methods 2. If you have unstructured data, use deep learning or transfer learning
heart_disease

3. Fit the model/algorithm on our data and use it to make predictions

3.1 Fitting the model to the data

Different names for:

  • X = features, features variables, data

  • y = labels, targets, target variables

# Import the RandomForestClassifier estimator class from sklearn.ensemble import RandomForestClassifier # Setup random seed np.random.seed(42) # Make the data X = heart_disease.drop("target", axis=1) y = heart_disease["target"] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate Random Forest Classifier clf = RandomForestClassifier(n_estimators=100) # Fit the model to the data (training the machine learning model) clf.fit(X_train, y_train) # Evaluate the Random Forest Classifier (use the patterns the model has learned) clf.score(X_test, y_test)
0.8524590163934426
X.head()
y.tail()
298 0 299 0 300 0 301 0 302 0 Name: target, dtype: int64

Random Forest model deep dive

These resources will help you understand what's happening inside the Random Forest models we've been using.

3.2 Make predictions using a machine learning model

2 ways to make predictions:

  1. predict()

  2. predict_proba()

# Use a trained model to make predictions clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work...
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-82-5908053f578c> in <module> 1 # Use a trained model to make predictions ----> 2 clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work... ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict(self, X) 610 The predicted classes. 611 """ --> 612 proba = self.predict_proba(X) 613 614 if self.n_outputs_ == 1: ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X) 654 check_is_fitted(self) 655 # Check data --> 656 X = self._validate_X_predict(X) 657 658 # Assign chunk of trees to jobs ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in _validate_X_predict(self, X) 410 check_is_fitted(self) 411 --> 412 return self.estimators_[0]._validate_X_predict(X, check_input=True) 413 414 @property ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/tree/_classes.py in _validate_X_predict(self, X, check_input) 378 """Validate X whenever one tries to predict, apply, predict_proba""" 379 if check_input: --> 380 X = check_array(X, dtype=DTYPE, accept_sparse="csr") 381 if issparse(X) and (X.indices.dtype != np.intc or 382 X.indptr.dtype != np.intc): ~/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 554 "Reshape your data either using array.reshape(-1, 1) if " 555 "your data has a single feature or array.reshape(1, -1) " --> 556 "if it contains a single sample.".format(array)) 557 558 # in the future np.flexible dtypes will be handled like object dtypes ValueError: Expected 2D array, got 1D array instead: array=[1. 7. 8. 3. 4.]. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
X_test.head()
clf.predict(X_test)
array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])
np.array(y_test)
array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])
# Compare predictions to truth labels to evaluate the model y_preds = clf.predict(X_test) np.mean(y_preds == y_test)
0.8524590163934426
clf.score(X_test, y_test)
0.8524590163934426
from sklearn.metrics import accuracy_score accuracy_score(y_test, y_preds)
0.8524590163934426

Make predictions with predict_proba() - use this if someone asks you "what's the probability your model is assigning to each prediction?"

# predict_proba() returns probabilities of a classification label clf.predict_proba(X_test[:5])
array([[0.89, 0.11], [0.49, 0.51], [0.43, 0.57], [0.84, 0.16], [0.18, 0.82]])
# Let's predict() on the same data... clf.predict(X_test[:5])
array([0, 1, 1, 0, 1])
X_test[:5]
heart_disease["target"].value_counts()
1 165 0 138 Name: target, dtype: int64

predict() can also be used for regression models.

boston_df.head()
from sklearn.ensemble import RandomForestRegressor np.random.seed(42) # Create the data X = boston_df.drop("target", axis=1) y = boston_df["target"] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate and fit model model = RandomForestRegressor(n_estimators=100).fit(X_train, y_train) # Make predictions y_preds = model.predict(X_test)
y_preds[:10]
array([23.002, 30.826, 16.734, 23.467, 16.853, 21.725, 19.232, 15.239, 21.067, 20.738])
np.array(y_test[:10])
array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])
# Compare the predictions to the truth from sklearn.metrics import mean_absolute_error mean_absolute_error(y_test, y_preds)
2.1226372549019623

4. Evaluating a machine learning model

Three ways to evaluate Scikit-Learn models/esitmators:

  1. Estimator score method

  2. The scoring parameter

  3. Problem-specific metric functions.

4.1 Evaluating a model with the score method

from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = RandomForestClassifier() clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)
clf.score(X_train, y_train)
1.0
clf.score(X_test, y_test)
0.8524590163934426

Let's do the same but for regression...

from sklearn.ensemble import RandomForestRegressor np.random.seed(42) # Create the data X = boston_df.drop("target", axis=1) y = boston_df["target"] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate and fit model model = RandomForestRegressor(n_estimators=100).fit(X_train, y_train)
model.score(X_test, y_test)
0.873969014117403

4.2 Evaluating a model using the scoring parameter

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train);
clf.score(X_test, y_test)
0.8524590163934426
cross_val_score(clf, X, y, cv=5)
array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])
cross_val_score(clf, X, y, cv=10)
array([0.90322581, 0.80645161, 0.87096774, 0.9 , 0.86666667, 0.8 , 0.73333333, 0.86666667, 0.73333333, 0.8 ])
np.random.seed(42) # Single training and test split score clf_single_score = clf.score(X_test, y_test) # Take the mean of 5-fold cross-validation score clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5)) # Compare the two clf_single_score, clf_cross_val_score
(0.8524590163934426, 0.8248087431693989)
# Default scoring parameter of classifier = mean accuracy clf.score()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-108-cca012993b3a> in <module> 1 # Default scoring parameter of classifier = mean accuracy ----> 2 clf.score() TypeError: score() missing 2 required positional arguments: 'X' and 'y'
# Scoring parameter set to None by default cross_val_score(clf, X, y, cv=5, scoring=None)

4.2.1 Classification model evaluation metrics

  1. Accuracy

  2. Area under ROC curve

  3. Confusion matrix

  4. Classification report

Accuracy

heart_disease.head()
from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] clf = RandomForestClassifier(n_estimators=100) cross_val_score = cross_val_score(clf, X, y, cv=5)
np.mean(cross_val_score)
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) *100:.2f}%")

Area under the receiver operating characteristic curve (AUC/ROC)

  • Area under curve (AUC)

  • ROC curve

ROC curves are a comparison of a model's true postive rate (tpr) versus a models false positive rate (fpr).

  • True positive = model predicts 1 when truth is 1

  • False positive = model predicts 1 when truth is 0

  • True negative = model predicts 0 when truth is 0

  • False negative = model predicts 0 when truth is 1

# Create X_test... etc X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.metrics import roc_curve # Fit the classifier clf.fit(X_train, y_train) # Make predictions with probabilities y_probs = clf.predict_proba(X_test) y_probs[:10], len(y_probs)
(array([[0.51, 0.49], [0.17, 0.83], [0.51, 0.49], [0.72, 0.28], [0.43, 0.57], [0.12, 0.88], [0.3 , 0.7 ], [0.97, 0.03], [0.15, 0.85], [0.4 , 0.6 ]]), 61)
y_probs_positive = y_probs[:, 1] y_probs_positive[:10]
array([0.49, 0.83, 0.49, 0.28, 0.57, 0.88, 0.7 , 0.03, 0.85, 0.6 ])
# Caculate fpr, tpr and thresholds fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive) # Check the false positive rates fpr
array([0. , 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.03448276, 0.06896552, 0.06896552, 0.06896552, 0.10344828, 0.10344828, 0.13793103, 0.13793103, 0.13793103, 0.20689655, 0.20689655, 0.20689655, 0.27586207, 0.37931034, 0.37931034, 0.48275862, 0.48275862, 0.55172414, 0.55172414, 1. ])
# Create a function for plotting ROC curves import matplotlib.pyplot as plt def plot_roc_curve(fpr, tpr): """ Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model. """ # Plot roc curve plt.plot(fpr, tpr, color="orange", label="ROC") # Plot line with no predictive power (baseline) #plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing") # Customize the plot plt.xlabel("False positive rate (fpr)") plt.ylabel("True positive rate (tpr)") plt.title("Receiver Operating Characteristic (ROC) Curve") plt.legend() plt.show() plot_roc_curve(fpr, tpr)
Image in a Jupyter notebook
from sklearn.metrics import roc_auc_score roc_auc_score(y_test, y_probs_positive)
0.8669181034482759
# Plot perfect ROC curve and AUC score fpr, tpr, thresholds = roc_curve(y_test, y_test) plot_roc_curve(fpr, tpr)
Image in a Jupyter notebook
# Perfect AUC score roc_auc_score(y_test, y_test)
1.0

Confusion Matrix

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict.

In essence, giving you an idea of where the model is getting confused.

from sklearn.metrics import confusion_matrix y_preds = clf.predict(X_test) confusion_matrix(y_test, y_preds)
array([[23, 6], [ 6, 26]])
# Visualize confusion matrix with pd.crosstab() pd.crosstab(y_test, y_preds, rownames=["Actual Labels"], colnames=["Predicted Labels"])
22 + 7 + 8 + 24
61
len(X_test)
61
# # How install a conda package into the current envrionment from a Jupyter Notebook # import sys # !conda install --yes --prefix {sys.prefix} seaborn
# Make our confusion matrix more visual with Seaborn's heatmap() import seaborn as sns # Set the font scale sns.set(font_scale=1.5) # Create a confusion matrix conf_mat = confusion_matrix(y_test, y_preds) # Plot it using Seaborn sns.heatmap(conf_mat);
Image in a Jupyter notebook

Note: In the original notebook, the function below had the "True label" as the x-axis label and the "Predicted label" as the y-axis label. But due to the way confusion_matrix() outputs values, these should be swapped around. The code below has been corrected.

def plot_conf_mat(conf_mat): """ Plots a confusion matrix using Seaborn's heatmap(). """ fig, ax = plt.subplots(figsize=(3,3)) ax = sns.heatmap(conf_mat, annot=True, # Annotate the boxes with conf_mat info cbar=False) plt.xlabel("Predicted label") plt.ylabel("True label") # Fix the broken annotations (this happened in Matplotlib 3.1.1) bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top-0.5); plot_conf_mat(conf_mat)
Image in a Jupyter notebook
from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(clf, X, y)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7ff630bc2d10>
Image in a Jupyter notebook

Classification Report

from sklearn.metrics import classification_report print(classification_report(y_test, y_preds))
precision recall f1-score support 0 0.79 0.79 0.79 29 1 0.81 0.81 0.81 32 accuracy 0.80 61 macro avg 0.80 0.80 0.80 61 weighted avg 0.80 0.80 0.80 61
# Where precision and recall become valuable disease_true = np.zeros(10000) disease_true[0] = 1 # only one positive case disease_preds = np.zeros(10000) # model predicts every case as 0 pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True))
/Users/daniel/Desktop/ml-course/zero-to-mastery-ml/env/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))

To summarize classification metrics:

  • Accuracy is a good measure to start with if all classes are balanced (e.g. same amount of samples which are labelled with 0 or 1).

  • Precision and recall become more important when classes are imbalanced.

  • If false positive predictions are worse than false negatives, aim for higher precision.

  • If false negative predictions are worse than false positives, aim for higher recall.

  • F1-score is a combination of precision and recall.

4.2.2 Regression model evaluation metrics

Model evaluation metrics documentation - https://scikit-learn.org/stable/modules/model_evaluation.html

  1. R^2 (pronounced r-squared) or coefficient of determination.

  2. Mean absolute error (MAE)

  3. Mean squared error (MSE)

R^2

What R-squared does: Compares your models predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, it's R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1.

from sklearn.ensemble import RandomForestRegressor np.random.seed(42) X = boston_df.drop("target", axis=1) y = boston_df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = RandomForestRegressor(n_estimators=100) model.fit(X_train, y_train);
model.score(X_test, y_test)
0.873969014117403
from sklearn.metrics import r2_score # Fill an array with y_test mean y_test_mean = np.full(len(y_test), y_test.mean())
y_test.mean()
21.488235294117644
# Model only predicting the mean gets an R^2 score of 0 r2_score(y_test, y_test_mean)
0.0
# Model predicting perfectly the correct values gets an R^2 score of 1 r2_score(y_test, y_test)
1.0

Mean absolue error (MAE)

MAE is the average of the aboslute differences between predictions and actual values. It gives you an idea of how wrong your models predictions are.

# Mean absolute error from sklearn.metrics import mean_absolute_error y_preds = model.predict(X_test) mae = mean_absolute_error(y_test, y_preds) mae
2.1226372549019623
df = pd.DataFrame(data={"actual values": y_test, "predicted values": y_preds}) df["differences"] = df["predicted values"] - df["actual values"] df

Mean squared error (MSE)

# Mean squared error from sklearn.metrics import mean_squared_error y_preds = model.predict(X_test) mse = mean_squared_error(y_test, y_preds) mse
9.242328990196082
# Calculate MSE by hand squared = np.square(df["differences"]) squared.mean()
9.242328990196082

4.2.3 Finally using the scoring parameter

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] clf = RandomForestClassifier(n_estimators=100)
np.random.seed(42) cv_acc = cross_val_score(clf, X, y, cv=5, scoring=None) cv_acc
array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])
# Cross-validated accuracy print(f'The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%')
The cross-validated accuracy is: 82.48%
np.random.seed(42) cv_acc = cross_val_score(clf, X, y, cv=5, scoring="accuracy") print(f'The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%')
The cross-validated accuracy is: 82.48%
# Precision cv_precision = cross_val_score(clf, X, y, cv=5, scoring="precision") np.mean(cv_precision)
0.8085601538512754
# Recall cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall") np.mean(cv_recall)
0.8424242424242424
cv_f1 = cross_val_score(clf, X, y, cv=5, scoring="f1") np.mean(cv_f1)
0.841476533416832

How about our regression model?

from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor np.random.seed(42) X = boston_df.drop("target", axis=1) y = boston_df["target"] model = RandomForestRegressor(n_estimators=100)
np.random.seed(42) cv_r2 = cross_val_score(model, X, y, cv=5, scoring=None) np.mean(cv_r2)
0.622375083951403
np.random.seed(42) cv_r2 = cross_val_score(model, X, y, cv=5, scoring="r2") cv_r2
array([0.76861165, 0.85851765, 0.74941131, 0.47891315, 0.25642166])
# Mean absolute error cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error") cv_mae
array([-2.12751961, -2.53956436, -3.42026733, -3.82432673, -3.06893069])
# Mean squared error cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error") np.mean(cv_mse)
-21.02253826604542

4.3 Using different evalution metrics as Scikit-Learn functions

Classification evaluation functions

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split np.random.seed(42) X = heart_disease.drop("target", axis=1) y = heart_disease["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) # Make some predictions y_preds = clf.predict(X_test) # Evaluate the classifier print("Classifier metrics on the test set") print(f"Accuracy: {accuracy_score(y_test, y_preds)*100:.2f}%") print(f"Precision: {precision_score(y_test, y_preds)}") print(f"Recall: {recall_score(y_test, y_preds)}") print(f"F1: {f1_score(y_test, y_preds)}")
Classifier metrics on the test set Accuracy: 85.25% Precision: 0.8484848484848485 Recall: 0.875 F1: 0.8615384615384615

Regression evaluation functions

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split np.random.seed(42) X = boston_df.drop("target", axis=1) y = boston_df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = RandomForestRegressor(n_estimators=100) model.fit(X_train, y_train) # Make predictions using our regression model y_preds = model.predict(X_test) # Evaluate the regression model print("Regression model metrics on the test set") print(f"R^2: {r2_score(y_test, y_preds)}") print(f"MAE: {mean_absolute_error(y_test, y_preds)}") print(f"MSE: {mean_squared_error(y_test, y_preds)}")
Regression model metrics on the test set R^2: 0.8739690141174031 MAE: 2.1226372549019623 MSE: 9.242328990196082

5. Improving a model

First predictions = baseline predictions. First model = baseline model.

From a data perspective:

  • Could we collect more data? (generally, the more data, the better)

  • Could we improve our data?

From a model perspective:

  • Is there a better model we could use?

  • Could we improve the current model?

Hyperparameters vs. Parameters

  • Parameters = model find these patterns in data

  • Hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns

Three ways to adjust hyperparameters:

  1. By hand

  2. Randomly with RandomSearchCV

  3. Exhaustively with GridSearchCV

from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimator=100)
clf.get_params()

5.1 Tuning hyperparameters by hand

Let's make 3 sets, training, validation and test.

clf.get_params()

We're going to try and adjust:

  • max_depth

  • max_features

  • min_samples_leaf

  • min_samples_split

  • n_estimators

def evaluate_preds(y_true, y_preds): """ Performs evaluation comparison on y_true labels vs. y_pred labels on a classification. """ accuracy = accuracy_score(y_true, y_preds) precision = precision_score(y_true, y_preds) recall = recall_score(y_true, y_preds) f1 = f1_score(y_true, y_preds) metric_dict = {"accuracy": round(accuracy, 2), "precision": round(precision, 2), "recall": round(recall, 2), "f1": round(f1, 2)} print(f"Acc: {accuracy * 100:.2f}%") print(f"Precision: {precision:.2f}") print(f"Recall: {recall:.2f}") print(f"F1 score: {f1:.2f}") return metric_dict
from sklearn.ensemble import RandomForestClassifier np.random.seed(42) # Shuffle the data heart_disease_shuffled = heart_disease.sample(frac=1) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split the data into train, validation & test sets train_split = round(0.7 * len(heart_disease_shuffled)) # 70% of data valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15% of data X_train, y_train = X[:train_split], y[:train_split] X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split] X_test, y_test = X[valid_split:], y[:valid_split] clf = RandomForestClassifier() clf.fit(X_train, y_train) # Make baseline predictions y_preds = clf.predict(X_valid) # Evaluate the classifier on validation set baseline_metrics = evaluate_preds(y_valid, y_preds) baseline_metrics
np.random.seed(42) # Create a second classifier with different hyperparameters clf_2 = RandomForestClassifier(n_estimators=100) clf_2.fit(X_train, y_train) # Make predictions with different hyperparameters y_preds_2 = clf_2.predict(X_valid) # Evalute the 2nd classsifier clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

5.2 Hyperparameter tuning with RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200], "max_depth": [None, 5, 10, 20, 30], "max_features": ["auto", "sqrt"], "min_samples_split": [2, 4, 6], "min_samples_leaf": [1, 2, 4]} np.random.seed(42) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate RandomForestClassifier clf = RandomForestClassifier(n_jobs=1) # Setup RandomizedSearchCV rs_clf = RandomizedSearchCV(estimator=clf, param_distributions=grid, n_iter=10, # number of models to try cv=5, verbose=2) # Fit the RandomizedSearchCV version of clf rs_clf.fit(X_train, y_train);
rs_clf.best_params_
# Make predictions with the best hyperparameters rs_y_preds = rs_clf.predict(X_test) # Evaluate the predictions rs_metrics = evaluate_preds(y_test, rs_y_preds)

5.3 Hyperparameter tuning with GridSearchCV

grid
grid_2 = {'n_estimators': [100, 200, 500], 'max_depth': [None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [6], 'min_samples_leaf': [1, 2]}
from sklearn.model_selection import GridSearchCV, train_test_split np.random.seed(42) # Split into X & y X = heart_disease_shuffled.drop("target", axis=1) y = heart_disease_shuffled["target"] # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # # Instantiate RandomForestClassifier # clf = RandomForestClassifier(n_jobs=1) # # Setup GridSearchCV # gs_clf = GridSearchCV(estimator=clf, # param_grid=grid_2, # cv=5, # verbose=2) # Fit the GridSearchCV version of clf #gs_clf.fit(X_train, y_train);
gs_clf.best_params_
gs_y_preds = gs_clf.predict(X_test) # evaluate the predictions gs_metrics = evaluate_preds(y_test, gs_y_preds)

Let's compare our different models metrics.

compare_metrics = pd.DataFrame({"baseline": baseline_metrics, "clf_2": clf_2_metrics, "random search": rs_metrics, "grid search": gs_metrics}) compare_metrics.plot.bar(figsize=(10, 8));

6. Saving and loading trained machine learning models

Two ways to save and load machine learning models:

  1. With Python's pickle module

  2. With the joblib module

Pickle

import pickle # Save an extisting model to file pickle.dump(gs_clf, open("gs_random_random_forest_model_1.pkl", "wb"))
# Load a saved model loaded_pickle_model = pickle.load(open("gs_random_random_forest_model_1.pkl", "rb"))
# Make some predictions pickle_y_preds = loaded_pickle_model.predict(X_test) evaluate_preds(y_test, pickle_y_preds)

Joblib

from joblib import dump, load # Save model to file dump(gs_clf, filename="gs_random_forest_model_1.joblib")
# Import a saved joblib model loaded_joblib_model = load(filename="gs_random_forest_model_1.joblib")
# Make and evaluate joblib predictions joblib_y_preds = loaded_joblib_model.predict(X_test) evaluate_preds(y_test, joblib_y_preds)

7. Putting it all together!

data = pd.read_csv("data/car-sales-extended-missing-data.csv") data
data.dtypes
data.isna().sum()

Steps we want to do (all in one cell):

  1. Fill missing data

  2. Convert data to numbers

  3. Build a model on the data

# Getting data ready import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder # Modelling from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, GridSearchCV # Setup random seed import numpy as np np.random.seed(42) # Import data and drop rows with missing labels data = pd.read_csv("data/car-sales-extended-missing-data.csv") data.dropna(subset=["Price"], inplace=True) # Define different features and transformer pipeline categorical_features = ["Make", "Colour"] categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]) door_feature = ["Doors"] door_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value=4)) ]) numeric_features = ["Odometer (KM)"] numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="mean")) ]) # Setup preprocessing steps (fill missing values, then convert to numbers) preprocessor = ColumnTransformer( transformers=[ ("cat", categorical_transformer, categorical_features), ("door", door_transformer, door_feature), ("num", numeric_transformer, numeric_features) ]) # Creating a preprocessing and modelling pipeline model = Pipeline(steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor())]) # Split data X = data.drop("Price", axis=1) y = data["Price"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Fit and score the model model.fit(X_train, y_train) model.score(X_test, y_test)

It's also possible to use GridSearchCV or RandomizedSesrchCV with our Pipeline.

# Use GridSearchCV with our regression Pipeline from sklearn.model_selection import GridSearchCV pipe_grid = { "preprocessor__num__imputer__strategy": ["mean", "median"], "model__n_estimators": [100, 1000], "model__max_depth": [None, 5], "model__max_features": ["auto"], "model__min_samples_split": [2, 4] } gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2) gs_model.fit(X_train, y_train)
gs_model.score(X_test, y_test)
what_were_covering