Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mrdbourke
GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/car-sales-data-manufacture.ipynb
874 views
Kernel: Python 3

Creating fake data for car_sales (to make it a bit bigger)

This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers.

import pandas as pd import numpy as np car_sales = pd.read_csv('../data/car-sales.csv')
car_sales
car_sales.Make.unique()
array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)
car_sales.Make.value_counts()
Toyota 4 Honda 3 Nissan 2 BMW 1 Name: Make, dtype: int64

Create fake "Make" data

# Create fake "Make" data toyota = ["Toyota" for i in range(0, 393)] len(toyota), toyota[:10]
(393, ['Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota'])
honda = ["Honda" for i in range(0, 304)] len(honda), honda[:10]
(304, ['Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda'])
nissan = ["Nissan" for i in range(0, 198)] len(nissan), nissan[:10]
(198, ['Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan'])
bmw = ["BMW" for i in range(0, 100)] len(bmw), bmw[:10]
(100, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])
makes = bmw+nissan+toyota+honda len(makes)
1000

Create fake "Colour" data

car_sales.Colour.unique()
array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)
car_sales.Colour.value_counts()
White 4 Blue 3 Green 1 Black 1 Red 1 Name: Colour, dtype: int64
white = ["White" for i in range(0, 407)] len(white), white[:3]
(407, ['White', 'White', 'White'])
blue = ["Blue" for i in range(0, 321)] len(blue), blue[:3]
(321, ['Blue', 'Blue', 'Blue'])
green = ["Green" for i in range(0, 79)] len(green), green[:3]
(79, ['Green', 'Green', 'Green'])
black = ["Black" for i in range(0, 99)] len(black), black[:3]
(99, ['Black', 'Black', 'Black'])
red = ["Red" for i in range(0, 94)] len(red), red[:3]
(94, ['Red', 'Red', 'Red'])
colours = white+blue+green+black+red len(colours)
1000
import random colours_shuffled = random.sample(colours, len(colours)) len(colours_shuffled), colours_shuffled[:10]
(1000, ['White', 'White', 'Blue', 'Blue', 'Blue', 'White', 'Blue', 'Blue', 'Red', 'White'])

Create fake Odometer (KM) data

car_sales
odometer = [random.randint(9789, 250000) for i in range(0, 1000)] len(odometer), odometer[:10]
(1000, [195419, 69066, 209466, 79301, 134103, 143651, 245427, 244095, 176660, 194189])

Create fake "Doors" data

five_doors = [5 for i in range(0, 79)] three_doors = [3 for i in range(0, 65)] four_doors = [4 for i in range(0, 856)] doors = five_doors + three_doors + four_doors doors_shuffled = random.sample(doors, len(doors))
doors_shuffled
[4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 4, 3, 5, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 4, 4, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3, 5, 4, 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 5, 5, 3, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, 4, 4, 3, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 3, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 5, 4, 4, 3, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 5, 5, 3, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4]

Create fake "Price" data

makes_series = pd.Series(makes) makes_series.value_counts()
Toyota 398 Honda 304 Nissan 198 BMW 100 dtype: int64
car_sales
car_sales[car_sales["Make"] == "Toyota"]
car_sales[car_sales["Make"] == "Honda"]
car_sales[car_sales["Make"] == "Nissan"]
prices = [random.randint(5000, 30000) for i in range(0, 1000)] len(prices), prices[:30]
(1000, [27185, 8815, 23614, 19783, 29208, 17251, 10138, 27640, 7332, 9946, 29670, 12779, 26735, 21481, 9313, 13094, 17684, 21389, 5239, 16733, 19670, 10542, 11122, 21311, 29545, 20601, 22714, 28876, 14063, 5491])

Create base dataframe with manufactured data

fake_sales = pd.DataFrame(columns = ["Make", "Colour", "Odometer (KM)", "Doors", "Price"]) fake_sales
fake_sales["Make"] = makes fake_sales["Colour"] = colours_shuffled fake_sales["Odometer (KM)"] = odometer fake_sales["Doors"] = doors fake_sales["Price"] = prices
fake_sales.head()

Adjust the price column

For the price column:

  • Generate random numbers between the certain values

  • If the Odometer reading is above 100K, multiply price by 0.75

  • If the Odometer reading is above 150K, multiply price by 0.6

  • If the Odometer reading is above 200K, multiply price by 0.5

  • If the Make column is BMW, multiply price by 1.5 + 2500

  • If the Make column is Toyota, multuply price by 1.2

  • If the Make is Nissan, multiply price by 1.1

  • If the Make is Honda, add $1000 to price

fake_sales["Price"].describe()
count 1000.000000 mean 17369.943000 std 7260.398755 min 5005.000000 25% 11039.500000 50% 17427.500000 75% 23353.500000 max 29990.000000 Name: Price, dtype: float64
def price_od(price, odometer): """ Changes price according to Odometer values. """ if 100000 <= odometer <= 150000: return round(price * 0.75) elif 150001 <= odometer <= 200000: return round(price * 0.6) elif 200001 <= odometer: return round(price * 0.5) else: return price fake_sales["Price"] = fake_sales.apply(lambda x: price_od(x["Price"], x["Odometer (KM)"]), axis=1) fake_sales["Price"].describe()
count 1000.000000 mean 13151.713000 std 6722.177036 min 2509.000000 25% 7854.750000 50% 12016.000000 75% 17082.250000 max 29990.000000 Name: Price, dtype: float64
def price_make(price, make): """ Manipulates the price base on the cars make. """ if make == "BMW": return round((price * 1.5) + random.randint(3000, 10000)) elif make == "Toyota": return round(price * 1.2) elif make == "Nissan": return round(price * 1.1) elif make == "Honda": return round(price + 1000) else: return price fake_sales["Price"] = fake_sales.apply(lambda x: price_make(x["Price"], x["Make"]), axis=1) fake_sales["Price"].describe()
count 1000.000000 mean 16045.665000 std 8630.794219 min 2796.000000 25% 9481.500000 50% 14264.000000 75% 20738.750000 max 52458.000000 Name: Price, dtype: float64
fake_sales = fake_sales.sample(frac=1)
fake_sales.reset_index(drop=True, inplace=True) fake_sales.head(10)

NEXT:

  • Drop some values at random (to manufacture missing data)

  • Build a random forest model to predict (this will involve changing categories to numerical data)

# Export the data fake_sales.to_csv("../data/car-sales-extended.csv")

Make missing data in car_sales_extended

sales_ext = pd.read_csv("../data/car-sales-extended.csv")
len(sales_ext)
1000
sales_ext

What we want to do

  • Remove some rows values or replace them at random

    • E.g. replace strings with empty strings ("")

    • And numbers with NaN or something similar...

  • Want to keep the number of samples the same, order the same, just put some holes in it

One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies.

# Replicate the df sales_ext_dropped = sales_ext
# Make column np.random.seed(10) make_idx = np.random.randint(0, 1000, 50)
make_idx
array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881, 8, 73, 256, 490, 40, 502, 420, 371, 528, 356, 239, 395, 54, 344, 363, 122, 574, 545, 200, 868, 974, 689, 691, 54, 77, 453, 13, 755, 409, 382, 653, 860, 342, 798, 670, 89, 652])
for value in make_idx: sales_ext_dropped.loc[value, "Make"] = ""
sales_ext_dropped["Make"][266]
'Honda'
# Colour column np.random.seed(42) colour_idx = np.random.randint(0, 1000, 50) for value in colour_idx: sales_ext_dropped.loc[value, "Colour"] = ""
# Odometer (KM) column np.random.seed(1) odom_idx = np.random.randint(0, 1000, 50) for value in odom_idx: sales_ext_dropped.loc[value, "Odometer (KM)"] = None
# Doors column np.random.seed(2) door_idx = np.random.randint(0, 1000, 50) for value in door_idx: sales_ext_dropped.loc[value, "Doors"] = None
# Price column np.random.seed(3) price_idx = np.random.randint(0, 1000, 50) for value in price_idx: sales_ext_dropped.loc[value, "Price"] = None
sales_ext_dropped.head(50)
# Check how many of our values are missing/NaN sales_ext_dropped.isna().sum()
Make 0 Colour 0 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Export dataframe with random missing values sales_ext_dropped.to_csv("../data/car-sales-extended-missing-data.csv", index=False)