GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/car-sales-data-manufacture.ipynb
⁸⁷⁴ views

Kernel: Python 3

Creating fake data for car_sales (to make it a bit bigger)

This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers.

In [1]:

import pandas as pd
import numpy as np

car_sales = pd.read_csv('../data/car-sales.csv')

In [2]:

car_sales

Out[2]:

In [3]:

car_sales.Make.unique()

Out[3]:

array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)

In [4]:

car_sales.Make.value_counts()

Out[4]:

Toyota    4
Honda     3
Nissan    2
BMW       1
Name: Make, dtype: int64

Create fake "Make" data

In [12]:

# Create fake "Make" data

toyota = ["Toyota" for i in range(0, 393)]
len(toyota), toyota[:10]

Out[12]:

(393,
 ['Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota',
  'Toyota'])

In [13]:

honda = ["Honda" for i in range(0, 304)]
len(honda), honda[:10]

Out[13]:

(304,
 ['Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda',
  'Honda'])

In [14]:

nissan = ["Nissan" for i in range(0, 198)]
len(nissan), nissan[:10]

Out[14]:

(198,
 ['Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan',
  'Nissan'])

In [15]:

bmw = ["BMW" for i in range(0, 100)]
len(bmw), bmw[:10]

Out[15]:

(100, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])

In [17]:

makes = bmw+nissan+toyota+honda
len(makes)

Out[17]:

1000

Create fake "Colour" data

In [20]:

car_sales.Colour.unique()

Out[20]:

array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)

In [21]:

car_sales.Colour.value_counts()

Out[21]:

White    4
Blue     3
Green    1
Black    1
Red      1
Name: Colour, dtype: int64

In [29]:

white = ["White" for i in range(0, 407)]
len(white), white[:3]

Out[29]:

(407, ['White', 'White', 'White'])

In [30]:

blue = ["Blue" for i in range(0, 321)]
len(blue), blue[:3]

Out[30]:

(321, ['Blue', 'Blue', 'Blue'])

In [31]:

green = ["Green" for i in range(0, 79)]
len(green), green[:3]

Out[31]:

(79, ['Green', 'Green', 'Green'])

In [32]:

black = ["Black" for i in range(0, 99)]
len(black), black[:3]

Out[32]:

(99, ['Black', 'Black', 'Black'])

In [35]:

red = ["Red" for i in range(0, 94)]
len(red), red[:3]

Out[35]:

(94, ['Red', 'Red', 'Red'])

In [36]:

colours = white+blue+green+black+red
len(colours)

Out[36]:

1000

In [62]:

import random
colours_shuffled = random.sample(colours, len(colours))
len(colours_shuffled), colours_shuffled[:10]

Out[62]:

(1000,
 ['White',
  'White',
  'Blue',
  'Blue',
  'Blue',
  'White',
  'Blue',
  'Blue',
  'Red',
  'White'])

Create fake Odometer (KM) data

In [63]:

car_sales

Out[63]:

In [64]:

odometer = [random.randint(9789, 250000) for i in range(0, 1000)]
len(odometer), odometer[:10]

Out[64]:

Create fake "Doors" data

In [65]:

five_doors = [5 for i in range(0, 79)]
three_doors = [3 for i in range(0, 65)]
four_doors = [4 for i in range(0, 856)]
doors = five_doors + three_doors + four_doors
doors_shuffled = random.sample(doors, len(doors))

In [66]:

doors_shuffled

Out[66]:

[4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 3,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 5,
 5,
 3,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 5,
 4,
 4,
 3,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4]

Create fake "Price" data

In [68]:

makes_series = pd.Series(makes)
makes_series.value_counts()

Out[68]:

Toyota    398
Honda     304
Nissan    198
BMW       100
dtype: int64

In [69]:

car_sales

Out[69]:

In [71]:

car_sales[car_sales["Make"] == "Toyota"]

Out[71]:

In [75]:

car_sales[car_sales["Make"] == "Honda"]

Out[75]:

In [76]:

car_sales[car_sales["Make"] == "Nissan"]

Out[76]:

In [119]:

prices = [random.randint(5000, 30000) for i in range(0, 1000)]
len(prices), prices[:30]

Out[119]:

Create base dataframe with manufactured data

In [136]:

fake_sales = pd.DataFrame(columns = ["Make", "Colour", "Odometer (KM)", "Doors", "Price"])
fake_sales

Out[136]:

In [137]:

fake_sales["Make"] = makes
fake_sales["Colour"] = colours_shuffled
fake_sales["Odometer (KM)"] = odometer
fake_sales["Doors"] = doors
fake_sales["Price"] = prices

In [138]:

fake_sales.head()

Out[138]:

Adjust the price column

For the price column:

Generate random numbers between the certain values
If the Odometer reading is above 100K, multiply price by 0.75
If the Odometer reading is above 150K, multiply price by 0.6
If the Odometer reading is above 200K, multiply price by 0.5
If the Make column is BMW, multiply price by 1.5 + 2500
If the Make column is Toyota, multuply price by 1.2
If the Make is Nissan, multiply price by 1.1
If the Make is Honda, add $1000 to price

In [139]:

fake_sales["Price"].describe()

Out[139]:

count     1000.000000
mean     17369.943000
std       7260.398755
min       5005.000000
25%      11039.500000
50%      17427.500000
75%      23353.500000
max      29990.000000
Name: Price, dtype: float64

In [140]:

def price_od(price, odometer):
    """
    Changes price according to Odometer values.
    """
    if 100000 <= odometer <= 150000:
        return round(price * 0.75)
    elif 150001 <= odometer <= 200000:
        return round(price * 0.6)
    elif 200001 <= odometer:
        return round(price * 0.5)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_od(x["Price"], 
                                                          x["Odometer (KM)"]), 
                                                          axis=1)

fake_sales["Price"].describe()

Out[140]:

count     1000.000000
mean     13151.713000
std       6722.177036
min       2509.000000
25%       7854.750000
50%      12016.000000
75%      17082.250000
max      29990.000000
Name: Price, dtype: float64

In [141]:

def price_make(price, make):
    """
    Manipulates the price base on the cars make.
    """
    if make == "BMW":
        return round((price * 1.5) + random.randint(3000, 10000))
    elif make == "Toyota":
        return round(price * 1.2)
    elif make == "Nissan":
        return round(price * 1.1)
    elif make == "Honda":
        return round(price + 1000)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_make(x["Price"], 
                                                            x["Make"]), 
                                                            axis=1)

fake_sales["Price"].describe()

Out[141]:

count     1000.000000
mean     16045.665000
std       8630.794219
min       2796.000000
25%       9481.500000
50%      14264.000000
75%      20738.750000
max      52458.000000
Name: Price, dtype: float64

In [142]:

fake_sales = fake_sales.sample(frac=1)

In [143]:

fake_sales.reset_index(drop=True, inplace=True)
fake_sales.head(10)

Out[143]:

Drop some values at random (to manufacture missing data)
Build a random forest model to predict (this will involve changing categories to numerical data)

In [146]:

# Export the data
fake_sales.to_csv("../data/car-sales-extended.csv")

Make missing data in car_sales_extended

In [61]:

sales_ext = pd.read_csv("../data/car-sales-extended.csv")

In [62]:

len(sales_ext)

Out[62]:

1000

In [63]:

sales_ext

Out[63]:

What we want to do

Remove some rows values or replace them at random
- E.g. replace strings with empty strings ("")
- And numbers with NaN or something similar...
Want to keep the number of samples the same, order the same, just put some holes in it

One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies.

In [64]:

# Replicate the df
sales_ext_dropped = sales_ext

In [65]:

# Make column
np.random.seed(10)
make_idx = np.random.randint(0, 1000, 50)

In [77]:

make_idx

Out[77]:

array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881,
         8,  73, 256, 490,  40, 502, 420, 371, 528, 356, 239, 395,  54,
       344, 363, 122, 574, 545, 200, 868, 974, 689, 691,  54,  77, 453,
        13, 755, 409, 382, 653, 860, 342, 798, 670,  89, 652])

In [66]:

for value in make_idx:
    sales_ext_dropped.loc[value, "Make"] = ""

In [67]:

sales_ext_dropped["Make"][266]

Out[67]:

'Honda'

In [68]:

# Colour column
np.random.seed(42)
colour_idx = np.random.randint(0, 1000, 50)
for value in colour_idx:
    sales_ext_dropped.loc[value, "Colour"] = ""

In [69]:

# Odometer (KM) column
np.random.seed(1)
odom_idx = np.random.randint(0, 1000, 50)
for value in odom_idx:
    sales_ext_dropped.loc[value, "Odometer (KM)"] = None

In [70]:

# Doors column
np.random.seed(2)
door_idx = np.random.randint(0, 1000, 50)
for value in door_idx:
    sales_ext_dropped.loc[value, "Doors"] = None

In [71]:

# Price column
np.random.seed(3)
price_idx = np.random.randint(0, 1000, 50)
for value in price_idx:
    sales_ext_dropped.loc[value, "Price"] = None

In [72]:

sales_ext_dropped.head(50)

Out[72]:

In [73]:

# Check how many of our values are missing/NaN
sales_ext_dropped.isna().sum()

Out[73]:

Make              0
Colour            0
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [74]:

# Export dataframe with random missing values
sales_ext_dropped.to_csv("../data/car-sales-extended-missing-data.csv", index=False)

In [ ]: