Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mrdbourke
GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/introduction-to-pandas-video.ipynb
874 views
Kernel: Python 3
import pandas as pd
# 2 main datatypes series = pd.Series(["BMW", "Toyota", "Honda"])
series
0 BMW 1 Toyota 2 Honda dtype: object
# series = 1-dimensional
colours = pd.Series(["Red", "Blue", "White"]) colours
0 Red 1 Blue 2 White dtype: object
# DataFrame = 2-dimensional car_data = pd.DataFrame({"Car make": series, "Colour": colours}) car_data
# Import data car_sales = pd.read_csv("car-sales.csv")
car_sales
# Exporting a dataframe car_sales.to_csv("exported-car-sales.csv", index=False)
exported_car_sales = pd.read_csv("exported-car-sales.csv") exported_car_sales

Describe data

# Attribute car_sales.dtypes # Function #car_sales.to_csv()
Make object Colour object Odometer (KM) int64 Doors int64 Price object dtype: object
car_sales.columns
Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')
car_columns = car_sales.columns car_columns
Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')
car_sales.index
RangeIndex(start=0, stop=10, step=1)
car_sales
car_sales.describe()
car_sales.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 5 columns): Make 10 non-null object Colour 10 non-null object Odometer (KM) 10 non-null int64 Doors 10 non-null int64 Price 10 non-null object dtypes: int64(2), object(3) memory usage: 528.0+ bytes
car_sales.mean()
Odometer (KM) 78601.4 Doors 4.0 dtype: float64
car_prices = pd.Series([3000, 1500, 111250]) car_prices.mean()
38583.333333333336
car_sales.sum()
Make ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo... Colour WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite Odometer (KM) 786014 Doors 40 Price $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00... dtype: object
car_sales["Doors"].sum()
40
len(car_sales)
10
car_sales

Viewing and selecting data

car_sales.head()
car_sales
car_sales.head(7)
car_sales.tail(3)
# .loc & .iloc animals = pd.Series(["cat", "dog", "bird", "panda", "snake"], index=[0, 3, 9, 8, 3])
animals
0 cat 3 dog 9 bird 8 panda 3 snake dtype: object
animals.loc[3]
3 dog 3 snake dtype: object
animals.loc[9]
'bird'
car_sales.loc[3]
Make BMW Colour Black Odometer (KM) 11179 Doors 5 Price $22,000.00 Name: 3, dtype: object
animals
0 cat 3 dog 9 bird 8 panda 3 snake dtype: object
# .iloc refers to position animals.iloc[3]
'panda'
# .loc refers to index car_sales.loc[3]
Make BMW Colour Black Odometer (KM) 11179 Doors 5 Price $22,000.00 Name: 3, dtype: object
car_sales
animals
0 cat 3 dog 9 bird 8 panda 3 snake dtype: object
animals.iloc[:3]
0 cat 3 dog 9 bird dtype: object
car_sales.loc[:3]
car_sales.head(4)
car_sales["Make"]
0 Toyota 1 Honda 2 Toyota 3 BMW 4 Nissan 5 Toyota 6 Honda 7 Honda 8 Toyota 9 Nissan Name: Make, dtype: object
car_sales["Colour"]
0 White 1 Red 2 Blue 3 Black 4 White 5 Green 6 Blue 7 Blue 8 White 9 White Name: Colour, dtype: object
car_sales["Make"]
0 Toyota 1 Honda 2 Toyota 3 BMW 4 Nissan 5 Toyota 6 Honda 7 Honda 8 Toyota 9 Nissan Name: Make, dtype: object
car_sales["Odometer (KM)"]
0 150043 1 87899 2 32549 3 11179 4 213095 5 99213 6 45698 7 54738 8 60000 9 31600 Name: Odometer (KM), dtype: int64
car_sales.Odometer (KM)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-53-cffc0afb7077> in <module> ----> 1 car_sales.Odometer (KM) ~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name) 5177 if self._info_axis._can_hold_identifiers_and_holds_name(name): 5178 return self[name] -> 5179 return object.__getattribute__(self, name) 5180 5181 def __setattr__(self, name, value): AttributeError: 'DataFrame' object has no attribute 'Odometer'
car_sales[car_sales["Make"] == "Toyota"]
car_sales[car_sales["Odometer (KM)"] > 100000]
car_sales
pd.crosstab(car_sales["Make"], car_sales["Doors"])
# Groupby car_sales.groupby(["Make"]).mean()
%matplotlib inline import matplotlib.pyplot as plt
car_sales["Odometer (KM)"].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x123781ad0>
Image in a Jupyter notebook
car_sales["Odometer (KM)"].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1234c6f10>
Image in a Jupyter notebook
car_sales["Price"].dtype
dtype('O')
car_sales["Price"].plot()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-63-83bf2f29b5a6> in <module> ----> 1 car_sales["Price"].plot() ~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_core.py in __call__(self, *args, **kwargs) 792 data.columns = label_name 793 --> 794 return plot_backend.plot(data, kind=kind, **kwargs) 795 796 def line(self, x=None, y=None, **kwargs): ~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/__init__.py in plot(data, kind, **kwargs) 60 kwargs["ax"] = getattr(ax, "left_ax", ax) 61 plot_obj = PLOT_CLASSES[kind](data, **kwargs) ---> 62 plot_obj.generate() 63 plot_obj.draw() 64 return plot_obj.result ~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/core.py in generate(self) 277 def generate(self): 278 self._args_adjust() --> 279 self._compute_plot_data() 280 self._setup_subplots() 281 self._make_plot() ~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/core.py in _compute_plot_data(self) 412 # no non-numeric frames or series allowed 413 if is_empty: --> 414 raise TypeError("no numeric data to plot") 415 416 # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to TypeError: no numeric data to plot
car_sales
dataframe['amount'] = dataframe['amount'].str.replace('[\$\,\.]', '').astype(int)
car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '').astype(int)
car_sales
car_sales["Price"].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1238437d0>
Image in a Jupyter notebook

Manipulating Data

car_sales["Make"].str.lower()
0 toyota 1 honda 2 toyota 3 bmw 4 nissan 5 toyota 6 honda 7 honda 8 toyota 9 nissan Name: Make, dtype: object
car_sales
car_sales["Make"] = car_sales["Make"].str.lower()
car_sales
car_sales
car_sales_missing = pd.read_csv("car-sales-missing-data.csv") car_sales_missing
car_sales_missing["Odometer"].mean()
92302.66666666667
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean())
0 150043.000000 1 87899.000000 2 92302.666667 3 11179.000000 4 213095.000000 5 92302.666667 6 92302.666667 7 92302.666667 8 60000.000000 9 31600.000000 Name: Odometer, dtype: float64
car_sales_missing
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(), inplace=True)
car_sales_missing
car_sales_missing.dropna()
car_sales_missing.dropna(inplace=True)
car_sales_missing
car_sales_missing = pd.read_csv("car-sales-missing-data.csv") car_sales_missing
car_sales_missing_dropped = car_sales_missing.dropna() car_sales_missing_dropped
car_sales_missing_dropped.to_csv("car-sales-missing-dropped.csv")
car_sales
# Column from series seats_column = pd.Series([5, 5, 5, 5, 5]) # New column called seats car_sales["Seats"] = seats_column car_sales
car_sales["Seats"].fillna(5, inplace=True)
car_sales
# Column from Python list fuel_economy = [7.5, 9.2, 5.0, 9.6, 8.7, 4.7, 7.6, 8.7, 3.0, 4.5] car_sales["Fuel per 100KM"] = fuel_economy car_sales
car_sales["Total fuel used (L)"] = car_sales["Odometer (KM)"]/100 * car_sales["Fuel per 100KM"]
car_sales
# Create a column from a single value car_sales["Number of wheels"] = 4 car_sales
car_sales["Passed road saftey"] = True car_sales.dtypes
Make object Colour object Odometer (KM) int64 Doors int64 Price int64 Seats float64 Fuel per 100KM float64 Total fuel used float64 Total fuel used (L) float64 Number of wheels int64 Passed road saftey bool dtype: object
car_sales
car_sales = car_sales.drop("Total fuel used", axis=1)
car_sales
car_sales_shuffled = car_sales.sample(frac=1)
car_sales_shuffled
# Only select 20% of data car_sales_shuffled.sample(frac=0.01)
car_sales_shuffled
car_sales_shuffled.reset_index(drop=True, inplace=True)
car_sales_shuffled
car_sales
car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x / 1.6) car_sales
# Try it, run your code # Search for it # Try again # Ask