CoCalc -- introduction-to-pandas-video.ipynb

GitHub Repository: mrdbourke/zero-to-mastery-ml
Path: blob/master/section-2-data-science-and-ml-tools/introduction-to-pandas-video.ipynb
⁸⁷⁴ views

Kernel: Python 3

In [1]:

import pandas as pd

In [2]:

# 2 main datatypes
series = pd.Series(["BMW", "Toyota", "Honda"])

In [3]:

series

Out[3]:

     BMW
  Toyota
   Honda
dtype: object

In [4]:

# series = 1-dimensional

In [5]:

colours = pd.Series(["Red", "Blue", "White"])
colours

Out[5]:

    Red
   Blue
  White
dtype: object

In [6]:

# DataFrame = 2-dimensional
car_data = pd.DataFrame({"Car make": series, "Colour": colours})
car_data

Out[6]:

In [7]:

# Import data
car_sales = pd.read_csv("car-sales.csv")

In [8]:

car_sales

Out[8]:

In [11]:

# Exporting a dataframe
car_sales.to_csv("exported-car-sales.csv", index=False)

In [12]:

exported_car_sales = pd.read_csv("exported-car-sales.csv")
exported_car_sales

Out[12]:

Describe data

In [13]:

# Attribute
car_sales.dtypes

# Function
#car_sales.to_csv()

Out[13]:

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [14]:

car_sales.columns

Out[14]:

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [16]:

car_columns = car_sales.columns
car_columns

Out[16]:

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [18]:

car_sales.index

Out[18]:

RangeIndex(start=0, stop=10, step=1)

In [19]:

car_sales

Out[19]:

In [20]:

car_sales.describe()

Out[20]:

In [21]:

car_sales.info()

Out[21]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
Make             10 non-null object
Colour           10 non-null object
Odometer (KM)    10 non-null int64
Doors            10 non-null int64
Price            10 non-null object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes

In [22]:

car_sales.mean()

Out[22]:

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [23]:

car_prices = pd.Series([3000, 1500, 111250])
car_prices.mean()

Out[23]:

38583.333333333336

In [24]:

car_sales.sum()

Out[24]:

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

In [25]:

car_sales["Doors"].sum()

Out[25]:

40

In [26]:

len(car_sales)

Out[26]:

10

In [27]:

car_sales

Out[27]:

Viewing and selecting data

In [28]:

car_sales.head()

Out[28]:

In [29]:

car_sales

Out[29]:

In [30]:

car_sales.head(7)

Out[30]:

In [32]:

car_sales.tail(3)

Out[32]:

In [35]:

# .loc & .iloc
animals = pd.Series(["cat", "dog", "bird", "panda", "snake"],
                    index=[0, 3, 9, 8, 3])

In [36]:

animals

Out[36]:

    cat
    dog
   bird
  panda
  snake
dtype: object

In [37]:

animals.loc[3]

Out[37]:

3      dog
3    snake
dtype: object

In [38]:

animals.loc[9]

Out[38]:

'bird'

In [40]:

car_sales.loc[3]

Out[40]:

Make                    BMW
Colour                Black
Odometer (KM)         11179
Doors                     5
Price            $22,000.00
Name: 3, dtype: object

In [42]:

animals

Out[42]:

    cat
    dog
   bird
  panda
  snake
dtype: object

In [41]:

# .iloc refers to position
animals.iloc[3]

Out[41]:

'panda'

In [43]:

# .loc refers to index
car_sales.loc[3]

Out[43]:

Make                    BMW
Colour                Black
Odometer (KM)         11179
Doors                     5
Price            $22,000.00
Name: 3, dtype: object

In [44]:

car_sales

Out[44]:

In [46]:

animals

Out[46]:

    cat
    dog
   bird
  panda
  snake
dtype: object

In [45]:

animals.iloc[:3]

Out[45]:

   cat
   dog
  bird
dtype: object

In [47]:

car_sales.loc[:3]

Out[47]:

In [48]:

car_sales.head(4)

Out[48]:

In [49]:

car_sales["Make"]

Out[49]:

  Toyota
   Honda
  Toyota
     BMW
  Nissan
  Toyota
   Honda
   Honda
  Toyota
  Nissan
Name: Make, dtype: object

In [50]:

car_sales["Colour"]

Out[50]:

  White
    Red
   Blue
  Black
  White
  Green
   Blue
   Blue
  White
  White
Name: Colour, dtype: object

In [51]:

car_sales["Make"]

Out[51]:

  Toyota
   Honda
  Toyota
     BMW
  Nissan
  Toyota
   Honda
   Honda
  Toyota
  Nissan
Name: Make, dtype: object

In [54]:

car_sales["Odometer (KM)"]

Out[54]:

  150043
   87899
   32549
   11179
  213095
   99213
   45698
   54738
   60000
   31600
Name: Odometer (KM), dtype: int64

In [53]:

car_sales.Odometer (KM)

Out[53]:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-53-cffc0afb7077> in <module>
----> 1 car_sales.Odometer (KM)

~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'Odometer'

In [55]:

car_sales[car_sales["Make"] == "Toyota"]

Out[55]:

In [56]:

car_sales[car_sales["Odometer (KM)"] > 100000]

Out[56]:

In [58]:

car_sales

Out[58]:

In [57]:

pd.crosstab(car_sales["Make"], car_sales["Doors"])

Out[57]:

In [59]:

# Groupby
car_sales.groupby(["Make"]).mean()

Out[59]:

In [61]:

%matplotlib inline
import matplotlib.pyplot as plt

In [60]:

car_sales["Odometer (KM)"].plot()

Out[60]:

<matplotlib.axes._subplots.AxesSubplot at 0x123781ad0>

In [62]:

car_sales["Odometer (KM)"].hist()

Out[62]:

<matplotlib.axes._subplots.AxesSubplot at 0x1234c6f10>

In [64]:

car_sales["Price"].dtype

Out[64]:

dtype('O')

In [63]:

car_sales["Price"].plot()

Out[63]:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-63-83bf2f29b5a6> in <module>
----> 1 car_sales["Price"].plot()

~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_core.py in __call__(self, *args, **kwargs)
    792                     data.columns = label_name
    793 
--> 794         return plot_backend.plot(data, kind=kind, **kwargs)
    795 
    796     def line(self, x=None, y=None, **kwargs):
~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/__init__.py in plot(data, kind, **kwargs)
     60             kwargs["ax"] = getattr(ax, "left_ax", ax)
     61     plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 62     plot_obj.generate()
     63     plot_obj.draw()
     64     return plot_obj.result
~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/core.py in generate(self)
    277     def generate(self):
    278         self._args_adjust()
--> 279         self._compute_plot_data()
    280         self._setup_subplots()
    281         self._make_plot()
~/Desktop/ml-course/sample-project/env/lib/python3.7/site-packages/pandas/plotting/_matplotlib/core.py in _compute_plot_data(self)
    412         # no non-numeric frames or series allowed
    413         if is_empty:
--> 414             raise TypeError("no numeric data to plot")
    415 
    416         # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to
TypeError: no numeric data to plot

In [65]:

car_sales

Out[65]:

In [ ]:

dataframe['amount'] = dataframe['amount'].str.replace('[\$\,\.]', '').astype(int)

In [66]:

car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '').astype(int)

In [67]:

car_sales

Out[67]:

In [68]:

car_sales["Price"].plot()

Out[68]:

<matplotlib.axes._subplots.AxesSubplot at 0x1238437d0>

Manipulating Data

In [69]:

car_sales["Make"].str.lower()

Out[69]:

  toyota
   honda
  toyota
     bmw
  nissan
  toyota
   honda
   honda
  toyota
  nissan
Name: Make, dtype: object

In [70]:

car_sales

Out[70]:

In [71]:

car_sales["Make"] = car_sales["Make"].str.lower()

In [72]:

car_sales

Out[72]:

In [73]:

car_sales

Out[73]:

In [74]:

car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing

Out[74]:

In [75]:

car_sales_missing["Odometer"].mean()

Out[75]:

92302.66666666667

In [76]:

car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean())

Out[76]:

  150043.000000
   87899.000000
   92302.666667
   11179.000000
  213095.000000
   92302.666667
   92302.666667
   92302.666667
   60000.000000
   31600.000000
Name: Odometer, dtype: float64

In [78]:

car_sales_missing

Out[78]:

In [79]:

car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(),
                                     inplace=True)

In [80]:

car_sales_missing

Out[80]:

In [81]:

car_sales_missing.dropna()

Out[81]:

In [83]:

car_sales_missing.dropna(inplace=True)

In [84]:

car_sales_missing

Out[84]:

In [89]:

car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing

Out[89]:

In [90]:

car_sales_missing_dropped = car_sales_missing.dropna()
car_sales_missing_dropped

Out[90]:

In [91]:

car_sales_missing_dropped.to_csv("car-sales-missing-dropped.csv")

In [92]:

car_sales

Out[92]:

In [93]:

# Column from series
seats_column = pd.Series([5, 5, 5, 5, 5])

# New column called seats
car_sales["Seats"] = seats_column
car_sales

Out[93]:

In [94]:

car_sales["Seats"].fillna(5, inplace=True)

In [96]:

car_sales

Out[96]:

In [99]:

# Column from Python list
fuel_economy = [7.5, 9.2, 5.0, 9.6, 8.7, 4.7, 7.6, 8.7, 3.0, 4.5]
car_sales["Fuel per 100KM"] = fuel_economy
car_sales

Out[99]:

In [102]:

car_sales["Total fuel used (L)"] = car_sales["Odometer (KM)"]/100 * car_sales["Fuel per 100KM"]

In [103]:

car_sales

Out[103]:

In [104]:

# Create a column from a single value
car_sales["Number of wheels"] = 4
car_sales

Out[104]:

In [106]:

car_sales["Passed road saftey"] = True
car_sales.dtypes

Out[106]:

Make                    object
Colour                  object
Odometer (KM)            int64
Doors                    int64
Price                    int64
Seats                  float64
Fuel per 100KM         float64
Total fuel used        float64
Total fuel used (L)    float64
Number of wheels         int64
Passed road saftey        bool
dtype: object

In [107]:

car_sales

Out[107]:

In [110]:

car_sales = car_sales.drop("Total fuel used", axis=1)

In [111]:

car_sales

Out[111]:

In [125]:

car_sales_shuffled = car_sales.sample(frac=1)

In [126]:

car_sales_shuffled

Out[126]:

In [127]:

# Only select 20% of data
car_sales_shuffled.sample(frac=0.01)

Out[127]:

In [128]:

car_sales_shuffled

Out[128]:

In [129]:

car_sales_shuffled.reset_index(drop=True, inplace=True)

In [130]:

car_sales_shuffled

Out[130]:

In [131]:

car_sales

Out[131]:

In [132]:

car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x / 1.6)
car_sales

Out[132]:

In [ ]:

# Try it, run your code
# Search for it
# Try again
# Ask

Describe data

Viewing and selecting data

Manipulating Data

Product

Resources

Company