CoCalc -- timeseries_weather

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/timeseries/timeseries_weather_forecasting.py
³⁵⁰⁷ views
1
"""
2
Title: Timeseries forecasting for weather prediction
3
Authors: [Prabhanshu Attri](https://prabhanshu.com/github), [Yashika Sharma](https://github.com/yashika51), [Kristi Takach](https://github.com/ktakattack), [Falak Shah](https://github.com/falaktheoptimist)
4
Date created: 2020/06/23
5
Last modified: 2023/11/22
6
Description: This notebook demonstrates how to do timeseries forecasting using a LSTM model.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Setup
12
"""
13

14
import pandas as pd
15
import matplotlib.pyplot as plt
16
import keras
17

18
"""
19
## Climate Data Time-Series
20

21
We will be using Jena Climate dataset recorded by the
22
[Max Planck Institute for Biogeochemistry](https://www.bgc-jena.mpg.de/wetter/).
23
The dataset consists of 14 features such as temperature, pressure, humidity etc, recorded once per
24
10 minutes.
25

26
**Location**: Weather Station, Max Planck Institute for Biogeochemistry
27
in Jena, Germany
28

29
**Time-frame Considered**: Jan 10, 2009 - December 31, 2016
30

31

32
The table below shows the column names, their value formats, and their description.
33

34
Index| Features      |Format             |Description
35
-----|---------------|-------------------|-----------------------
36
1    |Date Time      |01.01.2009 00:10:00|Date-time reference
37
2    |p (mbar)       |996.52             |The pascal SI derived unit of pressure used to quantify internal pressure. Meteorological reports typically state atmospheric pressure in millibars.
38
3    |T (degC)       |-8.02              |Temperature in Celsius
39
4    |Tpot (K)       |265.4              |Temperature in Kelvin
40
5    |Tdew (degC)    |-8.9               |Temperature in Celsius relative to humidity. Dew Point is a measure of the absolute amount of water in the air, the DP is the temperature at which the air cannot hold all the moisture in it and water condenses.
41
6    |rh (%)         |93.3               |Relative Humidity is a measure of how saturated the air is with water vapor, the %RH determines the amount of water contained within collection objects.
42
7    |VPmax (mbar)   |3.33               |Saturation vapor pressure
43
8    |VPact (mbar)   |3.11               |Vapor pressure
44
9    |VPdef (mbar)   |0.22               |Vapor pressure deficit
45
10   |sh (g/kg)      |1.94               |Specific humidity
46
11   |H2OC (mmol/mol)|3.12               |Water vapor concentration
47
12   |rho (g/m ** 3) |1307.75            |Airtight
48
13   |wv (m/s)       |1.03               |Wind speed
49
14   |max. wv (m/s)  |1.75               |Maximum wind speed
50
15   |wd (deg)       |152.3              |Wind direction in degrees
51
"""
52

53
from zipfile import ZipFile
54

55
uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
56
zip_path = keras.utils.get_file(origin=uri, fname="jena_climate_2009_2016.csv.zip")
57
zip_file = ZipFile(zip_path)
58
zip_file.extractall()
59
csv_path = "jena_climate_2009_2016.csv"
60

61
df = pd.read_csv(csv_path)
62

63
"""
64
## Raw Data Visualization
65

66
To give us a sense of the data we are working with, each feature has been plotted below.
67
This shows the distinct pattern of each feature over the time period from 2009 to 2016.
68
It also shows where anomalies are present, which will be addressed during normalization.
69
"""
70

71
titles = [
72
    "Pressure",
73
    "Temperature",
74
    "Temperature in Kelvin",
75
    "Temperature (dew point)",
76
    "Relative Humidity",
77
    "Saturation vapor pressure",
78
    "Vapor pressure",
79
    "Vapor pressure deficit",
80
    "Specific humidity",
81
    "Water vapor concentration",
82
    "Airtight",
83
    "Wind speed",
84
    "Maximum wind speed",
85
    "Wind direction in degrees",
86
]
87

88
feature_keys = [
89
    "p (mbar)",
90
    "T (degC)",
91
    "Tpot (K)",
92
    "Tdew (degC)",
93
    "rh (%)",
94
    "VPmax (mbar)",
95
    "VPact (mbar)",
96
    "VPdef (mbar)",
97
    "sh (g/kg)",
98
    "H2OC (mmol/mol)",
99
    "rho (g/m**3)",
100
    "wv (m/s)",
101
    "max. wv (m/s)",
102
    "wd (deg)",
103
]
104

105
colors = [
106
    "blue",
107
    "orange",
108
    "green",
109
    "red",
110
    "purple",
111
    "brown",
112
    "pink",
113
    "gray",
114
    "olive",
115
    "cyan",
116
]
117

118
date_time_key = "Date Time"
119

120

121
def show_raw_visualization(data):
122
    time_data = data[date_time_key]
123
    fig, axes = plt.subplots(
124
        nrows=7, ncols=2, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k"
125
    )
126
    for i in range(len(feature_keys)):
127
        key = feature_keys[i]
128
        c = colors[i % (len(colors))]
129
        t_data = data[key]
130
        t_data.index = time_data
131
        t_data.head()
132
        ax = t_data.plot(
133
            ax=axes[i // 2, i % 2],
134
            color=c,
135
            title="{} - {}".format(titles[i], key),
136
            rot=25,
137
        )
138
        ax.legend([titles[i]])
139
    plt.tight_layout()
140

141

142
show_raw_visualization(df)
143

144

145
"""
146
## Data Preprocessing
147

148
Here we are picking ~300,000 data points for training. Observation is recorded every
149
10 mins, that means 6 times per hour. We will resample one point per hour since no
150
drastic change is expected within 60 minutes. We do this via the `sampling_rate`
151
argument in `timeseries_dataset_from_array` utility.
152

153
We are tracking data from past 720 timestamps (720/6=120 hours). This data will be
154
used to predict the temperature after 72 timestamps (72/6=12 hours).
155

156
Since every feature has values with
157
varying ranges, we do normalization to confine feature values to a range of `[0, 1]` before
158
training a neural network.
159
We do this by subtracting the mean and dividing by the standard deviation of each feature.
160

161
71.5 % of the data will be used to train the model, i.e. 300,693 rows. `split_fraction` can
162
be changed to alter this percentage.
163

164
The model is shown data for first 5 days i.e. 720 observations, that are sampled every
165
hour. The temperature after 72 (12 hours * 6 observation per hour) observation will be
166
used as a label.
167
"""
168

169
split_fraction = 0.715
170
train_split = int(split_fraction * int(df.shape[0]))
171
step = 6
172

173
past = 720
174
future = 72
175
learning_rate = 0.001
176
batch_size = 256
177
epochs = 10
178

179

180
def normalize(data, train_split):
181
    data_mean = data[:train_split].mean(axis=0)
182
    data_std = data[:train_split].std(axis=0)
183
    return (data - data_mean) / data_std
184

185

186
"""
187
We can see from the correlation heatmap, few parameters like Relative Humidity and
188
Specific Humidity are redundant. Hence we will be using select features, not all.
189
"""
190

191
print(
192
    "The selected parameters are:",
193
    ", ".join([titles[i] for i in [0, 1, 5, 7, 8, 10, 11]]),
194
)
195
selected_features = [feature_keys[i] for i in [0, 1, 5, 7, 8, 10, 11]]
196
features = df[selected_features]
197
features.index = df[date_time_key]
198
features.head()
199

200
features = normalize(features.values, train_split)
201
features = pd.DataFrame(features)
202
features.head()
203

204
train_data = features.loc[0 : train_split - 1]
205
val_data = features.loc[train_split:]
206

207
"""
208
# Training dataset
209

210
The training dataset labels starts from the 792nd observation (720 + 72).
211
"""
212

213
start = past + future
214
end = start + train_split
215

216
x_train = train_data[[i for i in range(7)]].values
217
y_train = features.iloc[start:end][[1]]
218

219
sequence_length = int(past / step)
220

221
"""
222
The `timeseries_dataset_from_array` function takes in a sequence of data-points gathered at
223
equal intervals, along with time series parameters such as length of the
224
sequences/windows, spacing between two sequence/windows, etc., to produce batches of
225
sub-timeseries inputs and targets sampled from the main timeseries.
226
"""
227

228
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
229
    x_train,
230
    y_train,
231
    sequence_length=sequence_length,
232
    sampling_rate=step,
233
    batch_size=batch_size,
234
)
235

236
"""
237
## Validation dataset
238

239
The validation dataset must not contain the last 792 rows as we won't have label data for
240
those records, hence 792 must be subtracted from the end of the data.
241

242
The validation label dataset must start from 792 after train_split, hence we must add
243
past + future (792) to label_start.
244
"""
245

246
x_end = len(val_data) - past - future
247

248
label_start = train_split + past + future
249

250
x_val = val_data.iloc[:x_end][[i for i in range(7)]].values
251
y_val = features.iloc[label_start:][[1]]
252

253
dataset_val = keras.preprocessing.timeseries_dataset_from_array(
254
    x_val,
255
    y_val,
256
    sequence_length=sequence_length,
257
    sampling_rate=step,
258
    batch_size=batch_size,
259
)
260

261

262
for batch in dataset_train.take(1):
263
    inputs, targets = batch
264

265
print("Input shape:", inputs.numpy().shape)
266
print("Target shape:", targets.numpy().shape)
267

268
"""
269
## Training
270
"""
271

272
inputs = keras.layers.Input(shape=(inputs.shape[1], inputs.shape[2]))
273
lstm_out = keras.layers.LSTM(32)(inputs)
274
outputs = keras.layers.Dense(1)(lstm_out)
275

276
model = keras.Model(inputs=inputs, outputs=outputs)
277
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse")
278
model.summary()
279

280
"""
281
We'll use the `ModelCheckpoint` callback to regularly save checkpoints, and
282
the `EarlyStopping` callback to interrupt training when the validation loss
283
is not longer improving.
284
"""
285

286
path_checkpoint = "model_checkpoint.weights.h5"
287
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)
288

289
modelckpt_callback = keras.callbacks.ModelCheckpoint(
290
    monitor="val_loss",
291
    filepath=path_checkpoint,
292
    verbose=1,
293
    save_weights_only=True,
294
    save_best_only=True,
295
)
296

297
history = model.fit(
298
    dataset_train,
299
    epochs=epochs,
300
    validation_data=dataset_val,
301
    callbacks=[es_callback, modelckpt_callback],
302
)
303

304
"""
305
We can visualize the loss with the function below. After one point, the loss stops
306
decreasing.
307
"""
308

309

310
def visualize_loss(history, title):
311
    loss = history.history["loss"]
312
    val_loss = history.history["val_loss"]
313
    epochs = range(len(loss))
314
    plt.figure()
315
    plt.plot(epochs, loss, "b", label="Training loss")
316
    plt.plot(epochs, val_loss, "r", label="Validation loss")
317
    plt.title(title)
318
    plt.xlabel("Epochs")
319
    plt.ylabel("Loss")
320
    plt.legend()
321
    plt.show()
322

323

324
visualize_loss(history, "Training and Validation Loss")
325

326
"""
327
## Prediction
328

329
The trained model above is now able to make predictions for 5 sets of values from
330
validation set.
331
"""
332

333

334
def show_plot(plot_data, delta, title):
335
    labels = ["History", "True Future", "Model Prediction"]
336
    marker = [".-", "rx", "go"]
337
    time_steps = list(range(-(plot_data[0].shape[0]), 0))
338
    if delta:
339
        future = delta
340
    else:
341
        future = 0
342

343
    plt.title(title)
344
    for i, val in enumerate(plot_data):
345
        if i:
346
            plt.plot(future, plot_data[i], marker[i], markersize=10, label=labels[i])
347
        else:
348
            plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
349
    plt.legend()
350
    plt.xlim([time_steps[0], (future + 5) * 2])
351
    plt.xlabel("Time-Step")
352
    plt.show()
353
    return
354

355

356
for x, y in dataset_val.take(5):
357
    show_plot(
358
        [x[0][:, 1].numpy(), y[0].numpy(), model.predict(x)[0]],
359
        12,
360
        "Single Step Prediction",
361
    )
362

363
Product

Resources

Company