Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/timeseries/timeseries_weather_forecasting.py
3507 views
1
"""
2
Title: Timeseries forecasting for weather prediction
3
Authors: [Prabhanshu Attri](https://prabhanshu.com/github), [Yashika Sharma](https://github.com/yashika51), [Kristi Takach](https://github.com/ktakattack), [Falak Shah](https://github.com/falaktheoptimist)
4
Date created: 2020/06/23
5
Last modified: 2023/11/22
6
Description: This notebook demonstrates how to do timeseries forecasting using a LSTM model.
7
Accelerator: GPU
8
"""
9
10
"""
11
## Setup
12
"""
13
14
import pandas as pd
15
import matplotlib.pyplot as plt
16
import keras
17
18
"""
19
## Climate Data Time-Series
20
21
We will be using Jena Climate dataset recorded by the
22
[Max Planck Institute for Biogeochemistry](https://www.bgc-jena.mpg.de/wetter/).
23
The dataset consists of 14 features such as temperature, pressure, humidity etc, recorded once per
24
10 minutes.
25
26
**Location**: Weather Station, Max Planck Institute for Biogeochemistry
27
in Jena, Germany
28
29
**Time-frame Considered**: Jan 10, 2009 - December 31, 2016
30
31
32
The table below shows the column names, their value formats, and their description.
33
34
Index| Features |Format |Description
35
-----|---------------|-------------------|-----------------------
36
1 |Date Time |01.01.2009 00:10:00|Date-time reference
37
2 |p (mbar) |996.52 |The pascal SI derived unit of pressure used to quantify internal pressure. Meteorological reports typically state atmospheric pressure in millibars.
38
3 |T (degC) |-8.02 |Temperature in Celsius
39
4 |Tpot (K) |265.4 |Temperature in Kelvin
40
5 |Tdew (degC) |-8.9 |Temperature in Celsius relative to humidity. Dew Point is a measure of the absolute amount of water in the air, the DP is the temperature at which the air cannot hold all the moisture in it and water condenses.
41
6 |rh (%) |93.3 |Relative Humidity is a measure of how saturated the air is with water vapor, the %RH determines the amount of water contained within collection objects.
42
7 |VPmax (mbar) |3.33 |Saturation vapor pressure
43
8 |VPact (mbar) |3.11 |Vapor pressure
44
9 |VPdef (mbar) |0.22 |Vapor pressure deficit
45
10 |sh (g/kg) |1.94 |Specific humidity
46
11 |H2OC (mmol/mol)|3.12 |Water vapor concentration
47
12 |rho (g/m ** 3) |1307.75 |Airtight
48
13 |wv (m/s) |1.03 |Wind speed
49
14 |max. wv (m/s) |1.75 |Maximum wind speed
50
15 |wd (deg) |152.3 |Wind direction in degrees
51
"""
52
53
from zipfile import ZipFile
54
55
uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
56
zip_path = keras.utils.get_file(origin=uri, fname="jena_climate_2009_2016.csv.zip")
57
zip_file = ZipFile(zip_path)
58
zip_file.extractall()
59
csv_path = "jena_climate_2009_2016.csv"
60
61
df = pd.read_csv(csv_path)
62
63
"""
64
## Raw Data Visualization
65
66
To give us a sense of the data we are working with, each feature has been plotted below.
67
This shows the distinct pattern of each feature over the time period from 2009 to 2016.
68
It also shows where anomalies are present, which will be addressed during normalization.
69
"""
70
71
titles = [
72
"Pressure",
73
"Temperature",
74
"Temperature in Kelvin",
75
"Temperature (dew point)",
76
"Relative Humidity",
77
"Saturation vapor pressure",
78
"Vapor pressure",
79
"Vapor pressure deficit",
80
"Specific humidity",
81
"Water vapor concentration",
82
"Airtight",
83
"Wind speed",
84
"Maximum wind speed",
85
"Wind direction in degrees",
86
]
87
88
feature_keys = [
89
"p (mbar)",
90
"T (degC)",
91
"Tpot (K)",
92
"Tdew (degC)",
93
"rh (%)",
94
"VPmax (mbar)",
95
"VPact (mbar)",
96
"VPdef (mbar)",
97
"sh (g/kg)",
98
"H2OC (mmol/mol)",
99
"rho (g/m**3)",
100
"wv (m/s)",
101
"max. wv (m/s)",
102
"wd (deg)",
103
]
104
105
colors = [
106
"blue",
107
"orange",
108
"green",
109
"red",
110
"purple",
111
"brown",
112
"pink",
113
"gray",
114
"olive",
115
"cyan",
116
]
117
118
date_time_key = "Date Time"
119
120
121
def show_raw_visualization(data):
122
time_data = data[date_time_key]
123
fig, axes = plt.subplots(
124
nrows=7, ncols=2, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k"
125
)
126
for i in range(len(feature_keys)):
127
key = feature_keys[i]
128
c = colors[i % (len(colors))]
129
t_data = data[key]
130
t_data.index = time_data
131
t_data.head()
132
ax = t_data.plot(
133
ax=axes[i // 2, i % 2],
134
color=c,
135
title="{} - {}".format(titles[i], key),
136
rot=25,
137
)
138
ax.legend([titles[i]])
139
plt.tight_layout()
140
141
142
show_raw_visualization(df)
143
144
145
"""
146
## Data Preprocessing
147
148
Here we are picking ~300,000 data points for training. Observation is recorded every
149
10 mins, that means 6 times per hour. We will resample one point per hour since no
150
drastic change is expected within 60 minutes. We do this via the `sampling_rate`
151
argument in `timeseries_dataset_from_array` utility.
152
153
We are tracking data from past 720 timestamps (720/6=120 hours). This data will be
154
used to predict the temperature after 72 timestamps (72/6=12 hours).
155
156
Since every feature has values with
157
varying ranges, we do normalization to confine feature values to a range of `[0, 1]` before
158
training a neural network.
159
We do this by subtracting the mean and dividing by the standard deviation of each feature.
160
161
71.5 % of the data will be used to train the model, i.e. 300,693 rows. `split_fraction` can
162
be changed to alter this percentage.
163
164
The model is shown data for first 5 days i.e. 720 observations, that are sampled every
165
hour. The temperature after 72 (12 hours * 6 observation per hour) observation will be
166
used as a label.
167
"""
168
169
split_fraction = 0.715
170
train_split = int(split_fraction * int(df.shape[0]))
171
step = 6
172
173
past = 720
174
future = 72
175
learning_rate = 0.001
176
batch_size = 256
177
epochs = 10
178
179
180
def normalize(data, train_split):
181
data_mean = data[:train_split].mean(axis=0)
182
data_std = data[:train_split].std(axis=0)
183
return (data - data_mean) / data_std
184
185
186
"""
187
We can see from the correlation heatmap, few parameters like Relative Humidity and
188
Specific Humidity are redundant. Hence we will be using select features, not all.
189
"""
190
191
print(
192
"The selected parameters are:",
193
", ".join([titles[i] for i in [0, 1, 5, 7, 8, 10, 11]]),
194
)
195
selected_features = [feature_keys[i] for i in [0, 1, 5, 7, 8, 10, 11]]
196
features = df[selected_features]
197
features.index = df[date_time_key]
198
features.head()
199
200
features = normalize(features.values, train_split)
201
features = pd.DataFrame(features)
202
features.head()
203
204
train_data = features.loc[0 : train_split - 1]
205
val_data = features.loc[train_split:]
206
207
"""
208
# Training dataset
209
210
The training dataset labels starts from the 792nd observation (720 + 72).
211
"""
212
213
start = past + future
214
end = start + train_split
215
216
x_train = train_data[[i for i in range(7)]].values
217
y_train = features.iloc[start:end][[1]]
218
219
sequence_length = int(past / step)
220
221
"""
222
The `timeseries_dataset_from_array` function takes in a sequence of data-points gathered at
223
equal intervals, along with time series parameters such as length of the
224
sequences/windows, spacing between two sequence/windows, etc., to produce batches of
225
sub-timeseries inputs and targets sampled from the main timeseries.
226
"""
227
228
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
229
x_train,
230
y_train,
231
sequence_length=sequence_length,
232
sampling_rate=step,
233
batch_size=batch_size,
234
)
235
236
"""
237
## Validation dataset
238
239
The validation dataset must not contain the last 792 rows as we won't have label data for
240
those records, hence 792 must be subtracted from the end of the data.
241
242
The validation label dataset must start from 792 after train_split, hence we must add
243
past + future (792) to label_start.
244
"""
245
246
x_end = len(val_data) - past - future
247
248
label_start = train_split + past + future
249
250
x_val = val_data.iloc[:x_end][[i for i in range(7)]].values
251
y_val = features.iloc[label_start:][[1]]
252
253
dataset_val = keras.preprocessing.timeseries_dataset_from_array(
254
x_val,
255
y_val,
256
sequence_length=sequence_length,
257
sampling_rate=step,
258
batch_size=batch_size,
259
)
260
261
262
for batch in dataset_train.take(1):
263
inputs, targets = batch
264
265
print("Input shape:", inputs.numpy().shape)
266
print("Target shape:", targets.numpy().shape)
267
268
"""
269
## Training
270
"""
271
272
inputs = keras.layers.Input(shape=(inputs.shape[1], inputs.shape[2]))
273
lstm_out = keras.layers.LSTM(32)(inputs)
274
outputs = keras.layers.Dense(1)(lstm_out)
275
276
model = keras.Model(inputs=inputs, outputs=outputs)
277
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse")
278
model.summary()
279
280
"""
281
We'll use the `ModelCheckpoint` callback to regularly save checkpoints, and
282
the `EarlyStopping` callback to interrupt training when the validation loss
283
is not longer improving.
284
"""
285
286
path_checkpoint = "model_checkpoint.weights.h5"
287
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)
288
289
modelckpt_callback = keras.callbacks.ModelCheckpoint(
290
monitor="val_loss",
291
filepath=path_checkpoint,
292
verbose=1,
293
save_weights_only=True,
294
save_best_only=True,
295
)
296
297
history = model.fit(
298
dataset_train,
299
epochs=epochs,
300
validation_data=dataset_val,
301
callbacks=[es_callback, modelckpt_callback],
302
)
303
304
"""
305
We can visualize the loss with the function below. After one point, the loss stops
306
decreasing.
307
"""
308
309
310
def visualize_loss(history, title):
311
loss = history.history["loss"]
312
val_loss = history.history["val_loss"]
313
epochs = range(len(loss))
314
plt.figure()
315
plt.plot(epochs, loss, "b", label="Training loss")
316
plt.plot(epochs, val_loss, "r", label="Validation loss")
317
plt.title(title)
318
plt.xlabel("Epochs")
319
plt.ylabel("Loss")
320
plt.legend()
321
plt.show()
322
323
324
visualize_loss(history, "Training and Validation Loss")
325
326
"""
327
## Prediction
328
329
The trained model above is now able to make predictions for 5 sets of values from
330
validation set.
331
"""
332
333
334
def show_plot(plot_data, delta, title):
335
labels = ["History", "True Future", "Model Prediction"]
336
marker = [".-", "rx", "go"]
337
time_steps = list(range(-(plot_data[0].shape[0]), 0))
338
if delta:
339
future = delta
340
else:
341
future = 0
342
343
plt.title(title)
344
for i, val in enumerate(plot_data):
345
if i:
346
plt.plot(future, plot_data[i], marker[i], markersize=10, label=labels[i])
347
else:
348
plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
349
plt.legend()
350
plt.xlim([time_steps[0], (future + 5) * 2])
351
plt.xlabel("Time-Step")
352
plt.show()
353
return
354
355
356
for x, y in dataset_val.take(5):
357
show_plot(
358
[x[0][:, 1].numpy(), y[0].numpy(), model.predict(x)[0]],
359
12,
360
"Single Step Prediction",
361
)
362
363