Skip to content

Baseline Arima

Arima

A class to implement a ARIMA model as baseline for forecast cases in some city.

Attributes

df : pd.DataFrame A pandas dataframe with the columns y and a datetime index

Methods

train(): Train the model. predict_in_sample(): Predictions of the model in sample. predict_out_of_sample(): Predictions of the model out of sample. forecast(): Forecast models

Source code in mosqlient/forecast/baseline.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
class Arima:
    """
    A class to implement a ARIMA model as baseline for forecast cases in some city.

    Attributes
    ----------
    df : pd.DataFrame
        A pandas dataframe with the columns y and a datetime index

    Methods
    -------
    train():
        Train the model.
    predict_in_sample():
        Predictions of the model in sample.
    predict_out_of_sample():
        Predictions of the model out of sample.
    forecast():
        Forecast models
    """

    def __init__(self, df: pd.DataFrame, **auto_arima_kwargs):
        """
        Constructs all the necessary attributes for the Arima object.

        Parameters
        ----------
            df : pd.DataFrame
            A pandas dataframe with the column y and a datetime index
            auto_arima_kwargs : dict
            All parameters that can be passed to pmdarima.arima.auto_arima.

        """
        if not pd.api.types.is_datetime64_any_dtype(df.index):
            raise InvalidDataFrameError(
                "The DataFrame's index is not of datetime type."
            )

        if df.shape[1] != 1:
            raise InvalidDataFrameError(
                "The DataFrame must have one single column."
            )

        if df.columns[0] != "y":
            raise InvalidDataFrameError("The column must be named `y`.")

        df["y"] = df["y"].astype(float)

        self.df = df

        default_auto_arima_kwargs = {
            "seasonal": False,
            "trace": True,
            "maxiter": 100,
            "error_action": "ignore",
            "information_criterion": "aic",
            "suppress_warnings": True,
            "stepwise": True,
        }

        default_auto_arima_kwargs.update(auto_arima_kwargs)

        self.auto_arima_kwargs = default_auto_arima_kwargs

    def train(self, train_ini_date: str, train_end_date: str):
        """
        Train the ARIMA model

        Parameters
        ----------
            train_ini_date: str
                Initial date for model training
            train_end_date: str
                End date for model training
        """

        df_train = self.df.copy()

        df_train = df_train.loc[
            (df_train.index >= pd.to_datetime(train_ini_date))
            & (df_train.index <= pd.to_datetime(train_end_date))
        ]

        self.df_train = df_train

        boxcox = ppc.BoxCoxEndogTransformer().fit(df_train.y)

        self.boxcox = boxcox

        df_train.loc[:, "y"] = boxcox.transform(df_train.y)[0]

        model = auto_arima(df_train.y, **self.auto_arima_kwargs)

        model.fit(df_train.y)

        self.model = model

        return model

    def predict_in_sample(self, plot: bool = True) -> pd.DataFrame:
        """
        Returns the model performance in the sample.

        Parameters
        ----------
            plot: bool
                If true the plot of the model in the sample is returned

        """

        df_train = self.df_train.copy()

        df_in_sample = get_prediction_dataframe(
            self.model, df_train.index, self.boxcox
        )

        df_in_sample = df_in_sample.merge(
            df_train, left_on="date", right_index=True
        )

        df_in_sample = df_in_sample.rename(columns={"y": "data"})

        df_in_sample["data"] = self.boxcox.inverse_transform(
            df_in_sample["data"]
        )[0]

        if plot:
            plot_predictions(df_in_sample, title="In sample predictions")

        return df_in_sample

    def predict_out_of_sample(
        self, horizon: int, end_date: str, plot=True
    ) -> pd.DataFrame:
        """
        Returns the model performance out of the sample.
        The predictions are returned by windows of {horizon} observations. After each window
        the model is updated with the data of the last observations forecasted.

        Parameters
        ----------
            horizon: int
                The number of observations forecasted by the model
            end_date: str
                Last week of the out of sample evaluation. The first week is after the last training observation.
            plot: bool
                If true the plot of the model out of the sample is returned
        """

        df = self.df.copy()

        df.loc[:, "y"] = self.boxcox.transform(df.y)[0]

        model = self.model

        date = get_next_n_weeks(
            self.df_train.index[-1].strftime("%Y-%m-%d"), horizon
        )

        df_preds = get_prediction_dataframe(
            model, date, self.boxcox, horizon=horizon
        )

        while pd.Timestamp(date[-1]) < pd.to_datetime(end_date):

            date = get_next_n_weeks(date[-1].strftime("%Y-%m-%d"), horizon)

            df_preds = pd.concat(
                [
                    df_preds,
                    get_prediction_dataframe(
                        model.update(df.loc[date[0] : date[-1]]),
                        date,
                        self.boxcox,
                        horizon=horizon,
                    ),
                ]
            )

        df_preds.date = pd.to_datetime(df_preds.date)

        df_preds = df_preds.merge(df, left_on="date", right_index=True)

        df_preds = df_preds.loc[df_preds.date <= end_date]

        df_preds = df_preds.dropna()

        df_preds.loc[:, "y"] = self.boxcox.inverse_transform(df_preds["y"])[0]

        df_preds = df_preds.rename(columns={"y": "data"})

        if plot:

            plot_predictions(df_preds, title="Out of sample predictions")

        return df_preds

    def forecast(
        self, horizon: int, plot: bool, last_obs: int
    ) -> pd.DataFrame:
        """
        Returns the forecast of the model.
        Before applying this method is necessary to call the `train()` method.
        The `forecast()` method will forecast {horizon} observations ahead of the last observation
        used to train the model in the `train()` method.

        Parameters
        ----------
            horizon: int
                The number of observations forecasted by the model
            plot: bool
                If true return a figure with the forecasted values.
            last_obs: bool
                The number of last observations plotted in the figure
        """

        df_train = self.df_train.copy()

        df_train = df_train.rename(columns={"y": "data"})

        df_train["data"] = self.boxcox.inverse_transform(df_train["data"])[0]

        model = self.model

        date = get_next_n_weeks(
            df_train.index[-1].strftime("%Y-%m-%d"), horizon
        )

        df_preds = get_prediction_dataframe(
            model, date, self.boxcox, horizon=horizon
        )

        if plot:

            plot_forecast(df_preds, df_train, last_obs)

        return df_preds

__init__(df, **auto_arima_kwargs)

Constructs all the necessary attributes for the Arima object.

Parameters
df : pd.DataFrame
A pandas dataframe with the column y and a datetime index
auto_arima_kwargs : dict
All parameters that can be passed to pmdarima.arima.auto_arima.
Source code in mosqlient/forecast/baseline.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def __init__(self, df: pd.DataFrame, **auto_arima_kwargs):
    """
    Constructs all the necessary attributes for the Arima object.

    Parameters
    ----------
        df : pd.DataFrame
        A pandas dataframe with the column y and a datetime index
        auto_arima_kwargs : dict
        All parameters that can be passed to pmdarima.arima.auto_arima.

    """
    if not pd.api.types.is_datetime64_any_dtype(df.index):
        raise InvalidDataFrameError(
            "The DataFrame's index is not of datetime type."
        )

    if df.shape[1] != 1:
        raise InvalidDataFrameError(
            "The DataFrame must have one single column."
        )

    if df.columns[0] != "y":
        raise InvalidDataFrameError("The column must be named `y`.")

    df["y"] = df["y"].astype(float)

    self.df = df

    default_auto_arima_kwargs = {
        "seasonal": False,
        "trace": True,
        "maxiter": 100,
        "error_action": "ignore",
        "information_criterion": "aic",
        "suppress_warnings": True,
        "stepwise": True,
    }

    default_auto_arima_kwargs.update(auto_arima_kwargs)

    self.auto_arima_kwargs = default_auto_arima_kwargs

forecast(horizon, plot, last_obs)

Returns the forecast of the model. Before applying this method is necessary to call the train() method. The forecast() method will forecast {horizon} observations ahead of the last observation used to train the model in the train() method.

Parameters
horizon: int
    The number of observations forecasted by the model
plot: bool
    If true return a figure with the forecasted values.
last_obs: bool
    The number of last observations plotted in the figure
Source code in mosqlient/forecast/baseline.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
def forecast(
    self, horizon: int, plot: bool, last_obs: int
) -> pd.DataFrame:
    """
    Returns the forecast of the model.
    Before applying this method is necessary to call the `train()` method.
    The `forecast()` method will forecast {horizon} observations ahead of the last observation
    used to train the model in the `train()` method.

    Parameters
    ----------
        horizon: int
            The number of observations forecasted by the model
        plot: bool
            If true return a figure with the forecasted values.
        last_obs: bool
            The number of last observations plotted in the figure
    """

    df_train = self.df_train.copy()

    df_train = df_train.rename(columns={"y": "data"})

    df_train["data"] = self.boxcox.inverse_transform(df_train["data"])[0]

    model = self.model

    date = get_next_n_weeks(
        df_train.index[-1].strftime("%Y-%m-%d"), horizon
    )

    df_preds = get_prediction_dataframe(
        model, date, self.boxcox, horizon=horizon
    )

    if plot:

        plot_forecast(df_preds, df_train, last_obs)

    return df_preds

predict_in_sample(plot=True)

Returns the model performance in the sample.

Parameters
plot: bool
    If true the plot of the model in the sample is returned
Source code in mosqlient/forecast/baseline.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def predict_in_sample(self, plot: bool = True) -> pd.DataFrame:
    """
    Returns the model performance in the sample.

    Parameters
    ----------
        plot: bool
            If true the plot of the model in the sample is returned

    """

    df_train = self.df_train.copy()

    df_in_sample = get_prediction_dataframe(
        self.model, df_train.index, self.boxcox
    )

    df_in_sample = df_in_sample.merge(
        df_train, left_on="date", right_index=True
    )

    df_in_sample = df_in_sample.rename(columns={"y": "data"})

    df_in_sample["data"] = self.boxcox.inverse_transform(
        df_in_sample["data"]
    )[0]

    if plot:
        plot_predictions(df_in_sample, title="In sample predictions")

    return df_in_sample

predict_out_of_sample(horizon, end_date, plot=True)

Returns the model performance out of the sample. The predictions are returned by windows of {horizon} observations. After each window the model is updated with the data of the last observations forecasted.

Parameters
horizon: int
    The number of observations forecasted by the model
end_date: str
    Last week of the out of sample evaluation. The first week is after the last training observation.
plot: bool
    If true the plot of the model out of the sample is returned
Source code in mosqlient/forecast/baseline.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def predict_out_of_sample(
    self, horizon: int, end_date: str, plot=True
) -> pd.DataFrame:
    """
    Returns the model performance out of the sample.
    The predictions are returned by windows of {horizon} observations. After each window
    the model is updated with the data of the last observations forecasted.

    Parameters
    ----------
        horizon: int
            The number of observations forecasted by the model
        end_date: str
            Last week of the out of sample evaluation. The first week is after the last training observation.
        plot: bool
            If true the plot of the model out of the sample is returned
    """

    df = self.df.copy()

    df.loc[:, "y"] = self.boxcox.transform(df.y)[0]

    model = self.model

    date = get_next_n_weeks(
        self.df_train.index[-1].strftime("%Y-%m-%d"), horizon
    )

    df_preds = get_prediction_dataframe(
        model, date, self.boxcox, horizon=horizon
    )

    while pd.Timestamp(date[-1]) < pd.to_datetime(end_date):

        date = get_next_n_weeks(date[-1].strftime("%Y-%m-%d"), horizon)

        df_preds = pd.concat(
            [
                df_preds,
                get_prediction_dataframe(
                    model.update(df.loc[date[0] : date[-1]]),
                    date,
                    self.boxcox,
                    horizon=horizon,
                ),
            ]
        )

    df_preds.date = pd.to_datetime(df_preds.date)

    df_preds = df_preds.merge(df, left_on="date", right_index=True)

    df_preds = df_preds.loc[df_preds.date <= end_date]

    df_preds = df_preds.dropna()

    df_preds.loc[:, "y"] = self.boxcox.inverse_transform(df_preds["y"])[0]

    df_preds = df_preds.rename(columns={"y": "data"})

    if plot:

        plot_predictions(df_preds, title="Out of sample predictions")

    return df_preds

train(train_ini_date, train_end_date)

Train the ARIMA model

Parameters
train_ini_date: str
    Initial date for model training
train_end_date: str
    End date for model training
Source code in mosqlient/forecast/baseline.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
def train(self, train_ini_date: str, train_end_date: str):
    """
    Train the ARIMA model

    Parameters
    ----------
        train_ini_date: str
            Initial date for model training
        train_end_date: str
            End date for model training
    """

    df_train = self.df.copy()

    df_train = df_train.loc[
        (df_train.index >= pd.to_datetime(train_ini_date))
        & (df_train.index <= pd.to_datetime(train_end_date))
    ]

    self.df_train = df_train

    boxcox = ppc.BoxCoxEndogTransformer().fit(df_train.y)

    self.boxcox = boxcox

    df_train.loc[:, "y"] = boxcox.transform(df_train.y)[0]

    model = auto_arima(df_train.y, **self.auto_arima_kwargs)

    model.fit(df_train.y)

    self.model = model

    return model

InvalidDataFrameError

Bases: Exception

Custom exception for invalid DataFrame.

Source code in mosqlient/forecast/baseline.py
193
194
195
196
class InvalidDataFrameError(Exception):
    """Custom exception for invalid DataFrame."""

    pass

get_next_n_weeks(ini_date, next_days)

Return a list of dates with the {next_weeks} weeks after ini_date. This function was designed to generate the dates of the forecast models. Parameters


ini_date : str Initial date. next_weeks : int Number of weeks to be included in the list after the date in ini_date. Returns


list A list with the dates computed.

Source code in mosqlient/forecast/baseline.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_next_n_weeks(ini_date: str, next_days: int) -> list:
    """
    Return a list of dates with the {next_weeks} weeks after ini_date.
    This function was designed to generate the dates of the forecast
    models.
    Parameters
    ----------
    ini_date : str
        Initial date.
    next_weeks : int
        Number of weeks to be included in the list after the date in
        ini_date.
    Returns
    -------
    list
        A list with the dates computed.
    """

    next_dates = []

    a = datetime.strptime(ini_date, "%Y-%m-%d")

    for i in np.arange(1, next_days + 1):
        d_i = datetime.strftime(a + timedelta(days=int(i * 7)), "%Y-%m-%d")

        next_dates.append(datetime.strptime(d_i, "%Y-%m-%d").date())

    return next_dates

get_prediction_dataframe(model, date, boxcox, horizon=None, alphas=[0.05, 0.1, 0.2, 0.5])

Function to organize the predictions of the ARIMA model in a pandas DataFrame.

Parameters

horizon: int The number of weeks forecasted by the model end_date: str Last week of the out of the sample evaluation. The first week is after the last training observation. plot: bool If true the plot of the model out of the sample is returned

Source code in mosqlient/forecast/baseline.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def get_prediction_dataframe(
    model, date, boxcox, horizon=None, alphas=[0.05, 0.1, 0.2, 0.5]
) -> pd.DataFrame:
    """
    Function to organize the predictions of the ARIMA model in a pandas DataFrame.

    Parameters
    ----------
    horizon: int
        The number of weeks forecasted by the model
    end_date: str
        Last week of the out of the sample evaluation. The first week is after the last training observation.
    plot: bool
        If true the plot of the model out of the sample is returned
    """

    dfs = []
    for alpha in alphas:
        if horizon is None:
            preds_ = model.predict_in_sample(return_conf_int=True, alpha=alpha)
        else:
            preds_ = model.predict(
                n_periods=horizon, return_conf_int=True, alpha=alpha
            )
        df_ = pd.DataFrame(
            preds_[1],
            columns=[
                f"lower_{int((1-alpha)*100)}",
                f"upper_{int((1-alpha)*100)}",
            ],
        )
        dfs.append(df_)

    df_preds = pd.concat(dfs, axis=1)

    try:
        df_preds["pred"] = preds_[0].values
    except:
        df_preds["pred"] = preds_[0]

    if df_preds["pred"].values[0] == 0:
        df_preds = df_preds.iloc[1:]
        date = date[1:]

    for col in df_preds.columns:
        df_preds[col] = boxcox.inverse_transform(df_preds[col])[0]

    df_preds["date"] = date

    return df_preds

plot_forecast(df_for, df_train, last_obs, alphas=[0.05, 0.1, 0.2, 0.5])

Function to plot the forecast of the model.

Parameters

df_for: pd.DataFrame Dataframe with the forecast results, with the columns: ['date', 'pred', 'lower', 'upper'] df_preds: pd.DataFrame Dataframe with the columns: ['data'] and a datetime index. last_obs: int Number of previous observations of the data included.

Source code in mosqlient/forecast/baseline.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def plot_forecast(
    df_for: pd.DataFrame,
    df_train: pd.DataFrame,
    last_obs: int,
    alphas=[0.05, 0.1, 0.2, 0.5],
) -> None:
    """
    Function to plot the forecast of the model.

    Parameters
    ----------
    df_for: pd.DataFrame
        Dataframe with the forecast results, with the columns: ['date', 'pred', 'lower', 'upper']
    df_preds: pd.DataFrame
        Dataframe with the columns: ['data'] and a datetime index.
    last_obs: int
        Number of previous observations of the data included.
    """

    df_train = df_train.tail(last_obs)

    fig, ax = plt.subplots(1, figsize=(6, 4))

    ax.plot(df_train.index, df_train.data, color="black", label="Data")

    ax.plot(df_for.date, df_for.pred, color="tab:red", label="Forecast")

    for alpha in alphas:

        ax.fill_between(
            df_for.date,
            df_for[f"lower_{int((1-alpha)*100)}"],
            df_for[f"upper_{int((1-alpha)*100)}"],
            color="tab:red",
            alpha=0.1,
        )

    ax.plot(
        [df_train.index[-1], df_for.date[0]],
        [df_train[f"data"].values[-1], df_for.pred.values[0]],
        ls="--",
        color="black",
    )

    ax.legend()

    ax.grid()

    ax.set_title("Forecast ARIMA")

    ax.set_xlabel("Date")

    ax.set_ylabel("New cases")

    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b-%d\n%Y"))

plot_predictions(df_preds, title='', alphas=[0.05, 0.1, 0.2, 0.5])

Function to plot the predictions of the model.

Parameters

df_preds: pd.DataFrame Dataframe with the columns: ['date', 'data', 'pred', 'lower', 'upper']. title: str Title of the plot.

Source code in mosqlient/forecast/baseline.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def plot_predictions(
    df_preds: pd.DataFrame, title: str = "", alphas=[0.05, 0.1, 0.2, 0.5]
) -> None:
    """
    Function to plot the predictions of the model.

    Parameters
    ----------
    df_preds: pd.DataFrame
        Dataframe with the columns: ['date', 'data', 'pred', 'lower', 'upper'].
    title: str
        Title of the plot.
    """

    fig, ax = plt.subplots(1, figsize=(6, 4))

    ax.plot(df_preds.date, df_preds.data, color="black", label="Data")

    ax.plot(df_preds.date, df_preds.pred, color="tab:orange", label="ARIMA")

    for alpha in alphas:

        ax.fill_between(
            df_preds.date,
            df_preds[f"lower_{int((1-alpha)*100)}"],
            df_preds[f"upper_{int((1-alpha)*100)}"],
            color="tab:orange",
            alpha=0.1,
        )

    ax.legend()

    ax.grid()

    ax.set_title(title)

    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b\n%y"))

    ax.set_xlabel("Date")

    ax.set_ylabel("New cases")