Ensemble predictions

📥 Download Notebook

In this notebook it will be provide examples about how the mosqlient package can be used to apply the ensemble methodologies proposed

In [1]:

Copied!

import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("API_KEY")
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("API_KEY")

In [2]:

Copied!





import numpy as np
import pandas as pd
import mosqlient as mosq
from epiweeks import Week
import matplotlib.pyplot as plt 
from mosqlient.forecast.viz import plot_preds
from mosqlient.forecast import EnsembleDistPool, ensemble_vincentization

import numpy as np
import pandas as pd
import mosqlient as mosq
from epiweeks import Week
import matplotlib.pyplot as plt 
from mosqlient.forecast.viz import plot_preds
from mosqlient.forecast import EnsembleDistPool, ensemble_vincentization

Auxiliary function that retrieves a prediction DataFrame for a given model, forecast start date, and adm_1 identifier. This function was designed to work with IMDC forecasts.

In [3]:

Copied!





def get_prediction_df(
    api_key,
    model_name,
    adm_1,
    start_date,
    dropna_cols=True,
):

    list_preds = mosq.get_predictions_by_model_name(
        api_key=api_key,
        model_name=model_name
    )

    preds = [
        pred for pred in list_preds
        if pred.adm_1 == adm_1 and pred.start == start_date
    ]

    if not preds:
        raise ValueError(
            f"Nenhuma previsão encontrada para "
            f"model_name='{model_name}', adm_1={adm_1}, "
            f"start_date={start_date}"
        )

    df = preds[0].to_dataframe()

    if dropna_cols:
        df = df.dropna(axis=1)

    return df
def get_prediction_df(
    api_key,
    model_name,
    adm_1,
    start_date,
    dropna_cols=True,
):

    list_preds = mosq.get_predictions_by_model_name(
        api_key=api_key,
        model_name=model_name
    )

    preds = [
        pred for pred in list_preds
        if pred.adm_1 == adm_1 and pred.start == start_date
    ]

    if not preds:
        raise ValueError(
            f"Nenhuma previsão encontrada para "
            f"model_name='{model_name}', adm_1={adm_1}, "
            f"start_date={start_date}"
        )

    df = preds[0].to_dataframe()

    if dropna_cols:
        df = df.dropna(axis=1)

    return df

Get the predictions from two different models and concatenate them into a single DataFrame:

In [4]:

Copied!





model_name_1 = 'sprint2025'
model_name_2 = 'ghr-imdc-2025'
model_name_3 = 'dengue-oracle'

adm_1 = 33
state = 'RJ'

start_date = Week(2022,41).startdate()

df1 = get_prediction_df(
    api_key,
    model_name_1,
    adm_1,
    start_date)


df2 = get_prediction_df(
    api_key,
    model_name_2,
    adm_1,
    start_date)

df3 = get_prediction_df(
    api_key,
    model_name_3,
    adm_1,
    start_date)

df1['model_id'] = 1
df2['model_id'] = 2
df3['model_id'] = 3

df_end = pd.concat([df1, df2, df3], ignore_index=True)

df_end = df_end.loc[df_end.date <= min(df1.date.max(), df2.date.max(), df3.date.max())]

df_end.head()
model_name_1 = 'sprint2025'
model_name_2 = 'ghr-imdc-2025'
model_name_3 = 'dengue-oracle'

adm_1 = 33
state = 'RJ'

start_date = Week(2022,41).startdate()

df1 = get_prediction_df(
    api_key,
    model_name_1,
    adm_1,
    start_date)


df2 = get_prediction_df(
    api_key,
    model_name_2,
    adm_1,
    start_date)

df3 = get_prediction_df(
    api_key,
    model_name_3,
    adm_1,
    start_date)

df1['model_id'] = 1
df2['model_id'] = 2
df3['model_id'] = 3

df_end = pd.concat([df1, df2, df3], ignore_index=True)

df_end = df_end.loc[df_end.date <= min(df1.date.max(), df2.date.max(), df3.date.max())]

df_end.head()

100%|██████████| 1/1 [00:00<00:00,  1.36requests/s]

Out[4]:

	date	lower_95	lower_90	lower_80	lower_50	pred	upper_50	upper_80	upper_90	upper_95	model_id
0	2022-10-09	6.389023	8.961261	13.852453	20.256981	28.466873	38.008925	66.325197	74.152424	80.334717	1
1	2022-10-16	6.272326	7.555628	12.537025	19.355414	27.877567	36.126831	55.419285	64.354642	78.477453	1
2	2022-10-23	7.779501	11.393761	15.768014	20.381115	28.971523	42.513184	61.251280	68.193910	84.940634	1
3	2022-10-30	4.893708	8.822120	15.315052	19.684608	26.731152	34.782760	61.070226	68.887554	72.060513	1
4	2022-11-06	6.986582	8.685065	14.000801	20.180920	27.931368	36.116859	57.462316	65.915041	69.303849	1

Load the data for the same period:

In [5]:

Copied!

df_obs = mosq.get_infodengue(api_key=api_key,disease ='dengue', uf = state, start_date=start_date, end_date = df_end.date.max())

df_obs = df_obs[['data_iniSE', 'casprov']].rename(columns = {'data_iniSE': 'date', 'casprov': 'casos' })

df_obs = df_obs.groupby(['date'])[['casos']].sum()

df_obs = df_obs.reset_index()

df_obs['date'] = pd.to_datetime(df_obs['date'])

df_obs.head()
df_obs = mosq.get_infodengue(api_key=api_key,disease ='dengue', uf = state, start_date=start_date, end_date = df_end.date.max())

df_obs = df_obs[['data_iniSE', 'casprov']].rename(columns = {'data_iniSE': 'date', 'casprov': 'casos' })

df_obs = df_obs.groupby(['date'])[['casos']].sum()

df_obs = df_obs.reset_index()

df_obs['date'] = pd.to_datetime(df_obs['date'])

df_obs.head()

100%|██████████| 15/15 [00:06<00:00,  2.27requests/s]

Out[5]:

	date	casos
0	2022-10-09	106
1	2022-10-16	106
2	2022-10-23	104
3	2022-10-30	87
4	2022-11-06	92

Untrained ensembles:¶

Compute the following ensemble forecasts:

Equally weighted linear mixture of log-normal distributions;
Equally weighted logarithmic mixture of log-normal distributions;
Vincentization, obtained by taking the median of the corresponding quantiles across all models.

The first two methods require parameterizing each forecast as a log-normal distribution. As a result, the quantiles of the fitted distribution may not exactly match the original forecast quantiles. More details about the parametric approximation are available here: https://api.mosqlimate.org/docs/vis/dashboard/details/

In [6]:

Copied!





ens_log = EnsembleDistPool(
        df = df_end,
        order_models= [1,2, 3],
        mixture = "log",
        dist = "log_normal")

ens_lin = EnsembleDistPool(
        df = df_end,
        order_models= [1,2,3],
        mixture = "linear",
        dist = "log_normal")

equal_w = np.array([1/3, 1/3, 1/3])

df_ens_equal_log = ens_log.apply_ensemble(weights = equal_w)

df_ens_equal_lin = ens_lin.apply_ensemble(weights = equal_w)

df_v = ensemble_vincentization(df_end)
ens_log = EnsembleDistPool(
        df = df_end,
        order_models= [1,2, 3],
        mixture = "log",
        dist = "log_normal")

ens_lin = EnsembleDistPool(
        df = df_end,
        order_models= [1,2,3],
        mixture = "linear",
        dist = "log_normal")

equal_w = np.array([1/3, 1/3, 1/3])

df_ens_equal_log = ens_log.apply_ensemble(weights = equal_w)

df_ens_equal_lin = ens_lin.apply_ensemble(weights = equal_w)

df_v = ensemble_vincentization(df_end)

Concatenate the predictions:

In [7]:

Copied!





dfs = [df1, df2, df3, df_ens_equal_lin, df_ens_equal_log, df_v]
names = ['Model 1', 'Model 2', 'Model 3', 'Ens equal lin', 'Ens equal log', 'Vincentization']

df_preds = pd.concat(
    [df.assign(model_id=name) for df, name in zip(dfs, names)],
    ignore_index=True
)

df_preds.head()
dfs = [df1, df2, df3, df_ens_equal_lin, df_ens_equal_log, df_v]
names = ['Model 1', 'Model 2', 'Model 3', 'Ens equal lin', 'Ens equal log', 'Vincentization']

df_preds = pd.concat(
    [df.assign(model_id=name) for df, name in zip(dfs, names)],
    ignore_index=True
)

df_preds.head()

Out[7]:

	date	lower_95	lower_90	lower_80	lower_50	pred	upper_50	upper_80	upper_90	upper_95	model_id
0	2022-10-09	6.389023	8.961261	13.852453	20.256981	28.466873	38.008925	66.325197	74.152424	80.334717	Model 1
1	2022-10-16	6.272326	7.555628	12.537025	19.355414	27.877567	36.126831	55.419285	64.354642	78.477453	Model 1
2	2022-10-23	7.779501	11.393761	15.768014	20.381115	28.971523	42.513184	61.251280	68.193910	84.940634	Model 1
3	2022-10-30	4.893708	8.822120	15.315052	19.684608	26.731152	34.782760	61.070226	68.887554	72.060513	Model 1
4	2022-11-06	6.986582	8.685065	14.000801	20.180920	27.931368	36.116859	57.462316	65.915041	69.303849	Model 1

Plot the predictions:

In [8]:

Copied!





plot_preds(
    data=df_obs,
    df_preds=df_preds,
    conf_level=0.90,
    data_col="casos",
    model_col="model_id",
    date_col="date",
    pred_col="pred",
    color_palette="Set2",
    linestyle="-",
    alpha=0.1,
    figsize=(10, 5),
    title='Compare ensemble techniques',
    ax=None,
)

plt.show()
plot_preds(
    data=df_obs,
    df_preds=df_preds,
    conf_level=0.90,
    data_col="casos",
    model_col="model_id",
    date_col="date",
    pred_col="pred",
    color_palette="Set2",
    linestyle="-",
    alpha=0.1,
    figsize=(10, 5),
    title='Compare ensemble techniques',
    ax=None,
)

plt.show()

No description has been provided for this image

Trained ensembles:¶

Compute the following ensemble forecasts:

Optimal weights for a linear mixture of log-normal distributions, obtained by minimizing the log score;
Optimal weights for a linear mixture of log-normal distributions, obtained by minimizing the CRPS;
Optimal weights for a logarithmic mixture of log-normal distributions, obtained by minimizing the log score;
Optimal weights for a logarithmic mixture of log-normal distributions, obtained by minimizing the CRPS.

In [9]:

Copied!





weights_log_crps = ens_log.compute_weights(df_obs = df_obs, metric = 'crps')
weights_lin_crps = ens_lin.compute_weights(df_obs= df_obs, metric = 'crps')
weights_log_ls = ens_log.compute_weights(df_obs = df_obs, metric = 'log_score')
weights_lin_ls = ens_lin.compute_weights(df_obs= df_obs, metric = 'log_score')

df_ens_opt_log_crps = ens_log.apply_ensemble(weights = weights_log_crps['weights'])
df_ens_opt_log_ls = ens_log.apply_ensemble(weights = weights_log_ls['weights'])

df_ens_opt_lin_crps = ens_lin.apply_ensemble(weights = weights_lin_crps['weights'])
df_ens_opt_lin_ls = ens_lin.apply_ensemble(weights = weights_lin_ls['weights'])
weights_log_crps = ens_log.compute_weights(df_obs = df_obs, metric = 'crps')
weights_lin_crps = ens_lin.compute_weights(df_obs= df_obs, metric = 'crps')
weights_log_ls = ens_log.compute_weights(df_obs = df_obs, metric = 'log_score')
weights_lin_ls = ens_lin.compute_weights(df_obs= df_obs, metric = 'log_score')

df_ens_opt_log_crps = ens_log.apply_ensemble(weights = weights_log_crps['weights'])
df_ens_opt_log_ls = ens_log.apply_ensemble(weights = weights_log_ls['weights'])

df_ens_opt_lin_crps = ens_lin.apply_ensemble(weights = weights_lin_crps['weights'])
df_ens_opt_lin_ls = ens_lin.apply_ensemble(weights = weights_lin_ls['weights'])

In [10]:

Copied!





dfs = [df_ens_opt_log_crps, df_ens_opt_log_ls, df_ens_opt_lin_crps, df_ens_opt_lin_ls]
names = ['Ens log - CRPS', 'Ens log - LS', 'Ens lin - CRPS', 'Ens lin - LS']

df_preds_trained = pd.concat(
    [df.assign(model_id=name) for df, name in zip(dfs, names)],
    ignore_index=True
)

df_preds_trained.head()
dfs = [df_ens_opt_log_crps, df_ens_opt_log_ls, df_ens_opt_lin_crps, df_ens_opt_lin_ls]
names = ['Ens log - CRPS', 'Ens log - LS', 'Ens lin - CRPS', 'Ens lin - LS']

df_preds_trained = pd.concat(
    [df.assign(model_id=name) for df, name in zip(dfs, names)],
    ignore_index=True
)

df_preds_trained.head()

Out[10]:

	lower_95	lower_90	lower_80	lower_50	pred	upper_50	upper_80	upper_90	upper_95	date	model_id
0	5.686298	7.740720	11.046359	20.011419	38.725756	74.941420	135.762765	193.739624	263.736472	2022-10-09	Ens log - CRPS
1	6.884354	9.441314	13.588758	24.971063	49.095991	96.528383	177.383123	255.305183	350.129627	2022-10-16	Ens log - CRPS
2	6.268130	8.553065	12.239099	22.273909	43.323899	84.267211	153.357710	219.448831	299.444992	2022-10-23	Ens log - CRPS
3	7.543412	9.899772	13.543714	22.865249	40.914407	73.211039	123.598938	169.093661	221.913999	2022-10-30	Ens log - CRPS
4	7.797462	10.125508	13.684631	22.637129	39.599413	69.271749	114.589385	154.867630	201.105640	2022-11-06	Ens log - CRPS

Plot the predictions:

In [11]:

Copied!





plot_preds(
    data=df_obs,
    df_preds=df_preds_trained,
    conf_level=0.90,
    data_col="casos",
    model_col="model_id",
    date_col="date",
    pred_col="pred",
    color_palette="Set2",
    linestyle="-",
    alpha=0.1,
    figsize=(10, 5),
    title='Compare trained ensemble techniques',
    ax=None,
)

plt.show()
plot_preds(
    data=df_obs,
    df_preds=df_preds_trained,
    conf_level=0.90,
    data_col="casos",
    model_col="model_id",
    date_col="date",
    pred_col="pred",
    color_palette="Set2",
    linestyle="-",
    alpha=0.1,
    figsize=(10, 5),
    title='Compare trained ensemble techniques',
    ax=None,
)

plt.show()