Ensemble

`EnsembleDistPool` ¶

A class to compute the weights and apply the ensemble of multiple models.

Attributes¶

df : pd.DataFrame Processed DataFrame containing model predictions. dist : str The distribution type used for modeling. Currently, it only accepts 'log_normal'. order_models : list List of models in a specific order for weight computation.

Methods¶

compute_weights(df_obs: pd.DataFrame, metric: str = 'crps') Computes the weights for the ensemble based on observed data and a specified metric.

apply_ensemble(weights: dict = None) Computes the final ensemble distribution using either precomputed or provided weights.

Source code in mosqlient/forecast/ensemble.py

class EnsembleDistPool:
    """
    A class to compute the weights and apply the ensemble of multiple models.

    Attributes
    ------------
    df : pd.DataFrame
        Processed DataFrame containing model predictions.
    dist : str
        The distribution type used for modeling. Currently, it only accepts 'log_normal'.
    order_models : list
        List of models in a specific order for weight computation.

    Methods
    ---------
    compute_weights(df_obs: pd.DataFrame, metric: str = 'crps')
        Computes the weights for the ensemble based on observed data and a specified metric.

    apply_ensemble(weights: dict = None)
        Computes the final ensemble distribution using either precomputed or provided weights.

    """

    def __init__(
        self,
        df: pd.DataFrame,
        order_models: list,
        mixture: str = "log",
        dist: str = "log_normal",
    ):
        """
        Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing columns `date`, `pred`, `lower`, `upper`, and `model_id`.
        order_models : list
            List defining the order of models for weight computation.
        mixture: str
            Determine how the predictions are combined. Choose `linear` for a weighted
            linear mixture or `log` for logarithmic pooling.
        dist : str, optional
            The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'.
        fn_loss : str, optional
            Loss function used for estimation ('median' or 'lower'). Default is 'median'.


        Raises
        ------
        ValueError
            If the input DataFrame does not contain the required columns.
        """

        if len(df.columns) == 5:

            if not set(cols_preds_before_update).issubset(
                set(list(df.columns))
            ):
                raise ValueError(
                    "Missing required keys in the df:"
                    f"{set(cols_preds_before_update).difference(set(list(df.columns)))}"
                )

            if dist == "log_normal":
                df = get_df_pars(
                    df.copy(), conf_level=0.9, dist=dist, fn_loss="median"
                )

        else:

            if not set(cols_preds_complete).issubset(set(list(df.columns))):
                raise ValueError(
                    "Missing required keys in the pred:"
                    f"{set(cols_preds_before_update).difference(set(list(df.columns)))}"
                )
            if dist == "log_normal":

                df = get_df_pars_ls(df.copy())

        # organize the dataframe:
        df["model_id"] = pd.Categorical(
            df["model_id"], categories=order_models, ordered=True
        )
        df = df.sort_values(by=["model_id", "date"])

        self.df = df
        self.dist = dist
        self.mixture = mixture
        self.order_models = order_models

    def compute_weights(
        self,
        df_obs: pd.DataFrame,
        metric: str = "crps",
        bounds: tuple = (-100, 100),
    ) -> dict:
        """
        Computes the optimal weights for the ensemble based on observed data and a specified metric.

        Parameters
        ------------
        df_obs : pd.DataFrame
            DataFrame containing observed values with columns `date` and `casos`.
        metric : str, optional
            Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'.
        bounds: tuple
            Tuple where the first element represents the minimum value and the second
            represents the maximum value for the bounds.

        Returns
        -------
        dict
            Dictionary containing the computed weights for each model and the loss value.
        """

        preds = self.df[["date", "mu", "sigma", "model_id"]]

        preds.loc[:, "date"] = pd.to_datetime(preds["date"])

        if self.mixture == "linear":
            weights = find_opt_weights_linear(
                df_obs,
                preds,
                self.order_models,
                dist=self.dist,
                metric=metric,
                bounds=bounds,
            )

        if self.mixture == "log":
            weights = find_opt_weights_log(
                obs=df_obs,
                preds=preds,
                order_models=self.order_models,
                dist=self.dist,
                metric=metric,
                bounds=bounds,
            )

        self.weights = weights

        return weights

    def apply_ensemble(
        self,
        weights: Union[None, NDArray[np.float64]] = None,
        p: NDArray[np.float64] = np.array(
            [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]
        ),
    ) -> pd.DataFrame:
        """
        Computes the final ensemble distribution using either precomputed or user-provided weights.

        Parameters
        ----------
        weights : np.array
            Array containing weights for each model. If None, uses precomputed weights.

        p: np.array
            Returned percentile values

        Returns
        -------
        pd.DataFrame
            DataFrame containing the ensemble predictions with quantiles (`pred`, `lower`, `upper`).
        """

        if weights is None:
            try:
                weights = self.weights["weights"]
            except:
                raise ValueError(
                    "Weights must be computed first using `compute_weights`, or provided explicitly."
                )

        weights = cast(NDArray[np.float64], weights)

        columns = get_ci_columns(p)

        preds = self.df

        df_for = pd.DataFrame()

        for d in preds.date.unique():
            preds_ = preds.loc[preds.date == d]

            if self.mixture == "log":
                quantiles = get_quantiles_log(
                    self.dist,
                    weights=weights,
                    ms=preds_.mu,
                    vs=preds_.sigma**2,
                    p=p,
                )

            if self.mixture == "linear":
                quantiles = get_quantiles_linear(
                    self.dist, weights=weights, preds=preds_, p=p
                )

            df_ = pd.DataFrame([quantiles], columns=columns)

            df_["date"] = d

            df_for = pd.concat([df_for, df_], axis=0).reset_index(drop=True)

        df_for["date"] = pd.to_datetime(df_for["date"])

        return df_for

`init(df, order_models, mixture='log', dist='log_normal')` ¶

Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

Parameters¶

df : pd.DataFrame DataFrame containing columns date, pred, lower, upper, and model_id. order_models : list List defining the order of models for weight computation. mixture: str Determine how the predictions are combined. Choose linear for a weighted linear mixture or log for logarithmic pooling. dist : str, optional The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'. fn_loss : str, optional Loss function used for estimation ('median' or 'lower'). Default is 'median'.

Raises¶

ValueError If the input DataFrame does not contain the required columns.

Source code in mosqlient/forecast/ensemble.py

def __init__(
    self,
    df: pd.DataFrame,
    order_models: list,
    mixture: str = "log",
    dist: str = "log_normal",
):
    """
    Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing columns `date`, `pred`, `lower`, `upper`, and `model_id`.
    order_models : list
        List defining the order of models for weight computation.
    mixture: str
        Determine how the predictions are combined. Choose `linear` for a weighted
        linear mixture or `log` for logarithmic pooling.
    dist : str, optional
        The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'.
    fn_loss : str, optional
        Loss function used for estimation ('median' or 'lower'). Default is 'median'.


    Raises
    ------
    ValueError
        If the input DataFrame does not contain the required columns.
    """

    if len(df.columns) == 5:

        if not set(cols_preds_before_update).issubset(
            set(list(df.columns))
        ):
            raise ValueError(
                "Missing required keys in the df:"
                f"{set(cols_preds_before_update).difference(set(list(df.columns)))}"
            )

        if dist == "log_normal":
            df = get_df_pars(
                df.copy(), conf_level=0.9, dist=dist, fn_loss="median"
            )

    else:

        if not set(cols_preds_complete).issubset(set(list(df.columns))):
            raise ValueError(
                "Missing required keys in the pred:"
                f"{set(cols_preds_before_update).difference(set(list(df.columns)))}"
            )
        if dist == "log_normal":

            df = get_df_pars_ls(df.copy())

    # organize the dataframe:
    df["model_id"] = pd.Categorical(
        df["model_id"], categories=order_models, ordered=True
    )
    df = df.sort_values(by=["model_id", "date"])

    self.df = df
    self.dist = dist
    self.mixture = mixture
    self.order_models = order_models

`apply_ensemble(weights=None, p=np.array([0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]))` ¶

Computes the final ensemble distribution using either precomputed or user-provided weights.

Parameters¶

weights : np.array Array containing weights for each model. If None, uses precomputed weights.

np.array

Returned percentile values

Returns¶

pd.DataFrame DataFrame containing the ensemble predictions with quantiles (pred, lower, upper).

Source code in mosqlient/forecast/ensemble.py

def apply_ensemble(
    self,
    weights: Union[None, NDArray[np.float64]] = None,
    p: NDArray[np.float64] = np.array(
        [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]
    ),
) -> pd.DataFrame:
    """
    Computes the final ensemble distribution using either precomputed or user-provided weights.

    Parameters
    ----------
    weights : np.array
        Array containing weights for each model. If None, uses precomputed weights.

    p: np.array
        Returned percentile values

    Returns
    -------
    pd.DataFrame
        DataFrame containing the ensemble predictions with quantiles (`pred`, `lower`, `upper`).
    """

    if weights is None:
        try:
            weights = self.weights["weights"]
        except:
            raise ValueError(
                "Weights must be computed first using `compute_weights`, or provided explicitly."
            )

    weights = cast(NDArray[np.float64], weights)

    columns = get_ci_columns(p)

    preds = self.df

    df_for = pd.DataFrame()

    for d in preds.date.unique():
        preds_ = preds.loc[preds.date == d]

        if self.mixture == "log":
            quantiles = get_quantiles_log(
                self.dist,
                weights=weights,
                ms=preds_.mu,
                vs=preds_.sigma**2,
                p=p,
            )

        if self.mixture == "linear":
            quantiles = get_quantiles_linear(
                self.dist, weights=weights, preds=preds_, p=p
            )

        df_ = pd.DataFrame([quantiles], columns=columns)

        df_["date"] = d

        df_for = pd.concat([df_for, df_], axis=0).reset_index(drop=True)

    df_for["date"] = pd.to_datetime(df_for["date"])

    return df_for

`compute_weights(df_obs, metric='crps', bounds=(-100, 100))` ¶

Computes the optimal weights for the ensemble based on observed data and a specified metric.

Parameters¶

df_obs : pd.DataFrame DataFrame containing observed values with columns date and casos. metric : str, optional Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'. bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Returns¶

dict Dictionary containing the computed weights for each model and the loss value.

Source code in mosqlient/forecast/ensemble.py

def compute_weights(
    self,
    df_obs: pd.DataFrame,
    metric: str = "crps",
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Computes the optimal weights for the ensemble based on observed data and a specified metric.

    Parameters
    ------------
    df_obs : pd.DataFrame
        DataFrame containing observed values with columns `date` and `casos`.
    metric : str, optional
        Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'.
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Returns
    -------
    dict
        Dictionary containing the computed weights for each model and the loss value.
    """

    preds = self.df[["date", "mu", "sigma", "model_id"]]

    preds.loc[:, "date"] = pd.to_datetime(preds["date"])

    if self.mixture == "linear":
        weights = find_opt_weights_linear(
            df_obs,
            preds,
            self.order_models,
            dist=self.dist,
            metric=metric,
            bounds=bounds,
        )

    if self.mixture == "log":
        weights = find_opt_weights_log(
            obs=df_obs,
            preds=preds,
            order_models=self.order_models,
            dist=self.dist,
            metric=metric,
            bounds=bounds,
        )

    self.weights = weights

    return weights

`alpha_01(alpha_inv)` ¶

Function that maps from R^n to the open simplex.

Parameters¶

alpha_inv: array of float

Returns¶

array Vector on the (n+1) open simplex.

Source code in mosqlient/forecast/ensemble.py

def alpha_01(alpha_inv: NDArray[np.float64]) -> NDArray[np.float64]:
    """
    Function that maps from R^n to the open simplex.

    Parameters
    -----------
    alpha_inv: array of float

    Returns
    --------
    array
        Vector on the (n+1) open simplex.
    """
    K = len(alpha_inv) + 1
    z = np.full(K - 1, np.nan)  # Equivalent to rep(NA, K-1)
    alphas = np.zeros(K)  # Equivalent to rep(0, K)

    for k in range(K - 1):
        z[k] = invlogit(alpha_inv[k] + np.log(1 / (K - (k + 1))))
        alphas[k] = (1 - np.sum(alphas[:k])) * z[k]

    alphas[K - 1] = 1 - np.sum(alphas[:-1])
    return alphas

`compute_ppf(mu, sigma, weights, p=np.array([0.5, 0.05, 0.95]))` ¶

Compute the Percent-Point Function (PPF), which is the inverse of the CDF, for a mixture of lognormal distributions.

The function takes the parameters of a lognormal mixture (mean, standard deviation, and weights) and returns the mixture values for the 5th, 50th, and 95th percentiles.

Parameters¶

mu: np.array
    Mean values (in log-space) for the lognormal components of the mixture.
sigma: np.array
    Standard deviation values (in log-space) for the lognormal components of the mixture.
weights: np.array
    Weights of each component in the lognormal mixture. These should sum to 1.

Returns¶

np.array
The x-values corresponding to the 5th, 50th, and 95th percentiles.

Source code in mosqlient/forecast/ensemble.py

def compute_ppf(
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
) -> NDArray[np.float64]:
    """
    Compute the Percent-Point Function (PPF), which is the inverse of the CDF,
    for a mixture of lognormal distributions.

    The function takes the parameters of a lognormal mixture (mean, standard deviation, and weights)
    and returns the mixture values for the 5th, 50th, and 95th percentiles.

    Parameters
    ------------
        mu: np.array
            Mean values (in log-space) for the lognormal components of the mixture.
        sigma: np.array
            Standard deviation values (in log-space) for the lognormal components of the mixture.
        weights: np.array
            Weights of each component in the lognormal mixture. These should sum to 1.

    Returns
    ---------
        np.array
        The x-values corresponding to the 5th, 50th, and 95th percentiles.
    """
    x = np.linspace(1e-6, 10**5, 10**5).astype(np.float64)

    pdf_values = dlnorm_mix(x, mu, sigma, weights, log=False)

    # Normalize the PDF using the trapezoidal rule
    dx = np.diff(x)  # Compute spacing between consecutive x-values
    dx = np.append(dx, dx[-1])  # Ensure length matches the x array
    area = np.sum(pdf_values * dx)  # Approximate the area under the PDF
    pdf_values_normalized = (
        pdf_values / area
    )  # Normalize the PDF to ensure total area is 1

    cdf_values = cumulative_trapezoid(pdf_values_normalized, x, initial=0)

    # Invert the CDF to obtain the PPF
    ppf_function = interp1d(
        cdf_values, x, bounds_error=False, fill_value="extrapolate"
    )

    x_for_p = ppf_function(
        p
    )  # Get x-values corresponding to the probabilities

    return x_for_p

`crps_lognormal_mix(obs, mu, sigma, weights)` ¶

Compute the score of a mix of lognormal distributions.

Parameters¶

obs: np.array or float
    Values where the mixture score is evaluated.
mu: np.array
    Mu parameter (in log-space) for the lognormal components.
sigma: np.array
    Standard deviations (in log-space) for the lognormal components.
weight: array-like
    Mixture weights (must sum to 1).

Returns¶

float
The score evaluated.

Source code in mosqlient/forecast/ensemble.py

def crps_lognormal_mix(
    obs: Union[float, NDArray[np.float64]],
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
) -> float:
    """
    Compute the score of a mix of lognormal distributions.

    Parameters
    ------------
        obs: np.array or float
            Values where the mixture score is evaluated.
        mu: np.array
            Mu parameter (in log-space) for the lognormal components.
        sigma: np.array
            Standard deviations (in log-space) for the lognormal components.
        weight: array-like
            Mixture weights (must sum to 1).

    Returns
    ------------
        float
        The score evaluated.
    """
    K = len(mu)

    if len(sigma) != K:
        raise ValueError("mu and sigma should be the same length")

    crpsdens = list(np.zeros(K))

    for i in np.arange(K):
        crpsdens[i] = crps_lognormal(obs, mu[i], sigma[i])

    return np.dot(np.array(weights), np.array(crpsdens))  # , crpsdens

`dlnorm_mix(obs, mu, sigma, weights, log=False)` ¶

Compute the PDF or log-PDF of a mixture of lognormal distributions for omega values.

Parameters¶

obs: np.array or float
    Values where the mixture density is evaluated. Can be a single value or an array.
mu: np.array
    Mu parameter (in log-space) for the lognormal components.
sigma: np.array
    Standard deviations (in log-space) for the lognormal components.
weight: array-like
    Mixture weights (must sum to 1).
log: bool
    Whether to return the log-density.

Returns¶

array: The mixture density or log-density evaluated at obs.

Source code in mosqlient/forecast/ensemble.py

def dlnorm_mix(
    obs: NDArray[np.float64],
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
    log=False,
) -> float:
    """
    Compute the PDF or log-PDF of a mixture of lognormal distributions for omega values.

    Parameters
    ------------
        obs: np.array or float
            Values where the mixture density is evaluated. Can be a single value or an array.
        mu: np.array
            Mu parameter (in log-space) for the lognormal components.
        sigma: np.array
            Standard deviations (in log-space) for the lognormal components.
        weight: array-like
            Mixture weights (must sum to 1).
        log: bool
            Whether to return the log-density.

    Returns
    ------------
        array: The mixture density or log-density evaluated at obs.
    """
    obs = np.atleast_1d(obs)  # Ensure `obs` is an array
    lw = np.log(weights)  # Log of weights
    K = len(mu)  # Number of components

    if len(sigma) != K or len(weights) != K:
        raise ValueError("mu, sigma, and weights must have the same length")

    # Compute log-PDFs for each component in a vectorized manner
    ldens = np.array(
        [
            lognorm.logpdf(obs, s=sigma[i], scale=np.exp(mu[i]))
            for i in range(K)
        ]
    ).T  # Transpose to align with obs dimensions

    # Combine using logsumexp for numerical stability
    if log:
        ans = logsumexp(lw + ldens, axis=1)
    else:
        ans = np.exp(logsumexp(lw + ldens, axis=1))

    return (
        ans if ans.size > 1 else ans.item()
    )  # Return scalar if input was scalar

`ensemble_vincentization(df_preds, models=None, index_cols=['date'], model_col='model_id')` ¶

Construct a median ensemble forecast using Vincentization.

Vincentization combines forecasts by taking the median of corresponding quantiles across a set of models. For each forecast horizon (defined by index_cols), the ensemble quantiles are obtained as the median of the same quantile from all included models.

The function expects the following quantile columns to be present:

lower_95
lower_90
lower_80
lower_50
pred (point forecast / median)
upper_50
upper_80
upper_90
upper_95

After aggregation, the resulting quantiles are checked for monotonicity. An exception is raised if any forecast has crossing quantiles.

Parameters¶

df_preds : pandas.DataFrame DataFrame containing forecasts from multiple models. Each row corresponds to a model forecast and must include the quantile columns listed above, along with the columns specified by index_cols and model_col.

list-like, optional, default=None

Subset of model identifiers to include in the ensemble. If None, all available models are used.

list of str, optional, default=["date"]

Columns defining a unique forecast target (e.g., forecast date, location, horizon). Forecasts are aggregated separately for each unique combination of these columns.

str, optional, default="model_id"

Name of the column containing model identifiers.

Returns¶

pandas.DataFrame Ensemble forecast DataFrame containing index_cols and the aggregated quantile columns. Each quantile is the median across models for the corresponding forecast target.

Raises¶

Exception If the resulting ensemble contains non-monotonic quantiles (i.e., quantile crossing).

Source code in mosqlient/forecast/ensemble.py

def ensemble_vincentization(
    df_preds, models=None, index_cols=["date"], model_col="model_id"
):
    """
    Construct a median ensemble forecast using Vincentization.

    Vincentization combines forecasts by taking the median of corresponding
    quantiles across a set of models. For each forecast horizon (defined by
    ``index_cols``), the ensemble quantiles are obtained as the median of the
    same quantile from all included models.

    The function expects the following quantile columns to be present:

    - ``lower_95``
    - ``lower_90``
    - ``lower_80``
    - ``lower_50``
    - ``pred`` (point forecast / median)
    - ``upper_50``
    - ``upper_80``
    - ``upper_90``
    - ``upper_95``

    After aggregation, the resulting quantiles are checked for monotonicity.
    An exception is raised if any forecast has crossing quantiles.

    Parameters
    ----------
    df_preds : pandas.DataFrame
        DataFrame containing forecasts from multiple models. Each row
        corresponds to a model forecast and must include the quantile columns
        listed above, along with the columns specified by ``index_cols`` and
        ``model_col``.

    models : list-like, optional, default=None
        Subset of model identifiers to include in the ensemble. If ``None``,
        all available models are used.

    index_cols : list of str, optional, default=["date"]
        Columns defining a unique forecast target (e.g., forecast date,
        location, horizon). Forecasts are aggregated separately for each
        unique combination of these columns.

    model_col : str, optional, default="model_id"
        Name of the column containing model identifiers.

    Returns
    -------
    pandas.DataFrame
        Ensemble forecast DataFrame containing ``index_cols`` and the
        aggregated quantile columns. Each quantile is the median across models
        for the corresponding forecast target.

    Raises
    ------
    Exception
        If the resulting ensemble contains non-monotonic quantiles
        (i.e., quantile crossing).

    """

    if models is not None:
        df_preds = df_preds.loc[df_preds[model_col].isin(models)]

    list_dfs = []

    for col in [
        "lower_95",
        "lower_90",
        "lower_80",
        "lower_50",
        "pred",
        "upper_50",
        "upper_80",
        "upper_90",
        "upper_95",
    ]:

        list_dfs.append(
            pd.DataFrame(
                df_preds.pivot(
                    index=index_cols, columns=model_col, values=col
                ).median(axis=1)
            ).rename(columns={0: col})
        )

    df_median_ens = pd.concat(list_dfs, axis=1).reset_index()

    quantile_order = (
        (df_median_ens["lower_95"] <= df_median_ens["lower_90"])
        & (df_median_ens["lower_90"] <= df_median_ens["lower_80"])
        & (df_median_ens["lower_80"] <= df_median_ens["lower_50"])
        & (df_median_ens["lower_50"] <= df_median_ens["pred"])
        & (df_median_ens["pred"] <= df_median_ens["upper_50"])
        & (df_median_ens["upper_50"] <= df_median_ens["upper_80"])
        & (df_median_ens["upper_80"] <= df_median_ens["upper_90"])
        & (df_median_ens["upper_90"] <= df_median_ens["upper_95"])
    )

    if ~quantile_order.all():
        raise Exception(
            "The ensemble includes quantile values that violate monotonicity."
        )

    return df_median_ens

`find_opt_weights_linear(obs, preds, order_models, dist, metric, bounds=(-100, 100))` ¶

Find the weights of a linear mix distributions that minimizes the metric selected.

Parameters¶

obs: pd.Dataframe Dataframe with the columns: date and casos preds: pd.Dataframe Dataframe with the columns: date, mu, sigma, model_id order_models : list List defining the order of models for weight computation. dist : str, optional The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'. metric : str, optional Metric used for optimization. Options: crps, log_score. bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Return¶

dict A dictionary containing: - weights: The optimized weights for the models. - loss: The minimized loss value based on the selected metric.

Source code in mosqlient/forecast/ensemble.py

def find_opt_weights_linear(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    dist: str,
    metric: str,
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Find the weights of a linear mix distributions that minimizes the metric selected.

    Parameters
    -----------
    obs: pd.Dataframe
        Dataframe with the columns: `date` and `casos`
    preds: pd.Dataframe
        Dataframe with the columns: `date`, `mu`, `sigma`, `model_id`
    order_models : list
        List defining the order of models for weight computation.
    dist : str, optional
        The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'.
    metric : str, optional
        Metric used for optimization. Options: `crps`, `log_score`.
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Return
    -------
    dict
        A dictionary containing:
        - `weights`: The optimized weights for the models.
        - `loss`: The minimized loss value based on the selected metric.
    """

    if dist == "log_normal":
        weights = find_opt_weights_linear_mix_log(
            obs, preds, order_models, metric=metric, bounds=bounds
        )

    return weights

`find_opt_weights_linear_mix_log(obs, preds, order_models, metric, bounds)` ¶

Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

Parameters¶

obs: pd.Dataframe Dataframe with the columns: date and casos preds: pd.Dataframe Dataframe with the columns: date, mu, sigma, model_id order_models: list Order of the different models in the model_id column metric: str ['crps', 'log_score'] Metric used to optimize the weights bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Return¶

dict A dictionary containing: - weights: The optimized weights for the models. - loss: The minimized loss value based on the selected metric.

Source code in mosqlient/forecast/ensemble.py

def find_opt_weights_linear_mix_log(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    metric: str,
    bounds: tuple,
) -> dict:
    """
    Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

    Parameters
    -----------
    obs: pd.Dataframe
        Dataframe with the columns: `date` and `casos`
    preds: pd.Dataframe
        Dataframe with the columns: `date`, `mu`, `sigma`, `model_id`
    order_models: list
        Order of the different models in the model_id column
    metric: str ['crps', 'log_score']
        Metric used to optimize the weights
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Return
    -------
    dict
        A dictionary containing:
        - `weights`: The optimized weights for the models.
        - `loss`: The minimized loss value based on the selected metric.
    """
    K = len(order_models)

    def loss(eta):
        """
        Computes the loss function based on the selected metric.

        Parameters
        ----------
        eta : array-like
            Parameterization of the weights, transformed via `alpha_01`.

        Returns
        -------
        float
            The computed loss value.
        """
        ws = alpha_01(eta)
        ws = np.where(ws < 1e-6, 1e-6, ws)

        score = 0
        for date in obs.date:
            preds_ = preds.loc[preds.date == date]
            preds_ = preds_.drop(["date"], axis=1).reset_index(drop=True)

            if metric == "log_score":
                score = score - dlnorm_mix(
                    obs.loc[obs.date == date].casos,
                    preds_["mu"].to_numpy(),
                    preds_["sigma"].to_numpy(),
                    weights=ws,
                    log=True,
                )

            if metric == "crps":
                score = score + crps_lognormal_mix(
                    obs.loc[obs.date == date].casos,
                    preds_["mu"].to_numpy(),
                    preds_["sigma"].to_numpy(),
                    weights=ws,
                )

        return score

    initial_guess = np.random.normal(size=K - 1)
    bounds_ = [bounds] * (K - 1)
    opt_result = minimize(
        loss, initial_guess, method="Nelder-mead", bounds=bounds_
    )

    optimal_weights = alpha_01(opt_result.x)

    return {"weights": optimal_weights, "loss": opt_result.fun}

`find_opt_weights_log(obs, preds, order_models, dist='log_normal', metric='crps', bounds=(-100, 100))` ¶

Function that generate the weights of the ensemble minimizing the metric selected.

Parameters¶

obs: pd.dataframe Dataframe with columns date and casos;

pd.dataframe

Dataframe with columns date, mu, sigma, and model_id

list

Order of the different models in the model_id column

str ['log_normal']

Distribution used to represent the forecast. Currently, it only accepts 'log_normal'.

str ['crps', 'log_score']

Metric used to optimize the weights

tuple

Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Returns¶

dict The dict contains the keys: - weights: the optmize weights by the loss - loss: loss function value

Source code in mosqlient/forecast/ensemble.py

def find_opt_weights_log(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    dist: str = "log_normal",
    metric: str = "crps",
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Function that generate the weights of the ensemble minimizing the metric selected.

    Parameters
    -----------------
    obs: pd.dataframe
        Dataframe with columns date and casos;

    preds: pd.dataframe
        Dataframe with columns date, mu, sigma, and model_id

    order_models: list
        Order of the different models in the model_id column

    dist: str ['log_normal']
        Distribution used to represent the forecast. Currently, it only accepts 'log_normal'.

    metric: str ['crps', 'log_score']
        Metric used to optimize the weights

    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Returns
    --------
    dict
    The dict contains the keys:
    - weights: the optmize weights by the loss
    - loss: loss function value
    """

    K = len(order_models)

    def loss(eta):

        ws = alpha_01(eta)

        score = 0
        for date in obs.date:
            preds_ = preds.loc[preds.date == date]
            preds_ = preds_.drop(["date"], axis=1).reset_index(drop=True)
            ms = preds_["mu"]
            vs = preds_.sigma**2

            if not len(ms) == len(vs) == K:
                print(ms)
                print(vs)
                print(K)
                print(date)
                raise ValueError("n_models and vs are not the same size!")

            mu, sd = pool_par_gauss(alpha=ws, m=ms, v=vs)

            score = score + get_score(
                obs=obs.loc[obs.date == date].casos,
                mu=mu,
                sd=sd,
                dist=dist,
                metric=metric,
            )

        return score

    initial_guess = np.random.normal(size=K - 1)
    bounds_ = [bounds] * (K - 1)
    opt_result = minimize(
        loss, initial_guess, method="Nelder-mead", bounds=bounds_
    )

    optimal_weights = alpha_01(opt_result.x)

    return {"weights": optimal_weights, "loss": opt_result.fun}

`get_ci_columns(p)` ¶

Function that given the confidence interval return the columns names

Parameters¶

p: NDArray[np.float64] percentile values

Returns¶

List of columns name

Source code in mosqlient/forecast/ensemble.py

def get_ci_columns(p):
    """
    Function that given the confidence interval return the columns names

    Parameters
    -----------
    p: NDArray[np.float64]
    percentile values

    Returns
    --------
    List of columns name
    """

    columns = []

    for value in p:
        if value < 0.5:
            columns.append(f"lower_{int((1 - 2 * value) * 100)}")
        elif value == 0.5:
            columns.append("pred")
        else:
            columns.append(f"upper_{int(2 * value * 100) - 100}")

    return columns

`get_epiweek(date)` ¶

Function to capture the epidemiological year and week from the date

Source code in mosqlient/forecast/ensemble.py

def get_epiweek(date):
    """
    Function to capture the epidemiological year and week from the date
    """
    epiweek = Week.fromdate(date)
    return (epiweek.year, epiweek.week)

`get_quantiles_linear(dist, weights, preds, p=np.array([0.5, 0.05, 0.95]))` ¶

Function to get the quantiles of the linear mixture.

Parameters¶

dist : str, optional The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'.. weights: np.array The weights assigned to each prediction. preds: pd.DataFrame The Dataframe with the predictions. p: np.array Returned percentile values

Returns¶

quantiles: np.array The quantiles obtained according to p.

Source code in mosqlient/forecast/ensemble.py

def get_quantiles_linear(
    dist: str,
    weights: NDArray[np.float64],
    preds: pd.DataFrame,
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
):
    """
    Function to get the quantiles of the linear mixture.

    Parameters
    ------------
    dist : str, optional
        The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'..
    weights: np.array
        The weights assigned to each prediction.
    preds: pd.DataFrame
        The Dataframe with the predictions.
    p: np.array
        Returned percentile values

    Returns
    --------
    quantiles: np.array
        The quantiles obtained according to p.
    """

    weights = np.where(weights < 1e-6, 1e-6, weights)

    if dist == "log_normal":
        quantiles = compute_ppf(
            mu=np.asarray(preds["mu"].values, dtype=np.float64),
            sigma=np.asarray(preds["sigma"].values, dtype=np.float64),
            weights=weights,
            p=p,
        )

    return quantiles

`get_quantiles_log(dist, weights, ms, vs, p=np.array([0.5, 0.05, 0.95]))` ¶

Function to get the quantiles of a logarithmic pooling.

Parameters¶

dist : str, optional The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'.. weights: np.array The weights assigned to each prediction. ms: np.array The mu parameter of each prediction. vs: np.array The variance parameter of each prediction. p: np.array Returned percentile values

Returns¶

quantiles: np.array The quantiles obtained according to p.

Source code in mosqlient/forecast/ensemble.py

def get_quantiles_log(
    dist: str,
    weights: NDArray[np.float64],
    ms: NDArray[np.float64],
    vs: NDArray[np.float64],
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
):
    """
    Function to get the quantiles of a logarithmic pooling.

    Parameters
    ------------
    dist : str, optional
        The distribution type used for parameterizing predictions. Currently, it only accepts 'log_normal'..
    weights: np.array
        The weights assigned to each prediction.
    ms: np.array
        The mu parameter of each prediction.
    vs: np.array
        The variance parameter of each prediction.
    p: np.array
        Returned percentile values

    Returns
    --------
    quantiles: np.array
        The quantiles obtained according to p.
    """
    pool = pool_par_gauss(alpha=weights, m=ms, v=vs)

    if dist == "log_normal":
        quantiles = lognorm.ppf(p, s=pool[1], scale=np.exp(pool[0]))

    return quantiles

`get_score(obs, mu, sd, dist='log_normal', metric='crps')` ¶

Function to compute the score given a distribution and a predefined metric.

Parameters¶

obs: float The real observation.

mu:float The mu parameter of the distribution

float

The sd parameter associated with the distribution

str ['log_normal']

Distribution type. Currently, it only accepts 'log_normal'.

str ['crps', 'log_score']

Scoring metric, either 'crps' or 'log_score'.

Returns¶

float The computed score based on the given metric and distribution.

Source code in mosqlient/forecast/ensemble.py

def get_score(
    obs: float,
    mu: float,
    sd: float,
    dist: str = "log_normal",
    metric: str = "crps",
) -> float:
    """
    Function to compute the score given a distribution
    and a predefined metric.

    Parameters
    -----------
    obs: float
        The real observation.

    mu:float
        The mu parameter of the distribution

    sd: float
        The sd parameter associated with the distribution

    dist: str ['log_normal']
        Distribution type. Currently, it only accepts 'log_normal'.

    metric: str ['crps', 'log_score']
        Scoring metric, either 'crps' or 'log_score'.

    Returns
    --------
    float
        The computed score based on the given metric and distribution.
    """

    if metric == "log_score":
        if dist == "log_normal":
            return -lognorm.logpdf(obs, s=sd, scale=np.exp(mu))

    if metric == "crps":
        if dist == "log_normal":
            return crps_lognormal(obs, mu, sd)

    raise ValueError(f"Invalid distribution '{dist}' and metric '{metric}'")

`pool_par_gauss(alpha, m, v)` ¶

Function to get the output distribution from a logarithmic pool of lognormal distrutions

Parameters¶

alpha : array of float Weigths assigned to each distribution in the pool. m : array of float mu parameter v : array of float variance parameter Returns

tuple A tuple containing two elements. The first one is the mu and the second one the sd parameter of the distribution.

Notes¶

The logarithmic pooling method is based on the work of Carvalho, L. M., Villela, D. A., Coelho, F. C., & Bastos, L. S. (2023). Bayesian inference for the weights in logarithmic pooling. Bayesian Analysis, 18(1), 223-251.

Source code in mosqlient/forecast/ensemble.py

def pool_par_gauss(
    alpha: NDArray[np.float64], m: NDArray[np.float64], v: NDArray[np.float64]
) -> tuple:
    """
    Function to get the output distribution from a logarithmic pool of lognormal distrutions

    Parameters
    ----------
    alpha : array of float
        Weigths assigned to each distribution in the pool.
    m : array of float
        mu parameter
    v : array of float
        variance parameter
    Returns
    -------
    tuple
        A tuple containing two elements. The first one is the mu and the second one the sd parameter of the distribution.

    Notes
    ------
    The logarithmic pooling method is based on the work of Carvalho, L. M., Villela, D. A., Coelho, F. C., & Bastos, L. S. (2023).
    Bayesian inference for the weights in logarithmic pooling. Bayesian Analysis, 18(1), 223-251.
    """
    if not (len(alpha) == len(m) == len(v)):
        raise ValueError(
            "The arrays 'alpha', 'm', and 'v' must have the same length."
        )

    ws = alpha / v
    vstar = 1 / np.sum(ws)
    mstar = np.sum(ws * m) * vstar
    return mstar, np.sqrt(vstar)

Ensemble

EnsembleDistPool ¶

Attributes¶

Methods¶

__init__(df, order_models, mixture='log', dist='log_normal') ¶

Parameters¶

Raises¶

apply_ensemble(weights=None, p=np.array([0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975])) ¶

Parameters¶

Returns¶

compute_weights(df_obs, metric='crps', bounds=(-100, 100)) ¶

Parameters¶

Returns¶

alpha_01(alpha_inv) ¶

Parameters¶

Returns¶

compute_ppf(mu, sigma, weights, p=np.array([0.5, 0.05, 0.95])) ¶

Parameters¶

Returns¶

crps_lognormal_mix(obs, mu, sigma, weights) ¶

Parameters¶

Returns¶

dlnorm_mix(obs, mu, sigma, weights, log=False) ¶

Parameters¶

Returns¶

ensemble_vincentization(df_preds, models=None, index_cols=['date'], model_col='model_id') ¶

Parameters¶

Returns¶

Raises¶

find_opt_weights_linear(obs, preds, order_models, dist, metric, bounds=(-100, 100)) ¶

Parameters¶

Return¶

find_opt_weights_linear_mix_log(obs, preds, order_models, metric, bounds) ¶

Parameters¶

Return¶

find_opt_weights_log(obs, preds, order_models, dist='log_normal', metric='crps', bounds=(-100, 100)) ¶

Parameters¶

Returns¶

get_ci_columns(p) ¶

Parameters¶

Returns¶

get_epiweek(date) ¶

get_quantiles_linear(dist, weights, preds, p=np.array([0.5, 0.05, 0.95])) ¶

Parameters¶

Returns¶

get_quantiles_log(dist, weights, ms, vs, p=np.array([0.5, 0.05, 0.95])) ¶

Parameters¶

Returns¶

get_score(obs, mu, sd, dist='log_normal', metric='crps') ¶

Parameters¶

Returns¶

pool_par_gauss(alpha, m, v) ¶

Parameters¶

Notes¶

`EnsembleDistPool` ¶

`init(df, order_models, mixture='log', dist='log_normal')` ¶

`apply_ensemble(weights=None, p=np.array([0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]))` ¶

`compute_weights(df_obs, metric='crps', bounds=(-100, 100))` ¶

`alpha_01(alpha_inv)` ¶

`compute_ppf(mu, sigma, weights, p=np.array([0.5, 0.05, 0.95]))` ¶

`crps_lognormal_mix(obs, mu, sigma, weights)` ¶

`dlnorm_mix(obs, mu, sigma, weights, log=False)` ¶

`ensemble_vincentization(df_preds, models=None, index_cols=['date'], model_col='model_id')` ¶

`find_opt_weights_linear(obs, preds, order_models, dist, metric, bounds=(-100, 100))` ¶

`find_opt_weights_linear_mix_log(obs, preds, order_models, metric, bounds)` ¶

`find_opt_weights_log(obs, preds, order_models, dist='log_normal', metric='crps', bounds=(-100, 100))` ¶

`get_ci_columns(p)` ¶

`get_epiweek(date)` ¶

`get_quantiles_linear(dist, weights, preds, p=np.array([0.5, 0.05, 0.95]))` ¶

`get_quantiles_log(dist, weights, ms, vs, p=np.array([0.5, 0.05, 0.95]))` ¶

`get_score(obs, mu, sd, dist='log_normal', metric='crps')` ¶

`pool_par_gauss(alpha, m, v)` ¶