Skip to content

Ensemble

Ensemble

A class to compute the weights and apply the ensemble of multiple models.

Attributes

df : pd.DataFrame Processed DataFrame containing model predictions. dist : str The distribution type used for modeling ('log_normal' or 'normal'). order_models : list List of models in a specific order for weight computation.

Methods

compute_weights(df_obs: pd.DataFrame, metric: str = 'crps') Computes the weights for the ensemble based on observed data and a specified metric.

apply_ensemble(weights: dict = None) Computes the final ensemble distribution using either precomputed or provided weights.

Source code in mosqlient/forecast/ensemble.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
class Ensemble:
    """
    A class to compute the weights and apply the ensemble of multiple models.

    Attributes
    ------------
    df : pd.DataFrame
        Processed DataFrame containing model predictions.
    dist : str
        The distribution type used for modeling ('log_normal' or 'normal').
    order_models : list
        List of models in a specific order for weight computation.

    Methods
    ---------
    compute_weights(df_obs: pd.DataFrame, metric: str = 'crps')
        Computes the weights for the ensemble based on observed data and a specified metric.

    apply_ensemble(weights: dict = None)
        Computes the final ensemble distribution using either precomputed or provided weights.

    """

    def __init__(
        self,
        df: pd.DataFrame,
        order_models: list,
        mixture: str = "log",
        dist: str = "log_normal",
        fn_loss: str = "median",
        conf_level: float = 0.9,
    ):
        """
        Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing columns `date`, `pred`, `lower`, `upper`, and `model_id`.
        order_models : list
            List defining the order of models for weight computation.
        mixture: str
            Determine how the predictions are combined. Choose `linear` for a weighted
            linear mixture or `log` for logarithmic pooling.
        dist : str, optional
            The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'.
        fn_loss : str, optional
            Loss function used for estimation ('median' or 'lower'). Default is 'median'.
        conf_level : float, optional, default=0.9
            Confidence level used for computing the confidence intervals.

        Raises
        ------
        ValueError
            If the input DataFrame does not contain the required columns.
        """

        try:
            df = df[
                [
                    "date",
                    "pred",
                    f"lower_{int(100 * conf_level)}",
                    f"upper_{int(100 * conf_level)}",
                    "model_id",
                ]
            ]

        except:
            raise ValueError(
                f"The input dataframe must contain the columns: 'date', 'pred', 'lower_{int(100 * conf_level)}', 'upper_{int(100 * conf_level)}', 'model_id'"
            )

        df = get_df_pars(
            df.copy(), conf_level=conf_level, dist=dist, fn_loss=fn_loss
        )

        # organize the dataframe:
        df["model_id"] = pd.Categorical(
            df["model_id"], categories=order_models, ordered=True
        )
        df = df.sort_values(by=["model_id", "date"])

        self.df = df
        self.dist = dist
        self.mixture = mixture
        self.order_models = order_models

    def compute_weights(
        self,
        df_obs: pd.DataFrame,
        metric: str = "crps",
        bounds: tuple = (-100, 100),
    ) -> dict:
        """
        Computes the optimal weights for the ensemble based on observed data and a specified metric.

        Parameters
        ------------
        df_obs : pd.DataFrame
            DataFrame containing observed values with columns `date` and `casos`.
        metric : str, optional
            Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'.
        bounds: tuple
            Tuple where the first element represents the minimum value and the second
            represents the maximum value for the bounds.

        Returns
        -------
        dict
            Dictionary containing the computed weights for each model and the loss value.
        """

        preds = self.df[["date", "mu", "sigma", "model_id"]]

        if self.mixture == "linear":
            weights = find_opt_weights_linear(
                df_obs,
                preds,
                self.order_models,
                dist=self.dist,
                metric=metric,
                bounds=bounds,
            )

        if self.mixture == "log":
            weights = find_opt_weights_log(
                obs=df_obs,
                preds=preds,
                order_models=self.order_models,
                dist=self.dist,
                metric=metric,
                bounds=bounds,
            )

        self.weights = weights

        return weights

    def apply_ensemble(
        self,
        weights: Union[None, NDArray[np.float64]] = None,
        p: NDArray[np.float64] = np.array(
            [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]
        ),
    ) -> pd.DataFrame:
        """
        Computes the final ensemble distribution using either precomputed or user-provided weights.

        Parameters
        ----------
        weights : np.array
            Array containing weights for each model. If None, uses precomputed weights.

        p: np.array
            Returned percentile values

        Returns
        -------
        pd.DataFrame
            DataFrame containing the ensemble predictions with quantiles (`pred`, `lower`, `upper`).
        """

        if weights is None:
            try:
                weights = self.weights["weights"]
            except:
                raise ValueError(
                    "Weights must be computed first using `compute_weights`, or provided explicitly."
                )

        weights = cast(NDArray[np.float64], weights)

        columns = get_ci_columns(p)

        preds = self.df

        df_for = pd.DataFrame()

        for d in preds.date.unique():
            preds_ = preds.loc[preds.date == d]

            if self.mixture == "log":
                quantiles = get_quantiles_log(
                    self.dist,
                    weights=weights,
                    ms=preds_.mu,
                    vs=preds_.sigma**2,
                    p=p,
                )

            if self.mixture == "linear":
                quantiles = get_quantiles_linear(
                    self.dist, weights=weights, preds=preds_, p=p
                )

            df_ = pd.DataFrame([quantiles], columns=columns)

            df_["date"] = d

            df_for = pd.concat([df_for, df_], axis=0).reset_index(drop=True)

        df_for.date = pd.to_datetime(df_for.date)

        return df_for

__init__(df, order_models, mixture='log', dist='log_normal', fn_loss='median', conf_level=0.9)

Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

Parameters

df : pd.DataFrame DataFrame containing columns date, pred, lower, upper, and model_id. order_models : list List defining the order of models for weight computation. mixture: str Determine how the predictions are combined. Choose linear for a weighted linear mixture or log for logarithmic pooling. dist : str, optional The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'. fn_loss : str, optional Loss function used for estimation ('median' or 'lower'). Default is 'median'. conf_level : float, optional, default=0.9 Confidence level used for computing the confidence intervals.

Raises

ValueError If the input DataFrame does not contain the required columns.

Source code in mosqlient/forecast/ensemble.py
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
def __init__(
    self,
    df: pd.DataFrame,
    order_models: list,
    mixture: str = "log",
    dist: str = "log_normal",
    fn_loss: str = "median",
    conf_level: float = 0.9,
):
    """
    Initializes the Ensemble class by processing the input DataFrame and defining key attributes.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing columns `date`, `pred`, `lower`, `upper`, and `model_id`.
    order_models : list
        List defining the order of models for weight computation.
    mixture: str
        Determine how the predictions are combined. Choose `linear` for a weighted
        linear mixture or `log` for logarithmic pooling.
    dist : str, optional
        The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'.
    fn_loss : str, optional
        Loss function used for estimation ('median' or 'lower'). Default is 'median'.
    conf_level : float, optional, default=0.9
        Confidence level used for computing the confidence intervals.

    Raises
    ------
    ValueError
        If the input DataFrame does not contain the required columns.
    """

    try:
        df = df[
            [
                "date",
                "pred",
                f"lower_{int(100 * conf_level)}",
                f"upper_{int(100 * conf_level)}",
                "model_id",
            ]
        ]

    except:
        raise ValueError(
            f"The input dataframe must contain the columns: 'date', 'pred', 'lower_{int(100 * conf_level)}', 'upper_{int(100 * conf_level)}', 'model_id'"
        )

    df = get_df_pars(
        df.copy(), conf_level=conf_level, dist=dist, fn_loss=fn_loss
    )

    # organize the dataframe:
    df["model_id"] = pd.Categorical(
        df["model_id"], categories=order_models, ordered=True
    )
    df = df.sort_values(by=["model_id", "date"])

    self.df = df
    self.dist = dist
    self.mixture = mixture
    self.order_models = order_models

apply_ensemble(weights=None, p=np.array([0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]))

Computes the final ensemble distribution using either precomputed or user-provided weights.

Parameters

weights : np.array Array containing weights for each model. If None, uses precomputed weights.

np.array

Returned percentile values

Returns

pd.DataFrame DataFrame containing the ensemble predictions with quantiles (pred, lower, upper).

Source code in mosqlient/forecast/ensemble.py
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
def apply_ensemble(
    self,
    weights: Union[None, NDArray[np.float64]] = None,
    p: NDArray[np.float64] = np.array(
        [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975]
    ),
) -> pd.DataFrame:
    """
    Computes the final ensemble distribution using either precomputed or user-provided weights.

    Parameters
    ----------
    weights : np.array
        Array containing weights for each model. If None, uses precomputed weights.

    p: np.array
        Returned percentile values

    Returns
    -------
    pd.DataFrame
        DataFrame containing the ensemble predictions with quantiles (`pred`, `lower`, `upper`).
    """

    if weights is None:
        try:
            weights = self.weights["weights"]
        except:
            raise ValueError(
                "Weights must be computed first using `compute_weights`, or provided explicitly."
            )

    weights = cast(NDArray[np.float64], weights)

    columns = get_ci_columns(p)

    preds = self.df

    df_for = pd.DataFrame()

    for d in preds.date.unique():
        preds_ = preds.loc[preds.date == d]

        if self.mixture == "log":
            quantiles = get_quantiles_log(
                self.dist,
                weights=weights,
                ms=preds_.mu,
                vs=preds_.sigma**2,
                p=p,
            )

        if self.mixture == "linear":
            quantiles = get_quantiles_linear(
                self.dist, weights=weights, preds=preds_, p=p
            )

        df_ = pd.DataFrame([quantiles], columns=columns)

        df_["date"] = d

        df_for = pd.concat([df_for, df_], axis=0).reset_index(drop=True)

    df_for.date = pd.to_datetime(df_for.date)

    return df_for

compute_weights(df_obs, metric='crps', bounds=(-100, 100))

Computes the optimal weights for the ensemble based on observed data and a specified metric.

Parameters

df_obs : pd.DataFrame DataFrame containing observed values with columns date and casos. metric : str, optional Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'. bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Returns

dict Dictionary containing the computed weights for each model and the loss value.

Source code in mosqlient/forecast/ensemble.py
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def compute_weights(
    self,
    df_obs: pd.DataFrame,
    metric: str = "crps",
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Computes the optimal weights for the ensemble based on observed data and a specified metric.

    Parameters
    ------------
    df_obs : pd.DataFrame
        DataFrame containing observed values with columns `date` and `casos`.
    metric : str, optional
        Scoring metric used for optimization. Options: ['crps', 'log_score']. Default is 'crps'.
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Returns
    -------
    dict
        Dictionary containing the computed weights for each model and the loss value.
    """

    preds = self.df[["date", "mu", "sigma", "model_id"]]

    if self.mixture == "linear":
        weights = find_opt_weights_linear(
            df_obs,
            preds,
            self.order_models,
            dist=self.dist,
            metric=metric,
            bounds=bounds,
        )

    if self.mixture == "log":
        weights = find_opt_weights_log(
            obs=df_obs,
            preds=preds,
            order_models=self.order_models,
            dist=self.dist,
            metric=metric,
            bounds=bounds,
        )

    self.weights = weights

    return weights

alpha_01(alpha_inv)

Function that maps from R^n to the open simplex.

Parameters

alpha_inv: array of float

Returns

array Vector on the (n+1) open simplex.

Source code in mosqlient/forecast/ensemble.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def alpha_01(alpha_inv: NDArray[np.float64]) -> NDArray[np.float64]:
    """
    Function that maps from R^n to the open simplex.

    Parameters
    -----------
    alpha_inv: array of float

    Returns
    --------
    array
        Vector on the (n+1) open simplex.
    """
    K = len(alpha_inv) + 1
    z = np.full(K - 1, np.nan)  # Equivalent to rep(NA, K-1)
    alphas = np.zeros(K)  # Equivalent to rep(0, K)

    for k in range(K - 1):
        z[k] = invlogit(alpha_inv[k] + np.log(1 / (K - (k + 1))))
        alphas[k] = (1 - np.sum(alphas[:k])) * z[k]

    alphas[K - 1] = 1 - np.sum(alphas[:-1])
    return alphas

compute_ppf(mu, sigma, weights, p=np.array([0.5, 0.05, 0.95]))

Compute the Percent-Point Function (PPF), which is the inverse of the CDF, for a mixture of lognormal distributions.

The function takes the parameters of a lognormal mixture (mean, standard deviation, and weights) and returns the mixture values for the 5th, 50th, and 95th percentiles.

Parameters

mu: np.array
    Mean values (in log-space) for the lognormal components of the mixture.
sigma: np.array
    Standard deviation values (in log-space) for the lognormal components of the mixture.
weights: np.array
    Weights of each component in the lognormal mixture. These should sum to 1.

Returns

np.array
The x-values corresponding to the 5th, 50th, and 95th percentiles.
Source code in mosqlient/forecast/ensemble.py
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
def compute_ppf(
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
) -> NDArray[np.float64]:
    """
    Compute the Percent-Point Function (PPF), which is the inverse of the CDF,
    for a mixture of lognormal distributions.

    The function takes the parameters of a lognormal mixture (mean, standard deviation, and weights)
    and returns the mixture values for the 5th, 50th, and 95th percentiles.

    Parameters
    ------------
        mu: np.array
            Mean values (in log-space) for the lognormal components of the mixture.
        sigma: np.array
            Standard deviation values (in log-space) for the lognormal components of the mixture.
        weights: np.array
            Weights of each component in the lognormal mixture. These should sum to 1.

    Returns
    ---------
        np.array
        The x-values corresponding to the 5th, 50th, and 95th percentiles.
    """
    x = np.linspace(1e-6, 10**5, 10**5).astype(np.float64)

    pdf_values = dlnorm_mix(x, mu, sigma, weights, log=False)

    # Normalize the PDF using the trapezoidal rule
    dx = np.diff(x)  # Compute spacing between consecutive x-values
    dx = np.append(dx, dx[-1])  # Ensure length matches the x array
    area = np.sum(pdf_values * dx)  # Approximate the area under the PDF
    pdf_values_normalized = (
        pdf_values / area
    )  # Normalize the PDF to ensure total area is 1

    cdf_values = cumulative_trapezoid(pdf_values_normalized, x, initial=0)

    # Invert the CDF to obtain the PPF
    ppf_function = interp1d(
        cdf_values, x, bounds_error=False, fill_value="extrapolate"
    )

    x_for_p = ppf_function(
        p
    )  # Get x-values corresponding to the probabilities

    return x_for_p

crps_lognormal_mix(obs, mu, sigma, weights)

Compute the score of a mix of lognormal distributions.

Parameters

obs: np.array or float
    Values where the mixture score is evaluated.
mu: np.array
    Mu parameter (in log-space) for the lognormal components.
sigma: np.array
    Standard deviations (in log-space) for the lognormal components.
weight: array-like
    Mixture weights (must sum to 1).

Returns

float
The score evaluated.
Source code in mosqlient/forecast/ensemble.py
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
def crps_lognormal_mix(
    obs: Union[float, NDArray[np.float64]],
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
) -> float:
    """
    Compute the score of a mix of lognormal distributions.

    Parameters
    ------------
        obs: np.array or float
            Values where the mixture score is evaluated.
        mu: np.array
            Mu parameter (in log-space) for the lognormal components.
        sigma: np.array
            Standard deviations (in log-space) for the lognormal components.
        weight: array-like
            Mixture weights (must sum to 1).

    Returns
    ------------
        float
        The score evaluated.
    """
    K = len(mu)

    if len(sigma) != K:
        raise ValueError("mu and sigma should be the same length")

    crpsdens = list(np.zeros(K))

    for i in np.arange(K):
        crpsdens[i] = crps_lognormal(obs, mu[i], sigma[i])

    return np.dot(np.array(weights), np.array(crpsdens))  # , crpsdens

dlnorm_mix(obs, mu, sigma, weights, log=False)

Compute the PDF or log-PDF of a mixture of lognormal distributions for omega values.

Parameters

obs: np.array or float
    Values where the mixture density is evaluated. Can be a single value or an array.
mu: np.array
    Mu parameter (in log-space) for the lognormal components.
sigma: np.array
    Standard deviations (in log-space) for the lognormal components.
weight: array-like
    Mixture weights (must sum to 1).
log: bool
    Whether to return the log-density.

Returns

array: The mixture density or log-density evaluated at obs.
Source code in mosqlient/forecast/ensemble.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
def dlnorm_mix(
    obs: NDArray[np.float64],
    mu: NDArray[np.float64],
    sigma: NDArray[np.float64],
    weights: NDArray[np.float64],
    log=False,
) -> float:
    """
    Compute the PDF or log-PDF of a mixture of lognormal distributions for omega values.

    Parameters
    ------------
        obs: np.array or float
            Values where the mixture density is evaluated. Can be a single value or an array.
        mu: np.array
            Mu parameter (in log-space) for the lognormal components.
        sigma: np.array
            Standard deviations (in log-space) for the lognormal components.
        weight: array-like
            Mixture weights (must sum to 1).
        log: bool
            Whether to return the log-density.

    Returns
    ------------
        array: The mixture density or log-density evaluated at obs.
    """
    obs = np.atleast_1d(obs)  # Ensure `obs` is an array
    lw = np.log(weights)  # Log of weights
    K = len(mu)  # Number of components

    if len(sigma) != K or len(weights) != K:
        raise ValueError("mu, sigma, and weights must have the same length")

    # Compute log-PDFs for each component in a vectorized manner
    ldens = np.array(
        [
            lognorm.logpdf(obs, s=sigma[i], scale=np.exp(mu[i]))
            for i in range(K)
        ]
    ).T  # Transpose to align with obs dimensions

    # Combine using logsumexp for numerical stability
    if log:
        ans = logsumexp(lw + ldens, axis=1)
    else:
        ans = np.exp(logsumexp(lw + ldens, axis=1))

    return (
        ans if ans.size > 1 else ans.item()
    )  # Return scalar if input was scalar

find_opt_weights_linear(obs, preds, order_models, dist, metric, bounds=(-100, 100))

Find the weights of a linear mix distributions that minimizes the metric selected.

Parameters

obs: pd.Dataframe Dataframe with the columns: date and casos preds: pd.Dataframe Dataframe with the columns: date, mu, sigma, model_id order_models : list List defining the order of models for weight computation. dist : str, optional The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'. metric : str, optional Metric used for optimization. Options: crps, log_score. bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Return

dict A dictionary containing: - weights: The optimized weights for the models. - loss: The minimized loss value based on the selected metric.

Source code in mosqlient/forecast/ensemble.py
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
def find_opt_weights_linear(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    dist: str,
    metric: str,
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Find the weights of a linear mix distributions that minimizes the metric selected.

    Parameters
    -----------
    obs: pd.Dataframe
        Dataframe with the columns: `date` and `casos`
    preds: pd.Dataframe
        Dataframe with the columns: `date`, `mu`, `sigma`, `model_id`
    order_models : list
        List defining the order of models for weight computation.
    dist : str, optional
        The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'.
    metric : str, optional
        Metric used for optimization. Options: `crps`, `log_score`.
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Return
    -------
    dict
        A dictionary containing:
        - `weights`: The optimized weights for the models.
        - `loss`: The minimized loss value based on the selected metric.
    """

    if dist == "log_normal":
        weights = find_opt_weights_linear_mix_log(
            obs, preds, order_models, metric=metric, bounds=bounds
        )

    if dist == "normal":
        weights = find_opt_weights_linear_mix_norm(
            obs, preds, order_models, metric=metric, bounds=bounds
        )

    return weights

find_opt_weights_linear_mix_log(obs, preds, order_models, metric, bounds)

Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

Parameters

obs: pd.Dataframe Dataframe with the columns: date and casos preds: pd.Dataframe Dataframe with the columns: date, mu, sigma, model_id order_models: list Order of the different models in the model_id column metric: str ['crps', 'log_score'] Metric used to optimize the weights bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Return

dict A dictionary containing: - weights: The optimized weights for the models. - loss: The minimized loss value based on the selected metric.

Source code in mosqlient/forecast/ensemble.py
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
def find_opt_weights_linear_mix_log(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    metric: str,
    bounds: tuple,
) -> dict:
    """
    Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

    Parameters
    -----------
    obs: pd.Dataframe
        Dataframe with the columns: `date` and `casos`
    preds: pd.Dataframe
        Dataframe with the columns: `date`, `mu`, `sigma`, `model_id`
    order_models: list
        Order of the different models in the model_id column
    metric: str ['crps', 'log_score']
        Metric used to optimize the weights
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Return
    -------
    dict
        A dictionary containing:
        - `weights`: The optimized weights for the models.
        - `loss`: The minimized loss value based on the selected metric.
    """
    K = len(order_models)

    def loss(eta):
        """
        Computes the loss function based on the selected metric.

        Parameters
        ----------
        eta : array-like
            Parameterization of the weights, transformed via `alpha_01`.

        Returns
        -------
        float
            The computed loss value.
        """
        ws = alpha_01(eta)
        ws = np.where(ws < 1e-6, 1e-6, ws)

        score = 0
        for date in obs.date:
            preds_ = preds.loc[preds.date == date]
            preds_ = preds_.drop(["date"], axis=1).reset_index(drop=True)

            if metric == "log_score":
                score = score - dlnorm_mix(
                    obs.loc[obs.date == date].casos,
                    preds_["mu"].to_numpy(),
                    preds_["sigma"].to_numpy(),
                    weights=ws,
                    log=True,
                )

            if metric == "crps":
                score = score + crps_lognormal_mix(
                    obs.loc[obs.date == date].casos,
                    preds_["mu"].to_numpy(),
                    preds_["sigma"].to_numpy(),
                    weights=ws,
                )

        return score

    initial_guess = np.random.normal(size=K - 1)
    bounds_ = [bounds] * (K - 1)
    opt_result = minimize(
        loss, initial_guess, method="Nelder-mead", bounds=bounds_
    )

    optimal_weights = alpha_01(opt_result.x)

    return {"weights": optimal_weights, "loss": opt_result.fun}

find_opt_weights_linear_mix_norm(obs, preds, order_models, metric, bounds)

Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

Parameters

obs: pd.Dataframe Dataframe with the columns: date and casos preds: pd.Dataframe Dataframe with the columns: date, mu, sigma, model_id order_models: list Order of the different models in the model_id column metric: str ['crps', 'log_score'] Metric used to optimize the weights bounds: tuple Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Return

dict A dictionary containing: - weights: The optimized weights for the models. - loss: The minimized loss value based on the selected metric.

Source code in mosqlient/forecast/ensemble.py
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
def find_opt_weights_linear_mix_norm(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    metric: str,
    bounds: tuple,
) -> dict:
    """
    Find the weights of a lognormal linear mix distributions that minimizes the metric selected.

    Parameters
    -----------
    obs: pd.Dataframe
        Dataframe with the columns: `date` and `casos`
    preds: pd.Dataframe
        Dataframe with the columns: `date`, `mu`, `sigma`, `model_id`
    order_models: list
        Order of the different models in the model_id column
    metric: str ['crps', 'log_score']
        Metric used to optimize the weights
    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Return
    -------
    dict
        A dictionary containing:
        - `weights`: The optimized weights for the models.
        - `loss`: The minimized loss value based on the selected metric.
    """
    K = len(order_models)

    def loss(eta):
        """
        Computes the loss function based on the selected metric.

        Parameters
        ----------
        eta : array-like
            Parameterization of the weights, transformed via `alpha_01`.

        Returns
        -------
        float
            The computed loss value.
        """
        ws = alpha_01(eta)
        ws = np.where(ws < 1e-6, 1e-6, ws)

        score = 0
        for date in obs.date:
            preds_ = preds.loc[preds.date == date]
            preds_ = preds_.drop(["date"], axis=1).reset_index(drop=True)

            ms = preds_["mu"]
            vs = preds_.sigma**2

            if not len(ms) == len(vs) == K:
                raise ValueError("n_models and vs are not the same size!")

            mu, sd = linear_mix(weights=ws, ms=ms, vs=vs)

            score = score + get_score(
                obs=obs.loc[obs.date == date].casos,
                mu=mu,
                sd=sd,
                dist="norm",
                metric=metric,
            )

        return score

    initial_guess = np.random.normal(size=K - 1)
    bounds_ = [bounds] * (K - 1)
    opt_result = minimize(
        loss, initial_guess, method="Nelder-mead", bounds=bounds_
    )

    optimal_weights = alpha_01(opt_result.x)

    return {"weights": optimal_weights, "loss": opt_result.fun}

find_opt_weights_log(obs, preds, order_models, dist='log_normal', metric='crps', bounds=(-100, 100))

Function that generate the weights of the ensemble minimizing the metric selected.

Parameters

obs: pd.dataframe Dataframe with columns date and casos;

pd.dataframe

Dataframe with columns date, mu, sigma, and model_id

list

Order of the different models in the model_id column

str ['log_normal', 'normal']

Distribution used to represent the forecast

str ['crps', 'log_score']

Metric used to optimize the weights

tuple

Tuple where the first element represents the minimum value and the second represents the maximum value for the bounds.

Returns

dict The dict contains the keys: - weights: the optmize weights by the loss - loss: loss function value

Source code in mosqlient/forecast/ensemble.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def find_opt_weights_log(
    obs: pd.DataFrame,
    preds: pd.DataFrame,
    order_models: list,
    dist: str = "log_normal",
    metric: str = "crps",
    bounds: tuple = (-100, 100),
) -> dict:
    """
    Function that generate the weights of the ensemble minimizing the metric selected.

    Parameters
    -----------------
    obs: pd.dataframe
        Dataframe with columns date and casos;

    preds: pd.dataframe
        Dataframe with columns date, mu, sigma, and model_id

    order_models: list
        Order of the different models in the model_id column

    dist: str ['log_normal', 'normal']
        Distribution used to represent the forecast

    metric: str ['crps', 'log_score']
        Metric used to optimize the weights

    bounds: tuple
        Tuple where the first element represents the minimum value and the second
        represents the maximum value for the bounds.

    Returns
    --------
    dict
    The dict contains the keys:
    - weights: the optmize weights by the loss
    - loss: loss function value
    """

    K = len(order_models)

    def loss(eta):

        ws = alpha_01(eta)

        score = 0
        for date in obs.date:
            preds_ = preds.loc[preds.date == date]
            preds_ = preds_.drop(["date"], axis=1).reset_index(drop=True)
            ms = preds_["mu"]
            vs = preds_.sigma**2

            if not len(ms) == len(vs) == K:
                raise ValueError("n_models and vs are not the same size!")

            mu, sd = pool_par_gauss(alpha=ws, m=ms, v=vs)

            score = score + get_score(
                obs=obs.loc[obs.date == date].casos,
                mu=mu,
                sd=sd,
                dist=dist,
                metric=metric,
            )

        return score

    initial_guess = np.random.normal(size=K - 1)
    bounds_ = [bounds] * (K - 1)
    opt_result = minimize(
        loss, initial_guess, method="Nelder-mead", bounds=bounds_
    )

    optimal_weights = alpha_01(opt_result.x)

    return {"weights": optimal_weights, "loss": opt_result.fun}

get_ci_columns(p)

Function that given the confidence interval return the columns names

Parameters

p: NDArray[np.float64] percentile values

Returns

List of columns name

Source code in mosqlient/forecast/ensemble.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def get_ci_columns(p):
    """
    Function that given the confidence interval return the columns names

    Parameters
    -----------
    p: NDArray[np.float64]
    percentile values

    Returns
    --------
    List of columns name
    """

    columns = []

    for value in p:
        if value < 0.5:
            columns.append(f"lower_{int((1 - 2 * value) * 100)}")
        elif value == 0.5:
            columns.append("pred")
        else:
            columns.append(f"upper_{int(2 * value * 100) - 100}")

    return columns

get_epiweek(date)

Function to capture the epidemiological year and week from the date

Source code in mosqlient/forecast/ensemble.py
275
276
277
278
279
280
def get_epiweek(date):
    """
    Function to capture the epidemiological year and week from the date
    """
    epiweek = Week.fromdate(date)
    return (epiweek.year, epiweek.week)

get_quantiles_linear(dist, weights, preds, p=np.array([0.5, 0.05, 0.95]))

Function to get the quantiles of the linear mixture.

Parameters

dist : str, optional The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'. weights: np.array The weights assigned to each prediction. preds: pd.DataFrame The Dataframe with the predictions. p: np.array Returned percentile values

Returns

quantiles: np.array The quantiles obtained according to p.

Source code in mosqlient/forecast/ensemble.py
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
def get_quantiles_linear(
    dist: str,
    weights: NDArray[np.float64],
    preds: pd.DataFrame,
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
):
    """
    Function to get the quantiles of the linear mixture.

    Parameters
    ------------
    dist : str, optional
        The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'.
    weights: np.array
        The weights assigned to each prediction.
    preds: pd.DataFrame
        The Dataframe with the predictions.
    p: np.array
        Returned percentile values

    Returns
    --------
    quantiles: np.array
        The quantiles obtained according to p.
    """

    weights = np.where(weights < 1e-6, 1e-6, weights)

    if dist == "normal":
        pool = linear_mix(weights=weights, ms=preds.mu, vs=preds.sigma**2)

        quantiles = norm.ppf(p, loc=pool[0], scale=pool[1])

    if dist == "log_normal":
        quantiles = compute_ppf(
            mu=preds["mu"].values,
            sigma=preds["sigma"].values,
            weights=weights,
            p=p,
        )

    return quantiles

get_quantiles_log(dist, weights, ms, vs, p=np.array([0.5, 0.05, 0.95]))

Function to get the quantiles of a logarithmic pooling.

Parameters

dist : str, optional The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'. weights: np.array The weights assigned to each prediction. ms: np.array The mu parameter of each prediction. vs: np.array The variance parameter of each prediction. p: np.array Returned percentile values

Returns

quantiles: np.array The quantiles obtained according to p.

Source code in mosqlient/forecast/ensemble.py
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
def get_quantiles_log(
    dist: str,
    weights: NDArray[np.float64],
    ms: NDArray[np.float64],
    vs: NDArray[np.float64],
    p: NDArray[np.float64] = np.array([0.5, 0.05, 0.95]),
):
    """
    Function to get the quantiles of a logarithmic pooling.

    Parameters
    ------------
    dist : str, optional
        The distribution type used for parameterizing predictions ('log_normal' or 'normal'). Default is 'log_normal'.
    weights: np.array
        The weights assigned to each prediction.
    ms: np.array
        The mu parameter of each prediction.
    vs: np.array
        The variance parameter of each prediction.
    p: np.array
        Returned percentile values

    Returns
    --------
    quantiles: np.array
        The quantiles obtained according to p.
    """
    pool = pool_par_gauss(alpha=weights, m=ms, v=vs)

    if dist == "log_normal":
        quantiles = lognorm.ppf(p, s=pool[1], scale=np.exp(pool[0]))

    elif dist == "normal":
        quantiles = norm.ppf(p, loc=pool[0], scale=pool[1])

    return quantiles

get_score(obs, mu, sd, dist='log_normal', metric='crps')

Function to compute the score given a distribution and a predefined metric.

Parameters

obs: float The real observation.

mu:float The mu parameter of the distribution

float

The sd parameter associated with the distribution

str ['normal', 'log_normal']

Distribution type, either 'normal' or 'log_normal'.

str ['crps', 'log_score']

Scoring metric, either 'crps' or 'log_score'.

Returns

float The computed score based on the given metric and distribution.

Source code in mosqlient/forecast/ensemble.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def get_score(
    obs: float,
    mu: float,
    sd: float,
    dist: str = "log_normal",
    metric: str = "crps",
) -> float:
    """
    Function to compute the score given a distribution
    and a predefined metric.

    Parameters
    -----------
    obs: float
        The real observation.

    mu:float
        The mu parameter of the distribution

    sd: float
        The sd parameter associated with the distribution

    dist: str ['normal', 'log_normal']
        Distribution type, either 'normal' or 'log_normal'.

    metric: str ['crps', 'log_score']
        Scoring metric, either 'crps' or 'log_score'.

    Returns
    --------
    float
        The computed score based on the given metric and distribution.
    """

    if metric == "log_score":
        if dist == "log_normal":
            return -lognorm.logpdf(obs, s=sd, scale=np.exp(mu))

        elif dist == "normal":
            return -np.log(norm.pdf(obs, loc=mu, scale=sd))

    if metric == "crps":
        if dist == "log_normal":
            return crps_lognormal(obs, mu, sd)

        elif dist == "normal":
            return crps_normal(obs, mu, sd)

    raise ValueError(f"Invalid distribution '{dist}' and metric '{metric}'")

linear_mix(weights, ms, vs)

Computes the mean (mu) and standard deviation (sd) of a linear mixture of normal distributions weighted by weights.

Parameters

weights : np.array Array of weights for the linear mixture. Should sum to 1. ms : np.array Array of mean values of the normal distributions. vs : np.array Array of variance values of the normal distributions.

Returns

tuple (mu, sd) mu : float Mean of the resulting normal distribution. sd : float Standard deviation of the resulting normal distribution.

Source code in mosqlient/forecast/ensemble.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def linear_mix(
    weights: NDArray[np.float64],
    ms: NDArray[np.float64],
    vs: NDArray[np.float64],
) -> tuple:
    """
    Computes the mean (mu) and standard deviation (sd) of a linear mixture of normal distributions
    weighted by `weights`.

    Parameters
    ----------
    weights : np.array
        Array of weights for the linear mixture. Should sum to 1.
    ms : np.array
        Array of mean values of the normal distributions.
    vs : np.array
        Array of variance values of the normal distributions.

    Returns
    -------
    tuple (mu, sd)
        mu : float
            Mean of the resulting normal distribution.
        sd : float
            Standard deviation of the resulting normal distribution.
    """

    mu = np.dot(weights, ms)

    sd = np.sqrt(np.dot(weights**2, vs))

    return mu, sd

pool_par_gauss(alpha, m, v)

Function to get the output distribution from a logarithmic pool of lognormal (or normal) distrutions

Parameters

alpha : array of float Weigths assigned to each distribution in the pool. m : array of float mu parameter v : array of float variance parameter Returns


tuple A tuple containing two elements. The first one is the mu and the second one the sd parameter of the distribution.

Notes

The logarithmic pooling method is based on the work of Carvalho, L. M., Villela, D. A., Coelho, F. C., & Bastos, L. S. (2023). Bayesian inference for the weights in logarithmic pooling. Bayesian Analysis, 18(1), 223-251.

Source code in mosqlient/forecast/ensemble.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def pool_par_gauss(
    alpha: NDArray[np.float64], m: NDArray[np.float64], v: NDArray[np.float64]
) -> tuple:
    """
    Function to get the output distribution from a logarithmic pool of lognormal (or normal) distrutions

    Parameters
    ----------
    alpha : array of float
        Weigths assigned to each distribution in the pool.
    m : array of float
        mu parameter
    v : array of float
        variance parameter
    Returns
    -------
    tuple
        A tuple containing two elements. The first one is the mu and the second one the sd parameter of the distribution.

    Notes
    ------
    The logarithmic pooling method is based on the work of Carvalho, L. M., Villela, D. A., Coelho, F. C., & Bastos, L. S. (2023).
    Bayesian inference for the weights in logarithmic pooling. Bayesian Analysis, 18(1), 223-251.
    """
    if not (len(alpha) == len(m) == len(v)):
        raise ValueError(
            "The arrays 'alpha', 'm', and 'v' must have the same length."
        )

    ws = alpha / v
    vstar = 1 / np.sum(ws)
    mstar = np.sum(ws * m) * vstar
    return mstar, np.sqrt(vstar)

validate_df_preds(df_preds, conf_level=0.9)

Validade if the predictions dataframe contains the necessary columns

Parameters

df_preds: pd.DataFrame

float, optional, default=0.90

Confidence level used to define the lower and upper bounds.

Returns

Returns an error if df_preds is missing the required columns.

Source code in mosqlient/forecast/ensemble.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def validate_df_preds(df_preds: pd.DataFrame, conf_level=0.9):
    """
    Validade if the predictions dataframe contains the necessary columns

    Parameters
    ------------

    df_preds: pd.DataFrame

    conf_level : float, optional, default=0.90
        Confidence level used to define the lower and upper bounds.

    Returns
    ---------
    Returns an error if df_preds is missing the required columns.
    """
    expected_cols = {
        "date",
        f"lower_{int(100 * conf_level)}",
        "pred",
        f"upper_{int(100 * conf_level)}",
        "model_id",
    }

    if not expected_cols.issubset(df_preds.columns):
        raise ValueError(
            f"df_preds must contain the following columns: {expected_cols}. "
            f"Missing: {expected_cols - set(df_preds.columns)}"
        )