Source code for smx.predicates.bagging

"""
PredicateBagger: bootstrap/subsample predicates across multiple bags for
robust metric estimation.
"""

from typing import Dict, Optional, Union

import numpy as np
import pandas as pd



[docs]
class PredicateBagger:
    """Perform predicate bagging with granular control over sampling strategy.

    Bagging creates repeated random subsets of samples and/or predicates,
    evaluating each predicate on the subset.  This yields a distribution of
    predicate coverage that is used downstream to compute robust association
    metrics (see :mod:`smx.predicates.metrics`).

    Parameters
    ----------
    n_bags : int, default 50
        Number of bags (iterations) to create.
    n_predicates_per_bag : int, default 20
        Number of predicates to draw per bag (ignored when
        ``predicate_bagging=False``).
    n_samples_fraction : float, default 0.8
        Fraction of samples to draw per bag (ignored when
        ``sample_bagging=False``).  The minimum samples per predicate is
        hardcoded to 20 % of the dataset.
    replace : bool, default True
        Whether to sample with replacement (bootstrap).  Ignored when
        ``sample_bagging=False``.
    random_seed : int, default 42
        Base random seed for reproducibility.
    sample_bagging : bool, default True
        If ``False``, all samples are used in every bag.
    predicate_bagging : bool, default True
        If ``False``, all predicates are used in every bag.
    """

    def __init__(
        self,
        random_seed,
        n_bags: int = 10,
        n_predicates_per_bag: int = 20,
        n_samples_fraction: float = 0.8,
        replace: bool = False,
        sample_bagging: bool = True,
        predicate_bagging: bool = False,
    ) -> None:
        self.n_bags = n_bags
        self.n_predicates_per_bag = n_predicates_per_bag
        self.n_samples_fraction = n_samples_fraction
        self.replace = replace
        self.random_seed = random_seed
        self.sample_bagging = sample_bagging
        self.predicate_bagging = predicate_bagging


[docs]
    def run(
        self,
        zone_scores_df: pd.DataFrame,
        y_predicted_numeric: Union[pd.Series, np.ndarray],
        predicates_df: pd.DataFrame,
    ) -> Dict[str, Dict[str, pd.DataFrame]]:
        """Create bags by sampling samples and/or predicates.

        Parameters
        ----------
        zone_scores_df : pd.DataFrame
            Aggregated zone scores (samples × zones).
        y_predicted_numeric : pd.Series or np.ndarray
            Continuous model predictions aligned with *zone_scores_df*.
        predicates_df : pd.DataFrame
            Predicate catalogue with columns ``rule``, ``zone``,
            ``thresholds``, ``operator``.

        Returns
        -------
        dict
            ``{'Bag_1': {rule: DataFrame(['Zone_Sum', 'Predicted_Y',
            'Sample_Index']), ...}, 'Bag_2': ...}``
        """
        if isinstance(y_predicted_numeric, np.ndarray):
            y_predicted_numeric = pd.Series(y_predicted_numeric)

        np.random.seed(self.random_seed)

        n_total = len(zone_scores_df)
        n_samples_per_bag = max(1, int(n_total * self.n_samples_fraction))
        min_samples_per_predicate = max(1, int(n_total * 0.2))
        all_rules = predicates_df["rule"].tolist()
        bags: Dict[str, Dict[str, pd.DataFrame]] = {}

        for bag_num in range(1, self.n_bags + 1):
            # ── Sample selection ──────────────────────────────────────────
            if self.sample_bagging:
                bag_indices = np.random.choice(
                    range(n_total),
                    size=n_samples_per_bag,
                    replace=self.replace,
                )
            else:
                bag_indices = np.arange(n_total)

            # ── Predicate selection ───────────────────────────────────────
            if self.predicate_bagging:
                selected_rules = np.random.choice(
                    all_rules,
                    size=min(self.n_predicates_per_bag, len(all_rules)),
                    replace=False,
                )
            else:
                selected_rules = all_rules

            # ── Build bag ────────────────────────────────────────────────
            bag_predicates: Dict[str, pd.DataFrame] = {}
            n_discarded = 0

            for rule in selected_rules:
                rows = predicates_df[predicates_df["rule"] == rule]
                if rows.empty:
                    continue
                pred_row = rows.iloc[0]
                zone = pred_row["zone"]
                threshold = float(pred_row["thresholds"])
                operator = pred_row["operator"]

                zone_vals = zone_scores_df.loc[bag_indices, zone].values
                if operator == "<=":
                    mask = zone_vals <= threshold
                elif operator == ">":
                    mask = zone_vals > threshold
                else:
                    continue

                satisfied = bag_indices[mask]

                if self.sample_bagging and len(satisfied) < min_samples_per_predicate:
                    n_discarded += 1
                    continue
                if len(satisfied) == 0:
                    n_discarded += 1
                    continue

                bag_predicates[rule] = pd.DataFrame({
                    "Zone_Sum": zone_scores_df.loc[satisfied, zone].values,
                    "Predicted_Y": y_predicted_numeric.iloc[satisfied].values,
                    "Sample_Index": satisfied,
                })

            if bag_predicates:
                bags[f"Bag_{bag_num}"] = bag_predicates
                samp_str = "yes" if self.sample_bagging else "no"
                pred_str = (
                    f"yes ({self.n_predicates_per_bag})"
                    if self.predicate_bagging
                    else "no (all)"
                )
                print(
                    f"Bag_{bag_num} | samples: {samp_str} | predicates: {pred_str} | "
                    f"valid: {len(bag_predicates)} | discarded: {n_discarded}"
                )
            else:
                print(f"Bag_{bag_num}: EMPTY (all predicates discarded)")

        return bags