"""
ZoneAggregator: reduce each spectral zone (DataFrame) to a single score per sample.
Supports simple column-wise aggregations (sum, mean, …) and PCA-based
aggregation (PC1 score). A fit/transform interface ensures that the same
PCA model fitted on calibration data can be applied consistently to
prediction data.
"""
from typing import Dict, Literal, Optional, Union
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
_SIMPLE_AGGREGATORS = {
"sum": lambda df: df.sum(axis=1),
"mean": lambda df: df.mean(axis=1),
"median": lambda df: df.median(axis=1),
"max": lambda df: df.max(axis=1),
"min": lambda df: df.min(axis=1),
"std": lambda df: df.std(axis=1),
"var": lambda df: df.var(axis=1),
"extreme": lambda df: df.apply(
lambda row: row.loc[row.abs().idxmax()] if row.notna().any() else np.nan,
axis=1,
),
}
[docs]
class ZoneAggregator:
"""Aggregate spectral zones to a single score per sample.
Parameters
----------
method : str, default ``'pca'``
Aggregation strategy.
* ``'pca'``: fit a single-component PCA per zone and use PC1 scores.
Preserves directional information and maximises explained variance.
* ``'sum'``, ``'mean'``, ``'median'``, ``'max'``, ``'min'``,
``'std'``, ``'var'``, ``'extreme'``: simple column-wise aggregations.
Attributes (set after :meth:`fit`)
------------------------------------
pca_info\_ : dict or None
``{zone_name: {'pca_model', 'loadings', 'mean', 'variance_explained', 'columns'}}``
Only populated when ``method='pca'``.
is_fitted\_ : bool
``True`` after :meth:`fit` has been called.
"""
def __init__(
self,
method: str = "pca",
) -> None:
valid = {"pca"} | set(_SIMPLE_AGGREGATORS)
if method not in valid:
raise ValueError(
f"Unknown method '{method}'. Valid options: {sorted(valid)}"
)
self.method = method
self.pca_info_: Optional[Dict] = None
self.is_fitted_: bool = False
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
[docs]
def fit(self, spectral_zones_dict: Dict[str, pd.DataFrame]) -> "ZoneAggregator":
"""Fit the aggregator on calibration zone data.
For ``method='pca'`` this trains a 1-component PCA per zone and stores
the models so the same projections can be applied to new data. For
simple aggregation methods, fit is a no-op (nothing to learn).
Parameters
----------
spectral_zones_dict : dict[str, pd.DataFrame]
Calibration spectral zones as returned by
:func:`smx.zones.extraction.extract_spectral_zones`.
Returns
-------
self
"""
if self.method == "pca":
self.pca_info_ = {}
for zone_name, zone_df in spectral_zones_dict.items():
X_zone = zone_df.values.astype(float)
pca = PCA(n_components=1)
pca.fit(X_zone)
self.pca_info_[zone_name] = {
"pca_model": pca,
"loadings": pca.components_[0],
"mean": pca.mean_,
"variance_explained": pca.explained_variance_ratio_[0],
"columns": zone_df.columns.tolist(),
}
self.is_fitted_ = True
return self
# ------------------------------------------------------------------
# Informational helpers
# ------------------------------------------------------------------
[docs]
def get_variance_explained(self) -> Optional[Dict[str, float]]:
"""Return per-zone explained variance (PCA method only).
Returns ``None`` for non-PCA methods.
"""
if self.method != "pca" or self.pca_info_ is None:
return None
return {
zone: info["variance_explained"]
for zone, info in self.pca_info_.items()
}