Source code for smx.graph.interpretation
"""
Threshold mapping and predicate interpretation utilities.
Functions in this module translate LRC results from the preprocessed (score)
space back to the natural (unpreprocessed) spectral space, and reconstruct
multivariate threshold spectra for visualisation.
"""
from typing import Dict, Optional
import numpy as np
import pandas as pd
# ---------------------------------------------------------------------------
# Private helpers (not part of public API)
# ---------------------------------------------------------------------------
def _extract_zone_from_predicate(predicate_rule: str) -> str:
"""Return the spectral zone name embedded in a predicate rule string."""
if "<=" in predicate_rule:
return predicate_rule.split("<=")[0].strip()
if ">" in predicate_rule:
return predicate_rule.split(">")[0].strip()
raise ValueError(f"Unrecognised operator in predicate: '{predicate_rule}'")
# ---------------------------------------------------------------------------
# Public functions
# ---------------------------------------------------------------------------
[docs]
def map_thresholds_to_natural(
lrc_df: pd.DataFrame,
zone_sums_preprocessed: pd.DataFrame,
zone_sums_natural: pd.DataFrame,
) -> pd.DataFrame:
"""Map predicate thresholds from the preprocessed space to natural space.
For each predicate in *lrc_df*, this finds the calibration sample whose
zone score in the *preprocessed* space is closest to the predicate's
threshold, and retrieves that sample's value in the *natural*
(unpreprocessed) space as the best approximation.
Parameters
----------
lrc_df : pd.DataFrame
LRC results. Must contain columns ``'Zone'``, ``'Threshold'``,
``'Operator'``, and ``'Node'``.
zone_sums_preprocessed : pd.DataFrame
Zone aggregation scores computed on *preprocessed* calibration data
(same zones as *lrc_df*).
zone_sums_natural : pd.DataFrame
Zone aggregation scores computed on *original* (unprocessed) data.
Returns
-------
pd.DataFrame
Copy of *lrc_df* with additional columns:
* ``'Threshold_Natural'`` — threshold value in the natural space
* ``'Reference_Sample_Index'`` — index of the nearest calibration sample
* ``'Approximation_Error'`` — distance (preprocessed space) to the
nearest sample
* ``'Node_Natural'`` — predicate rule string using the natural threshold
"""
result_df = lrc_df.copy()
natural_thresholds = []
sample_indices = []
approximation_errors = []
node_natural_list = []
for _, row in result_df.iterrows():
zone_name = row["Zone"]
threshold_val = row["Threshold"]
operator = row["Operator"]
if (
zone_name is None
or threshold_val is None
or zone_name not in zone_sums_preprocessed.columns
):
natural_thresholds.append(None)
sample_indices.append(None)
approximation_errors.append(None)
node_natural_list.append(None)
continue
threshold = float(threshold_val)
zone_vals = zone_sums_preprocessed[zone_name]
distances = (zone_vals - threshold).abs()
closest_idx = distances.idxmin()
natural_value = zone_sums_natural.loc[closest_idx, zone_name]
error = distances.loc[closest_idx]
node_natural = (
f"{zone_name} {operator} {natural_value:.6f}"
if operator is not None and natural_value is not None
else None
)
natural_thresholds.append(natural_value)
sample_indices.append(closest_idx)
approximation_errors.append(error)
node_natural_list.append(node_natural)
result_df["Threshold_Natural"] = natural_thresholds
result_df["Reference_Sample_Index"] = sample_indices
result_df["Approximation_Error"] = approximation_errors
result_df["Node_Natural"] = node_natural_list
return result_df
[docs]
def reconstruct_threshold_to_spectrum(
threshold_value: float,
zone_name: str,
pca_info_dict: Dict,
) -> pd.Series:
"""Reconstruct a scalar threshold to a multivariate threshold spectrum.
Uses the PCA model fitted during zone aggregation to reconstruct a
threshold value *in score space* back into the original spectral variable
space:
.. math::
\\tau = \\bar{x} + q \\cdot \\mathbf{w}
where :math:`\\bar{x}` is the zone mean, :math:`\\mathbf{w}` the PC1
loadings vector, and :math:`q` the threshold score value.
Parameters
----------
threshold_value : float
Threshold in PC1 score space.
zone_name : str
Name of the spectral zone.
pca_info_dict : dict
PCA info dictionary as stored in
:attr:`smx.zones.aggregation.ZoneAggregator.pca_info_`.
Returns
-------
pd.Series
Threshold spectrum indexed by original column names.
"""
info = pca_info_dict[zone_name]
spectrum = info["mean"] + threshold_value * info["loadings"]
return pd.Series(
spectrum,
index=info["columns"],
name=f"threshold_{threshold_value:.4f}",
)