Source code for smx.zones.extraction
"""Extract spectral zones from a DataFrame based on numeric column boundaries."""
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
[docs]
def extract_spectral_zones(
Xcal: pd.DataFrame,
cuts: List[Union[Tuple, dict]],
) -> Dict[str, pd.DataFrame]:
"""
Extract spectral zones from a DataFrame based on specified cuts.
Parameters
----------
Xcal : pd.DataFrame
DataFrame with spectral data. Columns must be numeric (or convertible
to numeric) values representing wavelengths / energies.
cuts : list of tuples/lists or dicts
Each item defines a spectral zone to extract.
* ``(start, end)`` — zone boundaries; name defaults to ``"start-end"``
* ``(name, start, end)`` — named zone
* ``(name, start, end, group)`` — named zone assigned to a group
* ``{'name': str, 'start': float, 'end': float}`` — dict form
* ``{'name': str, 'start': float, 'end': float, 'group': str}`` — dict
form with grouping
When multiple cuts share the same ``group`` value their column subsets
are concatenated into a single zone keyed by the group name. Cuts
*without* a group are extracted individually under their own name, as
before.
Returns
-------
dict[str, pd.DataFrame]
Dictionary where keys are zone names (or group names) and values are
DataFrames with the extracted spectral data (same row index as *Xcal*).
Examples
--------
>>> zones = extract_spectral_zones(X, [('Ca ka', 3.6, 3.7), ('Fe ka', 6.3, 6.5)])
>>> zones['Ca ka'].shape
(n_samples, n_cols_in_Ca_ka_zone)
Group background regions into a single zone:
>>> cuts = [
... ('background 1', 1.0, 101.0, 'background'),
... ('Feature 1', 101.0, 193.3),
... ('background 2', 193.3, 255.4, 'background'),
... ('Feature 2', 255.4, 341.6),
... ]
>>> zones = extract_spectral_zones(X, cuts)
>>> 'background' in zones # True — merged from background 1 & 2
True
>>> 'background 1' in zones # False — individual cuts absorbed into group
False
"""
col_nums = pd.to_numeric(Xcal.columns.astype(str), errors="coerce")
zones: Dict[str, pd.DataFrame] = {}
# Accumulate column subsets for grouped cuts; order of insertion preserved
grouped: Dict[str, List[pd.DataFrame]] = {}
for cut in cuts:
group: Optional[str] = None
if isinstance(cut, dict):
name = cut.get("name", f"{cut.get('start')}-{cut.get('end')}")
start = cut.get("start")
end = cut.get("end")
group = cut.get("group", None)
elif isinstance(cut, (list, tuple)):
if len(cut) == 2:
start, end = cut
name = f"{start}-{end}"
elif len(cut) == 3:
name, start, end = cut
elif len(cut) == 4:
name, start, end, group = cut
else:
raise ValueError("Cuts in tuple/list format must have 2, 3, or 4 elements.")
else:
raise ValueError("Each cut must be a dict or a tuple/list.")
try:
s = float(start)
e = float(end)
except Exception:
raise ValueError("start and end must be numeric values (int/float or convertible strings).")
if s > e:
s, e = e, s
mask = (~np.isnan(col_nums)) & (col_nums >= s) & (col_nums <= e)
zone_df = Xcal.loc[:, mask]
if group is not None:
grouped.setdefault(group, []).append(zone_df)
else:
zones[name] = zone_df
# Merge grouped zones by concatenating columns (preserving spectral order)
for group_name, zone_dfs in grouped.items():
zones[group_name] = pd.concat(zone_dfs, axis=1)
return zones