Source code for smx.zones.extraction

"""Extract spectral zones from a DataFrame based on numeric column boundaries."""

from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd


[docs] def extract_spectral_zones( Xcal: pd.DataFrame, cuts: List[Union[Tuple, dict]], ) -> Dict[str, pd.DataFrame]: """ Extract spectral zones from a DataFrame based on specified cuts. Parameters ---------- Xcal : pd.DataFrame DataFrame with spectral data. Columns must be numeric (or convertible to numeric) values representing wavelengths / energies. cuts : list of tuples/lists or dicts Each item defines a spectral zone to extract. * ``(start, end)`` — zone boundaries; name defaults to ``"start-end"`` * ``(name, start, end)`` — named zone * ``(name, start, end, group)`` — named zone assigned to a group * ``{'name': str, 'start': float, 'end': float}`` — dict form * ``{'name': str, 'start': float, 'end': float, 'group': str}`` — dict form with grouping When multiple cuts share the same ``group`` value their column subsets are concatenated into a single zone keyed by the group name. Cuts *without* a group are extracted individually under their own name, as before. Returns ------- dict[str, pd.DataFrame] Dictionary where keys are zone names (or group names) and values are DataFrames with the extracted spectral data (same row index as *Xcal*). Examples -------- >>> zones = extract_spectral_zones(X, [('Ca ka', 3.6, 3.7), ('Fe ka', 6.3, 6.5)]) >>> zones['Ca ka'].shape (n_samples, n_cols_in_Ca_ka_zone) Group background regions into a single zone: >>> cuts = [ ... ('background 1', 1.0, 101.0, 'background'), ... ('Feature 1', 101.0, 193.3), ... ('background 2', 193.3, 255.4, 'background'), ... ('Feature 2', 255.4, 341.6), ... ] >>> zones = extract_spectral_zones(X, cuts) >>> 'background' in zones # True — merged from background 1 & 2 True >>> 'background 1' in zones # False — individual cuts absorbed into group False """ col_nums = pd.to_numeric(Xcal.columns.astype(str), errors="coerce") zones: Dict[str, pd.DataFrame] = {} # Accumulate column subsets for grouped cuts; order of insertion preserved grouped: Dict[str, List[pd.DataFrame]] = {} for cut in cuts: group: Optional[str] = None if isinstance(cut, dict): name = cut.get("name", f"{cut.get('start')}-{cut.get('end')}") start = cut.get("start") end = cut.get("end") group = cut.get("group", None) elif isinstance(cut, (list, tuple)): if len(cut) == 2: start, end = cut name = f"{start}-{end}" elif len(cut) == 3: name, start, end = cut elif len(cut) == 4: name, start, end, group = cut else: raise ValueError("Cuts in tuple/list format must have 2, 3, or 4 elements.") else: raise ValueError("Each cut must be a dict or a tuple/list.") try: s = float(start) e = float(end) except Exception: raise ValueError("start and end must be numeric values (int/float or convertible strings).") if s > e: s, e = e, s mask = (~np.isnan(col_nums)) & (col_nums >= s) & (col_nums <= e) zone_df = Xcal.loc[:, mask] if group is not None: grouped.setdefault(group, []).append(zone_df) else: zones[name] = zone_df # Merge grouped zones by concatenating columns (preserving spectral order) for group_name, zone_dfs in grouped.items(): zones[group_name] = pd.concat(zone_dfs, axis=1) return zones