Source code for smx.datasets.synthetic

import numpy as np
import pandas as pd


[docs] def gaussian_peak_model(x, center, amplitude, width): """ Generate a one-dimensional Gaussian peak. Implements the equation: g(x) = A * exp(-(x - c)² / (2σ²)) Parameters ---------- x : array_like Spectral axis (wavelengths, energy, channels). center : float Central position of the peak (same units as x). amplitude : float Maximum height of the peak (intensity at the center). width : float Standard deviation (σ) of the peak — controls spread/width. Returns ------- ndarray Array with the Gaussian peak evaluated at each point of x. Notes ----- - For XRF: use a small width (~5–15) to simulate narrow lines. - For Vis-NIR: use a larger width (~20–50) for broad absorption bands. """ return amplitude * np.exp(-(x - center) ** 2 / (2 * width ** 2))
def _resolve_peak_parameters( peak, default_amplitude_mean, default_amplitude_std, default_width_mean, default_width_std, ): """Resolve peak parameters from either scalar or dict peak definitions.""" if isinstance(peak, dict): if "center" not in peak: raise KeyError("Each peak dictionary must include the 'center' key.") center = peak["center"] amplitude_mean = peak.get("amplitude_mean", default_amplitude_mean) amplitude_std = peak.get("amplitude_std", default_amplitude_std) width_mean = peak.get("width_mean", default_width_mean) width_std = peak.get("width_std", default_width_std) else: center = peak amplitude_mean = default_amplitude_mean amplitude_std = default_amplitude_std width_mean = default_width_mean width_std = default_width_std return center, amplitude_mean, amplitude_std, width_mean, width_std def _generate_single_spectrum( x, peaks, amplitude_mean=1.0, amplitude_std=0.1, width_mean=15.0, width_std=2.0, noise_std=0.02, ): """ Generate a single spectrum by summing Gaussian peaks with variability + noise. Internal helper for generate_synthetic_spectral_data. Parameters ---------- x : ndarray Spectral axis. peaks : list of float or list of dict Peak definitions. Each item can be: - float: peak centre position. - dict: custom peak configuration with keys: ``center`` (required), ``amplitude_mean``, ``amplitude_std``, ``width_mean``, ``width_std`` (all optional). If a key is not provided in a peak dict, class-level defaults are used. amplitude_mean, amplitude_std : float Mean and standard deviation of peak amplitude. width_mean, width_std : float Mean and standard deviation of peak width. noise_std : float Standard deviation of the Gaussian baseline noise. Returns ------- ndarray Synthetic spectrum (baseline noise + peaks). """ # Baseline: white Gaussian noise spectrum = np.random.normal(0, noise_std, len(x)) # Add each peak with random variability for peak in peaks: center, peak_amp_mean, peak_amp_std, peak_width_mean, peak_width_std = ( _resolve_peak_parameters( peak, amplitude_mean, amplitude_std, width_mean, width_std, ) ) amp = np.random.normal(peak_amp_mean, peak_amp_std) width = np.random.normal(peak_width_mean, peak_width_std) spectrum += gaussian_peak_model(x, center, amp, width) return spectrum
[docs] def generate_synthetic_spectral_data( classes_config, n_points=500, x_min=0, x_max=1000, seed=None, ): """ Generate a synthetic spectral dataset for multiple classes. Returns a DataFrame where: - First column: ``'Class'`` (values defined by the user: 'A', 'B', 'C', …). - Remaining columns: spectral variables (intensity values). - Rows: individual samples. Parameters ---------- classes_config : list of dict List of dicts, each defining one class. Supported keys: - ``'name'`` (str): class label (e.g. ``'A'``, ``'B'``, ``'Soil'``). - ``'n_samples'`` (int): number of samples to generate. - ``'peaks'`` (list): peak definitions on the spectral axis. Supported formats:: [250, 550, 700] or:: [ {'center': 250, 'amplitude_mean': 0.9, 'width_mean': 10}, {'center': 550, 'amplitude_mean': 1.3, 'width_mean': 18}, {'center': 700, 'amplitude_mean': 0.7, 'width_mean': 25}, ] The second form allows per-peak amplitude/width customisation. Optional per-peak keys: ``amplitude_mean``, ``amplitude_std``, ``width_mean``, ``width_std``. Missing keys fallback to class-level defaults below. - ``'amplitude_mean'`` (float, optional, default ``1.0``): mean peak amplitude. - ``'amplitude_std'`` (float, optional, default ``0.1``): std dev of amplitude. - ``'width_mean'`` (float, optional, default ``15.0``): mean peak width (σ). - ``'width_std'`` (float, optional, default ``2.0``): std dev of peak width. - ``'noise_std'`` (float, optional, default ``0.02``): std dev of baseline noise. Example:: [ { 'name': 'A', 'n_samples': 50, 'peaks': [ {'center': 250, 'amplitude_mean': 0.9, 'width_mean': 12}, {'center': 550, 'amplitude_mean': 1.4, 'width_mean': 20}, {'center': 700, 'amplitude_mean': 0.8, 'width_mean': 16}, {'center': 850, 'amplitude_mean': 1.1, 'width_mean': 24}, ], 'amplitude_mean': 1.0, 'amplitude_std': 0.1, 'width_mean': 15.0, 'width_std': 2.0, }, { 'name': 'B', 'n_samples': 50, 'peaks': [250, 700, 850], 'amplitude_mean': 1.2, 'width_mean': 20.0, }, ] n_points : int, default ``500`` Number of points on the spectral axis (resolution). x_min, x_max : float, default ``0``, ``1000`` Limits of the spectral axis (e.g. 400–1000 nm for Vis-NIR, 0–40 keV for XRF). seed : int, optional Random seed for reproducibility. Returns ------- df : pandas.DataFrame Synthetic spectral dataset. - Column 0: ``'Class'`` (str — class name from *classes_config*). - Columns 1 … n_points: spectral intensities named after x-axis values. - Shape: ``(total_samples, n_points + 1)``. """ if seed is not None: np.random.seed(seed) x_axis = np.linspace(x_min, x_max, n_points) spectra_list = [] labels_list = [] for config in classes_config: class_name = config["name"] n_samples = config["n_samples"] peaks = config["peaks"] amplitude_mean = config.get("amplitude_mean", 1.0) amplitude_std = config.get("amplitude_std", 0.1) width_mean = config.get("width_mean", 15.0) width_std = config.get("width_std", 2.0) noise_std = config.get("noise_std", 0.02) for _ in range(n_samples): spectrum = _generate_single_spectrum( x_axis, peaks, amplitude_mean, amplitude_std, width_mean, width_std, noise_std, ) spectra_list.append(spectrum) labels_list.append(class_name) spectra_array = np.array(spectra_list) column_names = x_axis.astype(str).tolist() df = pd.DataFrame(spectra_array, columns=column_names) df.insert(0, "Class", labels_list) return df