Source code for smx.datasets.synthetic

import numpy as np
import pandas as pd



[docs]
def gaussian_peak_model(x, center, amplitude, width):
    """
    Generate a one-dimensional Gaussian peak.

    Implements the equation:
    g(x) = A * exp(-(x - c)² / (2σ²))

    Parameters
    ----------
    x : array_like
        Spectral axis (wavelengths, energy, channels).
    center : float
        Central position of the peak (same units as x).
    amplitude : float
        Maximum height of the peak (intensity at the center).
    width : float
        Standard deviation (σ) of the peak — controls spread/width.

    Returns
    -------
    ndarray
        Array with the Gaussian peak evaluated at each point of x.

    Notes
    -----
    - For XRF: use a small width (~5–15) to simulate narrow lines.
    - For Vis-NIR: use a larger width (~20–50) for broad absorption bands.
    """
    return amplitude * np.exp(-(x - center) ** 2 / (2 * width ** 2))



def _resolve_peak_parameters(
    peak,
    default_amplitude_mean,
    default_amplitude_std,
    default_width_mean,
    default_width_std,
):
    """Resolve peak parameters from either scalar or dict peak definitions."""
    if isinstance(peak, dict):
        if "center" not in peak:
            raise KeyError("Each peak dictionary must include the 'center' key.")

        center = peak["center"]
        amplitude_mean = peak.get("amplitude_mean", default_amplitude_mean)
        amplitude_std = peak.get("amplitude_std", default_amplitude_std)
        width_mean = peak.get("width_mean", default_width_mean)
        width_std = peak.get("width_std", default_width_std)
    else:
        center = peak
        amplitude_mean = default_amplitude_mean
        amplitude_std = default_amplitude_std
        width_mean = default_width_mean
        width_std = default_width_std

    return center, amplitude_mean, amplitude_std, width_mean, width_std


def _generate_single_spectrum(
    x,
    peaks,
    amplitude_mean=1.0,
    amplitude_std=0.1,
    width_mean=15.0,
    width_std=2.0,
    noise_std=0.02,
):
    """
    Generate a single spectrum by summing Gaussian peaks with variability + noise.

    Internal helper for generate_synthetic_spectral_data.

    Parameters
    ----------
    x : ndarray
        Spectral axis.
    peaks : list of float or list of dict
        Peak definitions. Each item can be:

        - float: peak centre position.
        - dict: custom peak configuration with keys:
          ``center`` (required), ``amplitude_mean``, ``amplitude_std``,
          ``width_mean``, ``width_std`` (all optional).

        If a key is not provided in a peak dict, class-level defaults are used.
    amplitude_mean, amplitude_std : float
        Mean and standard deviation of peak amplitude.
    width_mean, width_std : float
        Mean and standard deviation of peak width.
    noise_std : float
        Standard deviation of the Gaussian baseline noise.

    Returns
    -------
    ndarray
        Synthetic spectrum (baseline noise + peaks).
    """
    # Baseline: white Gaussian noise
    spectrum = np.random.normal(0, noise_std, len(x))

    # Add each peak with random variability
    for peak in peaks:
        center, peak_amp_mean, peak_amp_std, peak_width_mean, peak_width_std = (
            _resolve_peak_parameters(
                peak,
                amplitude_mean,
                amplitude_std,
                width_mean,
                width_std,
            )
        )
        amp = np.random.normal(peak_amp_mean, peak_amp_std)
        width = np.random.normal(peak_width_mean, peak_width_std)
        spectrum += gaussian_peak_model(x, center, amp, width)

    return spectrum



[docs]
def generate_synthetic_spectral_data(
    classes_config,
    n_points=500,
    x_min=0,
    x_max=1000,
    seed=None,
):
    """
    Generate a synthetic spectral dataset for multiple classes.

    Returns a DataFrame where:
    - First column: ``'Class'`` (values defined by the user: 'A', 'B', 'C', …).
    - Remaining columns: spectral variables (intensity values).
    - Rows: individual samples.

    Parameters
    ----------
    classes_config : list of dict
        List of dicts, each defining one class. Supported keys:

        - ``'name'`` (str): class label (e.g. ``'A'``, ``'B'``, ``'Soil'``).
        - ``'n_samples'`` (int): number of samples to generate.
        - ``'peaks'`` (list): peak definitions on the spectral axis.

          Supported formats::

              [250, 550, 700]

          or::

              [
                  {'center': 250, 'amplitude_mean': 0.9, 'width_mean': 10},
                  {'center': 550, 'amplitude_mean': 1.3, 'width_mean': 18},
                  {'center': 700, 'amplitude_mean': 0.7, 'width_mean': 25},
              ]

          The second form allows per-peak amplitude/width customisation.
          Optional per-peak keys: ``amplitude_mean``, ``amplitude_std``,
          ``width_mean``, ``width_std``. Missing keys fallback to class-level
          defaults below.
        - ``'amplitude_mean'`` (float, optional, default ``1.0``): mean peak amplitude.
        - ``'amplitude_std'`` (float, optional, default ``0.1``): std dev of amplitude.
        - ``'width_mean'`` (float, optional, default ``15.0``): mean peak width (σ).
        - ``'width_std'`` (float, optional, default ``2.0``): std dev of peak width.
        - ``'noise_std'`` (float, optional, default ``0.02``): std dev of baseline noise.

        Example::

            [
                {
                    'name': 'A',
                    'n_samples': 50,
                    'peaks': [
                        {'center': 250, 'amplitude_mean': 0.9, 'width_mean': 12},
                        {'center': 550, 'amplitude_mean': 1.4, 'width_mean': 20},
                        {'center': 700, 'amplitude_mean': 0.8, 'width_mean': 16},
                        {'center': 850, 'amplitude_mean': 1.1, 'width_mean': 24},
                    ],
                    'amplitude_mean': 1.0,
                    'amplitude_std': 0.1,
                    'width_mean': 15.0,
                    'width_std': 2.0,
                },
                {
                    'name': 'B',
                    'n_samples': 50,
                    'peaks': [250, 700, 850],
                    'amplitude_mean': 1.2,
                    'width_mean': 20.0,
                },
            ]

    n_points : int, default ``500``
        Number of points on the spectral axis (resolution).
    x_min, x_max : float, default ``0``, ``1000``
        Limits of the spectral axis (e.g. 400–1000 nm for Vis-NIR,
        0–40 keV for XRF).
    seed : int, optional
        Random seed for reproducibility.

    Returns
    -------
    df : pandas.DataFrame
        Synthetic spectral dataset.

        - Column 0: ``'Class'`` (str — class name from *classes_config*).
        - Columns 1 … n_points: spectral intensities named after x-axis values.
        - Shape: ``(total_samples, n_points + 1)``.
    """
    if seed is not None:
        np.random.seed(seed)

    x_axis = np.linspace(x_min, x_max, n_points)

    spectra_list = []
    labels_list = []

    for config in classes_config:
        class_name = config["name"]
        n_samples = config["n_samples"]
        peaks = config["peaks"]
        amplitude_mean = config.get("amplitude_mean", 1.0)
        amplitude_std = config.get("amplitude_std", 0.1)
        width_mean = config.get("width_mean", 15.0)
        width_std = config.get("width_std", 2.0)
        noise_std = config.get("noise_std", 0.02)

        for _ in range(n_samples):
            spectrum = _generate_single_spectrum(
                x_axis,
                peaks,
                amplitude_mean,
                amplitude_std,
                width_mean,
                width_std,
                noise_std,
            )
            spectra_list.append(spectrum)
            labels_list.append(class_name)

    spectra_array = np.array(spectra_list)
    column_names = x_axis.astype(str).tolist()
    df = pd.DataFrame(spectra_array, columns=column_names)
    df.insert(0, "Class", labels_list)

    return df