Source code for ewoksxas.tasks.read_scans

from collections import defaultdict
from dataclasses import dataclass

import h5py
import numpy as np
from ewokscore import Task
from ewokscore.model import BaseInputModel, BaseOutputModel
from Orange.data import Table
from silx.io import h5py_utils

from ewoksxas.converters.orange import Converter, VarType
from ewoksxas.converters.resample import interpolate_spectrum
from ewoksxas.io.hdf5 import read_data_at_path



[docs]
@dataclass(slots=True)
class CounterInfo:
    """A counter to read: its output name and HDF5 path within each scan."""

    name: str
    path: str




[docs]
@dataclass(slots=True)
class MetadataInfo:
    """A metadata field to read: output name, HDF5 path, and variable type.

    ``type`` is a token ("auto" | "categorical" | "text" | "numeric") controlling
    how the values are materialized as an Orange variable; "auto" lets the
    converter infer the type.
    """

    name: str
    path: str
    type: str = "auto"




[docs]
class Inputs(BaseInputModel):
    Data: Table
    x: str
    counters: list[CounterInfo]
    metadata: list[MetadataInfo]
    x_interp_grid: list[float] | None = None




[docs]
class Outputs(BaseOutputModel):
    Data: Table




[docs]
class ReadScans(Task, input_model=Inputs, output_model=Outputs):  # type: ignore
    """Task to read scan data and metadata from files."""


[docs]
    def run(self):
        scans = self._collect_scans()
        common_x, counter_data, metas = self._read_rows(scans)

        if not counter_data or common_x is None:
            raise ValueError("No valid data found in the specified paths.")

        self.outputs.Data = self._build_table(common_x, counter_data, metas)


    def _collect_scans(self) -> list[tuple[str, str]]:
        """Validate the input table and return the (filename, scan name) pairs."""
        data = self.inputs.Data

        if len(data.domain.attributes) > 0:
            raise ValueError("Input data table should not contain features.")
        if not self.inputs.counters:
            raise ValueError("At least one counter must be specified.")

        scans = self._scans_from_table(data)
        if not scans:
            raise ValueError("No scans to process.")
        return scans

    @staticmethod
    def _scans_from_table(data: Table) -> list[tuple[str, str]]:
        """Extract (filename, scan name) pairs from the metas of the table."""
        converter = Converter.from_table(data)
        meta_names = converter.get_meta_names()
        if "Filename" not in meta_names or "Scan Name" not in meta_names:
            return []
        filenames = converter.get_meta_values("Filename")
        scan_names = converter.get_meta_values("Scan Name")
        return [
            (str(filename), str(scan_name))
            for filename, scan_name in zip(filenames, scan_names, strict=True)
        ]

    def _read_rows(
        self, scans: list[tuple[str, str]]
    ) -> tuple[np.ndarray | None, list[np.ndarray], list[dict]]:
        """Read every scan*counter row, returning the common x grid and data."""
        # Group by filename so each file is opened once.
        by_file: dict[str, list[str]] = defaultdict(list)
        for file_path, scan_name in scans:
            by_file[file_path].append(scan_name)

        common_x: np.ndarray | None = None
        all_counter_data: list[np.ndarray] = []
        all_metas: list[dict] = []

        for file_path, scan_names in by_file.items():
            with h5py_utils.File(file_path, "r") as h5:
                for scan_name in scan_names:
                    x_data = self._read_x(h5, scan_name)
                    if x_data is None:
                        continue
                    if common_x is None:
                        common_x = self._common_x(x_data)

                    scan_metas = self._read_scan_metas(h5, scan_name, file_path)

                    for counter in self.inputs.counters:
                        counter_data = self._read_counter(
                            h5, scan_name, counter, x_data, common_x
                        )
                        if counter_data is None:
                            continue
                        all_counter_data.append(counter_data)
                        all_metas.append({**scan_metas, "Counter": counter.name})

        return common_x, all_counter_data, all_metas

    def _read_x(self, h5: h5py.File, scan_name: str) -> np.ndarray | None:
        """Read the x-axis dataset for a scan (None if the path is absent)."""
        x_data = read_data_at_path(h5, f"{scan_name}/{self.inputs.x}")
        if x_data is None:
            return None
        if isinstance(x_data, list):
            raise ValueError(f"x '{self.inputs.x}' points to a group, not a dataset")
        return np.asarray(x_data, dtype=np.float64)

    def _common_x(self, x_data: np.ndarray) -> np.ndarray:
        """The shared x grid: the custom interpolation grid or the first scan's."""
        if self.inputs.x_interp_grid:
            x_min, x_max, x_npoints = self.inputs.x_interp_grid
            return np.linspace(x_min, x_max, int(x_npoints))
        return x_data

    def _read_scan_metas(self, h5: h5py.File, scan_name: str, file_path: str) -> dict:
        """Build the metadata dict for a single scan."""
        scan_metas: dict = {"Filename": file_path}
        if scan_name:
            scan_metas["Scan Name"] = scan_name

        for spec in self.inputs.metadata:
            full_path = f"{scan_name}/{spec.path}" if scan_name else spec.path
            value = read_data_at_path(h5, full_path)
            if value is None:
                continue
            if isinstance(value, list):
                raise ValueError(
                    f"Metadata path '{spec.path}' points to a group, not a dataset"
                )
            scan_metas[spec.name] = value
        return scan_metas

    def _read_counter(
        self,
        h5: h5py.File,
        scan_name: str,
        counter: CounterInfo,
        x_data: np.ndarray,
        common_x: np.ndarray,
    ) -> np.ndarray | None:
        """Read one counter for a scan, interpolated to the common x grid."""
        counter_data = read_data_at_path(h5, f"{scan_name}/{counter.path}")
        if counter_data is None:
            return None
        if isinstance(counter_data, list):
            raise ValueError(
                f"Counter '{counter.path}' points to a group, not a dataset"
            )
        counter_data = np.asarray(counter_data, dtype=np.float64)

        if not np.array_equal(x_data, common_x):
            counter_data = interpolate_spectrum(common_x, x_data, counter_data)
        return counter_data

    def _build_table(
        self, common_x: np.ndarray, counter_data: list[np.ndarray], metas: list[dict]
    ) -> Table:
        """Assemble the output Orange Table from the collected rows."""
        converter = Converter()
        converter.add_features(common_x, np.array(counter_data))

        self._add_standard_metas(converter, metas)
        for spec in self.inputs.metadata:
            self._add_custom_meta(converter, spec, metas)

        return converter.to_table()

    @staticmethod
    def _add_standard_metas(converter: Converter, metas: list[dict]) -> None:
        """Add the always-present metas.

        Filename and Scan Name are identifiers (strings); Counter is categorical
        (discrete) so it can group spectra directly in Orange widgets such as
        "average". A unique per-scan row identity is derived downstream by the
        converter as a content hash of (Filename, Scan Name) (see
        ``Converter.hash_id``), so no visible ID meta column is needed.
        """
        filenames = [meta["Filename"] for meta in metas]
        converter.add_meta("Filename", np.array(filenames, dtype=object))

        scan_names = [meta.get("Scan Name", "") for meta in metas]
        converter.add_meta("Scan Name", np.array(scan_names, dtype=object))

        counter_names = [meta["Counter"] for meta in metas]
        converter.add_meta("Counter", np.array(counter_names, dtype=object))

    @staticmethod
    def _add_custom_meta(
        converter: Converter, spec: MetadataInfo, metas: list[dict]
    ) -> None:
        """Add one custom meta from the metadata paths, honoring its type token."""
        var_type = VarType.from_label(spec.type)

        # String-backed types pad missing values with "" so the array stays
        # string-typed; numeric/auto pad with np.nan (Orange's missing sentinel).
        string_backed = var_type in (VarType.TEXT, VarType.CATEGORICAL)
        missing = "" if string_backed else np.nan

        values = [meta.get(spec.name, missing) for meta in metas]
        for value in values:
            if not np.isscalar(value):
                raise ValueError(
                    f"Metadata '{spec.name}' contains non-scalar values "
                    "(e.g. arrays). Metadata must be single values "
                    "(numbers or strings)."
                )

        if var_type is VarType.AUTO:
            converter.add_meta(spec.name, np.array(values))
        elif string_backed:
            converter.add_meta(
                spec.name, np.array(values, dtype=object), var_type=var_type
            )
        else:
            converter.add_meta(
                spec.name, np.array(values, dtype=np.float64), var_type=var_type
            )