Source code for dataset_hub._core.provider.provider

from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass
from typing import Any, Dict, Generic, Type

from dataset_hub._core.data_bundle import UserDataT


[docs]@dataclass
class ProviderConfig:
    """
    Base class for all provider configuration models.

    Concrete provider configurations must inherit from this class.
    These dataclasses define the structure, defaults, and type hints
    for a provider's configuration, and are used by the Provider class
    to validate and normalize incoming config dictionaries.
    """

    pass


[docs]class Provider(ABC, Generic[UserDataT]):
    """
    Abstract base class for all data providers.

    A provider loads a dataset from some source (URL, file, built-in dataset, etc.)
    according to a configuration model defined in `ConfigClass`.

    The provider lifecycle consists of:
        1. Normalization — convert a raw dict into a validated config dict
           using the dataclass `ConfigClass`.
        2. Optional transformation — post-process or enrich the normalized config.
        3. Data loading — implemented in `load()`.

    Attributes:
        config (Dict[str, Any]):
            The validated and optionally transformed configuration dictionary.
        ConfigClass (Type[ProviderConfig]):
            A dataclass defining the structure of the provider's configuration.
            Must be overridden by subclasses.
    """

    ConfigClass: Type[ProviderConfig]

    def __init__(self, config: Dict[str, Any]) -> None:
        """
        Initialize the provider with the given raw configuration dictionary.

        The initialization pipeline consists of two stages:
            - `_normalize_config()`: Validate and normalize the raw config using
            the provider's `ConfigClass` dataclass.
            - `_transform_config()`: Optionally apply additional processing to the
            normalized configuration.

        The final result of both steps is stored in `self.config`.

        Args:
            config (Dict[str, Any]):
                The raw configuration dictionary supplied by the dataset definition.

        Raises:
            TypeError:
                If the subclass does not define a valid `ConfigClass`.
            ValueError:
                If the configuration cannot be validated against `ConfigClass`.
        """

        if self.ConfigClass is None:
            raise TypeError(
                f"{self.__class__.__name__} must define ConfigClass = SomeDataclass"
            )
        normalized = self._normalize_config(config)
        transformed = self._transform_config(normalized)
        self.config = transformed

[docs]    @abstractmethod
    def load(self) -> UserDataT:
        """
        Load and return the dataset according to the provider's configuration.

        This method must be implemented by all concrete providers.

        Returns:
            Any: The loaded dataset object. Typically a pd.DataFrame for single-table
            datasets, but can be any data type (e.g., dict, list, graph, array).
        """
        ...

    def _normalize_config(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """
        Validate and normalize the raw configuration using the provider's ConfigClass.

        This method constructs an instance of `ConfigClass` from the raw dictionary.
        This enforces field types, required fields, and applies default values defined
        in the dataclass. The resulting dataclass instance is then converted back into
        a plain dictionary.

        Subclasses normally do not override this method.

        Args:
            config (Dict[str, Any]):
                The raw configuration dictionary.

        Returns:
            Dict[str, Any]:
                A normalized configuration dictionary.

        Raises:
            ValueError:
                If the config does not match the dataclass signature.
        """
        try:
            inst = self.ConfigClass(**config)
        except TypeError as e:
            raise ValueError(
                f"Invalid config for {self.__class__.__name__}: {e}"
            ) from e

        return asdict(inst)

    def _transform_config(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """
        Apply optional post-processing to the normalized configuration.

        This hook allows subclasses to perform additional adjustments to the
        configuration after dataclass validation.

        By default, this method returns the configuration unchanged.

        Args:
            config (Dict[str, Any]):
                The already normalized configuration dictionary.

        Returns:
            Dict[str, Any]:
                The transformed configuration dictionary.
        """
        return config