Source code for dataset_hub._core.provider.provider

from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass
from typing import Any, Dict, Generic, Type

from dataset_hub._core.data_bundle import UserDataT


[docs]@dataclass class ProviderConfig: """ Base class for all provider configuration models. Concrete provider configurations must inherit from this class. These dataclasses define the structure, defaults, and type hints for a provider's configuration, and are used by the Provider class to validate and normalize incoming config dictionaries. """ pass
[docs]class Provider(ABC, Generic[UserDataT]): """ Abstract base class for all data providers. A provider loads a dataset from some source (URL, file, built-in dataset, etc.) according to a configuration model defined in `ConfigClass`. The provider lifecycle consists of: 1. Normalization — convert a raw dict into a validated config dict using the dataclass `ConfigClass`. 2. Optional transformation — post-process or enrich the normalized config. 3. Data loading — implemented in `load()`. Attributes: config (Dict[str, Any]): The validated and optionally transformed configuration dictionary. ConfigClass (Type[ProviderConfig]): A dataclass defining the structure of the provider's configuration. Must be overridden by subclasses. """ ConfigClass: Type[ProviderConfig] def __init__(self, config: Dict[str, Any]) -> None: """ Initialize the provider with the given raw configuration dictionary. The initialization pipeline consists of two stages: - `_normalize_config()`: Validate and normalize the raw config using the provider's `ConfigClass` dataclass. - `_transform_config()`: Optionally apply additional processing to the normalized configuration. The final result of both steps is stored in `self.config`. Args: config (Dict[str, Any]): The raw configuration dictionary supplied by the dataset definition. Raises: TypeError: If the subclass does not define a valid `ConfigClass`. ValueError: If the configuration cannot be validated against `ConfigClass`. """ if self.ConfigClass is None: raise TypeError( f"{self.__class__.__name__} must define ConfigClass = SomeDataclass" ) normalized = self._normalize_config(config) transformed = self._transform_config(normalized) self.config = transformed
[docs] @abstractmethod def load(self) -> UserDataT: """ Load and return the dataset according to the provider's configuration. This method must be implemented by all concrete providers. Returns: Any: The loaded dataset object. Typically a pd.DataFrame for single-table datasets, but can be any data type (e.g., dict, list, graph, array). """ ...
def _normalize_config(self, config: Dict[str, Any]) -> Dict[str, Any]: """ Validate and normalize the raw configuration using the provider's ConfigClass. This method constructs an instance of `ConfigClass` from the raw dictionary. This enforces field types, required fields, and applies default values defined in the dataclass. The resulting dataclass instance is then converted back into a plain dictionary. Subclasses normally do not override this method. Args: config (Dict[str, Any]): The raw configuration dictionary. Returns: Dict[str, Any]: A normalized configuration dictionary. Raises: ValueError: If the config does not match the dataclass signature. """ try: inst = self.ConfigClass(**config) except TypeError as e: raise ValueError( f"Invalid config for {self.__class__.__name__}: {e}" ) from e return asdict(inst) def _transform_config(self, config: Dict[str, Any]) -> Dict[str, Any]: """ Apply optional post-processing to the normalized configuration. This hook allows subclasses to perform additional adjustments to the configuration after dataclass validation. By default, this method returns the configuration unchanged. Args: config (Dict[str, Any]): The already normalized configuration dictionary. Returns: Dict[str, Any]: The transformed configuration dictionary. """ return config