Source code for dataset_hub._core.config_manager

from pathlib import Path
from typing import Any, Dict

import yaml


[docs]class ConfigManager:
    """
    Factory to load and build dataset configurations.

    Responsibilities:
        - Find config file by dataset_name and task_type
        - Load YAML into dict
        - Transform dataset_parts schema into provider-based schema
    """

[docs]    @staticmethod
    def load_config(dataset_name: str, task_type: str) -> Dict[str, Any]:
        """
        Load and return the dataset configuration as a dictionary.

        Converts from the dataset_parts schema to the provider-based schema:
            Input:  {"dataset_parts": [{"name": "...", "source": {...}, ...}]}
            Output: {"provider": {"type": "...", "params": {...}}}

        Args:
            dataset_name (str): Name of the dataset (file without extension).
            task_type (str): Type of task (e.g., "classification").

        Returns:
            dict: Loaded configuration with provider-based schema.
        """
        config_path = ConfigManager.build_config_path(dataset_name, task_type)
        raw_config = ConfigManager.load_raw_config(config_path)
        transformed_config = ConfigManager._transform_to_provider_schema(raw_config)
        return transformed_config

[docs]    @staticmethod
    def build_config_path(dataset_name: str, task_type: str) -> Path:
        """
        Build the file path to the dataset's YAML configuration.

        Args:
            dataset_name (str): Name of the dataset.
            task_type (str): Type of task.

        Returns:
            Path: Full path to the YAML config file.
        """
        current_file_path = Path(__file__).resolve()
        dataset_hub_path = current_file_path.parent.parent
        config_path = dataset_hub_path / task_type / "_configs" / f"{dataset_name}.yaml"
        return config_path

[docs]    @staticmethod
    def load_raw_config(config_path: Path) -> Dict[str, Any]:
        """
        Load the raw YAML configuration from the given path.

        Args:
            config_path (Path): Path to the YAML configuration file.

        Returns:
            dict: Configuration loaded from YAML.

        Raises:
            FileNotFoundError: If the YAML file does not exist.
        """
        if not config_path.exists():
            raise FileNotFoundError(
                f"Dataset config not found: {config_path.parts[-3:]}"
            )
        with open(config_path) as f:
            dataset_config: Dict[str, Any] = yaml.safe_load(f)
        return dataset_config

    @staticmethod
    def _transform_to_provider_schema(raw_config: Dict[str, Any]) -> Dict[str, Any]:
        """
        Transform dataset_parts schema into provider-based schema.

        Input schema:
            dataset_parts:
              - name: "iris"
                pack_type: "table"
                as_type: "pd.DataFrame"
                source:
                  type: "url"
                  url: "https://..."
                  format: "csv"
                read_kwargs:
                  sep: ","

        Output schema:
            provider:
              type: "dataframe"
              params:
                source:
                  type: "url"
                  url: "https://..."
                  format: "csv"
                read_kwargs:
                  sep: ","

        Args:
            raw_config (Dict[str, Any]): Raw configuration with dataset_parts.

        Returns:
            Dict[str, Any]: Configuration with provider-based schema.

        Raises:
            ValueError: If the configuration structure is invalid.
        """
        if "dataset_parts" not in raw_config:
            raise ValueError("Configuration must contain 'dataset_parts' key")

        dataset_parts = raw_config["dataset_parts"]
        if not isinstance(dataset_parts, list) or len(dataset_parts) == 0:
            raise ValueError("'dataset_parts' must be a non-empty list")

        # Take the first dataset part (for now, support single-part datasets)
        part = dataset_parts[0]

        if "source" not in part:
            raise ValueError("Dataset part must contain 'source' key")

        source = part["source"]
        if "type" not in source:
            raise ValueError("Source must contain 'type' key")

        # Build provider params from source and read_kwargs
        params: Dict[str, Any] = {
            "source": source,
        }

        if "read_kwargs" in part:
            params["read_kwargs"] = part["read_kwargs"]

        # Return provider-based schema
        return {
            "provider": {
                "type": "dataframe",
                "params": params,
            }
        }