Source code for dataset_hub._core.config_manager

from pathlib import Path
from typing import Any, Dict

import yaml


[docs]class ConfigManager: """ Factory to load and build dataset configurations. Responsibilities: - Find config file by dataset_name and task_type - Load YAML into dict - Transform dataset_parts schema into provider-based schema """
[docs] @staticmethod def load_config(dataset_name: str, task_type: str) -> Dict[str, Any]: """ Load and return the dataset configuration as a dictionary. Converts from the dataset_parts schema to the provider-based schema: Input: {"dataset_parts": [{"name": "...", "source": {...}, ...}]} Output: {"provider": {"type": "...", "params": {...}}} Args: dataset_name (str): Name of the dataset (file without extension). task_type (str): Type of task (e.g., "classification"). Returns: dict: Loaded configuration with provider-based schema. """ config_path = ConfigManager.build_config_path(dataset_name, task_type) raw_config = ConfigManager.load_raw_config(config_path) transformed_config = ConfigManager._transform_to_provider_schema(raw_config) return transformed_config
[docs] @staticmethod def build_config_path(dataset_name: str, task_type: str) -> Path: """ Build the file path to the dataset's YAML configuration. Args: dataset_name (str): Name of the dataset. task_type (str): Type of task. Returns: Path: Full path to the YAML config file. """ current_file_path = Path(__file__).resolve() dataset_hub_path = current_file_path.parent.parent config_path = dataset_hub_path / task_type / "_configs" / f"{dataset_name}.yaml" return config_path
[docs] @staticmethod def load_raw_config(config_path: Path) -> Dict[str, Any]: """ Load the raw YAML configuration from the given path. Args: config_path (Path): Path to the YAML configuration file. Returns: dict: Configuration loaded from YAML. Raises: FileNotFoundError: If the YAML file does not exist. """ if not config_path.exists(): raise FileNotFoundError( f"Dataset config not found: {config_path.parts[-3:]}" ) with open(config_path) as f: dataset_config: Dict[str, Any] = yaml.safe_load(f) return dataset_config
@staticmethod def _transform_to_provider_schema(raw_config: Dict[str, Any]) -> Dict[str, Any]: """ Transform dataset_parts schema into provider-based schema. Input schema: dataset_parts: - name: "iris" pack_type: "table" as_type: "pd.DataFrame" source: type: "url" url: "https://..." format: "csv" read_kwargs: sep: "," Output schema: provider: type: "dataframe" params: source: type: "url" url: "https://..." format: "csv" read_kwargs: sep: "," Args: raw_config (Dict[str, Any]): Raw configuration with dataset_parts. Returns: Dict[str, Any]: Configuration with provider-based schema. Raises: ValueError: If the configuration structure is invalid. """ if "dataset_parts" not in raw_config: raise ValueError("Configuration must contain 'dataset_parts' key") dataset_parts = raw_config["dataset_parts"] if not isinstance(dataset_parts, list) or len(dataset_parts) == 0: raise ValueError("'dataset_parts' must be a non-empty list") # Take the first dataset part (for now, support single-part datasets) part = dataset_parts[0] if "source" not in part: raise ValueError("Dataset part must contain 'source' key") source = part["source"] if "type" not in source: raise ValueError("Source must contain 'type' key") # Build provider params from source and read_kwargs params: Dict[str, Any] = { "source": source, } if "read_kwargs" in part: params["read_kwargs"] = part["read_kwargs"] # Return provider-based schema return { "provider": { "type": "dataframe", "params": params, } }