Source code for dataset_hub._core.get_data

from typing import Any, Optional

from dataset_hub._core.config_manager import ConfigManager
from dataset_hub._core.data_bundle import DataBundle
from dataset_hub._core.provider import ProviderFactory
from dataset_hub._core.utils.logger import log_dataset_doc_doc_link


[docs]@log_dataset_doc_doc_link() def get_data( dataset_name: str, task_type: str, verbose: Optional[bool] ) -> DataBundle[Any]: """ Core backend function used by all `.get_<dataset_name>()` functions to load \ datasets. This function: 1. Loads the dataset configuration using :ref:`ConfigFactory`. 2. Instantiates the appropriate Provider via :ref:`ProviderFactory`. 3. Loads the dataset using :ref:`providers`. 4. ``(optional)`` Logs a link to the dataset documentation once per session \ if verbose is enabled (either via argument or :ref:`settings`). Args: dataset_name (str): The name of the dataset (corresponding to the \ YAML config file). task_type (str): The type of task (e.g., "classification", "regression"). verbose (bool, optional): Whether to print dataset information and \ documentation link. If None, the global library setting is used. Returns: DataBundle: A consistent wrapper containing the loaded data. Example:: dataset = get_data("titanic", "classification") df = dataset["data"] # pd.DataFrame Raises: FileNotFoundError: If the dataset configuration YAML file is not found. ValueError: If the provider type is unknown or misconfigured. """ config = ConfigManager.load_config(dataset_name, task_type) provider = ProviderFactory.build_provider(config["provider"]) data = provider.load() return DataBundle({"data": data})