Source code for dataset_hub._core.provider.dataframe_provider

from dataclasses import dataclass, field
from typing import Any, Callable, Dict

import pandas as pd

from .provider import (
    Provider,
    ProviderConfig,
)


@dataclass
class SourceConfig:
    """
    Configuration for data source.

    Attributes:
        type (str): Source type (e.g., 'url', 'file').
        url (str): URL or file path to the dataset.
        format (str): The format of the file (e.g., 'csv', 'parquet').
    """

    type: str
    url: str
    format: str


[docs]@dataclass class DataFrameProviderConfig(ProviderConfig): """ Configuration schema for DataFrameProvider. Attributes: source (Dict[str, Any] | SourceConfig): Source configuration with type, url, and format. read_kwargs (Dict[str, Any]): Optional keyword arguments forwarded directly to the corresponding pandas reader. """ source: Dict[str, Any] read_kwargs: Dict[str, Any] = field(default_factory=dict)
[docs]class DataFrameProvider(Provider[pd.DataFrame]): """ Provider that loads a dataset from a source (URL or file) and returns it as a pandas DataFrame. Regardless of the underlying file format, the output is always returned as: {"data": pandas.DataFrame} Supported formats depend on the implementation of `read_dataframe`. """ ConfigClass = DataFrameProviderConfig # Registry of formats and corresponding pandas reader functions _READER_REGISTRY: Dict[str, Callable[..., pd.DataFrame]] = { "csv": pd.read_csv, "parquet": pd.read_parquet, "excel": pd.read_excel, "json": pd.read_json, }
[docs] def load(self) -> pd.DataFrame: """ Fetch and load the dataset specified in the configuration. Returns: pd.DataFrame: The loaded pandas DataFrame. Raises: ValueError: If the file cannot be read or the format is unsupported. """ source = self.config["source"] source_type = source.get("type") if source_type != "url": raise ValueError(f"Source type '{source_type}' is not supported yet") url = source.get("url") if not url: raise ValueError("Source must contain 'url' key") format_ = source.get("format") if not format_: raise ValueError("Source must contain 'format' key") df = self.read_dataframe( url, format_, self.config.get("read_kwargs", {}), ) return df
[docs] def read_dataframe( self, path_or_url: str, format: str, read_kwargs: Dict[str, Any] ) -> pd.DataFrame: """ Universal function to read a DataFrame from various file formats. Args: path_or_url (str): Local file path or URL to the data. format (str): Data format ('csv', 'parquet', 'excel', 'json'). read_kwargs (dict, optional): Additional parameters to pass to the corresponding pandas reader function. Returns: pd.DataFrame: Loaded DataFrame. Raises: ValueError: If the specified format is not supported. """ if read_kwargs is None: read_kwargs = {} format = format.lower() if format not in self._READER_REGISTRY: raise ValueError( f"Format '{format}' is not supported. " f"Supported formats: {list(self._READER_REGISTRY.keys())}" ) reader = self._READER_REGISTRY[format] return reader(path_or_url, **read_kwargs)