Source code for dataset_hub.classification.datasets

from typing import Optional

import pandas as pd

from dataset_hub._core.data_bundle import DataBundle
from dataset_hub._core.get_data import get_data as _get_data

task_type = "classification"


[docs]def get_titanic(verbose: Optional[bool] = None) -> pd.DataFrame: """ Load and return the Titanic dataset (classification). A classic binary classification dataset containing information about passengers aboard the Titanic, including demographic and ticket-related features and survival outcome. Original dataset: `Kaggle Titanic <https://www.kaggle.com/c/titanic/data>`_ Columns: - ``pclass`` (int): passenger class (1 = 1st, 2 = 2nd, 3 = 3rd) - ``name`` (str): full name of the passenger - ``sex`` (str): passenger gender - ``age`` (float): passenger age in years, may contain missing values - ``fare`` (float): ticket fare, may contain missing values - ``sibsp`` (int): number of siblings/spouses aboard - ``parch`` (int): number of parents/children aboard - ``survived`` 🚩 (int): **target variable**, 1 if survived, 0 otherwise Args: verbose (bool, optional): If True, the function prints a link to the dataset documentation in \ the log output after loading. (e.g., on this page) Default is None, which uses the global :ref:`settings`. Returns: pandas.DataFrame: The Titanic dataset with all features including the target. Quick Start: .. code-block:: python from dataset_hub.classification import get_titanic df = get_titanic() """ dataset: DataBundle[pd.DataFrame] = _get_data( dataset_name="titanic", task_type=task_type, verbose=verbose ) return dataset["data"]
[docs]def get_iris(verbose: Optional[bool] = None) -> pd.DataFrame: """ Load and return the Iris dataset (classification). A classic multiclass classification dataset containing measurements of iris flowers from three different species. Original dataset: `UCI Iris <https://archive.ics.uci.edu/ml/datasets/iris>`_ Columns: - ``sepal_length`` (float): length of the sepal in cm - ``sepal_width`` (float): width of the sepal in cm - ``petal_length`` (float): length of the petal in cm - ``petal_width`` (float): width of the petal in cm - ``species`` 🚩 (str): **target variable**, species name (setosa, \ versicolor, virginica) Args: verbose (bool, optional): If True, the function prints a link to the dataset documentation \ in the log output after loading. (e.g., on this page) Default is None, which uses the global :ref:`settings`. Returns: pandas.DataFrame: The Iris dataset with all features including the target. Quick Start: .. code-block:: python from dataset_hub.classification import get_iris df = get_iris() """ dataset: DataBundle[pd.DataFrame] = _get_data( dataset_name="iris", task_type=task_type, verbose=verbose ) return dataset["data"]