Source code for pipelinex.extras.datasets.pandas_profiling.pandas_profiling

from pathlib import Path
from typing import Any, Dict

import pandas as pd

from ..core import AbstractVersionedDataSet, DataSetError, Version


[docs]class PandasProfilingDataSet(AbstractVersionedDataSet):
    """``PandasProfilingDataSet`` is an ``AbstractVersionedDataSet`` to generate
    pandas profiling report.
    See https://github.com/pandas-profiling/pandas-profiling for details.
    """

    DEFAULT_SAVE_ARGS = dict()  # type: Dict[str, Any]

[docs]    def __init__(
        self,
        filepath: str,
        save_args: Dict[str, Any] = None,
        sample_args: Dict[str, Any] = None,
        version: Version = None,
    ) -> None:
        """Creates a new instance of ``PandasProfilingDataSet`` pointing to a concrete
        filepath.

        Args:
            filepath: path to a local yaml file.
            save_args: Arguments passed on to ``df.profile_report`` such as title.
                See https://pandas-profiling.github.io/pandas-profiling/docs/ for details.
                See https://github.com/pandas-profiling/pandas-profiling/blob/master/pandas_profiling/config_default.yaml for default values.
            sample_args: Arguments passed on to ``df.sample``.
                See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html for details.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.

        """
        super().__init__(
            filepath=Path(filepath), version=version, exists_function=self._exists
        )
        self._load_args = {}
        self._save_args = save_args
        self._sample_args = sample_args

    def _describe(self) -> Dict[str, Any]:
        return dict(
            filepath=self._filepath,
            save_args=self._save_args,
            sampling_args=self._sample_args,
            version=self._version,
        )

    def _load(self) -> Any:
        """loading is not supported."""
        return None

    def _save(self, data: pd.DataFrame) -> None:
        save_path = Path(self._get_save_path())
        save_path.parent.mkdir(parents=True, exist_ok=True)

        if self._sample_args is not None:
            data = data.sample(**self._sample_args)
        profile = data.profile_report(**self._save_args)
        profile.to_file(output_file=save_path)

        load_path = Path(self._get_load_path())
        self._check_paths_consistency(load_path.absolute(), save_path.absolute())

    def _exists(self) -> bool:
        try:
            path = self._get_load_path()
        except DataSetError:
            return False
        return Path(path).is_file()