Source code for pipelinex.extras.datasets.pandas_profiling.pandas_profiling

from pathlib import Path
from typing import Any, Dict

import pandas as pd

from ..core import AbstractVersionedDataSet, DataSetError, Version


[docs]class PandasProfilingDataSet(AbstractVersionedDataSet): """``PandasProfilingDataSet`` is an ``AbstractVersionedDataSet`` to generate pandas profiling report. See https://github.com/pandas-profiling/pandas-profiling for details. """ DEFAULT_SAVE_ARGS = dict() # type: Dict[str, Any]
[docs] def __init__( self, filepath: str, save_args: Dict[str, Any] = None, sample_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``PandasProfilingDataSet`` pointing to a concrete filepath. Args: filepath: path to a local yaml file. save_args: Arguments passed on to ``df.profile_report`` such as title. See https://pandas-profiling.github.io/pandas-profiling/docs/ for details. See https://github.com/pandas-profiling/pandas-profiling/blob/master/pandas_profiling/config_default.yaml for default values. sample_args: Arguments passed on to ``df.sample``. See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html for details. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ super().__init__( filepath=Path(filepath), version=version, exists_function=self._exists ) self._load_args = {} self._save_args = save_args self._sample_args = sample_args
def _describe(self) -> Dict[str, Any]: return dict( filepath=self._filepath, save_args=self._save_args, sampling_args=self._sample_args, version=self._version, ) def _load(self) -> Any: """loading is not supported.""" return None def _save(self, data: pd.DataFrame) -> None: save_path = Path(self._get_save_path()) save_path.parent.mkdir(parents=True, exist_ok=True) if self._sample_args is not None: data = data.sample(**self._sample_args) profile = data.profile_report(**self._save_args) profile.to_file(output_file=save_path) load_path = Path(self._get_load_path()) self._check_paths_consistency(load_path.absolute(), save_path.absolute()) def _exists(self) -> bool: try: path = self._get_load_path() except DataSetError: return False return Path(path).is_file()