Source code for pipelinex.extras.datasets.pandas.fixed_width_csv_dataset

from importlib.util import find_spec

import pandas as pd

if find_spec("kedro"):
    from kedro.extras.datasets.pandas.csv_dataset import CSVDataSet
else:
    from .csv_local import CSVLocalDataSet as CSVDataSet


[docs]class FixedWidthCSVDataSet(CSVDataSet):
    """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying
    filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file.
    """

[docs]    def __init__(
        self,
        *args,
        enable_fixed_width: bool = True,
        num_decimal_places: int = 9,
        **kwargs
    ) -> None:
        """Creates a ``FixedWidthCSVDataSet`` pointing to a concrete CSV file
        on a specific filesystem.
        Args:
            filepath: Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.
                If prefix is not provided, `file` protocol (local filesystem) will be used.
                The prefix should be any protocol supported by ``fsspec``.
                Note: `http(s)` doesn't support versioning.
            load_args: Pandas options for loading CSV files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
                All defaults are preserved.
            save_args: Pandas options for saving CSV files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
                All defaults are preserved, but "index", which is set to False.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.
            credentials: Credentials required to get access to the underlying filesystem.
                E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
            fs_args: Extra arguments to pass into underlying filesystem class constructor
                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
                to pass to the filesystem's `open` method through nested keys
                `open_args_load` and `open_args_save`.
                Here you can find all available arguments for `open`:
                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                All defaults are preserved, except `mode`, which is set to `r` when loading
                and to `w` when saving.
            enable_fixed_width: Save to a CSV file with each column width fixed among
                all the rows and theheader to improve readability for humans.
            num_decimal_places: Number of decimal places for float values to save.
        """
        self.enable_fixed_width = enable_fixed_width
        self.num_decimal_places = num_decimal_places
        super().__init__(*args, **kwargs)

    def _save(self, data: pd.DataFrame) -> None:
        if self.enable_fixed_width:
            fix_width(data, num_decimal_places=self.num_decimal_places)
        return super()._save(data)


[docs]def fix_width(df, num_decimal_places=9):
    import numpy as np

    for col in df.select_dtypes(["float64", "float32", "float16"]):
        sr = df[col].apply("{:.0f}".format)
        d = sr.astype(str).str.len().max()
        df[col] = df[col].apply(
            ("{:" + str(d) + "." + str(num_decimal_places) + "f}").format
        )

    for col in df.select_dtypes(["int64", "int32", "int16", "int8"]):
        sr = df[col].apply("{}".format)
        d = sr.astype(str).str.len().max()
        df[col] = df[col].apply(("{:" + str(d) + "d}").format)

    for col in df.select_dtypes(["object"]):
        df[col] = df[col].astype(str)
        d = df[col].str.len().max()
        df[col] = df[col].apply(("{:" + str(d) + "s}").format)

    for col in list(df.columns):
        df[col] = df[col].astype(str)
        w = max(df[col].str.len().max(), len(col))
        f = (" {:" + str(w) + "s} ").format
        df[col] = df[col].apply(f)
        df.rename(columns={col: f(col)}, inplace=True)