Source code for pipelinex.mlflow_on_kedro.hooks.mlflow.mlflow_time_logger

import json
import re
import tempfile
import time
from importlib.util import find_spec
from logging import getLogger
from pathlib import Path
from pprint import pformat
from typing import Any, Callable, Dict  # NOQA

from kedro.pipeline.node import Node  # NOQA

from .mlflow_utils import hook_impl, mlflow_log_artifacts, mlflow_log_metrics

log = getLogger(__name__)


def _get_task_name(node: Node) -> str:
    func_name = (
        node._func_name.replace("<", "")
        .replace(">", "")
        .split(" ")[0]
        .split("(")[0]
        .split("=")[0]
        .split(".")[-1]
    )
    task_name = "{} -- {}".format(func_name[:20], " - ".join(node.outputs))
    task_name = re.sub(r"[^A-Za-z0-9\_\-\.\ \/]", " ", task_name)
    return task_name[:50]


[docs]def dump_dict(filepath: str, d: dict): with open(filepath, "w") as outfile: json.dump(d, outfile)
[docs]def load_dict(filepath: str): with open(filepath, "r") as outfile: d = json.load(outfile) return d
[docs]class MLflowTimeLoggerHook: """ Logs duration time to run each node (task) to MLflow. Optionally, the execution logs can be visualized as a Gantt chart by `plotly.figure_factory.create_gantt` (https://plotly.github.io/plotly.py-docs/generated/plotly.figure_factory.create_gantt.html) if `plotly` is installed. """
[docs] def __init__( self, gantt_filepath: str = None, gantt_params: Dict[str, Any] = {}, metric_name_prefix: str = "_time_to_run ", task_name_func: Callable[[Node], str] = _get_task_name, time_log_filepath: str = None, enable_plotly: bool = True, enable_mlflow: bool = True, ): """ Args: gantt_filepath: File path to save the generated gantt chart. gantt_params: Args fed to: https://plotly.github.io/plotly.py-docs/generated/plotly.figure_factory.create_gantt.html metric_name_prefix: Prefix for the metric names. The metric names are `metric_name_prefix` concatenated with the string returned by `task_name_func`. task_name_func: Callable to return the task name using ``kedro.pipeline.node.Node`` object. time_log_filepath: File path to save the time log in JSON format. enable_plotly: Enable visualization of logged time as a gantt chart. enable_mlflow: Enable logging to MLflow. """ self.enable_mlflow = find_spec("mlflow") and enable_mlflow self.enable_plotly = find_spec("plotly") and enable_plotly self.gantt_filepath = gantt_filepath self.gantt_params = gantt_params self.metric_name_prefix = metric_name_prefix self.task_name_func = task_name_func self.time_log_filepath = time_log_filepath or ( tempfile.gettempdir() + "/_time_log.json" ) Path(self.time_log_filepath).parent.mkdir(parents=True, exist_ok=True) dump_dict( self.time_log_filepath, {"time_begin": {}, "time_end": {}, "time": {}} ) self._time_begin_dict = {} self._time_end_dict = {} self._time_dict = {}
[docs] def update_time_dict(self, key: str, d: dict): dumping_dict = load_dict(self.time_log_filepath) dumping_dict[key].update(d) dump_dict(self.time_log_filepath, dumping_dict)
[docs] def load_time_dict(self, key: str): return load_dict(self.time_log_filepath).get(key)
[docs] @hook_impl def before_node_run(self, node, catalog, inputs): task_name = self.task_name_func(node) time_begin_dict = {task_name: time.time()} self._time_begin_dict.update(time_begin_dict) self.update_time_dict("time_begin", time_begin_dict)
[docs] @hook_impl def after_node_run(self, node, catalog, inputs, outputs): task_name = self.task_name_func(node) time_end_dict = {task_name: time.time()} self._time_end_dict.update(time_end_dict) self.update_time_dict("time_end", time_end_dict) time_dict = { task_name: ( self._time_end_dict.get(task_name) - self._time_begin_dict.get(task_name) ) } log.info("Time duration: {}".format(time_dict)) self._time_dict.update(time_dict) self.update_time_dict("time", time_dict) metric_time_dict = { (self.metric_name_prefix + k): v for (k, v) in time_dict.items() } mlflow_log_metrics(metric_time_dict, enable_mlflow=self.enable_mlflow)
[docs] @hook_impl def after_pipeline_run(self, run_params, pipeline, catalog): self._time_begin_dict = self._time_begin_dict or self.load_time_dict( "time_begin" ) self._time_end_dict = self._time_end_dict or self.load_time_dict("time_end") self._time_dict = self._time_dict or self.load_time_dict("time") log.info("Time duration: \n{}".format(pformat(self._time_dict))) if self.enable_plotly: if not (self._time_begin_dict and self._time_end_dict): log.warning( "Time log dicts are not found. Skipping generating the Gantt Chart." ) return tasks_reversed = list(self._time_begin_dict.keys())[::-1] from plotly.figure_factory import create_gantt df = [ dict( Task=t, Start=self._time_begin_dict.get(t) * 1000, Finish=self._time_end_dict.get(t) * 1000, ) for t in tasks_reversed ] fig = create_gantt(df, **self.gantt_params) fp = self.gantt_filepath or (tempfile.gettempdir() + "/_gantt.html") Path(fp).parent.mkdir(parents=True, exist_ok=True) fig.write_html(fp) mlflow_log_artifacts(fp, enable_mlflow=self.enable_mlflow)