Source code for eurybia.core.smartplotter

"""Smart plotter module"""

from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from eurybia import SmartDrift

import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
from plotly import graph_objs as go
from plotly.subplots import make_subplots

from eurybia.report.common import VarType, compute_col_types
from eurybia.style.style_utils import colors_loading, define_style, select_palette


[docs]class SmartPlotter:
    """The smartplotter class includes all the methods used to display graphics

    Each SmartPlotter method is easy to use from a Smart Drift object,
    just use the following syntax

    Attributes:
    ----------
    smartdrift: object
        SmartDrift object
    style_dict: dict
            Dict contains dicts of the colors used in the different plots

    Example:
    -------
    >>> SD = Smartdrift()
    >>> SD.compile()
    >>> SD.plot.my_plot_method(param=value)

    """

    def __init__(self, smartdrift: SmartDrift):
        """Initialize a SmartPlotter.

        Args:
            smartdrift (SmartDrift): a SmartDrift instance

        """
        palette_name = list(colors_loading().keys())[0]
        self._style_dict = define_style(select_palette(colors_loading(), palette_name))
        self._smartdrift = smartdrift

    @property
    def style_dict(self) -> dict:
        """Getter"""
        return self._style_dict

    @style_dict.setter
    def style_dict(self, val: dict) -> None:
        """Setter"""
        if not isinstance(val, dict):
            raise ValueError("style_dict must be a dictionary.")
        self._style_dict = val

    @property
    def smartdrift(self) -> SmartDrift:
        """Getter"""
        return self._smartdrift

    @smartdrift.setter
    def smartdrift(self, val: SmartDrift) -> None:
        """Setter"""
        if not isinstance(val, SmartDrift):
            raise ValueError("style_dict must be of type SmartDrift.")
        self._smartdrift = val

[docs]    def generate_fig_univariate(
        self,
        col: str,
        hue: str | None = None,
        df_all: pd.DataFrame | None = None,
        dict_color_palette: dict | None = None,
    ) -> plt.Figure:
        """Returns a plotly figure containing the distribution of any kind of feature
        (continuous, categorical).

        If the feature is categorical and contains too many categories, the smallest
        categories are grouped into a new 'Other' category so that the graph remains
        readable.

        The input dataframe should contain the column of interest and a column that is used
        to distinguish two types of values (ex. 'train' and 'test')

        Parameters
        ----------
        df_all : pd.DataFrame
            The input dataframe that contains the column of interest
        col : str
            The column of interest
        hue : str
            The column used to distinguish the values (ex. 'train' and 'test')
        type: str
            The type of the series ('continous' or 'categorical')

        Returns
        -------
        plotly.graph_objs._figure.Figure

        """
        if hue is None:
            hue = self.smartdrift.datadrift_target
        if df_all is None:
            df_all = self.smartdrift.df_concat
            df_all[hue] = df_all[hue].astype("object")
            df_all.loc[df_all[hue] == 0, hue] = self.smartdrift.current_dataset_name
            df_all.loc[df_all[hue] == 1, hue] = self.smartdrift.baseline_dataset_name
        if dict_color_palette is None:
            dict_color_palette = self.style_dict
        col_types = compute_col_types(df_all=df_all)
        if col_types[col] == VarType.TYPE_NUM:
            fig = self.generate_fig_univariate_continuous(df_all, col, hue=hue, dict_color_palette=dict_color_palette)
        elif col_types[col] == VarType.TYPE_CAT:
            fig = self.generate_fig_univariate_categorical(df_all, col, hue=hue, dict_color_palette=dict_color_palette)
        else:
            raise NotImplementedError("Series dtype not supported")
        return fig

    def generate_fig_univariate_continuous(
        self,
        df_all: pd.DataFrame,
        col: str,
        hue: str,
        dict_color_palette: dict,  # FIXME: unused
        template: str | None = None,
        title: str | None = None,
        xaxis_title: dict | None = None,
        yaxis_title: dict | None = None,
        xaxis: str | None = None,
        height: str | None = None,
        width: str | None = None,
        hovermode: str | None = None,
    ) -> plotly.graph_objs._figure.Figure:
        """Returns a plotly figure containing the distribution of a continuous feature.

        Parameters
        ----------
        df_all : pd.DataFrame
            The input dataframe that contains the column of interest
        col : str
            The column of interest
        hue : str
            The column used to distinguish the values (ex. 'train' and 'test')
        template: str, , optional
            Template (background style) for the plot
        title: str, optional
            Plot title
        xaxis_title: str, , optional
            X axis title
        yaxis_title: str, , optional
            y axis title
        xaxis: str, , optional
            X axis options (spike line, margin, range ...)
        height: str, , optional
            Height of the plot
        width: str, , optional
            Width of the plot
        hovermode: str,n , optional
            Type of labels displaying on mouse hovering

        Returns
        -------
        plotly.graph_objs._figure.Figure

        """
        df_all[col] = df_all[col].fillna(0)
        datasets = [df_all[df_all[hue] == val][col].values.tolist() for val in df_all[hue].unique()]
        group_labels = [str(val) for val in df_all[hue].unique()]
        colors = list(self.style_dict["univariate_cont_bar"].values())
        if group_labels[0] == "Current dataset":
            group_labels = ["Baseline dataset", "Current dataset"]

        fig = ff.create_distplot(
            datasets,
            group_labels=group_labels,
            colors=list(colors),
            show_hist=False,
            show_curve=True,
            show_rug=False,
        )
        if template is None:
            template = self.style_dict["template"]
        if title is None:
            title = self.style_dict["dict_title"]
        if xaxis_title is None:
            xaxis_title = self.style_dict["dict_xaxis_continuous"]
            xaxis_title["text"] = col
        if yaxis_title is None:
            yaxis_title = self.style_dict["dict_yaxis_continuous"]
        if xaxis is None:
            xaxis = self.style_dict["dict_xaxis"]
        if height is None:
            height = self.style_dict["height"]
        if width is None:
            width = self.style_dict["width"]
        if hovermode is None:
            hovermode = self.style_dict["hovermode"]

        fig.update_layout(
            template=template,
            title=title,
            xaxis_title=xaxis_title,
            yaxis_title=yaxis_title,
            xaxis=xaxis,
            height=height,
            width=width,
            hovermode=hovermode,
        )
        fig.update_traces(hovertemplate="%{y:.2f}", showlegend=True)
        fig.update_xaxes(showgrid=False)
        fig.update_yaxes(showgrid=False)

        return fig

    def generate_fig_univariate_categorical(
        self,
        df_all: pd.DataFrame,
        col: str,
        hue: str,
        dict_color_palette: dict,  # FIXME: unused
        nb_cat_max: int = 15,
        template: str | None = None,
        title: str | None = None,
        xaxis_title: dict | None = None,
        yaxis_title: dict | None = None,
        xaxis: str | None = None,  # FIXME: unused
        height: str | None = None,
        width: str | None = None,
        hovermode: str | None = None,
        legend: str | None = None,
    ) -> plotly.graph_objs._figure.Figure:
        """Returns a plotly figure containing the distribution of a categorical feature.

        If the feature is categorical and contains too many categories, the smallest
        categories are grouped into a new 'Other' category so that the graph remains
        readable.

        Parameters
        ----------
        df_all : pd.DataFrame
            The input dataframe that contains the column of interest
        col : str
            The column of interest
        hue : str
            The column used to distinguish the values (ex. 'train' and 'test')
        nb_cat_max : int
            The number max of categories to be displayed. If the number of categories
            is greater than nb_cat_max then groups smallest categories into a new
            'Other' category
        template: str, optional
            Template (background style) for the plot
        title: str, optional
            Plot title
        xaxis_title: str, optional
            X axis title
        yaxis_title: str, optional
            y axis title
        xaxis: str, optional
            X axis options (spike line, margin, range ...)
        height: str, optional
            Height of the plot
        width: str, optional
            Width of the plot
        hovermode: str, optional
            Type of labels displaying on mouse hovering
        legend: str, optional
            Axis legends

        Returns
        -------
        plotly.graph_objs._figure.Figure

        """
        df_cat = df_all.groupby([col, hue]).agg({col: "count"}).rename(columns={col: "count"}).reset_index()
        df_cat["Percent"] = df_cat["count"] * 100 / df_cat.groupby(hue)["count"].transform("sum")

        if pd.api.types.is_numeric_dtype(df_cat[col].dtype):
            df_cat = df_cat.sort_values(col, ascending=True)
            df_cat[col] = df_cat[col].astype(str)

        nb_cat = df_cat.groupby([col]).agg({"count": "sum"}).reset_index()[col].nunique()

        if nb_cat > nb_cat_max:
            df_cat = self._merge_small_categories(df_cat=df_cat, col=col, hue=hue, nb_cat_max=nb_cat_max)

        df_to_sort = df_cat.copy().reset_index(drop=True)
        df_to_sort["Sorted_indicator"] = df_to_sort.sort_values([col]).groupby([col])["Percent"].diff()
        df_to_sort["Sorted_indicator"] = np.abs(df_to_sort["Sorted_indicator"])
        df_sorted = df_to_sort.dropna()[[col, "Sorted_indicator"]]

        df_cat = (
            df_cat.merge(df_sorted, how="left", on=col)
            .sort_values("Sorted_indicator", ascending=True)
            .drop("Sorted_indicator", axis=1)
        )

        df_cat["Percent_displayed"] = df_cat["Percent"].apply(lambda row: str(round(row, 2)) + " %")

        modalities = df_cat[hue].unique().tolist()

        fig1 = px.bar(
            df_cat[df_cat[hue] == modalities[0]],
            x="Percent",
            y=col,
            orientation="h",
            barmode="group",
            color=hue,
            text="Percent_displayed",
        )
        fig1.update_traces(marker_color=list(self.style_dict["univariate_cat_bar"].values())[1], showlegend=True)

        fig2 = px.bar(
            df_cat[df_cat[hue] == modalities[1]],
            x="Percent",
            y=col,
            orientation="h",
            barmode="group",
            color=hue,
            text="Percent_displayed",
        )
        fig2.update_traces(marker_color=list(self.style_dict["univariate_cat_bar"].values())[0], showlegend=True)

        fig = fig1.add_trace(fig2.data[0])

        fig.update_xaxes(showgrid=False, showticklabels=True)
        fig.update_yaxes(showgrid=False, showticklabels=True, automargin=True)
        fig.update_traces(showlegend=True, textposition="outside", cliponaxis=False)

        if template is None:
            template = self.style_dict["template"]
        if title is None:
            title = self.style_dict["dict_title"]
        if xaxis_title is None:
            xaxis_title = self.style_dict["dict_xaxis_title"]
        if yaxis_title is None:
            yaxis_title = self.style_dict["dict_yaxis_title"]
            yaxis_title["text"] = col
        if height is None:
            height = self.style_dict["height"]
        if width is None:
            width = self.style_dict["width"]
        if hovermode is None:
            hovermode = self.style_dict["hovermode"]
        if legend is None:
            legend = self.style_dict["dict_legend"]

        fig.update_layout(
            template=template,
            title=title,
            xaxis_title=xaxis_title,
            height=height,
            width=width,
            yaxis_title=yaxis_title,
            hovermode=hovermode,
            legend=legend,
            xaxis_range=[0, max(df_cat["Percent"]) + 10],
        )

        return fig

    def _merge_small_categories(self, df_cat: pd.DataFrame, col: str, hue: str, nb_cat_max: int) -> pd.DataFrame:
        """Merges categories of column 'col' of df_cat into 'Other' category so that
        the number of categories is less than nb_cat_max.
        """
        df_cat_sum_hue = df_cat.groupby([col]).agg({"count": "sum"}).reset_index()
        list_cat_to_merge = df_cat_sum_hue.sort_values("count", ascending=False)[col].to_list()[nb_cat_max - 1 :]
        df_cat_other = (
            df_cat.loc[df_cat[col].isin(list_cat_to_merge)].groupby(hue, as_index=False)[["count", "Percent"]].sum()
        )
        df_cat_other[col] = "Other"
        return pd.concat([df_cat.loc[~df_cat[col].isin(list_cat_to_merge)], df_cat_other])

[docs]    def scatter_feature_importance(
        self, feature_importance: pd.DataFrame | None = None, datadrift_stat_test: pd.DataFrame | None = None
    ) -> plotly.graph_objs._figure.Figure:
        """Displays scatter of feature importance between drift
        model and production one extracted from a datasets created
        during the compile step.

        Parameters
        ----------
        feature_importance : pd.DataFrame, optional
            DataFrame containing feature importance for each features from production and drift model.
        datadrift_stat_test: pd.DataFrame, optional
            DataFrame containing the result of datadrift univariate tests

        Returns
        -------
        plotly.express.scatter

        """
        dict_t = copy.deepcopy(self.style_dict["dict_title"])
        dict_xaxis = copy.deepcopy(self.style_dict["dict_xaxis_title"])
        dict_yaxis = copy.deepcopy(self.style_dict["dict_yaxis_title"])
        title = "<b>Datadrift Vs Feature Importance</b>"
        dict_t["text"] = title
        dict_xaxis["text"] = "Datadrift Importance"
        dict_yaxis["text"] = "Feature Importance - Deployed Model"

        if feature_importance is None:
            if self.smartdrift.feature_importance is None:
                raise RuntimeError("SmartDrift().feature_importance should not be None at this point.")
            feature_importance = self.smartdrift.feature_importance.set_index("feature")
        if datadrift_stat_test is None:
            if self.smartdrift.datadrift_stat_test is None:
                raise RuntimeError("SmartDrift().datadrift_stat_test should not be None at this point.")
            datadrift_stat_test = self.smartdrift.datadrift_stat_test

        data = datadrift_stat_test.join(feature_importance)
        data["features"] = data.index
        # symbols
        stat_test_list = list(data["testname"].unique())
        symbol_list = [0, 13]
        symbol_dict = dict(zip(stat_test_list, symbol_list, strict=True))

        hv_text = [
            f"<b>Feature: {feat}</b><br />Deployed Model Importance: {depimp * 100:.1f}%<br />"
            + f"Datadrift test: {t} - pvalue: {pv:.5f}<br />"
            + f"Datadrift model Importance: {ddrimp * 100:.1f}"
            for feat, depimp, t, pv, ddrimp in zip(
                *map(data.get, ["features", "deployed_model", "testname", "pvalue", "datadrift_classifier"]),
                strict=True,
            )
        ]

        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=data["datadrift_classifier"],
                y=data["deployed_model"],
                marker_symbol=datadrift_stat_test["testname"].apply(lambda x: symbol_dict[x]),
                mode="markers",
                showlegend=False,
                hovertext=hv_text,
                hovertemplate="%{hovertext}<extra></extra>",
            )
        )

        fig.update_traces(marker={"size": 15, "opacity": 0.8, "line": {"width": 0.8, "color": "white"}})

        fig.data[0].marker.color = data["pvalue"]
        fig.data[0].marker.coloraxis = "coloraxis"
        fig.layout.coloraxis.colorscale = self.style_dict["featimportance_colorscale"]
        fig.layout.coloraxis.colorbar = {"title": {"text": "Univariate<br />DataDrift Test<br />Pvalue"}}

        height = self.style_dict["height"]
        width = self.style_dict["width"]
        hovermode = self.style_dict["hovermode"]
        template = self.style_dict["template"]

        fig.update_layout(
            template=template,
            title=dict_t,
            xaxis_title=dict_xaxis,
            yaxis_title=dict_yaxis,
            height=height,
            width=width,
            hovermode=hovermode,
        )

        return fig

[docs]    def generate_historical_datadrift_metric(
        self,
        datadrift_historical: pd.DataFrame = None,
        template: str | None = None,
        title: str | None = None,
        xaxis_title: str | None = None,
        yaxis_title: str | None = None,  # FIXME: unused
        xaxis: str | None = None,  # FIXME: unused
        height: str | None = None,
        width: str | None = None,
        hovermode: str | None = None,
    ) -> plotly.graph_objs._figure.Figure:
        """Displays line plot of the evolution of the datadrift metrics :
        AUC of Datadrift classifier and if deployed_model fill, Jensen Shannon divergence of distribution of prediction

        Parameters
        ----------
        datadrift_historical : pd.DataFrame
            DataFrame with date, datadrif classifer auc and jensen shannon prediction divergence if deployed_model fill
        template: str, optional
            Template (background style) for the plot
        title: str, optional
            Plot title
        xaxis_title: str, optional
            X axis title
        yaxis_title: str, optional
            y axis title
        xaxis: str, optional
            X axis options (spike line, margin, range ...)
        height: str, optional
            Height of the plot
        width: str, optional
            Width of the plot
        hovermode: str, optional
            Type of labels displaying on mouse hovering

        Returns
        -------
        plotly.express.line

        """
        if datadrift_historical is None:
            datadrift_historical = self.smartdrift.historical_auc
        if datadrift_historical is not None:
            if self.smartdrift.deployed_model is not None:
                datadrift_historical = datadrift_historical[["date", "auc", "JS_predict"]]
                datadrift_historical = (
                    datadrift_historical.groupby(["date"])[["auc", "JS_predict"]].mean().reset_index()
                )
                datadrift_historical.sort_values(by="date", inplace=True)
            else:
                datadrift_historical = datadrift_historical[["date", "auc"]]
                datadrift_historical = datadrift_historical.groupby("date")["auc"].mean().reset_index()
                datadrift_historical.sort_values(by="date", inplace=True)

            datadrift_historical["auc_displayed"] = datadrift_historical["auc"].round(2)

            if self.smartdrift.deployed_model is not None:
                fig = make_subplots(specs=[[{"secondary_y": True}]])
                fig.add_trace(
                    go.Scatter(
                        x=datadrift_historical["date"], y=datadrift_historical["auc"], name="Datadrift classifier AUC"
                    ),
                    secondary_y=False,
                )

                fig.add_trace(
                    go.Scatter(
                        x=datadrift_historical["date"],
                        y=datadrift_historical["JS_predict"],
                        name="Jensen_Shannon Prediction Divergence",
                    ),
                    secondary_y=True,
                )

                fig.update_layout(title_text="Evolution of data drift")
                fig.update_yaxes(title_text="<b>Datadrift classifier AUC</b>  ", secondary_y=False)
                fig.update_yaxes(title_text="<b>Jensen_Shannon Prediction Divergence</b> ", secondary_y=True)
                fig.update_yaxes(range=[0.5, 1], secondary_y=False)
                fig.update_yaxes(range=[0, 0.3], secondary_y=True)
            else:
                fig = px.line(
                    datadrift_historical,
                    x="date",
                    y="auc",
                    title="AUC's Evolution of Datadrift classifier",
                    text="auc_displayed",
                )
                fig.update_yaxes(title_text="<b>Datadrift classifier AUC</b>")
                fig.update_yaxes(range=[0.5, 1])

            fig.update_traces(textposition="bottom right")

            if template is None:
                template = self.style_dict["template"]
            if title is None:
                title = self.style_dict["dict_title"]
            if xaxis_title is None:
                xaxis_title = self.style_dict["dict_xaxis_title"]
            if height is None:
                height = self.style_dict["height"]
            if width is None:
                width = self.style_dict["width"]
            if hovermode is None:
                hovermode = self.style_dict["hovermode"]

            fig.update_xaxes(showgrid=False)
            fig.update_layout(
                template=template,
                title=title,
                xaxis_title=xaxis_title,
                height=height,
                width=width,
                hovermode=hovermode,
            )
            return fig

[docs]    def generate_modeldrift_data(
        self,
        data_modeldrift: pd.DataFrame = None,
        metric: str = "performance",
        reference_columns: list | None = None,
        template: str | None = None,
        title: str | None = None,
        xaxis_title: str | None = None,
        yaxis_title: dict | None = None,
        xaxis: str | None = None,  # FIXME: unused
        height: str | None = None,
        width: str | None = None,
        hovermode: str | None = None,
    ) -> plotly.graph_objs._figure.Figure:
        """Displays line plot of the evolution of the Lift computed for deployed model with several criterias.

        Parameters
        ----------
        data_modeldrift : pd.DataFrame
            DataFrame containing the aggregated informations to display modeldrift.
        metric : str
            Column name of the metric computed
        reference_columns : list
            list of reference columns used to display the metric according to different criteria
        title: str, optional
            Plot title
        xaxis_title: str, optional
            X axis title
        yaxis_title: dict, optional
            y axis title
        xaxis: str, optional
            X axis options (spike line, margin, range ...)
        height: str, optional
            Height of the plot
        width: str, optional
            Width of the plot
        hovermode: str, optional
            Type of labels displaying on mouse hovering

        Returns
        -------
        plotly.express.line

        """
        if data_modeldrift is None:
            data_modeldrift = self.smartdrift.data_modeldrift
            if data_modeldrift is None:
                raise ValueError(
                    """You should run the add_data_modeldrift method before displaying model drift performances.
                For more information see the documentation"""
                )
        data_modeldrift[metric] = data_modeldrift[metric].apply(
            lambda row: round(row, len([char for char in str(row).split(".")[1] if char == "0"]) + 3)
        )
        if reference_columns is None:
            reference_columns = list()

        fig = px.line(
            data_modeldrift,
            x="Date",
            y=metric,
            hover_data=reference_columns,
            title="Performance's Evolution on deployed model",
            text=metric,
        )

        fig.update_traces(textposition="top right")

        if template is None:
            template = self.style_dict["template"]
        if title is None:
            title = self.style_dict["dict_title"]
        if xaxis_title is None:
            xaxis_title = self.style_dict["dict_xaxis_title"]
        if yaxis_title is None:
            yaxis_title = self.style_dict["dict_yaxis_title"]
            yaxis_title["text"] = metric
        if height is None:
            height = self.style_dict["height"]
        if width is None:
            width = self.style_dict["width"]
        if hovermode is None:
            hovermode = self.style_dict["hovermode"]

        fig.update_xaxes(showgrid=False)
        fig.update_layout(
            template=template,
            title=title,
            xaxis_title=xaxis_title,
            height=height,
            width=width,
            yaxis_title=yaxis_title,
            hovermode=hovermode,
        )

        fig.data[0].line.color = self.style_dict["datadrift_historical"]
        fig.data[-1].marker.color = self.style_dict["datadrift_historical"]

        return fig

    def define_style_attributes(self, colors_dict):
        """define_style_attributes allows Eurybia user to change the color of plot

        Parameters
        ----------
        colors_dict: dict
            Dict of the colors used in the different plots

        """
        self.style_dict = define_style(colors_dict)

        if hasattr(self, "pred_colorscale"):
            delattr(self, "pred_colorscale")

    def generate_indicator(
        self,
        fig_value: float,
        min_gauge: float = 0.5,
        max_gauge: float = 1,
        height: float = 300,
        width: float = 500,
        title: str = "Metric",
        image: bool = False,
    ) -> plotly.graph_objs._figure.Figure | bytes:
        """Generate an indicator gauge with a colorbar.

        Args:
            fig_value (float): Value to display on the gauge.
            min_gauge (float, optional): Minimum value of the gauge range. Defaults to 0.5.
            max_gauge (float, optional): Maximum value of the gauge range. Defaults to 1.
            height (float, optional): Height of the plot. Defaults to 300.
            width (float, optional): Width of the plot. Defaults to 500.
            title (str, optional): Title of the plot. Defaults to "Metric".
            image (bool, optional): If True, returns an image instead of a Plotly Figure. Defaults to False.

        Returns:
            plotly.graph_objs._figure.Figure | bytes: The generated figure or image.
        """
        color = sns.blend_palette(["green", "yellow", "orange", "red"], 100)
        color = color.as_hex()
        list_color_glob = list()
        threshold = [i for i in np.arange(min_gauge, max_gauge, (max_gauge - min_gauge) / len(color))]
        for i in range(1, len(threshold) + 1):
            dict_color = dict()
            if i == len(threshold):
                rang = [threshold[i - 1], 1]
            else:
                rang = [threshold[i - 1], threshold[i]]
            dict_color["range"] = rang
            dict_color["color"] = color[i - 1]
            list_color_glob.append(dict_color)
        fig = go.Figure(
            go.Indicator(
                mode="gauge+number",
                value=round(fig_value, 2),
                domain={"x": [0, 1], "y": [0, 1]},
                title={"text": title, "align": "center", "font": {"size": 20}},
                gauge={
                    "axis": {"range": [min_gauge, max_gauge], "ticktext": ["No Drift", "High Drift"], "tickwidth": 1},
                    "bar": {"color": "black"},
                    "borderwidth": 0,
                    "steps": list_color_glob,
                },
            )
        )
        fig.update_layout(
            height=height,
            width=width,
        )

        return fig.to_image() if image else fig