Source code for eurybia.core.smartplotter

"""Smart plotter module"""

from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from eurybia import SmartDrift

import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
from plotly import graph_objs as go
from plotly.subplots import make_subplots

from eurybia.report.common import VarType, compute_col_types
from eurybia.style.style_utils import colors_loading, define_style, select_palette


[docs]class SmartPlotter: """The smartplotter class includes all the methods used to display graphics Each SmartPlotter method is easy to use from a Smart Drift object, just use the following syntax Attributes: ---------- smartdrift: object SmartDrift object style_dict: dict Dict contains dicts of the colors used in the different plots Example: ------- >>> SD = Smartdrift() >>> SD.compile() >>> SD.plot.my_plot_method(param=value) """ def __init__(self, smartdrift: SmartDrift): """Initialize a SmartPlotter. Args: smartdrift (SmartDrift): a SmartDrift instance """ palette_name = list(colors_loading().keys())[0] self._style_dict = define_style(select_palette(colors_loading(), palette_name)) self._smartdrift = smartdrift @property def style_dict(self) -> dict: """Getter""" return self._style_dict @style_dict.setter def style_dict(self, val: dict) -> None: """Setter""" if not isinstance(val, dict): raise ValueError("style_dict must be a dictionary.") self._style_dict = val @property def smartdrift(self) -> SmartDrift: """Getter""" return self._smartdrift @smartdrift.setter def smartdrift(self, val: SmartDrift) -> None: """Setter""" if not isinstance(val, SmartDrift): raise ValueError("style_dict must be of type SmartDrift.") self._smartdrift = val
[docs] def generate_fig_univariate( self, col: str, hue: str | None = None, df_all: pd.DataFrame | None = None, dict_color_palette: dict | None = None, ) -> plt.Figure: """Returns a plotly figure containing the distribution of any kind of feature (continuous, categorical). If the feature is categorical and contains too many categories, the smallest categories are grouped into a new 'Other' category so that the graph remains readable. The input dataframe should contain the column of interest and a column that is used to distinguish two types of values (ex. 'train' and 'test') Parameters ---------- df_all : pd.DataFrame The input dataframe that contains the column of interest col : str The column of interest hue : str The column used to distinguish the values (ex. 'train' and 'test') type: str The type of the series ('continous' or 'categorical') Returns ------- plotly.graph_objs._figure.Figure """ if hue is None: hue = self.smartdrift.datadrift_target if df_all is None: df_all = self.smartdrift.df_concat df_all[hue] = df_all[hue].astype("object") df_all.loc[df_all[hue] == 0, hue] = self.smartdrift.current_dataset_name df_all.loc[df_all[hue] == 1, hue] = self.smartdrift.baseline_dataset_name if dict_color_palette is None: dict_color_palette = self.style_dict col_types = compute_col_types(df_all=df_all) if col_types[col] == VarType.TYPE_NUM: fig = self.generate_fig_univariate_continuous(df_all, col, hue=hue, dict_color_palette=dict_color_palette) elif col_types[col] == VarType.TYPE_CAT: fig = self.generate_fig_univariate_categorical(df_all, col, hue=hue, dict_color_palette=dict_color_palette) else: raise NotImplementedError("Series dtype not supported") return fig
def generate_fig_univariate_continuous( self, df_all: pd.DataFrame, col: str, hue: str, dict_color_palette: dict, # FIXME: unused template: str | None = None, title: str | None = None, xaxis_title: dict | None = None, yaxis_title: dict | None = None, xaxis: str | None = None, height: str | None = None, width: str | None = None, hovermode: str | None = None, ) -> plotly.graph_objs._figure.Figure: """Returns a plotly figure containing the distribution of a continuous feature. Parameters ---------- df_all : pd.DataFrame The input dataframe that contains the column of interest col : str The column of interest hue : str The column used to distinguish the values (ex. 'train' and 'test') template: str, , optional Template (background style) for the plot title: str, optional Plot title xaxis_title: str, , optional X axis title yaxis_title: str, , optional y axis title xaxis: str, , optional X axis options (spike line, margin, range ...) height: str, , optional Height of the plot width: str, , optional Width of the plot hovermode: str,n , optional Type of labels displaying on mouse hovering Returns ------- plotly.graph_objs._figure.Figure """ df_all[col] = df_all[col].fillna(0) datasets = [df_all[df_all[hue] == val][col].values.tolist() for val in df_all[hue].unique()] group_labels = [str(val) for val in df_all[hue].unique()] colors = list(self.style_dict["univariate_cont_bar"].values()) if group_labels[0] == "Current dataset": group_labels = ["Baseline dataset", "Current dataset"] fig = ff.create_distplot( datasets, group_labels=group_labels, colors=list(colors), show_hist=False, show_curve=True, show_rug=False, ) if template is None: template = self.style_dict["template"] if title is None: title = self.style_dict["dict_title"] if xaxis_title is None: xaxis_title = self.style_dict["dict_xaxis_continuous"] xaxis_title["text"] = col if yaxis_title is None: yaxis_title = self.style_dict["dict_yaxis_continuous"] if xaxis is None: xaxis = self.style_dict["dict_xaxis"] if height is None: height = self.style_dict["height"] if width is None: width = self.style_dict["width"] if hovermode is None: hovermode = self.style_dict["hovermode"] fig.update_layout( template=template, title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, xaxis=xaxis, height=height, width=width, hovermode=hovermode, ) fig.update_traces(hovertemplate="%{y:.2f}", showlegend=True) fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=False) return fig def generate_fig_univariate_categorical( self, df_all: pd.DataFrame, col: str, hue: str, dict_color_palette: dict, # FIXME: unused nb_cat_max: int = 15, template: str | None = None, title: str | None = None, xaxis_title: dict | None = None, yaxis_title: dict | None = None, xaxis: str | None = None, # FIXME: unused height: str | None = None, width: str | None = None, hovermode: str | None = None, legend: str | None = None, ) -> plotly.graph_objs._figure.Figure: """Returns a plotly figure containing the distribution of a categorical feature. If the feature is categorical and contains too many categories, the smallest categories are grouped into a new 'Other' category so that the graph remains readable. Parameters ---------- df_all : pd.DataFrame The input dataframe that contains the column of interest col : str The column of interest hue : str The column used to distinguish the values (ex. 'train' and 'test') nb_cat_max : int The number max of categories to be displayed. If the number of categories is greater than nb_cat_max then groups smallest categories into a new 'Other' category template: str, optional Template (background style) for the plot title: str, optional Plot title xaxis_title: str, optional X axis title yaxis_title: str, optional y axis title xaxis: str, optional X axis options (spike line, margin, range ...) height: str, optional Height of the plot width: str, optional Width of the plot hovermode: str, optional Type of labels displaying on mouse hovering legend: str, optional Axis legends Returns ------- plotly.graph_objs._figure.Figure """ df_cat = df_all.groupby([col, hue]).agg({col: "count"}).rename(columns={col: "count"}).reset_index() df_cat["Percent"] = df_cat["count"] * 100 / df_cat.groupby(hue)["count"].transform("sum") if pd.api.types.is_numeric_dtype(df_cat[col].dtype): df_cat = df_cat.sort_values(col, ascending=True) df_cat[col] = df_cat[col].astype(str) nb_cat = df_cat.groupby([col]).agg({"count": "sum"}).reset_index()[col].nunique() if nb_cat > nb_cat_max: df_cat = self._merge_small_categories(df_cat=df_cat, col=col, hue=hue, nb_cat_max=nb_cat_max) df_to_sort = df_cat.copy().reset_index(drop=True) df_to_sort["Sorted_indicator"] = df_to_sort.sort_values([col]).groupby([col])["Percent"].diff() df_to_sort["Sorted_indicator"] = np.abs(df_to_sort["Sorted_indicator"]) df_sorted = df_to_sort.dropna()[[col, "Sorted_indicator"]] df_cat = ( df_cat.merge(df_sorted, how="left", on=col) .sort_values("Sorted_indicator", ascending=True) .drop("Sorted_indicator", axis=1) ) df_cat["Percent_displayed"] = df_cat["Percent"].apply(lambda row: str(round(row, 2)) + " %") modalities = df_cat[hue].unique().tolist() fig1 = px.bar( df_cat[df_cat[hue] == modalities[0]], x="Percent", y=col, orientation="h", barmode="group", color=hue, text="Percent_displayed", ) fig1.update_traces(marker_color=list(self.style_dict["univariate_cat_bar"].values())[1], showlegend=True) fig2 = px.bar( df_cat[df_cat[hue] == modalities[1]], x="Percent", y=col, orientation="h", barmode="group", color=hue, text="Percent_displayed", ) fig2.update_traces(marker_color=list(self.style_dict["univariate_cat_bar"].values())[0], showlegend=True) fig = fig1.add_trace(fig2.data[0]) fig.update_xaxes(showgrid=False, showticklabels=True) fig.update_yaxes(showgrid=False, showticklabels=True, automargin=True) fig.update_traces(showlegend=True, textposition="outside", cliponaxis=False) if template is None: template = self.style_dict["template"] if title is None: title = self.style_dict["dict_title"] if xaxis_title is None: xaxis_title = self.style_dict["dict_xaxis_title"] if yaxis_title is None: yaxis_title = self.style_dict["dict_yaxis_title"] yaxis_title["text"] = col if height is None: height = self.style_dict["height"] if width is None: width = self.style_dict["width"] if hovermode is None: hovermode = self.style_dict["hovermode"] if legend is None: legend = self.style_dict["dict_legend"] fig.update_layout( template=template, title=title, xaxis_title=xaxis_title, height=height, width=width, yaxis_title=yaxis_title, hovermode=hovermode, legend=legend, xaxis_range=[0, max(df_cat["Percent"]) + 10], ) return fig def _merge_small_categories(self, df_cat: pd.DataFrame, col: str, hue: str, nb_cat_max: int) -> pd.DataFrame: """Merges categories of column 'col' of df_cat into 'Other' category so that the number of categories is less than nb_cat_max. """ df_cat_sum_hue = df_cat.groupby([col]).agg({"count": "sum"}).reset_index() list_cat_to_merge = df_cat_sum_hue.sort_values("count", ascending=False)[col].to_list()[nb_cat_max - 1 :] df_cat_other = ( df_cat.loc[df_cat[col].isin(list_cat_to_merge)].groupby(hue, as_index=False)[["count", "Percent"]].sum() ) df_cat_other[col] = "Other" return pd.concat([df_cat.loc[~df_cat[col].isin(list_cat_to_merge)], df_cat_other])
[docs] def scatter_feature_importance( self, feature_importance: pd.DataFrame | None = None, datadrift_stat_test: pd.DataFrame | None = None ) -> plotly.graph_objs._figure.Figure: """Displays scatter of feature importance between drift model and production one extracted from a datasets created during the compile step. Parameters ---------- feature_importance : pd.DataFrame, optional DataFrame containing feature importance for each features from production and drift model. datadrift_stat_test: pd.DataFrame, optional DataFrame containing the result of datadrift univariate tests Returns ------- plotly.express.scatter """ dict_t = copy.deepcopy(self.style_dict["dict_title"]) dict_xaxis = copy.deepcopy(self.style_dict["dict_xaxis_title"]) dict_yaxis = copy.deepcopy(self.style_dict["dict_yaxis_title"]) title = "<b>Datadrift Vs Feature Importance</b>" dict_t["text"] = title dict_xaxis["text"] = "Datadrift Importance" dict_yaxis["text"] = "Feature Importance - Deployed Model" if feature_importance is None: if self.smartdrift.feature_importance is None: raise RuntimeError("SmartDrift().feature_importance should not be None at this point.") feature_importance = self.smartdrift.feature_importance.set_index("feature") if datadrift_stat_test is None: if self.smartdrift.datadrift_stat_test is None: raise RuntimeError("SmartDrift().datadrift_stat_test should not be None at this point.") datadrift_stat_test = self.smartdrift.datadrift_stat_test data = datadrift_stat_test.join(feature_importance) data["features"] = data.index # symbols stat_test_list = list(data["testname"].unique()) symbol_list = [0, 13] symbol_dict = dict(zip(stat_test_list, symbol_list, strict=True)) hv_text = [ f"<b>Feature: {feat}</b><br />Deployed Model Importance: {depimp * 100:.1f}%<br />" + f"Datadrift test: {t} - pvalue: {pv:.5f}<br />" + f"Datadrift model Importance: {ddrimp * 100:.1f}" for feat, depimp, t, pv, ddrimp in zip( *map(data.get, ["features", "deployed_model", "testname", "pvalue", "datadrift_classifier"]), strict=True, ) ] fig = go.Figure() fig.add_trace( go.Scatter( x=data["datadrift_classifier"], y=data["deployed_model"], marker_symbol=datadrift_stat_test["testname"].apply(lambda x: symbol_dict[x]), mode="markers", showlegend=False, hovertext=hv_text, hovertemplate="%{hovertext}<extra></extra>", ) ) fig.update_traces(marker={"size": 15, "opacity": 0.8, "line": {"width": 0.8, "color": "white"}}) fig.data[0].marker.color = data["pvalue"] fig.data[0].marker.coloraxis = "coloraxis" fig.layout.coloraxis.colorscale = self.style_dict["featimportance_colorscale"] fig.layout.coloraxis.colorbar = {"title": {"text": "Univariate<br />DataDrift Test<br />Pvalue"}} height = self.style_dict["height"] width = self.style_dict["width"] hovermode = self.style_dict["hovermode"] template = self.style_dict["template"] fig.update_layout( template=template, title=dict_t, xaxis_title=dict_xaxis, yaxis_title=dict_yaxis, height=height, width=width, hovermode=hovermode, ) return fig
[docs] def generate_historical_datadrift_metric( self, datadrift_historical: pd.DataFrame = None, template: str | None = None, title: str | None = None, xaxis_title: str | None = None, yaxis_title: str | None = None, # FIXME: unused xaxis: str | None = None, # FIXME: unused height: str | None = None, width: str | None = None, hovermode: str | None = None, ) -> plotly.graph_objs._figure.Figure: """Displays line plot of the evolution of the datadrift metrics : AUC of Datadrift classifier and if deployed_model fill, Jensen Shannon divergence of distribution of prediction Parameters ---------- datadrift_historical : pd.DataFrame DataFrame with date, datadrif classifer auc and jensen shannon prediction divergence if deployed_model fill template: str, optional Template (background style) for the plot title: str, optional Plot title xaxis_title: str, optional X axis title yaxis_title: str, optional y axis title xaxis: str, optional X axis options (spike line, margin, range ...) height: str, optional Height of the plot width: str, optional Width of the plot hovermode: str, optional Type of labels displaying on mouse hovering Returns ------- plotly.express.line """ if datadrift_historical is None: datadrift_historical = self.smartdrift.historical_auc if datadrift_historical is not None: if self.smartdrift.deployed_model is not None: datadrift_historical = datadrift_historical[["date", "auc", "JS_predict"]] datadrift_historical = ( datadrift_historical.groupby(["date"])[["auc", "JS_predict"]].mean().reset_index() ) datadrift_historical.sort_values(by="date", inplace=True) else: datadrift_historical = datadrift_historical[["date", "auc"]] datadrift_historical = datadrift_historical.groupby("date")["auc"].mean().reset_index() datadrift_historical.sort_values(by="date", inplace=True) datadrift_historical["auc_displayed"] = datadrift_historical["auc"].round(2) if self.smartdrift.deployed_model is not None: fig = make_subplots(specs=[[{"secondary_y": True}]]) fig.add_trace( go.Scatter( x=datadrift_historical["date"], y=datadrift_historical["auc"], name="Datadrift classifier AUC" ), secondary_y=False, ) fig.add_trace( go.Scatter( x=datadrift_historical["date"], y=datadrift_historical["JS_predict"], name="Jensen_Shannon Prediction Divergence", ), secondary_y=True, ) fig.update_layout(title_text="Evolution of data drift") fig.update_yaxes(title_text="<b>Datadrift classifier AUC</b> ", secondary_y=False) fig.update_yaxes(title_text="<b>Jensen_Shannon Prediction Divergence</b> ", secondary_y=True) fig.update_yaxes(range=[0.5, 1], secondary_y=False) fig.update_yaxes(range=[0, 0.3], secondary_y=True) else: fig = px.line( datadrift_historical, x="date", y="auc", title="AUC's Evolution of Datadrift classifier", text="auc_displayed", ) fig.update_yaxes(title_text="<b>Datadrift classifier AUC</b>") fig.update_yaxes(range=[0.5, 1]) fig.update_traces(textposition="bottom right") if template is None: template = self.style_dict["template"] if title is None: title = self.style_dict["dict_title"] if xaxis_title is None: xaxis_title = self.style_dict["dict_xaxis_title"] if height is None: height = self.style_dict["height"] if width is None: width = self.style_dict["width"] if hovermode is None: hovermode = self.style_dict["hovermode"] fig.update_xaxes(showgrid=False) fig.update_layout( template=template, title=title, xaxis_title=xaxis_title, height=height, width=width, hovermode=hovermode, ) return fig
[docs] def generate_modeldrift_data( self, data_modeldrift: pd.DataFrame = None, metric: str = "performance", reference_columns: list | None = None, template: str | None = None, title: str | None = None, xaxis_title: str | None = None, yaxis_title: dict | None = None, xaxis: str | None = None, # FIXME: unused height: str | None = None, width: str | None = None, hovermode: str | None = None, ) -> plotly.graph_objs._figure.Figure: """Displays line plot of the evolution of the Lift computed for deployed model with several criterias. Parameters ---------- data_modeldrift : pd.DataFrame DataFrame containing the aggregated informations to display modeldrift. metric : str Column name of the metric computed reference_columns : list list of reference columns used to display the metric according to different criteria title: str, optional Plot title xaxis_title: str, optional X axis title yaxis_title: dict, optional y axis title xaxis: str, optional X axis options (spike line, margin, range ...) height: str, optional Height of the plot width: str, optional Width of the plot hovermode: str, optional Type of labels displaying on mouse hovering Returns ------- plotly.express.line """ if data_modeldrift is None: data_modeldrift = self.smartdrift.data_modeldrift if data_modeldrift is None: raise ValueError( """You should run the add_data_modeldrift method before displaying model drift performances. For more information see the documentation""" ) data_modeldrift[metric] = data_modeldrift[metric].apply( lambda row: round(row, len([char for char in str(row).split(".")[1] if char == "0"]) + 3) ) if reference_columns is None: reference_columns = list() fig = px.line( data_modeldrift, x="Date", y=metric, hover_data=reference_columns, title="Performance's Evolution on deployed model", text=metric, ) fig.update_traces(textposition="top right") if template is None: template = self.style_dict["template"] if title is None: title = self.style_dict["dict_title"] if xaxis_title is None: xaxis_title = self.style_dict["dict_xaxis_title"] if yaxis_title is None: yaxis_title = self.style_dict["dict_yaxis_title"] yaxis_title["text"] = metric if height is None: height = self.style_dict["height"] if width is None: width = self.style_dict["width"] if hovermode is None: hovermode = self.style_dict["hovermode"] fig.update_xaxes(showgrid=False) fig.update_layout( template=template, title=title, xaxis_title=xaxis_title, height=height, width=width, yaxis_title=yaxis_title, hovermode=hovermode, ) fig.data[0].line.color = self.style_dict["datadrift_historical"] fig.data[-1].marker.color = self.style_dict["datadrift_historical"] return fig
def define_style_attributes(self, colors_dict): """define_style_attributes allows Eurybia user to change the color of plot Parameters ---------- colors_dict: dict Dict of the colors used in the different plots """ self.style_dict = define_style(colors_dict) if hasattr(self, "pred_colorscale"): delattr(self, "pred_colorscale") def generate_indicator( self, fig_value: float, min_gauge: float = 0.5, max_gauge: float = 1, height: float = 300, width: float = 500, title: str = "Metric", image: bool = False, ) -> plotly.graph_objs._figure.Figure | bytes: """Generate an indicator gauge with a colorbar. Args: fig_value (float): Value to display on the gauge. min_gauge (float, optional): Minimum value of the gauge range. Defaults to 0.5. max_gauge (float, optional): Maximum value of the gauge range. Defaults to 1. height (float, optional): Height of the plot. Defaults to 300. width (float, optional): Width of the plot. Defaults to 500. title (str, optional): Title of the plot. Defaults to "Metric". image (bool, optional): If True, returns an image instead of a Plotly Figure. Defaults to False. Returns: plotly.graph_objs._figure.Figure | bytes: The generated figure or image. """ color = sns.blend_palette(["green", "yellow", "orange", "red"], 100) color = color.as_hex() list_color_glob = list() threshold = [i for i in np.arange(min_gauge, max_gauge, (max_gauge - min_gauge) / len(color))] for i in range(1, len(threshold) + 1): dict_color = dict() if i == len(threshold): rang = [threshold[i - 1], 1] else: rang = [threshold[i - 1], threshold[i]] dict_color["range"] = rang dict_color["color"] = color[i - 1] list_color_glob.append(dict_color) fig = go.Figure( go.Indicator( mode="gauge+number", value=round(fig_value, 2), domain={"x": [0, 1], "y": [0, 1]}, title={"text": title, "align": "center", "font": {"size": 20}}, gauge={ "axis": {"range": [min_gauge, max_gauge], "ticktext": ["No Drift", "High Drift"], "tickwidth": 1}, "bar": {"color": "black"}, "borderwidth": 0, "steps": list_color_glob, }, ) ) fig.update_layout( height=height, width=width, ) return fig.to_image() if image else fig