Source code for eurybia.report.generation

"""
Report generation helper module.
"""
from datetime import datetime
from typing import Optional

import datapane as dp
import pandas as pd
from shapash.explainer.smart_explainer import SmartExplainer

from eurybia import SmartDrift
from eurybia.report.project_report import DriftReport


def _get_index(dr: DriftReport, project_info_file: str, config_report: Optional[dict]) -> dp.Page:
    """
    This function generates and returns a Datapane page containing the Eurybia report index

    Parameters
    ----------
    dr : DriftReport
        DriftReport object
    project_info_file : str
        Path to the file used to display some information about the project in the report.
    config_report : dict, optional
        Report configuration options.
    Returns
    ----------
    datapane.Page
    """

    eurybia_logo = """
    <html>
        <img style="max-width: 150px; height: auto;" src="https://eurybia.readthedocs.io/en/latest/_images/eurybia-fond-clair.png?raw=true"/>
    </html>
    """

    # main block
    index_block = []

    # Title and logo
    index_block += [dp.Group(dp.HTML(eurybia_logo), dp.Text(f"# {dr.title_story}"), columns=2)]

    if (
        config_report is not None
        and "title_description" in config_report.keys()
        and config_report["title_description"] != ""
    ):
        raw_title = config_report["title_description"]
        index_block += [dp.Text(f"## {raw_title}")]
    index_str = "## Eurybia Report contents  \n"

    # Tabs index
    if project_info_file is not None:
        index_str += "- Project information: report context and information  \n"
    index_str += "- Consistency Analysis: highlighting differences between the two datasets  \n"
    index_str += "- Data drift: In-depth data drift analysis \n"

    if dr.smartdrift.data_modeldrift is not None:
        index_str += "- Model drift: In-depth model drift analysis"

    index_block += [dp.Text(index_str)]

    # AUC
    auc_block = dr.smartdrift.plot.generate_indicator(
        fig_value=dr.smartdrift.auc, height=280, width=500, title="Datadrift classifier AUC"
    )

    # Jensen-Shannon
    if dr.smartdrift.deployed_model is not None:
        JS_block = dr.smartdrift.plot.generate_indicator(
            fig_value=dr.smartdrift.js_divergence,
            height=280,
            width=500,
            title="Jensen Shannon Datadrift",
            min_gauge=0,
            max_gauge=0.2,
        )
        index_block += [dp.Group(auc_block, JS_block, columns=3)]
    else:
        index_block += [dp.Group(auc_block, columns=2)]

    page_index = dp.Page(title="Index", blocks=index_block)
    return page_index


def _dict_to_text_blocks(text_dict, level=1):
    """
    This function recursively explores the dict and returns a Datapane Group containing other groups and text blocks fed with the dict
    Parameters
    ----------
    text_dict: dict
        This dict must contain string as keys, and dicts or strings as values
    level: int = 1
        Recursion level, starting at 1 to allow for direct string manipulation
    Returns
    ----------
    datapane.Group
        Group of blocks
    """
    blocks = []
    text = ""
    for k, v in text_dict.items():
        if isinstance(v, (str, int, float)) or v is None:
            if k.lower() == "date" and v.lower() == "auto":
                v = str(datetime.now())[:-7]
            text += f"**{k}** : {v}  \n"
        elif isinstance(v, dict):
            if text != "":
                blocks.append(dp.Text(text))
                text = ""
            blocks.append(
                dp.Group(dp.Text("#" * min(level, 6) + " " + str(k)), _dict_to_text_blocks(v, level + 1), columns=1)
            )
    if text != "":
        blocks.append(dp.Text(text))
    return dp.Group(blocks=blocks, columns=1)


def _get_project_info(dr: DriftReport) -> dp.Page:
    """
    This function generates and returns a Datapane page from a dict containing dicts and strings

    Parameters
    ----------
    dr : DriftReport
        DriftReport object

    Returns
    ----------
    datapane.Page
    """
    if dr.metadata is None:
        return None
    page_info = dp.Page(
        title="Project information",
        blocks=[_dict_to_text_blocks(dr.metadata)],
    )
    return page_info


def _get_consistency_analysis(dr: DriftReport) -> dp.Page:
    """
    This function generates and returns a Datapane page containing the Eurybia consistency analysis

    Parameters
    ----------
    dr : DriftReport
        DriftReport object

    Returns
    ----------
    datapane.Page
    """
    # Title
    blocks = [dp.Text("# Consistency Analysis")]

    # Manually ignored coluumns
    ignore_cols = pd.DataFrame({"ignore_cols": dr.smartdrift.ignore_cols}).rename(
        columns={"ignore_cols": "Ignored columns"}
    )
    blocks += [
        dp.Text("## Ignored columns in the report (manually excluded)"),
    ]
    if len(ignore_cols) > 0:
        blocks += [dp.Table(data=ignore_cols)]
    else:
        blocks += [dp.Text("- Ignored columns : None.")]

    # Column mismatches
    blocks += [
        dp.Text("## Consistency checks: column match between the 2 datasets."),
        dp.Text(
            """
            The columns identified in this section have been automatically removed from this analysis.
            Their presence would always be sufficient for the datadrift classifier to perfectly discriminate the two datasets (maximal data drift, AUC=1).
            """
        ),
    ]
    for k, v in dr.smartdrift.pb_cols.items():
        if len(v) > 0:
            blocks += [dp.Table(data=pd.DataFrame(v).transpose())]
        else:
            blocks += [dp.Text(f"- No {k.lower()} have been detected.")]

    blocks += [
        dp.Text("### Unique values identified:"),
        dp.Text(
            """
            This section displays categorical features in which unique values differ.
            This analysis has been performed on unstratified samples of the baseline and current datasets.
            Missing or added unique values can be caused by this sampling.
            Columns identified in this section have been kept for the analysis.
            """
        ),
    ]
    if len(dr.smartdrift.err_mods) > 0:
        blocks += [
            dp.Table(
                data=pd.DataFrame(dr.smartdrift.err_mods)
                .rename(columns={"err_mods": "Modalities present in one dataset and absent in the other :"})
                .transpose()
            )
        ]
    else:
        blocks += [dp.Text("- No modalities have been detected as present in one dataset and absent in the other.")]

    page_consistency = dp.Page(title="Consistency Analysis", blocks=blocks)
    return page_consistency


def _get_datadrift(dr: DriftReport) -> dp.Page:
    """
    This function generates and returns a Datapane page containing the Eurybia data drift analysis

    Parameters
    ----------
    dr : DriftReport
        DriftReport object

    Returns
    ----------
    datapane.Page
    """
    # Loop for save in list plots of display analysis
    plot_dataset_analysis = []
    table_dataset_analysis = []
    fig_list, labels, table_list = dr.display_dataset_analysis(global_analysis=False)["univariate"]
    for i in range(len(labels)):
        plot_dataset_analysis.append(dp.Plot(fig_list[i], label=labels[i]))
        table_dataset_analysis.append(dp.Table(table_list[i], label=labels[i]))

    # Loop for save in list plots of display analysis
    plot_datadrift_contribution = []
    fig_list, labels = dr.display_model_contribution()
    for i in range(len(labels)):
        plot_datadrift_contribution.append(dp.Plot(fig_list[i], label=labels[i]))
    blocks = [
        dp.Text("# Data drift"),
        dp.Text(
            """The data drift detection methodology is based on the ability of a model classifier to identify whether
        a sample belongs to one or another dataset.
        For this purpose a target (0) is assigned to the baseline dataset and a second target (1) to the current dataset.
        A classification model (catboost) is trained to predict this target.
        As such, the data drift classifier performance is directly related to the difference between two datasets.
        A marked difference will lead to an easy classification (final AUC close to 1).
        Oppositely, highly similars datasets will lead to poor data drift classifier performance (final AUC close to 0.5)."""
        ),
        dp.Text("## Detecting data drift"),
        dp.Text("### Datadrift classifier model perfomances"),
        dp.Text(
            """The closer your AUC is from 0.5 the less your data drifted.
             The closer your AUC is from 1 the more your data drifted"""
        ),
        dp.Plot(
            dr.smartdrift.plot.generate_indicator(
                fig_value=dr.smartdrift.auc, height=300, width=500, title="Datadrift classifier AUC"
            )
        ),
        dp.Text("## Importance of features in data drift"),
        dp.Text("### Global feature importance plot"),
        dp.Text(
            """Bar chart representing the feature importance of each feature for the datadrift classifier.
             This parameter is a direct measure of the importance of a feature to perform the classification."""
        ),
        dp.Plot(dr.explainer.plot.features_importance()),
    ]
    if dr.smartdrift.deployed_model is not None:
        blocks += [
            dp.Text("### Feature importance overview"),
            dp.Text(
                """Scatter plot depicting, for each feature, the feature importance of the deployed model as a function of the datadrift classifier
                 feature importance. This graph thus highlight the real importance of a data drift for the deployed model classification.
                Interpretation based on graphical feature location:
            - Top left : Feature highly important for the deployed model and with low data drift
            - Bottom left : Feature with moderated importance for the deployed model and with low data drift
            - Bottom right : Feature with moderated importance for the deployed model but with high data drift.
            This feature might require your attention.
            - Top right : Feature highly important for the deployed model and high drift. This feature requires your attention.
            """
            ),
            dp.Plot(dr.smartdrift.plot.scatter_feature_importance()),
        ]
    blocks += [
        dp.Text("## Dataset analysis"),
        dp.Text(
            """This section provides numerical and graphical analysis of the 2 datasets distributions,
            making easier the study of the most important variable for drift detection."""
        ),
        dp.Text("### Global analysis"),
        dp.Table(dr._display_dataset_analysis_global()),
        dp.Text("### Univariate analysis"),
        dp.Text(
            """Bar chart showing the unique values distribution of a feature.
        Using the drop-down menu, it is possible to select the feature of interest.
        Features are sorted according to their respective importance in the datadrift classifier.
        For categorical features, the possible values are sorted by descending difference between the two datasets."""
        ),
        dp.Select(blocks=plot_dataset_analysis, type=dp.SelectType.DROPDOWN),
        dp.Select(blocks=table_dataset_analysis, type=dp.SelectType.DROPDOWN),
    ]
    if dr.smartdrift.deployed_model is not None:
        blocks += [
            dp.Text("### Distribution of predicted values"),
            dp.Text(
                "Histogram density showing the distributions of the production model outputs on both baseline and current datasets."
            ),
            dp.Plot(
                dr.smartdrift.plot.generate_fig_univariate(df_all=dr.smartdrift.df_predict, col="Score", hue="dataset")
            ),
            dp.Text(
                """Jensen Shannon Divergence (JSD). The JSD measures the effect of a data drift on the deployed model performance.
            A value close to 0 indicates similar data distributions, while a value close to 1 tend to indicate distinct data distributions
            with a negative effect on the deployed model performance."""
            ),
            dr.smartdrift.plot.generate_indicator(
                fig_value=dr.smartdrift.js_divergence,
                height=280,
                width=500,
                title="Jensen Shannon Datadrift",
                min_gauge=0,
                max_gauge=0.2,
            ),
        ]
    blocks += [
        dp.Text("## Feature contribution on data drift's detection"),
        dp.Text(
            """This graph represents the contribution of a variable to the data drift detection.
        This representation constitutes a support to understand the drift when the analysis of the dataset is unclear.
        In the drop-down menu, features are sorted by importance in the data drift detection."""
        ),
        dp.Select(blocks=plot_datadrift_contribution, type=dp.SelectType.DROPDOWN),
    ]
    if dr.smartdrift.historical_auc is not None:
        blocks += [
            dp.Text("## Historical Data drift"),
            dp.Text(
                "Line chart showing the metrics evolution of the datadrift classifier over the given period of time."
            ),
            dp.Plot(dr.smartdrift.plot.generate_historical_datadrift_metric()),
        ]
    page_datadrift = dp.Page(title="Data drift", blocks=blocks)
    return page_datadrift


def _get_modeldrift(dr: DriftReport) -> dp.Page:
    """
    This function generates and returns a Datapane page containing the Eurybia model drift analysis

    Parameters
    ----------
    dr : DriftReport
        DriftReport object

    Returns
    ----------
    datapane.Page
    """
    # Loop for save in list plots of display model drift
    if dr.smartdrift.data_modeldrift is not None:
        plot_modeldrift = []
        fig_list, labels = dr.display_data_modeldrift()
        if labels == []:
            plot_modeldrift = dp.Plot(fig_list[0])
            modeldrift_plot = plot_modeldrift
        else:
            for i in range(len(labels)):
                plot_modeldrift.append(dp.Plot(fig_list[i], label=labels[i]))
            modeldrift_plot = dp.Select(blocks=plot_modeldrift, label="reference_columns", type=dp.SelectType.DROPDOWN)
    else:
        modeldrift_plot = dp.Text("## Smartdrift.data_modeldrift is None")
    blocks = [
        dp.Text("# Model drift"),
        dp.Text(
            """This section provides support to monitor the production model's performance over time.
    This requires the performance history as input."""
        ),
        dp.Text("## Performance evolution of the deployed model"),
        dp.Text("Line chart of deployed model performances as a function of time"),
        modeldrift_plot,
    ]
    page_modeldrift = dp.Page(title="Model drift", blocks=blocks)
    return page_modeldrift


[docs]def execute_report( smartdrift: SmartDrift, explainer: SmartExplainer, project_info_file: str, output_file: str, config_report: Optional[dict] = None, ): """ Creates the report Parameters ---------- smartdrift : eurybia.core.smartdrift.SmartDrift object Compiled SmartDrift class explainer : shapash.explainer.smart_explainer.SmartExplainer object Compiled shapash explainer. project_info_file : str Path to the file used to display some information about the project in the report. config_report : dict, optional Report configuration options. output_file : str Path to the HTML file to write """ if config_report is None: config_report = {} dr = DriftReport( smartdrift=smartdrift, explainer=explainer, # rename to match kwarg project_info_file=project_info_file, config_report=config_report, ) pages = [] pages.append(_get_index(dr, project_info_file, config_report)) if project_info_file is not None: pages.append(_get_project_info(dr)) pages.append(_get_consistency_analysis(dr)) pages.append(_get_datadrift(dr)) if dr.smartdrift.data_modeldrift is not None: pages.append(_get_modeldrift(dr)) report = dp.View(blocks=pages) dp.save_report(report, path=output_file, open=False, name="report.html")