[ENH] MCM plot function tidy. (#3099)

MatthewMiddlehurst · web-flow · commit 6d99d7ed4a9c · 2025-11-04T22:19:44.000Z
* mcm tidy

* more fixes

* more fixes

* docs and holm

* Update note on pvalue parameters usage

Clarified note on pvalue parameters for configuration.

* Automatic `pre-commit` fixes

---------

Co-authored-by: MatthewMiddlehurst &lt;25731235+MatthewMiddlehurst@users.noreply.github.com&gt;
diff --git a/aeon/visualisation/results/_mcm.py b/aeon/visualisation/results/_mcm.py
@@ -1,6 +1,6 @@
 """Function to create the Multi-Comparison Matrix (MCM) results visualisation."""
 
-__maintainer__ = ["TonyBagnall"]
+__maintainer__ = ["hadifawaz1999", "TonyBagnall"]
 
 __all__ = ["create_multi_comparison_matrix"]
 
@@ -15,23 +15,20 @@
 
 
 def create_multi_comparison_matrix(
-    df_results,
+    results,
     save_path="./mcm",
     formats=None,
-    used_statistic="Accuracy",
+    statistic_name="Score",
     plot_1v1_comparisons=False,
     higher_stat_better=True,
     include_pvalue=True,
     pvalue_test="wilcoxon",
     pvalue_test_params=None,
-    pvalue_correction=None,
+    pvalue_correction="Holm",
     pvalue_threshold=0.05,
-    use_mean="mean-difference",
     order_stats="average-statistic",
     order_stats_increasing=False,
-    dataset_column=None,
     precision=4,
-    load_analysis=False,
     row_comparates=None,
     col_comparates=None,
     excluded_row_comparates=None,
@@ -43,20 +40,25 @@ def create_multi_comparison_matrix(
     colorbar_value=None,
     win_tie_loss_labels=None,
     include_legend=True,
-    show_symetry=True,
+    show_symmetry=True,
 ):
     """Generate the Multi-Comparison Matrix (MCM) [1]_.
 
     MCM summarises a set of results for multiple estimators evaluated on multiple
     datasets. The MCM is a heatmap that shows absolute performance and tests for
     significant difference. It is configurable inmany ways.
 
+    Note: this implementation uses different pvalue parameters from the original
+    by default. To use the original parameters, set ``pvalue_test_params`` to
+    ``{"zero_method": "pratt", "alternative": "two-sided"}`` and
+    ``pvalue_correction`` to ``None``.
+
     Parameters
     ----------
-    df_results: str or pd.DataFrame
-        A csv file containing results in `n_problems,n_estimators` format. The first
-        row should contain the names of the estimators and the first column can
-        contain the names of the problems if `dataset_column` is true.
+    results: pd.DataFrame
+        A dataframe of scores. Columns are the names of the estimators and rows are the
+        different problems. The estimator names present in the columns will be used as
+        the comparate names in the MCM.
     save_path: str, default = './mcm'
         The output directory for the results. If you want to save the results with a
         different filename, you must include the filename in the path.
@@ -65,33 +67,28 @@ def create_multi_comparison_matrix(
         File formats to save in the save_path.
         - If None, no files are saved.
         - Valid formats are 'pdf', 'png', 'json', 'csv', 'tex'.
-    used_statistic: str, default = 'Score'
-        Name of the metric being assesses (e.g. accuracy, error, mse).
-    save_as_json: bool, default = True
-        Whether or not to save the python analysis dict into a json file format.
+    statistic_name: str, default = 'Score'
+        Name of the metric being assessesed (e.g. accuracy, error, mse).
+        By default just generically labelles as 'Score'.
     plot_1v1_comparisons: bool, default = True
-        Whether or not to plot the 1v1 scatter results.
+        Whether to plot the 1v1 scatter results.
     higher_stat_better: bool, default = True
         The order on considering a win or a loss for a given statistics.
     include_pvalue bool, default = True
-        Condition whether or not include a pvalue stats.
+        Condition whether include a pvalue stats.
     pvalue_test: str, default = 'wilcoxon'
         The statistical test to produce the pvalue stats. Currently only wilcoxon is
         supported.
     pvalue_test_params: dict, default = None,
         The default parameter set for the pvalue_test used. If pvalue_test is set
         to Wilcoxon, one should check the scipy.stats.wilcoxon parameters,
         in the case Wilcoxon is set and this parameter is None, then the default setup
-        is {"zero_method": "pratt", "alternative": "greater"}.
+        is {"zero_method": "wilcox", "alternative": "greater"}.
     pvalue_correction: str, default = None
         Correction to use for the pvalue significant test, None or "Holm".
     pvalue_threshold: float, default = 0.05
         Threshold for considering a comparison is significant or not. If pvalue <
         pvalue_threshhold -> comparison is significant.
-    use_mean: str, default = 'mean-difference'
-        The mean used to compare two estimators. The only option available
-        is 'mean-difference' which is the difference between arithmetic mean
-        over all datasets.
     order_stats: str, default = 'average-statistic'
         The way to order the used_statistic, default setup orders by average
         statistic over all datasets.
@@ -108,17 +105,13 @@ def create_multi_comparison_matrix(
     order_stats_increasing: bool, default = False
         If True, the order_stats will be ordered in increasing order, otherwise they are
         ordered in decreasing order.
-    dataset_column: str, default = 'dataset_name'
-        The name of the datasets column in the csv file.
     precision: int, default = 4
         The number of floating numbers after decimal point.
-    load_analysis: bool, default = False
-        If True attempts to load the analysis json file.
     row_comparates: list of str, default = None
-      A list of included row comparates, if None, all of the comparates in the study
+      A list of included row comparates, if None, all the comparates in the study
       are placed in the rows.
     col_comparates: list of str, default = None
-        A list of included col comparates, if None, all of the comparates in the
+        A list of included col comparates, if None, all the comparates in the
         study are placed in the cols.
     excluded_row_comparates: list of str, default = None
         A list of excluded row comparates. If None, all comparates are included.
@@ -145,9 +138,9 @@ def create_multi_comparison_matrix(
         The tuple must contain exactly three strings, representing win, tie, and
         loss outcomes for the row comparate (r) against the column comparate (c).
     include_legend: bool, default = True
-        Whether or not to show the legend on the MCM.
-    show_symetry: bool, default = True
-        Whether or not to show the symmetrical part of the heatmap.
+        Whether to show the legend on the MCM.
+    show_symmetry: bool, default = True
+        Whether to show the symmetrical part of the heatmap.
 
     Returns
     -------
@@ -158,7 +151,7 @@ def create_multi_comparison_matrix(
     -------
     >>> from aeon.visualisation import create_multi_comparison_matrix # doctest: +SKIP
     >>> create_multi_comparison_matrix(
-    ...     df_results="results.csv",
+    ...     results="results.csv",
     ...     save_path="reports/mymcm",
     ...     formats=("png", "json")
     ... )  # doctest: +SKIP
@@ -173,12 +166,6 @@ def create_multi_comparison_matrix(
     Evaluations That Is Stable Under Manipulation Of The Comparate Set
     arXiv preprint arXiv:2305.11921, 2023.
     """
-    if isinstance(df_results, str):
-        try:
-            df_results = pd.read_csv(df_results)
-        except Exception as e:
-            raise ValueError(f"No dataframe or valid path is given: Exception {e}")
-
     formats = _normalize_formats(formats)
 
     if win_tie_loss_labels is None:
@@ -192,23 +179,20 @@ def create_multi_comparison_matrix(
     win_label, tie_label, loss_label = win_tie_loss_labels
 
     analysis = _get_analysis(
-        df_results,
+        results,
         save_path=save_path,
         formats=formats,
-        used_statistic=used_statistic,
+        used_statistic=statistic_name,
         plot_1v1_comparisons=plot_1v1_comparisons,
         higher_stat_better=higher_stat_better,
         include_pvalue=include_pvalue,
         pvalue_test=pvalue_test,
         pvalue_test_params=pvalue_test_params,
         pvalue_correction=pvalue_correction,
         pvalue_threshhold=pvalue_threshold,
-        use_mean=use_mean,
         order_stats=order_stats,
         order_stats_increasing=order_stats_increasing,
-        dataset_column=dataset_column,
         precision=precision,
-        load_analysis=load_analysis,
     )
 
     # start drawing heatmap
@@ -228,15 +212,15 @@ def create_multi_comparison_matrix(
         colorbar_value=colorbar_value,
         win_tie_loss_labels=win_tie_loss_labels,
         include_legend=include_legend,
-        show_symetry=show_symetry,
+        show_symmetry=show_symmetry,
     )
     return temp
 
 
 def _get_analysis(
     df_results,
     save_path="./",
-    formats=("json"),
+    formats="json",
     used_statistic="Score",
     plot_1v1_comparisons=False,
     higher_stat_better=True,
@@ -245,12 +229,9 @@ def _get_analysis(
     pvalue_test_params=None,
     pvalue_correction=None,
     pvalue_threshhold=0.05,
-    use_mean="mean-difference",
     order_stats="average-statistic",
     order_stats_increasing=False,
-    dataset_column=None,
     precision=4,
-    load_analysis=False,
 ):
     _check_soft_dependencies("matplotlib")
     import matplotlib as mpl
@@ -344,19 +325,7 @@ def _plot_1v1(
         plt.clf()
         plt.close()
 
-    save_file = f"{save_path}_analysis.json"
-
-    if load_analysis and os.path.exists(save_file):
-        with open(save_file) as json_file:
-            analysis = json.load(json_file)
-
-        analysis.setdefault("order_stats_increasing", order_stats_increasing)
-
-        return analysis
-
     analysis = {
-        "dataset-column": dataset_column,
-        "use-mean": use_mean,
         "order-stats": order_stats,
         "order_stats_increasing": order_stats_increasing,
         "used-statistics": used_statistic,
@@ -397,7 +366,6 @@ def _plot_1v1(
                     pvalue_test=pvalue_test,
                     pvalue_test_params=pvalue_test_params,
                     pvalue_threshhold=pvalue_threshhold,
-                    use_mean=use_mean,
                 )
 
                 analysis[pairwise_key] = pairwise_content
@@ -431,6 +399,7 @@ def _plot_1v1(
 
     _re_order_comparates(df_results=df_results, analysis=analysis)
 
+    save_file = f"{save_path}_analysis.json"
     if "json" in formats:
         with open(save_file, "w") as fjson:
             json.dump(analysis, fjson, cls=_NpEncoder)
@@ -454,7 +423,7 @@ def _draw(
     colorbar_value=None,
     win_tie_loss_labels=None,
     higher_stat_better=True,
-    show_symetry=True,
+    show_symmetry=True,
     include_legend=True,
 ):
     _check_soft_dependencies("matplotlib")
@@ -676,7 +645,7 @@ def _draw(
 
         latex_row = []
 
-        if can_be_symmetrical and (not show_symetry):
+        if can_be_symmetrical and (not show_symmetry):
             start_j = i
 
         for j in range(start_j, n_cols):
@@ -918,31 +887,11 @@ def _decode_results_data_frame(df, analysis):
 
     """
     df_columns = list(df.columns)  # extract columns from data frame
-
-    # check if dataset column name is correct
-
-    if analysis["dataset-column"] is not None:
-        if analysis["dataset-column"] not in df_columns:
-            raise KeyError("The column " + analysis["dataset-column"] + " is missing.")
-
-    # get number of examples (datasets)
-    # n_datasets = len(np.unique(np.asarray(df[analysis['dataset-column']])))
-    n_datasets = len(df.index)
-
-    analysis["n-datasets"] = n_datasets  # add number of examples to dictionary
-
-    if analysis["dataset-column"] is not None:
-        analysis["dataset-names"] = list(
-            df[analysis["dataset-column"]]
-        )  # add example names to dict
-        df_columns.remove(
-            analysis["dataset-column"]
-        )  # drop the dataset column name from columns list
-        # and keep comparate names
-
     comparate_names = df_columns.copy()
     n_comparates = len(comparate_names)
 
+    # add number of examples to dictionary
+    analysis["n-datasets"] = len(df.index)
     # add the information about comparates to dict
     analysis["comparate-names"] = comparate_names
     analysis["n-comparates"] = n_comparates
@@ -956,7 +905,6 @@ def _get_pairwise_content(
     pvalue_test="wilcoxon",
     pvalue_test_params=None,
     pvalue_threshhold=0.05,
-    use_mean="mean-difference",
 ):
     content = {}
 
@@ -992,9 +940,8 @@ def _get_pairwise_content(
 
             else:
                 raise ValueError(f"{pvalue_test} test is not supported yet")
-    if use_mean == "mean-difference":
-        content["mean"] = np.mean(x) - np.mean(y)
 
+    content["mean"] = np.mean(x) - np.mean(y)
     return content
 
 
@@ -1070,13 +1017,7 @@ def _re_order_comparates(df_results, analysis):
             stats.append(analysis["average-statistic"][analysis["comparate-names"][i]])
 
     elif analysis["order-stats"] == "average-rank":
-        if analysis["dataset-column"] is not None:
-            np_results = np.asarray(
-                df_results.drop([analysis["dataset-column"]], axis=1)
-            )
-        else:
-            np_results = np.asarray(df_results)
-
+        np_results = np.asarray(df_results)
         df = pd.DataFrame(columns=["comparate-name", "values"])
 
         for i, comparate_name in enumerate(analysis["comparate-names"]):
@@ -1169,7 +1110,7 @@ def _get_cell_legend(
     tie_label="r=c",
     loss_label="r<c",
 ):
-    cell_legend = _capitalize_label(analysis["use-mean"])
+    cell_legend = _capitalize_label("mean-difference")
     longest_string = len(cell_legend)
 
     win_tie_loss_string = f"{win_label} / {tie_label} / {loss_label}"
@@ -1190,7 +1131,6 @@ def _get_cell_legend(
 def _capitalize_label(s):
     if len(s.split("-")) == 1:
         return s.capitalize()
-
     else:
         return "-".join(ss.capitalize() for ss in s.split("-"))
 
diff --git a/aeon/visualisation/results/tests/test_mcm.py b/aeon/visualisation/results/tests/test_mcm.py
@@ -23,7 +23,25 @@ def test_mcm():
         np.random.rand(10, 3),  # 10 rows, 3 columns of random numbers
         columns=["Classifier1", "Classifier2", "Classifier3"],
     )
-    fig = create_multi_comparison_matrix(df, formats=())
+    fig = create_multi_comparison_matrix(df)
+    assert isinstance(fig, plt.Figure)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("matplotlib", severity="none"),
+    reason="skip test if required soft dependency not available",
+)
+def test_mcm_original_pvalue():
+    """Test the multi-comparison-matrix visualisation."""
+    import matplotlib.pyplot as plt
+
+    df = pd.DataFrame(
+        np.random.rand(10, 3),  # 10 rows, 3 columns of random numbers
+        columns=["Classifier1", "Classifier2", "Classifier3"],
+    )
+    fig = create_multi_comparison_matrix(
+        df, pvalue_test_params={"zero_method": "pratt", "alternative": "two-sided"}
+    )
     assert isinstance(fig, plt.Figure)
 
 
@@ -48,6 +66,3 @@ def test_mcm_file_save():
             pvalue_correction="Holm",
         )
         assert isinstance(fig, plt.Figure)
-
-
-#        assert os.path.isfile(os.path.join(tmp, "test1.pdf"))
diff --git a/examples/visualisation/plotting_results.ipynb b/examples/visualisation/plotting_results.ipynb