From 056280cb352d65f959d87cd8b79ff8cc19aedbeb Mon Sep 17 00:00:00 2001
From: Chris Bowdon <chris.bowdon@polecat.com>
Date: Wed, 29 Jan 2020 20:44:45 +0000
Subject: [PATCH] Implement parallel versions of precision and recall functions

---
 README.md           |  6 ++++
 bcubed/parallel.py  | 73 +++++++++++++++++++++++++++++++++++++++++++++
 test/test_bcubed.py | 50 +++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)
 create mode 100644 bcubed/parallel.py
 create mode 100644 test/test_bcubed.py

diff --git a/README.md b/README.md
index 85def88..66ac76b 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,12 @@ fscore = bcubed.fscore(precision, recall, beta=0.5)  # weights precision higher
 
 A complete example can be found in the included `example.py` file, where the examples of the source publication are used.
 
+Parallelised versions of the precision and recall functions are provided in `bcubed.parallel`. This can speed up the calculations for large datasets, but note that there is an overhead associated with the parallelism so it is unhelpful for smaller datasets. As a rule of thumb, it's useful when there are more than 5k items, depending on the system.
+
+## Running tests
+
+Tests can be run by installing `pytest` from pip and running `pytest test`.
+
 ## License
 
 This software is under the **Apache License 2.0**.
diff --git a/bcubed/parallel.py b/bcubed/parallel.py
new file mode 100644
index 0000000..2161dde
--- /dev/null
+++ b/bcubed/parallel.py
@@ -0,0 +1,73 @@
+# Simple extended BCubed implementation in Python for clustering evaluation
+# Copyright 2020 Hugo Hromic, Chris Bowdon
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Extended BCubed algorithm taken from:
+# Amigo, Enrique, et al. "A comparison of extrinsic clustering evaluation metrics
+# based on formal constraints." Information retrieval 12.4 (2009): 461-486.
+
+"""Generate extended BCubed evaluation for clustering."""
+
+"""Parallelized versions of functions in bcubed.extended."""
+import numpy
+from multiprocessing import Pool, cpu_count
+from itertools import repeat
+from .extended import mult_precision, mult_recall
+
+def _p(el1, cdict, ldict):
+    return numpy.mean([mult_precision(el1, el2, cdict, ldict)
+                       for el2 in cdict if cdict[el1] & cdict[el2]])
+
+def _r(el1, cdict, ldict):
+    return numpy.mean([mult_recall(el1, el2, cdict, ldict)
+                       for el2 in cdict if ldict[el1] & ldict[el2]])
+
+def parallel(function, cdict, ldict, n_processes=None):
+    if n_processes is None:
+        n_processes = max(1, cpu_count() - 2)
+
+    with Pool(n_processes) as pool:
+        return pool.starmap(function, zip(cdict.keys(), repeat(cdict), repeat(ldict)))
+
+def precision(cdict, ldict, n_processes=None):
+    """Computes overall extended BCubed precision for the C and L dicts
+    using multiple processes for parallelism.
+
+    Parameters
+    ==========
+    cdict: dict(item: set(cluster-ids))
+        The cluster assignments to be evaluated
+    ldict: dict(item: set(cluster-ids))
+        The ground truth clustering
+    n_processes: optional integer
+        Number of processes to use (defaults to number of CPU cores - 1)
+    """
+    p_per_el = parallel(_p, cdict, ldict, n_processes)
+    return numpy.mean(p_per_el)
+
+def recall(cdict, ldict, n_processes=None):
+    """Computes overall extended BCubed recall for the C and L dicts
+    using multiple processes for parallelism.
+
+    Parameters
+    ==========
+    cdict: dict(item: set(cluster-ids))
+        The cluster assignments to be evaluated
+    ldict: dict(item: set(cluster-ids))
+        The ground truth clustering
+    n_processes: optional integer
+        Number of processes to use (defaults to number of CPU cores - 1)
+    """
+    r_per_el = parallel(_r, cdict, ldict, n_processes)
+    return numpy.mean(r_per_el)
diff --git a/test/test_bcubed.py b/test/test_bcubed.py
new file mode 100644
index 0000000..50db358
--- /dev/null
+++ b/test/test_bcubed.py
@@ -0,0 +1,50 @@
+import bcubed
+import bcubed.parallel
+
+
+def test_precision():
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {1}, 1: {0}, 2: {1}, 3: {0}}
+    assert bcubed.precision(cdict, ldict) == 1
+
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}}
+    assert bcubed.precision(cdict, ldict) == 0.75
+
+
+def test_recall():
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {1}, 3: {1}}
+    assert bcubed.recall(cdict, ldict) == 2/3
+
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}}
+    assert bcubed.recall(cdict, ldict) == 1
+
+
+def test_parallel_precision():
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {1}, 1: {0}, 2: {1}, 3: {0}}
+    assert bcubed.parallel.precision(cdict, ldict) == 1
+
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}}
+    assert bcubed.parallel.precision(cdict, ldict) == 0.75
+
+    cdict = {i: {i} for i in range(5000)}  # just enough to make it worth it
+    ldict = {i: {i} for i in range(5000)}
+    assert bcubed.parallel.precision(cdict, ldict) == 1
+
+
+def test_parallel_recall():
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {1}, 3: {1}}
+    assert bcubed.parallel.recall(cdict, ldict) == 2/3
+
+    cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}}
+    ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}}
+    assert bcubed.parallel.recall(cdict, ldict) == 1
+
+    cdict = {i: {i} for i in range(5000)}  # just enough to make it worth it
+    ldict = {i: {i} for i in range(5000)}
+    assert bcubed.parallel.recall(cdict, ldict) == 1