From 056280cb352d65f959d87cd8b79ff8cc19aedbeb Mon Sep 17 00:00:00 2001 From: Chris Bowdon Date: Wed, 29 Jan 2020 20:44:45 +0000 Subject: [PATCH] Implement parallel versions of precision and recall functions --- README.md | 6 ++++ bcubed/parallel.py | 73 +++++++++++++++++++++++++++++++++++++++++++++ test/test_bcubed.py | 50 +++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 bcubed/parallel.py create mode 100644 test/test_bcubed.py diff --git a/README.md b/README.md index 85def88..66ac76b 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,12 @@ fscore = bcubed.fscore(precision, recall, beta=0.5) # weights precision higher A complete example can be found in the included `example.py` file, where the examples of the source publication are used. +Parallelised versions of the precision and recall functions are provided in `bcubed.parallel`. This can speed up the calculations for large datasets, but note that there is an overhead associated with the parallelism so it is unhelpful for smaller datasets. As a rule of thumb, it's useful when there are more than 5k items, depending on the system. + +## Running tests + +Tests can be run by installing `pytest` from pip and running `pytest test`. + ## License This software is under the **Apache License 2.0**. diff --git a/bcubed/parallel.py b/bcubed/parallel.py new file mode 100644 index 0000000..2161dde --- /dev/null +++ b/bcubed/parallel.py @@ -0,0 +1,73 @@ +# Simple extended BCubed implementation in Python for clustering evaluation +# Copyright 2020 Hugo Hromic, Chris Bowdon +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Extended BCubed algorithm taken from: +# Amigo, Enrique, et al. "A comparison of extrinsic clustering evaluation metrics +# based on formal constraints." Information retrieval 12.4 (2009): 461-486. + +"""Generate extended BCubed evaluation for clustering.""" + +"""Parallelized versions of functions in bcubed.extended.""" +import numpy +from multiprocessing import Pool, cpu_count +from itertools import repeat +from .extended import mult_precision, mult_recall + +def _p(el1, cdict, ldict): + return numpy.mean([mult_precision(el1, el2, cdict, ldict) + for el2 in cdict if cdict[el1] & cdict[el2]]) + +def _r(el1, cdict, ldict): + return numpy.mean([mult_recall(el1, el2, cdict, ldict) + for el2 in cdict if ldict[el1] & ldict[el2]]) + +def parallel(function, cdict, ldict, n_processes=None): + if n_processes is None: + n_processes = max(1, cpu_count() - 2) + + with Pool(n_processes) as pool: + return pool.starmap(function, zip(cdict.keys(), repeat(cdict), repeat(ldict))) + +def precision(cdict, ldict, n_processes=None): + """Computes overall extended BCubed precision for the C and L dicts + using multiple processes for parallelism. + + Parameters + ========== + cdict: dict(item: set(cluster-ids)) + The cluster assignments to be evaluated + ldict: dict(item: set(cluster-ids)) + The ground truth clustering + n_processes: optional integer + Number of processes to use (defaults to number of CPU cores - 1) + """ + p_per_el = parallel(_p, cdict, ldict, n_processes) + return numpy.mean(p_per_el) + +def recall(cdict, ldict, n_processes=None): + """Computes overall extended BCubed recall for the C and L dicts + using multiple processes for parallelism. + + Parameters + ========== + cdict: dict(item: set(cluster-ids)) + The cluster assignments to be evaluated + ldict: dict(item: set(cluster-ids)) + The ground truth clustering + n_processes: optional integer + Number of processes to use (defaults to number of CPU cores - 1) + """ + r_per_el = parallel(_r, cdict, ldict, n_processes) + return numpy.mean(r_per_el) diff --git a/test/test_bcubed.py b/test/test_bcubed.py new file mode 100644 index 0000000..50db358 --- /dev/null +++ b/test/test_bcubed.py @@ -0,0 +1,50 @@ +import bcubed +import bcubed.parallel + + +def test_precision(): + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {1}, 1: {0}, 2: {1}, 3: {0}} + assert bcubed.precision(cdict, ldict) == 1 + + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}} + assert bcubed.precision(cdict, ldict) == 0.75 + + +def test_recall(): + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {1}, 3: {1}} + assert bcubed.recall(cdict, ldict) == 2/3 + + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}} + assert bcubed.recall(cdict, ldict) == 1 + + +def test_parallel_precision(): + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {1}, 1: {0}, 2: {1}, 3: {0}} + assert bcubed.parallel.precision(cdict, ldict) == 1 + + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}} + assert bcubed.parallel.precision(cdict, ldict) == 0.75 + + cdict = {i: {i} for i in range(5000)} # just enough to make it worth it + ldict = {i: {i} for i in range(5000)} + assert bcubed.parallel.precision(cdict, ldict) == 1 + + +def test_parallel_recall(): + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {1}, 3: {1}} + assert bcubed.parallel.recall(cdict, ldict) == 2/3 + + cdict = {0: {0}, 1: {1}, 2: {0}, 3: {1}} + ldict = {0: {0}, 1: {1}, 2: {2}, 3: {1}} + assert bcubed.parallel.recall(cdict, ldict) == 1 + + cdict = {i: {i} for i in range(5000)} # just enough to make it worth it + ldict = {i: {i} for i in range(5000)} + assert bcubed.parallel.recall(cdict, ldict) == 1