Skip to content

Commit 45d07e4

Browse files
authored
Merge pull request #142 from man-group/sharding_fix
#95 - Shard GridFS chunks if the Mongo instance supports it
2 parents 660f645 + 5a957c4 commit 45d07e4

File tree

4 files changed

+34
-5
lines changed

4 files changed

+34
-5
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
0.6.1 (2023-09-13)
2+
------------------
3+
* Feature: GridFS document storage in Mongo-backed instances is now sharded if the mongo server supports it.
4+
15
0.6.0 (2023-09-01)
26
------------------
37
* Feature: Reports are now grouped by their containing folder on the main UI.

notebooker/serialization/mongo.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,22 @@ def load_files_from_gridfs(result_data_store: gridfs.GridFS, result: Dict, do_re
8686

8787

8888
class MongoResultSerializer(ABC):
89+
instance = None
90+
91+
def __new__(cls, *args, **kwargs):
92+
if not isinstance(cls.instance, cls):
93+
cls.instance = object.__new__(cls)
94+
return cls.instance
95+
8996
# This class is the interface between Mongo and the rest of the application
9097
def __init__(self, database_name="notebooker", mongo_host="localhost", result_collection_name="NOTEBOOK_OUTPUT"):
9198
self.database_name = database_name
9299
self.mongo_host = mongo_host
93100
self.result_collection_name = result_collection_name
94-
mongo_connection = self.get_mongo_database()
95-
self.library = mongo_connection[result_collection_name]
96-
self.result_data_store = gridfs.GridFS(mongo_connection, "notebook_data")
101+
102+
mongo_database = self.get_mongo_database()
103+
self.library = mongo_database[result_collection_name]
104+
self.result_data_store = gridfs.GridFS(mongo_database, "notebook_data")
97105

98106
def __init_subclass__(cls, cli_options: click.Command = None, **kwargs):
99107
if cli_options is None:
@@ -104,6 +112,16 @@ def __init_subclass__(cls, cli_options: click.Command = None, **kwargs):
104112
cls.cli_options = cli_options
105113
super().__init_subclass__(**kwargs)
106114

115+
def enable_sharding(self):
116+
conn = self.get_mongo_connection()
117+
try:
118+
conn.admin.command("enableSharding", self.database_name)
119+
conn.admin.command({"shardCollection": f"{self.database_name}.notebook_data.chunks",
120+
"key": {"files_id": 1, "n": 1}})
121+
logger.info(f"Successfully sharded GridFS collection for {self.database_name}")
122+
except pymongo.errors.OperationFailure:
123+
logger.error(f"Could not shard {self.database_name}. Continuing.")
124+
107125
def serializer_args_to_cmdline_args(self) -> List[str]:
108126
args = []
109127
for cli_arg in self.cli_options.params:

notebooker/web/app.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ def main(web_config: WebappConfig):
138138
GLOBAL_CONFIG = web_config
139139
flask_app = create_app(web_config)
140140
flask_app = setup_app(flask_app, web_config)
141+
serializer = get_serializer_from_cls(web_config.SERIALIZER_CLS, **web_config.SERIALIZER_CONFIG)
142+
try:
143+
serializer.enable_sharding()
144+
except AttributeError:
145+
pass
141146
start_app(web_config)
142147
logger.info("Notebooker is now running at http://0.0.0.0:%d", web_config.PORT)
143148
http_server = WSGIServer(("0.0.0.0", web_config.PORT), flask_app)

tests/unit/serialization/test_mongoose.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,17 @@ def test_mongo_filter_status():
2121
@patch("notebooker.serialization.mongo.gridfs")
2222
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_database")
2323
@patch("notebooker.serialization.mongo.MongoResultSerializer._get_all_job_ids")
24-
def test_get_latest_job_id_for_name_and_params(_get_all_job_ids, conn, gridfs):
24+
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_connection")
25+
def test_get_latest_job_id_for_name_and_params(conn, _get_all_job_ids, db, gridfs):
2526
serializer = MongoResultSerializer()
2627
serializer.get_latest_job_id_for_name_and_params("report_name", None)
2728
_get_all_job_ids.assert_called_once_with("report_name", None, as_of=None, limit=1)
2829

2930

3031
@patch("notebooker.serialization.mongo.gridfs")
3132
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_database")
32-
def test__get_all_job_ids(conn, gridfs):
33+
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_connection")
34+
def test__get_all_job_ids(conn, db, gridfs):
3335
serializer = MongoResultSerializer()
3436
serializer._get_all_job_ids("report_name", None, limit=1)
3537
serializer.library.aggregate.assert_called_once_with(

0 commit comments

Comments
 (0)