Skip to content

Commit 19b692f

Browse files
authored
Improved outputs handling when reading/writing from/to disk (#125)
* Fix missing parent argument for OutputsManager. * Process outputs in notebooks on disk. * Minor changes. * Fixing bugs and added saving phase method. * Better placeholder handling. * Addressing review feedback. * Fixing placeholder logic.
1 parent 8bdfa7f commit 19b692f

File tree

6 files changed

+193
-29
lines changed

6 files changed

+193
-29
lines changed

jupyter_server_documents/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def get_fileid_manager():
6868
self.settings["yroom_manager"] = self.yroom_manager
6969

7070
# Initialize OutputsManager
71-
self.outputs_manager = self.outputs_manager_class(config=self.config)
71+
self.outputs_manager = self.outputs_manager_class(parent=self)
7272
self.settings["outputs_manager"] = self.outputs_manager
7373

7474
# Serve Jupyter Collaboration API on

jupyter_server_documents/outputs/manager.py

Lines changed: 161 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import os
33
from pathlib import Path, PurePath
44
import shutil
5+
import uuid
56

67
from pycrdt import Map
8+
import nbformat
79

810
from traitlets.config import LoggingConfigurable
911
from traitlets import Dict, Instance, Int, default
@@ -112,18 +114,18 @@ def get_stream(self, file_id, cell_id):
112114
output = f.read()
113115
return output
114116

115-
def write(self, file_id, cell_id, output, display_id=None):
117+
def write(self, file_id, cell_id, output, display_id=None, asdict: bool = False) -> Map | dict:
116118
"""Write a new output for file_id and cell_id.
117119
118120
Returns a placeholder output (pycrdt.Map) or None if no placeholder
119121
output should be written to the ydoc.
120122
"""
121-
placeholder = self.write_output(file_id, cell_id, output, display_id)
123+
placeholder = self.write_output(file_id, cell_id, output, display_id, asdict=asdict)
122124
if output["output_type"] == "stream" and self.stream_limit is not None:
123-
placeholder = self.write_stream(file_id, cell_id, output, placeholder)
125+
placeholder = self.write_stream(file_id, cell_id, output, placeholder, asdict=asdict)
124126
return placeholder
125127

126-
def write_output(self, file_id, cell_id, output, display_id=None):
128+
def write_output(self, file_id, cell_id, output, display_id=None, asdict: bool = False) -> Map | dict:
127129
self._ensure_path(file_id, cell_id)
128130
index = self._compute_output_index(cell_id, display_id)
129131
path = self._build_path(file_id, cell_id, index)
@@ -132,9 +134,12 @@ def write_output(self, file_id, cell_id, output, display_id=None):
132134
f.write(data)
133135
url = create_output_url(file_id, cell_id, index)
134136
self.log.info(f"Wrote output: {url}")
135-
return create_placeholder_output(output["output_type"], url)
137+
placeholder = create_placeholder_dict(output["output_type"], url)
138+
if not asdict:
139+
placeholder = Map(placeholder)
140+
return placeholder
136141

137-
def write_stream(self, file_id, cell_id, output, placeholder) -> Map:
142+
def write_stream(self, file_id, cell_id, output, placeholder, asdict : bool = False) -> Map | dict:
138143
# How many stream outputs have been written for this cell previously
139144
count = self._stream_count.get(cell_id, 0)
140145

@@ -156,7 +161,9 @@ def write_stream(self, file_id, cell_id, output, placeholder) -> Map:
156161
placeholder = placeholder
157162
elif count == self.stream_limit:
158163
# Return a link to the full stream output
159-
placeholder = create_placeholder_output("display_data", url, full=True)
164+
placeholder = create_placeholder_dict("display_data", url, full=True)
165+
if not asdict:
166+
placeholder = Map(placeholder)
160167
elif count > self.stream_limit:
161168
# Return None to indicate that no placeholder should be written to the ydoc
162169
placeholder = None
@@ -180,10 +187,152 @@ def clear(self, file_id, cell_id=None):
180187
except FileNotFoundError:
181188
pass
182189

190+
def process_loaded_notebook(self, file_id: str, file_data: dict) -> dict:
191+
"""Process a loaded notebook and handle outputs through the outputs manager.
183192
184-
def create_output_url(file_id: str, cell_id: str, output_index: int = None) -> str:
193+
This method processes a notebook that has been loaded from disk.
194+
If the notebook metadata has placeholder_outputs set to True,
195+
outputs are loaded from disk and set as the cell outputs.
196+
197+
Args:
198+
file_id (str): The file identifier
199+
file_data (dict): The file data containing the notebook content
200+
from calling ContentsManager.get()
201+
202+
Returns:
203+
dict: The modified file data with processed outputs
204+
"""
205+
self.log.info(f"Processing loaded notebook: {file_id}")
206+
207+
# Notebook content is a tree of nbformat.NotebookNode objects,
208+
# which are a subclass of dict.
209+
nb = file_data['content']
210+
211+
# Check if the notebook metadata has placeholder_outputs set to True
212+
if nb.get('metadata', {}).get('placeholder_outputs') is True:
213+
nb = self._process_loaded_placeholders(file_id=file_id, nb=nb)
214+
else:
215+
nb = self._process_loaded_no_placeholders(file_id=file_id, nb=nb)
216+
217+
file_data['content'] = nb
218+
return file_data
219+
220+
def _process_loaded_placeholders(self, file_id: str, nb: dict) -> dict:
221+
"""Process a notebook with placeholder_outputs metadata set to True.
222+
223+
This method processes notebooks that have been saved with placeholder outputs.
224+
It attempts to load actual outputs from disk and creates placeholder outputs
225+
for each code cell. If no outputs exist on disk for a cell, the cell's
226+
outputs are set to an empty list.
227+
228+
Args:
229+
file_id (str): The file identifier
230+
nb (dict): The notebook dictionary
231+
232+
Returns:
233+
dict: The notebook with placeholder outputs loaded from disk
234+
"""
235+
for cell in nb.get('cells', []):
236+
if cell.get('cell_type') == 'code':
237+
cell_id = cell.get('id', str(uuid.uuid4()))
238+
try:
239+
# Try to get outputs from disk
240+
output_strings = self.get_outputs(file_id=file_id, cell_id=cell_id)
241+
outputs = []
242+
for output_string in output_strings:
243+
output_dict = json.loads(output_string)
244+
placeholder = create_placeholder_dict(
245+
output_dict["output_type"],
246+
url=create_output_url(file_id, cell_id)
247+
)
248+
outputs.append(placeholder)
249+
cell['outputs'] = outputs
250+
except FileNotFoundError:
251+
# No outputs on disk for this cell, set empty outputs
252+
cell['outputs'] = []
253+
return nb
254+
255+
def _process_loaded_no_placeholders(self, file_id: str, nb: dict) -> dict:
256+
"""Process a notebook that doesn't have placeholder_outputs metadata.
257+
258+
This method processes notebooks with actual output data in the cells.
259+
It saves existing outputs to disk and replaces them with placeholder
260+
outputs that reference the saved files. Outputs that already have
261+
a URL in their metadata are left as-is.
262+
263+
Args:
264+
file_id (str): The file identifier
265+
nb (dict): The notebook dictionary
266+
267+
Returns:
268+
dict: The notebook with outputs saved to disk and replaced with placeholders
185269
"""
186-
Create the URL for an output or stream.
270+
for cell in nb.get('cells', []):
271+
if cell.get('cell_type') != 'code' or 'outputs' not in cell:
272+
continue
273+
274+
cell_id = cell.get('id', str(uuid.uuid4()))
275+
processed_outputs = []
276+
for output in cell.get('outputs', []):
277+
display_id = output.get('metadata', {}).get('display_id')
278+
url = output.get('metadata', {}).get('url')
279+
if url is None:
280+
# Save output to disk and replace with placeholder
281+
try:
282+
placeholder = self.write(
283+
file_id,
284+
cell_id,
285+
output,
286+
display_id,
287+
asdict=True,
288+
)
289+
except Exception as e:
290+
self.log.error(f"Error writing output: {e}")
291+
# If we can't write the output to disk, keep the original
292+
placeholder = output
293+
else:
294+
# In this case, there is a placeholder already so keep it
295+
placeholder = output
296+
297+
if placeholder is not None:
298+
# A placeholder of None means to not add to the YDoc
299+
processed_outputs.append(nbformat.from_dict(placeholder))
300+
301+
# Replace the outputs with processed ones
302+
cell['outputs'] = processed_outputs
303+
return nb
304+
305+
def process_saving_notebook(self, nb: dict) -> dict:
306+
"""Process a notebook before saving to disk.
307+
308+
This method is called when the yroom_file_api saves notebooks.
309+
It sets the placeholder_outputs key to True in the notebook metadata
310+
and clears the outputs array for each cell.
311+
312+
Args:
313+
nb (dict): The notebook dict
314+
315+
Returns:
316+
dict: The modified file data with placeholder_outputs set to True
317+
and empty outputs arrays
318+
"""
319+
# Ensure metadata exists
320+
if 'metadata' not in nb:
321+
nb['metadata'] = {}
322+
323+
# Set placeholder_outputs to True
324+
nb['metadata']['placeholder_outputs'] = True
325+
326+
# Clear outputs for all code cells, as they are saved to disk
327+
for cell in nb.get('cells', []):
328+
if cell.get('cell_type') == 'code':
329+
cell['outputs'] = []
330+
331+
return nb
332+
333+
334+
def create_output_url(file_id: str, cell_id: str, output_index: int = None) -> str:
335+
"""Create the URL for an output or stream.
187336
188337
Parameters:
189338
- file_id (str): The ID of the file.
@@ -198,9 +347,9 @@ def create_output_url(file_id: str, cell_id: str, output_index: int = None) -> s
198347
else:
199348
return f"/api/outputs/{file_id}/{cell_id}/{output_index}.output"
200349

201-
def create_placeholder_dict(output_type: str, url: str, full: bool = False):
202-
"""
203-
Build a placeholder output dict for the given output_type and url.
350+
def create_placeholder_dict(output_type: str, url: str, full: bool = False) -> dict:
351+
"""Build a placeholder output dict for the given output_type and url.
352+
204353
If full is True and output_type is "display_data", returns a display_data output
205354
with an HTML link to the full stream output.
206355
@@ -234,18 +383,3 @@ def create_placeholder_dict(output_type: str, url: str, full: bool = False):
234383
else:
235384
raise ValueError(f"Unknown output_type: {output_type}")
236385

237-
def create_placeholder_output(output_type: str, url: str, full: bool = False):
238-
"""
239-
Creates a placeholder output Map for the given output_type and url.
240-
If full is True and output_type is "display_data", creates a display_data output with an HTML link.
241-
242-
Parameters:
243-
- output_type (str): The type of the output.
244-
- url (str): The URL associated with the output.
245-
- full (bool): Whether to create a full output placeholder with a link.
246-
247-
Returns:
248-
- Map: The placeholder output `ycrdt.Map`.
249-
"""
250-
output_dict = create_placeholder_dict(output_type, url, full=full)
251-
return Map(output_dict)

jupyter_server_documents/outputs/output_processor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,12 @@ async def output_task(self, msg_type, cell_id, content):
142142
# Convert from the message spec to the nbformat output structure
143143
if self.use_outputs_service:
144144
output = self.transform_output(msg_type, content, ydoc=False)
145-
output = self.outputs_manager.write(file_id, cell_id, output, display_id)
145+
output = self.outputs_manager.write(
146+
file_id=file_id,
147+
cell_id=cell_id,
148+
output=output,
149+
display_id=display_id
150+
)
146151
else:
147152
output = self.transform_output(msg_type, content, ydoc=True)
148153

jupyter_server_documents/rooms/yroom.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from jupyter_server_fileid.manager import BaseFileIdManager
2323
from jupyter_server.services.contents.manager import ContentsManager
2424
from pycrdt import TransactionEvent
25+
from ..outputs.manager import OutputsManager
2526

2627
class YRoom(LoggingConfigurable):
2728
"""
@@ -267,6 +268,11 @@ def event_logger(self) -> EventLogger:
267268
return self.parent.event_logger
268269

269270

271+
@property
272+
def outputs_manager(self) -> OutputsManager:
273+
return self.parent.outputs_manager
274+
275+
270276
def _init_ydoc(self) -> pycrdt.Doc:
271277
"""
272278
Initializes a YDoc, automatically binding its `_on_ydoc_update()`

jupyter_server_documents/rooms/yroom_file_api.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from .yroom import YRoom
1515
from jupyter_server_fileid.manager import BaseFileIdManager
1616
from jupyter_server.services.contents.manager import ContentsManager
17+
from ..outputs.manager import OutputsManager
1718

1819
class YRoomFileAPI(LoggingConfigurable):
1920
"""
@@ -143,6 +144,10 @@ def contents_manager(self) -> ContentsManager:
143144
"""
144145
return self.parent.contents_manager
145146

147+
@property
148+
def outputs_manager(self) -> OutputsManager:
149+
return self.parent.outputs_manager
150+
146151
@property
147152
def content_loaded(self) -> bool:
148153
"""
@@ -195,6 +200,10 @@ async def _load_content(self, jupyter_ydoc: YBaseDoc) -> None:
195200
format=self.file_format
196201
))
197202

203+
if self.file_type == "notebook":
204+
self.log.info(f"Processing outputs for loaded notebook: '{self.room_id}'.")
205+
file_data = self.outputs_manager.process_loaded_notebook(file_id=self.file_id, file_data=file_data)
206+
198207
# Set JupyterYDoc content and set `dirty = False` to hide the "unsaved
199208
# changes" icon in the UI
200209
jupyter_ydoc.source = file_data['content']
@@ -367,6 +376,9 @@ async def save(self, jupyter_ydoc: YBaseDoc):
367376
# being awaited.
368377
self._save_scheduled = False
369378

379+
if self.file_type == "notebook":
380+
content = self.outputs_manager.process_saving_notebook(content)
381+
370382
# Save the YDoc via the ContentsManager
371383
async with self._content_lock:
372384
file_data = await ensure_async(self.contents_manager.save(

jupyter_server_documents/rooms/yroom_manager.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from traitlets.config import LoggingConfigurable
88
from jupyter_server_fileid.manager import BaseFileIdManager
99

10+
from ..outputs.manager import OutputsManager
11+
1012
if TYPE_CHECKING:
1113
import logging
1214
from jupyter_server.extension.application import ExtensionApp
@@ -97,6 +99,11 @@ def event_logger(self) -> EventLogger:
9799
return self.parent.serverapp.event_logger
98100

99101

102+
@property
103+
def outputs_manager(self) -> OutputsManager:
104+
return self.parent.outputs_manager
105+
106+
100107
def get_room(self, room_id: str) -> YRoom | None:
101108
"""
102109
Returns the `YRoom` instance for a given room ID. If the instance does

0 commit comments

Comments
 (0)