From dedde452d291b840b7bdecd9d26512d6c556590a Mon Sep 17 00:00:00 2001 From: Mike Carlo Date: Fri, 26 Sep 2025 14:34:35 -0500 Subject: [PATCH] Add Lakehouse Table Maintenance example notebook Co-authored-by: James Bartlett <37491308+JamesDBartlett3@users.noreply.github.com> --- notebooks/Lakehouse Table Maintenance.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 notebooks/Lakehouse Table Maintenance.ipynb diff --git a/notebooks/Lakehouse Table Maintenance.ipynb b/notebooks/Lakehouse Table Maintenance.ipynb new file mode 100644 index 00000000..bb501819 --- /dev/null +++ b/notebooks/Lakehouse Table Maintenance.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["### Install the latest .whl package\n","\n","Check [here](https://pypi.org/project/semantic-link-labs/) to see the latest version."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"5c27dfd1-4fe0-4a97-92e6-ddf78889aa93"},{"cell_type":"code","source":["%pip install semantic-link-labs"],"outputs":[],"execution_count":null,"metadata":{"jupyter":{"outputs_hidden":true,"source_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"d5cae9db-cef9-48a8-a351-9c5fcc99645c"},{"cell_type":"markdown","source":["### Import the library and necessary packages"],"metadata":{},"id":"b195eae8"},{"cell_type":"code","source":["import sempy_labs as labs\n","import sempy_labs.lakehouse as lake\n","import sempy_labs.admin as admin"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"1344e286"},{"cell_type":"markdown","source":["# List Items in a Workspace\n","\n","Using the admin list items function, filter down the list of returned items to only be lakehouses. \n","[Docs on admin.list_items](link-URL)"],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"5a3fe6e8-b8aa-4447-812b-7931831e07fe"},{"cell_type":"code","source":["# List all items from a workspace\n","\n","lakehouse_list = admin.list_items(workspace='', type='lakehouse')\n","lakehouse_list"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"78afeb4d-d78f-4467-920f-19b77b3d6336"},{"cell_type":"code","source":["# Alternatively you could save the name into the lakehouse name\n","lakehouse_name = ['']\n","\n","lakehouse_name"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"fe3a7f3d-4c9f-43ff-a012-e5ccee7f0284"},{"cell_type":"code","source":["# List tables from the selected lakehouse\n","\n","lakehouse_tables = lake.get_lakehouse_tables(lakehouse=lakehouse_name)\n","lakehouse_tables"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"914f97bb-178c-4f47-9832-1ecc9da789ec"},{"cell_type":"markdown","source":["# Optimize\n","\n","**Optimize** refers to a set of techniques and tools used to improve the performance of the tables. It includes query optimization, physical execution planning, and efficient memory and resource management. These optimizations help reduce execution time, minimize data shuffling, and enhance overall scalability in distributed data processing.\n","\n","Both Vacuum and Optimize will work in python notebooks and PySpark notebooks.\n","\n","[Documentation Link for Optimize function](https://semantic-link-labs.readthedocs.io/en/latest/sempy_labs.lakehouse.html#sempy_labs.lakehouse.optimize_lakehouse_tables)\n"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"8f9b59bd-af0e-4758-899e-121637f17d8f"},{"cell_type":"code","source":["# Run Optmize on a single table\n","\n","## Enter the name as text\n","lake.optimize_lakehouse_tables(lakehouse=lakehouse_name, tables='')"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"56d1af0f-6e3e-4608-a2f7-914fec4e5225"},{"cell_type":"code","source":["# Run Optimize on a list of tables\n","\n","table_names = ['','']\n","\n","lake.optimize_lakehouse_tables(lakehouse=lakehouse_name, tables=table_names)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"a9ae7d32-cc52-4130-a5e4-82e1f43ce9cb"},{"cell_type":"code","source":["# Run optimize on the entire lakehouse\n","\n","lake.optimize_lakehouse_tables(lakehouse=lakehouse_name)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"ed168590-8c90-4bfc-a2e4-21fbfb97954f"},{"cell_type":"markdown","source":["# Vacuum Tables\n","\n","**Vacuum** is a maintenance operation used primarily with Delta Lake tables to clean up obsolete data files. It removes files that are no longer referenced by the Delta transaction log, helping to free up storage and improve performance. By default, it retains data for a set period (e.g., 7 days) to ensure compatibility with time travel and data recovery features.\n","\n","Both Vacuum and Optimize will work in python notebooks and PySpark notebooks.\n","\n","[Documentation Link for Vacuum function](https://semantic-link-labs.readthedocs.io/en/latest/sempy_labs.lakehouse.html#sempy_labs.lakehouse.vacuum_lakehouse_tables)"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"c5d9ff06-3d1c-4def-b2f9-f217cd849d66"},{"cell_type":"code","source":["# run Vacuum on one table\n","\n","lake.vacuum_lakehouse_tables(lakehouse=lakehouse_name, tables='')"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"9d470387-b2b4-40c0-810c-91da1eddecc5"},{"cell_type":"code","source":["# run Vacuum on a list of tables\n","\n","lake.vacuum_lakehouse_tables(lakehouse=lakehouse_name, tables=table_names)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"2454627e-05bc-4cdd-9088-5c7e709c105a"},{"cell_type":"code","source":["# Run Vacuum on the entire lakehouse\n","\n","lake.vacuum_lakehouse_tables(lakehouse=lakehouse_name)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"c9b6876d-531f-4622-88bf-46b7778793ed"},{"cell_type":"markdown","source":["# Run Maintenance\n","\n","Documentation for [Table Maintenance from Read the Docs for Semantic Link Labs](https://semantic-link-labs.readthedocs.io/en/stable/sempy_labs.lakehouse.html#sempy_labs.lakehouse.run_table_maintenance)."],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"97453646-d430-4c27-b7a7-2c6926b2b251"},{"cell_type":"code","source":["# Run Maintenance opperations on a specific table from the lakehouse\n","\n","# Optional Parameter of v_order=False for this function\n","lake.run_table_maintenance(lakehouse=lakehouse_name, table_name='', optimize=True, vacuum=True)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"1c5aa688-b746-4bf6-abf2-9fa87e8471c2"},{"cell_type":"markdown","source":["Run Maintenance operations on a specific table from the lakehouse using additional options\n","- optimize\n","- vacuum\n","- retention_period\n","- v_order"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"0c4aa065-0477-4fc4-9c48-26b7196ec84a"},{"cell_type":"code","source":["lake.run_table_maintenance(lakehouse=lakehouse_name, table_name='', optimize=True, vacuum=True, retention_period='14:00:00:00', v_order=True)"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"8948609a-f0ba-4587-a7b3-ff1f1324f5ca"},{"cell_type":"code","source":["# Check for table v-ordering\n","\n","lake.is_v_ordered(lakehouse=lakehouse_name, table_name='')"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"jupyter_python"}},"id":"10faef00-b31b-4117-9c53-f9a326af88c1"}],"metadata":{"kernel_info":{"name":"jupyter","jupyter_kernel_name":"python3.11"},"kernelspec":{"name":"jupyter","display_name":"Jupyter"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"jupyter_python","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"widgets":{},"synapse_widget":{"state":{},"version":"0.1"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file