diff --git a/examples/pzmm_generate_complete_model_card.ipynb b/examples/pzmm_generate_complete_model_card.ipynb index f49efd56..60124580 100644 --- a/examples/pzmm_generate_complete_model_card.ipynb +++ b/examples/pzmm_generate_complete_model_card.ipynb @@ -1716,12 +1716,14 @@ ], "source": [ "# Step 13: Generate requirements files\n", - "requirements_json = pzmm.JSONFiles.create_requirements_json(output_path)\n", + "requirements_json = pzmm.JSONFiles.create_requirements_json(output_path, create_requirements_txt=False)\n", "\n", "import json\n", "print(json.dumps(requirements_json, sort_keys=True, indent=4))\n", "\n", "for requirement in requirements_json:\n", + " # Example: Replace sklearn with scikit-learn in requirements\n", + " # (This is redundant in newer versions but shows how to modify package names)\n", " if 'sklearn' in requirement['step']:\n", " requirement['command'] = requirement[\"command\"].replace('sklearn', 'scikit-learn')\n", " requirement['step'] = requirement['step'].replace('sklearn', 'scikit-learn')\n", diff --git a/examples/pzmm_generate_requirements_json.ipynb b/examples/pzmm_generate_requirements_json.ipynb index 604ae800..9a4314cf 100644 --- a/examples/pzmm_generate_requirements_json.ipynb +++ b/examples/pzmm_generate_requirements_json.ipynb @@ -14,16 +14,18 @@ "id": "e9b8cb7c-1974-4af5-8992-d51f90fcfe5b", "metadata": {}, "source": [ - "# Automatic Generation of the requirements.json File\n", + "# Automatic Generation of the requirements.json or requirements.txt File\n", "In order to validate Python models within a container publishing destination, the Python packages which contain the modules that are used in the Python score code file and its score resource files must be installed in the run-time container. You can install the packages when you publish a Python model or decision that contains a Python model to a container publishing destination by adding a `requirements.json` file that includes the package install statements to your model.\n", "\n", "This notebook provides an example execution and assessment of the create_requirements_json() function added in python-sasctl v1.8.0. The aim of this function is help to create the instructions (aka the `requirements.json` file) for a lightweight Python container in SAS Model Manager. Lightweight here meaning that the container will only install the packages found in the model's pickle files and python scripts.\n", "\n", + "Additionally, the create_requirements_json() function provides an optional parameter `create_requirements_txt` which when set to `True` will generate a requirements.txt file alongside the requirements.json file. By default this option is set to `False`. The requirements.txt file is needed when deploying Python models to SAS Event Stream Processing, which requires this format for package installation in their environment. While SAS Model Manager continues to use the requirements.json format, adding the requirements.txt file ensures compatibility across both platforms. \n", + "\n", "### **User Warnings**\n", "The methods utilized in this function can determine package dependencies and versions from provided scripts and pickle files, but there are some stipulations that need to be considered:\n", "\n", "1. If run outside of the development environment that the model was created in, the create_requirements_json() function **CANNOT** determine the required package _versions_ accurately. \n", - "2. Not all Python packages have matching import and install names and as such some of the packages added to the requirements.json file may be incorrectly named (i.e. `import sklearn` vs `pip install scikit-learn`).\n", + "2. Not all Python packages have matching import and install names and as such some of the packages added to the requirements.json file may be incorrectly named (i.e. `import sklearn` vs `pip install scikit-learn`). Some of the major packages with differing import and install names are automatically converted. \n", "\n", "As such, it is recommended that the user check over the requirements.json file for package name and version accuracy before deploying to a run-time container in SAS Model Manager." ] @@ -63,7 +65,7 @@ "outputs": [], "source": [ "model_dir = Path.cwd() / \"data/hmeqModels/DecisionTreeClassifier\"\n", - "requirements_json = pzmm.JSONFiles.create_requirements_json(model_dir)" + "requirements_json = pzmm.JSONFiles.create_requirements_json(model_dir, create_requirements_txt=False)" ] }, { @@ -145,6 +147,8 @@ ], "source": [ "for requirement in requirements_json:\n", + " # Example: Replace sklearn with scikit-learn in requirements\n", + " # (This is redundant in newer versions but shows how to modify package names)\n", " if 'sklearn' in requirement['step']:\n", " requirement['command'] = requirement[\"command\"].replace('sklearn', 'scikit-learn')\n", " requirement['step'] = requirement['step'].replace('sklearn', 'scikit-learn')\n", diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 1c0c560d..8eb98bf9 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -1614,6 +1614,7 @@ def create_requirements_json( cls, model_path: Union[str, Path, None] = Path.cwd(), output_path: Union[str, Path, None] = None, + create_requirements_txt: bool = False, ) -> Union[dict, None]: """ Searches the model directory for Python scripts and pickle files and @@ -1636,7 +1637,11 @@ def create_requirements_json( environment. When provided with an output_path argument, this function outputs a JSON file - named "requirements.json". Otherwise, a list of dicts is returned. + named "requirements.json". If create_requirements_txt is True, it will also + create a requirements.txt file. Otherwise, a list of dicts is returned. + + Note: requirements.txt file is only created when both output_path and + create_requirements_txt are specified. Parameters ---------- @@ -1644,6 +1649,10 @@ def create_requirements_json( The path to a Python project, by default the current working directory. output_path : str or pathlib.Path, optional The path for the output requirements.json file. The default value is None. + create_requirements_txt : bool, optional + Whether to also create a requirements.txt file in addition to the + requirements.json file. This is useful for SAS Event Stream Processing + environments. The default value is False. Returns ------- @@ -1662,11 +1671,57 @@ def create_requirements_json( package_list = list(set(list(_flatten(package_list)))) package_list = cls.remove_standard_library_packages(package_list) package_and_version = cls.get_local_package_version(package_list) + # Identify packages with missing versions missing_package_versions = [ item[0] for item in package_and_version if not item[1] ] + IMPORT_TO_INSTALL_MAPPING = { + # Data Science & ML Core + "sklearn": "scikit-learn", + "skimage": "scikit-image", + "cv2": "opencv-python", + "PIL": "Pillow", + # Data Formats & Parsing + "yaml": "PyYAML", + "bs4": "beautifulsoup4", + "docx": "python-docx", + "pptx": "python-pptx", + # Date & Time Utilities + "dateutil": "python-dateutil", + # Database Connectors + "MySQLdb": "MySQL-python", + "psycopg2": "psycopg2-binary", + # System & Platform + "win32api": "pywin32", + "win32com": "pywin32", + # Scientific Libraries + "Bio": "biopython", + } + + # Map import names to their corresponding package installation names + package_and_version = [ + (IMPORT_TO_INSTALL_MAPPING.get(name, name), version) + for name, version in package_and_version + ] + + if create_requirements_txt: + requirements_txt = "" + if missing_package_versions: + requirements_txt += "# Warning- The existence and/or versions for the following packages could not be determined:\n" + requirements_txt += "# " + ", ".join(missing_package_versions) + "\n" + + for package, version in package_and_version: + if version: + requirements_txt += f"{package}=={version}\n" + + if output_path: + with open( # skipcq: PTC-W6004 + Path(output_path) / "requirements.txt", "w" + ) as file: + file.write(requirements_txt) + # Create a list of dicts related to each package or warning json_dicts = [] if missing_package_versions: @@ -1800,16 +1855,16 @@ def find_imports(file_path: Union[str, Path]) -> List[str]: file_text = file.read() # Parse the file to get the abstract syntax tree representation tree = ast.parse(file_text) - modules = [] + modules = set() # Walk through each node in the ast to find import calls for node in ast.walk(tree): # Determine parent module for `from * import *` calls if isinstance(node, ast.ImportFrom): - modules.append(node.module) + modules.add(node.module.split(".")[0]) elif isinstance(node, ast.Import): for name in node.names: - modules.append(name.name) + modules.add(name.name.split(".")[0]) modules = list(set(modules)) try: diff --git a/tests/unit/test_write_json_files.py b/tests/unit/test_write_json_files.py index b0a3c6a0..3321fc30 100644 --- a/tests/unit/test_write_json_files.py +++ b/tests/unit/test_write_json_files.py @@ -699,8 +699,9 @@ def test_create_requirements_json(change_dir): dtc = dtc.fit(x_train, y_train) with open(tmp_dir / "DecisionTreeClassifier.pickle", "wb") as pkl_file: pickle.dump(dtc, pkl_file) - jf.create_requirements_json(tmp_dir, Path(tmp_dir)) + jf.create_requirements_json(tmp_dir, Path(tmp_dir), True) assert (Path(tmp_dir) / "requirements.json").exists() + assert (Path(tmp_dir) / "requirements.txt").exists() json_dict = jf.create_requirements_json(tmp_dir) expected = [ @@ -709,13 +710,20 @@ def test_create_requirements_json(change_dir): "command": f"pip install numpy=={np.__version__}", }, { - "step": "install sklearn", - "command": f"pip install sklearn=={sk.__version__}", + "step": "install scikit-learn", + "command": f"pip install scikit-learn=={sk.__version__}", }, ] unittest.TestCase.maxDiff = None unittest.TestCase().assertCountEqual(json_dict, expected) + # Verify requirements.txt content + with open(Path(tmp_dir) / "requirements.txt", "r") as file: + requirements_content = [line.strip() for line in file.readlines()] + + assert f"numpy=={np.__version__}" in requirements_content + assert f"scikit-learn=={sk.__version__}" in requirements_content + class TestAssessBiasHelpers(unittest.TestCase): md_1 = pd.DataFrame({"Value": [0], "Base": ["A"], "Compare": ["C"]})