diff --git a/examples/document_functions/README.md b/examples/document_functions/README.md new file mode 100644 index 0000000..ffa11ce --- /dev/null +++ b/examples/document_functions/README.md @@ -0,0 +1,84 @@ +# Automated Function Documentation Generator + +This example demonstrates how to use Codegen to automatically generate comprehensive docstrings for functions by analyzing their dependencies and usage patterns within a codebase. + +## Overview + +The script uses Codegen's symbol analysis capabilities to: +1. Identify functions without docstrings +2. Analyze their dependencies and usages up to N degrees deep +3. Generate contextually aware docstrings using AI + +## Key Features + +### Recursive Context Collection +The script recursively collects both dependencies and usages to provide comprehensive context for docstring generation: + +```python +def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[Symbol]]: + """Recursively collect dependencies and usages up to the specified degree.""" + dependencies = set() + usages = set() + + if degree > 0: + for dep in symbol.dependencies: + if isinstance(dep, Import): + dep = hop_through_imports(dep) + if isinstance(dep, Symbol): + dependencies.add(dep) + # Recursively collect nested context + dep_deps, dep_usages = get_extended_context(dep, degree - 1) + dependencies.update(dep_deps) + usages.update(dep_usages) +``` + +### Import Resolution +The script intelligently resolves imports to find the actual symbol definitions: + +```python +def hop_through_imports(imp: Import) -> Symbol | ExternalModule: + """Finds the root symbol for an import""" + if isinstance(imp.imported_symbol, Import): + return hop_through_imports(imp.imported_symbol) + return imp.imported_symbol +``` + +## Usage + +1. Run the script on a target repository: +```python +codebase = Codebase.from_repo("your/repo", commit="commit_hash") +run(codebase) +``` + +2. The script will: + - Process each function in the codebase + - Skip functions that already have docstrings + - Generate contextually aware docstrings for undocumented functions + - Commit changes incrementally for safe early termination + +## Example Output + +The script provides detailed progress information: +``` +[1/150] Skipping my_function - already has docstring +[2/150] Generating docstring for process_data at src/utils.py + ✓ Generated docstring +[3/150] Generating docstring for validate_input at src/validation.py + ✗ Failed to generate docstring +``` + +## Features + +- **Intelligent Context Collection**: Analyzes both dependencies and usages to understand function purpose +- **Import Resolution**: Follows import chains to find actual symbol definitions +- **Incremental Commits**: Saves progress after each function for safe interruption +- **Progress Tracking**: Detailed logging of processing status +- **Existing Docstring Preservation**: Skips functions that are already documented + +## Use Cases + +- Documenting legacy codebases +- Maintaining documentation standards in large projects +- Onboarding new team members with better code documentation +- Preparing codebases for public release \ No newline at end of file diff --git a/examples/document_functions/run.py b/examples/document_functions/run.py new file mode 100644 index 0000000..3cc9912 --- /dev/null +++ b/examples/document_functions/run.py @@ -0,0 +1,119 @@ +import codegen +from codegen import Codebase +from codegen.sdk.core.external_module import ExternalModule +from codegen.sdk.core.import_resolution import Import +from codegen.sdk.core.symbol import Symbol + + +def hop_through_imports(imp: Import) -> Symbol | ExternalModule: + """Finds the root symbol for an import""" + if isinstance(imp.imported_symbol, Import): + return hop_through_imports(imp.imported_symbol) + return imp.imported_symbol + + +def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[Symbol]]: + """Recursively collect dependencies and usages up to the specified degree. + + Args: + symbol: The symbol to collect context for + degree: How many levels deep to collect dependencies and usages + + Returns: + A tuple of (dependencies, usages) where each is a set of related Symbol objects + """ + dependencies = set() + usages = set() + + if degree > 0: + # Collect direct dependencies + for dep in symbol.dependencies: + # Hop through imports to find the root symbol + if isinstance(dep, Import): + dep = hop_through_imports(dep) + + if isinstance(dep, Symbol) and dep not in dependencies: + dependencies.add(dep) + dep_deps, dep_usages = get_extended_context(dep, degree - 1) + dependencies.update(dep_deps) + usages.update(dep_usages) + + # Collect usages in the current symbol + for usage in symbol.usages: + usage_symbol = usage.usage_symbol + # Hop through imports for usage symbols too + if isinstance(usage_symbol, Import): + usage_symbol = hop_through_imports(usage_symbol) + + if isinstance(usage_symbol, Symbol) and usage_symbol not in usages: + usages.add(usage_symbol) + usage_deps, usage_usages = get_extended_context(usage_symbol, degree - 1) + dependencies.update(usage_deps) + usages.update(usage_usages) + + return dependencies, usages + + +@codegen.function("document-functions") +def run(codebase: Codebase): + # Define the maximum degree of dependencies and usages to consider for context + N_DEGREE = 2 + + # Filter out test and tutorial functions first + functions = [f for f in codebase.functions if not any(pattern in f.name.lower() for pattern in ["test", "tutorial"]) and not any(pattern in f.filepath.lower() for pattern in ["test", "tutorial"])] + + # Track progress for user feedback + total_functions = len(functions) + processed = 0 + + print(f"Found {total_functions} functions to process (excluding tests and tutorials)") + + for function in functions: + processed += 1 + + # Skip if already has docstring + if function.docstring: + print(f"[{processed}/{total_functions}] Skipping {function.name} - already has docstring") + continue + + print(f"[{processed}/{total_functions}] Generating docstring for {function.name} at {function.filepath}") + + # Collect context using N-degree dependencies and usages + dependencies, usages = get_extended_context(function, N_DEGREE) + + # Generate a docstring using the AI with the context + docstring = codebase.ai( + """ + Generate a docstring for this function using the provided context. + The context includes: + - dependencies: other symbols this function depends on + - usages: other symbols that use this function + """, + target=function, + # `codebase.ai` is smart about stringifying symbols + context={"dependencies": list(dependencies), "usages": list(usages)}, + ) + + # Set the generated docstring for the function + if docstring: + function.set_docstring(docstring) + print(" ✓ Generated docstring") + else: + print(" ✗ Failed to generate docstring") + + # Commit after each function so work is saved incrementally + # This allows for: + # 1. Safe early termination - progress won't be lost + # 2. Immediate feedback - can check results while running + # 3. Smaller atomic changes - easier to review/revert if needed + codebase.commit() + + print(f"\nCompleted processing {total_functions} functions") + + +if __name__ == "__main__": + print("Parsing codebase...") + codebase = Codebase.from_repo("fastapi/fastapi", commit="887270ff8a54bb58c406b0651678a27589793d2f") + + print("Running function...") + run(codebase)