Skip to content

Commit 3e11cae

Browse files
committed
updates
1 parent d9e7c7d commit 3e11cae

File tree

4 files changed

+566
-107
lines changed

4 files changed

+566
-107
lines changed

llms.py

Lines changed: 177 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,111 +1,134 @@
11
import ast
22
import pathlib
33
import re
4+
from typing import Optional, Sequence, cast
45

6+
import astroid # type: ignore
57
from gitignore_parser import parse_gitignore # type: ignore
68

79

8-
def compress_python_code(content):
10+
def compress_python_code(content: str) -> str:
911
"""Compress Python code while preserving docstrings."""
1012
try:
1113
parsed_ast = ast.parse(content)
1214
except SyntaxError:
13-
# If the code cannot be parsed, return it unmodified
1415
return content
1516

1617
class RemoveCommentsAndDocstrings(ast.NodeTransformer):
17-
def visit_FunctionDef(self, node):
18-
self.generic_visit(node)
19-
return node
20-
21-
def visit_ClassDef(self, node):
22-
self.generic_visit(node)
23-
return node
24-
25-
def visit_AsyncFunctionDef(self, node):
26-
self.generic_visit(node)
27-
return node
28-
29-
def visit_Module(self, node):
30-
self.generic_visit(node)
31-
return node
32-
33-
def visit_Expr(self, node):
34-
if not isinstance(node.value, ast.Constant):
35-
# Remove expressions that are not docstrings
36-
return None
37-
return node
38-
39-
def visit_Str(self, node):
40-
# Keep docstrings, remove other strings
18+
def visit_FunctionDef(self, node: ast.FunctionDef) -> Optional[ast.FunctionDef]:
19+
if (
20+
ast.get_docstring(node) is not None
21+
and node.body
22+
and isinstance(node.body[0], ast.Expr)
23+
and isinstance(node.body[0].value, ast.Constant)
24+
):
25+
docstring = node.body[0]
26+
node.body = [docstring] + [
27+
n for n in map(self.visit, node.body[1:]) if n is not None
28+
]
29+
return node
30+
node.body = [n for n in map(self.visit, node.body) if n is not None]
4131
return node
4232

43-
def visit_Constant(self, node):
44-
# For Python 3.8 and above (ast.Constant replaces ast.Str)
45-
if isinstance(node.value, str):
46-
# Keep docstrings
33+
def visit_ClassDef(self, node: ast.ClassDef) -> Optional[ast.ClassDef]:
34+
if (
35+
ast.get_docstring(node) is not None
36+
and node.body
37+
and isinstance(node.body[0], ast.Expr)
38+
and isinstance(node.body[0].value, ast.Constant)
39+
):
40+
docstring = node.body[0]
41+
node.body = [docstring] + [
42+
n for n in map(self.visit, node.body[1:]) if n is not None
43+
]
4744
return node
48-
return None
49-
50-
def visit_Import(self, node):
45+
node.body = [n for n in map(self.visit, node.body) if n is not None]
5146
return node
5247

53-
def visit_ImportFrom(self, node):
48+
def visit_Module(self, node: ast.Module) -> Optional[ast.Module]:
49+
if (
50+
ast.get_docstring(node) is not None
51+
and node.body
52+
and isinstance(node.body[0], ast.Expr)
53+
and isinstance(node.body[0].value, ast.Constant)
54+
):
55+
docstring = node.body[0]
56+
node.body = [docstring] + [
57+
n for n in map(self.visit, node.body[1:]) if n is not None
58+
]
59+
return node
60+
node.body = [n for n in map(self.visit, node.body) if n is not None]
5461
return node
5562

56-
def visit_Pass(self, node):
57-
return node
63+
def generic_visit(self, node: ast.AST) -> ast.AST:
64+
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Constant):
65+
return ast.Pass()
66+
return super().generic_visit(node)
5867

59-
transformer = RemoveCommentsAndDocstrings()
60-
cleaned_ast = transformer.visit(parsed_ast)
61-
ast.fix_missing_locations(cleaned_ast)
62-
compressed_code = ast.unparse(cleaned_ast)
63-
return compressed_code
68+
try:
69+
transformer = RemoveCommentsAndDocstrings()
70+
cleaned_ast = transformer.visit(parsed_ast)
71+
ast.fix_missing_locations(cleaned_ast)
72+
# Use astroid to handle AST to source code conversion
73+
astroid_module = astroid.parse(ast.dump(cleaned_ast))
74+
compressed_code = cast(str, astroid_module.as_string())
75+
# Remove multiple blank lines
76+
compressed_code = re.sub(r"\n\s*\n\s*\n", "\n\n", compressed_code)
77+
return compressed_code
78+
except Exception as e:
79+
print(f"Warning: Error compressing Python code: {e}")
80+
return content
6481

6582

66-
def compress_code_content(content, file_extension):
83+
def compress_code_content(content: str, file_extension: str) -> str:
6784
"""Compress code content based on the file extension."""
68-
if file_extension == ".py":
85+
if file_extension in (".py", ".pyi"):
6986
return compress_python_code(content)
70-
else:
71-
# For other languages, basic comment and blank line removal
72-
return basic_compress(content, file_extension)
87+
return basic_compress(content, file_extension)
7388

7489

75-
def basic_compress(content, file_extension):
90+
def basic_compress(content: str, file_extension: str) -> str:
7691
"""Basic compression: remove comments and multiple blank lines."""
77-
if file_extension in [".js", ".java", ".c", ".cpp", ".h", ".hpp", ".sh"]:
78-
# Remove single-line comments starting with //
79-
content = re.sub(r"//.*", "", content)
80-
# Remove multi-line comments /* ... */
81-
content = re.sub(r"/\*[\s\S]*?\*/", "", content)
82-
# Remove shell script comments starting with #
83-
if file_extension == ".sh":
84-
content = re.sub(r"#.*", "", content)
85-
# Remove multiple blank lines
86-
content = re.sub(r"\n\s*\n", "\n", content)
87-
return content.strip()
88-
89-
90-
def compress_text_content(content):
92+
lines = content.split("\n")
93+
cleaned_lines = []
94+
for line in lines:
95+
if file_extension in (".py", ".sh"):
96+
line = re.sub(r"#.*$", "", line)
97+
elif file_extension in (".js", ".java", ".c", ".cpp", ".h", ".hpp"):
98+
line = re.sub(r"//.*$", "", line)
99+
cleaned_lines.append(line)
100+
101+
content = "\n".join(cleaned_lines)
102+
return re.sub(r"\n\s*\n\s*\n", "\n\n", content)
103+
104+
105+
def compress_text_content(content: str) -> str:
91106
"""Removes multiple blank lines from text files."""
92-
content = re.sub(r"\n\s*\n", "\n", content)
93-
return content.strip()
107+
return re.sub(r"\n\s*\n\s*\n", "\n\n", content)
94108

95109

96-
def compress_markdown_content(content):
110+
def compress_markdown_content(content: str) -> str:
97111
"""Extracts code blocks from Markdown and compresses them."""
98-
code_blocks = re.findall(r"```.*?\n(.*?)```", content, re.DOTALL)
99-
compressed_blocks = []
100-
for block in code_blocks:
101-
compressed_block = basic_compress(block, "")
102-
compressed_blocks.append(compressed_block)
103-
return "\n".join(compressed_blocks)
112+
parts = re.split(r"(```\w*\n.*?\n```)", content, flags=re.DOTALL)
113+
compressed_parts = []
114+
for part in parts:
115+
if part.startswith("```"):
116+
lang_match = re.match(r"```(\w*)\n", part)
117+
lang = lang_match.group(1) if lang_match else ""
118+
code = re.sub(r"```\w*\n(.*)\n```", r"\1", part, flags=re.DOTALL)
119+
if lang in ("python", "py"):
120+
code = compress_python_code(code)
121+
else:
122+
code = compress_text_content(code)
123+
compressed_parts.append(f"```{lang}\n{code}\n```")
124+
else:
125+
compressed_parts.append(compress_text_content(part))
126+
return "".join(compressed_parts)
104127

105128

106129
def generate_llms_txt(
107-
output_file="llms.txt",
108-
allowed_extensions=(
130+
output_file: str = "llms.txt",
131+
allowed_extensions: Sequence[str] = (
109132
".py",
110133
".js",
111134
".html",
@@ -125,27 +148,48 @@ def generate_llms_txt(
125148
".toml",
126149
".ini",
127150
),
128-
max_file_size=100 * 1024, # 100 KB
129-
):
151+
max_file_size: int = 100 * 1024, # 100 KB
152+
) -> None:
130153
"""
131-
Generates a compressed llms.txt file.
154+
Generates a compressed llms.txt file optimized for LLM/AI consumption.
155+
156+
Args:
157+
output_file: Name of the output file
158+
allowed_extensions: Tuple of file extensions to process
159+
max_file_size: Maximum file size in bytes to process
132160
"""
133-
current_dir = pathlib.Path(".")
134-
gitignore_path = current_dir / ".gitignore"
161+
current_dir: pathlib.Path = pathlib.Path(".")
162+
gitignore_path: pathlib.Path = current_dir / ".gitignore"
135163
matches = parse_gitignore(gitignore_path) if gitignore_path.exists() else None
136164

137165
with open(output_file, "w", encoding="utf-8") as outfile:
138-
# Include README.md and LICENSE at the beginning if they exist
166+
# Project metadata
167+
outfile.write("# Project: llmstxt\n\n")
168+
outfile.write("## Project Structure\n")
169+
outfile.write(
170+
"This file contains the compressed and processed contents of the project.\n\n"
171+
)
172+
outfile.write("### File Types\n")
173+
outfile.write("The following file types are included:\n")
174+
outfile.write("".join([f"- {ext}\n" for ext in allowed_extensions]))
175+
outfile.write("\n### Special Files\n")
176+
177+
# Include README and LICENSE with metadata
139178
for special_file in ["README.md", "LICENSE", "LICENSE.txt"]:
140-
special_path = current_dir / special_file
179+
special_path: pathlib.Path = current_dir / special_file
141180
if special_path.exists():
142-
outfile.write(f"# {special_file}\n\n")
181+
outfile.write(f"<file>{special_file}</file>\n")
182+
outfile.write("<metadata>\n")
183+
outfile.write(f"path: {special_file}\n")
184+
outfile.write(f"size: {special_path.stat().st_size} bytes\n")
185+
outfile.write("</metadata>\n\n")
143186
with open(
144187
special_path, "r", encoding="utf-8", errors="replace"
145188
) as infile:
146-
content = infile.read()
147-
outfile.write(content + "\n\n")
189+
special_content: str = infile.read()
190+
outfile.write(special_content + "\n\n")
148191

192+
# Process all other files
149193
for file in current_dir.rglob("*"):
150194
if (
151195
file.is_file()
@@ -158,12 +202,39 @@ def generate_llms_txt(
158202
print(f"Skipping {file} as it exceeds the maximum file size.")
159203
continue
160204

161-
relative_path = file.relative_to(current_dir)
162-
outfile.write(f"## File: {relative_path}\n\n")
205+
relative_path: pathlib.Path = file.relative_to(current_dir)
206+
outfile.write(f"<file>{relative_path}</file>\n")
207+
outfile.write("<metadata>\n")
208+
outfile.write(f"path: {relative_path}\n")
209+
outfile.write(f"type: {file.suffix.lstrip('.')}\n")
210+
outfile.write(f"size: {file.stat().st_size} bytes\n")
211+
outfile.write("</metadata>\n\n")
163212

164213
try:
165214
with open(file, "r", encoding="utf-8", errors="replace") as infile:
166-
content = infile.read()
215+
raw_content: str = infile.read()
216+
217+
# Add semantic markers based on file type
218+
if file.suffix.lower() in (".py", ".js", ".java"):
219+
outfile.write("<imports>\n")
220+
# Extract and write imports
221+
import_lines = [
222+
line
223+
for line in raw_content.split("\n")
224+
if any(
225+
imp in line.lower()
226+
for imp in [
227+
"import ",
228+
"from ",
229+
"require",
230+
"include",
231+
]
232+
)
233+
]
234+
if import_lines:
235+
outfile.write("\n".join(import_lines) + "\n")
236+
outfile.write("</imports>\n\n")
237+
167238
if file.suffix.lower() in (
168239
".py",
169240
".js",
@@ -174,12 +245,12 @@ def generate_llms_txt(
174245
".hpp",
175246
".sh",
176247
):
177-
compressed_content = compress_code_content(
178-
content, file.suffix.lower()
248+
code_content: str = compress_code_content(
249+
raw_content, file.suffix.lower()
179250
)
180-
language = file.suffix.lstrip(".")
251+
language: str = file.suffix.lstrip(".")
181252
outfile.write(
182-
f"```{language}\n{compressed_content}\n```\n\n"
253+
f"<code lang='{language}'>\n{code_content}\n</code>\n\n"
183254
)
184255
elif file.suffix.lower() in (
185256
".txt",
@@ -190,21 +261,30 @@ def generate_llms_txt(
190261
".toml",
191262
".ini",
192263
):
193-
compressed_content = compress_text_content(content)
194-
outfile.write(f"```\n{compressed_content}\n```\n\n")
264+
text_content: str = compress_text_content(raw_content)
265+
outfile.write(
266+
f"<content type='{file.suffix.lstrip('.')}'>\n"
267+
)
268+
outfile.write(f"{text_content}\n")
269+
outfile.write("</content>\n\n")
195270
elif file.suffix.lower() == ".md":
196-
compressed_content = compress_markdown_content(content)
197-
outfile.write(f"```md\n{compressed_content}\n```\n\n")
271+
md_content: str = compress_markdown_content(raw_content)
272+
outfile.write("<markdown>\n")
273+
outfile.write(f"{md_content}\n")
274+
outfile.write("</markdown>\n\n")
198275
else:
199-
# For other files like .html, include as-is
200276
outfile.write(
201-
f"```{file.suffix.lstrip('.')}\n{content}\n```\n\n"
277+
f"<content type='{file.suffix.lstrip('.')}'>\n"
202278
)
279+
outfile.write(f"{raw_content}\n")
280+
outfile.write("</content>\n\n")
203281
except Exception as e:
204-
outfile.write(f"Error processing {relative_path}: {e}\n\n")
282+
outfile.write(
283+
f"<error>Error processing {relative_path}: {e}</error>\n\n"
284+
)
205285

206286

207287
if __name__ == "__main__":
208-
output_filename = "llms.txt"
288+
output_filename: str = "llms.txt"
209289
generate_llms_txt(output_filename)
210290
print(f"{output_filename} generated successfully in the current directory!")

0 commit comments

Comments
 (0)