Skip to content

Commit 58c071a

Browse files
committed
updates
1 parent 93273db commit 58c071a

File tree

5 files changed

+325
-3
lines changed

5 files changed

+325
-3
lines changed

.github/workflows/update-llms.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,24 @@ jobs:
3636
run: |
3737
uv venv
3838
. .venv/bin/activate
39-
uv pip install git+https://github.com/ngmisl/llmstxt.git
39+
if [ "${{ github.event_name }}" = "repository_dispatch" ]; then
40+
uv pip install git+https://github.com/ngmisl/llmstxt.git
41+
else
42+
uv pip install -e .
43+
fi
4044
4145
- name: Clone target repository
4246
if: github.event_name == 'repository_dispatch'
4347
run: |
4448
git clone ${{ github.event.client_payload.repository }} target_repo
4549
cd target_repo
46-
python -m llmstxt
50+
python -c "from llmstxt import generate_llms_txt; generate_llms_txt()"
4751
4852
- name: Generate llms.txt for current repository
4953
if: github.event_name != 'repository_dispatch'
5054
run: |
5155
. .venv/bin/activate
52-
python -m llmstxt
56+
python -c "from llmstxt import generate_llms_txt; generate_llms_txt()"
5357
5458
- name: Check for changes
5559
id: changes

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"cSpell.words": [
3+
"Aunova",
34
"celerybeat",
45
"Connor",
56
"cython",

llmstxt/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""llmstxt - Compress code files into LLM-friendly format."""
2+
3+
from .llms import generate_llms_txt
4+
5+
__version__ = "0.1.0"
6+
__all__ = ["generate_llms_txt"]

llmstxt/__main__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""Command-line interface for llmstxt."""
2+
3+
from .llms import generate_llms_txt
4+
5+
if __name__ == "__main__":
6+
generate_llms_txt()

llmstxt/llms.py

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
import ast
2+
import pathlib
3+
import re
4+
from typing import Optional, Sequence, cast
5+
6+
import astroid # type: ignore
7+
from gitignore_parser import parse_gitignore # type: ignore
8+
9+
10+
def compress_python_code(content: str) -> str:
11+
"""Compress Python code while preserving docstrings."""
12+
try:
13+
parsed_ast = ast.parse(content)
14+
except SyntaxError:
15+
return content
16+
17+
class RemoveCommentsAndDocstrings(ast.NodeTransformer):
18+
def visit_FunctionDef(self, node: ast.FunctionDef) -> Optional[ast.FunctionDef]:
19+
if (
20+
ast.get_docstring(node) is not None
21+
and node.body
22+
and isinstance(node.body[0], ast.Expr)
23+
and isinstance(node.body[0].value, ast.Constant)
24+
):
25+
docstring = node.body[0]
26+
node.body = [docstring] + [
27+
n for n in map(self.visit, node.body[1:]) if n is not None
28+
]
29+
return node
30+
node.body = [n for n in map(self.visit, node.body) if n is not None]
31+
return node
32+
33+
def visit_ClassDef(self, node: ast.ClassDef) -> Optional[ast.ClassDef]:
34+
if (
35+
ast.get_docstring(node) is not None
36+
and node.body
37+
and isinstance(node.body[0], ast.Expr)
38+
and isinstance(node.body[0].value, ast.Constant)
39+
):
40+
docstring = node.body[0]
41+
node.body = [docstring] + [
42+
n for n in map(self.visit, node.body[1:]) if n is not None
43+
]
44+
return node
45+
node.body = [n for n in map(self.visit, node.body) if n is not None]
46+
return node
47+
48+
def visit_Module(self, node: ast.Module) -> Optional[ast.Module]:
49+
if (
50+
ast.get_docstring(node) is not None
51+
and node.body
52+
and isinstance(node.body[0], ast.Expr)
53+
and isinstance(node.body[0].value, ast.Constant)
54+
):
55+
docstring = node.body[0]
56+
node.body = [docstring] + [
57+
n for n in map(self.visit, node.body[1:]) if n is not None
58+
]
59+
return node
60+
node.body = [n for n in map(self.visit, node.body) if n is not None]
61+
return node
62+
63+
def generic_visit(self, node: ast.AST) -> ast.AST:
64+
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Constant):
65+
return ast.Pass()
66+
return super().generic_visit(node)
67+
68+
try:
69+
transformer = RemoveCommentsAndDocstrings()
70+
cleaned_ast = transformer.visit(parsed_ast)
71+
ast.fix_missing_locations(cleaned_ast)
72+
73+
# Convert AST to source code using astroid
74+
try:
75+
# Try using ast.unparse first (Python 3.9+)
76+
source = ast.unparse(cleaned_ast) # type: ignore
77+
except (AttributeError, TypeError):
78+
# Fallback to astroid for older Python versions
79+
source = cast(str, astroid.parse(ast.dump(cleaned_ast)).as_string())
80+
81+
# Clean up the source code
82+
lines = source.split("\n")
83+
cleaned_lines = []
84+
for line in lines:
85+
# Remove empty pass statements
86+
if line.strip() != "pass":
87+
cleaned_lines.append(line)
88+
89+
# Join lines and remove multiple blank lines
90+
compressed_code = "\n".join(cleaned_lines)
91+
compressed_code = re.sub(r"\n\s*\n\s*\n", "\n\n", compressed_code)
92+
return compressed_code
93+
except Exception as e:
94+
print(f"Warning: Error compressing Python code: {e}")
95+
return content
96+
97+
98+
def compress_code_content(content: str, file_extension: str) -> str:
99+
"""Compress code content based on the file extension."""
100+
if file_extension in (".py", ".pyi"):
101+
return compress_python_code(content)
102+
return basic_compress(content, file_extension)
103+
104+
105+
def basic_compress(content: str, file_extension: str) -> str:
106+
"""Basic compression: remove comments and multiple blank lines."""
107+
lines = content.split("\n")
108+
cleaned_lines = []
109+
for line in lines:
110+
if file_extension in (".py", ".sh"):
111+
line = re.sub(r"#.*$", "", line)
112+
elif file_extension in (".js", ".java", ".c", ".cpp", ".h", ".hpp"):
113+
line = re.sub(r"//.*$", "", line)
114+
cleaned_lines.append(line)
115+
116+
content = "\n".join(cleaned_lines)
117+
return re.sub(r"\n\s*\n\s*\n", "\n\n", content)
118+
119+
120+
def compress_text_content(content: str) -> str:
121+
"""Removes multiple blank lines from text files."""
122+
return re.sub(r"\n\s*\n\s*\n", "\n\n", content)
123+
124+
125+
def compress_markdown_content(content: str) -> str:
126+
"""Extracts code blocks from Markdown and compresses them."""
127+
parts = re.split(r"(```\w*\n.*?\n```)", content, flags=re.DOTALL)
128+
compressed_parts = []
129+
for part in parts:
130+
if part.startswith("```"):
131+
lang_match = re.match(r"```(\w*)\n", part)
132+
lang = lang_match.group(1) if lang_match else ""
133+
code = re.sub(r"```\w*\n(.*)\n```", r"\1", part, flags=re.DOTALL)
134+
if lang in ("python", "py"):
135+
code = compress_python_code(code)
136+
else:
137+
code = compress_text_content(code)
138+
compressed_parts.append(f"```{lang}\n{code}\n```")
139+
else:
140+
compressed_parts.append(compress_text_content(part))
141+
return "".join(compressed_parts)
142+
143+
144+
def generate_llms_txt(
145+
output_file: str = "llms.txt",
146+
allowed_extensions: Sequence[str] = (
147+
".py",
148+
".js",
149+
".html",
150+
".css",
151+
".java",
152+
".c",
153+
".cpp",
154+
".h",
155+
".hpp",
156+
".sh",
157+
".txt",
158+
".md",
159+
".json",
160+
".xml",
161+
".yaml",
162+
".yml",
163+
".toml",
164+
".ini",
165+
),
166+
max_file_size: int = 100 * 1024, # 100 KB
167+
) -> None:
168+
"""
169+
Generates a compressed llms.txt file optimized for LLM/AI consumption.
170+
171+
Args:
172+
output_file: Name of the output file
173+
allowed_extensions: Tuple of file extensions to process
174+
max_file_size: Maximum file size in bytes to process
175+
"""
176+
current_dir: pathlib.Path = pathlib.Path(".")
177+
gitignore_path: pathlib.Path = current_dir / ".gitignore"
178+
matches = parse_gitignore(gitignore_path) if gitignore_path.exists() else None
179+
180+
with open(output_file, "w", encoding="utf-8") as outfile:
181+
# Project metadata
182+
outfile.write("# Project: llmstxt\n\n")
183+
outfile.write("## Project Structure\n")
184+
outfile.write(
185+
"This file contains the compressed and processed contents of the project.\n\n"
186+
)
187+
outfile.write("### File Types\n")
188+
outfile.write("The following file types are included:\n")
189+
outfile.write("".join([f"- {ext}\n" for ext in allowed_extensions]))
190+
outfile.write("\n### Special Files\n")
191+
192+
# Include README and LICENSE with metadata
193+
for special_file in ["README.md", "LICENSE", "LICENSE.txt"]:
194+
special_path: pathlib.Path = current_dir / special_file
195+
if special_path.exists():
196+
outfile.write(f"<file>{special_file}</file>\n")
197+
outfile.write("<metadata>\n")
198+
outfile.write(f"path: {special_file}\n")
199+
outfile.write(f"size: {special_path.stat().st_size} bytes\n")
200+
outfile.write("</metadata>\n\n")
201+
with open(
202+
special_path, "r", encoding="utf-8", errors="replace"
203+
) as infile:
204+
special_content: str = infile.read()
205+
outfile.write(special_content + "\n\n")
206+
207+
# Process all other files
208+
for file in current_dir.rglob("*"):
209+
if (
210+
file.is_file()
211+
and file.suffix.lower() in allowed_extensions
212+
and not (matches and matches(str(file.relative_to(current_dir))))
213+
and file.name
214+
not in ["README.md", "LICENSE", "LICENSE.txt", output_file]
215+
):
216+
if file.stat().st_size > max_file_size:
217+
print(f"Skipping {file} as it exceeds the maximum file size.")
218+
continue
219+
220+
relative_path: pathlib.Path = file.relative_to(current_dir)
221+
outfile.write(f"<file>{relative_path}</file>\n")
222+
outfile.write("<metadata>\n")
223+
outfile.write(f"path: {relative_path}\n")
224+
outfile.write(f"type: {file.suffix.lstrip('.')}\n")
225+
outfile.write(f"size: {file.stat().st_size} bytes\n")
226+
outfile.write("</metadata>\n\n")
227+
228+
try:
229+
with open(file, "r", encoding="utf-8", errors="replace") as infile:
230+
raw_content: str = infile.read()
231+
232+
# Add semantic markers based on file type
233+
if file.suffix.lower() in (".py", ".js", ".java"):
234+
outfile.write("<imports>\n")
235+
# Extract and write imports
236+
import_lines = [
237+
line
238+
for line in raw_content.split("\n")
239+
if any(
240+
imp in line.lower()
241+
for imp in [
242+
"import ",
243+
"from ",
244+
"require",
245+
"include",
246+
]
247+
)
248+
]
249+
if import_lines:
250+
outfile.write("\n".join(import_lines) + "\n")
251+
outfile.write("</imports>\n\n")
252+
253+
if file.suffix.lower() in (
254+
".py",
255+
".js",
256+
".java",
257+
".c",
258+
".cpp",
259+
".h",
260+
".hpp",
261+
".sh",
262+
):
263+
code_content: str = compress_code_content(
264+
raw_content, file.suffix.lower()
265+
)
266+
language: str = file.suffix.lstrip(".")
267+
outfile.write(
268+
f"<code lang='{language}'>\n{code_content}\n</code>\n\n"
269+
)
270+
elif file.suffix.lower() in (
271+
".txt",
272+
".json",
273+
".xml",
274+
".yaml",
275+
".yml",
276+
".toml",
277+
".ini",
278+
):
279+
text_content: str = compress_text_content(raw_content)
280+
outfile.write(
281+
f"<content type='{file.suffix.lstrip('.')}'>\n"
282+
)
283+
outfile.write(f"{text_content}\n")
284+
outfile.write("</content>\n\n")
285+
elif file.suffix.lower() == ".md":
286+
md_content: str = compress_markdown_content(raw_content)
287+
outfile.write("<markdown>\n")
288+
outfile.write(f"{md_content}\n")
289+
outfile.write("</markdown>\n\n")
290+
else:
291+
outfile.write(
292+
f"<content type='{file.suffix.lstrip('.')}'>\n"
293+
)
294+
outfile.write(f"{raw_content}\n")
295+
outfile.write("</content>\n\n")
296+
except Exception as e:
297+
outfile.write(
298+
f"<error>Error processing {relative_path}: {e}</error>\n\n"
299+
)
300+
301+
302+
if __name__ == "__main__":
303+
output_filename: str = "llms.txt"
304+
generate_llms_txt(output_filename)
305+
print(f"{output_filename} generated successfully in the current directory!")

0 commit comments

Comments
 (0)