11import ast
22import pathlib
33import re
4+ from typing import Optional , Sequence , cast
45
6+ import astroid # type: ignore
57from gitignore_parser import parse_gitignore # type: ignore
68
79
8- def compress_python_code (content ) :
10+ def compress_python_code (content : str ) -> str :
911 """Compress Python code while preserving docstrings."""
1012 try :
1113 parsed_ast = ast .parse (content )
1214 except SyntaxError :
13- # If the code cannot be parsed, return it unmodified
1415 return content
1516
1617 class RemoveCommentsAndDocstrings (ast .NodeTransformer ):
17- def visit_FunctionDef (self , node ):
18- self .generic_visit (node )
19- return node
20-
21- def visit_ClassDef (self , node ):
22- self .generic_visit (node )
23- return node
24-
25- def visit_AsyncFunctionDef (self , node ):
26- self .generic_visit (node )
27- return node
28-
29- def visit_Module (self , node ):
30- self .generic_visit (node )
31- return node
32-
33- def visit_Expr (self , node ):
34- if not isinstance (node .value , ast .Constant ):
35- # Remove expressions that are not docstrings
36- return None
37- return node
38-
39- def visit_Str (self , node ):
40- # Keep docstrings, remove other strings
18+ def visit_FunctionDef (self , node : ast .FunctionDef ) -> Optional [ast .FunctionDef ]:
19+ if (
20+ ast .get_docstring (node ) is not None
21+ and node .body
22+ and isinstance (node .body [0 ], ast .Expr )
23+ and isinstance (node .body [0 ].value , ast .Constant )
24+ ):
25+ docstring = node .body [0 ]
26+ node .body = [docstring ] + [
27+ n for n in map (self .visit , node .body [1 :]) if n is not None
28+ ]
29+ return node
30+ node .body = [n for n in map (self .visit , node .body ) if n is not None ]
4131 return node
4232
43- def visit_Constant (self , node ):
44- # For Python 3.8 and above (ast.Constant replaces ast.Str)
45- if isinstance (node .value , str ):
46- # Keep docstrings
33+ def visit_ClassDef (self , node : ast .ClassDef ) -> Optional [ast .ClassDef ]:
34+ if (
35+ ast .get_docstring (node ) is not None
36+ and node .body
37+ and isinstance (node .body [0 ], ast .Expr )
38+ and isinstance (node .body [0 ].value , ast .Constant )
39+ ):
40+ docstring = node .body [0 ]
41+ node .body = [docstring ] + [
42+ n for n in map (self .visit , node .body [1 :]) if n is not None
43+ ]
4744 return node
48- return None
49-
50- def visit_Import (self , node ):
45+ node .body = [n for n in map (self .visit , node .body ) if n is not None ]
5146 return node
5247
53- def visit_ImportFrom (self , node ):
48+ def visit_Module (self , node : ast .Module ) -> Optional [ast .Module ]:
49+ if (
50+ ast .get_docstring (node ) is not None
51+ and node .body
52+ and isinstance (node .body [0 ], ast .Expr )
53+ and isinstance (node .body [0 ].value , ast .Constant )
54+ ):
55+ docstring = node .body [0 ]
56+ node .body = [docstring ] + [
57+ n for n in map (self .visit , node .body [1 :]) if n is not None
58+ ]
59+ return node
60+ node .body = [n for n in map (self .visit , node .body ) if n is not None ]
5461 return node
5562
56- def visit_Pass (self , node ):
57- return node
63+ def generic_visit (self , node : ast .AST ) -> ast .AST :
64+ if isinstance (node , ast .Expr ) and isinstance (node .value , ast .Constant ):
65+ return ast .Pass ()
66+ return super ().generic_visit (node )
5867
59- transformer = RemoveCommentsAndDocstrings ()
60- cleaned_ast = transformer .visit (parsed_ast )
61- ast .fix_missing_locations (cleaned_ast )
62- compressed_code = ast .unparse (cleaned_ast )
63- return compressed_code
68+ try :
69+ transformer = RemoveCommentsAndDocstrings ()
70+ cleaned_ast = transformer .visit (parsed_ast )
71+ ast .fix_missing_locations (cleaned_ast )
72+ # Use astroid to handle AST to source code conversion
73+ astroid_module = astroid .parse (ast .dump (cleaned_ast ))
74+ compressed_code = cast (str , astroid_module .as_string ())
75+ # Remove multiple blank lines
76+ compressed_code = re .sub (r"\n\s*\n\s*\n" , "\n \n " , compressed_code )
77+ return compressed_code
78+ except Exception as e :
79+ print (f"Warning: Error compressing Python code: { e } " )
80+ return content
6481
6582
66- def compress_code_content (content , file_extension ) :
83+ def compress_code_content (content : str , file_extension : str ) -> str :
6784 """Compress code content based on the file extension."""
68- if file_extension == ".py" :
85+ if file_extension in ( ".py" , ".pyi" ) :
6986 return compress_python_code (content )
70- else :
71- # For other languages, basic comment and blank line removal
72- return basic_compress (content , file_extension )
87+ return basic_compress (content , file_extension )
7388
7489
75- def basic_compress (content , file_extension ) :
90+ def basic_compress (content : str , file_extension : str ) -> str :
7691 """Basic compression: remove comments and multiple blank lines."""
77- if file_extension in [ ".js" , ".java" , ".c" , ".cpp" , ".h" , ".hpp" , ".sh" ]:
78- # Remove single-line comments starting with //
79- content = re . sub ( r"//.*" , "" , content )
80- # Remove multi-line comments /* ... */
81- content = re .sub (r"/\*[\s\S]*?\*/ " , "" , content )
82- # Remove shell script comments starting with #
83- if file_extension == ".sh" :
84- content = re . sub ( r"#.*" , "" , content )
85- # Remove multiple blank lines
86- content = re . sub ( r "\n\s*\n" , " \n " , content )
87- return content . strip ( )
88-
89-
90- def compress_text_content (content ) :
92+ lines = content . split ( " \n " )
93+ cleaned_lines = []
94+ for line in lines :
95+ if file_extension in ( ".py" , ".sh" ):
96+ line = re .sub (r"#.*$ " , "" , line )
97+ elif file_extension in ( ".js" , ".java" , ".c" , ".cpp" , ".h" , ".hpp" ):
98+ line = re . sub ( r"//.*$" , "" , line )
99+ cleaned_lines . append ( line )
100+
101+ content = "\n " . join ( cleaned_lines )
102+ return re . sub ( r"\n\s*\n\s*\n" , " \n \n " , content )
103+
104+
105+ def compress_text_content (content : str ) -> str :
91106 """Removes multiple blank lines from text files."""
92- content = re .sub (r"\n\s*\n" , "\n " , content )
93- return content .strip ()
107+ return re .sub (r"\n\s*\n\s*\n" , "\n \n " , content )
94108
95109
96- def compress_markdown_content (content ) :
110+ def compress_markdown_content (content : str ) -> str :
97111 """Extracts code blocks from Markdown and compresses them."""
98- code_blocks = re .findall (r"```.*?\n(.*?)```" , content , re .DOTALL )
99- compressed_blocks = []
100- for block in code_blocks :
101- compressed_block = basic_compress (block , "" )
102- compressed_blocks .append (compressed_block )
103- return "\n " .join (compressed_blocks )
112+ parts = re .split (r"(```\w*\n.*?\n```)" , content , flags = re .DOTALL )
113+ compressed_parts = []
114+ for part in parts :
115+ if part .startswith ("```" ):
116+ lang_match = re .match (r"```(\w*)\n" , part )
117+ lang = lang_match .group (1 ) if lang_match else ""
118+ code = re .sub (r"```\w*\n(.*)\n```" , r"\1" , part , flags = re .DOTALL )
119+ if lang in ("python" , "py" ):
120+ code = compress_python_code (code )
121+ else :
122+ code = compress_text_content (code )
123+ compressed_parts .append (f"```{ lang } \n { code } \n ```" )
124+ else :
125+ compressed_parts .append (compress_text_content (part ))
126+ return "" .join (compressed_parts )
104127
105128
106129def generate_llms_txt (
107- output_file = "llms.txt" ,
108- allowed_extensions = (
130+ output_file : str = "llms.txt" ,
131+ allowed_extensions : Sequence [ str ] = (
109132 ".py" ,
110133 ".js" ,
111134 ".html" ,
@@ -125,27 +148,48 @@ def generate_llms_txt(
125148 ".toml" ,
126149 ".ini" ,
127150 ),
128- max_file_size = 100 * 1024 , # 100 KB
129- ):
151+ max_file_size : int = 100 * 1024 , # 100 KB
152+ ) -> None :
130153 """
131- Generates a compressed llms.txt file.
154+ Generates a compressed llms.txt file optimized for LLM/AI consumption.
155+
156+ Args:
157+ output_file: Name of the output file
158+ allowed_extensions: Tuple of file extensions to process
159+ max_file_size: Maximum file size in bytes to process
132160 """
133- current_dir = pathlib .Path ("." )
134- gitignore_path = current_dir / ".gitignore"
161+ current_dir : pathlib . Path = pathlib .Path ("." )
162+ gitignore_path : pathlib . Path = current_dir / ".gitignore"
135163 matches = parse_gitignore (gitignore_path ) if gitignore_path .exists () else None
136164
137165 with open (output_file , "w" , encoding = "utf-8" ) as outfile :
138- # Include README.md and LICENSE at the beginning if they exist
166+ # Project metadata
167+ outfile .write ("# Project: llmstxt\n \n " )
168+ outfile .write ("## Project Structure\n " )
169+ outfile .write (
170+ "This file contains the compressed and processed contents of the project.\n \n "
171+ )
172+ outfile .write ("### File Types\n " )
173+ outfile .write ("The following file types are included:\n " )
174+ outfile .write ("" .join ([f"- { ext } \n " for ext in allowed_extensions ]))
175+ outfile .write ("\n ### Special Files\n " )
176+
177+ # Include README and LICENSE with metadata
139178 for special_file in ["README.md" , "LICENSE" , "LICENSE.txt" ]:
140- special_path = current_dir / special_file
179+ special_path : pathlib . Path = current_dir / special_file
141180 if special_path .exists ():
142- outfile .write (f"# { special_file } \n \n " )
181+ outfile .write (f"<file>{ special_file } </file>\n " )
182+ outfile .write ("<metadata>\n " )
183+ outfile .write (f"path: { special_file } \n " )
184+ outfile .write (f"size: { special_path .stat ().st_size } bytes\n " )
185+ outfile .write ("</metadata>\n \n " )
143186 with open (
144187 special_path , "r" , encoding = "utf-8" , errors = "replace"
145188 ) as infile :
146- content = infile .read ()
147- outfile .write (content + "\n \n " )
189+ special_content : str = infile .read ()
190+ outfile .write (special_content + "\n \n " )
148191
192+ # Process all other files
149193 for file in current_dir .rglob ("*" ):
150194 if (
151195 file .is_file ()
@@ -158,12 +202,39 @@ def generate_llms_txt(
158202 print (f"Skipping { file } as it exceeds the maximum file size." )
159203 continue
160204
161- relative_path = file .relative_to (current_dir )
162- outfile .write (f"## File: { relative_path } \n \n " )
205+ relative_path : pathlib .Path = file .relative_to (current_dir )
206+ outfile .write (f"<file>{ relative_path } </file>\n " )
207+ outfile .write ("<metadata>\n " )
208+ outfile .write (f"path: { relative_path } \n " )
209+ outfile .write (f"type: { file .suffix .lstrip ('.' )} \n " )
210+ outfile .write (f"size: { file .stat ().st_size } bytes\n " )
211+ outfile .write ("</metadata>\n \n " )
163212
164213 try :
165214 with open (file , "r" , encoding = "utf-8" , errors = "replace" ) as infile :
166- content = infile .read ()
215+ raw_content : str = infile .read ()
216+
217+ # Add semantic markers based on file type
218+ if file .suffix .lower () in (".py" , ".js" , ".java" ):
219+ outfile .write ("<imports>\n " )
220+ # Extract and write imports
221+ import_lines = [
222+ line
223+ for line in raw_content .split ("\n " )
224+ if any (
225+ imp in line .lower ()
226+ for imp in [
227+ "import " ,
228+ "from " ,
229+ "require" ,
230+ "include" ,
231+ ]
232+ )
233+ ]
234+ if import_lines :
235+ outfile .write ("\n " .join (import_lines ) + "\n " )
236+ outfile .write ("</imports>\n \n " )
237+
167238 if file .suffix .lower () in (
168239 ".py" ,
169240 ".js" ,
@@ -174,12 +245,12 @@ def generate_llms_txt(
174245 ".hpp" ,
175246 ".sh" ,
176247 ):
177- compressed_content = compress_code_content (
178- content , file .suffix .lower ()
248+ code_content : str = compress_code_content (
249+ raw_content , file .suffix .lower ()
179250 )
180- language = file .suffix .lstrip ("." )
251+ language : str = file .suffix .lstrip ("." )
181252 outfile .write (
182- f"``` { language } \n { compressed_content } \n ``` \n \n "
253+ f"<code lang=' { language } '> \n { code_content } \n </code> \n \n "
183254 )
184255 elif file .suffix .lower () in (
185256 ".txt" ,
@@ -190,21 +261,30 @@ def generate_llms_txt(
190261 ".toml" ,
191262 ".ini" ,
192263 ):
193- compressed_content = compress_text_content (content )
194- outfile .write (f"```\n { compressed_content } \n ```\n \n " )
264+ text_content : str = compress_text_content (raw_content )
265+ outfile .write (
266+ f"<content type='{ file .suffix .lstrip ('.' )} '>\n "
267+ )
268+ outfile .write (f"{ text_content } \n " )
269+ outfile .write ("</content>\n \n " )
195270 elif file .suffix .lower () == ".md" :
196- compressed_content = compress_markdown_content (content )
197- outfile .write (f"```md\n { compressed_content } \n ```\n \n " )
271+ md_content : str = compress_markdown_content (raw_content )
272+ outfile .write ("<markdown>\n " )
273+ outfile .write (f"{ md_content } \n " )
274+ outfile .write ("</markdown>\n \n " )
198275 else :
199- # For other files like .html, include as-is
200276 outfile .write (
201- f"``` { file .suffix .lstrip ('.' )} \n { content } \n ``` \n \n "
277+ f"<content type=' { file .suffix .lstrip ('.' )} '> \n "
202278 )
279+ outfile .write (f"{ raw_content } \n " )
280+ outfile .write ("</content>\n \n " )
203281 except Exception as e :
204- outfile .write (f"Error processing { relative_path } : { e } \n \n " )
282+ outfile .write (
283+ f"<error>Error processing { relative_path } : { e } </error>\n \n "
284+ )
205285
206286
207287if __name__ == "__main__" :
208- output_filename = "llms.txt"
288+ output_filename : str = "llms.txt"
209289 generate_llms_txt (output_filename )
210290 print (f"{ output_filename } generated successfully in the current directory!" )
0 commit comments