Skip to content

Commit 3fefe51

Browse files
committed
docs: add a treesitter demo examples
1 parent 5589c91 commit 3fefe51

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

README.md

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,122 @@ fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
547547
548548
TODO
549549
550+
#### TreeSitter
551+
552+
> TreeSitter 是一个用于生成高效的自定义语法分析器的框架,由 GitHub 开发。它使用 LR(1)解析器,这意味着它可以在 O(n)时间内解析任何语言,而不是
553+
O(n²)时间。它还使用了一种称为“语法树的重用”的技术,该技术使其能够在不重新解析整个文件的情况下更新语法树。
554+
555+
由于 TreeSitter 已经提供了多语言的支持,你可以使用 Node.js、Rust 等语言来构建对应的插件。详细见:[TreeSitter](https://tree-sitter.github.io/tree-sitter/)。
556+
557+
根据我们的意图不同,使用 TreeSitter 也有不同的方式:
558+
559+
**解析 Symbol**
560+
561+
在代码自然语言搜索引擎 [Bloop](https://github.com/BloopAI/bloop) 中,我们使用 TreeSitter 来解析 Symbol,以实现更好的搜索质量。
562+
563+
```scheme
564+
;; methods
565+
(method_declaration
566+
name: (identifier) @hoist.definition.method)
567+
```
568+
569+
随后,根据不同的类型来决定如何显示:
570+
571+
```rust
572+
pub static JAVA: TSLanguageConfig = TSLanguageConfig {
573+
language_ids: &["Java"],
574+
file_extensions: &["java"],
575+
grammar: tree_sitter_java::language,
576+
scope_query: MemoizedQuery::new(include_str!("./scopes.scm")),
577+
hoverable_query: MemoizedQuery::new(
578+
r#"
579+
[(identifier)
580+
(type_identifier)] @hoverable
581+
"#,
582+
),
583+
namespaces: &[&[
584+
// variables
585+
"local",
586+
// functions
587+
"method",
588+
// namespacing, modules
589+
"package",
590+
"module",
591+
// types
592+
"class",
593+
"enum",
594+
"enumConstant",
595+
"record",
596+
"interface",
597+
"typedef",
598+
// misc.
599+
"label",
600+
]],
601+
};
602+
```
603+
604+
**Chunk 代码**
605+
606+
如下是 [Improving LlamaIndex’s Code Chunker
607+
by Cleaning Tree-Sitter CSTs](https://docs.sweep.dev/blogs/chunking-improvements) 中的
608+
TreeSitter 的使用方式:
609+
610+
```python
611+
from tree_sitter import Tree
612+
613+
def chunker(
614+
tree: Tree,
615+
source_code: bytes,
616+
MAX_CHARS=512 * 3,
617+
coalesce=50 # Any chunk less than 50 characters long gets coalesced with the next chunk
618+
) -> list[Span]:
619+
620+
# 1. Recursively form chunks based on the last post (https://docs.sweep.dev/blogs/chunking-2m-files)
621+
def chunk_node(node: Node) -> list[Span]:
622+
chunks: list[Span] = []
623+
current_chunk: Span = Span(node.start_byte, node.start_byte)
624+
node_children = node.children
625+
for child in node_children:
626+
if child.end_byte - child.start_byte > MAX_CHARS:
627+
chunks.append(current_chunk)
628+
current_chunk = Span(child.end_byte, child.end_byte)
629+
chunks.extend(chunk_node(child))
630+
elif child.end_byte - child.start_byte + len(current_chunk) > MAX_CHARS:
631+
chunks.append(current_chunk)
632+
current_chunk = Span(child.start_byte, child.end_byte)
633+
else:
634+
current_chunk += Span(child.start_byte, child.end_byte)
635+
chunks.append(current_chunk)
636+
return chunks
637+
chunks = chunk_node(tree.root_node)
638+
639+
# 2. Filling in the gaps
640+
for prev, curr in zip(chunks[:-1], chunks[1:]):
641+
prev.end = curr.start
642+
curr.start = tree.root_node.end_byte
643+
644+
# 3. Combining small chunks with bigger ones
645+
new_chunks = []
646+
current_chunk = Span(0, 0)
647+
for chunk in chunks:
648+
current_chunk += chunk
649+
if non_whitespace_len(current_chunk.extract(source_code)) > coalesce \
650+
and "\n" in current_chunk.extract(source_code):
651+
new_chunks.append(current_chunk)
652+
current_chunk = Span(chunk.end, chunk.end)
653+
if len(current_chunk) > 0:
654+
new_chunks.append(current_chunk)
655+
656+
# 4. Changing line numbers
657+
line_chunks = [Span(get_line_number(chunk.start, source_code),
658+
get_line_number(chunk.end, source_code)) for chunk in new_chunks]
659+
660+
# 5. Eliminating empty chunks
661+
line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0]
662+
663+
return line_chunks
664+
```
665+
550666
### 度量体系设计
551667
552668
#### 常用指标

0 commit comments

Comments
 (0)