@@ -547,6 +547,122 @@ fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
547547
548548TODO
549549
550+ #### TreeSitter
551+
552+ > TreeSitter 是一个用于生成高效的自定义语法分析器的框架,由 GitHub 开发。它使用 LR(1)解析器,这意味着它可以在 O(n)时间内解析任何语言,而不是
553+ O(n²)时间。它还使用了一种称为“语法树的重用”的技术,该技术使其能够在不重新解析整个文件的情况下更新语法树。
554+
555+ 由于 TreeSitter 已经提供了多语言的支持,你可以使用 Node.js、Rust 等语言来构建对应的插件。详细见:[TreeSitter](https://tree-sitter.github.io/tree-sitter/)。
556+
557+ 根据我们的意图不同,使用 TreeSitter 也有不同的方式:
558+
559+ **解析 Symbol**
560+
561+ 在代码自然语言搜索引擎 [Bloop](https://github.com/BloopAI/bloop) 中,我们使用 TreeSitter 来解析 Symbol,以实现更好的搜索质量。
562+
563+ ```scheme
564+ ;; methods
565+ (method_declaration
566+ name: (identifier) @hoist.definition.method)
567+ ```
568+
569+ 随后,根据不同的类型来决定如何显示:
570+
571+ ```rust
572+ pub static JAVA: TSLanguageConfig = TSLanguageConfig {
573+ language_ids: &["Java"],
574+ file_extensions: &["java"],
575+ grammar: tree_sitter_java::language,
576+ scope_query: MemoizedQuery::new(include_str!("./scopes.scm")),
577+ hoverable_query: MemoizedQuery::new(
578+ r#"
579+ [(identifier)
580+ (type_identifier)] @hoverable
581+ "#,
582+ ),
583+ namespaces: &[&[
584+ // variables
585+ "local",
586+ // functions
587+ "method",
588+ // namespacing, modules
589+ "package",
590+ "module",
591+ // types
592+ "class",
593+ "enum",
594+ "enumConstant",
595+ "record",
596+ "interface",
597+ "typedef",
598+ // misc.
599+ "label",
600+ ]],
601+ };
602+ ```
603+
604+ **Chunk 代码**
605+
606+ 如下是 [Improving LlamaIndex’s Code Chunker
607+ by Cleaning Tree-Sitter CSTs](https://docs.sweep.dev/blogs/chunking-improvements) 中的
608+ TreeSitter 的使用方式:
609+
610+ ```python
611+ from tree_sitter import Tree
612+
613+ def chunker(
614+ tree: Tree,
615+ source_code: bytes,
616+ MAX_CHARS=512 * 3,
617+ coalesce=50 # Any chunk less than 50 characters long gets coalesced with the next chunk
618+ ) -> list[Span]:
619+
620+ # 1. Recursively form chunks based on the last post (https://docs.sweep.dev/blogs/chunking-2m-files)
621+ def chunk_node(node: Node) -> list[Span]:
622+ chunks: list[Span] = []
623+ current_chunk: Span = Span(node.start_byte, node.start_byte)
624+ node_children = node.children
625+ for child in node_children:
626+ if child.end_byte - child.start_byte > MAX_CHARS:
627+ chunks.append(current_chunk)
628+ current_chunk = Span(child.end_byte, child.end_byte)
629+ chunks.extend(chunk_node(child))
630+ elif child.end_byte - child.start_byte + len(current_chunk) > MAX_CHARS:
631+ chunks.append(current_chunk)
632+ current_chunk = Span(child.start_byte, child.end_byte)
633+ else:
634+ current_chunk += Span(child.start_byte, child.end_byte)
635+ chunks.append(current_chunk)
636+ return chunks
637+ chunks = chunk_node(tree.root_node)
638+
639+ # 2. Filling in the gaps
640+ for prev, curr in zip(chunks[:-1], chunks[1:]):
641+ prev.end = curr.start
642+ curr.start = tree.root_node.end_byte
643+
644+ # 3. Combining small chunks with bigger ones
645+ new_chunks = []
646+ current_chunk = Span(0, 0)
647+ for chunk in chunks:
648+ current_chunk += chunk
649+ if non_whitespace_len(current_chunk.extract(source_code)) > coalesce \
650+ and "\n" in current_chunk.extract(source_code):
651+ new_chunks.append(current_chunk)
652+ current_chunk = Span(chunk.end, chunk.end)
653+ if len(current_chunk) > 0:
654+ new_chunks.append(current_chunk)
655+
656+ # 4. Changing line numbers
657+ line_chunks = [Span(get_line_number(chunk.start, source_code),
658+ get_line_number(chunk.end, source_code)) for chunk in new_chunks]
659+
660+ # 5. Eliminating empty chunks
661+ line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0]
662+
663+ return line_chunks
664+ ```
665+
550666### 度量体系设计
551667
552668#### 常用指标
0 commit comments