Skip to content

Commit b7e475d

Browse files
committed
feat: Add a configuration switch for OpenAPI documentation parsing
1 parent 3a0057e commit b7e475d

File tree

6 files changed

+55
-13
lines changed

6 files changed

+55
-13
lines changed

deploy/embber-debug.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ data:
3434
OverlapTokens: 100
3535
MaxTokensPerChunk: 1000
3636
EnableMarkdownParsing: false
37+
EnableOpenAPIParsing: false
3738
GraphTask:
3839
MaxConcurrency: 100
3940
Timeout: 18000s

deploy/embber.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ data:
3434
OverlapTokens: 100
3535
MaxTokensPerChunk: 1000
3636
EnableMarkdownParsing: false
37+
EnableOpenAPIParsing: false
3738
GraphTask:
3839
MaxConcurrency: 100
3940
Timeout: 18000s

internal/config/index.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type EmbeddingTaskConf struct {
2222
OverlapTokens int
2323
MaxTokensPerChunk int
2424
EnableMarkdownParsing bool `json:",default=false"` // 是否启用markdown文件解析
25+
EnableOpenAPIParsing bool `json:",default=false"` // 是否启用OpenAPI文档解析
2526
}
2627

2728
type GraphTaskConf struct {

internal/embedding/splitter.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ type SplitOptions struct {
2424
MaxTokensPerChunk int
2525
SlidingWindowOverlapTokens int
2626
EnableMarkdownParsing bool // 是否启用markdown文件解析
27+
EnableOpenAPIParsing bool // 是否启用OpenAPI文档解析
2728
}
2829

2930
// NewCodeSplitter 创建代码分割器
@@ -55,12 +56,15 @@ func (p *CodeSplitter) Split(codeFile *types.SourceFile) ([]*types.CodeChunk, er
5556
if language.Language == parser.Markdown && p.splitOptions.EnableMarkdownParsing {
5657
return p.splitMarkdownFile(codeFile)
5758
}
58-
if language.Language == parser.OpenAPI || language.Language == parser.Swagger {
59+
if (language.Language == parser.OpenAPI || language.Language == parser.Swagger) {
60+
if !p.splitOptions.EnableOpenAPIParsing{
61+
return nil,fmt.Errorf("openapi file parse is close")
62+
}
5963
return p.splitOpenAPIFile(codeFile)
6064
}
6165

6266
sitterParser := sitter.NewParser()
63-
67+
defer sitterParser.Close()
6468
// 设置解析器语言(复用已创建的Parser)
6569
if err := sitterParser.SetLanguage(language.SitterLanguage()); err != nil {
6670
return nil, fmt.Errorf("failed to set parser language: %w", err)

internal/embedding/splitter_test.go

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ func TestSplitOpenAPIFile(t *testing.T) {
2020
MaxTokensPerChunk: 1000,
2121
SlidingWindowOverlapTokens: 100,
2222
EnableMarkdownParsing: true,
23+
EnableOpenAPIParsing: true,
2324
}
2425
splitter, err := NewCodeSplitter(splitOptions)
2526
assert.NoError(t, err)
@@ -34,28 +35,28 @@ func TestSplitOpenAPIFile(t *testing.T) {
3435
}{
3536
{
3637
name: "OpenAPI 3.0 JSON 文件",
37-
filePath: "/home/kcx/codeWorkspace/codebase-embedder/bin/openapi3.json",
38+
filePath: "../../bin/openapi3.json",
3839
expectError: false,
3940
expectCount: 2, // /pets 和 /pets/{petId} 两个路径
4041
description: "应该成功分割 OpenAPI 3.0 JSON 文件",
4142
},
4243
{
4344
name: "OpenAPI 3.0 YAML 文件",
44-
filePath: "/home/kcx/codeWorkspace/codebase-embedder/bin/openapi3.yaml",
45+
filePath: "../../bin/openapi3.yaml",
4546
expectError: false,
4647
expectCount: 2, // /users 和 /users/{id} 两个路径
4748
description: "应该成功分割 OpenAPI 3.0 YAML 文件",
4849
},
4950
{
5051
name: "Swagger 2.0 JSON 文件",
51-
filePath: "/home/kcx/codeWorkspace/codebase-embedder/bin/swagger2.json",
52+
filePath: "../../bin/swagger2.json",
5253
expectError: false,
5354
expectCount: 14, // 14个不同的路径
5455
description: "应该成功分割 Swagger 2.0 JSON 文件",
5556
},
5657
{
5758
name: "Swagger 2.0 YAML 文件",
58-
filePath: "/home/kcx/codeWorkspace/codebase-embedder/bin/swagger2.yaml",
59+
filePath: "../../bin/swagger2.yaml",
5960
expectError: true, // 目前不支持Swagger 2.0 YAML 文件
6061
expectCount: 2, // /users 和 /users/{id} 两个路径
6162
description: "应该成功分割 Swagger 2.0 YAML 文件",
@@ -76,10 +77,11 @@ func TestSplitOpenAPIFile(t *testing.T) {
7677
Path: filepath.Base(tt.filePath),
7778
Content: content,
7879
}
79-
80+
if !splitter.splitOptions.EnableOpenAPIParsing {
81+
assert.Error(t, err, "openapi file parse is close")
82+
}
8083
// 执行分割
8184
chunks, err := splitter.splitOpenAPIFile(sourceFile)
82-
8385
// 验证结果
8486
if tt.expectError {
8587
assert.Error(t, err, tt.description)
@@ -131,6 +133,7 @@ func TestValidateOpenAPISpec(t *testing.T) {
131133
MaxTokensPerChunk: 1000,
132134
SlidingWindowOverlapTokens: 100,
133135
EnableMarkdownParsing: true,
136+
EnableOpenAPIParsing: true,
134137
}
135138
splitter, err := NewCodeSplitter(splitOptions)
136139
assert.NoError(t, err)
@@ -181,6 +184,9 @@ func TestValidateOpenAPISpec(t *testing.T) {
181184

182185
for _, tt := range tests {
183186
t.Run(tt.name, func(t *testing.T) {
187+
if !splitter.splitOptions.EnableOpenAPIParsing {
188+
assert.Error(t, err, "openapi file parse is close")
189+
}
184190
version, err := splitter.validateOpenAPISpec(tt.content, tt.filePath)
185191

186192
if tt.expectError {
@@ -199,11 +205,15 @@ func TestSplitOpenAPIFileEdgeCases(t *testing.T) {
199205
MaxTokensPerChunk: 1000,
200206
SlidingWindowOverlapTokens: 100,
201207
EnableMarkdownParsing: true,
208+
EnableOpenAPIParsing: true,
202209
}
203210
splitter, err := NewCodeSplitter(splitOptions)
204211
assert.NoError(t, err)
205212

206213
t.Run("空路径的 OpenAPI 文档", func(t *testing.T) {
214+
if !splitter.splitOptions.EnableOpenAPIParsing {
215+
assert.Error(t, err, "openapi file parse is close")
216+
}
207217
doc := map[string]interface{}{
208218
"openapi": "3.0.0",
209219
"info": map[string]interface{}{
@@ -228,6 +238,9 @@ func TestSplitOpenAPIFileEdgeCases(t *testing.T) {
228238
})
229239

230240
t.Run("单个路径的文档", func(t *testing.T) {
241+
if !splitter.splitOptions.EnableOpenAPIParsing {
242+
assert.Error(t, err, "openapi file parse is close")
243+
}
231244
doc := map[string]interface{}{
232245
"openapi": "3.0.0",
233246
"info": map[string]interface{}{
@@ -284,12 +297,16 @@ func TestComplexOpenAPIDocumentSplitting(t *testing.T) {
284297
MaxTokensPerChunk: 1000,
285298
SlidingWindowOverlapTokens: 100,
286299
EnableMarkdownParsing: true,
300+
EnableOpenAPIParsing: true,
287301
}
288302
splitter, err := NewCodeSplitter(splitOptions)
289303
assert.NoError(t, err)
290304

291305
t.Run("Swagger 2.0 JSON 完整文档分割", func(t *testing.T) {
292-
content, err := os.ReadFile("/home/kcx/codeWorkspace/codebase-embedder/bin/swagger2.json")
306+
if !splitter.splitOptions.EnableOpenAPIParsing {
307+
assert.Error(t, err, "openapi file parse is close")
308+
}
309+
content, err := os.ReadFile("../../bin/swagger2.json")
293310
assert.NoError(t, err, "应该能够读取 swagger2.json 文件")
294311

295312
sourceFile := &types.SourceFile{
@@ -338,7 +355,10 @@ func TestComplexOpenAPIDocumentSplitting(t *testing.T) {
338355
})
339356

340357
t.Run("OpenAPI 3.0 JSON 文档分割", func(t *testing.T) {
341-
content, err := os.ReadFile("/home/kcx/codeWorkspace/codebase-embedder/bin/openapi3.json")
358+
if !splitter.splitOptions.EnableOpenAPIParsing {
359+
assert.Error(t, err, "openapi file parse is close")
360+
}
361+
content, err := os.ReadFile("../../bin/openapi3.json")
342362
assert.NoError(t, err, "应该能够读取 openapi3.json 文件")
343363

344364
sourceFile := &types.SourceFile{
@@ -382,7 +402,10 @@ func TestComplexOpenAPIDocumentSplitting(t *testing.T) {
382402
})
383403

384404
t.Run("OpenAPI 3.0 YAML 文档分割", func(t *testing.T) {
385-
content, err := os.ReadFile("/home/kcx/codeWorkspace/codebase-embedder/bin/openapi3.yaml")
405+
if !splitter.splitOptions.EnableOpenAPIParsing {
406+
assert.Error(t, err, "openapi file parse is close")
407+
}
408+
content, err := os.ReadFile("../../bin/openapi3.yaml")
386409
assert.NoError(t, err, "应该能够读取 openapi3.yaml 文件")
387410

388411
sourceFile := &types.SourceFile{
@@ -426,7 +449,10 @@ func TestComplexOpenAPIDocumentSplitting(t *testing.T) {
426449
})
427450

428451
t.Run("Swagger 2.0 YAML 文档分割", func(t *testing.T) {
429-
content, err := os.ReadFile("/home/kcx/codeWorkspace/codebase-embedder/bin/swagger2.yaml")
452+
if !splitter.splitOptions.EnableOpenAPIParsing {
453+
assert.Error(t, err, "openapi file parse is close")
454+
}
455+
content, err := os.ReadFile("../../bin/swagger2.yaml")
430456
assert.NoError(t, err, "应该能够读取 swagger2.yaml 文件")
431457

432458
sourceFile := &types.SourceFile{
@@ -476,6 +502,7 @@ func TestSplitOpenAPIFileErrorCases(t *testing.T) {
476502
MaxTokensPerChunk: 1000,
477503
SlidingWindowOverlapTokens: 100,
478504
EnableMarkdownParsing: true,
505+
EnableOpenAPIParsing: true,
479506
}
480507
splitter, err := NewCodeSplitter(splitOptions)
481508
assert.NoError(t, err)
@@ -519,6 +546,9 @@ func TestSplitOpenAPIFileErrorCases(t *testing.T) {
519546

520547
for _, tt := range tests {
521548
t.Run(tt.name, func(t *testing.T) {
549+
if !splitter.splitOptions.EnableOpenAPIParsing {
550+
assert.Error(t, err, "openapi file parse is close")
551+
}
522552
sourceFile := &types.SourceFile{
523553
CodebaseId: 1,
524554
CodebasePath: "/test/path",
@@ -547,6 +577,7 @@ func TestSplitMarkdownFile(t *testing.T) {
547577
MaxTokensPerChunk: 1000,
548578
SlidingWindowOverlapTokens: 100,
549579
EnableMarkdownParsing: true,
580+
EnableOpenAPIParsing: true,
550581
}
551582
splitter, err := NewCodeSplitter(splitOptions)
552583
assert.NoError(t, err)
@@ -659,6 +690,7 @@ func TestSplitMarkdownFileEdgeCases(t *testing.T) {
659690
MaxTokensPerChunk: 1000,
660691
SlidingWindowOverlapTokens: 100,
661692
EnableMarkdownParsing: true,
693+
EnableOpenAPIParsing: true,
662694
}
663695
splitter, err := NewCodeSplitter(splitOptions)
664696
assert.NoError(t, err)
@@ -719,6 +751,7 @@ func TestSplitRealMarkdownFile(t *testing.T) {
719751
MaxTokensPerChunk: 1000,
720752
SlidingWindowOverlapTokens: 100,
721753
EnableMarkdownParsing: true,
754+
EnableOpenAPIParsing: true,
722755
}
723756
splitter, err := NewCodeSplitter(splitOptions)
724757
assert.NoError(t, err)
@@ -733,7 +766,7 @@ func TestSplitRealMarkdownFile(t *testing.T) {
733766
}{
734767
{
735768
name: "自定义知识库功能文档",
736-
filePath: "/Code/Go/zgsm-ai/codebase-embedder/docs/自定义知识库功能.md",
769+
filePath: "../../bin/自定义知识库功能.md",
737770
expectError: false,
738771
expectCount: 3, // 预期会分成3个主要部分:Epic 1、Epic 2、效果验收方案
739772
description: "应该成功分割真实的自定义知识库功能文档",
@@ -831,6 +864,7 @@ func TestSplitMarkdownFileLargeContent(t *testing.T) {
831864
MaxTokensPerChunk: 50, // 设置较小的 token 限制来测试分割
832865
SlidingWindowOverlapTokens: 10,
833866
EnableMarkdownParsing: true,
867+
EnableOpenAPIParsing: true,
834868
}
835869
splitter, err := NewCodeSplitter(splitOptions)
836870
assert.NoError(t, err)

internal/svc/service_context.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ func NewServiceContext(ctx context.Context, c config.Config) (*ServiceContext, e
8585
MaxTokensPerChunk: c.IndexTask.EmbeddingTask.MaxTokensPerChunk,
8686
SlidingWindowOverlapTokens: c.IndexTask.EmbeddingTask.OverlapTokens,
8787
EnableMarkdownParsing: c.IndexTask.EmbeddingTask.EnableMarkdownParsing,
88+
EnableOpenAPIParsing: c.IndexTask.EmbeddingTask.EnableOpenAPIParsing,
8889
})
8990
if err != nil {
9091
return nil, err

0 commit comments

Comments
 (0)