Skip to content

Commit e7e2f88

Browse files
authored
Fixes for index_config (#738)
This PR includes a couple of fixes to the config logic. - `shorten_dimensions` was dropped by mistake in #502; added it back, together with some unit tests - In `from_index_name()`, limit the split to one element to cater with property names containing hyphens, for example embeddings model names. (+ unit test) - Booleans were not properly decoded; added fix + tests.
1 parent d4d208e commit e7e2f88

File tree

5 files changed

+60
-8
lines changed

5 files changed

+60
-8
lines changed

rag_experiment_accelerator/config/embedding_model_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ class EmbeddingModelConfig(BaseConfig):
77
type: str = "sentence-transformer"
88
model_name: str = "all-mpnet-base-v2"
99
dimension: int = None
10+
shorten_dimensions: bool = False

rag_experiment_accelerator/config/index_config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,14 @@ def __from_label_properties(cls, properties: dict) -> "IndexConfig":
8989
ef_construction=int(properties[IndexKey.EF_CONSTRUCTION]),
9090
ef_search=int(properties[IndexKey.EF_SEARCH]),
9191
chunking=ChunkingConfig(
92-
preprocess=bool(properties[IndexKey.PREPROCESS]),
92+
preprocess=bool(int(properties[IndexKey.PREPROCESS])),
9393
chunk_size=int(properties[IndexKey.CHUNK_SIZE]),
9494
chunking_strategy=properties[IndexKey.CHUNKING_STRATEGY],
9595
overlap_size=int(properties[IndexKey.OVERLAP_SIZE]),
96-
generate_title=bool(properties[IndexKey.GENERATE_TITLE]),
97-
generate_summary=bool(properties[IndexKey.GENERATE_SUMMARY]),
96+
generate_title=bool(int(properties[IndexKey.GENERATE_TITLE])),
97+
generate_summary=bool(int(properties[IndexKey.GENERATE_SUMMARY])),
9898
override_content_with_summary=bool(
99-
properties[IndexKey.OVERRIDE_CONTENT_WITH_SUMMARY]
99+
int(properties[IndexKey.OVERRIDE_CONTENT_WITH_SUMMARY])
100100
),
101101
),
102102
embedding_model=EmbeddingModelConfig(
@@ -132,7 +132,7 @@ def from_index_name(cls, index_name: str) -> "IndexConfig":
132132
Reverse of index_name().
133133
"""
134134

135-
key_values = [kv.split("-") for kv in index_name.split("_")]
135+
key_values = [kv.split("-", 1) for kv in index_name.split("_")]
136136
properties = {kv[0]: kv[1].strip() for kv in key_values}
137137

138138
try:

rag_experiment_accelerator/config/tests/data/config.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@
2828
{
2929
"type": "azure",
3030
"model_name": "text-embedding-ada-002"
31+
},
32+
{
33+
"type": "azure",
34+
"model_name": "text-embedding-3-large",
35+
"dimension": 3072
36+
},
37+
{
38+
"type": "azure",
39+
"model_name": "text-embedding-3-small",
40+
"dimension": 256,
41+
"shorten_dimensions": true
3142
}
3243
],
3344
"sampling": {

rag_experiment_accelerator/config/tests/test_config.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,20 @@ def test_config_init(mock_validate_json_with_schema, mock_create_embedding_model
3232
embedding_model_1 = MagicMock()
3333
embedding_model_1.model_name.return_value = "all-MiniLM-L6-v2"
3434
embedding_model_1.dimension.return_value = 384
35+
embedding_model_1.shorten_dimensions.return_value = False
3536
embedding_model_2 = MagicMock()
3637
embedding_model_2.model_name.return_value = "text-embedding-ada-002"
3738
embedding_model_2.dimension.return_value = 1536
38-
mock_create_embedding_model.side_effect = [embedding_model_1, embedding_model_2]
39+
embedding_model_2.shorten_dimensions.return_value = False
40+
embedding_model_3 = MagicMock()
41+
embedding_model_3.model_name.return_value = "text-embedding-3-large"
42+
embedding_model_3.dimension.return_value = 3072
43+
embedding_model_3.shorten_dimensions.return_value = False
44+
embedding_model_4 = MagicMock()
45+
embedding_model_4.model_name.return_value = "text-embedding-3-small"
46+
embedding_model_4.dimension.return_value = 256
47+
embedding_model_4.shorten_dimensions.return_value = True
48+
mock_create_embedding_model.side_effect = [embedding_model_1, embedding_model_2, embedding_model_3, embedding_model_4]
3949
mock_validate_json_with_schema.return_value = (True, None)
4050

4151
config = Config.from_path(environment, config_path)
@@ -88,6 +98,15 @@ def test_config_init(mock_validate_json_with_schema, mock_create_embedding_model
8898
assert index.embedding_model[1].type == mock_embedding[1]["type"]
8999
assert index.embedding_model[1].model_name == mock_embedding[1]["model_name"]
90100

101+
assert index.embedding_model[2].type == mock_embedding[2]["type"]
102+
assert index.embedding_model[2].model_name == mock_embedding[2]["model_name"]
103+
assert index.embedding_model[2].dimension == mock_embedding[2]["dimension"]
104+
105+
assert index.embedding_model[3].type == mock_embedding[3]["type"]
106+
assert index.embedding_model[3].model_name == mock_embedding[3]["model_name"]
107+
assert index.embedding_model[3].dimension == mock_embedding[3]["dimension"]
108+
assert index.embedding_model[3].shorten_dimensions == mock_embedding[3]["shorten_dimensions"]
109+
91110
model1 = config.get_embedding_model(config.index.embedding_model[0].model_name)
92111
assert model1.model_name.return_value == "all-MiniLM-L6-v2"
93112
assert model1.dimension.return_value == 384

rag_experiment_accelerator/config/tests/test_index_config.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,33 @@ def test_index_config_to_index_name():
3333

3434

3535
def test_index_name_to_index_config():
36-
index_name = "idx-prefix_efc-3_efs-4_em-modelname_sp-10_p-0_cs-1_st-abcd_o-2_t-0_s-0_oc-0_d-100"
36+
index_name = "idx-prefix_efc-3_efs-4_em-modelname_sp-10_p-0_cs-1_st-abcd_o-2_t-0_s-1_oc-0_d-100"
3737

3838
index_config = IndexConfig.from_index_name(index_name)
3939

4040
assert index_config.index_name_prefix == "prefix"
4141
assert index_config.chunking.chunk_size == 1
4242
assert index_config.chunking.chunking_strategy == "abcd"
4343
assert index_config.chunking.overlap_size == 2
44+
assert index_config.chunking.generate_summary is True
45+
assert index_config.chunking.generate_title is False
4446
assert index_config.embedding_model.model_name == "modelname"
4547
assert index_config.embedding_model.dimension == 100
4648
assert index_config.ef_construction == 3
4749
assert index_config.ef_search == 4
4850

4951

5052
def test_index_name_to_index_config_shuffled_order():
51-
index_name = "idx-prefix_efc-3_efs-4_em-modelname_p-0_cs-1_st-abcd_o-2_t-0_s-0_oc-0_sp-10_d-100"
53+
index_name = "idx-prefix_efc-3_efs-4_em-modelname_p-0_cs-1_st-abcd_o-2_t-0_s-1_oc-0_sp-10_d-100"
5254

5355
index_config = IndexConfig.from_index_name(index_name)
5456

5557
assert index_config.index_name_prefix == "prefix"
5658
assert index_config.chunking.chunk_size == 1
5759
assert index_config.chunking.chunking_strategy == "abcd"
5860
assert index_config.chunking.overlap_size == 2
61+
assert index_config.chunking.generate_summary is True
62+
assert index_config.chunking.generate_title is False
5963
assert index_config.embedding_model.model_name == "modelname"
6064
assert index_config.embedding_model.dimension == 100
6165
assert index_config.ef_construction == 3
@@ -73,3 +77,20 @@ def test_index_name_to_index_config_missing_property():
7377
assert True
7478
else:
7579
assert False, "Expected ValueError to be thrown"
80+
81+
82+
def test_index_name_to_index_config_hyphens():
83+
index_name = (
84+
"idx-prefix_efc-3_efs-4_em-model-name_sp-10_p-0_cs-1_st-ab-cd_o-2_t-0_s-0_oc-0_d-100"
85+
)
86+
87+
index_config = IndexConfig.from_index_name(index_name)
88+
89+
assert index_config.index_name_prefix == "prefix"
90+
assert index_config.chunking.chunk_size == 1
91+
assert index_config.chunking.chunking_strategy == "ab-cd"
92+
assert index_config.chunking.overlap_size == 2
93+
assert index_config.embedding_model.model_name == "model-name"
94+
assert index_config.embedding_model.dimension == 100
95+
assert index_config.ef_construction == 3
96+
assert index_config.ef_search == 4

0 commit comments

Comments
 (0)