|
45 | 45 | is_valid_ocid, |
46 | 46 | upload_local_to_os, |
47 | 47 | ) |
| 48 | +from ads.aqua.config.config import evaluation_service_config |
48 | 49 | from ads.aqua.constants import ( |
49 | 50 | CONSOLE_LINK_RESOURCE_TYPE_MAPPING, |
50 | 51 | EVALUATION_REPORT, |
@@ -191,7 +192,7 @@ def create( |
191 | 192 | enable_spec=True |
192 | 193 | ).inference |
193 | 194 | for container in inference_config.values(): |
194 | | - if container.name == runtime.image[:runtime.image.rfind(":")]: |
| 195 | + if container.name == runtime.image[: runtime.image.rfind(":")]: |
195 | 196 | eval_inference_configuration = ( |
196 | 197 | container.spec.evaluation_configuration |
197 | 198 | ) |
@@ -416,9 +417,11 @@ def create( |
416 | 417 | report_path=create_aqua_evaluation_details.report_path, |
417 | 418 | model_parameters=create_aqua_evaluation_details.model_parameters, |
418 | 419 | metrics=create_aqua_evaluation_details.metrics, |
419 | | - inference_configuration=eval_inference_configuration.to_filtered_dict() |
420 | | - if eval_inference_configuration |
421 | | - else {}, |
| 420 | + inference_configuration=( |
| 421 | + eval_inference_configuration.to_filtered_dict() |
| 422 | + if eval_inference_configuration |
| 423 | + else {} |
| 424 | + ), |
422 | 425 | ) |
423 | 426 | ).create(**kwargs) ## TODO: decide what parameters will be needed |
424 | 427 | logger.debug( |
@@ -901,48 +904,8 @@ def get_status(self, eval_id: str) -> dict: |
901 | 904 |
|
902 | 905 | def get_supported_metrics(self) -> dict: |
903 | 906 | """Gets a list of supported metrics for evaluation.""" |
904 | | - # TODO: implement it when starting to support more metrics. |
905 | 907 | return [ |
906 | | - { |
907 | | - "use_case": ["text_generation"], |
908 | | - "key": "bertscore", |
909 | | - "name": "bertscore", |
910 | | - "description": ( |
911 | | - "BERT Score is a metric for evaluating the quality of text " |
912 | | - "generation models, such as machine translation or summarization. " |
913 | | - "It utilizes pre-trained BERT contextual embeddings for both the " |
914 | | - "generated and reference texts, and then calculates the cosine " |
915 | | - "similarity between these embeddings." |
916 | | - ), |
917 | | - "args": {}, |
918 | | - }, |
919 | | - { |
920 | | - "use_case": ["text_generation"], |
921 | | - "key": "rouge", |
922 | | - "name": "rouge", |
923 | | - "description": ( |
924 | | - "ROUGE scores compare a candidate document to a collection of " |
925 | | - "reference documents to evaluate the similarity between them. " |
926 | | - "The metrics range from 0 to 1, with higher scores indicating " |
927 | | - "greater similarity. ROUGE is more suitable for models that don't " |
928 | | - "include paraphrasing and do not generate new text units that don't " |
929 | | - "appear in the references." |
930 | | - ), |
931 | | - "args": {}, |
932 | | - }, |
933 | | - { |
934 | | - "use_case": ["text_generation"], |
935 | | - "key": "bleu", |
936 | | - "name": "bleu", |
937 | | - "description": ( |
938 | | - "BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the " |
939 | | - "quality of text which has been machine-translated from one natural language to another. " |
940 | | - "Quality is considered to be the correspondence between a machine's output and that of a " |
941 | | - "human: 'the closer a machine translation is to a professional human translation, " |
942 | | - "the better it is'." |
943 | | - ), |
944 | | - "args": {}, |
945 | | - }, |
| 908 | + item.to_dict() for item in evaluation_service_config().ui_config.metrics |
946 | 909 | ] |
947 | 910 |
|
948 | 911 | @telemetry(entry_point="plugin=evaluation&action=load_metrics", name="aqua") |
@@ -1225,7 +1188,7 @@ def _delete_job_and_model(job, model): |
1225 | 1188 | f"Exception message: {ex}" |
1226 | 1189 | ) |
1227 | 1190 |
|
1228 | | - def load_evaluation_config(self, eval_id): |
| 1191 | + def load_evaluation_config(self): |
1229 | 1192 | """Loads evaluation config.""" |
1230 | 1193 | return { |
1231 | 1194 | "model_params": { |
|
0 commit comments