-
Notifications
You must be signed in to change notification settings - Fork 0
Semantic similarity #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 69 commits
757f02b
95d93d2
8b7bf3c
5e5276c
b8ec696
113f0df
aa2de29
02d6087
9faf81d
8e407a0
82eaa9a
70d0ad4
bf7d817
1d85fa6
6be969d
0798f61
ca9ff84
21f54d0
37b82d9
7e13853
1f30150
a153996
92fe4c9
a8751c2
5b9539e
2a73701
a8547f5
274948b
051afae
d51d3cf
e9cd534
ab7fe4a
bc2897b
78b005a
aac6d69
73abff7
41bc34a
d37541a
67fee2c
e59e1df
bef036e
044cef7
77bc96d
67fca56
bf80fd4
f5a21f5
b471178
08914e5
2c3a299
c1a0b3c
f83b044
d9b447a
8ebd724
b9ae4c1
ecf9f4b
05b8f0f
389a5d8
5e84115
64c5369
bed4991
29b650c
b84a102
c4e1aea
7cd5a3a
d5e4210
9d349e8
98609a2
391ecc5
23a5f95
5feeff7
4768cf8
2b3a4f7
6dcfd74
11ba9fc
0da978e
44f0c62
3b09a3f
640ae0a
ae9d928
e74e0d6
0ee0288
8c71fac
1ae047a
e382646
a3ecdb5
9f56b0b
57a2596
b36b998
91009b9
3c400ec
e21f100
96e33be
ff351b9
48c0e6d
ce27ec4
aeaca8f
40f09ef
30b3906
67a9dae
fdd7607
78ed285
73c2c85
305fa68
ece8320
2006587
0ed95b0
6821871
9e87114
95d924a
f346705
ddc3dbd
5b599d9
64dd157
c29539f
5362535
5df5eb5
c4a47b1
486914d
2c9f47c
4da2ff3
cd74379
cd996af
2884ed5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| from automation.tasks import SemanticSimilarityGenerateTask | ||
|
|
||
| task = SemanticSimilarityGenerateTask( | ||
| project_name="semantic_similarity_debug", | ||
| task_name="semantic_generation_qwen3_14b_w4a16", | ||
| branch="semantic_similarity", | ||
| packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"], | ||
| dataset_args = {"alpaca": "tatsu-lab/alpaca", "openplatypus": "garage-bAInd/Open-Platypus", "tulu": "allenai/tulu-3-sft-mixture"}, | ||
| model_id="RedHatAI/Qwen3-14B-quantized.w4a16", | ||
| num_samples_per_dataset=330, | ||
| max_new_tokens=1024, | ||
| max_model_len=4096, | ||
| semantic_similarity_args={"enable-chunked-prefill": True, "enforce_eager": True, "dtype" :"auto", "device_map": "auto", "temperature": 0.0}, | ||
| ) | ||
|
|
||
| task.execute_remotely("oneshot-a100x1") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| from automation.tasks import SemanticSimilarityScoreTask | ||
|
|
||
| task = SemanticSimilarityScoreTask( | ||
| project_name="semantic_similarity_debug", | ||
| task_name="semantic_scoring_14b", | ||
| branch="semantic_similarity", | ||
| packages = ["huggingface-hub==0.34.3", "networkx==3.4.2", "datasets==4.2.0", "rouge_score==0.1.2", "bert-score==0.3.13", "sentence-transformers==5.1.1", "matplotlib"], | ||
| reference_model_project_name="semantic_similarity_debug", | ||
| candidate_model_project_name="semantic_similarity_debug", | ||
| reference_model_task_name="semantic_generation_qwen3_14b_base", | ||
| candidate_model_task_name="semantic_generation_qwen3_14b_w4a16", | ||
| sts_model_id="all-MiniLM-L6-v2", | ||
| rouge_scores=["rouge1", "rougeL"], | ||
| scoring_args={"f1": 0.75, "rouge1": 0.5, "sts": 0.75}, | ||
| ) | ||
|
|
||
| task.execute_remotely("oneshot-a100x1") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,166 @@ | ||
| import json | ||
| import os | ||
| import requests | ||
| from torch.cuda import device_count | ||
| from tqdm import tqdm | ||
| from datasets import load_dataset | ||
| from vllm import LLM, SamplingParams | ||
| from transformers import AutoTokenizer | ||
|
|
||
| from automation.utils import kill_process_tree, parse_argument | ||
|
|
||
| try: | ||
| from clearml import OutputModel, Task, Model | ||
| clearml_available = True | ||
| except ImportError: | ||
| clearml_available = False | ||
|
|
||
| RESULTS_DIR = os.path.join(os.getcwd(), "results") | ||
| os.makedirs(RESULTS_DIR, exist_ok=False) | ||
|
|
||
| def make_alpaca_platypus_prompt(sample): | ||
| instruction = sample["instruction"].strip() | ||
| input_text = sample.get("input", "").strip() | ||
| prompt = ( | ||
| f"### Instruction:\n{instruction}\n\n" | ||
| f"### Input:\n{input_text if input_text else 'N/A'}\n\n" | ||
| f"### Response:\n" | ||
| ) | ||
|
|
||
| return prompt | ||
|
|
||
|
|
||
| def make_tulu_prompt(sample): | ||
| msgs = [] | ||
| for m in sample["messages"]: | ||
| role = m.get("role", "user") | ||
| content = m.get("content", "").strip() | ||
| msgs.append(f"{role.upper()}: {content}") | ||
| joined = "\n".join(msgs) | ||
| prompt = f"### Conversation:\n{joined}\n\n### Response:\n" | ||
|
|
||
| return prompt | ||
|
|
||
|
|
||
| def make_default_prompt(sample): | ||
| prompt = f"### Input:\n{json.dumps(sample)}\n\n### Response:\n" | ||
| return prompt | ||
|
|
||
|
|
||
| def semantic_similarity_generate_main( | ||
| model_id, | ||
| trust_remote_code, | ||
| dataset_args, | ||
| semantic_similarity_args, | ||
| max_model_len, | ||
| max_new_tokens, | ||
| num_samples_per_dataset, | ||
| clearml_model, | ||
| ): | ||
| from collections import defaultdict | ||
| from huggingface_hub import snapshot_download | ||
|
|
||
| all_prompts = [] | ||
| all_samples_dict = defaultdict(list) | ||
|
|
||
| print(">>> Loading dataset...") | ||
| for dataset_name,dataset_path in dataset_args.items(): | ||
| print(f">>> Loading dataset {dataset_name}...") | ||
| dataset = load_dataset(dataset_path, split=f"train[:{num_samples_per_dataset}]") | ||
| all_samples_dict[dataset_name].extend(dataset) | ||
|
|
||
| for dataset_name,dataset_samples in all_samples_dict.items(): | ||
| print(f">>> Loading values for {dataset_name}...") | ||
| for sample in dataset_samples: | ||
| if dataset_name == "alpaca" or (dataset_name == "openplatypus"): | ||
| prompt = make_alpaca_platypus_prompt(sample) | ||
| elif dataset_name == "tulu": | ||
| prompt = make_tulu_prompt(sample) | ||
| else: | ||
| print("Using default prompt") | ||
| prompt = make_default_prompt(sample) | ||
| all_prompts.append(prompt) | ||
|
|
||
| print("Define sampling parameters") | ||
| sampling_params = SamplingParams( | ||
| temperature=semantic_similarity_args.get("temperature", 0.0), | ||
| max_tokens=max_new_tokens, | ||
| stop=["### Instruction:", "### Input:", "### Response:"], | ||
| ) | ||
|
|
||
| HUGGINGFACE_DIR = "/home" | ||
| if clearml_model: | ||
| HUGGINGFACE_DIR = Model(model_id).get_local_copy() | ||
| else: | ||
| print(">>> Downloading snapshot ...") | ||
| snapshot_download(repo_id=model_id, local_dir=HUGGINGFACE_DIR) | ||
|
|
||
| try: | ||
| print(">>> Initializing vLLM...") | ||
| llm = LLM( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we using the LLM class instead of vllm serve?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main branch has an old |
||
| model=HUGGINGFACE_DIR, | ||
| dtype=semantic_similarity_args.get("dtype", "auto"), | ||
| trust_remote_code=trust_remote_code, | ||
| tensor_parallel_size=device_count(), | ||
| enforce_eager=semantic_similarity_args.get("enforce_eager", True), | ||
| enable_chunked_prefill=semantic_similarity_args.get("enable_chunked_prefill", True), | ||
| max_model_len=max_model_len | ||
| ) | ||
| print("Completed the model initialization ") | ||
| print(">>> Running vLLM generation...") | ||
| outputs = llm.generate(all_prompts, sampling_params) | ||
| except Exception as e: | ||
| print(f"Error initializing LLM: {e}") | ||
|
|
||
| return all_prompts, outputs | ||
|
|
||
| def main(configurations=None, args=None): | ||
| if clearml_available: | ||
| task = Task.current_task() | ||
| args = task.get_parameters_as_dict(cast=True)["Args"] | ||
| clearml_model = parse_argument(args["clearml_model"], bool) | ||
| else: | ||
| args = args["Args"] | ||
| clearml_model = False | ||
|
|
||
| # Parse arguments | ||
| force_download = parse_argument(args["force_download"], bool) | ||
| trust_remote_code = parse_argument(args["trust_remote_code"], bool) | ||
| model_id = parse_argument(args["model_id"], str) | ||
| max_model_len = parse_argument(args["max_model_len"], int) | ||
| num_samples_per_dataset = parse_argument(args["num_samples_per_dataset"], int) | ||
| max_new_tokens = parse_argument(args["max_new_tokens"], int) | ||
| dataset_args = args.get("dataset_args", None) | ||
| semantic_similarity_args= args.get("semantic_similarity_args", None) | ||
| tags = args.get("tags", None) | ||
|
|
||
| print(semantic_similarity_args) | ||
Chibukach marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| all_prompts, outputs = semantic_similarity_generate_main( | ||
| model_id, | ||
| trust_remote_code, | ||
| dataset_args, | ||
| semantic_similarity_args, | ||
| max_model_len, | ||
| max_new_tokens, | ||
| num_samples_per_dataset, | ||
| clearml_model, | ||
| ) | ||
|
|
||
| OUTPUT_FILE = os.path.join(RESULTS_DIR,f"{model_id.replace('/', '_')}.jsonl") | ||
| print(">>> Writing outputs to file...") | ||
| with open(OUTPUT_FILE, "w") as fout: | ||
| for idx, (prompt, output) in enumerate(zip(all_prompts, outputs)): | ||
| response = output.outputs[0].text.strip() | ||
| fout.write(json.dumps({ | ||
| "index": idx, | ||
| "prompt": prompt, | ||
| "response": response | ||
| }) + "\n") | ||
|
|
||
| print(f">>> Completed. Saved {len(outputs)} outputs to {OUTPUT_FILE}") | ||
|
|
||
| if clearml_available: | ||
| task.upload_artifact("jsonl_output", OUTPUT_FILE) | ||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| import json | ||
| from bert_score import score | ||
| from rouge_score import rouge_scorer | ||
| from sentence_transformers import SentenceTransformer, util | ||
Chibukach marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| import os | ||
| from automation.utils import parse_argument | ||
|
|
||
| try: | ||
| from clearml import OutputModel, Task | ||
| clearml_available = True | ||
| except ImportError: | ||
| clearml_available = False | ||
|
|
||
| SCORING_DIR = os.path.join(os.getcwd(), "scoresdirectory") | ||
| os.makedirs(SCORING_DIR, exist_ok=False) | ||
|
|
||
| def semantic_similarity_score_main( | ||
| reference_file, | ||
| candidate_file, | ||
| sts_model_id, | ||
| rouge_scores, | ||
| bert_score_limit, | ||
| rouge1_score_limit, | ||
| sts_score_limit, | ||
| ): | ||
| # Load reference and candidate data | ||
| with open(reference_file, "r") as f_ref, open(candidate_file, "r") as f_cand: | ||
| reference_data = [json.loads(line) for line in f_ref] | ||
| candidate_data = [json.loads(line) for line in f_cand] | ||
|
|
||
| assert len(reference_data) == len(candidate_data), "Mismatched number of entries!" | ||
|
|
||
| # Extract answers | ||
| references = [ref.get("output") or ref["response"] for ref in reference_data] | ||
| candidates = [cand["response"] for cand in candidate_data] | ||
|
|
||
| # Load models | ||
| sts_model = SentenceTransformer(sts_model_id) | ||
| rouge = rouge_scorer.RougeScorer(rouge_scores, use_stemmer=True) | ||
|
|
||
| # Compute BERTScore | ||
| _, _, f1_scores = score(candidates, references, lang="en", verbose=False) | ||
| #all_bert_f1 = [ f1.item() for f1 in f1_scores ] | ||
|
|
||
| # Evaluate metrics | ||
| all_rouge1_f1, all_rougeL_f1, all_sts, all_bert_f1 = [], [], [], [] | ||
| low_score_indices = [] | ||
|
|
||
| for i, (ref, cand, f1) in enumerate(zip(references, candidates, f1_scores)): | ||
| emb_ref = sts_model.encode(ref, convert_to_tensor=True) | ||
| emb_cand = sts_model.encode(cand, convert_to_tensor=True) | ||
| raw_sts = util.cos_sim(emb_cand, emb_ref).item() | ||
| sts = (raw_sts + 1) / 2 # Normalize to [0, 1] | ||
| all_sts.append(sts) | ||
|
|
||
| rouge_scores = rouge.score(ref, cand) | ||
| rouge1 = rouge_scores["rouge1"].fmeasure | ||
| rougeL = rouge_scores["rougeL"].fmeasure | ||
| all_rouge1_f1.append(rouge1) | ||
| all_rougeL_f1.append(rougeL) | ||
|
|
||
| all_bert_f1.append(f1.item()) | ||
|
|
||
| if f1 < bert_score_limit or rouge1 < rouge1_score_limit or sts < sts_score_limit: | ||
| low_score_indices.append(i) | ||
|
|
||
| # Compute averages | ||
| num_samples = len(references) | ||
| avg_bert = sum(all_bert_f1) / num_samples | ||
| avg_rouge1 = sum(all_rouge1_f1) / num_samples | ||
| avg_rougeL = sum(all_rougeL_f1) / num_samples | ||
| avg_sts = sum(all_sts) / num_samples | ||
| return avg_bert, avg_rouge1, avg_rougeL, avg_sts, low_score_indices | ||
|
|
||
| def main(configurations=None, args=None): | ||
| if clearml_available: | ||
| task = Task.current_task() | ||
| args = task.get_parameters_as_dict(cast=True)["Args"] | ||
| else: | ||
| args = args["Args"] | ||
|
|
||
| # Parse arguments | ||
| clearml_model = parse_argument(args["clearml_model"], bool) | ||
| force_download = parse_argument(args["force_download"], bool) | ||
| trust_remote_code = parse_argument(args["trust_remote_code"], bool) | ||
| scoring_args = args.get("scoring_args", dict) | ||
| sts_model_id = args.get("sts_model_id", str) | ||
| rouge_scores= args.get("rouge_scores", list) | ||
| tags = args.get("tags", None) | ||
|
|
||
| print(args) | ||
| print(scoring_args) | ||
|
|
||
| if clearml_available: | ||
| reference_model_project_name = parse_argument(args["reference_model_project_name"], str) | ||
| candidate_model_project_name = parse_argument(args["candidate_model_project_name"], str) | ||
| candidate_model_task_name = parse_argument(args["candidate_model_task_name"], str) | ||
| reference_model_task_name = parse_argument(args["reference_model_task_name"], str) | ||
| reference_task = Task.query_tasks(project_name=reference_model_project_name,task_name= reference_model_task_name, task_filter={'order_by': ['-last_update'], 'status': ['completed'] }) | ||
| reference_task = Task.get_task(reference_task[0]) | ||
| reference_file = reference_task.artifacts['jsonl_output'].get_local_copy() | ||
|
|
||
| candidate_task = Task.query_tasks(project_name=candidate_model_project_name,task_name= candidate_model_task_name, task_filter={'order_by': ['-last_update'], 'status': ['completed'] }) | ||
| candidate_task = Task.get_task(candidate_task[0]) | ||
| candidate_file = candidate_task.artifacts['jsonl_output'].get_local_copy() | ||
| else: | ||
| ref_model_jsonl = args.get("ref_model_jsonl", str) | ||
| cand_model_jsonl = args.get("cand_model_jsonl", str) | ||
| reference_file = os.path.join(SCORING_DIR, ref_model_jsonl) | ||
| candidate_file = os.path.join(SCORING_DIR, cand_model_jsonl) | ||
|
|
||
| bert_score_limit = scoring_args.get("f1",0.75) | ||
| rouge1_score_limit = scoring_args.get("rouge1",0.6) | ||
| sts_score_limit = scoring_args.get("sts",0.75) | ||
|
|
||
| avg_bert, avg_rouge1, avg_rougeL, avg_sts, low_score_indices = semantic_similarity_score_main( | ||
| reference_file, | ||
| candidate_file, | ||
| sts_model_id, | ||
| rouge_scores, | ||
| bert_score_limit, | ||
| rouge1_score_limit, | ||
| sts_score_limit, | ||
| ) | ||
| # Print summary | ||
| print("\n=== Averages (for Google Sheets) ===") | ||
| print("BERTScore F1 | ROUGE-1 F1 | ROUGE-L F1 | STS CosSim") | ||
| print(f"{avg_bert:.3f} | {avg_rouge1:.3f} | {avg_rougeL:.3f} | {avg_sts:.3f}") | ||
|
|
||
| print(f"\n=== Low-score indices (BERT < {bert_score_limit}, ROUGE-1 < {rouge1_score_limit}, STS < {sts_score_limit}) ===") | ||
| print(low_score_indices) | ||
|
|
||
| data = { | ||
| "BERTScore F1": f"{avg_bert:.3f}", | ||
| "ROUGE-1 F1": f"{avg_rouge1:.3f}", | ||
| "ROUGE-1 FL": f"{avg_rougeL:.3f}", | ||
| "STS CosSim": f"{avg_sts:.3f}", | ||
| } | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| reference_file = Path(reference_file).stem.lower() | ||
| candidate_file = Path(candidate_file).stem.lower() | ||
| out_filename = f"scores_{reference_file}__vs__{candidate_file}.txt" | ||
| out_filename = os.path.join(SCORING_DIR,out_filename) | ||
|
|
||
| # Save results | ||
| with open(out_filename, "w") as file: | ||
| json.dump(data, file, indent=4) | ||
|
|
||
| print(f"\nSaved results to {out_filename}") | ||
| if clearml_available: | ||
| task.upload_artifact("scores", data) | ||
| task.upload_artifact("outscores", out_filename) | ||
| print("Pushing clearml artifact") | ||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
Uh oh!
There was an error while loading. Please reload this page.