Skip to content
Open
Show file tree
Hide file tree
Changes from 69 commits
Commits
Show all changes
123 commits
Select commit Hold shift + click to select a range
757f02b
base semantic gen
Oct 15, 2025
95d93d2
base requirements
Oct 15, 2025
8b7bf3c
simple package
Oct 15, 2025
5e5276c
base method added
Oct 15, 2025
b8ec696
remove missing libraru inserts
Oct 15, 2025
113f0df
clean up variables
Oct 15, 2025
aa2de29
clean up input variables
Oct 15, 2025
02d6087
clean up prompt logs
Oct 15, 2025
9faf81d
fix device count
Oct 15, 2025
8e407a0
add semantic_similarity_args
Oct 15, 2025
82eaa9a
update model input vars
Oct 16, 2025
70d0ad4
added more log
Oct 16, 2025
bf7d817
initialize vllm
Oct 16, 2025
1d85fa6
download model beforehand
Oct 16, 2025
6be969d
template score
Oct 16, 2025
0798f61
update task name
Oct 16, 2025
ca9ff84
rouge score array
Oct 16, 2025
21f54d0
base scoring script
Oct 16, 2025
37b82d9
remove snapshot downlad
Oct 16, 2025
7e13853
test vllm server
Oct 16, 2025
1f30150
add requests query
Oct 16, 2025
a153996
clean libs
Oct 16, 2025
92fe4c9
fix parse issue
Oct 16, 2025
a8751c2
use start_vllm_server
Oct 16, 2025
5b9539e
test llm generate
Oct 16, 2025
2a73701
updated vllm server
Oct 16, 2025
a8547f5
add debug logging level
Oct 16, 2025
274948b
base LLM
Oct 16, 2025
051afae
try except for vllm
Oct 16, 2025
d51d3cf
use vllmm server
Oct 16, 2025
e9cd534
retry snapshot download
Oct 17, 2025
ab7fe4a
snapshot down
Oct 17, 2025
bc2897b
snapshot with download_dir
Oct 17, 2025
78b005a
add model dir
Oct 17, 2025
aac6d69
add dtype
Oct 17, 2025
73abff7
model dir
Oct 17, 2025
41bc34a
add trust remote code
Oct 17, 2025
d37541a
download safetensors
Oct 17, 2025
67fee2c
move vllm server up
Oct 17, 2025
e59e1df
use the same dir
Oct 17, 2025
bef036e
redo snapshot download
Oct 17, 2025
044cef7
trigger
Oct 17, 2025
77bc96d
combined
Oct 17, 2025
67fca56
use vllm server
Oct 17, 2025
bf80fd4
add process tree import
Oct 17, 2025
f5a21f5
add clearml conditional
Oct 17, 2025
b471178
add task import
Oct 17, 2025
08914e5
retrieve current task
Oct 17, 2025
2c3a299
output server logs
Oct 17, 2025
c1a0b3c
print vllm command
Oct 17, 2025
f83b044
output as json
Oct 20, 2025
d9b447a
output artifact
Oct 20, 2025
8ebd724
retry with python llm interface
Oct 22, 2025
b9ae4c1
reference the downloaded model
Oct 23, 2025
ecf9f4b
add results directory creation
Oct 23, 2025
05b8f0f
fix download and read
Oct 23, 2025
389a5d8
clean up repo
Oct 23, 2025
5e84115
clean up scoring and remove hardcoding
Oct 23, 2025
64c5369
add low score indices
Oct 23, 2025
bed4991
add f1 score to enum
Oct 23, 2025
29b650c
simplify output path
Oct 23, 2025
b84a102
add examples and clean up
Oct 23, 2025
c4e1aea
clean up example
Oct 23, 2025
7cd5a3a
add scoring args dict
Oct 23, 2025
d5e4210
add support for variable score limits
Oct 23, 2025
9d349e8
clearml get model_id
Oct 28, 2025
98609a2
add clearml model import
Oct 28, 2025
391ecc5
check for clearml model
Oct 29, 2025
23a5f95
reference huggingface dir
Oct 29, 2025
5feeff7
implement semantic feedback
Oct 31, 2025
4768cf8
add db path debug
Oct 31, 2025
2b3a4f7
more debug
Oct 31, 2025
6dcfd74
debug dataset_args
Oct 31, 2025
11ba9fc
hardcode dataset args
Oct 31, 2025
0da978e
update examples
Oct 31, 2025
44f0c62
moved from utils
Oct 31, 2025
3b09a3f
dataset args through parse
Oct 31, 2025
640ae0a
add more dataset arg prints
Oct 31, 2025
ae9d928
add dict flattening
Oct 31, 2025
e74e0d6
added dictionary flattening
Nov 3, 2025
0ee0288
update prompt to chat
Nov 11, 2025
8c71fac
string output prompts
Nov 11, 2025
1ae047a
moved from prompt to conversation
Nov 12, 2025
e382646
retry with tqdm
Nov 12, 2025
a3ecdb5
re-add messages list
Nov 12, 2025
9f56b0b
clean up convos
Nov 12, 2025
57a2596
add debug to know which model is being initialised
Nov 13, 2025
b36b998
add mistral exception
Nov 13, 2025
91009b9
allow existing results dir
Nov 13, 2025
3c400ec
snapshot download only
Nov 13, 2025
e21f100
add tokenizer mode for mistral
Nov 13, 2025
96e33be
llm direct from model id
Nov 13, 2025
ff351b9
add print
Nov 13, 2025
48c0e6d
fix format
Nov 14, 2025
ce27ec4
add list dir
Nov 14, 2025
aeaca8f
add params download
Nov 14, 2025
40f09ef
download to tmp
Nov 14, 2025
30b3906
move params.json
Nov 14, 2025
67a9dae
default path
Nov 14, 2025
fdd7607
default path
Nov 14, 2025
78ed285
use recursive search
Nov 14, 2025
73c2c85
use recursive search
Nov 14, 2025
305fa68
apply download of param.json to only quantized models
Nov 14, 2025
ece8320
ref hf dir
Nov 14, 2025
2006587
remove formatting
Nov 14, 2025
0ed95b0
remove formatting
Nov 14, 2025
6821871
remove formatting
Nov 14, 2025
9e87114
remove formatting
Nov 14, 2025
95d924a
remove formatting
Nov 14, 2025
f346705
remove formatting
Nov 14, 2025
ddc3dbd
remove formatting
Nov 14, 2025
5b599d9
use param.json
Nov 14, 2025
64dd157
use param.json
Nov 14, 2025
c29539f
use param.json
Nov 14, 2025
5362535
add formatting
Nov 14, 2025
5df5eb5
without params
Nov 14, 2025
c4a47b1
remove format
Nov 14, 2025
486914d
only change format for mistral base
Nov 14, 2025
2c9f47c
retrieve use role only for tulu
Nov 14, 2025
4da2ff3
clean up
Nov 14, 2025
cd74379
rerun with tulu
Nov 19, 2025
cd996af
return msg
Nov 19, 2025
2884ed5
clean tulu
Nov 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/semantic_similarity_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from automation.tasks import SemanticSimilarityGenerateTask

task = SemanticSimilarityGenerateTask(
project_name="semantic_similarity_debug",
task_name="semantic_generation_qwen3_14b_w4a16",
branch="semantic_similarity",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
dataset_args = {"alpaca": "tatsu-lab/alpaca", "openplatypus": "garage-bAInd/Open-Platypus", "tulu": "allenai/tulu-3-sft-mixture"},
model_id="RedHatAI/Qwen3-14B-quantized.w4a16",
num_samples_per_dataset=330,
max_new_tokens=1024,
max_model_len=4096,
semantic_similarity_args={"enable-chunked-prefill": True, "enforce_eager": True, "dtype" :"auto", "device_map": "auto", "temperature": 0.0},
)

task.execute_remotely("oneshot-a100x1")
17 changes: 17 additions & 0 deletions examples/semantic_similarity_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from automation.tasks import SemanticSimilarityScoreTask

task = SemanticSimilarityScoreTask(
project_name="semantic_similarity_debug",
task_name="semantic_scoring_14b",
branch="semantic_similarity",
packages = ["huggingface-hub==0.34.3", "networkx==3.4.2", "datasets==4.2.0", "rouge_score==0.1.2", "bert-score==0.3.13", "sentence-transformers==5.1.1", "matplotlib"],
reference_model_project_name="semantic_similarity_debug",
candidate_model_project_name="semantic_similarity_debug",
reference_model_task_name="semantic_generation_qwen3_14b_base",
candidate_model_task_name="semantic_generation_qwen3_14b_w4a16",
sts_model_id="all-MiniLM-L6-v2",
rouge_scores=["rouge1", "rougeL"],
scoring_args={"f1": 0.75, "rouge1": 0.5, "sts": 0.75},
)

task.execute_remotely("oneshot-a100x1")
6 changes: 5 additions & 1 deletion src/automation/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from automation.tasks.base_task import BaseTask
from automation.tasks.semantic_similarity_generate import SemanticSimilarityGenerateTask
from automation.tasks.semantic_similarity_score import SemanticSimilarityScoreTask
from automation.tasks.llmcompressor import LLMCompressorTask
from automation.tasks.lmeval import LMEvalTask
from automation.tasks.lighteval import LightEvalTask
Expand All @@ -7,9 +9,11 @@

__all__ = [
"BaseTask",
"SemanticSimilarityGenerateTask",
"SemanticSimilarityScoreTask",
"LLMCompressorTask",
"LMEvalTask",
"LightEvalTask",
"GuideLLMTask",
"DebugTask",
]
]
166 changes: 166 additions & 0 deletions src/automation/tasks/scripts/semantic_similarity_generate_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import json
import os
import requests
from torch.cuda import device_count
from tqdm import tqdm
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

from automation.utils import kill_process_tree, parse_argument

try:
from clearml import OutputModel, Task, Model
clearml_available = True
except ImportError:
clearml_available = False

RESULTS_DIR = os.path.join(os.getcwd(), "results")
os.makedirs(RESULTS_DIR, exist_ok=False)

def make_alpaca_platypus_prompt(sample):
instruction = sample["instruction"].strip()
input_text = sample.get("input", "").strip()
prompt = (
f"### Instruction:\n{instruction}\n\n"
f"### Input:\n{input_text if input_text else 'N/A'}\n\n"
f"### Response:\n"
)

return prompt


def make_tulu_prompt(sample):
msgs = []
for m in sample["messages"]:
role = m.get("role", "user")
content = m.get("content", "").strip()
msgs.append(f"{role.upper()}: {content}")
joined = "\n".join(msgs)
prompt = f"### Conversation:\n{joined}\n\n### Response:\n"

return prompt


def make_default_prompt(sample):
prompt = f"### Input:\n{json.dumps(sample)}\n\n### Response:\n"
return prompt


def semantic_similarity_generate_main(
model_id,
trust_remote_code,
dataset_args,
semantic_similarity_args,
max_model_len,
max_new_tokens,
num_samples_per_dataset,
clearml_model,
):
from collections import defaultdict
from huggingface_hub import snapshot_download

all_prompts = []
all_samples_dict = defaultdict(list)

print(">>> Loading dataset...")
for dataset_name,dataset_path in dataset_args.items():
print(f">>> Loading dataset {dataset_name}...")
dataset = load_dataset(dataset_path, split=f"train[:{num_samples_per_dataset}]")
all_samples_dict[dataset_name].extend(dataset)

for dataset_name,dataset_samples in all_samples_dict.items():
print(f">>> Loading values for {dataset_name}...")
for sample in dataset_samples:
if dataset_name == "alpaca" or (dataset_name == "openplatypus"):
prompt = make_alpaca_platypus_prompt(sample)
elif dataset_name == "tulu":
prompt = make_tulu_prompt(sample)
else:
print("Using default prompt")
prompt = make_default_prompt(sample)
all_prompts.append(prompt)

print("Define sampling parameters")
sampling_params = SamplingParams(
temperature=semantic_similarity_args.get("temperature", 0.0),
max_tokens=max_new_tokens,
stop=["### Instruction:", "### Input:", "### Response:"],
)

HUGGINGFACE_DIR = "/home"
if clearml_model:
HUGGINGFACE_DIR = Model(model_id).get_local_copy()
else:
print(">>> Downloading snapshot ...")
snapshot_download(repo_id=model_id, local_dir=HUGGINGFACE_DIR)

try:
print(">>> Initializing vLLM...")
llm = LLM(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using the LLM class instead of vllm serve?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main branch has an old src/automation/vllm/server.py file the class VLLMServer but other branches use start_vllm_server`.
Also shouldn't the output of the LLM class be identical to the vllm serve api endpoint?

model=HUGGINGFACE_DIR,
dtype=semantic_similarity_args.get("dtype", "auto"),
trust_remote_code=trust_remote_code,
tensor_parallel_size=device_count(),
enforce_eager=semantic_similarity_args.get("enforce_eager", True),
enable_chunked_prefill=semantic_similarity_args.get("enable_chunked_prefill", True),
max_model_len=max_model_len
)
print("Completed the model initialization ")
print(">>> Running vLLM generation...")
outputs = llm.generate(all_prompts, sampling_params)
except Exception as e:
print(f"Error initializing LLM: {e}")

return all_prompts, outputs

def main(configurations=None, args=None):
if clearml_available:
task = Task.current_task()
args = task.get_parameters_as_dict(cast=True)["Args"]
clearml_model = parse_argument(args["clearml_model"], bool)
else:
args = args["Args"]
clearml_model = False

# Parse arguments
force_download = parse_argument(args["force_download"], bool)
trust_remote_code = parse_argument(args["trust_remote_code"], bool)
model_id = parse_argument(args["model_id"], str)
max_model_len = parse_argument(args["max_model_len"], int)
num_samples_per_dataset = parse_argument(args["num_samples_per_dataset"], int)
max_new_tokens = parse_argument(args["max_new_tokens"], int)
dataset_args = args.get("dataset_args", None)
semantic_similarity_args= args.get("semantic_similarity_args", None)
tags = args.get("tags", None)

print(semantic_similarity_args)
all_prompts, outputs = semantic_similarity_generate_main(
model_id,
trust_remote_code,
dataset_args,
semantic_similarity_args,
max_model_len,
max_new_tokens,
num_samples_per_dataset,
clearml_model,
)

OUTPUT_FILE = os.path.join(RESULTS_DIR,f"{model_id.replace('/', '_')}.jsonl")
print(">>> Writing outputs to file...")
with open(OUTPUT_FILE, "w") as fout:
for idx, (prompt, output) in enumerate(zip(all_prompts, outputs)):
response = output.outputs[0].text.strip()
fout.write(json.dumps({
"index": idx,
"prompt": prompt,
"response": response
}) + "\n")

print(f">>> Completed. Saved {len(outputs)} outputs to {OUTPUT_FILE}")

if clearml_available:
task.upload_artifact("jsonl_output", OUTPUT_FILE)

if __name__ == '__main__':
main()
158 changes: 158 additions & 0 deletions src/automation/tasks/scripts/semantic_similarity_score_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import json
from bert_score import score
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import os
from automation.utils import parse_argument

try:
from clearml import OutputModel, Task
clearml_available = True
except ImportError:
clearml_available = False

SCORING_DIR = os.path.join(os.getcwd(), "scoresdirectory")
os.makedirs(SCORING_DIR, exist_ok=False)

def semantic_similarity_score_main(
reference_file,
candidate_file,
sts_model_id,
rouge_scores,
bert_score_limit,
rouge1_score_limit,
sts_score_limit,
):
# Load reference and candidate data
with open(reference_file, "r") as f_ref, open(candidate_file, "r") as f_cand:
reference_data = [json.loads(line) for line in f_ref]
candidate_data = [json.loads(line) for line in f_cand]

assert len(reference_data) == len(candidate_data), "Mismatched number of entries!"

# Extract answers
references = [ref.get("output") or ref["response"] for ref in reference_data]
candidates = [cand["response"] for cand in candidate_data]

# Load models
sts_model = SentenceTransformer(sts_model_id)
rouge = rouge_scorer.RougeScorer(rouge_scores, use_stemmer=True)

# Compute BERTScore
_, _, f1_scores = score(candidates, references, lang="en", verbose=False)
#all_bert_f1 = [ f1.item() for f1 in f1_scores ]

# Evaluate metrics
all_rouge1_f1, all_rougeL_f1, all_sts, all_bert_f1 = [], [], [], []
low_score_indices = []

for i, (ref, cand, f1) in enumerate(zip(references, candidates, f1_scores)):
emb_ref = sts_model.encode(ref, convert_to_tensor=True)
emb_cand = sts_model.encode(cand, convert_to_tensor=True)
raw_sts = util.cos_sim(emb_cand, emb_ref).item()
sts = (raw_sts + 1) / 2 # Normalize to [0, 1]
all_sts.append(sts)

rouge_scores = rouge.score(ref, cand)
rouge1 = rouge_scores["rouge1"].fmeasure
rougeL = rouge_scores["rougeL"].fmeasure
all_rouge1_f1.append(rouge1)
all_rougeL_f1.append(rougeL)

all_bert_f1.append(f1.item())

if f1 < bert_score_limit or rouge1 < rouge1_score_limit or sts < sts_score_limit:
low_score_indices.append(i)

# Compute averages
num_samples = len(references)
avg_bert = sum(all_bert_f1) / num_samples
avg_rouge1 = sum(all_rouge1_f1) / num_samples
avg_rougeL = sum(all_rougeL_f1) / num_samples
avg_sts = sum(all_sts) / num_samples
return avg_bert, avg_rouge1, avg_rougeL, avg_sts, low_score_indices

def main(configurations=None, args=None):
if clearml_available:
task = Task.current_task()
args = task.get_parameters_as_dict(cast=True)["Args"]
else:
args = args["Args"]

# Parse arguments
clearml_model = parse_argument(args["clearml_model"], bool)
force_download = parse_argument(args["force_download"], bool)
trust_remote_code = parse_argument(args["trust_remote_code"], bool)
scoring_args = args.get("scoring_args", dict)
sts_model_id = args.get("sts_model_id", str)
rouge_scores= args.get("rouge_scores", list)
tags = args.get("tags", None)

print(args)
print(scoring_args)

if clearml_available:
reference_model_project_name = parse_argument(args["reference_model_project_name"], str)
candidate_model_project_name = parse_argument(args["candidate_model_project_name"], str)
candidate_model_task_name = parse_argument(args["candidate_model_task_name"], str)
reference_model_task_name = parse_argument(args["reference_model_task_name"], str)
reference_task = Task.query_tasks(project_name=reference_model_project_name,task_name= reference_model_task_name, task_filter={'order_by': ['-last_update'], 'status': ['completed'] })
reference_task = Task.get_task(reference_task[0])
reference_file = reference_task.artifacts['jsonl_output'].get_local_copy()

candidate_task = Task.query_tasks(project_name=candidate_model_project_name,task_name= candidate_model_task_name, task_filter={'order_by': ['-last_update'], 'status': ['completed'] })
candidate_task = Task.get_task(candidate_task[0])
candidate_file = candidate_task.artifacts['jsonl_output'].get_local_copy()
else:
ref_model_jsonl = args.get("ref_model_jsonl", str)
cand_model_jsonl = args.get("cand_model_jsonl", str)
reference_file = os.path.join(SCORING_DIR, ref_model_jsonl)
candidate_file = os.path.join(SCORING_DIR, cand_model_jsonl)

bert_score_limit = scoring_args.get("f1",0.75)
rouge1_score_limit = scoring_args.get("rouge1",0.6)
sts_score_limit = scoring_args.get("sts",0.75)

avg_bert, avg_rouge1, avg_rougeL, avg_sts, low_score_indices = semantic_similarity_score_main(
reference_file,
candidate_file,
sts_model_id,
rouge_scores,
bert_score_limit,
rouge1_score_limit,
sts_score_limit,
)
# Print summary
print("\n=== Averages (for Google Sheets) ===")
print("BERTScore F1 | ROUGE-1 F1 | ROUGE-L F1 | STS CosSim")
print(f"{avg_bert:.3f} | {avg_rouge1:.3f} | {avg_rougeL:.3f} | {avg_sts:.3f}")

print(f"\n=== Low-score indices (BERT < {bert_score_limit}, ROUGE-1 < {rouge1_score_limit}, STS < {sts_score_limit}) ===")
print(low_score_indices)

data = {
"BERTScore F1": f"{avg_bert:.3f}",
"ROUGE-1 F1": f"{avg_rouge1:.3f}",
"ROUGE-1 FL": f"{avg_rougeL:.3f}",
"STS CosSim": f"{avg_sts:.3f}",
}

from pathlib import Path

reference_file = Path(reference_file).stem.lower()
candidate_file = Path(candidate_file).stem.lower()
out_filename = f"scores_{reference_file}__vs__{candidate_file}.txt"
out_filename = os.path.join(SCORING_DIR,out_filename)

# Save results
with open(out_filename, "w") as file:
json.dump(data, file, indent=4)

print(f"\nSaved results to {out_filename}")
if clearml_available:
task.upload_artifact("scores", data)
task.upload_artifact("outscores", out_filename)
print("Pushing clearml artifact")

if __name__ == '__main__':
main()
Loading