neuralmagic · Chibukach · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/examples/semantic_similarity_generate.py b/examples/semantic_similarity_generate.py
@@ -0,0 +1,17 @@
+from automation.tasks import SemanticSimilarityGenerateTask
+
+task = SemanticSimilarityGenerateTask(
+    project_name="semantic_similarity_debug",
+    task_name="semantic_generation_qwen3_14b_w4a16_feedback",
+    #task_name="semantic_generation_qwen3_14b_feedback",
+    branch="semantic_similarity",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+    dataset_args = {"tatsu-lab/alpaca" : 300 , "garage-bAInd/Open-Platypus": "310", "allenai/tulu-3-sft-mixture": 320},
+    model_id="Qwen/Qwen3-14B",
+    max_new_tokens=1024,
+    max_model_len=4096,
+    semantic_similarity_args={"enable-chunked-prefill": True, "enforce_eager": True, "dtype" :"auto", "device_map": "auto", "temperature": 0.0},
+)
+
+task.execute_remotely("oneshot-a100x1")
+
diff --git a/examples/semantic_similarity_score.py b/examples/semantic_similarity_score.py
@@ -0,0 +1,20 @@
+from automation.tasks import SemanticSimilarityScoreTask
+
+task = SemanticSimilarityScoreTask(
+    project_name="semantic_similarity_debug",
+    #task_name="semantic_scoring_14b",
+    task_name="semantic_scoring_4b",
+    branch="semantic_similarity",
+    packages = ["huggingface-hub==0.34.3", "networkx==3.4.2", "datasets==4.2.0", "rouge_score==0.1.2", "bert-score==0.3.13", "sentence-transformers==5.1.1", "matplotlib"],
+    reference_model_project_name="semantic_similarity_debug",
+    candidate_model_project_name="semantic_similarity_debug",
+    reference_model_task_name="semantic_generation_qwen3_14b_feedback",
+    #reference_model_task_name="semantic_generation_qwen3_14b_base",
+    candidate_model_task_name="semantic_generation_qwen3_14b_w4a16_feedback",
+    #candidate_model_task_name="semantic_generation_qwen3_14b_w4a16",
+    sts_model_id="all-MiniLM-L6-v2",
+    rouge_scores=["rouge1", "rougeL"],
+    low_score_threshold_args={"f1": 0.79, "rouge1": 0.65, "sts": 0.71},
+)
+
+task.execute_remotely("oneshot-a100x1")
diff --git a/src/automation/datasets/__init__.py b/src/automation/datasets/__init__.py
@@ -4,6 +4,10 @@
 from automation.datasets.openthoughts import DATASET_PATH as OPENTHOUGHTSDATASET
 from automation.datasets.utils import load_llm_messages, load_vlm_messages
 from automation.datasets.fleurs import load_fleurs_dataset
+from automation.datasets.tulu import make_tulu_prompt
+from automation.datasets.openplatypus import make_openplatypus_prompt
+from automation.datasets.alpaca import make_alpaca_prompt
+from automation.datasets.defaults import make_default_prompt
 
 SUPPORTED_DATASETS = {
     "calibration": load_calibration_dataset,
@@ -17,6 +21,10 @@
     "load_openthoughts_dataset",
     "load_llm_messages",
     "load_vlm_messages",
+    "make_tulu_prompt",
+    "make_openplatypus_prompt",
+    "make_alpaca_prompt",
+    "make_default_prompt",
     "load_fleurs_dataset",
     "SUPPORTED_DATASETS",
-]
+]
diff --git a/src/automation/datasets/alpaca.py b/src/automation/datasets/alpaca.py
@@ -0,0 +1,22 @@
+def make_alpaca_prompt(sample):
+    instruction = sample["instruction"].strip()
+    input_text = sample.get("input", "").strip()
+
+    if input_text == "":
+        messages = [
+            {
+                "role": "user",
+                "content": f"{instruction}",
+            }
+        ]
+
+
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": f"{instruction}\n{input_text}",
+            }
+        ]
+
+    return messages
diff --git a/src/automation/datasets/defaults.py b/src/automation/datasets/defaults.py
@@ -0,0 +1,12 @@
+def make_default_prompt(sample):
+    messages = [
+        {
+            "role": "user",
+            "content": f"{json.dumps(sample)}",
+        }
+    ]
+
+    prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
+
+    return prompt
+
diff --git a/src/automation/datasets/openplatypus.py b/src/automation/datasets/openplatypus.py
@@ -0,0 +1,21 @@
+def make_openplatypus_prompt(sample):
+    instruction = sample["instruction"].strip()
+    input_text = sample.get("input", "").strip()
+
+    if input_text == "":
+        messages = [
+            {
+                "role": "user",
+                "content": f"{instruction}",
+            }
+        ]
+
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": f"{instruction}\n{input_text}",
+            }
+        ]
+
+    return messages
diff --git a/src/automation/datasets/tulu.py b/src/automation/datasets/tulu.py
@@ -0,0 +1,3 @@
+
+def make_tulu_prompt(sample):
+    return sample["messages"]
diff --git a/src/automation/tasks/__init__.py b/src/automation/tasks/__init__.py
@@ -1,4 +1,6 @@
 from automation.tasks.base_task import BaseTask
+from automation.tasks.semantic_similarity_generate import SemanticSimilarityGenerateTask
+from automation.tasks.semantic_similarity_score import SemanticSimilarityScoreTask
 from automation.tasks.llmcompressor import LLMCompressorTask
 from automation.tasks.lmeval import LMEvalTask
 from automation.tasks.lighteval import LightEvalTask
@@ -7,9 +9,11 @@
 
 __all__ = [
     "BaseTask",
+    "SemanticSimilarityGenerateTask",
+    "SemanticSimilarityScoreTask",
     "LLMCompressorTask",
     "LMEvalTask",
     "LightEvalTask",
     "GuideLLMTask",
     "DebugTask",
-]
+]
diff --git a/src/automation/tasks/scripts/semantic_similarity_generate_script.py b/src/automation/tasks/scripts/semantic_similarity_generate_script.py
@@ -0,0 +1,150 @@
+import json
+import os
+import requests
+from torch.cuda import device_count
+from tqdm import tqdm
+from datasets import load_dataset
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+
+from automation.utils import kill_process_tree, parse_argument, flatten_nested_dict
+from automation.datasets.tulu import make_tulu_prompt
+from automation.datasets.openplatypus import make_openplatypus_prompt
+from automation.datasets.alpaca import make_alpaca_prompt
+from automation.datasets.defaults import make_default_prompt
+
+try:
+    from clearml import OutputModel, Task, Model
+    clearml_available = True
+except ImportError:
+    clearml_available = False
+
+RESULTS_DIR = os.path.join(os.getcwd(), "results")
+os.makedirs(RESULTS_DIR, exist_ok=True)
+
+def semantic_similarity_generate_main(
+    model_id,
+    trust_remote_code,
+    dataset_args,
+    semantic_similarity_args,
+    max_model_len,
+    max_new_tokens,
+    clearml_model,
+):
+    from collections import defaultdict
+    from huggingface_hub import snapshot_download
+
+    all_conversations = []
+    all_samples_dict = defaultdict(list)
+
+    print(">>> Loading dataset...")
+    for dataset_path, num_samples_per_dataset in dataset_args.items():
+        dataset_name = dataset_path.split("/")[1].lower()
+        print(f">>> Loading dataset {dataset_name}...")
+        dataset = load_dataset(dataset_path, split=f"train[:{int(num_samples_per_dataset)}]")
+        all_samples_dict[dataset_name].extend(dataset)
+
+    sorted_all_samples_dict = dict(sorted(all_samples_dict.items()))
+
+    for dataset_name,dataset_samples in sorted_all_samples_dict.items():
+        print(f">>> Loading values for {dataset_name}...")
+        for sample in dataset_samples:
+            if dataset_name == "alpaca":
+                prompt = make_alpaca_prompt(sample)
+            elif dataset_name == "open-platypus":
+                prompt = make_openplatypus_prompt(sample)
+            elif dataset_name == "tulu-3-sft-mixture":
+                prompt = make_tulu_prompt(sample)
+            else:
+                print("Using default prompt")
+                prompt = make_default_prompt(sample)
+            all_conversations.append(prompt)
+
+    print("Define sampling parameters")
+    sampling_params = SamplingParams(
+        temperature=semantic_similarity_args.get("temperature", 0.0),
+        max_tokens=max_new_tokens
+    )
+
+    HUGGINGFACE_DIR = "/home"
+    if clearml_model:
+        HUGGINGFACE_DIR = Model(model_id).get_local_copy()
+    else:
+        print("Download snapshot")
+        snapshot_download(repo_id=model_id, local_dir=HUGGINGFACE_DIR)
+        print(os.listdir(HUGGINGFACE_DIR))
+        if "mistral" in model_id.lower():
+            from huggingface_hub import hf_hub_download
+            hf_hub_download(repo_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503", filename="params.json", local_dir=HUGGINGFACE_DIR)
+
+    try:
+        print(f"Initializing vLLM: {model_id}...")
+        llm = LLM(
+            model= HUGGINGFACE_DIR,
+            #model= model_id if "mistral" in model_id.lower() else HUGGINGFACE_DIR,
+            dtype=semantic_similarity_args.get("dtype", "auto"),
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=device_count(),
+            enforce_eager=semantic_similarity_args.get("enforce_eager", True),
+            enable_chunked_prefill=semantic_similarity_args.get("enable_chunked_prefill", True),
+            max_model_len=max_model_len,
+            load_format="mistral",
+            config_format="mistral",
+            tokenizer_mode="mistral" if "mistral" in model_id.lower() else "auto"
+        )
+        print("Completed the model initialization ")
+        print(">>> Running vLLM generation...")
+        outputs = llm.chat(messages=all_conversations, sampling_params=sampling_params)
+    except Exception as e:
+        print(f"Error initializing LLM: {e}")
+
+    return all_conversations, outputs
+
+
+def main(configurations=None, args=None):
+    if clearml_available:
+        task = Task.current_task()
+        args = task.get_parameters_as_dict(cast=True)["Args"]
+        clearml_model = parse_argument(args["clearml_model"], bool)
+    else:
+        args = args["Args"]
+        clearml_model = False
+
+    # Parse arguments
+    force_download = parse_argument(args["force_download"], bool)
+    trust_remote_code = parse_argument(args["trust_remote_code"], bool)
+    model_id = parse_argument(args["model_id"], str)
+    max_model_len = parse_argument(args["max_model_len"], int)
+    max_new_tokens = parse_argument(args["max_new_tokens"], int)
+    dataset_args = flatten_nested_dict(parse_argument(args["dataset_args"], dict))
+    semantic_similarity_args= args.get("semantic_similarity_args", None)
+    tags = args.get("tags", None)
+
+    all_conversations, outputs = semantic_similarity_generate_main(
+        model_id,
+        trust_remote_code,
+        dataset_args,
+        semantic_similarity_args,
+        max_model_len,
+        max_new_tokens,
+        clearml_model,
+    )
+
+    OUTPUT_FILE = os.path.join(RESULTS_DIR,f"{model_id.replace('/', '_')}.jsonl")
+    print(">>> Writing outputs to file...")
+    with open(OUTPUT_FILE, "w") as fout:
+        for idx, (prompt, output) in enumerate(zip(all_conversations, outputs)):
+            response = output.outputs[0].text.strip()
+            fout.write(json.dumps({
+                "index": idx,
+                "prompt": prompt,
+                "response": response
+            }) + "\n")
+
+    print(f">>> Completed. Saved {len(outputs)} outputs to {OUTPUT_FILE}")
+
+    if clearml_available:
+        task.upload_artifact("jsonl_output", OUTPUT_FILE)
+
+if __name__ == '__main__':
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		def make_tulu_prompt(sample):
		return sample["messages"]