Merge pull request #11 from foundation-model-stack/encoder_shape_testing

JRosenkranz · web-flow · commit b91226458eb7 · 2025-03-27T10:44:49.000-04:00
added a shapes test for encoders
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -46,39 +46,26 @@ def __download_file(url, filename):
     except requests.exceptions.RequestException as e:
         print(f"An error occurred: {e}")
 
-def sample_sharegpt_requests(
-    dataset_path: str,
+def __sample_requests(
+    prompt_list: List[str], 
     num_requests: int,
     tokenizer: BaseTokenizer,
     prompt_length_min: int = 32,
     prompt_length_max: int = 64,
     seed: Optional[int] = None
-) -> List[Tuple[str, int]]:
-    if not os.path.exists(dataset_path):
-        print("downloading share-gpt dataset as it does not exist")
-        __download_file("https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", dataset_path)
-
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
+):
     # Shuffle the dataset.
     if seed is not None:
-        random.Random(seed).shuffle(dataset)
+        random.Random(seed).shuffle(prompt_list)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
+    for i in range(len(prompt_list)):
         if len(filtered_dataset) == num_requests:
             break
 
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
+        prompt = prompt_list[i]
         prompt_token_ids = ids_for_prompt(prompt, tokenizer)
         
         prompt_len = len(prompt_token_ids)
@@ -87,4 +74,49 @@ def sample_sharegpt_requests(
             continue
         filtered_dataset.append((prompt, prompt_len))
     
-    return filtered_dataset
+    return filtered_dataset
+    
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: BaseTokenizer,
+    prompt_length_min: int = 32,
+    prompt_length_max: int = 64,
+    seed: Optional[int] = None
+) -> List[Tuple[str, int]]:
+    if not os.path.exists(dataset_path):
+        print("downloading share-gpt dataset as it does not exist")
+        __download_file("https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", dataset_path)
+
+    # Load the dataset.
+    with open(dataset_path, encoding='utf-8') as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    dataset = [data["conversations"][0]["value"] for data in dataset]
+    
+    return __sample_requests(dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed)
+
+def sample_squad_v2_qa_requests(
+    dataset_path: str,
+    num_requests: int, 
+    tokenizer: BaseTokenizer, 
+    prompt_length_min: int = 32, 
+    prompt_length_max: int = 64, 
+    seed: Optional[int] = None
+) -> List[Tuple[str, int]]:
+    from datasets import load_dataset
+
+    if os.path.exists(dataset_path):
+        ds = load_dataset(dataset_path)['train']
+    else:
+        ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)['train']
+        
+    
+    ds = [f"{data['context']}\n{data['question']}" for data in ds]
+
+    return __sample_requests(ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed)
+    
+
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -9,8 +9,7 @@
 from aiu_fms_testing_utils.utils.aiu_setup import dprint
 import os
 
-if "HF_HOME" not in os.environ:
-    os.environ["HF_HOME"] = "/tmp/models/hf_cache"
+ORIGINAL_HF_HOME = os.environ.get("HF_HOME", None)
 
 # Add models to test here
 LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"
@@ -72,6 +71,11 @@ def reset_compiler():
     yield # run the test
     torch.compiler.reset()
     torch._dynamo.reset()
+    os.environ.pop('COMPILATION_MODE', None)
+    if ORIGINAL_HF_HOME is None:
+        os.environ.pop('HF_HOME', None)
+    else:
+        os.environ['HF_HOME'] = ORIGINAL_HF_HOME
 
 def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
     prompts_and_sizes = sample_sharegpt_requests(SHARE_GPT_DATASET_PATH, batch_size, tokenizer, int(seq_length / 2), seq_length, seed)
@@ -113,9 +117,13 @@ def __load_validation_info(model_path, batch_size, seq_length, max_new_tokens, t
     else:
         return None
 
-
 @pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens", common_shapes)
 def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens):
+    os.environ["COMPILATION_MODE"] = "offline_decoder"
+    
+    if "HF_HOME" not in os.environ:
+        os.environ["HF_HOME"] = "/tmp/models/hf_cache"
+
     dprint(f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}")
 
     if USE_MICRO_MODELS:
diff --git a/tests/models/test_encoders.py b/tests/models/test_encoders.py
@@ -0,0 +1,107 @@
+from fms.testing.comparison import ModelSignatureParams, compare_model_signatures, get_signature
+from fms.utils import tokenizers
+import pytest
+from fms.models import get_model
+from fms.utils.generation import pad_input_ids
+import itertools
+import torch
+from aiu_fms_testing_utils.utils import ids_for_prompt, sample_squad_v2_qa_requests
+from aiu_fms_testing_utils.utils.aiu_setup import dprint
+import os
+
+ORIGINAL_HF_HOME = os.environ.get("HF_HOME", None)
+
+# Add models to test here
+ROBERTA_SQUAD_V2 = "deepset/roberta-base-squad2"
+
+SQUAD_V2_DATASET_PATH = os.environ.get("SQUAD_V2_DATASET_PATH", os.path.expanduser("~/squad_v2"))
+common_model_paths = os.environ.get("FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ROBERTA_SQUAD_V2])
+common_batch_sizes = os.environ.get("FMS_TEST_SHAPES_COMMON_BATCH_SIZES", [1, 2, 4, 8])
+common_seq_lengths = os.environ.get("FMS_TEST_SHAPES_COMMON_SEQ_LENGTHS", [64, 512])
+
+# pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/roberta,/tmp/models/roberta-base-squad2"
+if isinstance(common_model_paths, str):
+    common_model_paths = common_model_paths.split(",")
+
+# pass custom common batch sizes as a comma separated str of ints
+if isinstance(common_batch_sizes, str):
+    common_batch_sizes = [int(bs) for bs in common_batch_sizes.split(",")]
+
+# pass custom common seq lengths as a comma separated str of ints
+if isinstance(common_seq_lengths, str):
+    common_seq_lengths = [int(sl) for sl in common_seq_lengths.split(",")]
+
+common_shapes = list(itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths))
+
+
+def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
+    prompts_and_sizes = sample_squad_v2_qa_requests(SQUAD_V2_DATASET_PATH, batch_size, tokenizer, int(seq_length / 2), seq_length, seed)
+    prompt_list = []
+    for prompt, _ in prompts_and_sizes:
+        prompt_list.append(ids_for_prompt(prompt, tokenizer))
+
+    input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length, is_causal_mask=False)
+    return input_ids, padding_kwargs
+
+@pytest.fixture(autouse=True)
+def reset_compiler():
+    yield # run the test
+    torch.compiler.reset()
+    torch._dynamo.reset()
+    os.environ.pop('COMPILATION_MODE', None)
+    if ORIGINAL_HF_HOME is None:
+        os.environ.pop('HF_HOME', None)
+    else:
+        os.environ['HF_HOME'] = ORIGINAL_HF_HOME
+
+encoder_paths = ["deepset/roberta-base-squad2"]
+common_encoder_shapes = list(itertools.product(encoder_paths, common_batch_sizes, common_seq_lengths))
+
+@pytest.mark.parametrize("model_path,batch_size,seq_length", common_encoder_shapes)
+def test_common_shapes(model_path, batch_size, seq_length):
+    os.environ["COMPILATION_MODE"] = "offline"
+
+    if "HF_HOME" not in os.environ:
+        os.environ["HF_HOME"] = "/tmp/models/hf_cache"
+    
+    dprint(f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}")
+
+    tokenizer = tokenizers.get_tokenizer(model_path)
+    
+    if os.path.exists(model_path):
+        model_path_kwargs = {"model_path": model_path}
+    else:
+        model_path_kwargs = {"variant": model_path}
+
+    # prepare the AIU model
+    model = get_model(
+        architecture="hf_pretrained",
+        device_type="cpu",
+        fused_weights=False,
+        **model_path_kwargs
+    )
+
+    model.eval()
+    torch.set_grad_enabled(False)
+    model.compile(backend="sendnn")
+
+    # prepare the cpu model
+    validation_model = get_model(
+        architecture="hf_pretrained",
+        device_type="cpu",
+        data_type=torch.float32,
+        fused_weights=False,
+        **model_path_kwargs
+    )
+
+    # prepare input_ids
+    input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+
+    # warmup model
+    logits_getter_fn = lambda x: x if isinstance(x, torch.Tensor) else torch.cat(list(x), dim=-1)
+    aiu_msp = ModelSignatureParams(model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs)
+    get_signature(aiu_msp.model, aiu_msp.params, aiu_msp.inp, aiu_msp.other_params, aiu_msp.logits_getter_fn)
+
+    cpu_msp = ModelSignatureParams(validation_model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs)
+    # FIXME: Compute GPU atol/rtol
+    compare_model_signatures(cpu_msp, aiu_msp, atol=0.1, rtol=.05)
diff --git a/tests/models/test_model_expectations.py b/tests/models/test_model_expectations.py
@@ -8,39 +8,71 @@
 )
 import os
 
+if "HF_HOME" not in os.environ:
+    os.environ["HF_HOME"] = "/tmp/models/hf_cache"
+
 model_dir = os.environ.get("FMS_TESTING_MODEL_DIR", "/tmp/models")
 LLAMA_194M = f"{model_dir}/llama-194m"
 GRANITE_7B_BASE = f"{model_dir}/granite-7b-base"
 GRANITE_8B_CODE_BASE = f"{model_dir}/granite-8b-code-base"
 GRANITE_3_8B_CODE_BASE = f"{model_dir}/granite-3-8b-base"
 
 models = [LLAMA_194M, GRANITE_7B_BASE, GRANITE_8B_CODE_BASE, GRANITE_3_8B_CODE_BASE]
+mini_models = {LLAMA_194M, GRANITE_7B_BASE, GRANITE_8B_CODE_BASE, GRANITE_3_8B_CODE_BASE}
 
 class AIUModelFixtureMixin(ModelFixtureMixin):
 
     @pytest.fixture(scope="class", autouse=True)
     def uninitialized_model(self, model_id):
+        if model_id in mini_models:
+            get_model_kwargs = {"architecture": "hf_configured", "nlayers": 3}
+        else:
+            get_model_kwargs = {"architecture": "hf_pretrained"}
+
         aiu_model = get_model(
-            "hf_configured",
-            model_id,
+            variant=model_id,
             device_type="cpu",
             unfuse_weights=True,
-            nlayers=3
+            **get_model_kwargs
         )
         torch.compile(aiu_model, backend="sendnn")
         return aiu_model
-    
+
+class TestAIUModels(
+    ModelConsistencyTestSuite,
+    AIUModelFixtureMixin,
+):
+
+    # x is the main parameter for this model which is the input tensor
+    _get_signature_params = ["x"]
+
     @pytest.fixture(scope="class", autouse=True, params=models)
     def model_id(self, request):
         return request.param
 
-class TestAIUModels(
+    def test_model_unfused(self, model, signature):
+        pytest.skip("All AIU models are already unfused")
+
+
+ROBERTA_SQUAD_v2 = "deepset/roberta-base-squad2"
+tuple_output_models = [ROBERTA_SQUAD_v2]
+
+class TestAIUModelsTupleOutput(
     ModelConsistencyTestSuite,
     AIUModelFixtureMixin,
 ):
-
+    
     # x is the main parameter for this model which is the input tensor
     _get_signature_params = ["x"]
 
+    @pytest.fixture(scope="class", autouse=True, params=tuple_output_models)
+    def model_id(self, request):
+        return request.param
+    
+    @staticmethod
+    def _get_signature_logits_getter_fn(f_out) -> torch.Tensor:
+        return torch.cat([f_out[0], f_out[1]], dim=-1)
+    
     def test_model_unfused(self, model, signature):
-        pytest.skip("All AIU models are already unfused")
+        pytest.skip("All AIU models are already unfused")
+    
diff --git a/tests/resources/expectations/models.test_model_expectations.TestAIUModelsTupleOutput.roberta-base-squad2.test_model_output b/tests/resources/expectations/models.test_model_expectations.TestAIUModelsTupleOutput.roberta-base-squad2.test_model_output
@@ -0,0 +1 @@
+9.834766387939453e-07,3.5762786865234375e-07,8.940696716308594e-07,6.258487701416016e-07,8.344650268554688e-07,1.1324882507324219e-06,6.556510925292969e-07,1.2516975402832031e-06,1.6391277313232422e-06,0.0,2.384185791015625e-07,1.1324882507324219e-06,4.172325134277344e-07,9.238719940185547e-07,4.76837158203125e-07,1.1622905731201172e-06,0.2104383111000061,0.2104375958442688,0.21043795347213745,0.21043813228607178,0.21043753623962402,0.21043819189071655,0.2104378342628479,0.21043813228607178,0.21043860912322998,0.21043741703033447,0.21043741703033447,0.21043819189071655,0.21043717861175537,0.21043848991394043,0.21043795347213745,0.21043837070465088
diff --git a/tests/resources/expectations/models.test_model_expectations.TestAIUModelsTupleOutput.roberta-base-squad2.test_model_weight_keys b/tests/resources/expectations/models.test_model_expectations.TestAIUModelsTupleOutput.roberta-base-squad2.test_model_weight_keys
@@ -0,0 +1 @@
+base_model.embedding.weight,base_model.enc_norm.bias,base_model.enc_norm.weight,base_model.layers.0.attn.dense.bias,base_model.layers.0.attn.dense.weight,base_model.layers.0.attn.in_proj.qkv_fused.bias,base_model.layers.0.attn.in_proj.qkv_fused.weight,base_model.layers.0.ff_ln.bias,base_model.layers.0.ff_ln.weight,base_model.layers.0.ff_sub_layer.w1.bias,base_model.layers.0.ff_sub_layer.w1.weight,base_model.layers.0.ff_sub_layer.w2.bias,base_model.layers.0.ff_sub_layer.w2.weight,base_model.layers.0.ln.bias,base_model.layers.0.ln.weight,base_model.layers.1.attn.dense.bias,base_model.layers.1.attn.dense.weight,base_model.layers.1.attn.in_proj.qkv_fused.bias,base_model.layers.1.attn.in_proj.qkv_fused.weight,base_model.layers.1.ff_ln.bias,base_model.layers.1.ff_ln.weight,base_model.layers.1.ff_sub_layer.w1.bias,base_model.layers.1.ff_sub_layer.w1.weight,base_model.layers.1.ff_sub_layer.w2.bias,base_model.layers.1.ff_sub_layer.w2.weight,base_model.layers.1.ln.bias,base_model.layers.1.ln.weight,base_model.layers.10.attn.dense.bias,base_model.layers.10.attn.dense.weight,base_model.layers.10.attn.in_proj.qkv_fused.bias,base_model.layers.10.attn.in_proj.qkv_fused.weight,base_model.layers.10.ff_ln.bias,base_model.layers.10.ff_ln.weight,base_model.layers.10.ff_sub_layer.w1.bias,base_model.layers.10.ff_sub_layer.w1.weight,base_model.layers.10.ff_sub_layer.w2.bias,base_model.layers.10.ff_sub_layer.w2.weight,base_model.layers.10.ln.bias,base_model.layers.10.ln.weight,base_model.layers.11.attn.dense.bias,base_model.layers.11.attn.dense.weight,base_model.layers.11.attn.in_proj.qkv_fused.bias,base_model.layers.11.attn.in_proj.qkv_fused.weight,base_model.layers.11.ff_ln.bias,base_model.layers.11.ff_ln.weight,base_model.layers.11.ff_sub_layer.w1.bias,base_model.layers.11.ff_sub_layer.w1.weight,base_model.layers.11.ff_sub_layer.w2.bias,base_model.layers.11.ff_sub_layer.w2.weight,base_model.layers.11.ln.bias,base_model.layers.11.ln.weight,base_model.layers.2.attn.dense.bias,base_model.layers.2.attn.dense.weight,base_model.layers.2.attn.in_proj.qkv_fused.bias,base_model.layers.2.attn.in_proj.qkv_fused.weight,base_model.layers.2.ff_ln.bias,base_model.layers.2.ff_ln.weight,base_model.layers.2.ff_sub_layer.w1.bias,base_model.layers.2.ff_sub_layer.w1.weight,base_model.layers.2.ff_sub_layer.w2.bias,base_model.layers.2.ff_sub_layer.w2.weight,base_model.layers.2.ln.bias,base_model.layers.2.ln.weight,base_model.layers.3.attn.dense.bias,base_model.layers.3.attn.dense.weight,base_model.layers.3.attn.in_proj.qkv_fused.bias,base_model.layers.3.attn.in_proj.qkv_fused.weight,base_model.layers.3.ff_ln.bias,base_model.layers.3.ff_ln.weight,base_model.layers.3.ff_sub_layer.w1.bias,base_model.layers.3.ff_sub_layer.w1.weight,base_model.layers.3.ff_sub_layer.w2.bias,base_model.layers.3.ff_sub_layer.w2.weight,base_model.layers.3.ln.bias,base_model.layers.3.ln.weight,base_model.layers.4.attn.dense.bias,base_model.layers.4.attn.dense.weight,base_model.layers.4.attn.in_proj.qkv_fused.bias,base_model.layers.4.attn.in_proj.qkv_fused.weight,base_model.layers.4.ff_ln.bias,base_model.layers.4.ff_ln.weight,base_model.layers.4.ff_sub_layer.w1.bias,base_model.layers.4.ff_sub_layer.w1.weight,base_model.layers.4.ff_sub_layer.w2.bias,base_model.layers.4.ff_sub_layer.w2.weight,base_model.layers.4.ln.bias,base_model.layers.4.ln.weight,base_model.layers.5.attn.dense.bias,base_model.layers.5.attn.dense.weight,base_model.layers.5.attn.in_proj.qkv_fused.bias,base_model.layers.5.attn.in_proj.qkv_fused.weight,base_model.layers.5.ff_ln.bias,base_model.layers.5.ff_ln.weight,base_model.layers.5.ff_sub_layer.w1.bias,base_model.layers.5.ff_sub_layer.w1.weight,base_model.layers.5.ff_sub_layer.w2.bias,base_model.layers.5.ff_sub_layer.w2.weight,base_model.layers.5.ln.bias,base_model.layers.5.ln.weight,base_model.layers.6.attn.dense.bias,base_model.layers.6.attn.dense.weight,base_model.layers.6.attn.in_proj.qkv_fused.bias,base_model.layers.6.attn.in_proj.qkv_fused.weight,base_model.layers.6.ff_ln.bias,base_model.layers.6.ff_ln.weight,base_model.layers.6.ff_sub_layer.w1.bias,base_model.layers.6.ff_sub_layer.w1.weight,base_model.layers.6.ff_sub_layer.w2.bias,base_model.layers.6.ff_sub_layer.w2.weight,base_model.layers.6.ln.bias,base_model.layers.6.ln.weight,base_model.layers.7.attn.dense.bias,base_model.layers.7.attn.dense.weight,base_model.layers.7.attn.in_proj.qkv_fused.bias,base_model.layers.7.attn.in_proj.qkv_fused.weight,base_model.layers.7.ff_ln.bias,base_model.layers.7.ff_ln.weight,base_model.layers.7.ff_sub_layer.w1.bias,base_model.layers.7.ff_sub_layer.w1.weight,base_model.layers.7.ff_sub_layer.w2.bias,base_model.layers.7.ff_sub_layer.w2.weight,base_model.layers.7.ln.bias,base_model.layers.7.ln.weight,base_model.layers.8.attn.dense.bias,base_model.layers.8.attn.dense.weight,base_model.layers.8.attn.in_proj.qkv_fused.bias,base_model.layers.8.attn.in_proj.qkv_fused.weight,base_model.layers.8.ff_ln.bias,base_model.layers.8.ff_ln.weight,base_model.layers.8.ff_sub_layer.w1.bias,base_model.layers.8.ff_sub_layer.w1.weight,base_model.layers.8.ff_sub_layer.w2.bias,base_model.layers.8.ff_sub_layer.w2.weight,base_model.layers.8.ln.bias,base_model.layers.8.ln.weight,base_model.layers.9.attn.dense.bias,base_model.layers.9.attn.dense.weight,base_model.layers.9.attn.in_proj.qkv_fused.bias,base_model.layers.9.attn.in_proj.qkv_fused.weight,base_model.layers.9.ff_ln.bias,base_model.layers.9.ff_ln.weight,base_model.layers.9.ff_sub_layer.w1.bias,base_model.layers.9.ff_sub_layer.w1.weight,base_model.layers.9.ff_sub_layer.w2.bias,base_model.layers.9.ff_sub_layer.w2.weight,base_model.layers.9.ln.bias,base_model.layers.9.ln.weight,base_model.position_embedding.weight,qa_head.bias,qa_head.weight

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+9.834766387939453e-07,3.5762786865234375e-07,8.940696716308594e-07,6.258487701416016e-07,8.344650268554688e-07,1.1324882507324219e-06,6.556510925292969e-07,1.2516975402832031e-06,1.6391277313232422e-06,0.0,2.384185791015625e-07,1.1324882507324219e-06,4.172325134277344e-07,9.238719940185547e-07,4.76837158203125e-07,1.1622905731201172e-06,0.2104383111000061,0.2104375958442688,0.21043795347213745,0.21043813228607178,0.21043753623962402,0.21043819189071655,0.2104378342628479,0.21043813228607178,0.21043860912322998,0.21043741703033447,0.21043741703033447,0.21043819189071655,0.21043717861175537,0.21043848991394043,0.21043795347213745,0.21043837070465088
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+base_model.embedding.weight,base_model.enc_norm.bias,base_model.enc_norm.weight,base_model.layers.0.attn.dense.bias,base_model.layers.0.attn.dense.weight,base_model.layers.0.attn.in_proj.qkv_fused.bias,base_model.layers.0.attn.in_proj.qkv_fused.weight,base_model.layers.0.ff_ln.bias,base_model.layers.0.ff_ln.weight,base_model.layers.0.ff_sub_layer.w1.bias,base_model.layers.0.ff_sub_layer.w1.weight,base_model.layers.0.ff_sub_layer.w2.bias,base_model.layers.0.ff_sub_layer.w2.weight,base_model.layers.0.ln.bias,base_model.layers.0.ln.weight,base_model.layers.1.attn.dense.bias,base_model.layers.1.attn.dense.weight,base_model.layers.1.attn.in_proj.qkv_fused.bias,base_model.layers.1.attn.in_proj.qkv_fused.weight,base_model.layers.1.ff_ln.bias,base_model.layers.1.ff_ln.weight,base_model.layers.1.ff_sub_layer.w1.bias,base_model.layers.1.ff_sub_layer.w1.weight,base_model.layers.1.ff_sub_layer.w2.bias,base_model.layers.1.ff_sub_layer.w2.weight,base_model.layers.1.ln.bias,base_model.layers.1.ln.weight,base_model.layers.10.attn.dense.bias,base_model.layers.10.attn.dense.weight,base_model.layers.10.attn.in_proj.qkv_fused.bias,base_model.layers.10.attn.in_proj.qkv_fused.weight,base_model.layers.10.ff_ln.bias,base_model.layers.10.ff_ln.weight,base_model.layers.10.ff_sub_layer.w1.bias,base_model.layers.10.ff_sub_layer.w1.weight,base_model.layers.10.ff_sub_layer.w2.bias,base_model.layers.10.ff_sub_layer.w2.weight,base_model.layers.10.ln.bias,base_model.layers.10.ln.weight,base_model.layers.11.attn.dense.bias,base_model.layers.11.attn.dense.weight,base_model.layers.11.attn.in_proj.qkv_fused.bias,base_model.layers.11.attn.in_proj.qkv_fused.weight,base_model.layers.11.ff_ln.bias,base_model.layers.11.ff_ln.weight,base_model.layers.11.ff_sub_layer.w1.bias,base_model.layers.11.ff_sub_layer.w1.weight,base_model.layers.11.ff_sub_layer.w2.bias,base_model.layers.11.ff_sub_layer.w2.weight,base_model.layers.11.ln.bias,base_model.layers.11.ln.weight,base_model.layers.2.attn.dense.bias,base_model.layers.2.attn.dense.weight,base_model.layers.2.attn.in_proj.qkv_fused.bias,base_model.layers.2.attn.in_proj.qkv_fused.weight,base_model.layers.2.ff_ln.bias,base_model.layers.2.ff_ln.weight,base_model.layers.2.ff_sub_layer.w1.bias,base_model.layers.2.ff_sub_layer.w1.weight,base_model.layers.2.ff_sub_layer.w2.bias,base_model.layers.2.ff_sub_layer.w2.weight,base_model.layers.2.ln.bias,base_model.layers.2.ln.weight,base_model.layers.3.attn.dense.bias,base_model.layers.3.attn.dense.weight,base_model.layers.3.attn.in_proj.qkv_fused.bias,base_model.layers.3.attn.in_proj.qkv_fused.weight,base_model.layers.3.ff_ln.bias,base_model.layers.3.ff_ln.weight,base_model.layers.3.ff_sub_layer.w1.bias,base_model.layers.3.ff_sub_layer.w1.weight,base_model.layers.3.ff_sub_layer.w2.bias,base_model.layers.3.ff_sub_layer.w2.weight,base_model.layers.3.ln.bias,base_model.layers.3.ln.weight,base_model.layers.4.attn.dense.bias,base_model.layers.4.attn.dense.weight,base_model.layers.4.attn.in_proj.qkv_fused.bias,base_model.layers.4.attn.in_proj.qkv_fused.weight,base_model.layers.4.ff_ln.bias,base_model.layers.4.ff_ln.weight,base_model.layers.4.ff_sub_layer.w1.bias,base_model.layers.4.ff_sub_layer.w1.weight,base_model.layers.4.ff_sub_layer.w2.bias,base_model.layers.4.ff_sub_layer.w2.weight,base_model.layers.4.ln.bias,base_model.layers.4.ln.weight,base_model.layers.5.attn.dense.bias,base_model.layers.5.attn.dense.weight,base_model.layers.5.attn.in_proj.qkv_fused.bias,base_model.layers.5.attn.in_proj.qkv_fused.weight,base_model.layers.5.ff_ln.bias,base_model.layers.5.ff_ln.weight,base_model.layers.5.ff_sub_layer.w1.bias,base_model.layers.5.ff_sub_layer.w1.weight,base_model.layers.5.ff_sub_layer.w2.bias,base_model.layers.5.ff_sub_layer.w2.weight,base_model.layers.5.ln.bias,base_model.layers.5.ln.weight,base_model.layers.6.attn.dense.bias,base_model.layers.6.attn.dense.weight,base_model.layers.6.attn.in_proj.qkv_fused.bias,base_model.layers.6.attn.in_proj.qkv_fused.weight,base_model.layers.6.ff_ln.bias,base_model.layers.6.ff_ln.weight,base_model.layers.6.ff_sub_layer.w1.bias,base_model.layers.6.ff_sub_layer.w1.weight,base_model.layers.6.ff_sub_layer.w2.bias,base_model.layers.6.ff_sub_layer.w2.weight,base_model.layers.6.ln.bias,base_model.layers.6.ln.weight,base_model.layers.7.attn.dense.bias,base_model.layers.7.attn.dense.weight,base_model.layers.7.attn.in_proj.qkv_fused.bias,base_model.layers.7.attn.in_proj.qkv_fused.weight,base_model.layers.7.ff_ln.bias,base_model.layers.7.ff_ln.weight,base_model.layers.7.ff_sub_layer.w1.bias,base_model.layers.7.ff_sub_layer.w1.weight,base_model.layers.7.ff_sub_layer.w2.bias,base_model.layers.7.ff_sub_layer.w2.weight,base_model.layers.7.ln.bias,base_model.layers.7.ln.weight,base_model.layers.8.attn.dense.bias,base_model.layers.8.attn.dense.weight,base_model.layers.8.attn.in_proj.qkv_fused.bias,base_model.layers.8.attn.in_proj.qkv_fused.weight,base_model.layers.8.ff_ln.bias,base_model.layers.8.ff_ln.weight,base_model.layers.8.ff_sub_layer.w1.bias,base_model.layers.8.ff_sub_layer.w1.weight,base_model.layers.8.ff_sub_layer.w2.bias,base_model.layers.8.ff_sub_layer.w2.weight,base_model.layers.8.ln.bias,base_model.layers.8.ln.weight,base_model.layers.9.attn.dense.bias,base_model.layers.9.attn.dense.weight,base_model.layers.9.attn.in_proj.qkv_fused.bias,base_model.layers.9.attn.in_proj.qkv_fused.weight,base_model.layers.9.ff_ln.bias,base_model.layers.9.ff_ln.weight,base_model.layers.9.ff_sub_layer.w1.bias,base_model.layers.9.ff_sub_layer.w1.weight,base_model.layers.9.ff_sub_layer.w2.bias,base_model.layers.9.ff_sub_layer.w2.weight,base_model.layers.9.ln.bias,base_model.layers.9.ln.weight,base_model.position_embedding.weight,qa_head.bias,qa_head.weight