fix negative_subspace, add intervention nulling with locations

nathankim7 · nathankim7 · commit 86edc1076ffd · 2024-04-22T17:43:44.000-07:00
diff --git a/tests/integration_tests/IntervenableBasicTestCase.py b/tests/integration_tests/IntervenableBasicTestCase.py
@@ -646,6 +646,54 @@ def test_customized_intervention_function_zeroout(self):
             intervened_outputs_fn[1].last_hidden_state,
         )
 
+    def test_nulling_intervention(self):
+
+        _, tokenizer, gpt2 = pv.create_gpt2()
+        gpt2.to("cuda")
+        base = tokenizer(
+            ["The capital of Spain is" for i in range(3)], return_tensors="pt"
+        ).to("cuda")
+
+        base_output = gpt2(**base)
+        base_logits = pv.embed_to_distrib(
+            gpt2, base_output.last_hidden_state, logits=True
+        )[0]
+        print(base_logits.shape)
+
+        pv_gpt2 = pv.IntervenableModel(
+            {
+                "layer": 0,
+                "component": "mlp_output",
+                "intervention": lambda b, s: b * 0.5 + s * 0.5,
+            },
+            model=gpt2,
+        )
+        pv_gpt2.set_device("cuda")
+
+        _, intervened_outputs = pv_gpt2(
+            # the base input
+            base=base,
+            # the source input
+            sources=tokenizer(["Egypt" for i in range(3)], return_tensors="pt").to(
+                "cuda"
+            ),
+            # the location to intervene at (3rd token)
+            unit_locations={"sources->base": (0, [[[3], None, [3]]])},
+        )
+
+        intervened_logits = pv.embed_to_distrib(
+            gpt2, intervened_outputs.last_hidden_state, logits=True
+        )
+        assert not torch.allclose(
+            base_logits, intervened_logits[0]
+        ), "Intervention had no effect on example 0!"
+        assert torch.allclose(
+            base_logits, intervened_logits[1]
+        ), "Intervention was not nulled on example 1!"
+        assert not torch.allclose(
+            base_logits, intervened_logits[2]
+        ), "Intervention had no effect on example 2!"
+
     @classmethod
     def tearDownClass(cls):
         print(f"Removing testing dir {cls._test_dir}")
diff --git a/tests/integration_tests/InterventionWithMLPTestCase.py b/tests/integration_tests/InterventionWithMLPTestCase.py
@@ -42,6 +42,35 @@ def setUpClass(cls):
             intervention_types=VanillaIntervention,
         )
 
+        cls.test_negative_subspace_config = IntervenableConfig(
+            model_type=type(cls.mlp),
+            representations=[
+                RepresentationConfig(
+                    0,
+                    "mlp_activation",
+                    "pos",  # mlp layer creates a single token reprs
+                    1,
+                    subspace_partition=[
+                        [1, 4],
+                        [0, 1],
+                    ],  # partition into two sets of subspaces
+                    intervention_link_key=0,  # linked ones target the same subspace
+                ),
+                RepresentationConfig(
+                    0,
+                    "mlp_activation",
+                    "pos",  # mlp layer creates a single token reprs
+                    1,
+                    subspace_partition=[
+                        [1, 4],
+                        [0, 1],
+                    ],  # partition into two sets of subspaces
+                    intervention_link_key=0,  # linked ones target the same subspace
+                ),
+            ],
+            intervention_types=VanillaIntervention,
+        )
+
         cls.test_subspace_no_intervention_link_config = (
             IntervenableConfig(
                 model_type=type(cls.mlp),
@@ -149,13 +178,12 @@ def test_with_subspace_negative(self):
         Negative test case to check input length.
         """
         intervenable = IntervenableModel(
-            self.test_subspace_intervention_link_config, self.mlp
+            self.test_negative_subspace_config, self.mlp
         )
         # golden label
         b_s = 10
         base = {"inputs_embeds": torch.rand(b_s, 1, 3)}
         source_1 = {"inputs_embeds": torch.rand(b_s, 1, 3)}
-        source_2 = {"inputs_embeds": torch.rand(b_s, 1, 3)}
 
         try:
             intervenable(