feat: update implementation of SummarizationAccuracySemanticRobustnes…

…s to use Transform-based approach (#233)
aws · Mar 27, 2024 · 5c931fe · 5c931fe
1 parent cb3b30e
commit 5c931fe
Show file tree

Hide file tree

Showing 13 changed files with 556 additions and 1,030 deletions.
diff --git a/src/fmeval/eval_algorithms/general_semantic_robustness.py b/src/fmeval/eval_algorithms/general_semantic_robustness.py
@@ -6,9 +6,6 @@
 from fmeval.constants import (
     DatasetColumns,
     MEAN,
-    BUTTER_FINGER,
-    RANDOM_UPPER_CASE,
-    WHITESPACE_ADD_REMOVE,
 )
 from fmeval.data_loaders.data_config import DataConfig
 from fmeval.data_loaders.util import get_dataset
@@ -31,7 +28,6 @@
     validate_dataset,
     verify_model_determinism,
     get_dataset_configs,
-    create_model_invocation_pipeline,
     evaluate_dataset,
 )
 from fmeval.model_runners.composers.composers import PromptComposer

diff --git a/src/fmeval/eval_algorithms/helper_models/helper_model.py b/src/fmeval/eval_algorithms/helper_models/helper_model.py
@@ -43,10 +43,6 @@ def get_helper_scores(self, text_input: str) -> Any:
         :returns: model output
         """
 
-    def __reduce__(self):
-        """Serializer method."""
-        return self.__class__, ()  # pragma: no cover
-
 
 class ToxigenHelperModel(BaseHelperModel):
     """
@@ -65,6 +61,10 @@ def __init__(self, column_name: str = COLUMN_NAME):
         self._model = pipeline("text-classification", model=self.TOXIGEN_MODEL_NAME)
         self._column_name = column_name
 
+    def __reduce__(self):
+        """Serializer method so that instances of this class can be made into shared resources."""
+        return self.__class__, (self._column_name,)
+
     def get_helper_scores(self, text_input: List[str]) -> Dict[str, List[float]]:  # type: ignore[override]
         """
         Method to get scores from ToxigenHelper
@@ -139,6 +139,10 @@ def __init__(self, column_name: str = COLUMN_NAME):
         self._tokenizer = getattr(transformers, config["tokenizer_name"]).from_pretrained(config["model_type"])
         self._column_name = column_name
 
+    def __reduce__(self):
+        """Serializer method so that instances of this class can be made into shared resources."""
+        return self.__class__, (self._column_name,)
+
     def get_helper_scores(self, text_input: List[str]) -> Dict[str, List[float]]:  # type: ignore[override]
         """
         Method to get scores from DetoxifyHelper
@@ -195,12 +199,9 @@ def __init__(self, model_type: str):  # pragma: no cover
         self._bertscore = hf_evaluate.load("bertscore")
         self._model_type = model_type
 
-        # Dummy call to download the model within constructor
-        self._bertscore.compute(
-            predictions=["dummy_prediction"],
-            references=["dummy_reference"],
-            model_type=self._model_type,
-        )
+    def __reduce__(self):
+        """Serializer method so that instances of this class can be made into shared resources."""
+        return self.__class__, (self._model_type,)
 
     def get_helper_scores(self, target_output: str, model_output: str) -> float:  # type: ignore[override]
         """

diff --git a/src/fmeval/eval_algorithms/summarization_accuracy_semantic_robustness.py b/src/fmeval/eval_algorithms/summarization_accuracy_semantic_robustness.py
diff --git a/src/fmeval/transforms/semantic_robustness_metrics.py b/src/fmeval/transforms/semantic_robustness_metrics.py
@@ -1,5 +1,7 @@
 import evaluate as hf_evaluate
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Tuple
+
+import numpy as np
 
 from fmeval.util import require
 from fmeval.transforms.common import Mean
@@ -91,3 +93,52 @@ def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
         )
         record[self.output_key] = wer_metric
         return record
+
+
+class MeanDeltaScores(Transform):
+    """This transform augments an input record with mean delta scores.
+
+    Given
+        1) An "original score", which is a score that was computed using
+            an "original" i.e. unperturbed input
+        2) A series of "perturbed scores", which are scores computed using
+            perturbations of the original input
+    the delta score for a particular perturbed score is computed using the
+    formula: abs(original_score - perturbed_score), and the mean delta score
+    is simply the arithmetic mean of all delta scores for the series of
+    perturbed scores.
+    """
+
+    def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]):
+        """MeanDeltaScores initializer.
+
+        :param key_mapping: Maps an original score key to a tuple of the form
+            (perturbed_score_keys, output_key). output_key will be used
+            as the output key corresponding to the mean delta score computed
+            using the original score and perturbed scores.
+        """
+        super().__init__(key_mapping)
+        original_score_keys = list(key_mapping.keys())
+        perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]]
+        self.register_input_output_keys(
+            input_keys=original_score_keys + perturbed_score_keys,
+            output_keys=[tup[1] for tup in key_mapping.values()],
+        )
+        self.key_mapping = key_mapping
+
+    @validate_call
+    def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        """Augment the input record with the computed mean delta scores.
+
+        :param record: The input record.
+        :returns: The input record with the mean delta scores added in.
+        """
+        for original_score_key, tup in self.key_mapping.items():
+            perturbed_score_keys, output_key = tup
+            record[output_key] = np.mean(
+                [
+                    abs(record[original_score_key] - record[perturbed_score_key])
+                    for perturbed_score_key in perturbed_score_keys
+                ]
+            )
+        return record
diff --git a/src/fmeval/transforms/summarization_accuracy_metrics.py b/src/fmeval/transforms/summarization_accuracy_metrics.py
@@ -275,8 +275,8 @@ def compute_metric(self, target_output: str, model_output: str) -> float:
         :returns: The BERT metric value.
         """
         if isinstance(self.bertscore_model, BertscoreHelperModel):
-            return self.bertscore_model.invoke_model(target_output, model_output)
+            return self.bertscore_model.get_helper_scores(target_output, model_output)
         else:
             return ray.get(  # type: ignore[return-value]
-                self.bertscore_model.invoke_model.remote(target_output, model_output)  # type: ignore[union-attr]
+                self.bertscore_model.get_helper_scores.remote(target_output, model_output)  # type: ignore[union-attr]
             )
diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -1,7 +1,7 @@
-import os
-import ray
 import json
+import os
 import pytest
+import ray
 
 from typing import NamedTuple, Dict
 from pytest import approx
@@ -16,10 +16,8 @@
     DELTA_ROUGE_SCORE,
     DELTA_METEOR_SCORE,
     DELTA_BERT_SCORE,
-    BUTTER_FINGER,
-    RANDOM_UPPER_CASE,
-    WHITESPACE_ADD_REMOVE,
 )
+from fmeval.eval_algorithms.semantic_robustness_utils import BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE
 from test.integration.models.model_runners import sm_model_runner
 
 ABS_TOL = 1e-6
@@ -43,100 +41,104 @@
 )
 
 
-class TestCaseEvaluate(NamedTuple):
+class TestCase(NamedTuple):
     config: SummarizationAccuracySemanticRobustnessConfig
-    expected_evaluate_sample_scores: Dict[str, float]
-    expected_evaluate_scores: Dict[str, float]
+    expected_scores: Dict[str, float]
 
 
 class TestSummarizationAccuracySemanticRobustness:
     @pytest.mark.parametrize(
-        "config, expected_evaluate_sample_scores, expected_evaluate_scores",
+        "config, expected_scores",
         [
-            TestCaseEvaluate(
+            TestCase(
                 config=BUTTER_FINGER_CONFIG,
-                expected_evaluate_sample_scores={
+                expected_scores={
                     ROUGE_SCORE: 0.0,
                     METEOR_SCORE: 0.0,
                     BERT_SCORE: 0.536162,
                     DELTA_ROUGE_SCORE: 0.0,
-                    DELTA_METEOR_SCORE: 0.063628,
-                    DELTA_BERT_SCORE: 0.050299,
-                },
-                expected_evaluate_scores={
-                    ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.105540,
-                    BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.021061,
-                    DELTA_METEOR_SCORE: 0.046859,
-                    DELTA_BERT_SCORE: 0.032417,
+                    DELTA_METEOR_SCORE: 0.037836,
+                    DELTA_BERT_SCORE: 0.024666,
                 },
             ),
-            TestCaseEvaluate(
+            TestCase(
                 config=RANDOM_UPPER_CASE_CONFIG,
-                expected_evaluate_sample_scores={
+                expected_scores={
                     ROUGE_SCORE: 0.0,
                     METEOR_SCORE: 0.0,
                     BERT_SCORE: 0.536162,
                     DELTA_ROUGE_SCORE: 0.0,
-                    DELTA_METEOR_SCORE: 0.051282,
-                    DELTA_BERT_SCORE: 0.048976,
-                },
-                expected_evaluate_scores={
-                    ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.105540,
-                    BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.037362,
-                    DELTA_METEOR_SCORE: 0.056909,
-                    DELTA_BERT_SCORE: 0.026363,
+                    DELTA_METEOR_SCORE: 0.064103,
+                    DELTA_BERT_SCORE: 0.056435,
                 },
             ),
-            TestCaseEvaluate(
+            TestCase(
                 config=WHITESPACE_CONFIG,
-                expected_evaluate_sample_scores={
+                expected_scores={
                     ROUGE_SCORE: 0.0,
                     METEOR_SCORE: 0.0,
                     BERT_SCORE: 0.536162,
                     DELTA_ROUGE_SCORE: 0.0,
-                    DELTA_METEOR_SCORE: 0.050657,
-                    DELTA_BERT_SCORE: 0.037705,
-                },
-                expected_evaluate_scores={
-                    ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.105540,
-                    BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.030725,
-                    DELTA_METEOR_SCORE: 0.054234,
-                    DELTA_BERT_SCORE: 0.026511,
+                    DELTA_METEOR_SCORE: 0.038462,
+                    DELTA_BERT_SCORE: 0.039566,
                 },
             ),
         ],
     )
-    def test_evaluate_sample_and_evaluate(
-        self, config, expected_evaluate_sample_scores, expected_evaluate_scores, integration_tests_dir
-    ):
-        """
-        In order to reuse SummarizationAccuracySemanticRobustness objects
-        as much as possible (to minimize creation of BertscoreHelperModels),
-        we test evaluate_sample and evaluate back to back using the same eval_algo
-        (instead of following the convention of the other tests, where evaluate_sample
-        and evaluate are tested in separate methods).
-        """
+    def test_evaluate_sample(self, config, expected_scores, integration_tests_dir):
         eval_algo = SummarizationAccuracySemanticRobustness(config)
-        # Test evaluate_sample
         with open(os.path.join(integration_tests_dir, "datasets", "gigaword_sample.jsonl")) as fh:
             json_obj = json.loads(fh.readline())
             model_input = json_obj["document"]
             target_output = json_obj["summary"]
             eval_scores = eval_algo.evaluate_sample(
                 model_input=model_input,
-                model=sm_model_runner,
                 target_output=target_output,
+                model=sm_model_runner,
             )
             for eval_score in eval_scores:
-                assert eval_score.value == approx(expected_evaluate_sample_scores[eval_score.name], abs=ABS_TOL)
+                assert eval_score.value == approx(expected_scores[eval_score.name], abs=ABS_TOL)
 
-        # Test evaluate
+    @pytest.mark.parametrize(
+        "config, expected_scores",
+        [
+            TestCase(
+                config=BUTTER_FINGER_CONFIG,
+                expected_scores={
+                    ROUGE_SCORE: 0.021908,
+                    METEOR_SCORE: 0.105540,
+                    BERT_SCORE: 0.559893,
+                    DELTA_ROUGE_SCORE: 0.023259,
+                    DELTA_METEOR_SCORE: 0.059768,
+                    DELTA_BERT_SCORE: 0.031421,
+                },
+            ),
+            TestCase(
+                config=RANDOM_UPPER_CASE_CONFIG,
+                expected_scores={
+                    ROUGE_SCORE: 0.021908,
+                    METEOR_SCORE: 0.105540,
+                    BERT_SCORE: 0.559893,
+                    DELTA_ROUGE_SCORE: 0.032086,
+                    DELTA_METEOR_SCORE: 0.057150,
+                    DELTA_BERT_SCORE: 0.026943,
+                },
+            ),
+            TestCase(
+                config=WHITESPACE_CONFIG,
+                expected_scores={
+                    ROUGE_SCORE: 0.021908,
+                    METEOR_SCORE: 0.105540,
+                    BERT_SCORE: 0.559893,
+                    DELTA_ROUGE_SCORE: 0.020407,
+                    DELTA_METEOR_SCORE: 0.048702,
+                    DELTA_BERT_SCORE: 0.026193,
+                },
+            ),
+        ],
+    )
+    def test_evaluate(self, config, expected_scores):
+        eval_algo = SummarizationAccuracySemanticRobustness(config)
         dataset_config = DATASET_CONFIGS[GIGAWORD]
         eval_output = eval_algo.evaluate(
             model=sm_model_runner,
@@ -145,9 +147,5 @@ def test_evaluate_sample_and_evaluate(
             num_records=20,
         )[0]
         for eval_score in eval_output.dataset_scores:
-            assert eval_score.value == approx(expected_evaluate_scores[eval_score.name], abs=ABS_TOL)
-
-        # Calling ray.shutdown() would be overkill since there are still other test cases.
-        # Thus, we kill only the SummarizationAccuracySingleton ray actor used by the
-        # current test case, to make sure that resources are cleaned up between test cases.
-        ray.kill(eval_algo._summarization_accuracy_eval_algo)
+            assert eval_score.value == approx(expected_scores[eval_score.name], abs=ABS_TOL)
+        ray.shutdown()
diff --git a/test/integration/test_util.py b/test/integration/test_util.py
@@ -16,7 +16,7 @@ def test_create_shared_resource(self):
             `remote` and `get`).
 
         Note that the input payload and expected result are copied from
-        the BertscoreHelperModel.invoke_model unit test.
+        the BertscoreHelperModel.get_helper_scores unit test.
         """
         bertscore_model = BertscoreHelperModel(BertscoreHelperModelTypes.ROBERTA_MODEL.value)
         actor_handle = create_shared_resource(bertscore_model)

diff --git a/test/unit/eval_algorithms/test_general_semantic_robustness.py b/test/unit/eval_algorithms/test_general_semantic_robustness.py
@@ -309,7 +309,7 @@ def test_evaluate_sample_deterministic_model(self, bertscore_model, test_case, c
             (test_case.perturbed_model_output_2, None),  # Output on the second perturbation
         ]
         bertscore_model_instance = Mock(spec=BertscoreHelperModel)
-        bertscore_model_instance.invoke_model = Mock(return_value=BERTSCORE_DUMMY_VALUE)
+        bertscore_model_instance.get_helper_scores = Mock(return_value=BERTSCORE_DUMMY_VALUE)
         bertscore_model.return_value = bertscore_model_instance
 
         eval_algo = GeneralSemanticRobustness(config, use_ray=False)
@@ -339,7 +339,7 @@ def test_semantic_robustness_evaluate_sample_non_deterministic_model(self, berts
         THEN the robustness score value is smaller than it would be for a deterministic model.
         """
         bertscore_model_instance = Mock(spec=BertscoreHelperModel)
-        bertscore_model_instance.invoke_model = Mock(return_value=BERTSCORE_DUMMY_VALUE)
+        bertscore_model_instance.get_helper_scores = Mock(return_value=BERTSCORE_DUMMY_VALUE)
         bertscore_model.return_value = bertscore_model_instance
 
         deterministic_model = MagicMock()