Skip to content

Commit

Permalink
feat: update implementation of SummarizationAccuracySemanticRobustnes…
Browse files Browse the repository at this point in the history
…s to use Transform-based approach (#233)
  • Loading branch information
danielezhu committed Mar 27, 2024
1 parent cb3b30e commit 5c931fe
Show file tree
Hide file tree
Showing 13 changed files with 556 additions and 1,030 deletions.
4 changes: 0 additions & 4 deletions src/fmeval/eval_algorithms/general_semantic_robustness.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
from fmeval.constants import (
DatasetColumns,
MEAN,
BUTTER_FINGER,
RANDOM_UPPER_CASE,
WHITESPACE_ADD_REMOVE,
)
from fmeval.data_loaders.data_config import DataConfig
from fmeval.data_loaders.util import get_dataset
Expand All @@ -31,7 +28,6 @@
validate_dataset,
verify_model_determinism,
get_dataset_configs,
create_model_invocation_pipeline,
evaluate_dataset,
)
from fmeval.model_runners.composers.composers import PromptComposer
Expand Down
21 changes: 11 additions & 10 deletions src/fmeval/eval_algorithms/helper_models/helper_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ def get_helper_scores(self, text_input: str) -> Any:
:returns: model output
"""

def __reduce__(self):
"""Serializer method."""
return self.__class__, () # pragma: no cover


class ToxigenHelperModel(BaseHelperModel):
"""
Expand All @@ -65,6 +61,10 @@ def __init__(self, column_name: str = COLUMN_NAME):
self._model = pipeline("text-classification", model=self.TOXIGEN_MODEL_NAME)
self._column_name = column_name

def __reduce__(self):
"""Serializer method so that instances of this class can be made into shared resources."""
return self.__class__, (self._column_name,)

def get_helper_scores(self, text_input: List[str]) -> Dict[str, List[float]]: # type: ignore[override]
"""
Method to get scores from ToxigenHelper
Expand Down Expand Up @@ -139,6 +139,10 @@ def __init__(self, column_name: str = COLUMN_NAME):
self._tokenizer = getattr(transformers, config["tokenizer_name"]).from_pretrained(config["model_type"])
self._column_name = column_name

def __reduce__(self):
"""Serializer method so that instances of this class can be made into shared resources."""
return self.__class__, (self._column_name,)

def get_helper_scores(self, text_input: List[str]) -> Dict[str, List[float]]: # type: ignore[override]
"""
Method to get scores from DetoxifyHelper
Expand Down Expand Up @@ -195,12 +199,9 @@ def __init__(self, model_type: str): # pragma: no cover
self._bertscore = hf_evaluate.load("bertscore")
self._model_type = model_type

# Dummy call to download the model within constructor
self._bertscore.compute(
predictions=["dummy_prediction"],
references=["dummy_reference"],
model_type=self._model_type,
)
def __reduce__(self):
"""Serializer method so that instances of this class can be made into shared resources."""
return self.__class__, (self._model_type,)

def get_helper_scores(self, target_output: str, model_output: str) -> float: # type: ignore[override]
"""
Expand Down
483 changes: 169 additions & 314 deletions src/fmeval/eval_algorithms/summarization_accuracy_semantic_robustness.py

Large diffs are not rendered by default.

53 changes: 52 additions & 1 deletion src/fmeval/transforms/semantic_robustness_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import evaluate as hf_evaluate
from typing import List, Dict, Any
from typing import List, Dict, Any, Tuple

import numpy as np

from fmeval.util import require
from fmeval.transforms.common import Mean
Expand Down Expand Up @@ -91,3 +93,52 @@ def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
)
record[self.output_key] = wer_metric
return record


class MeanDeltaScores(Transform):
"""This transform augments an input record with mean delta scores.
Given
1) An "original score", which is a score that was computed using
an "original" i.e. unperturbed input
2) A series of "perturbed scores", which are scores computed using
perturbations of the original input
the delta score for a particular perturbed score is computed using the
formula: abs(original_score - perturbed_score), and the mean delta score
is simply the arithmetic mean of all delta scores for the series of
perturbed scores.
"""

def __init__(self, key_mapping: Dict[str, Tuple[List[str], str]]):
"""MeanDeltaScores initializer.
:param key_mapping: Maps an original score key to a tuple of the form
(perturbed_score_keys, output_key). output_key will be used
as the output key corresponding to the mean delta score computed
using the original score and perturbed scores.
"""
super().__init__(key_mapping)
original_score_keys = list(key_mapping.keys())
perturbed_score_keys = [key for tup in key_mapping.values() for key in tup[0]]
self.register_input_output_keys(
input_keys=original_score_keys + perturbed_score_keys,
output_keys=[tup[1] for tup in key_mapping.values()],
)
self.key_mapping = key_mapping

@validate_call
def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
"""Augment the input record with the computed mean delta scores.
:param record: The input record.
:returns: The input record with the mean delta scores added in.
"""
for original_score_key, tup in self.key_mapping.items():
perturbed_score_keys, output_key = tup
record[output_key] = np.mean(
[
abs(record[original_score_key] - record[perturbed_score_key])
for perturbed_score_key in perturbed_score_keys
]
)
return record
4 changes: 2 additions & 2 deletions src/fmeval/transforms/summarization_accuracy_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,8 @@ def compute_metric(self, target_output: str, model_output: str) -> float:
:returns: The BERT metric value.
"""
if isinstance(self.bertscore_model, BertscoreHelperModel):
return self.bertscore_model.invoke_model(target_output, model_output)
return self.bertscore_model.get_helper_scores(target_output, model_output)
else:
return ray.get( # type: ignore[return-value]
self.bertscore_model.invoke_model.remote(target_output, model_output) # type: ignore[union-attr]
self.bertscore_model.get_helper_scores.remote(target_output, model_output) # type: ignore[union-attr]
)
128 changes: 63 additions & 65 deletions test/integration/test_summarization_accuracy_semantic_robustness.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import ray
import json
import os
import pytest
import ray

from typing import NamedTuple, Dict
from pytest import approx
Expand All @@ -16,10 +16,8 @@
DELTA_ROUGE_SCORE,
DELTA_METEOR_SCORE,
DELTA_BERT_SCORE,
BUTTER_FINGER,
RANDOM_UPPER_CASE,
WHITESPACE_ADD_REMOVE,
)
from fmeval.eval_algorithms.semantic_robustness_utils import BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE
from test.integration.models.model_runners import sm_model_runner

ABS_TOL = 1e-6
Expand All @@ -43,100 +41,104 @@
)


class TestCaseEvaluate(NamedTuple):
class TestCase(NamedTuple):
config: SummarizationAccuracySemanticRobustnessConfig
expected_evaluate_sample_scores: Dict[str, float]
expected_evaluate_scores: Dict[str, float]
expected_scores: Dict[str, float]


class TestSummarizationAccuracySemanticRobustness:
@pytest.mark.parametrize(
"config, expected_evaluate_sample_scores, expected_evaluate_scores",
"config, expected_scores",
[
TestCaseEvaluate(
TestCase(
config=BUTTER_FINGER_CONFIG,
expected_evaluate_sample_scores={
expected_scores={
ROUGE_SCORE: 0.0,
METEOR_SCORE: 0.0,
BERT_SCORE: 0.536162,
DELTA_ROUGE_SCORE: 0.0,
DELTA_METEOR_SCORE: 0.063628,
DELTA_BERT_SCORE: 0.050299,
},
expected_evaluate_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.021061,
DELTA_METEOR_SCORE: 0.046859,
DELTA_BERT_SCORE: 0.032417,
DELTA_METEOR_SCORE: 0.037836,
DELTA_BERT_SCORE: 0.024666,
},
),
TestCaseEvaluate(
TestCase(
config=RANDOM_UPPER_CASE_CONFIG,
expected_evaluate_sample_scores={
expected_scores={
ROUGE_SCORE: 0.0,
METEOR_SCORE: 0.0,
BERT_SCORE: 0.536162,
DELTA_ROUGE_SCORE: 0.0,
DELTA_METEOR_SCORE: 0.051282,
DELTA_BERT_SCORE: 0.048976,
},
expected_evaluate_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.037362,
DELTA_METEOR_SCORE: 0.056909,
DELTA_BERT_SCORE: 0.026363,
DELTA_METEOR_SCORE: 0.064103,
DELTA_BERT_SCORE: 0.056435,
},
),
TestCaseEvaluate(
TestCase(
config=WHITESPACE_CONFIG,
expected_evaluate_sample_scores={
expected_scores={
ROUGE_SCORE: 0.0,
METEOR_SCORE: 0.0,
BERT_SCORE: 0.536162,
DELTA_ROUGE_SCORE: 0.0,
DELTA_METEOR_SCORE: 0.050657,
DELTA_BERT_SCORE: 0.037705,
},
expected_evaluate_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.030725,
DELTA_METEOR_SCORE: 0.054234,
DELTA_BERT_SCORE: 0.026511,
DELTA_METEOR_SCORE: 0.038462,
DELTA_BERT_SCORE: 0.039566,
},
),
],
)
def test_evaluate_sample_and_evaluate(
self, config, expected_evaluate_sample_scores, expected_evaluate_scores, integration_tests_dir
):
"""
In order to reuse SummarizationAccuracySemanticRobustness objects
as much as possible (to minimize creation of BertscoreHelperModels),
we test evaluate_sample and evaluate back to back using the same eval_algo
(instead of following the convention of the other tests, where evaluate_sample
and evaluate are tested in separate methods).
"""
def test_evaluate_sample(self, config, expected_scores, integration_tests_dir):
eval_algo = SummarizationAccuracySemanticRobustness(config)
# Test evaluate_sample
with open(os.path.join(integration_tests_dir, "datasets", "gigaword_sample.jsonl")) as fh:
json_obj = json.loads(fh.readline())
model_input = json_obj["document"]
target_output = json_obj["summary"]
eval_scores = eval_algo.evaluate_sample(
model_input=model_input,
model=sm_model_runner,
target_output=target_output,
model=sm_model_runner,
)
for eval_score in eval_scores:
assert eval_score.value == approx(expected_evaluate_sample_scores[eval_score.name], abs=ABS_TOL)
assert eval_score.value == approx(expected_scores[eval_score.name], abs=ABS_TOL)

# Test evaluate
@pytest.mark.parametrize(
"config, expected_scores",
[
TestCase(
config=BUTTER_FINGER_CONFIG,
expected_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.023259,
DELTA_METEOR_SCORE: 0.059768,
DELTA_BERT_SCORE: 0.031421,
},
),
TestCase(
config=RANDOM_UPPER_CASE_CONFIG,
expected_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.032086,
DELTA_METEOR_SCORE: 0.057150,
DELTA_BERT_SCORE: 0.026943,
},
),
TestCase(
config=WHITESPACE_CONFIG,
expected_scores={
ROUGE_SCORE: 0.021908,
METEOR_SCORE: 0.105540,
BERT_SCORE: 0.559893,
DELTA_ROUGE_SCORE: 0.020407,
DELTA_METEOR_SCORE: 0.048702,
DELTA_BERT_SCORE: 0.026193,
},
),
],
)
def test_evaluate(self, config, expected_scores):
eval_algo = SummarizationAccuracySemanticRobustness(config)
dataset_config = DATASET_CONFIGS[GIGAWORD]
eval_output = eval_algo.evaluate(
model=sm_model_runner,
Expand All @@ -145,9 +147,5 @@ def test_evaluate_sample_and_evaluate(
num_records=20,
)[0]
for eval_score in eval_output.dataset_scores:
assert eval_score.value == approx(expected_evaluate_scores[eval_score.name], abs=ABS_TOL)

# Calling ray.shutdown() would be overkill since there are still other test cases.
# Thus, we kill only the SummarizationAccuracySingleton ray actor used by the
# current test case, to make sure that resources are cleaned up between test cases.
ray.kill(eval_algo._summarization_accuracy_eval_algo)
assert eval_score.value == approx(expected_scores[eval_score.name], abs=ABS_TOL)
ray.shutdown()
2 changes: 1 addition & 1 deletion test/integration/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_create_shared_resource(self):
`remote` and `get`).
Note that the input payload and expected result are copied from
the BertscoreHelperModel.invoke_model unit test.
the BertscoreHelperModel.get_helper_scores unit test.
"""
bertscore_model = BertscoreHelperModel(BertscoreHelperModelTypes.ROBERTA_MODEL.value)
actor_handle = create_shared_resource(bertscore_model)
Expand Down
4 changes: 2 additions & 2 deletions test/unit/eval_algorithms/test_general_semantic_robustness.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def test_evaluate_sample_deterministic_model(self, bertscore_model, test_case, c
(test_case.perturbed_model_output_2, None), # Output on the second perturbation
]
bertscore_model_instance = Mock(spec=BertscoreHelperModel)
bertscore_model_instance.invoke_model = Mock(return_value=BERTSCORE_DUMMY_VALUE)
bertscore_model_instance.get_helper_scores = Mock(return_value=BERTSCORE_DUMMY_VALUE)
bertscore_model.return_value = bertscore_model_instance

eval_algo = GeneralSemanticRobustness(config, use_ray=False)
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_semantic_robustness_evaluate_sample_non_deterministic_model(self, berts
THEN the robustness score value is smaller than it would be for a deterministic model.
"""
bertscore_model_instance = Mock(spec=BertscoreHelperModel)
bertscore_model_instance.invoke_model = Mock(return_value=BERTSCORE_DUMMY_VALUE)
bertscore_model_instance.get_helper_scores = Mock(return_value=BERTSCORE_DUMMY_VALUE)
bertscore_model.return_value = bertscore_model_instance

deterministic_model = MagicMock()
Expand Down
Loading

0 comments on commit 5c931fe

Please sign in to comment.