forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CompGuessWhat?! dataset (huggingface#236)
* Created data generator for CompGuessWhat?! games * Added missing datasets __init__.py * Added support for zero-shot split via multiple configuration dataset * Fixed typos in docstrings * Updated dataset_infos.json with the new configuration
- Loading branch information
Showing
5 changed files
with
712 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
from __future__ import absolute_import, division, print_function | ||
|
||
import gzip | ||
import json | ||
import os | ||
|
||
import nlp | ||
|
||
|
||
class CompguesswhatConfig(nlp.BuilderConfig): | ||
""" BuilderConfig for CompGuessWhat?!""" | ||
|
||
def __init__(self, data_url, splits, gameplay_scenario, **kwargs): | ||
""" | ||
Args: | ||
gameplay_scenario: to specify if we want to load original CompGuessWhat?! split ('original') or | ||
the zero-shot reference games based on NOCAPS images ('zero_shot') | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super(CompguesswhatConfig, self).__init__( | ||
version=nlp.Version("0.1.0", "First CompGuessWhat?! release"), **kwargs | ||
) | ||
assert gameplay_scenario in ("original", "zero_shot"), \ | ||
"Invalid choice for parameter 'gameplay_scenario': {gameplay_scenario}. Valid values are ('original', 'zero_shot')." | ||
|
||
self.gameplay_scenario = gameplay_scenario | ||
self.splits = splits | ||
self.data_url = data_url | ||
|
||
|
||
class Compguesswhat(nlp.GeneratorBasedBuilder): | ||
_CITATION = """\ | ||
@inproceedings{suglia2020compguesswhat, | ||
title={CompGuessWhat?!: a Multi-task Evaluation Framework for Grounded Language Learning}, | ||
author={Suglia, Alessandro, Konstas, Ioannis, Vanzo, Andrea, Bastianelli, Emanuele, Desmond Elliott, Stella Frank and Oliver Lemon}, | ||
booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, | ||
year={2020} | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """ | ||
CompGuessWhat?! is an instance of a multi-task framework for evaluating the quality of learned neural representations, | ||
in particular concerning attribute grounding. Use this dataset if you want to use the set of games whose reference | ||
scene is an image in VisualGenome. Visit the website for more details: https://compguesswhat.github.io | ||
""" | ||
|
||
BUILDER_CONFIGS = [ | ||
CompguesswhatConfig( | ||
name="compguesswhat-original", | ||
gameplay_scenario="original", | ||
description="CompGuessWhat?! subset of games from the original GuessWhat?! dataset", | ||
data_url="https://www.dropbox.com/s/l0nc13udml6vs0w/compguesswhat-original.zip?dl=1", | ||
splits={ | ||
"train": "compguesswhat.train.jsonl.gz", | ||
"valid": "compguesswhat.valid.jsonl.gz", | ||
"test": "compguesswhat.test.jsonl.gz" | ||
} | ||
|
||
), | ||
CompguesswhatConfig( | ||
name="compguesswhat-zero_shot", | ||
gameplay_scenario="zero_shot", | ||
description="CompGuessWhat?! reference set of games for zero-shot evaluation using NOCAPS images", | ||
data_url="https://www.dropbox.com/s/gd46azul7o7iip4/compguesswhat-zero_shot.zip?dl=1", | ||
splits={ | ||
"nd_valid": "compguesswhat.nd_valid.jsonl.gz", | ||
"nd_test": "compguesswhat.nd_test.jsonl.gz", | ||
"od_valid": "compguesswhat.od_valid.jsonl.gz", | ||
"od_test": "compguesswhat.od_test.jsonl.gz" | ||
} | ||
), | ||
] | ||
|
||
VERSION = nlp.Version("0.1.0") | ||
|
||
def _info(self): | ||
if self.config.gameplay_scenario == "original": | ||
return nlp.DatasetInfo( | ||
# This is the description that will appear on the datasets page. | ||
description=self._DESCRIPTION, | ||
# nlp.features.FeatureConnectors | ||
features=nlp.Features( | ||
{ | ||
"id": nlp.Value("int32"), | ||
"target_id": nlp.Value("int32"), | ||
"timestamp": nlp.Value("string"), | ||
"status": nlp.Value("string"), | ||
"image": { | ||
"id": nlp.Value("int32"), | ||
"file_name": nlp.Value("string"), | ||
"flickr_url": nlp.Value("string"), | ||
"coco_url": nlp.Value("string"), | ||
"height": nlp.Value("int32"), | ||
"width": nlp.Value("int32"), | ||
"vg_id": nlp.Value("int32"), | ||
"vg_url": nlp.Value("string") | ||
}, | ||
"qas": nlp.features.Sequence({ | ||
"question": nlp.Value("string"), | ||
"answer": nlp.Value("string"), | ||
"id": nlp.Value("int32") | ||
}), | ||
"objects": nlp.features.Sequence({ | ||
"id": nlp.Value("int32"), | ||
"bbox": nlp.Sequence(nlp.Value("float32"), length=4), | ||
"category": nlp.Value("string"), | ||
"area": nlp.Value("float32"), | ||
"category_id": nlp.Value("int32"), | ||
"segment": nlp.features.Sequence(nlp.features.Sequence(nlp.Value("float32"))) | ||
}) | ||
} | ||
), | ||
# If there's a common (input, target) tuple from the features, | ||
# specify them here. They'll be used if as_supervised=True in | ||
# builder.as_dataset. | ||
supervised_keys=None, | ||
# Homepage of the dataset for documentation | ||
homepage="https://compguesswhat.github.io/", | ||
citation=self._CITATION, | ||
) | ||
elif self.config.gameplay_scenario == "zero_shot": | ||
return nlp.DatasetInfo( | ||
# This is the description that will appear on the datasets page. | ||
description=self._DESCRIPTION, | ||
# nlp.features.FeatureConnectors | ||
features=nlp.Features( | ||
{ | ||
"id": nlp.Value("int32"), | ||
"target_id": nlp.Value("string"), | ||
"status": nlp.Value("string"), | ||
"image": { | ||
"id": nlp.Value("int32"), | ||
"file_name": nlp.Value("string"), | ||
"coco_url": nlp.Value("string"), | ||
"height": nlp.Value("int32"), | ||
"width": nlp.Value("int32"), | ||
"license": nlp.Value("int32"), | ||
"open_images_id": nlp.Value("string"), | ||
"date_captured": nlp.Value("string") | ||
}, | ||
"objects": nlp.features.Sequence({ | ||
"id": nlp.Value("string"), | ||
"bbox": nlp.Sequence(nlp.Value("float32"), length=4), | ||
"category": nlp.Value("string"), | ||
"area": nlp.Value("float32"), | ||
"category_id": nlp.Value("int32"), | ||
"IsOccluded": nlp.Value("int32"), | ||
"IsTruncated": nlp.Value("int32"), | ||
"segment": nlp.features.Sequence({ | ||
"MaskPath": nlp.Value("string"), | ||
"LabelName": nlp.Value("string"), | ||
"BoxID": nlp.Value("string"), | ||
"BoxXMin": nlp.Value("string"), | ||
"BoxXMax": nlp.Value("string"), | ||
"BoxYMin": nlp.Value("string"), | ||
"BoxYMax": nlp.Value("string"), | ||
"PredictedIoU": nlp.Value("string"), | ||
"Clicks": nlp.Value("string") | ||
|
||
}) | ||
}) | ||
} | ||
), | ||
# If there's a common (input, target) tuple from the features, | ||
# specify them here. They'll be used if as_supervised=True in | ||
# builder.as_dataset. | ||
supervised_keys=None, | ||
# Homepage of the dataset for documentation | ||
homepage="https://compguesswhat.github.io/", | ||
citation=self._CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
dl_dir = dl_manager.download_and_extract(self.config.data_url) | ||
|
||
splits_gen = [] | ||
|
||
for split_id, split_filename in self.config.splits.items(): | ||
if self.config.gameplay_scenario == "original": | ||
if "train" in split_id: | ||
split_name = nlp.Split.TRAIN | ||
elif "valid" in split_id: | ||
split_name = nlp.Split.VALIDATION | ||
elif "test" in split_id: | ||
split_name = nlp.Split.TEST | ||
else: | ||
split_name = nlp.Split(split_id) | ||
|
||
full_split_name = "-".join(["compguesswhat", self.config.gameplay_scenario]) | ||
splits_gen.append( | ||
nlp.SplitGenerator( | ||
name=split_name, | ||
gen_kwargs={"filepath": os.path.join( | ||
dl_dir, | ||
full_split_name, | ||
self.VERSION.version_str, | ||
split_filename | ||
)}, | ||
) | ||
) | ||
|
||
return splits_gen | ||
|
||
def _generate_examples(self, filepath): | ||
def _extract_game_tuple(data): | ||
data = data.decode("utf-8") | ||
game = json.loads(data.strip("\n")) | ||
|
||
# we refactor the data structure a bit to fit with the new version | ||
game["target_id"] = game["object_id"] | ||
if "object_id" in game: | ||
del game["object_id"] | ||
|
||
if "questioner_id" in game: | ||
del game["questioner_id"] | ||
### | ||
|
||
return game["id"], game | ||
|
||
"""Yields examples.""" | ||
with gzip.open(filepath) as in_file: | ||
for data in in_file: | ||
yield _extract_game_tuple(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import gzip | ||
import json | ||
import os | ||
from argparse import ArgumentParser | ||
|
||
parser = ArgumentParser() | ||
parser.add_argument("-d", "--data_path", type=str, required=True, | ||
help="Data path containing the CompGuessWhat?! datasets (files with 'jsonl.gz' extension)") | ||
parser.add_argument("--examples", type=int, default=5, help="Number of games to consider in the dummy dataset") | ||
original_data_files = { | ||
"train": "compguesswhat.train.jsonl.gz", | ||
"valid": "compguesswhat.valid.jsonl.gz", | ||
"test": "compguesswhat.test.jsonl.gz" | ||
} | ||
|
||
zs_data_files = { | ||
"nd_valid": "compguesswhat.nd_valid.jsonl.gz", | ||
"od_valid": "compguesswhat.od_valid.jsonl.gz", | ||
"nd_test": "compguesswhat.nd_test.jsonl.gz", | ||
"od_test": "compguesswhat.od_test.jsonl.gz" | ||
} | ||
|
||
COMPGUESSWHAT_ROOT = "datasets/compguesswhat/" | ||
|
||
|
||
def create_dummy_data_for_split(data_path, dataset_name, dataset_version, data_files): | ||
full_dataset_name = "-".join(["compguesswhat", dataset_name]) | ||
dummy_data_path = os.path.join( | ||
COMPGUESSWHAT_ROOT, | ||
"dummy", | ||
full_dataset_name, | ||
dataset_version, | ||
"dummy_data", | ||
full_dataset_name, | ||
dataset_version \ | ||
) | ||
|
||
if not os.path.exists(dummy_data_path): | ||
os.makedirs(dummy_data_path) | ||
|
||
for split_name, split_file in data_files.items(): | ||
print(f"Generating dummy data for split {split_name} (num. examples = {args.examples})") | ||
|
||
split_filepath = os.path.join( | ||
data_path, | ||
full_dataset_name, | ||
dataset_version, | ||
split_file | ||
) | ||
print(f"Reading split file {split_filepath}") | ||
with gzip.open(split_filepath) as in_file: | ||
dummy_filepath = os.path.join( | ||
dummy_data_path, | ||
split_file | ||
) | ||
with gzip.open(dummy_filepath, mode="w") as out_file: | ||
for i, line in enumerate(in_file): | ||
if i > args.examples: | ||
break | ||
|
||
data = json.loads(line.strip()) | ||
out_file.write(json.dumps(data).encode("utf-8")) | ||
out_file.write(b"\n") | ||
|
||
|
||
def main(args): | ||
# args.data_path is the directory containing the already downloaded dataset files | ||
# we assume that the dataset test was successful and we have the file dataset_info.json | ||
dataset_info_path = os.path.join(COMPGUESSWHAT_ROOT, "dataset_infos.json") | ||
|
||
if not os.path.exists(dataset_info_path): | ||
raise ValueError( | ||
"The file 'dataset_info.json' doesn't exists. Make sure that you run the dataset tests via nlp-cli.") | ||
|
||
with open(dataset_info_path) as in_file: | ||
dataset_info = json.load(in_file) | ||
|
||
dataset_version = dataset_info["default"]["version"]["version_str"] | ||
|
||
print(f"Creating dummy data for CompGuessWhat?! {dataset_version}") | ||
|
||
print("Original dataset...") | ||
create_dummy_data_for_split(args.data_path, "original", dataset_version, original_data_files) | ||
|
||
print("Zero-shot dataset...") | ||
create_dummy_data_for_split(args.data_path, "zero_shot", dataset_version, zs_data_files) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
main(args) |
Oops, something went wrong.