Skip to content

Commit

Permalink
add BlendedSkillTalk dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Mariama Drame authored and mariamabarham committed Jun 3, 2020
1 parent 45d1ec0 commit 2e0a863
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 0 deletions.
151 changes: 151 additions & 0 deletions datasets/blended_skill_talk/blended_skill_talk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""TODO(blended_skill_talk): Add a description here."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import nlp
import os
import json


# TODO(blended_skill_talk): BibTeX citation
_CITATION = """\
@misc{smith2020evaluating,
title={Can You Put it All Together: Evaluating Conversational Agents' Ability to Blend Skills},
author={Eric Michael Smith and Mary Williamson and Kurt Shuster and Jason Weston and Y-Lan Boureau},
year={2020},
eprint={2004.08449},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""

# TODO(blended_skill_talk):
_DESCRIPTION = """\
A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.
"""
_URL = 'http://parl.ai/downloads/blended_skill_talk/blended_skill_talk.tar.gz'

_TASK = ['convai2', 'empathetic_dialogues', 'wizard_of_wikipedia']


class BlendedSkillTalk(nlp.GeneratorBasedBuilder):
"""TODO(blended_skill_talk): Short description of my dataset."""

# TODO(blended_skill_talk): Set up version.
VERSION = nlp.Version('1.0.0')

def _info(self):
# TODO(blended_skill_talk): Specifies the nlp.DatasetInfo object
return nlp.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# nlp.features.FeatureConnectors
features=nlp.Features({
'personas': nlp.features.Sequence({
'persona': nlp.Value('string'),
}),
'additional_context': nlp.Value('string'),
'previous_utterance': nlp.features.Sequence({
'previous_utterance': nlp.Value('string'),
}),
'context': nlp.Value('string'),
'free_messages': nlp.features.Sequence({
'free_message': nlp.Value('string'),
}),
'guided_messgaes': nlp.features.Sequence({
'guided_messgae': nlp.Value('string'),
}),
'suggestions': nlp.features.Sequence({
task: nlp.Value('string') for task in _TASK

})
# These are the features of your dataset like images, labels ...
}),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage='https://parl.ai/projects/bst/',
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(blended_skill_talk): Downloads the data and defines the splits
# dl_manager is a nlp.download.DownloadManager that can be used to
# download and extract URLs
data_dir = dl_manager.download_and_extract(_URL)
return [
nlp.SplitGenerator(
name=nlp.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
'filepath': os.path.join(data_dir, 'train.json')
},
),
nlp.SplitGenerator(
name=nlp.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
'filepath': os.path.join(data_dir, 'valid.json')
},
),
nlp.SplitGenerator(
name=nlp.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
'filepath': os.path.join(data_dir, 'test.json')
},
),
]

def _generate_examples(self, filepath):
"""Yields examples."""
# TODO(blended_skill_talk): Yields (key, example) tuples from the dataset
with open(filepath) as f:
data = json.load(f)
for id_, row in enumerate(data):
personas = [row['personas'][1][0], row['personas'][1][1]]
dialogs = [dialog[1] for dialog in row['dialog']]
free_messages = []
guided_messages = []

for i in range(len(dialogs)//2):
free_messages.append(dialogs[2*i])
guided_messages.append(dialogs[2*i+1])
context = row['context_dataset']
add_context = row['additional_context'] if context == 'wizard_of_wikipedia' else ''
previous_utterance = [row['free_turker_utterance'], row['guided_turker_utterance']]
suggestions = row['suggestions']
convai_suggestions = []
empathetic_suggestions = []
wow_suggestions = []
for i in range(len(suggestions)//2):
convai_suggestions.append(suggestions[2 * i + 1]['convai2'])
empathetic_suggestions.append(suggestions[2*i + 1]['empathetic_dialogues'])
wow_suggestions.append(suggestions[2*i + 1]['wizard_of_wikipedia'])
yield id_, {
'personas': {
'persona': personas,
},
'additional_context': add_context,
'previous_utterance': {
'previous_utterance': previous_utterance,
},
'context': context,
'free_messages': {
'free_message': free_messages,
},
'guided_messgaes': {
'guided_messgae': guided_messages,
},
'suggestions': {
'convai2': convai_suggestions,
'empathetic_dialogues': empathetic_suggestions,
'wizard_of_wikipedia': wow_suggestions
}
}

1 change: 1 addition & 0 deletions datasets/blended_skill_talk/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"default": {"description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.\n", "citation": "@misc{smith2020evaluating,\n title={Can You Put it All Together: Evaluating Conversational Agents' Ability to Blend Skills},\n author={Eric Michael Smith and Mary Williamson and Kurt Shuster and Jason Weston and Y-Lan Boureau},\n year={2020},\n eprint={2004.08449},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://parl.ai/projects/bst/", "license": "", "features": {"personas": {"feature": {"persona": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_context": {"dtype": "string", "id": null, "_type": "Value"}, "previous_utterance": {"feature": {"previous_utterance": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "free_messages": {"feature": {"free_message": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "guided_messgaes": {"feature": {"guided_messgae": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "suggestions": {"feature": {"convai2": {"dtype": "string", "id": null, "_type": "Value"}, "empathetic_dialogues": {"dtype": "string", "id": null, "_type": "Value"}, "wizard_of_wikipedia": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "blended_skill_talk", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10623972, "num_examples": 4819, "dataset_name": "blended_skill_talk"}, "validation": {"name": "validation", "num_bytes": 2238630, "num_examples": 1009, "dataset_name": "blended_skill_talk"}, "test": {"name": "test", "num_bytes": 2215477, "num_examples": 980, "dataset_name": "blended_skill_talk"}}, "download_checksums": {"http://parl.ai/downloads/blended_skill_talk/blended_skill_talk.tar.gz": {"num_bytes": 38101408, "checksum": "5fbed0068ee89e2d43b93c3ecb341e784617033efa5e8e911a219d4eda6134a6"}}, "download_size": 38101408, "dataset_size": 15078079, "size_in_bytes": 53179487}}
Binary file not shown.

0 comments on commit 2e0a863

Please sign in to comment.