fix reddit tifu dummy data (huggingface#110)

ajua-labs · May 14, 2020 · 3bc2297 · 3bc2297
1 parent c1e7125
commit 3bc2297
Show file tree

Hide file tree

Showing 8 changed files with 202 additions and 228 deletions.
diff --git a/datasets/reclor/reclor.py b/datasets/reclor/reclor.py
@@ -1,13 +1,13 @@
 """TODO(reclor): Add a description here."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import nlp
 import json
-from pathlib import Path
 import os
+from pathlib import Path
+
+import nlp
+
 
 # TODO(reclor): BibTeX citation
 _CITATION = """\
@@ -32,89 +32,81 @@
 
 
 class Reclor(nlp.GeneratorBasedBuilder):
-  """TODO(reclor): Short description of my dataset."""
+    """TODO(reclor): Short description of my dataset."""
 
-  # TODO(reclor): Set up version.
-  VERSION = nlp.Version('0.1.0')
-  MANUAL_DOWNLOAD_INSTRUCTIONS = """\
+    # TODO(reclor): Set up version.
+    VERSION = nlp.Version("0.1.0")
+    MANUAL_DOWNLOAD_INSTRUCTIONS = """\
   to use ReClor you need to download it manually. Please go to its homepage (http://whyu.me/reclor/) fill the google 
   form and you will recive a download link and a password to extract it.Please extract all files in one folder and use the path folder in nlp.load('reclor', data_dir='path/to/folder/folder_name')
   """
 
-  def _info(self):
-    # TODO(reclor): Specifies the nlp.DatasetInfo object
-    return nlp.DatasetInfo(
-        # This is the description that will appear on the datasets page.
-        description=_DESCRIPTION,
-        # nlp.features.FeatureConnectors
-        features=nlp.Features({
-            # These are the features of your dataset like images, labels ...
-            'context': nlp.Value('string'),
-            'question': nlp.Value('string'),
-            'answers': nlp.features.Sequence({
-                'answer': nlp.Value('string')
-            }),
-            'label': nlp.Value('string'),
-            "id_string": nlp.Value('string')
-        }),
-        # If there's a common (input, target) tuple from the features,
-        # specify them here. They'll be used if as_supervised=True in
-        # builder.as_dataset.
-        supervised_keys=None,
-        # Homepage of the dataset for documentation
-        homepage='http://whyu.me/reclor/',
-        citation=_CITATION,
-    )
+    def _info(self):
+        # TODO(reclor): Specifies the nlp.DatasetInfo object
+        return nlp.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # nlp.features.FeatureConnectors
+            features=nlp.Features(
+                {
+                    # These are the features of your dataset like images, labels ...
+                    "context": nlp.Value("string"),
+                    "question": nlp.Value("string"),
+                    "answers": nlp.features.Sequence({"answer": nlp.Value("string")}),
+                    "label": nlp.Value("string"),
+                    "id_string": nlp.Value("string"),
+                }
+            ),
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage="http://whyu.me/reclor/",
+            citation=_CITATION,
+        )
 
-  def _split_generators(self, dl_manager):
-    """Returns SplitGenerators."""
-    # TODO(reclor): Downloads the data and defines the splits
-    # dl_manager is a nlp.download.DownloadManager that can be used to
-    # download and extract URLs
-    data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))                   
-
-    if not os.path.exists(data_dir):
-        raise FileNotFoundError(
-            "{} does not exist. Make sure you insert a manual dir via `nlp.load('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format(
-                data_dir,  self.MANUAL_DOWNLOAD_INSTRUCTIONS
-            ))
-    return [
-        nlp.SplitGenerator(
-            name=nlp.Split.TRAIN,
-            # These kwargs will be passed to _generate_examples
-            gen_kwargs={
-                'filepath': os.path.join(data_dir, 'train.json')
-            },
-        ),
-        nlp.SplitGenerator(
-            name=nlp.Split.TEST,
-            # These kwargs will be passed to _generate_examples
-            gen_kwargs={
-                'filepath': os.path.join(data_dir, 'test.json')
-            },
-        ),
-        nlp.SplitGenerator(
-            name=nlp.Split.VALIDATION,
-            # These kwargs will be passed to _generate_examples
-            gen_kwargs={
-                'filepath': os.path.join(data_dir, 'val.json')
-            },
-        ),
-    ]
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO(reclor): Downloads the data and defines the splits
+        # dl_manager is a nlp.download.DownloadManager that can be used to
+        # download and extract URLs
+        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
 
-  def _generate_examples(self, filepath):
-    """Yields examples."""
-    # TODO(reclor): Yields (key, example) tuples from the dataset
-    with open(filepath) as f:
-        data = json.load(f)
-        for id_, row in enumerate(data):
-            yield id_, {
-                'context': row['context'],
-                'question': row['question'],
-                'answers': {
-                    'answer': row['answers']
-                },
-                'label': str(row.get('label', '')),
-                "id_string": row['id_string']
-            }
+        if not os.path.exists(data_dir):
+            raise FileNotFoundError(
+                "{} does not exist. Make sure you insert a manual dir via `nlp.load('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format(
+                    data_dir, self.MANUAL_DOWNLOAD_INSTRUCTIONS
+                )
+            )
+        return [
+            nlp.SplitGenerator(
+                name=nlp.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": os.path.join(data_dir, "train.json")},
+            ),
+            nlp.SplitGenerator(
+                name=nlp.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": os.path.join(data_dir, "test.json")},
+            ),
+            nlp.SplitGenerator(
+                name=nlp.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": os.path.join(data_dir, "val.json")},
+            ),
+        ]
 
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        # TODO(reclor): Yields (key, example) tuples from the dataset
+        with open(filepath) as f:
+            data = json.load(f)
+            for id_, row in enumerate(data):
+                yield id_, {
+                    "context": row["context"],
+                    "question": row["question"],
+                    "answers": {"answer": row["answers"]},
+                    "label": str(row.get("label", "")),
+                    "id_string": row["id_string"],
+                }
diff --git a/...nstruction/reddit_tifu/dataset_infos.json → datasets/reddit_tifu/dataset_infos.json b/...nstruction/reddit_tifu/dataset_infos.json → datasets/reddit_tifu/dataset_infos.json
diff --git a/datasets/reddit_tifu/dummy/short/1.1.0/dummy_data.zip b/datasets/reddit_tifu/dummy/short/1.1.0/dummy_data.zip
diff --git a/datasets/reddit_tifu/reddit_tifu.py b/datasets/reddit_tifu/reddit_tifu.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Reddit TIFU dataset using tifu or tldr from subreddit tifu."""
+
+from __future__ import absolute_import, division, print_function
+
+import json
+
+import nlp
+
+
+_CITATION = """
+@misc{kim2018abstractive,
+    title={Abstractive Summarization of Reddit Posts with Multi-level Memory Networks},
+    author={Byeongchang Kim and Hyunwoo Kim and Gunhee Kim},
+    year={2018},
+    eprint={1811.00783},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """
+Reddit dataset, where TIFU denotes the name of subbreddit /r/tifu.
+As defined in the publication, styel "short" uses title as summary and
+"long" uses tldr as summary.
+
+Features includes:
+  - document: post text without tldr.
+  - tldr: tldr line.
+  - title: trimmed title without tldr.
+  - ups: upvotes.
+  - score: score.
+  - num_comments: number of comments.
+  - upvote_ratio: upvote ratio.
+"""
+
+_URL = "https://drive.google.com/uc?export=download&id=1ffWfITKFMJeqjT8loC8aiCLRNJpc_XnF"
+
+_DOCUMENT = "documents"
+_TITLE = "title"
+_TLDR = "tldr"
+_ADDITIONAL_FEATURES = ["ups", "num_comments", "score", "upvote_ratio"]
+
+
+class RedditTifuConfig(nlp.BuilderConfig):
+    """BuilderConfig for RedditTifu."""
+
+    def __init__(self, summary_key=None, **kwargs):
+        """BuilderConfig for RedditTifu.
+
+    Args:
+      summary_key: key string of summary in downloaded json file.
+      **kwargs: keyword arguments forwarded to super.
+    """
+        # Version 1.1.0 remove empty document and summary strings.
+        super(RedditTifuConfig, self).__init__(version=nlp.Version("1.1.0"), **kwargs)
+        self.summary_key = summary_key
+
+
+class RedditTifu(nlp.GeneratorBasedBuilder):
+    """Reddit TIFU Dataset."""
+
+    BUILDER_CONFIGS = [
+        RedditTifuConfig(name="short", summary_key=_TITLE, description="Using title as summary.",),
+        RedditTifuConfig(name="long", summary_key=_TLDR, description="Using TLDR as summary.",),
+    ]
+
+    def _info(self):
+        features = {
+            "ups": nlp.Value("float32"),
+            "num_comments": nlp.Value("float32"),
+            "upvote_ratio": nlp.Value("float32"),
+            "score": nlp.Value("float32"),
+        }
+        features.update({k: nlp.Value("string") for k in [_DOCUMENT, _TLDR, _TITLE]})
+        return nlp.DatasetInfo(
+            description=_DESCRIPTION,
+            features=nlp.Features(features),
+            supervised_keys=(_DOCUMENT, self.config.summary_key),
+            homepage="https://github.com/ctr4si/MMN",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        dl_path = dl_manager.download_and_extract(_URL)
+        return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"path": dl_path},)]
+
+    def _generate_examples(self, path=None):
+        """Yields examples."""
+        with open(path, "rb") as f:
+            for i, line in enumerate(f):
+                # keys are 'title_tokenized','permalink','title','url','num_comments',
+                #   'tldr'(optional),'created_utc','trimmed_title_tokenized','ups',
+                #   'selftext_html','score','upvote_ratio','tldr_tokenized'(optional),
+                #   'selftext','trimmed_title','selftext_without_tldr_tokenized',
+                #   'id','selftext_without_tldr'
+                d = json.loads(line)
+                r = {
+                    _DOCUMENT: d["selftext_without_tldr"].strip(),
+                    _TITLE: d["trimmed_title"].strip(),
+                    _TLDR: (d["tldr"] or "").strip(),
+                }
+                r.update({k: d[k] for k in _ADDITIONAL_FEATURES})
+                # skip if document or summary is empty
+                if r[_DOCUMENT] and r[self.config.summary_key]:
+                    yield i, r
diff --git a/datasets/wiki40b/wiki40b.py b/datasets/wiki40b/wiki40b.py
@@ -20,9 +20,9 @@
 import logging
 import os
 
+import apache_beam as beam
 import tensorflow as tf
 
-import apache_beam as beam
 import nlp
 
 

diff --git a/datasets/xtreme/xtreme.py b/datasets/xtreme/xtreme.py
@@ -711,13 +711,14 @@ def _split_generators(self, dl_manager):
 
         if self.config.name.startswith("PAN-X"):
             path_to_manual_folder = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-            panx_path = os.path.join(path_to_manual_folder, _PAN_X_FOLDER)                
-           
+            panx_path = os.path.join(path_to_manual_folder, _PAN_X_FOLDER)
+
             if not os.path.exists(panx_path):
                 raise FileNotFoundError(
                     "{} does not exist. Make sure you insert a manual dir via `nlp.load('wikihow', data_dir=...)` that includes {}. Manual download instructions: {}".format(
                         panx_path, _PAN_X_FOLDER, self.MANUAL_DOWNLOAD_INSTRUCTIONS
-                    ))
+                    )
+                )
 
             panx_dl_dir = dl_manager.extract(panx_path)
             lang = self.config.name.split(".")[1]

diff --git a/datasets_under_construction/reddit_tifu/dummy/short/1.1.0/dummy_data.zip b/datasets_under_construction/reddit_tifu/dummy/short/1.1.0/dummy_data.zip