Skip to content

Commit

Permalink
make style
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickvonplaten authored and mariamabarham committed May 15, 2020
1 parent e8e0466 commit 23edc4b
Showing 1 changed file with 84 additions and 94 deletions.
178 changes: 84 additions & 94 deletions datasets/xsum/xsum.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,15 @@
# Lint as: python3
"""XSum dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function

import json
import logging
import os

import logging
import nlp


_CITATION = """
@article{Narayan2018DontGM,
title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},
Expand All @@ -49,109 +48,100 @@
'xsum-extracts-from-downloads.tar.gz' and put in manually downloaded folder.
"""

_URL = "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"
_URL = (
"https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"
)

_DOCUMENT = "document"
_SUMMARY = "summary"

_REMOVE_LINES = set([
"Share this with\n", "Email\n", "Facebook\n", "Messenger\n", "Twitter\n",
"Pinterest\n", "WhatsApp\n", "Linkedin\n", "LinkedIn\n", "Copy this link\n",
"These are external links and will open in a new window\n"
])
_REMOVE_LINES = set(
[
"Share this with\n",
"Email\n",
"Facebook\n",
"Messenger\n",
"Twitter\n",
"Pinterest\n",
"WhatsApp\n",
"Linkedin\n",
"LinkedIn\n",
"Copy this link\n",
"These are external links and will open in a new window\n",
]
)


class Xsum(nlp.GeneratorBasedBuilder):
"""Extreme Summarization (XSum) Dataset."""
"""Extreme Summarization (XSum) Dataset."""

# Version 1.1.0 removes web contents.
VERSION = nlp.Version("1.1.0")
SUPPORTED_VERSIONS = [nlp.Version("1.0.0", "Dataset without cleaning.")]
# Version 1.1.0 removes web contents.
VERSION = nlp.Version("1.1.0")
SUPPORTED_VERSIONS = [nlp.Version("1.0.0", "Dataset without cleaning.")]

MANUAL_DOWNLOAD_INSTRUCTIONS = """\
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
Detailed download instructions (which require running a custom script) are
here:
https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md . Please make sure you run download-bbc-articles.py and parse-bbc-html-data.py scripts
"""

def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features({
_DOCUMENT: nlp.Value('string'),
_SUMMARY: nlp.Value('string'),
}),
supervised_keys=(_DOCUMENT, _SUMMARY),
homepage=
"https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
dl_path = dl_manager.download_and_extract(_URL)
import ipdb
ipdb.set_trace()
with open(dl_path, "r") as json_file:
split_ids = json.load(json_file)
downloaded_path = os.path.join(dl_manager.manual_dir, "xsum-extracts-from-downloads")
return [
nlp.SplitGenerator(
name=nlp.Split.TRAIN,
gen_kwargs={
"split_ids": split_ids["train"],
"path": downloaded_path,
},
),
nlp.SplitGenerator(
name=nlp.Split.VALIDATION,
gen_kwargs={
"split_ids": split_ids["validation"],
"path": downloaded_path,
},
),
nlp.SplitGenerator(
name=nlp.Split.TEST,
gen_kwargs={
"split_ids": split_ids["test"],
"path": downloaded_path,
},
),
]

def _generate_examples(self, split_ids=None, path=None):
"""Yields examples."""
missing = 0
total_num = len(split_ids)
for i in split_ids:
filename = os.path.join(path, i + ".data")
print(filename)

if os.path.exists(filename):
with open(filename) as f:

text = "".join([
line for line in f.readlines()
if line not in _REMOVE_LINES and line.strip()
])

# Each file follows below format:
# [XSUM]URL[XSUM]
# http://somelink
#
# [XSUM]INTRODUCTION[XSUM]
# some intro
#
# [XSUM]RESTBODY[XSUM]
# text line.
# another text line.
# "another text line."
segs = text.split("[XSUM]")
yield i, {_DOCUMENT: segs[6].strip(), _SUMMARY: segs[4].strip()}
else:
missing += 1
logging.info("id %s missing.", i)
if missing:
logging.warning("%d out of %d examples are missing.", missing, total_num)
def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features({_DOCUMENT: nlp.Value("string"), _SUMMARY: nlp.Value("string"),}),
supervised_keys=(_DOCUMENT, _SUMMARY),
homepage="https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
dl_path = dl_manager.download_and_extract(_URL)
with open(dl_path, "r") as json_file:
split_ids = json.load(json_file)
downloaded_path = os.path.join(dl_manager.manual_dir, "xsum-extracts-from-downloads")
return [
nlp.SplitGenerator(
name=nlp.Split.TRAIN, gen_kwargs={"split_ids": split_ids["train"], "path": downloaded_path,},
),
nlp.SplitGenerator(
name=nlp.Split.VALIDATION, gen_kwargs={"split_ids": split_ids["validation"], "path": downloaded_path,},
),
nlp.SplitGenerator(
name=nlp.Split.TEST, gen_kwargs={"split_ids": split_ids["test"], "path": downloaded_path,},
),
]

def _generate_examples(self, split_ids=None, path=None):
"""Yields examples."""
missing = 0
total_num = len(split_ids)
for i in split_ids:
filename = os.path.join(path, i + ".data")
print(filename)

if os.path.exists(filename):
with open(filename) as f:

text = "".join([line for line in f.readlines() if line not in _REMOVE_LINES and line.strip()])

# Each file follows below format:
# [XSUM]URL[XSUM]
# http://somelink
#
# [XSUM]INTRODUCTION[XSUM]
# some intro
#
# [XSUM]RESTBODY[XSUM]
# text line.
# another text line.
# "another text line."
segs = text.split("[XSUM]")
yield i, {_DOCUMENT: segs[6].strip(), _SUMMARY: segs[4].strip()}
else:
missing += 1
logging.info("id %s missing.", i)
if missing:
logging.warning("%d out of %d examples are missing.", missing, total_num)

0 comments on commit 23edc4b

Please sign in to comment.