forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_dummy_data_autogenerate.py
103 lines (86 loc) · 4.18 KB
/
test_dummy_data_autogenerate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import shutil
from tempfile import TemporaryDirectory
from unittest import TestCase
from datasets.builder import GeneratorBasedBuilder
from datasets.commands.dummy_data import DummyDataGeneratorDownloadManager, MockDownloadManager
from datasets.features import Features, Value
from datasets.info import DatasetInfo
from datasets.splits import Split, SplitGenerator
from datasets.utils.download_manager import DownloadConfig
from datasets.utils.version import Version
EXPECTED_XML_DUMMY_DATA = """\
<tmx version="1.4">
<header segtype="sentence" srclang="ca" />
<body>
<tu>
<tuv xml:lang="ca"><seg>Contingut 1</seg></tuv>
<tuv xml:lang="en"><seg>Content 1</seg></tuv>
</tu>
</body>
</tmx>"""
class DummyBuilder(GeneratorBasedBuilder):
def __init__(self, tmp_test_dir, *args, **kwargs):
super().__init__(*args, **kwargs)
self.tmp_test_dir = tmp_test_dir
def _info(self) -> DatasetInfo:
return DatasetInfo(features=Features({"text": Value("string")}))
def _split_generators(self, dl_manager):
to_dl = {
"train": os.path.abspath(os.path.join(self.tmp_test_dir, "train.txt")),
"test": os.path.abspath(os.path.join(self.tmp_test_dir, "test.txt")),
}
downloaded_files = dl_manager.download_and_extract(to_dl)
return [
SplitGenerator(Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
SplitGenerator(Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
]
def _generate_examples(self, filepath, **kwargs):
with open(filepath, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
yield i, {"text": line.strip()}
class DummyDataAutoGenerationTest(TestCase):
def test_dummy_data_autogenerate(self):
n_lines = 5
with TemporaryDirectory() as tmp_dir:
with open(os.path.join(tmp_dir, "train.txt"), "w", encoding="utf-8") as f:
f.write("foo\nbar\n" * 10)
with open(os.path.join(tmp_dir, "test.txt"), "w", encoding="utf-8") as f:
f.write("foo\nbar\n" * 10)
class MockDownloadManagerWithCustomDatasetsScriptsDir(MockDownloadManager):
datasets_scripts_dir = os.path.join(tmp_dir, "datasets")
cache_dir = os.path.join(tmp_dir, "cache")
os.makedirs(cache_dir, exist_ok=True)
dataset_builder = DummyBuilder(tmp_test_dir=tmp_dir, cache_dir=cache_dir)
mock_dl_manager = MockDownloadManagerWithCustomDatasetsScriptsDir(
dataset_name=dataset_builder.name,
config=None,
version=Version("0.0.0"),
use_local_dummy_data=True,
cache_dir=cache_dir,
load_existing_dummy_data=False, # dummy data don't exist yet
)
download_config = DownloadConfig(cache_dir=os.path.join(tmp_dir, "downloads"))
dl_manager = DummyDataGeneratorDownloadManager(
dataset_name=dataset_builder.name,
mock_download_manager=mock_dl_manager,
download_config=download_config,
)
dataset_builder.download_and_prepare(dl_manager=dl_manager, try_from_hf_gcs=False)
shutil.rmtree(dataset_builder._cache_dir)
dl_manager.auto_generate_dummy_data_folder(n_lines=n_lines)
path_do_dataset = os.path.join(mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name)
dl_manager.compress_autogenerated_dummy_data(path_do_dataset)
mock_dl_manager.load_existing_dummy_data = True
dataset_builder.download_and_prepare(
dl_manager=mock_dl_manager, ignore_verifications=True, try_from_hf_gcs=False
)
dataset = dataset_builder.as_dataset(split="train")
self.assertEqual(len(dataset), n_lines)
del dataset
def test_create_xml_dummy_data(xml_file, tmp_path):
dst_path = tmp_path / "file.xml"
DummyDataGeneratorDownloadManager._create_xml_dummy_data(xml_file, dst_path, "tu", n_lines=1)
with open(dst_path) as f:
xml_dummy_data = f.read()
assert xml_dummy_data == EXPECTED_XML_DUMMY_DATA