forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_dataset_scripts.py
67 lines (47 loc) · 2.79 KB
/
test_dataset_scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
from pathlib import Path
from unittest import TestCase
import pytest
@pytest.mark.integration
class TestDatasetScripts(TestCase):
def _no_encoding_on_file_open(self, filepath: str):
r"""Find all instances where a non-binary file is opened without UTF-8 encoding.
This function uses regular expressions to find instances where Python's `open()` function is used to open
non-binary files. See below for an explanation of the regular expression:
(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b): Lookahead and discard match if `encoding` or `rb` etc are
arguments of `open()`.
(?<=\s): Lookbehind and match if `open()` predeceded by one whitespace.
(open)\((.*)\): Capture everything in parentheses of `open()`.
"""
with open(filepath, encoding="utf-8") as input_file:
regexp = re.compile(r"(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b)(?<=\s)(open)\((.*)\)")
input_text = input_file.read()
match = regexp.search(input_text)
return match
def _no_print_statements(self, filepath: str):
r"""Find all instances where a python sctipt file contains a `print` statement.
#[^\r\n]*print\(: Match print statement inside a comment. We ignore this group.
\"[^\r\n]*print\(: Match print statement inside a string. We ignore this group.
\"\"\".*?print\(.*?\"\"\"": Match print statement inside a triple-quoted string. Uses re.DOTALL to also match newlines with ".".
We ignore this group.
(print\()): Match print statement.
"""
with open(filepath, encoding="utf-8") as input_file:
regexp = re.compile(r"#[^\r\n]*print\(|\"[^\r\n]*print\(|\"\"\".*?print\(.*?\"\"\"|(print\()", re.DOTALL)
input_text = input_file.read()
# use `re.finditer` to handle the case where the ignored groups would be matched first by `re.search`
matches = regexp.finditer(input_text)
matches = [match for match in matches if match is not None and match.group(1) is not None]
return matches[0] if matches else None
def test_no_encoding_on_file_open(self):
dataset_paths = Path("./datasets")
dataset_files = list(dataset_paths.absolute().glob("**/*.py"))
for dataset in dataset_files:
if self._no_encoding_on_file_open(str(dataset)):
raise AssertionError(f"open(...) must use utf-8 encoding in {dataset}")
def test_no_print_statements(self):
dataset_paths = Path("./datasets")
dataset_files = list(dataset_paths.absolute().glob("**/*.py"))
for dataset in dataset_files:
if self._no_print_statements(str(dataset)):
raise AssertionError(f"print statement found in {dataset}. Use datasets.logger/logging instead.")