forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tok.py
executable file
·34 lines (24 loc) · 844 Bytes
/
tok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
import sacremoses
def main(args):
"""Tokenizes, preserving tabs"""
mt = sacremoses.MosesTokenizer(lang=args.lang)
def tok(s):
return mt.tokenize(s, return_str=True)
for line in sys.stdin:
parts = list(map(tok, line.split("\t")))
print(*parts, sep="\t", flush=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--lang", "-l", default="en")
parser.add_argument("--penn", "-p", action="store_true")
parser.add_argument("--fields", "-f", help="fields to tokenize")
args = parser.parse_args()
main(args)