Skip to content

Commit

Permalink
Fix tokenizer
Browse files Browse the repository at this point in the history
do_basic_tokenizer=False for correctly tokenize "[speaker1]", "[speaker2]"
  • Loading branch information
lemon234071 committed Dec 29, 2020
1 parent cf94149 commit 5de4f3c
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ def train():
config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config
tokenizer_class = BertTokenizer
if args.pretrained:
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, do_lower_case=True)
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, do_lower_case=True, do_basic_tokenize=False)
model = model_class.from_pretrained(args.model_checkpoint)
else:
tokenizer = tokenizer_class(os.path.join(args.model_checkpoint, "vocab.txt"), do_lower_case=True)
tokenizer = tokenizer_class(os.path.join(args.model_checkpoint, "vocab.txt"), do_lower_case=True, do_basic_tokenize=False)
config = config_class.from_json_file(os.path.join(args.model_checkpoint, CONFIG_NAME))
model = model_class(config)
model.to(args.device)
Expand Down

0 comments on commit 5de4f3c

Please sign in to comment.