Only available ['MS', 'EN', 'OTHERS', 'CAPITAL', 'NOT_LANG']
.
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/en-substrings.json
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/ms-substrings.json
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/en-ms-substrings.json
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/en-ms-substrings-v2.json
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/ms-en-substrings.json
- https://huggingface.co/datasets/mesolitica/substring-language-detection/resolve/main/ms-en-substrings-v2.json
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Substring language detection,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/substring-language-detection}}
}