diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index 0238237d..f8357e4f 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -224,6 +224,10 @@ def get_stop_list_dict(languages=[]): lang_key = language.upper() stop_list_dict[lang_key] = justext.get_stoplist(language) + # List obtained from https://github.com/stopwords-iso/stopwords-th + from .thai_stopwords import thai_stopwords + stop_list_dict["THAI"] = thai_stopwords + return stop_list_dict diff --git a/nemo_curator/download/thai_stopwords.py b/nemo_curator/download/thai_stopwords.py new file mode 100644 index 00000000..70891e5c --- /dev/null +++ b/nemo_curator/download/thai_stopwords.py @@ -0,0 +1 @@ +thai_stopwords = frozenset(["กล่าว","กว่า","กัน","กับ","การ","ก็","ก่อน","ขณะ","ขอ","ของ","ขึ้น","คง","ครั้ง","ความ","คือ","จะ","จัด","จาก","จึง","ช่วง","ซึ่ง","ดัง","ด้วย","ด้าน","ตั้ง","ตั้งแต่","ตาม","ต่อ","ต่าง","ต่างๆ","ต้อง","ถึง","ถูก","ถ้า","ทั้ง","ทั้งนี้","ทาง","ทำ","ทำให้","ที่","ที่สุด","ทุก","นอกจาก","นัก","นั้น","นำ","นี้","น่า","บาง","ผล","ผ่าน","พบ","พร้อม","มา","มาก","มี","ยัง","รวม","ระหว่าง","รับ","ราย","ร่วม","ลง","วัน","ว่า","สำหรับ","สุด","ส่ง","ส่วน","หนึ่ง","หรือ","หลัง","หลังจาก","หลาย","หาก","อยาก","อยู่","อย่าง","ออก","อะไร","อาจ","อีก","เขา","เข้า","เคย","เฉพาะ","เช่น","เดียว","เดียวกัน","เนื่องจาก","เปิด","เปิดเผย","เป็น","เป็นการ","เพราะ","เพื่อ","เมื่อ","เรา","เริ่ม","เลย","เห็น","เอง","แต่","แบบ","แรก","และ","แล้ว","แห่ง","โดย","ใน","ให้","ได้","ไป","ไม่","ไว้"])