Skip to content

Commit

Permalink
add thai stopwords
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
  • Loading branch information
sarahyurick committed Oct 15, 2024
1 parent 65affd6 commit 46829e3
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 0 deletions.
4 changes: 4 additions & 0 deletions nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ def get_stop_list_dict(languages=[]):
lang_key = language.upper()
stop_list_dict[lang_key] = justext.get_stoplist(language)

# List obtained from https://github.com/stopwords-iso/stopwords-th
from .thai_stopwords import thai_stopwords
stop_list_dict["THAI"] = thai_stopwords

return stop_list_dict


Expand Down
1 change: 1 addition & 0 deletions nemo_curator/download/thai_stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
thai_stopwords = frozenset(["กล่าว","กว่า","กัน","กับ","การ","ก็","ก่อน","ขณะ","ขอ","ของ","ขึ้น","คง","ครั้ง","ความ","คือ","จะ","จัด","จาก","จึง","ช่วง","ซึ่ง","ดัง","ด้วย","ด้าน","ตั้ง","ตั้งแต่","ตาม","ต่อ","ต่าง","ต่างๆ","ต้อง","ถึง","ถูก","ถ้า","ทั้ง","ทั้งนี้","ทาง","ทำ","ทำให้","ที่","ที่สุด","ทุก","นอกจาก","นัก","นั้น","นำ","นี้","น่า","บาง","ผล","ผ่าน","พบ","พร้อม","มา","มาก","มี","ยัง","รวม","ระหว่าง","รับ","ราย","ร่วม","ลง","วัน","ว่า","สำหรับ","สุด","ส่ง","ส่วน","หนึ่ง","หรือ","หลัง","หลังจาก","หลาย","หาก","อยาก","อยู่","อย่าง","ออก","อะไร","อาจ","อีก","เขา","เข้า","เคย","เฉพาะ","เช่น","เดียว","เดียวกัน","เนื่องจาก","เปิด","เปิดเผย","เป็น","เป็นการ","เพราะ","เพื่อ","เมื่อ","เรา","เริ่ม","เลย","เห็น","เอง","แต่","แบบ","แรก","และ","แล้ว","แห่ง","โดย","ใน","ให้","ได้","ไป","ไม่","ไว้"])

0 comments on commit 46829e3

Please sign in to comment.