Skip to content

Commit

Permalink
Handle malformed escapes in parser
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe committed Sep 9, 2024
1 parent 8c77e42 commit bf7679c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
5 changes: 4 additions & 1 deletion text/_slugify_char_map/icu_parser.peggy
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,12 @@ Char

Hex = [0-9a-fA-F]

Escape = Uescape / Xescape
Escape = Uescape / Xescape / MalformedUescape
Xescape = "\\x" h:(Hex Hex) { return hexToChar(h) }
Uescape = "\\u" h:(Hex Hex Hex Hex) { return hexToChar(h) }
// only occurs on one line in one file in the icu repo 🤷
// https://github.com/unicode-org/icu/blob/f062f52c123f436eb1142115ba2e4e7b65a4ac8f/icu4c/source/data/translit/my_my_Latn.txt#L130
MalformedUescape = "\\" h:(Hex Hex Hex Hex) { return hexToChar(h) }

CharClass
= "[" ch:(CharRange / Char)* "]" { return ch.flat() }
Expand Down
4 changes: 2 additions & 2 deletions text/_slugify_char_map/map.json
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@
"ryo": "りょ,リョ,りょう,リョウ",
"ryu": "りゅ,リュ,りゅう,リュウ",
"s": "⒮,🄢,s,ꞩ,ſ,ꜱ,ᵴ,ᶊ,ʂ,ȿ,ẜ,ẝ,σ,ς,с,ს,ս,ס,ש,שׂ,ث,س,ص,ܣ,ܤ,ܨ,ސ,ޞ,ශ,ෂ,ස,ซ,ศ,ส,ဆ,သ,ဿ",
"sa": "ሣ,ሳ,श,ष,स,শ,ষ,স,ਸ,ਸ਼,\\0a3c,શ,ષ,સ,ଶ,ଷ,ସ,ஶ,ஷ,ஸ,శ,ష,స,ಶ,ಷ,ಸ,ശ,ഷ,സ,さ,サ,仨,卅,挱,挲,摋,撒,櫒,泧,洒,潵,灑,脎,萨,薩,虄,訯,躠,鈒,钑,隡,靸,颯,飒,馺,㒎,㚫,㪪,㽂,䊛,䙣,䬃,𠎷,𠦃,𠬙,𠮿,𠱡,𠿓,𡄳,𡐥,𡒁,𢓔,𢕬,𢫬,𢻨,𣀯,𣜂,𣬬,𥋌,𥵯,𥸗,𥻦,𦠿,𦻅,𦼧,𧀕,𧭝,𨃛,𨆂,𨐖,𨷆,𩆅,𩎕,𩐅,𩗉,𩗞,𩨞,𪠡,𫂿,𱂃,𱅂,𱗂,,,",
"sa": "ሣ,ሳ,श,ष,स,শ,ষ,স,ਸ,ਸ਼,શ,ષ,સ,ଶ,ଷ,ସ,ஶ,ஷ,ஸ,శ,ష,స,ಶ,ಷ,ಸ,ശ,ഷ,സ,さ,サ,仨,卅,挱,挲,摋,撒,櫒,泧,洒,潵,灑,脎,萨,薩,虄,訯,躠,鈒,钑,隡,靸,颯,飒,馺,㒎,㚫,㪪,㽂,䊛,䙣,䬃,𠎷,𠦃,𠬙,𠮿,𠱡,𠿓,𡄳,𡐥,𡒁,𢓔,𢕬,𢫬,𢻨,𣀯,𣜂,𣬬,𥋌,𥵯,𥸗,𥻦,𦠿,𦻅,𦼧,𧀕,𧭝,𨃛,𨆂,𨐖,𨷆,𩆅,𩎕,𩐅,𩗉,𩗞,𩨞,𪠡,𫂿,𱂃,𱅂,𱗂,,,",
"sai": "僿,嗮,嘥,噻,塞,愢,揌,毢,毸,簺,腮,賽,赛,顋,鰓,鳃,㗷,㘔,㩙,䈢,䚡,䰄,𡬉,𦞫,𪃄,𫬐,𱂲",
"san": "三,仐,伞,俕,傘,厁,叁,壭,帴,弎,散,橵,毵,毶,毿,犙,糁,糂,糝,糣,糤,繖,鏒,鏾,閐,饊,馓,鬖,㤾,㧲,㪔,㪚,䈀,䉈,䊉,䫅,䫩,𡙘,𢁘,𢕕,𣀧,𣀫,𣬛,𣮠,𣯶,𥒬,𦙱,𦙸,𦡨,𦷻,𦺻,𧗋,𧱆,𧽾,𨸃,𩀲,𩀼,𩞀,𩭹,𩯑,𫔌,𬭝,𬱬,𰏕,𰬷",
"sang": "丧,喪,嗓,搡,桑,桒,槡,磉,褬,鎟,顙,颡,䘮,䡦,䫙,𡕏,𡠏,𣉕,𣊝,𣞙,𤸯,𥔫,𦅇,𦟄,𧍨,𨢆,𩐷,𩦌,𩺞,𪔬,𫄪,𬨑,𱈎,𱮒",
Expand Down Expand Up @@ -799,7 +799,7 @@
"yay": "ြေ",
"ye": "є,ье,ѣ,የ,ዬ,ြီ,ᅨ,业,也,亪,亱,倻,僷,冶,叶,吔,啘,嘢,噎,嚈,埜,墷,壄,夜,嶪,嶫,抴,捓,捙,掖,揶,擛,擨,擪,擫,晔,暍,曄,曅,曗,曳,曵,枼,枽,椰,業,歋,殗,洂,液,漜,潱,澲,烨,燁,爗,爷,爺,璍,皣,瞱,瞸,礏,耶,腋,葉,蠮,謁,谒,邺,鄓,鄴,野,釾,鋣,鍱,鎁,鎑,鐷,铘,靥,靨,頁,页,⻚,餣,饁,馌,驜,鵺,鸈,㖡,㗼,㙒,㡋,㥷,㩎,㪑,㱉,㱌,㸣,䁆,䈎,䊦,䎨,䓉,䢡,䤳,䤶,䥟,䥡,䥺,䧨,䭇,䭎,䭟,䱒,䲜,𠀸,𠄅,𠟪,𠥇,𠱝,𡀽,𡁁,𡑀,𡛌,𡛽,𡽣,𢀘,𢉥,𢢜,𢪧,𢬍,𢱴,𣎩,𣐂,𣚋,𣚕,𣩫,𣩯,𣰛,𤑷,𤝇,𤝉,𤝱,𤳪,𥌅,𥠍,𥮧,𥯘,𦀕,𦂡,𦕆,𦠜,𦤪,𦰳,𧎭,𧏽,𧐓,𧒐,𧔦,𧗖,𨂒,𨈺,𨉅,𨶮,𨸌,𨼥,𨽀,𩉂,𩐱,𩑃,𩘏,𩜺,𩱝,𩸾,𩼋,𩼴,𪋫,𪍅,𪑦,𪒲,𫥺,𫩤,𫩫,𬑓,𬒆,𬰺,𬲼,𬳀,𰉪,𰎑,𰑸,𰓙,𰚱,𰾕,𰾩,𱇰,𱛹,𲈍,𲍿,𮳴,𮴔",
"yee": "ြီး",
"yet": "ျက်,ြက\\103a",
"yet": "ျက်,ြက်",
"yh": "ܞ",
"yi": "ї,ዪ,ይ,一,乁,乂,义,乊,乙,亄,亦,亿,以,仪,伇,伊,伿,佁,佚,佾,侇,依,俋,倚,偯,儀,億,兿,冝,刈,劓,劮,勚,勩,匇,匜,医,吚,呓,呭,呹,咦,咿,唈,噫,囈,圛,圯,坄,垼,埶,埸,墿,壱,壹,夁,夷,奕,姨,媐,嫕,嫛,嬄,嬑,嬟,宐,宜,宧,寱,寲,屹,峄,峓,崺,嶧,嶬,嶷,已,巸,帟,帠,幆,庡,廙,异,弈,弋,弌,弬,彛,彜,彝,彞,役,忆,怈,怡,怿,恞,悒,悘,悥,意,憶,懌,懿,扅,扆,抑,拸,挹,掜,揖,撎,攺,敡,敼,斁,旑,旖,易,晹,暆,曀,曎,杙,枍,枻,柂,栘,栧,栺,桋,棭,椅,椬,椸,榏,槸,檍,檥,檹,欥,欭,歝,殔,殪,殹,毅,毉,沂,沶,泆,洢,浂,浥,浳,渏,湙,溢,漪,潩,澺,瀷,炈,焲,熠,熤,熪,熼,燚,燡,燱,狋,猗,獈,玴,珆,瑿,瓵,畩,異,疑,疫,痍,痬,瘗,瘞,瘱,癔,益,眙,睪,瞖,矣,硛,礒,祎,禕,秇,移,稦,穓,竩,笖,箷,簃,籎,縊,繄,繶,繹,绎,缢,羛,羠,義,羿,翊,翌,翳,翼,耛,耴,肄,肊,胰,膉,臆,舣,艗,艤,艺,芅,苅,苡,苢,萓,萟,蓺,薏,藙,藝,蘙,虉,蚁,蛜,蛡,蛦,蜴,螔,螘,螠,蟻,衣,衤,⻂,衪,衵,袘,袣,裔,裛,裿,褹,襼,觺,訑,訲,訳,詍,詑,詒,詣,誃,誼,謻,譩,譯,議,讉,讛,议,译,诒,诣,谊,豙,豛,豷,貖,貤,貽,賹,贀,贻,跇,跠,踦,軼,輢,轙,轶,辷,迆,迤,迻,逘,逸,遗,遺,邑,郼,酏,醫,醳,醷,釔,釴,鈘,鈠,鉯,銥,鎰,鏔,鐿,钇,铱,镒,镱,陭,隿,霬,靾,頉,頤,頥,顊,顗,颐,飴,饐,饴,駅,驛,驿,骮,鮨,鯣,鳦,鶂,鶃,鶍,鷁,鷊,鷖,鷧,鷾,鸃,鹝,鹢,鹥,黓,黟,黳,齮,齸,㐌,㐹,㑊,㑜,㑥,㓷,㔴,㕈,㖂,㘁,㘈,㙠,㙪,㙯,㚤,㚦,㛄,㛕,㛳,㜋,㜒,㝖,㝣,㞔,㠖,㠯,㡫,㡼,㢞,㣇,㣻,㥋,㥴,㦉,㦤,㦾,㫊,㰘,㰝,㰻,㱅,㱞,㱲,㲼,㳑,㳖,㴁,㴒,㵝,㵩,㶠,㹫,㹭,㺿,㼢,㽈,㾨,䃜,䄁,䄩,䄬,䄿,䆿,䇩,䇵,䇼,䉗,䉝,䉨,䋚,䋵,䌻,䎈,䒾,䓃,䓈,䓹,䔟,䔬,䔱,䕍,䖁,䖊,䖌,䗑,䗟,䗷,䘝,䘸,䚷,䝘,䝝,䝯,䞅,䢃,䣡,䣧,䦴,䧅,䧇,䧧,䩟,䪰,䫑,䬁,䬥,䬮,䭂,䭞,䭲,䭿,䮊,䯆,䰙,䰯,䱌,䲑,䴊,䴬,䵝,𠂆,𠄱,𠅌,𠈶,𠍫,𠍳,𠏩,𠐀,𠓋,𠗺,𠚮,𠛃,𠜁,𠡔,𠡝,𠤕,𠤗,𠤘,𠥦,𠨾,𠩗,𠩫,𠪗,𠬤,𠮙,𠯋,𠰄,𠲔,𠲖,𠲚,𠲺,𠲻,𠶷,𠼪,𠽜,𠿣,𡄵,𡄻,𡉛,𡊁,𡊶,𡍡,𡜬,𡥁,𡬓,𡱐,𡷪,𡻣,𡼎,𡾾,𢀁,𢂒,𢂗,𢂼,𢄅,𢇙,𢇚,𢇸,𢈶,𢊘,𢍰,𢎀,𢎃,𢎉,𢏗,𢓀,𢓡,𢕷,𢖅,𢖫,𢖴,𢖺,𢗎,𢘽,𢙇,𢞉,𢡃,𢣉,𢦕,𢨮,𢨳,𢩮,𢩼,𢱁,𢷔,𣎅,𣐓,𣐵,𣐿,𣕁,𣘦,𣙛,𣚘,𣡊,𣢭,𣢷,𣤪,𣦌,𣧄,𣨟,𣫙,𣶫,𣷩,𣸘,𣿉,𤆾,𤇴,𤈙,𤑹,𤖪,𤘊,𤝳,𤝻,𤣨,𤣮,𤤺,𤥿,𤧕,𤬩,𤴧,𤶛,𤷅,𤸸,𤻂,𤼌,𥃠,𥃸,𥄻,𥄿,𥅓,𥌟,𥍴,𥏜,𥑴,𥒵,𥘒,𥘠,𥙁,𥙇,𥜃,𥜥,𥟘,𥡪,𥥌,𥥴,𥩖,𥫃,𥫜,𥫝,𥰧,𥱃,𥸊,𥹋,𥾐,𥿹,𦌩,𦎝,𦏸,𦓻,𦔜,𦔥,𦘳,𦙨,𦚟,𦟧,𦠉,𦡫,𦥱,𦨇,𦭥,𦮸,𦶂,𧃟,𧅖,𧆦,𧈻,𧉅,𧊣,𧊤,𧋏,𧑌,𧓗,𧔮,𧙡,𧜤,𧡇,𧢂,𧣟,𧣬,𧦧,𧫦,𧬇,𧮒,𧱊,𧱏,𧳁,𧷅,𧷥,𧺎,𧺝,𧾰,𨋯,𨛯,𨜶,𨜽,𨠑,𨠶,𨣠,𨣬,𨦯,𨱁,𨳷,𨹝,𨻊,𨻏,𨽹,𩂒,𩂹,𩈭,𩋌,𩍖,𩎭,𩎷,𩓧,𩔦,𩕲,𩖹,𩖾,𩗑,𩘧,𩚂,𩚇,𩛆,𩛮,𩟉,𩠂,𩡖,𩡣,𩣞,𩤒,𩥯,𩧭,𩪟,𩪣,𩮵,𩳇,𩴜,𩴮,𩷍,𩷘,𩸨,𩼨,𩾘,𩾠,𩾢,𪀓,𪀕,𪁚,𪁛,𪈨,𪎈,𪐔,𪐘,𪐣,𪒕,𪕶,𪗷,𪘃,𪘬,𪙴,𪪴,𪹀,𪽷,𫄷,𫍙,𫍟,𫍡,𫐎,𫖮,𬟁,𬤞,𬤦,𬥵,𬬩,𬭰,𬱪,𬲳,𬷼,𬺈,𭣧,𭩚,𮩞,𮬜,𰉣,𰞇,𰲹,𰳵,𰵔,𰵖,𰵥,𰶁,𰶊,𰷠,𰷪,𰹵,𰼅,𱁱,𱇬,𱉇,𱉌,𱉷,𱊄,𱊈,𱊦,𱊰,𱌷,𱌽,𲍇,𮰸,𮵠",
"yin": "ျင်,ြင်,乑,乚,⺃,侌,冘,凐,印,吟,吲,喑,噖,噾,嚚,囙,因,圁,垔,垠,垽,堙,堷,夤,姻,婣,婬,寅,尹,峾,崟,崯,嶾,廕,廴,引,愔,慇,慭,憖,憗,懚,斦,朄,栶,檃,檭,檼,櫽,歅,殥,殷,氤,泿,洇,洕,淫,淾,湚,溵,滛,濥,濦,烎,犾,狺,猌,珢,璌,瘖,瘾,癊,癮,硍,碒,磤,禋,秵,窨,筃,粌,絪,緸,胤,苂,茚,茵,荫,荶,蒑,蔩,蔭,蘟,蚓,螾,蟫,裀,訔,訚,訡,誾,諲,讔,赺,趛,輑,鄞,酳,鈏,鈝,銀,銦,铟,银,闉,阥,阴,陰,陻,隂,隐,隠,隱,霒,霠,霪,靷,鞇,音,韾,飮,飲,饮,駰,骃,鮣,鷣,齗,龂,龈,㐆,㐺,㒚,㕂,㖗,㙬,㝙,㞤,㡥,㣧,㥯,㥼,㦩,㧈,㧢,㪦,㱃,㴈,㶏,㸒,㹜,㹞,䄄,䇙,䌥,䒡,䓄,䓰,䕃,䕾,䖐,䖜,䚿,䜾,䡛,䤃,䨸,䪩,䲟,䴦,𠃊,𠖟,𠪚,𠽨,𡇂,𡈲,𡋪,𡐔,𡓓,𡓿,𡖣,𡩘,𡸛,𡼽,𢂨,𢉩,𢋻,𢌲,𢓕,𢓙,𢛦,𢝯,𢪪,𢳃,𢷍,𣓆,𣔸,𣘴,𣦫,𣱐,𣱜,𣸊,𣽮,𤂹,𤝎,𤢦,𤯸,𤵯,𤷏,𤻘,𥖵,𥤷,𥬜,𥮍,𦈑,𦈠,𦜲,𦝴,𦟘,𦻕,𦾻,𧊭,𧥸,𧦸,𧦹,𧩬,𨈧,𨋙,𨏈,𨐐,𨒦,𨓮,𨛊,𨟏,𨟴,𨡢,𨢂,𨦆,𩂢,𩂥,𩃬,𩐞,𩖄,𩚕,𩬵,𪔰,𪔽,𪘎,𪙤,𪙾,𪛊,𪺽,𫜃,𫡑,𫮜,𫷮,𬄩,𬘡,𬤇,𬮱,𬺒,𮙊,𰝋,𰺈,𰽣",
Expand Down
11 changes: 6 additions & 5 deletions text/slugify.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ function getTransliterationRe(charMap: Map<string, string>) {
return /[^\s\S]/gu;
}

const re = new RegExp(
// sort length descending to ensure longer substrings are matched first
`(?:${[...charMap.keys()].sort((a, b) => b.length - a.length).join("|")})`,
"gu",
);
// sort length descending to ensure longer substrings are matched first
const source = `(?:${
[...charMap.keys()].sort((a, b) => b.length - a.length).join("|")
})`;

const re = new RegExp(source, "gu");

transliterationReCache.set(charMap, re);
return re;
Expand Down

0 comments on commit bf7679c

Please sign in to comment.