-
Notifications
You must be signed in to change notification settings - Fork 3
/
splitters.py
63 lines (53 loc) · 2.09 KB
/
splitters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
import re
ascii_lowercase = "abcdefghijklmnopqrstuvwxyz"
ascii_uppercase = ascii_lowercase.upper()
# States w/ with thanks to https://github.com/unitedstates/python-us
# Titles w/ thanks to https://github.com/nytimes/emphasis and @donohoe
abbr_capped = "|".join([
"ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|"
"neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States
"u.s",
"mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|"
"snr|atty|supt", # Titles
"ave|blvd|st|rd|hwy", # Streets
"jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months
"|".join(ascii_lowercase) # Initials
]).split("|")
abbr_lowercase = "etc|v|vs|viz|al|pct"
exceptions = "U.S.|U.N.|E.U.|F.B.I.|C.I.A.".split("|")
def is_abbreviation(dotted_word):
clipped = dotted_word[:-1]
if clipped[0] in ascii_uppercase:
if clipped.lower() in abbr_capped:
return True
else:
return False
else:
if clipped in abbr_lowercase:
return True
else:
return False
def is_sentence_ender(word):
if word in exceptions:
return False
if word[-1] in ["?", "!"]:
return True
if len(re.sub(r"[^A-Z]", "", word)) > 1:
return True
if word[-1] == "." and (not is_abbreviation(word)):
return True
return False
def split_into_sentences(text):
potential_end_pat = re.compile(r"".join([
r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation
r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc
r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash)
]), re.U)
dot_iter = re.finditer(potential_end_pat, text)
end_indices = [(x.start() + len(x.group(1)) + len(x.group(2)))
for x in dot_iter
if is_sentence_ender(x.group(1))]
spans = zip([None] + end_indices, end_indices + [None])
sentences = [text[start:end].strip() for start, end in spans]
return sentences