-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing1.py
162 lines (139 loc) · 5.75 KB
/
preprocessing1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import re
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
from collections import Counter
from hangul_utils import split_syllables
# Reading perimetric complexity
jamo = pd.read_csv('PerimComplexRAW.csv', sep='\t')
jamo['ImageRef'] = jamo['ImageRef'].str.strip('.pnm')
jamo['ImageRef'] = jamo['ImageRef'].str.upper()
jamo = jamo[['ImageRef', 'PCComplexity']]
jamo.columns = ['character', 'PCComplexity']
jamo_c = pd.read_csv('jamo/results.csv')
jamo['compressed'] = jamo_c['compressed']
jamo['folder'] = 'Jamo'
jamo['LengthScript'] = jamo.shape[0]
jamo['Idiosyncratic'] = 0
jamo['Type'] = 'featural system'
jamo['Family'] = 'East Asian'
jamo['Ancestor'] = np.nan
complexity = pd.concat([pd.read_csv('data_bothC-namesOK.csv'), jamo])
complexity.to_csv('data_bothC-namesOK_J.csv')
dfs = []
# complexity = pd.read_csv('data_bothC-namesOK_J.csv', sep=',')
complexity['character'] = complexity['character'].str.strip()
complexity = complexity.set_index('character')
exceptions_l = ['arz', 'pes', 'prs', 'snd', 'urd' , 'grt', 'grc']
exceptions_s = ['Deva', 'Limb']
def char_to_unicode(char):
'''
Convert char into respective unicode pointer
in the same format as in Helena's and Olivier's
database
'''
code = hex(ord(char)).upper()[2:]
return ('0'*(5-len(code)) + code).strip()
def convert_list_of_words(file):
'''
Convert the list of word frequencies to letter frequencies
'''
name = re.match(r'^(\w+)', file.split('/')[1]).group(0)
freq = pd.read_csv(file)
freq['textfile'] = freq['textfile'].astype('str')
freq['textfile'] = freq['textfile'].apply(list)
freq = freq.explode('textfile')[['textfile', 'Freq']].\
groupby('textfile').\
sum().sort_values(by='Freq', ascending=False).reset_index()
freq['unicode'] = freq['textfile'].apply(char_to_unicode)
freq['lang'] = str(name)
freq['file'] = file
return freq
########### Parsing bible corpora ######################################
# lgs = ['Armenian', 'Cherokee', 'Coptic', 'Thai',
# 'Telugu', 'Syriac', 'Malayalam', 'Myanmar',
# 'Kannada', 'Korean', 'Amharic', 'Cherokee']
# fls = []
# for filename in os.listdir('bibles'):
# if any(s in filename for s in lgs):
# try:
# root = ET.fromstring(open('bibles/' + filename).read())
# except ParseError:
# print(filename)
# with open('texts2/' + filename.strip('.xml') + '.txt', 'w', encoding='utf-8') as out:
# for n in root.iter('seg'):
# try:
# out.write(n.text.strip() + '\n')
# except AttributeError:
# pass
dfs = []
for txtfile in os.listdir('texts'):
if txtfile not in '.DS_Store':
with open('texts/' + txtfile, 'r') as file:
text = file.read()
if 'chr' in txtfile:
text = ''.join(e for e in text if e.isalnum())
elif 'kor' in txtfile:
text = split_syllables(''.join(e for e in text if e.isalnum()))
else:
text = ''.join(e for e in text.lower() if e.isalnum())
df = pd.DataFrame.from_dict(Counter(text), orient='index').reset_index()
df['lang'] = txtfile[:-4]
dfs.append(df)
data_2 = pd.concat(dfs)
data_2.columns = ['textfile', 'Freq', 'lang']
data_2['unicode'] = data_2['textfile'].apply(char_to_unicode)
data_2 = data_2.set_index('unicode').merge(complexity,
left_index=True, right_index=True).reset_index()
data_2 = data_2[~data_2['folder'].isin(['Cyrl', 'Latn', 'Cyrs', 'Grek'])]
data_2['Sum_count'] = data_2['Freq'].groupby(data_2['lang']).transform('sum')
data_2['Rel_freq'] = data_2['Freq']/data_2['Sum_count']
data_2['Source'] = 'bible.com'
########### Bentz data #################################################
exceptions = ['eng-x-bible-scriptures.csv']
for filename in os.listdir('FreqDists_50K/'):
if filename not in exceptions:
dfs.append(convert_list_of_words('FreqDists_50K/'+filename))
# Concatenate frequencies from individual files
res_full = pd.concat(dfs, sort=True)
# Sum frequencies by identical characters in each language
# (cause multiple files)
res_full = pd.DataFrame(res_full.groupby(['lang', 'unicode', 'textfile'])['Freq'].\
agg({'Freq':'sum',
'file':'size'})).reset_index().set_index(['unicode'])
# Merge with Cognition paper dataset
res_full = res_full.merge(complexity,
left_index=True, right_index=True).reset_index()
res_full['Sum_count'] = res_full['Freq'].groupby(res_full['lang']).transform('sum')
res_full['Rel_freq'] = res_full['Freq']/res_full['Sum_count']
# Remove cyrillic and latin characters
res_full = res_full[~res_full['folder'].isin(['Cyrl', 'Latn', 'Cyrs'])]
# Sum of probabilities (if language has occasional latin
# or cyrillic characters,
# we expect it to be at least 0.99, languages with smaller values are removed)
res_full['Sum_prob'] = res_full['Rel_freq'].groupby(res_full['lang']).transform('sum')
res_full = res_full[res_full['Sum_prob'] > 0.99]
# Removing outliers
i = res_full[(res_full['lang'] == 'pes') & (res_full['folder'] == 'Armn')].index
res_full = res_full.drop(i, axis=0)
# Count frequencies once again
res_full['Sum_count'] = res_full['Freq'].groupby(res_full['lang']).transform('sum')
res_full['Rel_freq'] = res_full['Freq']/res_full['Sum_count']
diff = list(set(res_full.folder.unique()) - set(data_2.folder.unique()))
# Making the final dataset
res_full = res_full[res_full.folder.isin(diff)]
res_full['Source'] = 'Bentz & Ferrer-i-Cancho, 2016'
res_full.loc[res_full.lang == 'Shavian', 'Source'] = "shavian.info"
data = pd.concat([res_full, data_2], sort=False).reset_index(drop=True)
data = data[~data.lang.isin(exceptions_l)]
data = data[~data.folder.isin(exceptions_s)]
data['Unicode'] = data['textfile'].apply(char_to_unicode)
data = data.rename(columns={"Rel_freq": "Relative_frequency",
"compressed": "Compression",
"folder": "ISO_script",
'lang': 'ISO_language',
'PCComplexity': 'Perimetric_complexity',
'Freq': 'Frequency'})
data.reset_index(drop=True).to_csv('final.csv')