Skip to content

Commit

Permalink
Added the cleaning script, updated README
Browse files Browse the repository at this point in the history
  • Loading branch information
lechatpito committed Oct 15, 2014
1 parent 68e0444 commit bab444b
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 6 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
word2vec-api
============

Simple web service wrapping a Word2Vec as implemented in Gensim.
Simple web service providing a word embedding API. The methods are based on Gensim Word2Vec implementation. Models are passed as parameters and must be in the Word2Vec text or binary format.

* Launching the service
```
Expand All @@ -11,6 +11,9 @@ python word2vec-api --model path/to/the/model [--host host --port 1234]
* Example calls
```
curl http://127.0.0.1:5000/wor2vec/n_similarity?ws1=Sushi&ws1=Shop&ws2=Japanese&ws2=Restaurant
curl http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese
curl http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=]
curl http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese
curl http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=]
curl http://127.0.0.1:5000/wor2vec/model?word=restaurant
```

Note: The "model" method returns a base64 encoding of the Word2Vec vector.
102 changes: 102 additions & 0 deletions clean-word2vec-text-format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/python

import codecs
import os
import gc
import shutil

SOURCE = 'glove.840B.300d.txt'
TARGET = 'glove.840B.300d--modified.txt'
TMP = 'tmp.file'

LENGTH_BY_PREFIX = [
(0xC0, 2), # first byte mask, total codepoint length
(0xE0, 3),
(0xF0, 4),
(0xF8, 5),
(0xFC, 6),
]

def codepoint_length(first_byte):
if first_byte < 128:
return 1 # ASCII
for mask, length in LENGTH_BY_PREFIX:
if first_byte & mask == mask:
return length
else:
return 0

def read_utf8_char_and_decode(source):
c = source.read(1)
if c:
char_len = codepoint_length(ord(c))
else:
return u''
if char_len:
c += source.read(char_len-1)
try:
c=c.decode('utf8')
except:
return u''
else:
return u''
return c

source = open(SOURCE,mode='r')
tmp = codecs.open(TMP,mode='w',encoding='utf8')
line = source.readline()
vsize, nbdim = line.split()
vsize = int(vsize)
print vsize
count = 0
bad = 0
i = 0
wrong_chars = [u'',u'\u00A0',u'\u2026',u'\u000A', u'\u000B', u'\u000C', u'\u000D', u'\u0085', u'\u2028', u'\u2029']
print "Started ..."
while i<vsize:
if i % 100000 == 0:
print i
i+=1
s = u''
c = u''
while c != u' ':
c = read_utf8_char_and_decode(source)
if c in wrong_chars:
if c:
print 'Error %s' % repr(c)
bad+=1
source.readline()
break
else:
s += c
if c in wrong_chars:
continue
s2 = source.readline()
try:
s2 = s2.decode('utf8')
except:
print "Error: %s" % s2
bad += 1
continue
count += 1
tmp.write(s+s2)

print "%d bad words" % bad
print "%d total word count" % count
print "Now copying to the target file..."

source.close()
tmp.close()

with codecs.open(TMP,mode='r',encoding='utf8') as tmp:
with codecs.open(TARGET,mode='w',encoding='utf8') as target:
target.write("%d 300\n" % (count))
shutil.copyfileobj(tmp, target)

tmp.close()
target.close()
os.remove(TMP)
len(gc.get_objects())
gc.collect()

print("Done.")
9 changes: 6 additions & 3 deletions word2vec-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ def get(self):
try:
res = model.most_similar_cosmul(positive=pos,negative=neg,topn=t)
return res
except:
except Exception, e:
print e
print res



class Model(Resource):
def get(self):
parser = reqparse.RequestParser()
Expand All @@ -66,7 +68,8 @@ def get(self):
res = model[args['word']]
res = base64.b64encode(res)
return res
except:
except Exception, e:
print e
return

app = Flask(__name__)
Expand Down

0 comments on commit bab444b

Please sign in to comment.