From bab444b73dc140442f6140984d56f9b3e59c5884 Mon Sep 17 00:00:00 2001
From: lechatpito <francois.scharffe@3top.com>
Date: Wed, 15 Oct 2014 18:23:44 -0400
Subject: [PATCH] Added the cleaning script, updated README

---
 README.md                     |   9 ++-
 clean-word2vec-text-format.py | 102 ++++++++++++++++++++++++++++++++++
 word2vec-api.py               |   9 ++-
 3 files changed, 114 insertions(+), 6 deletions(-)
 create mode 100644 clean-word2vec-text-format.py

diff --git a/README.md b/README.md
index 0dd151b..ec4d9b9 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 word2vec-api
 ============
 
-Simple web service wrapping a Word2Vec as implemented in Gensim.
+Simple web service providing a word embedding API. The methods are based on Gensim Word2Vec implementation. Models are passed as parameters and must be in the Word2Vec text or binary format.
 
 * Launching the service
 ```
@@ -11,6 +11,9 @@ python word2vec-api --model path/to/the/model [--host host --port 1234]
 * Example calls
 ```
 curl http://127.0.0.1:5000/wor2vec/n_similarity?ws1=Sushi&ws1=Shop&ws2=Japanese&ws2=Restaurant  
-curl  http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese   
-curl  http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=]
+curl http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese   
+curl http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=]
+curl http://127.0.0.1:5000/wor2vec/model?word=restaurant
 ```
+
+Note: The "model" method returns a base64 encoding of the Word2Vec vector.
diff --git a/clean-word2vec-text-format.py b/clean-word2vec-text-format.py
new file mode 100644
index 0000000..0062c6c
--- /dev/null
+++ b/clean-word2vec-text-format.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+
+import codecs
+import os
+import gc
+import shutil
+
+SOURCE = 'glove.840B.300d.txt'
+TARGET = 'glove.840B.300d--modified.txt'
+TMP = 'tmp.file'
+
+LENGTH_BY_PREFIX = [
+  (0xC0, 2), # first byte mask, total codepoint length
+  (0xE0, 3), 
+  (0xF0, 4),
+  (0xF8, 5),
+  (0xFC, 6),
+]
+
+def codepoint_length(first_byte):
+    if first_byte < 128:
+        return 1 # ASCII
+    for mask, length in LENGTH_BY_PREFIX:
+        if first_byte & mask == mask:
+            return length
+        else:
+            return 0
+
+def read_utf8_char_and_decode(source):
+    c = source.read(1)
+    if c:
+        char_len = codepoint_length(ord(c))
+    else:
+        return u''
+    if char_len:
+        c += source.read(char_len-1)
+        try:
+            c=c.decode('utf8')
+        except:
+            return u''
+    else:
+        return u''
+    return c
+
+source = open(SOURCE,mode='r')
+tmp = codecs.open(TMP,mode='w',encoding='utf8')
+line = source.readline()
+vsize, nbdim = line.split()
+vsize = int(vsize)
+print vsize
+count = 0
+bad = 0
+i = 0
+wrong_chars = [u'',u'\u00A0',u'\u2026',u'\u000A', u'\u000B', u'\u000C', u'\u000D', u'\u0085', u'\u2028', u'\u2029']
+print "Started ..."
+while i<vsize:
+    if i % 100000 == 0:
+        print i
+    i+=1
+    s = u''
+    c = u''
+    while c != u' ':
+        c = read_utf8_char_and_decode(source)
+        if c in wrong_chars:
+            if c:
+	        print 'Error %s' % repr(c)
+            bad+=1
+            source.readline()
+            break
+        else:
+            s += c
+    if c in wrong_chars:
+        continue
+    s2 = source.readline()
+    try:
+        s2 = s2.decode('utf8')
+    except:
+        print "Error: %s" % s2
+        bad += 1
+        continue
+    count += 1
+    tmp.write(s+s2)
+
+print "%d bad words" % bad
+print "%d total word count" % count
+print "Now copying to the target file..."
+
+source.close()
+tmp.close()
+
+with codecs.open(TMP,mode='r',encoding='utf8') as tmp:
+    with codecs.open(TARGET,mode='w',encoding='utf8') as target:
+        target.write("%d 300\n" % (count))
+        shutil.copyfileobj(tmp, target)
+
+tmp.close()
+target.close()
+os.remove(TMP)
+len(gc.get_objects())
+gc.collect()
+
+print("Done.")
diff --git a/word2vec-api.py b/word2vec-api.py
index f7418a6..9e80966 100644
--- a/word2vec-api.py
+++ b/word2vec-api.py
@@ -54,9 +54,11 @@ def get(self):
         try:    
             res = model.most_similar_cosmul(positive=pos,negative=neg,topn=t)
             return res
-        except:
+        except Exception, e:
+            print e
             print res
-            
+
+
 class Model(Resource):
     def get(self):
         parser = reqparse.RequestParser()
@@ -66,7 +68,8 @@ def get(self):
             res = model[args['word']]
             res = base64.b64encode(res)
             return res
-        except:
+        except Exception, e:
+            print e
             return
 
 app = Flask(__name__)