From bab444b73dc140442f6140984d56f9b3e59c5884 Mon Sep 17 00:00:00 2001 From: lechatpito Date: Wed, 15 Oct 2014 18:23:44 -0400 Subject: [PATCH] Added the cleaning script, updated README --- README.md | 9 ++- clean-word2vec-text-format.py | 102 ++++++++++++++++++++++++++++++++++ word2vec-api.py | 9 ++- 3 files changed, 114 insertions(+), 6 deletions(-) create mode 100644 clean-word2vec-text-format.py diff --git a/README.md b/README.md index 0dd151b..ec4d9b9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ word2vec-api ============ -Simple web service wrapping a Word2Vec as implemented in Gensim. +Simple web service providing a word embedding API. The methods are based on Gensim Word2Vec implementation. Models are passed as parameters and must be in the Word2Vec text or binary format. * Launching the service ``` @@ -11,6 +11,9 @@ python word2vec-api --model path/to/the/model [--host host --port 1234] * Example calls ``` curl http://127.0.0.1:5000/wor2vec/n_similarity?ws1=Sushi&ws1=Shop&ws2=Japanese&ws2=Restaurant -curl http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese -curl http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=] +curl http://127.0.0.1:5000/wor2vec/similarity?w1=Sushi&w2=Japanese +curl http://127.0.0.1:5000/wor2vec/most_similar?positive=indian&positive=food[&negative=][&topn=] +curl http://127.0.0.1:5000/wor2vec/model?word=restaurant ``` + +Note: The "model" method returns a base64 encoding of the Word2Vec vector. diff --git a/clean-word2vec-text-format.py b/clean-word2vec-text-format.py new file mode 100644 index 0000000..0062c6c --- /dev/null +++ b/clean-word2vec-text-format.py @@ -0,0 +1,102 @@ +#!/usr/bin/python + +import codecs +import os +import gc +import shutil + +SOURCE = 'glove.840B.300d.txt' +TARGET = 'glove.840B.300d--modified.txt' +TMP = 'tmp.file' + +LENGTH_BY_PREFIX = [ + (0xC0, 2), # first byte mask, total codepoint length + (0xE0, 3), + (0xF0, 4), + (0xF8, 5), + (0xFC, 6), +] + +def codepoint_length(first_byte): + if first_byte < 128: + return 1 # ASCII + for mask, length in LENGTH_BY_PREFIX: + if first_byte & mask == mask: + return length + else: + return 0 + +def read_utf8_char_and_decode(source): + c = source.read(1) + if c: + char_len = codepoint_length(ord(c)) + else: + return u'' + if char_len: + c += source.read(char_len-1) + try: + c=c.decode('utf8') + except: + return u'' + else: + return u'' + return c + +source = open(SOURCE,mode='r') +tmp = codecs.open(TMP,mode='w',encoding='utf8') +line = source.readline() +vsize, nbdim = line.split() +vsize = int(vsize) +print vsize +count = 0 +bad = 0 +i = 0 +wrong_chars = [u'',u'\u00A0',u'\u2026',u'\u000A', u'\u000B', u'\u000C', u'\u000D', u'\u0085', u'\u2028', u'\u2029'] +print "Started ..." +while i