-
Notifications
You must be signed in to change notification settings - Fork 28
/
tsv_glue.py
112 lines (82 loc) · 3.48 KB
/
tsv_glue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''This module offers a simple interface to the TSV files we want to store.
You will note that it doesn't use the csv module. This is because that
module has a number of annoying habits, the most significant one being that
it completely does not support unicode. (There are wrappers which solve
this, but I'd prefer to keep dependencies, particularly those that don't
come with the OS, to a minimum.) However, the final straw was the fact that
it wanted to escape quote characters even though I specified no quoting.
It's really a very simple format. We'll be OK.
WARNING: Keep the TSV dialect consistent with README.
See http://docs.python.org/library/io.html for the meaning of the buffering
parameter. '''
import collections
import io
import u
class Reader(object):
'''Read rows are returned as lists. Empty strings are converted to None.
Converting to numbers, etc., is the responsibility of the caller.'''
__slots__ = ('filename', 'fp')
def __init__(self, filename, buffering=-1):
'''Open a TSV file for reading and return the reader object. If the file
does not exist, raise an exception.'''
self.filename = filename
self.fp = io.open(self.filename, mode='rt', buffering=buffering,
encoding='utf8')
def __iter__(self):
return self
def close(self):
self.fp.close()
def next(self):
line = self.fp.readline()
line = line.rstrip('\n')
if (line == ''):
raise StopIteration
return [(col if col != '' else None) for col in line.split('\t')]
class Writer(object):
__slots__ = ('filename', 'fp')
def __init__(self, filename=None, fp=None, buffering=-1, clobber=False):
'''Open a TSV file for writing and return the writer object, creating a
new file if one does not already exist. If clobber is True, overwrite
any existing contents, if False (the default), append.'''
self.filename = filename
mode = 'wt' if clobber else 'at'
if (fp is not None):
self.fp = fp
else:
self.fp = io.open(self.filename, mode=mode, buffering=buffering,
encoding='utf8')
def close(self):
self.fp.close()
def writerow(self, row):
def _unicodify(s):
if s is None:
return u''
elif isinstance(s, unicode):
return s
else:
return unicode(s)
self.fp.write('\t'.join([_unicodify(i) for i in row]) + '\n')
class Dict(collections.defaultdict):
'''A lazy-loading dictionary of open TSV files. Essentially:
>>> t = tsv_glue.Dict('/tmp/foo_')
>>> d['bar'].writerow([1,2,3])
In the second line, if t['bar'] isn't already an open TSV file, it will
magically become one (stored in "/tmp/foo_bar.tsv").'''
def __init__(self, prefix, class_=Writer, buffering=-1, clobber=False):
self.prefix = prefix
self.class_ = class_
self.buffering = buffering
self.clobber = clobber
def __missing__(self, key):
filename = self.filename_from_key(key)
self[key] = self.class_(filename=filename, buffering=self.buffering,
clobber=self.clobber)
u.l.debug('lazy opened %s' % (filename))
return self[key]
def close(self):
for f in self.itervalues():
f.close()
def filename_from_key(self, key):
return (self.prefix + key + '.tsv')
def iterfiles(self):
return [self.filename_from_key(i) for i in self.iterkeys()]