Skip to content

Commit

Permalink
preprocessing data
Browse files Browse the repository at this point in the history
  • Loading branch information
gkovacs committed Mar 14, 2018
1 parent 4e34f04 commit 059d96f
Show file tree
Hide file tree
Showing 18 changed files with 741 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
wsgidav.conf
__pycache__
tmp
._.DS_Store
.DS_Store
.dropbox
.ipynb_checkpoints
cboe
Gemini_BTCUSD_1h.csv
Empty file modified README.md
100644 → 100755
Empty file.
Empty file modified RNN.ipynb
100644 → 100755
Empty file.
Empty file modified RNN_checkpoint.ipynb
100644 → 100755
Empty file.
472 changes: 472 additions & 0 deletions RNN_improved.ipynb

Large diffs are not rendered by default.

Empty file modified RNN_mixedsplit.ipynb
100644 → 100755
Empty file.
Empty file modified baseline_hourlyprices.py
100644 → 100755
Empty file.
Empty file modified baseline_orderbook.py
100644 → 100755
Empty file.
57 changes: 57 additions & 0 deletions check_for_duplicate_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3

import glob
import gzip
import io
import lz4.frame
import csv
import numpy as np

#import diskcache as dc
#cache = dc.Cache('tmp')

trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC']
for trading_pair in trading_pairs:
fz = io.TextIOWrapper(lz4.frame.open('cboe/' + trading_pair + '_duplicates_removed.csv.lz4', mode='wb'), encoding='utf-8')
#fz = open('cboe/' + trading_pair + '_duplicates_removed.csv', 'wt')
allfiles = glob.glob('cboe/lz4/' + trading_pair + '*.csv.lz4')
if len(allfiles) == 0:
continue
fieldnames = None
for filename in allfiles:
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file:
reader = csv.reader(file)
for x in reader:
print(x)
fieldnames = x
break
break

maxid = 0
for filename in allfiles:
print(filename)
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file:
reader = csv.DictReader(file)
for x in reader:
id = int(x['Event ID'])
maxid = max(id, maxid)
seen_ids = np.full(maxid + 1, False, dtype=bool)

writer = csv.DictWriter(fz, fieldnames)
writer.writeheader()

#ids = set()
allfiles = glob.glob('cboe/lz4/' + trading_pair + '*.csv.lz4')
for filename in allfiles:
print(filename)
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file:
reader = csv.DictReader(file)
for x in reader:
id = int(x['Event ID'])
if seen_ids[id]:
continue
seen_ids[id] = True
#if id in cache:
# continue
#cache[id] = True
writer.writerow(x)
12 changes: 12 additions & 0 deletions convert_to_lz4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python3

from glob import glob
from plumbum.cmd import gunzip, lz4, rm

for x in glob('cboe/lz4/*.csv.gz'):
print(x)
gunzip[x]()
csvfile = x.replace('.csv.gz', '.csv')
outfile = x.replace('.csv.gz', '.csv.lz4')
(lz4['-9', csvfile] > outfile)()
rm(csvfile)
Empty file modified iterate_over_data.py
100644 → 100755
Empty file.
55 changes: 55 additions & 0 deletions print_duplicate_ids_entries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3

import paratext
import pandas
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
import copy

'''
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4'
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz'
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath)))
#df = pandas.read_csv(filepath)
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read())
print((df))
'''

from glob import glob
from plumbum.cmd import rm
import sys

trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC']

for trading_pair in trading_pairs:
allfiles = sorted(glob(f'cboe/parquet/{trading_pair}*.parquet'))

id_to_row = {}
id_to_filesrc = {}

for x in allfiles:
outfile = x.replace('cboe/parquet/', 'cboe/parquet_nodups/')
print(outfile)
table = pq.read_table(x).to_pandas()
def is_duplicate(row):
id = row['Event ID']
#if id == 343:
# print(row)
retval = id in id_to_row
if retval:
print(x)
print(row)
print(id_to_filesrc[id])
print(id_to_row[id])
sys.exit()
else:
id_to_row[id] = copy.copy(row)
id_to_filesrc[id] = copy.copy(x)
return retval
table['isduplicate'] = table.apply(is_duplicate, axis=1)
table = table.query('isduplicate == False')
del table['isduplicate']
26 changes: 26 additions & 0 deletions read_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3

import paratext
import pandas
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa

'''
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4'
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz'
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath)))
#df = pandas.read_csv(filepath)
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read())
print((df))
'''

from glob import glob
from plumbum.cmd import rm

for x in glob('cboe/parquet/*.parquet'):
print(x)
table = pq.read_table(x, columns=["Event ID"])

53 changes: 53 additions & 0 deletions remove_duplicate_ids_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3

import paratext
import pandas
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np

'''
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4'
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz'
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath)))
#df = pandas.read_csv(filepath)
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read())
print((df))
'''

from glob import glob
from plumbum.cmd import rm

trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC']

for trading_pair in trading_pairs:
allfiles = sorted(glob(f'cboe/parquet/{trading_pair}*.parquet'))

print(f'trading pair {trading_pair} finding max id')
maxid = 0

for x in allfiles:
table = pq.read_table(x, columns=['Event ID']).to_pandas()
curmax = table['Event ID'].max()
maxid = max(maxid, curmax)

print(f'max id for {trading_pair} is {maxid}')
seen_ids = np.full(maxid + 1, False, dtype=bool)

for x in allfiles:
outfile = x.replace('cboe/parquet/', 'cboe/parquet_nodups/')
print(outfile)
table = pq.read_table(x).to_pandas()
def is_duplicate(row):
id = row['Event ID']
retval = seen_ids[id]
if not retval:
seen_ids[id] = True
return retval
table['isduplicate'] = table.apply(is_duplicate, axis=1)
table = table.query('isduplicate == False')
del table['isduplicate']
pq.write_table(pa.Table.from_pandas(table), outfile, compression='snappy')
Empty file modified simple_baselines.ipynb
100644 → 100755
Empty file.
Empty file modified simple_baselines_mixedsplit.ipynb
100644 → 100755
Empty file.
29 changes: 29 additions & 0 deletions to_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

import paratext
import pandas
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa

'''
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4'
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz'
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath)))
#df = pandas.read_csv(filepath)
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read())
print((df))
'''

from glob import glob
from plumbum.cmd import rm

for x in glob('cboe/parquet/*.csv.lz4'):
print(x)
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(x)), low_memory=False)
table = pa.Table.from_pandas(df)
outfile = x.replace('.csv.lz4', '.parquet')
pq.write_table(table, outfile, compression='snappy')
rm(x)
28 changes: 28 additions & 0 deletions view_as_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3

import paratext
import pandas
import lz4.frame
import gzip
import io
import pyarrow.parquet as pq
import pyarrow as pa

'''
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4'
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz'
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath)))
#df = pandas.read_csv(filepath)
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read())
print((df))
'''

from glob import glob
from plumbum.cmd import rm

import sys


filename = sys.argv[1]
df = pq.read_table(filename).to_pandas()
print(df.to_csv())

0 comments on commit 059d96f

Please sign in to comment.