-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
741 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
wsgidav.conf | ||
__pycache__ | ||
tmp | ||
._.DS_Store | ||
.DS_Store | ||
.dropbox | ||
.ipynb_checkpoints | ||
cboe | ||
Gemini_BTCUSD_1h.csv |
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import glob | ||
import gzip | ||
import io | ||
import lz4.frame | ||
import csv | ||
import numpy as np | ||
|
||
#import diskcache as dc | ||
#cache = dc.Cache('tmp') | ||
|
||
trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC'] | ||
for trading_pair in trading_pairs: | ||
fz = io.TextIOWrapper(lz4.frame.open('cboe/' + trading_pair + '_duplicates_removed.csv.lz4', mode='wb'), encoding='utf-8') | ||
#fz = open('cboe/' + trading_pair + '_duplicates_removed.csv', 'wt') | ||
allfiles = glob.glob('cboe/lz4/' + trading_pair + '*.csv.lz4') | ||
if len(allfiles) == 0: | ||
continue | ||
fieldnames = None | ||
for filename in allfiles: | ||
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file: | ||
reader = csv.reader(file) | ||
for x in reader: | ||
print(x) | ||
fieldnames = x | ||
break | ||
break | ||
|
||
maxid = 0 | ||
for filename in allfiles: | ||
print(filename) | ||
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file: | ||
reader = csv.DictReader(file) | ||
for x in reader: | ||
id = int(x['Event ID']) | ||
maxid = max(id, maxid) | ||
seen_ids = np.full(maxid + 1, False, dtype=bool) | ||
|
||
writer = csv.DictWriter(fz, fieldnames) | ||
writer.writeheader() | ||
|
||
#ids = set() | ||
allfiles = glob.glob('cboe/lz4/' + trading_pair + '*.csv.lz4') | ||
for filename in allfiles: | ||
print(filename) | ||
with io.TextIOWrapper(lz4.frame.open(filename, 'rb'), encoding='utf-8') as file: | ||
reader = csv.DictReader(file) | ||
for x in reader: | ||
id = int(x['Event ID']) | ||
if seen_ids[id]: | ||
continue | ||
seen_ids[id] = True | ||
#if id in cache: | ||
# continue | ||
#cache[id] = True | ||
writer.writerow(x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from glob import glob | ||
from plumbum.cmd import gunzip, lz4, rm | ||
|
||
for x in glob('cboe/lz4/*.csv.gz'): | ||
print(x) | ||
gunzip[x]() | ||
csvfile = x.replace('.csv.gz', '.csv') | ||
outfile = x.replace('.csv.gz', '.csv.lz4') | ||
(lz4['-9', csvfile] > outfile)() | ||
rm(csvfile) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import paratext | ||
import pandas | ||
import lz4.frame | ||
import gzip | ||
import io | ||
import pyarrow.parquet as pq | ||
import pyarrow as pa | ||
import numpy as np | ||
import copy | ||
|
||
''' | ||
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4' | ||
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz' | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath))) | ||
#df = pandas.read_csv(filepath) | ||
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read()) | ||
print((df)) | ||
''' | ||
|
||
from glob import glob | ||
from plumbum.cmd import rm | ||
import sys | ||
|
||
trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC'] | ||
|
||
for trading_pair in trading_pairs: | ||
allfiles = sorted(glob(f'cboe/parquet/{trading_pair}*.parquet')) | ||
|
||
id_to_row = {} | ||
id_to_filesrc = {} | ||
|
||
for x in allfiles: | ||
outfile = x.replace('cboe/parquet/', 'cboe/parquet_nodups/') | ||
print(outfile) | ||
table = pq.read_table(x).to_pandas() | ||
def is_duplicate(row): | ||
id = row['Event ID'] | ||
#if id == 343: | ||
# print(row) | ||
retval = id in id_to_row | ||
if retval: | ||
print(x) | ||
print(row) | ||
print(id_to_filesrc[id]) | ||
print(id_to_row[id]) | ||
sys.exit() | ||
else: | ||
id_to_row[id] = copy.copy(row) | ||
id_to_filesrc[id] = copy.copy(x) | ||
return retval | ||
table['isduplicate'] = table.apply(is_duplicate, axis=1) | ||
table = table.query('isduplicate == False') | ||
del table['isduplicate'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import paratext | ||
import pandas | ||
import lz4.frame | ||
import gzip | ||
import io | ||
import pyarrow.parquet as pq | ||
import pyarrow as pa | ||
|
||
''' | ||
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4' | ||
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz' | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath))) | ||
#df = pandas.read_csv(filepath) | ||
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read()) | ||
print((df)) | ||
''' | ||
|
||
from glob import glob | ||
from plumbum.cmd import rm | ||
|
||
for x in glob('cboe/parquet/*.parquet'): | ||
print(x) | ||
table = pq.read_table(x, columns=["Event ID"]) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import paratext | ||
import pandas | ||
import lz4.frame | ||
import gzip | ||
import io | ||
import pyarrow.parquet as pq | ||
import pyarrow as pa | ||
import numpy as np | ||
|
||
''' | ||
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4' | ||
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz' | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath))) | ||
#df = pandas.read_csv(filepath) | ||
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read()) | ||
print((df)) | ||
''' | ||
|
||
from glob import glob | ||
from plumbum.cmd import rm | ||
|
||
trading_pairs = ['BTCUSD', 'ETHUSD', 'ETHBTC'] | ||
|
||
for trading_pair in trading_pairs: | ||
allfiles = sorted(glob(f'cboe/parquet/{trading_pair}*.parquet')) | ||
|
||
print(f'trading pair {trading_pair} finding max id') | ||
maxid = 0 | ||
|
||
for x in allfiles: | ||
table = pq.read_table(x, columns=['Event ID']).to_pandas() | ||
curmax = table['Event ID'].max() | ||
maxid = max(maxid, curmax) | ||
|
||
print(f'max id for {trading_pair} is {maxid}') | ||
seen_ids = np.full(maxid + 1, False, dtype=bool) | ||
|
||
for x in allfiles: | ||
outfile = x.replace('cboe/parquet/', 'cboe/parquet_nodups/') | ||
print(outfile) | ||
table = pq.read_table(x).to_pandas() | ||
def is_duplicate(row): | ||
id = row['Event ID'] | ||
retval = seen_ids[id] | ||
if not retval: | ||
seen_ids[id] = True | ||
return retval | ||
table['isduplicate'] = table.apply(is_duplicate, axis=1) | ||
table = table.query('isduplicate == False') | ||
del table['isduplicate'] | ||
pq.write_table(pa.Table.from_pandas(table), outfile, compression='snappy') |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import paratext | ||
import pandas | ||
import lz4.frame | ||
import gzip | ||
import io | ||
import pyarrow.parquet as pq | ||
import pyarrow as pa | ||
|
||
''' | ||
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4' | ||
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz' | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath))) | ||
#df = pandas.read_csv(filepath) | ||
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read()) | ||
print((df)) | ||
''' | ||
|
||
from glob import glob | ||
from plumbum.cmd import rm | ||
|
||
for x in glob('cboe/parquet/*.csv.lz4'): | ||
print(x) | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(x)), low_memory=False) | ||
table = pa.Table.from_pandas(df) | ||
outfile = x.replace('.csv.lz4', '.parquet') | ||
pq.write_table(table, outfile, compression='snappy') | ||
rm(x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import paratext | ||
import pandas | ||
import lz4.frame | ||
import gzip | ||
import io | ||
import pyarrow.parquet as pq | ||
import pyarrow as pa | ||
|
||
''' | ||
filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.lz4' | ||
#filepath = 'cboe/lz4_test/BTCUSD_order_book_20170627.csv.gz' | ||
df = pandas.read_csv(io.TextIOWrapper(lz4.frame.open(filepath))) | ||
#df = pandas.read_csv(filepath) | ||
#df = paratext.load_csv_to_pandas(gzip.open(filepath).read()) | ||
print((df)) | ||
''' | ||
|
||
from glob import glob | ||
from plumbum.cmd import rm | ||
|
||
import sys | ||
|
||
|
||
filename = sys.argv[1] | ||
df = pq.read_table(filename).to_pandas() | ||
print(df.to_csv()) |