vpc-ccg · baraaorabi · Apr 7, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 17, 2021
diff --git a/.gitignore b/.gitignore
@@ -53,3 +53,4 @@ freddie_dbg
 *.mat
 *.data
 .vscode/
+gurobi.lic
diff --git a/.gitmodules b/.gitmodules
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018 Hach Lab for Computational Cancer Genomics
+Copyright (c) 2021 Hach Lab for Computational Cancer Genomics
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/cluster.json b/cluster.json
diff --git a/config.yaml b/config.yaml
@@ -1,24 +1,27 @@
 outpath:
-    test/dev-out/
+    test/benchmark/
 
 gurobi:
     license: gurobi.lic
     timeout: 15
 
-samples:
-    N_sim:
-        reads:
-            - /groups/hachgrp/projects/dev-ltr-simulator/analysis/ltr-sim-dev/reads/P000S000.L-non.fastq
-        seq_type: ont1d
-        data_type: sim
-        gtf: whole_genome
-
 exec:
     align    : py/freddie_align.py
     split    : py/freddie_split.py
     segment  : py/freddie_segment.py
     cluster  : py/freddie_cluster.py
     isoforms : py/freddie_isoforms.py
 
+samples:
+        seq_type : ont1d
+        ref      : homo_sapiens
+        reads    :
+            - extern/LTR-sim/output/reads/22Rv1.L-non.fastq
+        reads_info    :
+            - extern/LTR-sim/output/reads/22Rv1.L-non.tsv
 references:
-    dna_desalt : /groups/hachgrp/annotations/DNA/97/deSALT.index
+    homo_sapiens:
+        annot      : extern/LTR-sim/refs/homo_sapiens/homo_sapiens.annot.gtf
+        genome     : extern/LTR-sim/refs/homo_sapiens/homo_sapiens.dna.fa
+        genome_fai : extern/LTR-sim/refs/homo_sapiens/homo_sapiens.dna.fa.fai
+        desalt_idx : test/mapping/homo_sapiens.dna.desalt_idx
diff --git a/environment.yml → envs/freddie.yml b/environment.yml → envs/freddie.yml
@@ -1,19 +1,17 @@
-name: freddie
+name: freddie_bench_freddie
 channels:
   - conda-forge
   - bioconda
   - defaults
   - anaconda
   - gurobi
 dependencies:
-  - python>=3.6
-  - snakemake>=5
-  - desalt=1.5.4
-  - pysam>=0.15
+  - desalt==1.5.4
+  - gurobi>=9.0
+  - minimap2>=2.16
+  - networkx>=2
   - numpy>=1.16
+  - pysam>=0.15
+  - python>=3.6
   - scikit-learn>=0.20
-  - scipy>=1.2.1
-  - networkx>=2
-  - gurobi>=9.0
-  - matplotlib>=3
-  - pypdf2>=1.26
+  - scipy>=1.2.1
diff --git a/py/freddie_cluster.py b/py/freddie_cluster.py
@@ -244,8 +244,8 @@ def partition_reads(tint):
 
 
 def preprocess_ilp(tint, ilp_settings):
-    print('Preproessing ILP with {} read reps and the following settings:\n{}'.format(
-        len(tint['read_reps']), ilp_settings))
+    # print('Preproessing ILP with {} read reps and the following settings:\n{}'.format(
+    #     len(tint['read_reps']), ilp_settings))
     read_reps = tint['read_reps']
     N = len(read_reps)
     M = len(tint['segs'])
@@ -588,7 +588,7 @@ def run_ilp(tint, remaining_rids, incomp_rids, ilp_settings, log_prefix):
 
     isoforms = {k: dict()
                 for k in range(ISOFORM_INDEX_START, ilp_settings['K'])}
-    print('STATUS: {}'.format(ILP_ISOFORMS_STATUS))
+    # print('STATUS: {}'.format(ILP_ISOFORMS_STATUS))
     # if ILP_ISOFORMS_STATUS == GRB.Status.TIME_LIMIT:
     #     status = 'TIME_LIMIT'
     if ILP_ISOFORMS_STATUS != GRB.Status.OPTIMAL:
@@ -709,27 +709,27 @@ def cluster_tint(cluster_args):
     assert len(tints) == 1
     tint = list(tints.values())[0]
 
-    print('# Clustering tint {}'.format(tint['id']))
+    # print('# Clustering tint {}'.format(tint['id']))
     if logs_dir != None:
         os.makedirs('{}/{}'.format(logs_dir, tint['id']), exist_ok=True)
         timeout_log = open(
             '{}/{}/timeout.log'.format(logs_dir, tint['id']), 'w+')
     preprocess_ilp(tint, ilp_settings)
     partition_reads(tint)
-    print('# Paritions ({}) sizes: {}\n'.format(
-        len(tint['partitions']), [len(p) for p in tint['partitions']]))
+    # print('# Paritions ({}) sizes: {}\n'.format(
+    #     len(tint['partitions']), [len(p) for p in tint['partitions']]))
     tint['isoforms'] = list()
     tint['garbage_rids'] = list()
     for partition, (remaining_rids, incomp_rids) in enumerate(tint['partitions']):
         for rid in remaining_rids:
             for ridx in tint['read_reps'][rid]:
                 tint['reads'][ridx]['partition'] = partition
-        print(
-            '==========\ntint {}: Running {}-th partition...'.format(tint['id'], partition))
+        # print(
+        #     '==========\ntint {}: Running {}-th partition...'.format(tint['id'], partition))
         for round_num in range(ilp_settings['max_rounds']):
             actual_remaining_rids_len = sum(len(tint['read_reps'][i]) for i in remaining_rids)
-            print('==========\ntint {}: Running {}-th round with {} read reps and {} actual reads...'.format(
-                tint['id'], round_num, len(remaining_rids), actual_remaining_rids_len))
+            # print('==========\ntint {}: Running {}-th round with {} read reps and {} actual reads...'.format(
+            #     tint['id'], round_num, len(remaining_rids), actual_remaining_rids_len))
             if actual_remaining_rids_len < min_isoform_size:
                 break
             ILP_ISOFORMS_STATUS, status, round_isoforms = run_ilp(
@@ -748,12 +748,12 @@ def cluster_tint(cluster_args):
             number_of_clustered_reads = 0
             for i in round_isoforms.values():
                 number_of_clustered_reads += sum([len(tint['read_reps'][rid]) for i in round_isoforms.values() for rid in i['rid_to_corrections'].keys()])
-            print('Number of clustered reads:', number_of_clustered_reads)
+            # print('Number of clustered reads:', number_of_clustered_reads)
             if number_of_clustered_reads < min_isoform_size:
                 break
             for k, isoform in round_isoforms.items():
-                print('Isoform {} size: {}'.format(
-                    k, len(isoform['rid_to_corrections'])))
+                # print('Isoform {} size: {}'.format(
+                #     k, len(isoform['rid_to_corrections'])))
                 if sum(len(tint['read_reps'][rid]) for rid in isoform['rid_to_corrections'].keys()) < min_isoform_size:
                     continue
                 tint['isoforms'].append(isoform)
@@ -763,9 +763,9 @@ def cluster_tint(cluster_args):
                     for ridx in tint['read_reps'][rid]:
                         tint['reads'][ridx]['corrections'] = corrections
                         tint['reads'][ridx]['isoform'] = len(tint['isoforms'])-1
-            print('------->')
-            print('Remaining reads: {}\n'.format(len(remaining_rids)))
-            print('<-------')
+            # print('------->')
+            # print('Remaining reads: {}\n'.format(len(remaining_rids)))
+            # print('<-------')
         tint['garbage_rids'].extend(sorted(remaining_rids))
     if logs_dir != None:
         timeout_log.close()