Merge pull request #155 from changebio/master

update region extraction pipeline
cumc · Apr 29, 2022 · ba8abd1 · ba8abd1
2 parents 5bf3cbe + 3daef3a
commit ba8abd1
Showing 1 changed file with 21 additions and 15 deletions.
diff --git a/GWAS/Region_Extraction.ipynb b/GWAS/Region_Extraction.ipynb
@@ -48,7 +48,7 @@
     "Make sure you install the pre-requisited before running this notebook:\n",
     "\n",
     "```\n",
-    "pip install LDtoolsets\n",
+    "pip install cugg\n",
     "```"
    ]
   },
@@ -71,7 +71,7 @@
     "- `--pheno-path`, the path of a phenotype. Only for one genotype data. If `None`, only `pld` will be calculated.\n",
     "    - The phenotype file should have a column with the name `IID`, which is used to represent the sample ID.\n",
     "- `--sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n",
-    "    - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n",
+    "    - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n",
     "- `--unrelated-samples`, the file path of unrelated samples with a column named `IID`. If `None`, all samples will be considered unrelative.  \n",
     "- `--cwd`, the path of output directory\n",
     "\n",
@@ -81,7 +81,7 @@
     "    - The first column is chromosome ID, the 2nd file is genotype for that chromosome.\n",
     "    - When chromosome ID is 0, it implies that the genotype file contains all the genotypes.\n",
     "- `--imp-sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n",
-    "    - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n",
+    "    - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n",
     "- `--imp-ref`, the reference genome if exome genotype and imputed genotype are different. If `None`, The two genotype data will be considered from the same  "
    ]
   },
@@ -155,12 +155,10 @@
     "parameter: geno_path = path\n",
     "# Phenotype path\n",
     "parameter: pheno_path = path\n",
-    "# Sample file path, for bgen format\n",
-    "parameter: bgen_sample_path = path('.')\n",
     "# Path to summary stats file\n",
     "parameter: sumstats_path = path\n",
-    "# Path to summary stats format configuration\n",
-    "parameter: format_config_path = path('.')\n",
+    "# Sample file path, for bgen format\n",
+    "parameter: bgen_sample_path = path()\n",
     "# Path to samples of unrelated individuals\n",
     "parameter: unrelated_samples = path()\n",
     "# imputed Genotype file inventory\n",
@@ -170,7 +168,7 @@
     "# Number of tasks to run in each job on cluster\n",
     "parameter: job_size = int\n",
     "# The reference genome of imputed genotype data\n",
-    "parameter: imp_ref = str\n",
+    "parameter: imp_ref = str('')\n",
     "parameter: walltime = '12h'\n",
     "parameter: mem = '60G'\n",
     "fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')\n",
@@ -189,18 +187,18 @@
    "outputs": [],
    "source": [
     "[default_1 (export utils script)]\n",
-    "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('LDtools')\n",
+    "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('cugg')\n",
     "parameter: scan_window = 500000\n",
     "output: f'{cwd:a}/utils.py'\n",
     "report: expand = '${ }', output=f'{cwd:a}/utils.py'\n",
     "    import pandas as pd\n",
     "    import numpy as np\n",
     "    import dask.array as da\n",
-    "    from LDtools.liftover import *\n",
-    "    from LDtools.genodata import *\n",
-    "    from LDtools.sumstat import *\n",
-    "    from LDtools.ldmatrix import *\n",
-    "    from LDtools.utils import *\n",
+    "    from cugg.liftover import *\n",
+    "    from cugg.genodata import *\n",
+    "    from cugg.sumstat import *\n",
+    "    from cugg.ldmatrix import *\n",
+    "    from cugg.utils import *\n",
     "\n",
     "\n",
     "    def main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD,bgen_sample_path):\n",
@@ -344,16 +342,24 @@
     "    imp_sumstats_path = ${_input[5]:r}\n",
     "    bgen_sample_path = ${_input[6]:r}\n",
     "    imp_ref =  '${imp_ref}'\n",
+    "\n",
+    "    if not imp_ref:\n",
+    "        imp_ref=None\n",
+    "\n",
+    "    if not os.path.isfile(bgen_sample_path):\n",
+    "        bgen_sample_path=None\n",
+    "        print('If the genotype data is bgen format, please provide the path of bgen sample')\n",
     "    \n",
     "    input_format_config = ${format_config_path:r} if ${format_config_path.is_file()} else None\n",
     "\n",
     "    chrom = \"${_regions[0]}\"\n",
     "    # Load genotype file for the region of interest\n",
     "    geno_inventory = dict([x.strip().split() for x in open(input_geno_path).readlines() if x.strip()])\n",
-    "    if yml_path.is_file(): \n",
+    "    if os.path.isfile(imp_geno_path): \n",
     "        imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n",
     "    else:\n",
     "        imp_geno_inventory={'0':None,chrom:None}\n",
+    "        imp_sumstats_path = None\n",
     "    \n",
     "    if chrom.startswith('chr'):\n",
     "        chrom = chrom[3:]\n",