Skip to content

Commit

Permalink
Merge pull request #155 from changebio/master
Browse files Browse the repository at this point in the history
update region extraction pipeline
  • Loading branch information
gaow authored Apr 29, 2022
2 parents 5bf3cbe + 3daef3a commit ba8abd1
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions GWAS/Region_Extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"Make sure you install the pre-requisited before running this notebook:\n",
"\n",
"```\n",
"pip install LDtoolsets\n",
"pip install cugg\n",
"```"
]
},
Expand All @@ -71,7 +71,7 @@
"- `--pheno-path`, the path of a phenotype. Only for one genotype data. If `None`, only `pld` will be calculated.\n",
" - The phenotype file should have a column with the name `IID`, which is used to represent the sample ID.\n",
"- `--sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n",
" - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n",
" - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n",
"- `--unrelated-samples`, the file path of unrelated samples with a column named `IID`. If `None`, all samples will be considered unrelative. \n",
"- `--cwd`, the path of output directory\n",
"\n",
Expand All @@ -81,7 +81,7 @@
" - The first column is chromosome ID, the 2nd file is genotype for that chromosome.\n",
" - When chromosome ID is 0, it implies that the genotype file contains all the genotypes.\n",
"- `--imp-sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n",
" - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n",
" - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n",
"- `--imp-ref`, the reference genome if exome genotype and imputed genotype are different. If `None`, The two genotype data will be considered from the same "
]
},
Expand Down Expand Up @@ -155,12 +155,10 @@
"parameter: geno_path = path\n",
"# Phenotype path\n",
"parameter: pheno_path = path\n",
"# Sample file path, for bgen format\n",
"parameter: bgen_sample_path = path('.')\n",
"# Path to summary stats file\n",
"parameter: sumstats_path = path\n",
"# Path to summary stats format configuration\n",
"parameter: format_config_path = path('.')\n",
"# Sample file path, for bgen format\n",
"parameter: bgen_sample_path = path()\n",
"# Path to samples of unrelated individuals\n",
"parameter: unrelated_samples = path()\n",
"# imputed Genotype file inventory\n",
Expand All @@ -170,7 +168,7 @@
"# Number of tasks to run in each job on cluster\n",
"parameter: job_size = int\n",
"# The reference genome of imputed genotype data\n",
"parameter: imp_ref = str\n",
"parameter: imp_ref = str('')\n",
"parameter: walltime = '12h'\n",
"parameter: mem = '60G'\n",
"fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')\n",
Expand All @@ -189,18 +187,18 @@
"outputs": [],
"source": [
"[default_1 (export utils script)]\n",
"depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('LDtools')\n",
"depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('cugg')\n",
"parameter: scan_window = 500000\n",
"output: f'{cwd:a}/utils.py'\n",
"report: expand = '${ }', output=f'{cwd:a}/utils.py'\n",
" import pandas as pd\n",
" import numpy as np\n",
" import dask.array as da\n",
" from LDtools.liftover import *\n",
" from LDtools.genodata import *\n",
" from LDtools.sumstat import *\n",
" from LDtools.ldmatrix import *\n",
" from LDtools.utils import *\n",
" from cugg.liftover import *\n",
" from cugg.genodata import *\n",
" from cugg.sumstat import *\n",
" from cugg.ldmatrix import *\n",
" from cugg.utils import *\n",
"\n",
"\n",
" def main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD,bgen_sample_path):\n",
Expand Down Expand Up @@ -344,16 +342,24 @@
" imp_sumstats_path = ${_input[5]:r}\n",
" bgen_sample_path = ${_input[6]:r}\n",
" imp_ref = '${imp_ref}'\n",
"\n",
" if not imp_ref:\n",
" imp_ref=None\n",
"\n",
" if not os.path.isfile(bgen_sample_path):\n",
" bgen_sample_path=None\n",
" print('If the genotype data is bgen format, please provide the path of bgen sample')\n",
" \n",
" input_format_config = ${format_config_path:r} if ${format_config_path.is_file()} else None\n",
"\n",
" chrom = \"${_regions[0]}\"\n",
" # Load genotype file for the region of interest\n",
" geno_inventory = dict([x.strip().split() for x in open(input_geno_path).readlines() if x.strip()])\n",
" if yml_path.is_file(): \n",
" if os.path.isfile(imp_geno_path): \n",
" imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n",
" else:\n",
" imp_geno_inventory={'0':None,chrom:None}\n",
" imp_sumstats_path = None\n",
" \n",
" if chrom.startswith('chr'):\n",
" chrom = chrom[3:]\n",
Expand Down

0 comments on commit ba8abd1

Please sign in to comment.