From 88f044076a55f7f45c98a1e06aada03308a64299 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 11 Aug 2022 15:39:11 +0100
Subject: [PATCH 01/10] evidence string automation WIP

---
 bin/evidence_string_generation.py             |   2 +-
 docs/generate-evidence-strings.md             |  60 ++-----
 .../consequence_type.py                       |   2 +
 .../evidence_string_generation/pipeline.nf    | 159 ++++++++++++++++++
 4 files changed, 173 insertions(+), 50 deletions(-)
 create mode 100644 eva_cttv_pipeline/evidence_string_generation/pipeline.nf

diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py
index c3c2fd45..ee69cc62 100755
--- a/bin/evidence_string_generation.py
+++ b/bin/evidence_string_generation.py
@@ -10,7 +10,7 @@
 parser.add_argument('--ot-schema',    help='OpenTargets schema JSON',                required=True)
 parser.add_argument('--out',          help='Output directory',                       required=True)
 parser.add_argument('--include-structural', help='Use structural variants consequence prediction pipeline',
-                    action='store_true', default=False, required=False)
+                    action='store_true', default=True, required=False)
 
 
 if __name__ == '__main__':
diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index 49bcd3b0..d13d44e4 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -22,67 +22,29 @@ export OT_SCHEMA_VERSION=2.2.6
 ```
 
 ## 1. Process data
-The protocol is automated. See specific section comments for details.
+
+First create the directory structure for holding all files for the current batch.
 
 ```bash
-# Create directory structure for holding all files for the current batch.
 export BATCH_ROOT=${BATCH_ROOT_BASE}/batch-${OT_RELEASE}
 mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
+```
 
-# Download ClinVar data. We always use the most recent XML dump, which contains all data for the release.
-wget \
-  -O ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
-
-# Download the Open Targets JSON schema.
-wget \
-  -O ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
-  https://raw.githubusercontent.com/opentargets/json_schema/${OT_SCHEMA_VERSION}/opentargets.json
-
-# Run ClinVar variants through VEP and map them to genes and functional consequences.
-${BSUB_CMDLINE} -K -M 10G \
-  -o ${BATCH_ROOT}/logs/consequence_vep.out \
-  -e ${BATCH_ROOT}/logs/consequence_vep.err \
-  bash ${CODE_ROOT}/consequence_prediction/run_consequence_mapping.sh \
-    ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv
-
-# Generate the evidence strings for submission to Open Targets.
-${BSUB_CMDLINE} -K -M 10G \
-  -o ${BATCH_ROOT}/logs/evidence_string_generation.out \
-  -e ${BATCH_ROOT}/logs/evidence_string_generation.err \
-  python3 ${CODE_ROOT}/bin/evidence_string_generation.py \
-    --clinvar-xml  ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    --efo-mapping  ${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
-    --gene-mapping ${BATCH_ROOT}/gene_mapping/consequences_vep.tsv \
-    --ot-schema    ${BATCH_ROOT}/evidence_strings/opentargets-${OT_SCHEMA_VERSION}.json \
-    --out          ${BATCH_ROOT}/evidence_strings/ \
-    --include-structural
-
-# Check that the generated evidence strings do not contain any duplicated evidence strings. 
-#    For every evidence string, we group the value of fields datatypeId, studyId, 
-#    targetFromSourceId, variantId, variantFunctionalConsequenceId and diseaseFromSourceMappedId, 
-#    all separated by tabs, sorted and saved at duplicates.tsv if found duplicated. 
-jq --arg sep $'\t' -jr \
-  '.datatypeId,$sep,.studyId,$sep,.targetFromSourceId,$sep,.variantId,$sep,.variantFunctionalConsequenceId,$sep,.diseaseFromSourceMappedId,$sep,.diseaseFromSource,"\n"' \
-  ${BATCH_ROOT}/evidence_strings/evidence_strings.json \
-  | sort | uniq -d > ${BATCH_ROOT}/evidence_strings/duplicates.tsv
-
-# Convert MedGen and OMIM cross-references into ZOOMA format.
-${BSUB_CMDLINE} -K \
-  -o ${BATCH_ROOT}/logs/traits_to_zooma_format.out \
-  -e ${BATCH_ROOT}/logs/traits_to_zooma_format.err \
-  python3 ${CODE_ROOT}/bin/traits_to_zooma_format.py \
-    --clinvar-xml    ${BATCH_ROOT}/clinvar/ClinVarFullRelease_00-latest.xml.gz \
-    --zooma-feedback ${BATCH_ROOT}/clinvar/clinvar_xrefs.txt
+Then from `BATCH_ROOT` run the automated pipeline:
+```bash
+nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \
+  --batch_root ${BATCH_ROOT} \
+  --schema ${OT_SCHEMA_VERSION} \
+  -c ~/opentargets-nextflow.config
+  -resume
 ```
 
 ## 2. Manual follow-up actions
 
 ### Check that generated evidence strings do not contain any duplicates
-The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the file `${BATCH_ROOT}/evidence_strings/duplicates.tsv` should be empty. Check that this is the case.
+The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the automated pipeline should fail with an error if duplicates are detected.
 
 A repeated evidence string will have identical values for these five fields:
 * **datatypeId** - Identifier of the type of data we are associating, varying between somatic and non-somatic ClinVar records (*e.g.* ``somatic_mutation`` or ``genetic_association`` respectively). 
diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
index d2b10379..08bb812b 100644
--- a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
+++ b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
@@ -20,6 +20,8 @@ def process_consequence_type_dataframes(*dataframes):
     """
     consequence_type_dict = defaultdict(list)
     for consequences_dataframe in dataframes:
+        if consequences_dataframe is None:
+            continue
         for row in consequences_dataframe.itertuples():
             variant_id = row[1]
             ensembl_gene_id = row[2]
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
new file mode 100644
index 00000000..0f0e16bc
--- /dev/null
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -0,0 +1,159 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+
+def helpMessage() {
+    log.info"""
+    
+    Params:
+        --batch_root     Directory for current batch
+        --schema        Open Targets JSON schema version
+        --clinvar       ClinVar XML file (optional, will download latest if omitted)
+    """
+}
+
+params.help = null
+params.batch_root = null
+params.schema = null
+params.clinvar = null
+
+if (params.help) {
+    exit 0, helpMessage()
+}
+if (!params.batch_root || !params.schema) {
+    exit 1, helpMessage()
+}
+batchRoot = params.batch_root
+
+workflow {
+    if (params.clinvar != null) {
+        clinvarXml = Channel.fromPath(params.clinvar)
+    } else {
+        clinvarXml = downloadClinvar()
+    }
+    downloadJsonSchema()
+
+    runSnp(clinvarXml)
+    generateEvidence(clinvarXml,
+                     downloadJsonSchema.out.jsonSchema,
+                     runSnp.out.consequenceMappingsSnp)
+    checkDuplicates(generateEvidence.out.evidenceStrings)
+
+    convertXrefs(clinvarXml)
+}
+
+process downloadClinvar {
+    output:
+    path "clinvar.xml.gz", emit: clinvarXml
+
+    script:
+    """
+    wget -O clinvar.xml.gz \
+        https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
+    """
+}
+
+process downloadJsonSchema {
+    output:
+    path "opentargets-${params.schema}.json", emit: jsonSchema
+
+    script:
+    """
+    wget -O opentargets-${params.schema}.json \
+        https://raw.githubusercontent.com/opentargets/json_schema/${params.schema}/opentargets.json
+    """
+}
+
+process runSnp {
+    input:
+    path clinvarXml
+
+    output:
+    path "consequences_snp.tsv", emit: consequenceMappingsSnp
+
+    script:
+    """
+    \${PYTHON_BIN} "\${CODE_ROOT}/consequence_prediction/extract_variants_for_vep.py" --clinvar-xml ${clinvarXml} \
+    | sort -u \
+    | parallel \
+        --halt now,fail=1    `# If any job fails, kill the remaining ones immediately and report failure` \
+        --pipe               `# Input is read from STDIN and split by chunks`                             \
+        -j 20                `# Number of concurrent workers`                                             \
+        -N 200               `# Number of records (lines) per worker`                                     \
+        --tmpdir .           `# Store temporary files in the current directory to avoid /tmp overflow`    \
+        \${PYTHON_BIN} "\${CODE_ROOT}/consequence_prediction/vep_mapping_pipeline/consequence_mapping.py" \
+    | sort -u > consequences_snp.tsv
+
+    """
+    // TODO logs for this
+}
+
+process generateEvidence {
+    publishDir "${batchRoot}/logs",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.log"
+
+    publishDir "${batchRoot}/evidence_strings",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.json"
+
+    input:
+    path clinvarXml
+    path jsonSchema
+    path consequenceMappings
+
+    output:
+    path "evidence_strings.json", emit: evidenceStrings
+
+    script:
+    """
+    \${PYTHON_BIN} \${CODE_ROOT}/bin/evidence_string_generation.py \
+        --clinvar-xml ${clinvarXml} \
+        --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
+        --gene-mapping ${consequenceMappings} \
+        --ot-schema ${jsonSchema} \
+        --out . \
+        > evidence_string_generation.log
+    """
+}
+
+process checkDuplicates {
+    input:
+    path evidenceStrings
+
+    script:
+    """
+    (jq --arg sep \$'\t' -jr \
+        '.datatypeId,\$sep,.studyId,\$sep,.targetFromSourceId,\$sep,.variantId,\$sep,.variantFunctionalConsequenceId,\$sep,.diseaseFromSourceMappedId,\sep,.diseaseFromSource,"\n"' \
+        ${evidenceStrings} | sort | uniq -d > duplicates.tsv) \
+    || [[ -z duplicates.tsv ]]
+    """
+}
+
+process convertXrefs {
+    publishDir "${batchRoot}/logs",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.log"
+
+    publishDir "${batchRoot}/clinvar",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.txt"
+
+    input:
+    path clinvarXml
+
+    output:
+    path "clinvar_xrefs.txt", emit: clinvarXrefs
+
+    """
+    \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \
+        --clinvar-xml    ${clinvarXml} \
+        --zooma-feedback clinvar_xrefs.txt \
+        > traits_to_zooma_format.log
+    """
+}
\ No newline at end of file

From 04b6eac9dbf81796db33f8c168dce10344718bbd Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 22 Aug 2022 14:23:40 +0100
Subject: [PATCH 02/10] fixing logging and adding docs

---
 docs/build.md                                 |  8 +++++
 docs/environment.md                           | 12 +++++--
 docs/generate-evidence-strings.md             |  8 ++---
 .../evidence_string_generation/pipeline.nf    | 34 ++++++++++---------
 4 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 5900aa84..9003b71d 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -52,6 +52,14 @@ export PYTHONPATH=${INSTALL_PATH}
 
 The installed Python version can then be called with either `python` or `python3`. You can also use either `pip` or `pip3` to install packages into this local distribution.
 
+## Nextflow installation
+
+The evidence string generation pipeline uses Nextflow, which itself relies on Java. You can install in the current directory as follows:
+```bash
+wget -qO- https://get.nextflow.io | bash
+```
+You can then include this in your `$PATH` variable if necessary, or invoke the executable directly.  For more details on installing Nextflow, see the [documentation](https://www.nextflow.io/docs/latest/getstarted.html).
+
 ## Deploying local OLS installation
 During the preparation of 2019_04 release, which had to be synchronized with EFO v3, OLS had to be deployed locally because the production deployment of OLS on www.ebi.ac.uk/ols only supported EFO v2 at the time. This can be done using the following command (substitute the image version as appropriate):
 
diff --git a/docs/environment.md b/docs/environment.md
index e7125a35..a66e5d9c 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -3,7 +3,7 @@
 1. Log in to the LSF cluster, where all data processing must take place.
 1. Using a `become` command, switch to a common EVA production user instead of your personal account.
 1. Adjust and execute the commands below. They will set up the environment, fetch and build the code. Notes:
-    - The first five variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
+    - The first six variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
     - By modifying the `GIT_REMOTE` and `GIT_BRANCH` variables, you can run an arbitrary version of the pipeline. This can be used for development and debugging. By default it will fetch the master branch from the main pipeline repository.
     - Running these commands will overwrite any local changes you had in the repository copy on the cluster.
 
@@ -17,6 +17,9 @@ export PYTHON_INSTALL_PATH=
 # Location of bcftools installation path
 export BCFTOOLS_INSTALL_PATH=
 
+# Location of Nextflow installation path
+export NEXTFLOW_INSTALL_PATH=
+
 # The directory where subdirectories for each batch will be created
 export BATCH_ROOT_BASE=
 
@@ -26,8 +29,8 @@ export FTP_PATH_BASE=
 # Base bsub command line for all commands.
 export BSUB_CMDLINE="bsub"
 
-# Setting up Python paths
-export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:$PATH
+# Setting up paths
+export PATH=${PYTHON_INSTALL_PATH}:${BCFTOOLS_INSTALL_PATH}:${NEXTFLOW_INSTALL_PATH}:$PATH
 export PYTHONPATH=${PYTHON_INSTALL_PATH}
 
 # External service paths
@@ -44,4 +47,7 @@ source env/bin/activate
 python3 -m pip -q install --upgrade pip setuptools
 python3 -m pip -q install -r requirements.txt
 python3 setup.py install
+
+# Location of Python executable, pointing to the virtualenv
+export PYTHON_BIN=${CODE_ROOT}/env/bin/python
 ```
diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index d13d44e4..5a73ae1b 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -23,21 +23,19 @@ export OT_SCHEMA_VERSION=2.2.6
 
 ## 1. Process data
 
-First create the directory structure for holding all files for the current batch.
+The protocol is automated. See specific section comments for details.
 
 ```bash
+# Create directory structure for holding all files for the current batch.
 export BATCH_ROOT=${BATCH_ROOT_BASE}/batch-${OT_RELEASE}
 mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
-```
 
-Then from `BATCH_ROOT` run the automated pipeline:
-```bash
+# Run the nextflow pipeline, resuming execution if necessary.
 nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \
   --batch_root ${BATCH_ROOT} \
   --schema ${OT_SCHEMA_VERSION} \
-  -c ~/opentargets-nextflow.config
   -resume
 ```
 
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index 0f0e16bc..cff26cc9 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -5,9 +5,10 @@ nextflow.enable.dsl=2
 
 def helpMessage() {
     log.info"""
+    Generate ClinVar evidence strings for Open Targets.
     
     Params:
-        --batch_root     Directory for current batch
+        --batch_root    Directory for current batch
         --schema        Open Targets JSON schema version
         --clinvar       ClinVar XML file (optional, will download latest if omitted)
     """
@@ -66,6 +67,14 @@ process downloadJsonSchema {
 }
 
 process runSnp {
+    clusterOptions "-o ${batchRoot}/logs/consequence_vep.out \
+                    -e ${batchRoot}/logs/consequence_vep.err"
+
+    publishDir "${batchRoot}/gene_mapping",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.tsv"
+
     input:
     path clinvarXml
 
@@ -86,14 +95,11 @@ process runSnp {
     | sort -u > consequences_snp.tsv
 
     """
-    // TODO logs for this
 }
 
 process generateEvidence {
-    publishDir "${batchRoot}/logs",
-        overwrite: true,
-        mode: "copy",
-        pattern: "*.log"
+    clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \
+                    -e ${batchRoot}/logs/evidence_string_generation.err"
 
     publishDir "${batchRoot}/evidence_strings",
         overwrite: true,
@@ -115,8 +121,7 @@ process generateEvidence {
         --efo-mapping \${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv \
         --gene-mapping ${consequenceMappings} \
         --ot-schema ${jsonSchema} \
-        --out . \
-        > evidence_string_generation.log
+        --out .
     """
 }
 
@@ -134,10 +139,8 @@ process checkDuplicates {
 }
 
 process convertXrefs {
-    publishDir "${batchRoot}/logs",
-        overwrite: true,
-        mode: "copy",
-        pattern: "*.log"
+    clusterOptions "-o ${batchRoot}/logs/traits_to_zooma_format.out \
+                    -e ${batchRoot}/logs/traits_to_zooma_format.err"
 
     publishDir "${batchRoot}/clinvar",
         overwrite: true,
@@ -152,8 +155,7 @@ process convertXrefs {
 
     """
     \${PYTHON_BIN} \${CODE_ROOT}/bin/traits_to_zooma_format.py \
-        --clinvar-xml    ${clinvarXml} \
-        --zooma-feedback clinvar_xrefs.txt \
-        > traits_to_zooma_format.log
+        --clinvar-xml ${clinvarXml} \
+        --zooma-feedback clinvar_xrefs.txt
     """
-}
\ No newline at end of file
+}

From 422a3bc270eb79e1d830cbd76e66cb9bb20fe083 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 22 Aug 2022 14:45:45 +0100
Subject: [PATCH 03/10] cleaning up and added comments to nextflow

---
 docs/generate-evidence-strings.md             |  3 +--
 .../evidence_string_generation/pipeline.nf    | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index 5a73ae1b..39fe4618 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -22,7 +22,6 @@ export OT_SCHEMA_VERSION=2.2.6
 ```
 
 ## 1. Process data
-
 The protocol is automated. See specific section comments for details.
 
 ```bash
@@ -32,7 +31,7 @@ mkdir -p ${BATCH_ROOT}
 cd ${BATCH_ROOT}
 mkdir -p clinvar gene_mapping evidence_strings logs
 
-# Run the nextflow pipeline, resuming execution if necessary.
+# Run the nextflow pipeline, resuming execution of previous attempt if possible.
 nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.nf \
   --batch_root ${BATCH_ROOT} \
   --schema ${OT_SCHEMA_VERSION} \
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index cff26cc9..5c2e0d35 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -27,6 +27,9 @@ if (!params.batch_root || !params.schema) {
 }
 batchRoot = params.batch_root
 
+/*
+ * Main workflow
+ */
 workflow {
     if (params.clinvar != null) {
         clinvarXml = Channel.fromPath(params.clinvar)
@@ -44,6 +47,9 @@ workflow {
     convertXrefs(clinvarXml)
 }
 
+/*
+ * Download ClinVar data, using the most recent XML dump.
+ */
 process downloadClinvar {
     output:
     path "clinvar.xml.gz", emit: clinvarXml
@@ -55,6 +61,9 @@ process downloadClinvar {
     """
 }
 
+/*
+ * Download the Open Targets JSON schema.
+ */
 process downloadJsonSchema {
     output:
     path "opentargets-${params.schema}.json", emit: jsonSchema
@@ -66,6 +75,10 @@ process downloadJsonSchema {
     """
 }
 
+/*
+ * Run simple variants (SNPs and other variants with complete coordinates) through VEP and map them
+ * to genes and functional consequences.
+ */
 process runSnp {
     clusterOptions "-o ${batchRoot}/logs/consequence_vep.out \
                     -e ${batchRoot}/logs/consequence_vep.err"
@@ -97,6 +110,9 @@ process runSnp {
     """
 }
 
+/*
+ * Generate the evidence strings for submission to Open Targets.
+ */
 process generateEvidence {
     clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \
                     -e ${batchRoot}/logs/evidence_string_generation.err"
@@ -125,6 +141,9 @@ process generateEvidence {
     """
 }
 
+/*
+ * Check that the generated evidence strings do not contain any duplicated evidence strings.
+ */
 process checkDuplicates {
     input:
     path evidenceStrings
@@ -138,6 +157,9 @@ process checkDuplicates {
     """
 }
 
+/*
+ * Convert MedGen and OMIM cross-references into ZOOMA format.
+ */
 process convertXrefs {
     clusterOptions "-o ${batchRoot}/logs/traits_to_zooma_format.out \
                     -e ${batchRoot}/logs/traits_to_zooma_format.err"

From 08fb4eb7b98c1e4b55d80a1c2c379f9d1d95548d Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 22 Aug 2022 17:35:46 +0100
Subject: [PATCH 04/10] refactor consequence mapping to occur in parallel in
 nextflow

---
 .../run_repeat_expansion_variants.py          |  2 +-
 .../run_structural_variants.py                | 18 ++++
 .../structural_variants/pipeline.py           | 13 ++-
 .../clinvar_to_evidence_strings.py            | 32 ++-----
 .../consequence_type.py                       | 20 ----
 .../evidence_string_generation/pipeline.nf    | 93 ++++++++++++++++++-
 .../expected_genetics_evidence_string.json    |  1 +
 ...pected_multiple_names_evidence_string.json |  1 +
 .../expected_somatic_evidence_string.json     |  1 +
 .../test_clinvar_to_evidence_strings.py       |  7 +-
 .../test_consequence_type.py                  | 12 ---
 11 files changed, 134 insertions(+), 66 deletions(-)
 create mode 100755 consequence_prediction/run_structural_variants.py

diff --git a/consequence_prediction/run_repeat_expansion_variants.py b/consequence_prediction/run_repeat_expansion_variants.py
index abc04d4b..e9e2ecaa 100755
--- a/consequence_prediction/run_repeat_expansion_variants.py
+++ b/consequence_prediction/run_repeat_expansion_variants.py
@@ -14,7 +14,7 @@
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
 )
 parser.add_argument(
-    '--output-dataframe', required=True,
+    '--output-dataframe', required=False,
     help='File to output full dataframe for subsequent analysis and debugging.'
 )
 args = parser.parse_args()
diff --git a/consequence_prediction/run_structural_variants.py b/consequence_prediction/run_structural_variants.py
new file mode 100755
index 00000000..cd2117c2
--- /dev/null
+++ b/consequence_prediction/run_structural_variants.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+"""A wrapper script for running the repeat expansion pipeline."""
+
+import argparse
+import structural_variants.pipeline
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--clinvar-xml', required=True,
+    help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
+)
+parser.add_argument(
+    '--output-consequences', required=True,
+    help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
+)
+
+args = parser.parse_args()
+structural_variants.pipeline.main(args.clinvar_xml, args.output_consequences)
diff --git a/consequence_prediction/structural_variants/pipeline.py b/consequence_prediction/structural_variants/pipeline.py
index 232ec150..214f7ca7 100644
--- a/consequence_prediction/structural_variants/pipeline.py
+++ b/consequence_prediction/structural_variants/pipeline.py
@@ -68,7 +68,16 @@ def get_vep_results(clinvar_xml):
     return vep_results
 
 
-def main(clinvar_xml):
+def generate_consequences_file(consequences, output_consequences):
+    """Output final table."""
+    if consequences.empty:
+        logger.info('There are no records ready for output')
+        return
+    # Write the consequences table. This is used by the main evidence string generation pipeline.
+    consequences.to_csv(output_consequences, sep='\t', index=False, header=False)
+
+
+def main(clinvar_xml, output_consequences=None):
     vep_results = get_vep_results(clinvar_xml)
     results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
     variant_data = []
@@ -82,4 +91,6 @@ def main(clinvar_xml):
     # Return as a dataframe to be compatible with repeat expansion pipeline
     consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
                                                        'EnsemblGeneName', 'ConsequenceTerm'))
+    if output_consequences is not None:
+        generate_consequences_file(consequences, output_consequences)
     return consequences
diff --git a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py
index 00ad502a..5f4ca7e4 100644
--- a/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py
+++ b/eva_cttv_pipeline/evidence_string_generation/clinvar_to_evidence_strings.py
@@ -8,8 +8,6 @@
 
 import jsonschema
 
-from consequence_prediction.repeat_expansion_variants import pipeline as repeat_pipeline
-from consequence_prediction.structural_variants import pipeline as structural_pipeline
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
 from eva_cttv_pipeline.evidence_string_generation import consequence_type as CT
 
@@ -113,29 +111,20 @@ def validate_evidence_string(ev_string, ot_schema_contents):
         sys.exit(1)
 
 
-def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out,
-                    include_structural=False):
+def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
     os.makedirs(dir_out, exist_ok=True)
     string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
-
-    repeat_consequences = repeat_pipeline.main(clinvar_xml_file)
-    if include_structural:
-        structural_consequences = structural_pipeline.main(clinvar_xml_file)
-        complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences, structural_consequences)
-    else:
-        complex_consequences = CT.process_consequence_type_dataframes(repeat_consequences)
-    variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file, complex_consequences)
+    variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
 
     report = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
-        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME),
-        include_structural=include_structural)
+        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
     print(report.collate_report())
     report.write_unmapped_terms(dir_out)
 
 
 def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
-                                output_evidence_strings, include_structural):
+                                output_evidence_strings):
     report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
     ot_schema_contents = json.loads(open(ot_schema).read())
     output_evidence_strings_file = open(output_evidence_strings, 'wt')
@@ -160,7 +149,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
         # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
         # origins, (2) EFO mappings, and (3) genes where the variant has effect.
         grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
-        consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings, include_structural)
+        consequence_types = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
         grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
                                                          string_to_efo_mappings)
 
@@ -192,8 +181,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                 grouped_allele_origins, grouped_diseases, consequence_types):
             disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
             evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
-                                                       disease_mapped_efo_id, consequence_attributes,
-                                                       include_structural=include_structural)
+                                                       disease_mapped_efo_id, consequence_attributes)
 
             # Validate and immediately output the evidence string (not keeping everything in memory).
             validate_evidence_string(evidence_string, ot_schema_contents)
@@ -219,7 +207,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
 
 
 def generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, disease_mapped_efo_id,
-                             consequence_attributes, include_structural=False):
+                             consequence_attributes):
     """Generates an evidence string based on ClinVar record and some additional attributes."""
     is_somatic = allele_origins == ['somatic']
     evidence_string = {
@@ -268,7 +256,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
         # required by the Open Targets JSON schema.
         'diseaseFromSourceMappedId': disease_mapped_efo_id.split('/')[-1] if disease_mapped_efo_id else None,
     }
-    if include_structural and clinvar_record.measure.preferred_current_hgvs:
+    if clinvar_record.measure.preferred_current_hgvs:
         evidence_string['variantHgvsId'] = clinvar_record.measure.preferred_current_hgvs.text
 
     # Remove the attributes with empty values (either None or empty lists).
@@ -276,7 +264,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
     return evidence_string
 
 
-def get_consequence_types(clinvar_record_measure, consequence_type_dict, include_structural=False):
+def get_consequence_types(clinvar_record_measure, consequence_type_dict):
     """Returns the list of functional consequences for a given ClinVar record measure.
 
     This is the place where ClinVar records are paired with the information about gene and functional consequences.
@@ -317,7 +305,7 @@ def get_consequence_types(clinvar_record_measure, consequence_type_dict, include
             return consequence_type_dict[coord_id]
 
     # If there's also no complete coordinates, pair using HGVS
-    if include_structural and clinvar_record_measure.preferred_current_hgvs:
+    if clinvar_record_measure.preferred_current_hgvs:
         hgvs_id = clinvar_record_measure.preferred_current_hgvs.text
         if hgvs_id in consequence_type_dict:
             consequences = consequence_type_dict[hgvs_id]
diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
index 08bb812b..f224f1d9 100644
--- a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
+++ b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
@@ -13,25 +13,6 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term):
     consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term)))
 
 
-def process_consequence_type_dataframes(*dataframes):
-    """
-    Return a dictionary of consequence information extracted from one or more dataframes.
-    Assumes all dataframes are in the same format.
-    """
-    consequence_type_dict = defaultdict(list)
-    for consequences_dataframe in dataframes:
-        if consequences_dataframe is None:
-            continue
-        for row in consequences_dataframe.itertuples():
-            variant_id = row[1]
-            ensembl_gene_id = row[2]
-            so_term = row[4]
-
-            process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)
-
-    return consequence_type_dict
-
-
 def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
     """
     Return a dictionary of consequence information extracted from the given file.
@@ -90,7 +71,6 @@ class SoTerm(object):
     Represents a sequence ontology term belonging to a consequence type object.
     Holds information on accession and rank.
     """
-
     so_accession_name_dict = get_so_accession_dict()
 
     ranked_so_names_list = get_severity_ranking()
diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index 5c2e0d35..574c5387 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -27,8 +27,9 @@ if (!params.batch_root || !params.schema) {
 }
 batchRoot = params.batch_root
 
+
 /*
- * Main workflow
+ * Main workflow.
  */
 workflow {
     if (params.clinvar != null) {
@@ -38,10 +39,16 @@ workflow {
     }
     downloadJsonSchema()
 
-    runSnp(clinvarXml)
+    runSnpIndel(clinvarXml)
+    runRepeat(clinvarXml)
+    runStructural(clinvarXml)
+    combineConsequences(runSnpIndel.out.consequencesSnp,
+                        runRepeat.out.consequencesRepeat,
+                        runStructural.out.consequencesStructural)
+    
     generateEvidence(clinvarXml,
                      downloadJsonSchema.out.jsonSchema,
-                     runSnp.out.consequenceMappingsSnp)
+                     combineConsequences.out.consequencesCombined)
     checkDuplicates(generateEvidence.out.evidenceStrings)
 
     convertXrefs(clinvarXml)
@@ -79,7 +86,7 @@ process downloadJsonSchema {
  * Run simple variants (SNPs and other variants with complete coordinates) through VEP and map them
  * to genes and functional consequences.
  */
-process runSnp {
+process runSnpIndel {
     clusterOptions "-o ${batchRoot}/logs/consequence_vep.out \
                     -e ${batchRoot}/logs/consequence_vep.err"
 
@@ -92,7 +99,7 @@ process runSnp {
     path clinvarXml
 
     output:
-    path "consequences_snp.tsv", emit: consequenceMappingsSnp
+    path "consequences_snp.tsv", emit: consequencesSnp
 
     script:
     """
@@ -106,7 +113,83 @@ process runSnp {
         --tmpdir .           `# Store temporary files in the current directory to avoid /tmp overflow`    \
         \${PYTHON_BIN} "\${CODE_ROOT}/consequence_prediction/vep_mapping_pipeline/consequence_mapping.py" \
     | sort -u > consequences_snp.tsv
+    """
+}
+
+/*
+ * Extract repeat expansion variants from ClinVar and map them to genes.
+ */
+process runRepeat {
+   clusterOptions "-o ${batchRoot}/logs/consequence_repeat.out \
+                   -e ${batchRoot}/logs/consequence_repeat.err"
+
+   publishDir "${batchRoot}/gene_mapping",
+       overwrite: true,
+       mode: "copy",
+       pattern: "*.tsv"
+
+   input:
+   path clinvarXml
+
+   output:
+   path "consequences_repeat.tsv", emit: consequencesRepeat
+
+   script:
+   """
+   \${PYTHON_BIN} \${CODE_ROOT}/consequence_prediction/run_repeat_expansion_variants.py \
+        --clinvar-xml ${clinvarXml} \
+        --output-consequences consequences_repeat.tsv
+
+    # create an empty file if nothing generated
+    [[ -f consequences_repeat.tsv ]] || touch consequences_repeat.tsv
+   """
+}
+
+/*
+ * Run consequence and gene mapping for structural variants (i.e. no complete coordinates and not
+ * known repeat expansions).
+ */
+process runStructural {
+   clusterOptions "-o ${batchRoot}/logs/consequence_structural.out \
+                   -e ${batchRoot}/logs/consequence_structural.err"
+
+   publishDir "${batchRoot}/gene_mapping",
+       overwrite: true,
+       mode: "copy",
+       pattern: "*.tsv"
+
+   input:
+   path clinvarXml
+
+   output:
+   path "consequences_structural.tsv", emit: consequencesStructural
+
+   script:
+   """
+   \${PYTHON_BIN} \${CODE_ROOT}/consequence_prediction/run_structural_variants.py \
+        --clinvar-xml ${clinvarXml} \
+        --output-consequences consequences_structural.tsv
+
+    # create an empty file if nothing generated
+    [[ -f consequences_structural.tsv ]] || touch consequences_structural.tsv
+   """
+}
 
+/*
+ * Unite results of consequence mapping.
+ */
+process combineConsequences {
+    input:
+    path consequencesSnp
+    path consequencesRepeat
+    path consequencesStructural
+
+    output:
+    path "consequences_combined.tsv", emit: consequencesCombined
+
+    script:
+    """
+    cat ${consequencesRepeat} ${consequencesSnp} ${consequencesStructural} > consequences_combined.tsv
     """
 }
 
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json
index 47ea7401..8245a994 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_genetics_evidence_string.json
@@ -22,6 +22,7 @@
   "studyId": "RCV000002127",
   "targetFromSourceId": "ENSG00000139988",
   "variantFunctionalConsequenceId": "SO_0001583",
+  "variantHgvsId": "NC_000014.9:g.67729209A>G",
   "variantId": "14_67729209_A_G",
   "variantRsId": "rs28940313"
 }
\ No newline at end of file
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json
index 5740864d..a7a398f2 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_multiple_names_evidence_string.json
@@ -23,6 +23,7 @@
   "studyId": "RCV000415158",
   "targetFromSourceId": "ENSG00000139988",
   "variantFunctionalConsequenceId": "SO_0001583",
+  "variantHgvsId": "NC_000007.14:g.94423102G>A",
   "variantId": "7_94423102_G_A",
   "variantRsId": "rs1057518967"
 }
\ No newline at end of file
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json
index a42b3944..8dee46cf 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/resources/expected_somatic_evidence_string.json
@@ -22,6 +22,7 @@
   "studyId": "RCV000002127",
   "targetFromSourceId": "ENSG00000139988",
   "variantFunctionalConsequenceId": "SO_0001583",
+  "variantHgvsId": "NC_000014.9:g.67729209A>G",
   "variantId": "14_67729209_A_G",
   "variantRsId": "rs28940313"
 }
\ No newline at end of file
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py
index 39e73329..37dd45a5 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_clinvar_to_evidence_strings.py
@@ -130,14 +130,11 @@ def test_structural_variant_consequences(self):
         structural_crm = config.get_test_clinvar_record('test_structural_record.xml.gz').measure
         consequences = [CT.ConsequenceType('ENSG00000075151', CT.SoTerm('splice_polypyrimidine_tract_variant'))]
         consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences}
-
-        # only get consequences from HGVS if include_structural is True
-        assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict, False) == []
-        assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict, True) == consequences
+        assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, consequence_dict) == consequences
 
         # don't get consequences if there are more than MAX_TARGET_GENES
         long_consequence_dict = {structural_crm.preferred_current_hgvs.text: consequences * (MAX_TARGET_GENES+1)}
-        assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict, True) == []
+        assert clinvar_to_evidence_strings.get_consequence_types(structural_crm, long_consequence_dict) == []
 
 
 class TestGenerateEvidenceStringTest:
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py b/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py
index 0cfbd92e..7ecc591d 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/test_consequence_type.py
@@ -25,18 +25,6 @@ def test_process_consequence_type_file_tsv():
     assert consequence_type_dict["14:67729241:C:T"][0] == test_consequence_type
 
 
-def test_process_consequence_type_dataframes():
-    dataframe_1 = pd.DataFrame(
-        [('NC_000011.10:g.5226797_5226798insGCC', 'ENSG00000244734', 'HBB', 'coding_sequence_variant')],
-        columns=('VariantID', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm'))
-    dataframe_2 = pd.DataFrame(
-        [('RCV001051772', 'ENSG00000130711', 'PRDM12', 'trinucleotide_repeat_expansion')],
-        columns=('1', '2', '3', '4'))  # column names can be anything
-    consequence_type_dict = CT.process_consequence_type_dataframes(dataframe_1, dataframe_2)
-    assert consequence_type_dict['NC_000011.10:g.5226797_5226798insGCC'][0].ensembl_gene_id == 'ENSG00000244734'
-    assert consequence_type_dict['RCV001051772'][0].ensembl_gene_id == 'ENSG00000130711'
-
-
 def test_ensembl_so_term():
     so_term = CT.SoTerm('stop_gained')
     assert so_term.accession == 'SO_0001587'

From fd51e321324783588205c60287fcf389afef1768 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 23 Aug 2022 11:25:29 +0100
Subject: [PATCH 05/10] update json schema version

---
 docs/generate-evidence-strings.md                            | 2 +-
 tests/eva_cttv_pipeline/evidence_string_generation/config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index 39fe4618..12cbbb0b 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -18,7 +18,7 @@ Next, set up the protocol-specific environment:
 export OT_RELEASE=YYYY-MM
 
 # Open Targets JSON schema version.
-export OT_SCHEMA_VERSION=2.2.6
+export OT_SCHEMA_VERSION=2.2.7
 ```
 
 ## 1. Process data
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/config.py b/tests/eva_cttv_pipeline/evidence_string_generation/config.py
index c7d7593e..2e2bf49a 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/config.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/config.py
@@ -2,7 +2,7 @@
 
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
 
-OT_SCHEMA_VERSION = "2.2.6"
+OT_SCHEMA_VERSION = "2.2.7"
 
 test_dir = os.path.dirname(__file__)
 efo_mapping_file = os.path.join(test_dir, 'resources', 'string_to_ontology_mappings.tsv')

From 72319973235fc9cb14f1079b7c388e8e83c34816 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 23 Aug 2022 13:53:12 +0100
Subject: [PATCH 06/10] remove include_structural param

---
 bin/evidence_string_generation.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py
index ee69cc62..c24e8f20 100755
--- a/bin/evidence_string_generation.py
+++ b/bin/evidence_string_generation.py
@@ -9,12 +9,10 @@
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--ot-schema',    help='OpenTargets schema JSON',                required=True)
 parser.add_argument('--out',          help='Output directory',                       required=True)
-parser.add_argument('--include-structural', help='Use structural variants consequence prediction pipeline',
-                    action='store_true', default=True, required=False)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     clinvar_to_evidence_strings.launch_pipeline(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        ot_schema_file=args.ot_schema, dir_out=args.out, include_structural=args.include_structural)
+        ot_schema_file=args.ot_schema, dir_out=args.out)

From 869540e8f112acd2aea36574861f7938ec40827a Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 25 Aug 2022 10:24:42 +0100
Subject: [PATCH 07/10] bump pipeline version and json schema version

---
 docs/generate-evidence-strings.md                            | 2 +-
 setup.py                                                     | 2 +-
 tests/eva_cttv_pipeline/evidence_string_generation/config.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index 12cbbb0b..04382060 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -18,7 +18,7 @@ Next, set up the protocol-specific environment:
 export OT_RELEASE=YYYY-MM
 
 # Open Targets JSON schema version.
-export OT_SCHEMA_VERSION=2.2.7
+export OT_SCHEMA_VERSION=2.2.8
 ```
 
 ## 1. Process data
diff --git a/setup.py b/setup.py
index 8ad34e3b..5d136262 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def get_requires():
 
 
 setup(name='eva_cttv_pipeline',
-      version='2.5.4',
+      version='2.6.0',
       packages=find_packages(),
       install_requires=get_requires(),
       #! TBD: list as a dependency subpackage 'clinvar_xml_utils.clinvar_xml_utils.clinvar_xml_utils'
diff --git a/tests/eva_cttv_pipeline/evidence_string_generation/config.py b/tests/eva_cttv_pipeline/evidence_string_generation/config.py
index 2e2bf49a..40192d6b 100644
--- a/tests/eva_cttv_pipeline/evidence_string_generation/config.py
+++ b/tests/eva_cttv_pipeline/evidence_string_generation/config.py
@@ -2,7 +2,7 @@
 
 from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
 
-OT_SCHEMA_VERSION = "2.2.7"
+OT_SCHEMA_VERSION = "2.2.8"
 
 test_dir = os.path.dirname(__file__)
 efo_mapping_file = os.path.join(test_dir, 'resources', 'string_to_ontology_mappings.tsv')

From 9435ded1fbf6c9a4547dbad8f9ff54438ae80d23 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 26 Aug 2022 09:27:19 +0100
Subject: [PATCH 08/10] rename snp consequences log for consistency

---
 eva_cttv_pipeline/evidence_string_generation/pipeline.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index 574c5387..b60480d8 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -87,8 +87,8 @@ process downloadJsonSchema {
  * to genes and functional consequences.
  */
 process runSnpIndel {
-    clusterOptions "-o ${batchRoot}/logs/consequence_vep.out \
-                    -e ${batchRoot}/logs/consequence_vep.err"
+    clusterOptions "-o ${batchRoot}/logs/consequence_snp.out \
+                    -e ${batchRoot}/logs/consequence_snp.err"
 
     publishDir "${batchRoot}/gene_mapping",
         overwrite: true,

From 03e93db71c6efb4ee9c66f886dce50f848ced865 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 1 Sep 2022 08:51:37 +0100
Subject: [PATCH 09/10] address review comments

---
 docs/environment.md               | 2 +-
 docs/generate-evidence-strings.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/environment.md b/docs/environment.md
index a66e5d9c..788b9acd 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -1,6 +1,6 @@
 # Setting up the common environment
 
-1. Log in to the LSF cluster, where all data processing must take place.
+1. Log in to the LSF cluster (currently `codon`), where all data processing must take place.
 1. Using a `become` command, switch to a common EVA production user instead of your personal account.
 1. Adjust and execute the commands below. They will set up the environment, fetch and build the code. Notes:
     - The first six variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md).
diff --git a/docs/generate-evidence-strings.md b/docs/generate-evidence-strings.md
index 04382060..e69e6e86 100644
--- a/docs/generate-evidence-strings.md
+++ b/docs/generate-evidence-strings.md
@@ -38,9 +38,7 @@ nextflow run ${CODE_ROOT}/eva_cttv_pipeline/evidence_string_generation/pipeline.
   -resume
 ```
 
-## 2. Manual follow-up actions
-
-### Check that generated evidence strings do not contain any duplicates
+### Note on duplication checks
 The algorithm used for generating the evidence strings should not allow any duplicate values to be emitted, and the automated pipeline should fail with an error if duplicates are detected.
 
 A repeated evidence string will have identical values for these five fields:
@@ -53,6 +51,8 @@ A repeated evidence string will have identical values for these five fields:
 
 Nevertheless, we also report evidence strings in which  ``diseaseFromSourceMappedId`` may be empty (``diseaseFromSourceMappedId: null``) - i.e. the phenotype has not been mapped to an ontology yet. Therefore, to check for duplicates we also take into account the field ``diseaseFromSource``, which is the string describing the phenotype within ClinVar records (and is never missing in any evidence string).
 
+## 2. Manual follow-up actions
+
 ### Update summary metrics
 After the evidence strings have been generated, summary metrics need to be updated in the Google Sheets [table](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/) on the “Raw statistics” sheet.
 

From a5ec64431ad1d5bacd9b7cd6a7bd3e911986693f Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 5 Sep 2022 13:51:28 +0100
Subject: [PATCH 10/10] update checkDuplicates

---
 eva_cttv_pipeline/evidence_string_generation/pipeline.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
index b60480d8..bda65227 100644
--- a/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
+++ b/eva_cttv_pipeline/evidence_string_generation/pipeline.nf
@@ -233,10 +233,10 @@ process checkDuplicates {
 
     script:
     """
-    (jq --arg sep \$'\t' -jr \
-        '.datatypeId,\$sep,.studyId,\$sep,.targetFromSourceId,\$sep,.variantId,\$sep,.variantFunctionalConsequenceId,\$sep,.diseaseFromSourceMappedId,\sep,.diseaseFromSource,"\n"' \
-        ${evidenceStrings} | sort | uniq -d > duplicates.tsv) \
-    || [[ -z duplicates.tsv ]]
+    jq --arg sep \$'\t' -jr \
+        '.datatypeId,\$sep,.studyId,\$sep,.targetFromSourceId,\$sep,.variantId,\$sep,.variantFunctionalConsequenceId,\$sep,.diseaseFromSourceMappedId,\$sep,.diseaseFromSource,"\n"' \
+        ${evidenceStrings} | sort | uniq -d > duplicates.tsv
+    [[ ! -s duplicates.tsv ]]
     """
 }