Skip to content

Commit

Permalink
Merge pull request #30 from tcezard/EVA3553_get_md5
Browse files Browse the repository at this point in the history
EVA-3553 - Calculate md5 of VCF files and add them to the JSON metadata
  • Loading branch information
tcezard authored May 6, 2024
2 parents 9a0aca0 + 30b27e3 commit 9fb8832
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 1 deletion.
35 changes: 35 additions & 0 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ workflow {
check_vcf_valid(vcf_channel)
check_vcf_reference(vcf_channel)

generate_md5_digests(vcf_files)
collect_md5(generate_md5_digests.out.md5_digest.collect())


// Metadata conversion
if (params.metadata_xlsx && !params.metadata_json){
convert_xlsx_2_json(joinBasePath(params.metadata_xlsx))
Expand Down Expand Up @@ -146,6 +150,37 @@ process check_vcf_reference {
"""
}

process generate_md5_digests {
input:
path(vcf_file)

output:
path "${vcf_file}.md5", emit: md5_digest

script:
// Capture the realpath of the vcf to be able to resolve the file based on path instead of name
"""
md5sum `readlink $vcf_file` > ${vcf_file}.md5
"""
}

process collect_md5 {
publishDir output_dir,
overwrite: true,
mode: "copy"

input:
path(file_digests)

output:
path "md5sums.txt", emit: md5_digest_log

script:
"""
cat $file_digests > md5sums.txt
"""
}


process convert_xlsx_2_json {
publishDir output_dir,
Expand Down
4 changes: 4 additions & 0 deletions eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ def __init__(self, mapping_file, output_dir, metadata_json=None,
def _validate(self):
self.run_docker_validator()

@staticmethod
def _validation_file_path_for(file_path):
return f'{container_validation_dir}/{file_path}'

def get_docker_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
docker_cmd = (
Expand Down
50 changes: 50 additions & 0 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import csv
import datetime
import glob
import json
import logging
import os
import re
Expand Down Expand Up @@ -57,6 +58,12 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.sub_config.backup()
self.sub_config.write()

@property
def metadata_json_post_validation(self):
if self.metadata_json:
return self.metadata_json
return resolve_single_file_path(os.path.join(self.output_dir, 'metadata.json'))

@staticmethod
def _run_quiet_command(command_description, command, **kwargs):
return run_command_with_output(command_description, command, stdout_log_level=logging.DEBUG,
Expand Down Expand Up @@ -90,6 +97,10 @@ def report(self):
def _validate(self):
raise NotImplementedError

@staticmethod
def _validation_file_path_for(file_path):
return file_path

def verify_files_present(self):
# verify mapping file exists
if not os.path.exists(self.mapping_file):
Expand Down Expand Up @@ -330,6 +341,7 @@ def _collect_metadata_results(self):
self._parse_biovalidator_validation_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
self._collect_md5sum_to_metadata()

def _load_spreadsheet_conversion_errors(self):
errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'metadata_conversion_errors.yml'))
Expand Down Expand Up @@ -450,6 +462,44 @@ def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
if attributes_dict[attribute] == json_attribute:
return attribute

def _collect_md5sum_to_metadata(self):
md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'md5sums.txt'))
file_path_2_md5 = {}
file_name_2_md5 = {}
if md5sum_file:
with open(md5sum_file) as open_file:
for line in open_file:
sp_line = line.split(' ')
md5sum = sp_line[0]
vcf_file = line.strip()[len(md5sum):].lstrip() # Remove the md5: the rest is the file path
file_path_2_md5[vcf_file] = md5sum
file_name_2_md5[os.path.basename(vcf_file)] = md5sum
if self.metadata_json_post_validation:
with open(self.metadata_json_post_validation) as open_file:
try:
json_data = json.load(open_file)
analysis_aliases = [a.get('analysisAlias') for a in json_data.get('analysis', [])]
file_rows = []
files_from_metadata = json_data.get('files', [])
if files_from_metadata:
for file_dict in json_data.get('files', []):
if file_dict.get('fileType') == 'vcf':
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_rows.append(file_dict)
else:
self.error('No file found in metadata and multiple analysis alias exist: '
'cannot infer the relationship between files and analysis alias')
json_data['files'] = file_rows
except Exception as e:
# Skip adding the md5
self.error('Error while loading or parsing metadata json: ' + str(e))
if json_data:
with open(self.metadata_json_post_validation, 'w') as open_file:
json.dump(json_data, open_file)


def create_reports(self):
report_html = generate_html_report(self.results, self.validation_date, self.project_title)
file_path = os.path.join(self.output_dir, 'report.html')
Expand Down
8 changes: 7 additions & 1 deletion tests/test_docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def test_validate(self):
# run validation in docker
self.validator.validate()

# assert vcf checks
vcf_format_dir = os.path.join(self.validator.output_dir, 'vcf_format')
self.assertTrue(os.path.exists(vcf_format_dir))

Expand Down Expand Up @@ -123,6 +122,13 @@ def test_validate(self):
}
self.assert_sample_checker(self.validator._sample_check_yaml, expected_checker)

with open(self.validator.metadata_json_post_validation) as open_file:
json_data = json.load(open_file)
assert json_data.get('files') == [
{'analysisAlias': 'AA', 'fileName': 'input_passed.vcf', 'fileType': 'vcf',
'md5': '96a80c9368cc3c37095c86fbe6044fb2'}
]

def test_validate_from_excel(self):
self.validator_from_excel.validate()
self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml))

0 comments on commit 9fb8832

Please sign in to comment.