From bfcd4afaf3b9aeb538e5e713cb52df6ba0348270 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Mon, 14 Mar 2016 12:12:59 -0500 Subject: [PATCH 1/7] first checked in --- src/toil_scripts/gatk_concordance/__init__.py | 0 .../gatk_concordance/gatk_concordance.py | 103 ++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 src/toil_scripts/gatk_concordance/__init__.py create mode 100644 src/toil_scripts/gatk_concordance/gatk_concordance.py diff --git a/src/toil_scripts/gatk_concordance/__init__.py b/src/toil_scripts/gatk_concordance/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py new file mode 100644 index 00000000..6abf43f3 --- /dev/null +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python2.7 + +import argparse +import multiprocessing +import os +import subprocess + +from toil.job import Job + +from toil_scripts import download_from_s3_url +from toil_scripts.batch_alignment.bwa_alignment import upload_to_s3, docker_call + +def build_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--reference', required=True, help="Reference Genome URL") + parser.add_argument('-i', '--reference_index', required=True, help="Reference Genome index (.fai) URL") + parser.add_argument('-e', '--eval_vcf', required=True, help="VCF file URL to evaluate") + parser.add_argument('-c', '--comp_vcf', required=True, help="VCF file URL to compare against") + parser.add_argument('-o', '--output_dir', required=True, help="Output directory S3 URL") + return parser + + +# copied from germline.py +def download_url_c(job, url, filename): + work_dir = job.fileStore.getLocalTempDir() + file_path = os.path.join(work_dir, filename) + if not os.path.exists(file_path): + if url.startswith('s3:'): + download_from_s3_url(file_path, url) + else: + try: + subprocess.check_call(['curl', '-fs', '--retry', '5', '--create-dir', url, '-o', file_path]) + except subprocess.CalledProcessError as cpe: + raise RuntimeError( + '\nNecessary file could not be acquired: %s. Got error "%s". Check input URL' % (url, cpe)) + except OSError: + raise RuntimeError('Failed to find "curl". Install via "apt-get install curl"') + assert os.path.exists(file_path) + return job.fileStore.writeGlobalFile(file_path) + + +# copied from germline.py +def read_from_filestore_c(job, work_dir, ids, *filenames): + for filename in filenames: + if not os.path.exists(os.path.join(work_dir, filename)): + job.fileStore.readGlobalFile(ids[filename], os.path.join(work_dir, filename)) + + +def batch_start(job, input_args): + shared_files = ['ref.fa', 'ref.fa.fai', 'eval.vcf', 'comp.vcf'] + shared_ids = {} + for file_name in shared_files: + url = input_args[file_name] + shared_ids[file_name] = job.addChildJobFn(download_url_c, url, file_name).rv() + job.addFollowOnJobFn(concordance, shared_ids, input_args) + + +def concordance(job, shared_ids, input_args): + """ + Evaluate concordance between two VCFs with GATK. + """ + work_dir = job.fileStore.getLocalTempDir() + input_files = ['ref.fa', 'ref.fa.fai', 'eval.vcf', 'comp.vcf'] + read_from_filestore_c(job, work_dir, shared_ids, *input_files) + + eval_name = input_args['eval.vcf'].split('/')[-1] + comp_name = input_args['comp.vcf'].split('/')[-1] + output = '%s_vs_%s.concordance' % (eval_name, comp_name) + command = ['-T', 'GenotypeConcordance', + '-R', 'ref.fa', + '--eval', 'eval.vcf', + '--comp', 'comp.vcf', + '-o', output + ] + try: + docker_call(work_dir = work_dir, + tool_parameters = command, + tool = 'quay.io/ucsc_cgl/gatk', + sudo = input_args['sudo']) + except: + sys.stderr.write("Running concordance with %s in %s failed." % ( + " ".join(command), work_dir)) + raise + + upload_to_s3(work_dir, input_args, output) + + +if __name__ == '__main__': + args_parser = build_parser() + Job.Runner.addToilOptions(args_parser) + args = args_parser.parse_args() + + inputs = {'ref.fa': args.reference, + 'ref.fa.fai': args.reference_index, + 'eval.vcf', args.eval_vcf, + 'comp.vcf', args.comp_vcf, + 's3_dir', args.output_dir, + 'uuid': None, + 'cpu_count': str(multiprocessing.cpu_count()), + 'ssec': None, + 'sudo': False} + + Job.Runner.startToil(Job.wrapJobFn(batch_start, inputs), args) From 47352a0f552776e3fd80bd9eb286b999824f1e65 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Mon, 14 Mar 2016 13:18:55 -0500 Subject: [PATCH 2/7] addressing review comments --- .../gatk_concordance/gatk_concordance.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index 6abf43f3..0f1fde12 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -70,17 +70,12 @@ def concordance(job, shared_ids, input_args): '-R', 'ref.fa', '--eval', 'eval.vcf', '--comp', 'comp.vcf', - '-o', output - ] - try: - docker_call(work_dir = work_dir, - tool_parameters = command, - tool = 'quay.io/ucsc_cgl/gatk', - sudo = input_args['sudo']) - except: - sys.stderr.write("Running concordance with %s in %s failed." % ( - " ".join(command), work_dir)) - raise + '-o', output] + + docker_call(work_dir = work_dir, + tool_parameters = command, + tool = 'quay.io/ucsc_cgl/gatk', + sudo = input_args['sudo']) upload_to_s3(work_dir, input_args, output) From 8a4f767fc2ef215648cd93a98e8515a260c8b8ea Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Tue, 15 Mar 2016 11:34:07 -0500 Subject: [PATCH 3/7] minor syntax errors --- src/toil_scripts/gatk_concordance/gatk_concordance.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index 0f1fde12..28ba95a5 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -14,8 +14,8 @@ def build_parser(): parser = argparse.ArgumentParser() parser.add_argument('-r', '--reference', required=True, help="Reference Genome URL") parser.add_argument('-i', '--reference_index', required=True, help="Reference Genome index (.fai) URL") - parser.add_argument('-e', '--eval_vcf', required=True, help="VCF file URL to evaluate") - parser.add_argument('-c', '--comp_vcf', required=True, help="VCF file URL to compare against") + parser.add_argument('-1', '--eval_vcf', required=True, help="VCF file URL to evaluate") + parser.add_argument('-2', '--comp_vcf', required=True, help="VCF file URL to compare against") parser.add_argument('-o', '--output_dir', required=True, help="Output directory S3 URL") return parser @@ -87,9 +87,9 @@ def concordance(job, shared_ids, input_args): inputs = {'ref.fa': args.reference, 'ref.fa.fai': args.reference_index, - 'eval.vcf', args.eval_vcf, - 'comp.vcf', args.comp_vcf, - 's3_dir', args.output_dir, + 'eval.vcf': args.eval_vcf, + 'comp.vcf': args.comp_vcf, + 's3_dir': args.output_dir, 'uuid': None, 'cpu_count': str(multiprocessing.cpu_count()), 'ssec': None, From a0a20e5b05d60e365654487f8ce0323e01637b77 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Tue, 15 Mar 2016 15:18:21 -0500 Subject: [PATCH 4/7] adding ref.fa.dict parameter --- .../gatk_concordance/gatk_concordance.py | 37 ++++--------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index 28ba95a5..69ff0981 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -9,49 +9,25 @@ from toil_scripts import download_from_s3_url from toil_scripts.batch_alignment.bwa_alignment import upload_to_s3, docker_call +from toil_scripts.gatk_germline.germline import download_url, read_from_filestore_hc def build_parser(): parser = argparse.ArgumentParser() parser.add_argument('-r', '--reference', required=True, help="Reference Genome URL") parser.add_argument('-i', '--reference_index', required=True, help="Reference Genome index (.fai) URL") + parser.add_argument('-d', '--reference_dict', required=True, help="Reference Genome sequence dictionary (.dict) URL") parser.add_argument('-1', '--eval_vcf', required=True, help="VCF file URL to evaluate") parser.add_argument('-2', '--comp_vcf', required=True, help="VCF file URL to compare against") parser.add_argument('-o', '--output_dir', required=True, help="Output directory S3 URL") return parser -# copied from germline.py -def download_url_c(job, url, filename): - work_dir = job.fileStore.getLocalTempDir() - file_path = os.path.join(work_dir, filename) - if not os.path.exists(file_path): - if url.startswith('s3:'): - download_from_s3_url(file_path, url) - else: - try: - subprocess.check_call(['curl', '-fs', '--retry', '5', '--create-dir', url, '-o', file_path]) - except subprocess.CalledProcessError as cpe: - raise RuntimeError( - '\nNecessary file could not be acquired: %s. Got error "%s". Check input URL' % (url, cpe)) - except OSError: - raise RuntimeError('Failed to find "curl". Install via "apt-get install curl"') - assert os.path.exists(file_path) - return job.fileStore.writeGlobalFile(file_path) - - -# copied from germline.py -def read_from_filestore_c(job, work_dir, ids, *filenames): - for filename in filenames: - if not os.path.exists(os.path.join(work_dir, filename)): - job.fileStore.readGlobalFile(ids[filename], os.path.join(work_dir, filename)) - - def batch_start(job, input_args): - shared_files = ['ref.fa', 'ref.fa.fai', 'eval.vcf', 'comp.vcf'] + shared_files = ['ref.fa', 'ref.fa.fai', 'ref.fa.dict', 'eval.vcf', 'comp.vcf'] shared_ids = {} for file_name in shared_files: url = input_args[file_name] - shared_ids[file_name] = job.addChildJobFn(download_url_c, url, file_name).rv() + shared_ids[file_name] = job.addChildJobFn(download_url, url, file_name).rv() job.addFollowOnJobFn(concordance, shared_ids, input_args) @@ -60,8 +36,8 @@ def concordance(job, shared_ids, input_args): Evaluate concordance between two VCFs with GATK. """ work_dir = job.fileStore.getLocalTempDir() - input_files = ['ref.fa', 'ref.fa.fai', 'eval.vcf', 'comp.vcf'] - read_from_filestore_c(job, work_dir, shared_ids, *input_files) + input_files = ['ref.fa', 'ref.fa.fai', 'ref.fa.dict', 'eval.vcf', 'comp.vcf'] + read_from_filestore_hc(job, work_dir, shared_ids, *input_files) eval_name = input_args['eval.vcf'].split('/')[-1] comp_name = input_args['comp.vcf'].split('/')[-1] @@ -87,6 +63,7 @@ def concordance(job, shared_ids, input_args): inputs = {'ref.fa': args.reference, 'ref.fa.fai': args.reference_index, + 'ref.fa.dict': args.reference_dict, 'eval.vcf': args.eval_vcf, 'comp.vcf': args.comp_vcf, 's3_dir': args.output_dir, From 8900a287e7ffb394a3c861077f7ac6cafb3c979f Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Tue, 15 Mar 2016 15:42:11 -0500 Subject: [PATCH 5/7] ref.fa.dict --> ref.dict --- src/toil_scripts/gatk_concordance/gatk_concordance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index 69ff0981..36fa0d3f 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -23,7 +23,7 @@ def build_parser(): def batch_start(job, input_args): - shared_files = ['ref.fa', 'ref.fa.fai', 'ref.fa.dict', 'eval.vcf', 'comp.vcf'] + shared_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf'] shared_ids = {} for file_name in shared_files: url = input_args[file_name] @@ -36,7 +36,7 @@ def concordance(job, shared_ids, input_args): Evaluate concordance between two VCFs with GATK. """ work_dir = job.fileStore.getLocalTempDir() - input_files = ['ref.fa', 'ref.fa.fai', 'ref.fa.dict', 'eval.vcf', 'comp.vcf'] + input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) eval_name = input_args['eval.vcf'].split('/')[-1] @@ -63,7 +63,7 @@ def concordance(job, shared_ids, input_args): inputs = {'ref.fa': args.reference, 'ref.fa.fai': args.reference_index, - 'ref.fa.dict': args.reference_dict, + 'ref.dict': args.reference_dict, 'eval.vcf': args.eval_vcf, 'comp.vcf': args.comp_vcf, 's3_dir': args.output_dir, From a39489a5f53a0a3ef715025078a1a866418704dc Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Fri, 18 Mar 2016 10:14:27 -0500 Subject: [PATCH 6/7] adding eval_label and comp_label --- src/toil_scripts/gatk_concordance/gatk_concordance.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index 36fa0d3f..d76d4454 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -17,7 +17,9 @@ def build_parser(): parser.add_argument('-i', '--reference_index', required=True, help="Reference Genome index (.fai) URL") parser.add_argument('-d', '--reference_dict', required=True, help="Reference Genome sequence dictionary (.dict) URL") parser.add_argument('-1', '--eval_vcf', required=True, help="VCF file URL to evaluate") + parser.add_argument('-y', '--eval_label', required=False, help="Optional label for VCF file to evaluate, defaults to file name") parser.add_argument('-2', '--comp_vcf', required=True, help="VCF file URL to compare against") + parser.add_argument('-z', '--comp_label', required=False, help="Optional label for VCF file to compare against, defaults to file name") parser.add_argument('-o', '--output_dir', required=True, help="Output directory S3 URL") return parser @@ -39,11 +41,13 @@ def concordance(job, shared_ids, input_args): input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) - eval_name = input_args['eval.vcf'].split('/')[-1] - comp_name = input_args['comp.vcf'].split('/')[-1] + # todo: is this the best way to do this? + eval_name = input_args['eval_label'] if 'eval_label' in input_args else input_args['eval.vcf'].split('/')[-1] + comp_name = input_args['comp_label'] if 'comp_label' in input_args else input_args['comp.vcf'].split('/')[-1] output = '%s_vs_%s.concordance' % (eval_name, comp_name) command = ['-T', 'GenotypeConcordance', '-R', 'ref.fa', + '-nct', input_args['cpu_count'], '--eval', 'eval.vcf', '--comp', 'comp.vcf', '-o', output] @@ -65,9 +69,12 @@ def concordance(job, shared_ids, input_args): 'ref.fa.fai': args.reference_index, 'ref.dict': args.reference_dict, 'eval.vcf': args.eval_vcf, + 'eval_label': args.eval_label, 'comp.vcf': args.comp_vcf, + 'comp_label': args.comp_label, 's3_dir': args.output_dir, 'uuid': None, + # todo: this is req'd by gatk and by upload_to_s3 'cpu_count': str(multiprocessing.cpu_count()), 'ssec': None, 'sudo': False} From 88c46251443a9c24eaf2308fb3f8e8dbcd930940 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Fri, 18 Mar 2016 18:01:34 -0500 Subject: [PATCH 7/7] fix label defaults --- .../gatk_concordance/gatk_concordance.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/toil_scripts/gatk_concordance/gatk_concordance.py b/src/toil_scripts/gatk_concordance/gatk_concordance.py index d76d4454..cae32c1c 100644 --- a/src/toil_scripts/gatk_concordance/gatk_concordance.py +++ b/src/toil_scripts/gatk_concordance/gatk_concordance.py @@ -8,19 +8,20 @@ from toil.job import Job from toil_scripts import download_from_s3_url -from toil_scripts.batch_alignment.bwa_alignment import upload_to_s3, docker_call +from toil_scripts.batch_alignment.bwa_alignment import upload_or_move, docker_call from toil_scripts.gatk_germline.germline import download_url, read_from_filestore_hc def build_parser(): parser = argparse.ArgumentParser() - parser.add_argument('-r', '--reference', required=True, help="Reference Genome URL") - parser.add_argument('-i', '--reference_index', required=True, help="Reference Genome index (.fai) URL") - parser.add_argument('-d', '--reference_dict', required=True, help="Reference Genome sequence dictionary (.dict) URL") - parser.add_argument('-1', '--eval_vcf', required=True, help="VCF file URL to evaluate") - parser.add_argument('-y', '--eval_label', required=False, help="Optional label for VCF file to evaluate, defaults to file name") - parser.add_argument('-2', '--comp_vcf', required=True, help="VCF file URL to compare against") - parser.add_argument('-z', '--comp_label', required=False, help="Optional label for VCF file to compare against, defaults to file name") - parser.add_argument('-o', '--output_dir', required=True, help="Output directory S3 URL") + parser.add_argument('-r', '--reference', required=True, help='Reference Genome URL') + parser.add_argument('-i', '--reference_index', required=True, help='Reference Genome index (.fai) URL') + parser.add_argument('-d', '--reference_dict', required=True, help='Reference Genome sequence dictionary (.dict) URL') + parser.add_argument('-1', '--eval_vcf', required=True, help='VCF file URL to evaluate') + parser.add_argument('-y', '--eval_label', required=False, help='Optional label for VCF file to evaluate, defaults to file name') + parser.add_argument('-2', '--comp_vcf', required=True, help='VCF file URL to compare against') + parser.add_argument('-z', '--comp_label', required=False, help='Optional label for VCF file to compare against, defaults to file name') + parser.add_argument('-o', '--output_dir', required=False, default=None, help='Local output directory, use one of this option or --s3_dir') + parser.add_argument('-3', '--s3_dir', required=False, default=None, help='S3 output directory, starting with bucket name, use one of this option or --output_dir') return parser @@ -41,9 +42,8 @@ def concordance(job, shared_ids, input_args): input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) - # todo: is this the best way to do this? - eval_name = input_args['eval_label'] if 'eval_label' in input_args else input_args['eval.vcf'].split('/')[-1] - comp_name = input_args['comp_label'] if 'comp_label' in input_args else input_args['comp.vcf'].split('/')[-1] + eval_name = input_args.get('eval_label', os.path.basename(input_args['eval.vcf'])) + comp_name = input_args.get('comp_label', os.path.basename(input_args['comp.vcf'])) output = '%s_vs_%s.concordance' % (eval_name, comp_name) command = ['-T', 'GenotypeConcordance', '-R', 'ref.fa', @@ -57,7 +57,7 @@ def concordance(job, shared_ids, input_args): tool = 'quay.io/ucsc_cgl/gatk', sudo = input_args['sudo']) - upload_to_s3(work_dir, input_args, output) + upload_or_move(work_dir, input_args, output) if __name__ == '__main__': @@ -72,7 +72,8 @@ def concordance(job, shared_ids, input_args): 'eval_label': args.eval_label, 'comp.vcf': args.comp_vcf, 'comp_label': args.comp_label, - 's3_dir': args.output_dir, + 'output_dir': args.output_dir, + 's3_dir': args.s3_dir, 'uuid': None, # todo: this is req'd by gatk and by upload_to_s3 'cpu_count': str(multiprocessing.cpu_count()),