diff --git a/Snakefile b/Snakefile index 318df52..5751211 100644 --- a/Snakefile +++ b/Snakefile @@ -33,13 +33,13 @@ configfile: Path("config/parameters.yaml") # Use Python functions to automatically detect batches of genomes fasta files # in the input directory as 'BATCHES' -INPUT_DIR = config["input_directory"] +WORK_DIR = config["working_directory"] -BATCH_PATHS = list(Path(INPUT_DIR).glob("batch_*")) +BATCH_PATHS = list((Path(WORK_DIR) / "assemblies").glob("atb.assembly.*")) for batch in BATCH_PATHS: assert Path(batch).is_dir(), f"Batches must be directories, got {batch}" -BATCHES = [batch.stem for batch in BATCH_PATHS] +BATCHES = [batch.name for batch in BATCH_PATHS] OUTPUT_DIR = config["output_directory"] @@ -50,15 +50,15 @@ OUTPUT_DIR = config["output_directory"] rule all: input: # Multilocus Sequence Types (ST) for Campylobacter - "data/processed/mlst_table.tsv", + OUTPUT_DIR + "mlst_table.tsv", # Virus and plasmid predictions per contig - "data/processed/genomad_predictions.csv", - "data/processed/jaeger_predictions.csv", + OUTPUT_DIR + "genomad_predictions.csv", + OUTPUT_DIR + "jaeger_predictions.csv", # Phage defence systems - "data/processed/padloc_table.csv", + OUTPUT_DIR + "padloc_table.csv", # Concatenated CCTyper output expand( - OUTPUT_DIR + "cctyper/{batch}/{filename}-{batch}.tab", + WORK_DIR + "cctyper/{batch}/{filename}-{batch}.tab", batch=BATCHES, filename=[ "CRISPR_Cas", @@ -68,31 +68,31 @@ rule all: "cas_operons", ], ), - expand(OUTPUT_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), + expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), # Combined CCTyper output as CSV + BED files - expand(OUTPUT_DIR + "cctyper/{batch}/parsed", batch=BATCHES), + expand(WORK_DIR + "cctyper/{batch}/parsed", batch=BATCHES), # CCTyper CRISPR spacer cluster analysis report - OUTPUT_DIR + "cctyper/spacer_cluster_summary.tsv", + WORK_DIR + "cctyper/spacer_cluster_summary.tsv", # Cluster unique CRISPR spacers - OUTPUT_DIR + "cctyper/all_spacers-clustered.clstr", - OUTPUT_DIR + "crispridentify/all_spacers-clustered.clstr", - "data/processed/all_spacers_table_identify.tsv", - "data/processed/all_spacers_table.tsv", + WORK_DIR + "cctyper/all_spacers-clustered.clstr", + WORK_DIR + "crispridentify/all_spacers-clustered.clstr", + OUTPUT_DIR + "all_spacers_table_identify.tsv", + OUTPUT_DIR + "all_spacers_table.tsv", # Extracted CRISPR arrays (as fasta) - expand(OUTPUT_DIR + "arrays/{batch}/complete", batch=BATCHES), + expand(WORK_DIR + "arrays/{batch}/complete", batch=BATCHES), #CRISPRidentify output - expand(OUTPUT_DIR + "crispridentify/{batch}/complete", batch=BATCHES), + expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES), #concatenated CRISPRidentify output - OUTPUT_DIR + "crispridentify/complete_summary.csv", - OUTPUT_DIR + "crispridentify/all_spacers.fa", + WORK_DIR + "crispridentify/complete_summary.csv", + WORK_DIR + "crispridentify/all_spacers.fa", #merged CRISPRidentify and CCtyper output - "data/processed/all_CRISPRS_with_identify.tab", + OUTPUT_DIR + "all_CRISPRS_with_identify.tab", #spacepharer output - "data/processed/phage_matches.tsv", - "data/processed/plasmid_matches.tsv", + OUTPUT_DIR + "phage_matches.tsv", + OUTPUT_DIR + "plasmid_matches.tsv", #KMA output - OUTPUT_DIR + "kma/output/CRISPR.frag.gz", - OUTPUT_DIR + "kma/CRISPR_alignment" + WORK_DIR + "kma/output/CRISPR.frag.gz", + WORK_DIR + "kma/CRISPR_alignment", ### Step 3: Define processing steps that generate the output ### @@ -100,7 +100,7 @@ rule all: rule download_mlst_database: output: - OUTPUT_DIR + "mlst/campylobacter.db", + WORK_DIR + "mlst/campylobacter.db", params: species=config["mlst"]["species"], conda: @@ -118,10 +118,10 @@ claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1 rule mlst: input: - batch=INPUT_DIR + "{batch}/", - db=OUTPUT_DIR + "mlst/campylobacter.db", + batch=WORK_DIR + "assemblies/{batch}/", + db=WORK_DIR + "mlst/campylobacter.db", output: - OUTPUT_DIR + "mlst/{batch}/complete", + WORK_DIR + "mlst/{batch}/complete", conda: "envs/pymlst.yaml" threads: config["mlst"]["threads"] @@ -141,9 +141,9 @@ touch {output} rule concatenate_mlst_batches: input: - OUTPUT_DIR + "mlst/{batch}/complete", + WORK_DIR + "mlst/{batch}/complete", output: - OUTPUT_DIR + "mlst/{batch}-concatenated.tsv", + WORK_DIR + "mlst/{batch}-concatenated.tsv", threads: config["mlst"]["threads"] log: "log/concatenate_mlst/{batch}.txt", @@ -154,18 +154,18 @@ rule concatenate_mlst_batches: echo -e "Genome\tST" > {output} find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'tail -n 1 {{}} | cut -f 1-2 >> {output}' + 'tail -n +2 {{}} | cut -f 1-2 >> {output}' """ rule concatenate_mlst_all: input: - expand(OUTPUT_DIR + "mlst/{batch}-concatenated.tsv", batch=BATCHES) + expand(WORK_DIR + "mlst/{batch}-concatenated.tsv", batch=BATCHES), output: - "data/processed/mlst_table.tsv" + OUTPUT_DIR + "mlst_table.tsv", threads: 1 log: - "log/concatenate_mlst_all.txt" + "log/concatenate_mlst_all.txt", benchmark: "log/benchmark/concatenate_mlst_all.txt" shell: @@ -178,11 +178,11 @@ sed --separate 1d ${{batches[@]}} >> {output} rule crisprcastyper: input: - batch=INPUT_DIR + "{batch}/", + batch=WORK_DIR + "assemblies/{batch}/", output: - OUTPUT_DIR + "cctyper/{batch}/complete", + WORK_DIR + "cctyper/{batch}/complete", params: - out_dir=OUTPUT_DIR + "cctyper/{batch}/", + out_dir=WORK_DIR + "cctyper/{batch}/", conda: "envs/cctyper.yaml" threads: config["cctyper"]["threads"] @@ -203,7 +203,7 @@ touch {output} rule download_padloc_database: output: - OUTPUT_DIR + "padloc/database", + WORK_DIR + "padloc/database", conda: "envs/padloc.yaml" threads: 1 @@ -220,10 +220,10 @@ touch {output} rule padloc: input: - batch=INPUT_DIR + "{batch}/", - db=OUTPUT_DIR + "padloc/database", + batch=WORK_DIR + "assemblies/{batch}/", + db=WORK_DIR + "padloc/database", output: - OUTPUT_DIR + "padloc/{batch}/complete", + WORK_DIR + "padloc/{batch}/complete", conda: "envs/padloc.yaml" threads: config["padloc"]["threads"] @@ -243,9 +243,9 @@ touch {output} rule concatenate_padloc_batches: input: - OUTPUT_DIR + "padloc/{batch}/complete", + WORK_DIR + "padloc/{batch}/complete", output: - OUTPUT_DIR + "padloc/{batch}-concatenated.csv", + WORK_DIR + "padloc/{batch}-concatenated.csv", threads: config["padloc"]["threads"] log: "log/concatenate_padloc/{batch}.txt", @@ -256,18 +256,18 @@ rule concatenate_padloc_batches: file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") ) head -1 ${{file_array[0]}} > {output} parallel --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'tail -n 1 {{}} >> {output}' ::: ${{file_array[@]}} + 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}} """ rule concatenate_padloc_all: input: - expand(OUTPUT_DIR + "padloc/{batch}-concatenated.csv", batch=BATCHES) + expand(WORK_DIR + "padloc/{batch}-concatenated.csv", batch=BATCHES), output: - "data/processed/padloc_table.csv" + OUTPUT_DIR + "padloc_table.csv", threads: 1 log: - "log/concatenate_padloc_all.txt" + "log/concatenate_padloc_all.txt", benchmark: "log/benchmark/concatenate_padloc_all.txt" shell: @@ -280,9 +280,9 @@ sed --separate 1d ${{batches[@]}} >> {output} rule parse_cctyper: input: - OUTPUT_DIR + "cctyper/{batch}/complete", + WORK_DIR + "cctyper/{batch}/complete", output: - OUTPUT_DIR + "cctyper/{batch}/parsed", + WORK_DIR + "cctyper/{batch}/parsed", conda: "envs/pandas.yaml" threads: config["parse_cctyper"]["threads"] @@ -302,9 +302,9 @@ touch {output} rule extract_sequences: input: - OUTPUT_DIR + "cctyper/{batch}/parsed", + WORK_DIR + "cctyper/{batch}/parsed", output: - OUTPUT_DIR + "cctyper/{batch}/subseq", + WORK_DIR + "cctyper/{batch}/subseq", conda: "envs/seqkit.yaml" threads: config["extract_sequences"]["threads"] @@ -324,19 +324,17 @@ touch {output} rule collect_cctyper: input: - cctyper=OUTPUT_DIR + "cctyper/{batch}/complete", - parser=OUTPUT_DIR + "cctyper/{batch}/parsed", - output: - crispr_cas=OUTPUT_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", - crisprs_all=OUTPUT_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", - crisprs_near_cas=OUTPUT_DIR + "cctyper/{batch}/crisprs_near_cas-{batch}.tab", - crisprs_orphan=OUTPUT_DIR + "cctyper/{batch}/crisprs_orphan-{batch}.tab", - spacers=OUTPUT_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", - cas_putative=temp( - OUTPUT_DIR + "cctyper/{batch}/cas_operons_putative-{batch}.tab" - ), - cas=OUTPUT_DIR + "cctyper/{batch}/cas_operons-{batch}.tab", - csv=OUTPUT_DIR + "cctyper/{batch}/CRISPR-Cas-{batch}.csv", + cctyper=WORK_DIR + "cctyper/{batch}/complete", + parser=WORK_DIR + "cctyper/{batch}/parsed", + output: + crispr_cas=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", + crisprs_all=WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", + crisprs_near_cas=WORK_DIR + "cctyper/{batch}/crisprs_near_cas-{batch}.tab", + crisprs_orphan=WORK_DIR + "cctyper/{batch}/crisprs_orphan-{batch}.tab", + spacers=WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", + cas_putative=temp(WORK_DIR + "cctyper/{batch}/cas_operons_putative-{batch}.tab"), + cas=WORK_DIR + "cctyper/{batch}/cas_operons-{batch}.tab", + csv=WORK_DIR + "cctyper/{batch}/CRISPR-Cas-{batch}.csv", threads: 1 log: "log/cctyper/collect_{batch}.txt", @@ -354,9 +352,9 @@ find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat { rule extract_crispr_cas_locations: input: - OUTPUT_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", + WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", output: - OUTPUT_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", + WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", threads: 1 log: "log/extract_crispr_cas_location/{batch}.txt", @@ -370,12 +368,12 @@ python bin/create_CCTyper_bedfile.py -i {input} -o {output} > {log} 2>&1 rule extract_crispr_array: input: - batch=INPUT_DIR + "{batch}/", - bed=OUTPUT_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", + batch=WORK_DIR + "assemblies/{batch}/", + bed=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", output: - OUTPUT_DIR + "arrays/{batch}/complete", + WORK_DIR + "arrays/{batch}/complete", params: - out_dir=OUTPUT_DIR + "arrays/{batch}/", + out_dir=WORK_DIR + "arrays/{batch}/", conda: "envs/seqkit.yaml" threads: config["extract_arrays"]["threads"] @@ -399,9 +397,9 @@ touch {output} rule concatenate_all_spacers: input: - expand(OUTPUT_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), + expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), output: - OUTPUT_DIR + "cctyper/all_spacers.fa", + WORK_DIR + "cctyper/all_spacers.fa", threads: 1 log: "log/concatenate_all_spacers.txt", @@ -415,19 +413,19 @@ cat {input} > {output} 2> {log} rule cluster_all_spacers: input: - OUTPUT_DIR + "cctyper/all_spacers.fa", + WORK_DIR + "cctyper/all_spacers.fa", output: clusters=expand( - OUTPUT_DIR + "cctyper/all_spacers-clustered-{cutoff}.clstr", + WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}.clstr", cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], ), spacers=expand( - OUTPUT_DIR + "cctyper/all_spacers-clustered-{cutoff}", + WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}", cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], ), - summary=OUTPUT_DIR + "cctyper/spacer_cluster_summary.tsv", + summary=WORK_DIR + "cctyper/spacer_cluster_summary.tsv", params: - output_dir=OUTPUT_DIR + "cctyper/", + WORK_DIR=WORK_DIR + "cctyper/", log_dir="log/spacer_clustering/", conda: "envs/cdhit.yaml" @@ -440,18 +438,18 @@ rule cluster_all_spacers: """ bash bin/cluster_all_spacers.sh\ {input}\ - {params.output_dir}\ + {params.WORK_DIR}\ {params.log_dir} > {log} 2>&1 """ rule cluster_unique_spacers: input: - OUTPUT_DIR + "cctyper/all_spacers.fa", + WORK_DIR + "cctyper/all_spacers.fa", output: - clusters=OUTPUT_DIR + "cctyper/all_spacers-clustered.clstr", - spacers=OUTPUT_DIR + "cctyper/all_spacers-clustered", - distribution=OUTPUT_DIR + "cctyper/all_spacers-clustered-distribution.tsv", + clusters=WORK_DIR + "cctyper/all_spacers-clustered.clstr", + spacers=WORK_DIR + "cctyper/all_spacers-clustered", + distribution=WORK_DIR + "cctyper/all_spacers-clustered-distribution.tsv", conda: "envs/cdhit.yaml" threads: 1 @@ -473,10 +471,10 @@ plot_len1.pl {output.clusters}\ rule create_crispr_cluster_table: input: - clstr=OUTPUT_DIR + "cctyper/all_spacers-clustered.clstr", - fasta=OUTPUT_DIR + "cctyper/all_spacers.fa", + clstr=WORK_DIR + "cctyper/all_spacers-clustered.clstr", + fasta=WORK_DIR + "cctyper/all_spacers.fa", output: - "data/processed/all_spacers_table.tsv", + OUTPUT_DIR + "all_spacers_table.tsv", conda: "envs/pyfaidx.yaml" threads: 1 @@ -490,12 +488,12 @@ rule create_crispr_cluster_table: rule crispridentify: input: - OUTPUT_DIR + "cctyper/{batch}/subseq", + WORK_DIR + "cctyper/{batch}/subseq", output: - OUTPUT_DIR + "crispridentify/{batch}/complete", + WORK_DIR + "crispridentify/{batch}/complete", params: - out_dir=OUTPUT_DIR + "crispridentify/{batch}", - arrays=OUTPUT_DIR + "cctyper/{batch}", + out_dir=WORK_DIR + "crispridentify/{batch}", + arrays=WORK_DIR + "cctyper/{batch}", conda: "envs/crispridentify.yml" threads: config["crispridentify"]["threads"] @@ -518,21 +516,21 @@ rule crispridentify: rule merge_crispridentify_batches: input: - expand(OUTPUT_DIR + "crispridentify/{batch}/complete", batch=BATCHES), + expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES), params: spacers_crispr=expand( - OUTPUT_DIR + WORK_DIR + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta", batch=BATCHES, ), summary_crispr=expand( - OUTPUT_DIR + WORK_DIR + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv", batch=BATCHES, ), output: - spacers_crispr=OUTPUT_DIR + "crispridentify/all_spacers.fa", - summary_crispr=OUTPUT_DIR + "crispridentify/complete_summary.csv", + spacers_crispr=WORK_DIR + "crispridentify/all_spacers.fa", + summary_crispr=WORK_DIR + "crispridentify/complete_summary.csv", threads: 1 log: "log/merge_crispridentify_batches.txt", @@ -555,16 +553,18 @@ rule merge_crispridentify_batches: rule merge_cctyper_identify: input: - identify=OUTPUT_DIR + "crispridentify/complete_summary.csv", - cctyper=expand(OUTPUT_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES) - output: - "data/processed/all_CRISPRS_with_identify.tab" + identify=WORK_DIR + "crispridentify/complete_summary.csv", + cctyper=expand( + WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES + ), + output: + OUTPUT_DIR + "all_CRISPRS_with_identify.tab", params: tmp1="tmp_file1", - tmp2="tmp_file2" + tmp2="tmp_file2", threads: 1 log: - "log/merge_cctyper_identify" + "log/merge_cctyper_identify", shell: """ first=True @@ -617,11 +617,11 @@ rule merge_cctyper_identify: rule cluster_unique_spacers_crispridentify: input: - OUTPUT_DIR + "crispridentify/all_spacers.fa", + WORK_DIR + "crispridentify/all_spacers.fa", output: - clusters=OUTPUT_DIR + "crispridentify/all_spacers-clustered.clstr", - spacers=OUTPUT_DIR + "crispridentify/all_spacers-clustered", - distribution=OUTPUT_DIR + "crispridentify/all_spacers-clustered-distribution.tsv", + clusters=WORK_DIR + "crispridentify/all_spacers-clustered.clstr", + spacers=WORK_DIR + "crispridentify/all_spacers-clustered", + distribution=WORK_DIR + "crispridentify/all_spacers-clustered-distribution.tsv", conda: "envs/cdhit.yaml" threads: 1 @@ -640,12 +640,13 @@ plot_len1.pl {output.clusters}\ > {output.distribution} """ + rule create_crispr_cluster_table_identify: input: - clstr=OUTPUT_DIR + "crispridentify/all_spacers-clustered.clstr", - fasta=OUTPUT_DIR + "crispridentify/all_spacers.fa", + clstr=WORK_DIR + "crispridentify/all_spacers-clustered.clstr", + fasta=WORK_DIR + "crispridentify/all_spacers.fa", output: - "data/processed/all_spacers_table_identify.tsv", + OUTPUT_DIR + "all_spacers_table_identify.tsv", conda: "envs/pyfaidx.yaml" threads: 1 @@ -656,14 +657,15 @@ rule create_crispr_cluster_table_identify: script: "bin/make_cluster_table_identify.py" + rule concatenate_batches: input: - INPUT_DIR + "{batch}" + WORK_DIR + "assemblies/{batch}", output: - temp(OUTPUT_DIR + "{batch}.fasta") + temp(WORK_DIR + "assemblies/{batch}.fasta"), threads: 1 log: - "log/concatenate_{batch}.txt" + "log/concatenate_{batch}.txt", benchmark: "log/benchmark/concatenate_{batch}.txt" shell: @@ -671,19 +673,20 @@ rule concatenate_batches: cat {input}/*.fa > {output} 2> {log} """ + rule genomad: input: - fasta=OUTPUT_DIR + "{batch}.fasta", + fasta=WORK_DIR + "assemblies/{batch}.fasta", db=config["genomad_database"], output: - aggregated_classification=OUTPUT_DIR + aggregated_classification=WORK_DIR + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", - plasmid_summary=OUTPUT_DIR + plasmid_summary=WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", - virus_summary=OUTPUT_DIR + virus_summary=WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", params: - output_dir=OUTPUT_DIR + "genomad/{batch}/", + work_dir=WORK_DIR + "genomad/{batch}/", conda: "envs/genomad.yaml" threads: config["genomad"]["threads"] @@ -694,38 +697,43 @@ rule genomad: shell: """ genomad end-to-end -t {threads} --cleanup --enable-score-calibration\ - {input.fasta} {params.output_dir} {input.db} > {log} 2>&1 + {input.fasta} {params.work_dir} {input.db} > {log} 2>&1 """ rule collect_genomad_predictions: input: - aggregated_classification=expand(OUTPUT_DIR - + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", - batch = BATCHES), - plasmid_summary=expand(OUTPUT_DIR - + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", - batch = BATCHES), - virus_summary=expand(OUTPUT_DIR - + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", - batch = BATCHES), + aggregated_classification=expand( + WORK_DIR + + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", + batch=BATCHES, + ), + plasmid_summary=expand( + WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", + batch=BATCHES, + ), + virus_summary=expand( + WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", + batch=BATCHES, + ), output: - "data/processed/genomad_predictions.csv" + OUTPUT_DIR + "genomad_predictions.csv", conda: "envs/tidy_here.yaml" threads: 1 log: - "log/collect_genomad_predictions.txt" + "log/collect_genomad_predictions.txt", benchmark: "log/benchmark/collect_genomad_predictions.txt" - script: "bin/collect_genomad_predictions.R" + script: + "bin/collect_genomad_predictions.R" rule jaeger: input: - batch=INPUT_DIR + "{batch}/", + batch=WORK_DIR + "assemblies/{batch}/", output: - OUTPUT_DIR + "jaeger/{batch}/complete", + WORK_DIR + "jaeger/{batch}/complete", conda: "envs/jaeger.yaml" threads: config["jaeger"]["threads"] @@ -745,42 +753,43 @@ touch {output} rule collect_jaeger_batch: input: - OUTPUT_DIR + "jaeger/{batch}/complete" + WORK_DIR + "jaeger/{batch}/complete", output: - OUTPUT_DIR + "jaeger/{batch}/jaeger-{batch}.csv" + WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv", params: - batch="{batch}" + batch="{batch}", conda: "envs/tidy_here.yaml" threads: 1 log: - "log/collect_jaeger_{batch}.txt" + "log/collect_jaeger_{batch}.txt", benchmark: "log/benchmark/collect_jaeger_{batch}.txt" - script: "bin/collect_jaeger_batch.R" + script: + "bin/collect_jaeger_batch.R" rule collect_jaeger_predictions: input: - expand(OUTPUT_DIR + "jaeger/{batch}/jaeger-{batch}.csv", - batch = BATCHES), + expand(WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES), output: - "data/processed/jaeger_predictions.csv" + OUTPUT_DIR + "jaeger_predictions.csv", threads: 1 log: - "log/collect_jaeger_predictions.txt" + "log/collect_jaeger_predictions.txt", benchmark: "log/benchmark/collect_jaeger_predictions.txt" - script: "bin/collect_jaeger_predictions.sh" + script: + "bin/collect_jaeger_predictions.sh" rule spacepharer_spacer_setup: input: - spacers=OUTPUT_DIR + "crispridentify/all_spacers.fa", + spacers=WORK_DIR + "crispridentify/all_spacers.fa", output: - spacer_DB=OUTPUT_DIR + "spacepharer/DB_CRISPR/querysetDB", + spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", params: - tmp_folder=OUTPUT_DIR + "spacepharer/tmpFolder", + tmp_folder=WORK_DIR + "spacepharer/tmpFolder", conda: "envs/spacepharer.yml" threads: 48 @@ -798,10 +807,10 @@ rule spacepharer_spacer_setup: rule spacepharer_phage_setup: output: - phage_DB=OUTPUT_DIR + "spacepharer/phage_DB/targetsetDB", - phage_control_DB=OUTPUT_DIR + "spacepharer/phage_DB/controlsetDB", + phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB", + phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB", params: - tmp_folder=OUTPUT_DIR + "spacepharer/tmpFolder", + tmp_folder=WORK_DIR + "spacepharer/tmpFolder", DB=config["spacepharer_phage_database"] + "*.fasta", conda: "envs/spacepharer.yml" @@ -821,14 +830,14 @@ rule spacepharer_phage_setup: rule spacepharer_phage: input: - spacer_DB=OUTPUT_DIR + "spacepharer/DB_CRISPR/querysetDB", - phage_DB=OUTPUT_DIR + "spacepharer/phage_DB/targetsetDB", - phage_control_DB=OUTPUT_DIR + "spacepharer/phage_DB/controlsetDB", + spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", + phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB", + phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB", output: - result=OUTPUT_DIR + "spacepharer/predicted_phage_matches.tsv", - result_sanitised=OUTPUT_DIR + "spacepharer/predicted_phage_matches_san.tsv", + result=WORK_DIR + "spacepharer/predicted_phage_matches.tsv", + result_sanitised=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv", params: - tmp_folder=OUTPUT_DIR + "spacepharer/tmpFolder", + tmp_folder=WORK_DIR + "spacepharer/tmpFolder", conda: "envs/spacepharer.yml" threads: 48 @@ -848,10 +857,10 @@ rule spacepharer_plasmid_setup: input: DB=config["spacepharer_plasmid_database"] + "sequences.fasta", output: - DB=OUTPUT_DIR + "spacepharer/plasmid_DB/targetsetDB", - control_DB=OUTPUT_DIR + "spacepharer/plasmid_DB/controlsetDB", + DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB", + control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB", params: - tmp_folder=OUTPUT_DIR + "spacepharer/tmpFolder", + tmp_folder=WORK_DIR + "spacepharer/tmpFolder", conda: "envs/spacepharer.yml" threads: 48 @@ -870,14 +879,14 @@ rule spacepharer_plasmid_setup: rule spacepharer_plasmid: input: - phage_DB=OUTPUT_DIR + "spacepharer/plasmid_DB/targetsetDB", - phage_control_DB=OUTPUT_DIR + "spacepharer/plasmid_DB/controlsetDB", - spacer_DB=OUTPUT_DIR + "spacepharer/DB_CRISPR/querysetDB", + phage_DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB", + phage_control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB", + spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", output: - result=OUTPUT_DIR + "spacepharer/predicted_plasmid_matches.tsv", - result_sanitised=OUTPUT_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", + result=WORK_DIR + "spacepharer/predicted_plasmid_matches.tsv", + result_sanitised=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", params: - tmp_folder=OUTPUT_DIR + "spacepharer/tmpFolder", + tmp_folder=WORK_DIR + "spacepharer/tmpFolder", conda: "envs/spacepharer.yml" threads: 48 @@ -892,48 +901,35 @@ rule spacepharer_plasmid: rm -r {params.tmp_folder} >> {log} 2>&1 """ + rule create_spacepharer_table: input: - phage=OUTPUT_DIR + "spacepharer/predicted_phage_matches_san.tsv", + phage=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv", meta_phage=config["spacepharer_phage_database"], - plasmid=OUTPUT_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", - meta_plasmid=config["spacepharer_plasmid_database"] + plasmid=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", + meta_plasmid=config["spacepharer_plasmid_database"], output: - phage="data/processed/phage_matches.tsv", - plasmid="data/processed/plasmid_matches.tsv" + phage=OUTPUT_DIR + "phage_matches.tsv", + plasmid=OUTPUT_DIR + "plasmid_matches.tsv", threads: 1 log: - "log/create_spacepharer_table.txt" - shell: - """ - echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > {output.phage} - while read line; do - ID=$(echo $line | cut -f 2 -d " ") - metadata_match=$(grep -w "$ID" {input.meta_phage}/merged_metadata.tsv | cut -f 2-7) - echo -e "$line $metadata_match" >> {output.phage} - done < {input.phage} - - echo "starting plasmid" - echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\ttaxonomy" > {output.plasmid} - while read line; do - ID=$(echo $line | cut -f 2 -d " ") - nuccore_match=$(grep -w "$ID" {input.meta_plasmid}/nuccore.csv | cut -f 13 -d ",") - taxonomy_match=$(grep -w "^$nuccore_match" {input.meta_plasmid}/taxonomy.csv | cut -f 3 -d ",") - echo -e "$line $taxonomy_match" >> {output.plasmid} - done < {input.plasmid} - """ + "log/create_spacepharer_table.txt", + script: + "bin/create_spacepharer_table.sh" + rule kma_indexing: input: - spacers=OUTPUT_DIR + "crispridentify/all_spacers.fa" + spacers=WORK_DIR + "crispridentify/all_spacers.fa", output: - indexed_spacers=OUTPUT_DIR + "kma/spacer_DB/spacers.name" + indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name", params: - OUTPUT_DIR + "kma/spacer_DB/spacers" + WORK_DIR + "kma/spacer_DB/spacers", conda: "envs/kma.yaml" threads: 12 - log: "log/kma/kma_index.txt" + log: + "log/kma/kma_index.txt", benchmark: "log/benchmark/kma/kma_index.txt" shell: @@ -941,21 +937,22 @@ rule kma_indexing: kma index -i {input.spacers} -o {params} > {log} 2>&1 """ + rule kma: input: - genomes=expand(INPUT_DIR + "{batch}/", batch=BATCHES), - indexed_spacers=OUTPUT_DIR + "kma/spacer_DB/spacers.name" + genomes=expand(WORK_DIR + "assemblies/{batch}/", batch=BATCHES), + indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name", output: - OUTPUT_DIR + "kma/output/CRISPR.frag.gz" + WORK_DIR + "kma/output/CRISPR.frag.gz", params: - output=OUTPUT_DIR + "kma/output/CRISPR", - indexed_spacers=OUTPUT_DIR + "kma/spacer_DB/spacers", - spacers=OUTPUT_DIR + "crispridentify/all_spacers.fa" + output=WORK_DIR + "kma/output/CRISPR", + indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers", + spacers=WORK_DIR + "crispridentify/all_spacers.fa", conda: "envs/kma.yaml" threads: 24 log: - "log/kma/kma.txt" + "log/kma/kma.txt", benchmark: "log/benchmark/kma/kma.txt" shell: @@ -967,13 +964,14 @@ rule kma: rm tmp_file all_genomes.txt """ + rule collect_kma: - input: - OUTPUT_DIR + "kma/output/CRISPR.frag.gz" + input: + WORK_DIR + "kma/output/CRISPR.frag.gz", output: - OUTPUT_DIR + "kma/CRISPR_alignment" + WORK_DIR + "kma/CRISPR_alignment", log: - "log/kma/collect_kma.txt" + "log/kma/collect_kma.txt", benchmark: "log/benchmark/kma/collect_kma.txt" shell: diff --git a/bin/collect_genomad_predictions.R b/bin/collect_genomad_predictions.R index 154e4da..0cc783e 100644 --- a/bin/collect_genomad_predictions.R +++ b/bin/collect_genomad_predictions.R @@ -2,8 +2,10 @@ # Find output files for geNomad and create one overall prediction report. -library(tidyverse) -library(here) +suppressPackageStartupMessages({ + library(tidyverse) + library(here) +}) genomad_scores_files <- snakemake@input[["aggregated_classification"]] genomad_plasmid_files <- snakemake@input[["plasmid_summary"]] diff --git a/bin/collect_jaeger_batch.R b/bin/collect_jaeger_batch.R index 7a36b21..167e0aa 100644 --- a/bin/collect_jaeger_batch.R +++ b/bin/collect_jaeger_batch.R @@ -3,8 +3,10 @@ # Concatenate Jaeger output files per batch while selecting only the most # relevant, easy-to-use columns. -library(here) -library(tidyverse) +suppressPackageStartupMessages({ + library(tidyverse) + library(here) +}) # Read all output files in a batch (one per genome) jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv")) diff --git a/bin/create_spacepharer_table.sh b/bin/create_spacepharer_table.sh new file mode 100644 index 0000000..888b286 --- /dev/null +++ b/bin/create_spacepharer_table.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +## this script uses the output from the two spacepharer matches created from the spacepharer rules. This script assumes that Phagescope and PLSDB are the used databases. +## Overall the script takes every match and extracts the ID which is directly taking from the database and then tries to match this back to the metadata and attach this to the match. + +#spacepharer does not include its own column names and expected metadata column names are also manually added +echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]} +while read line; do + ID=$(echo $line | cut -f 2) + metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7) + echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]} +done < ${snakemake_input[phage]} + +echo "starting plasmid" +echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\ttaxonomy" > ${snakemake_output[plasmid]} +while read line; do + ID=$(echo $line | cut -f 2) + #PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy + nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",") + taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",") + echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]} +done < ${snakemake_input[plasmid]} diff --git a/bin/download_ATB_metadata.sh b/bin/download_ATB_metadata.sh index 995b46c..62e9b38 100755 --- a/bin/download_ATB_metadata.sh +++ b/bin/download_ATB_metadata.sh @@ -42,4 +42,7 @@ download_if_not_present ${output_dir}checkm2.tsv.gz https://osf.io/download/289f download_if_not_present ${output_dir}species_calls.tsv.gz https://osf.io/download/7t9qd/ # Get a list of all files: +download_if_not_present ${output_dir}all_atb_files.tsv https://osf.io/download/r6gcp/ + +# And a list with sample - batch matches download_if_not_present ${output_dir}file_list.all.20240805.tsv.gz https://osf.io/download/dw3h7 diff --git a/bin/download_bakta_annotations.sh b/bin/download_bakta_annotations.sh new file mode 100644 index 0000000..b2ebc02 --- /dev/null +++ b/bin/download_bakta_annotations.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n' + +# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/ + +part=${1:-"update"} + +if [ "${part}" == "update" ] +then + download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv) + +elif [ "${part}" == "original" ] +then + download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv) + +elif [ "${part}" == "all" ] +then + download_list=$(cat data/ATB/batches_to_download.tsv) + +else + download_list="" + echo "Unknown argument provided! Please use 'all', 'original', or 'update'." + echo "Or none to use the default (=update)." +fi + +for line in ${download_list} +do + filename=$(echo ${line} | cut -f 1 | sed -e 's/assembly/bakta/') + url=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 4) + checksum=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 5) + echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}" + + mkdir -p data/tmp/ATB + + outputfile="data/tmp/ATB/${filename}" + + # If the output file is not a file of size greater than zero + if [ ! -s ${outputfile} ] + then + # Then download it + wget -O ${outputfile} ${url} + else + echo "${outputfile} has been downloaded before!" + fi + + # Check the md5sum + if echo ${checksum} ${outputfile} | md5sum -c --status; + then + echo "OK: md5sum for ${outputfile} is correct!" + else + # If it is wrong, delete the file + echo "BAD: md5sum for ${outputfile} is incorrect... Deleting file" + rm ${outputfile} + fi + + # Extract the batch number from the file name + batchdir="data/tmp/annotations" + mkdir -p ${batchdir} + + # If the batch directory has not been made yet + if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ] + then + echo "Extracting ${outputfile}!" + + # Decompress the XZ archive and send the output to the specified directory. + tar -Jxf ${outputfile} -C ${batchdir} + + else + echo "Bakta files have been extracted previously!" + fi +done diff --git a/bin/download_genomes.sh b/bin/download_genomes.sh index f3caf7f..09b9153 100644 --- a/bin/download_genomes.sh +++ b/bin/download_genomes.sh @@ -3,6 +3,16 @@ set -euo pipefail IFS=$'\n' # Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# This script does: +# 1. Look for the batches of interest, +# 2. download them from ATB, +# 3. check their md5 checksum (file integrity) +# 4. extract to 'data/tmp/assemblies' + +# Note: If the md5sum does not match (for example, the file is incomplete or missing), +# this script does not download the archive again. You would have to re-run the script +# to download missing batches. (It checks file presence before downloading, so you +# will not download complete/correct files again.) part=${1:-"update"} @@ -52,19 +62,16 @@ do fi # Extract the batch number from the file name - batchnumber=$(basename ${outputfile} | cut -f 6 -d '.') - batchdir="data/tmp/ATB/batch_${batchnumber}" + batchdir="data/tmp/assemblies/" + mkdir -p ${batchdir} # If the batch directory has not been made yet - if [ ! -d ${batchdir} ] + if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ] then echo "Extracting ${outputfile}!" - mkdir -p ${batchdir} - - # Decompress the XZ archive, send the output to the specified directory, - # and strip the leading directory from within the XZ archive. - tar -Jxf ${outputfile} -C ${batchdir} --strip-components 1 + # Decompress the XZ archive and send the output to the specified directory. + tar -Jxf ${outputfile} -C ${batchdir} else echo "Fasta files have been extracted previously!" diff --git a/bin/extract_crispr-cas_from_fasta.sh b/bin/extract_crispr-cas_from_fasta.sh index 1414625..a012e77 100644 --- a/bin/extract_crispr-cas_from_fasta.sh +++ b/bin/extract_crispr-cas_from_fasta.sh @@ -4,7 +4,7 @@ cctyper_dir=$1 sample_name=$(basename ${cctyper_dir}) batch_name=$(basename $(dirname ${cctyper_dir})) -fasta_file="data/tmp/ATB/${batch_name}/${sample_name}.fa" +fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa" bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") ) diff --git a/bin/prepare_genomes.sh b/bin/prepare_genomes.sh index 0200c98..a7b4329 100644 --- a/bin/prepare_genomes.sh +++ b/bin/prepare_genomes.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +set -euo pipefail +IFS=$'\n' + +# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/ ## Prepare genome sequences for CRISPR analysis ## @@ -42,22 +46,62 @@ echo "----------" # and cut column 1 which contains the accession IDs echo "Step 2: extracting sample accession IDs of species of interest" species_samples_file="${output_dir}all_samples_of_interest.txt" + +echo -e "Species of interest:\n$(cat config/species_of_interest.txt)" + +total_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | wc -l) +hq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l) +lq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l) + +echo -e "Total_genomes\t${total_genomes}" > data/tmp/total_genomes_of_interest.tsv +echo -e "High-quality_genomes\t${hq_genomes}" >> data/tmp/total_genomes_of_interest.tsv +echo -e "Low-quality_genomes\t${lq_genomes}" >> data/tmp/total_genomes_of_interest.tsv + +echo "ATB contains ${total_genomes} genomes of your species of interest." +echo "Of those, ${lq_genomes} are labeled as low-quality, which are not included for further analyses." +echo "That means, ${hq_genomes} are available to work with." + +echo +echo "Also see the file data/tmp/total_genomes_of_interest.tsv to see these" +echo "numbers in table form (tab-separated values)." +echo + +# Use no further grep options to match anything that contains the species names, +# including subspecies and lineages. Exclude low-quality 'HQ field == F'. zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz |\ grep -v -e "F$" | cut -f 1 > ${species_samples_file} echo "Done extracting sample names!" ls -lh ${species_samples_file} -echo "Contains: $(zless ${species_samples_file} | wc -l) entries" +echo "Contains: $(wc -l ${species_samples_file}) entries" echo "----------" # Collect the number of genomes included for each species of interest zgrep -f ${species_samples_file} -w ${output_dir}species_calls.tsv.gz |\ cut -f 2 | sort | uniq -c | sort -nr > ${output_dir}number_of_genomes_per_species.txt +# Count the number of genomes of interest in the 'original' and 'update' releases: +original_genomes=$(zgrep -w -f ${species_samples_file}\ + ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\ + grep -c "r0.2") +update_genomes=$(zgrep -w -f ${species_samples_file}\ + ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\ + grep -c "incr_release") + +echo -e "Genomes_in_original_release\t${original_genomes}" >> data/tmp/total_genomes_of_interest.tsv +echo -e "Genomes_in_update_release\t${update_genomes}" >> data/tmp/total_genomes_of_interest.tsv + ## Step 3: Filter metadata to the species of interest echo "Step 3: Filtering metadata for species of interest" -zgrep -w -f ${species_samples_file} ${output_dir}ena_metadata.20240801.tsv.gz |\ - gzip > ${output_dir}ena_metadata.20240801-filtered.tsv.gz +# Include the header for simpler processing in e.g. R: +zless ${output_dir}ena_metadata.20240801.tsv.gz | head -1 | gzip >\ + ${output_dir}ena_metadata.20240801-filtered.tsv.gz || true +# N.B. the 'or true' is inserted to avoid the non-zero exit status of head + +# Use 'grep -w' to match only whole words; don't match longer accession numbers that +# contain the sample accession of interest! +zgrep -w -f "${species_samples_file}" "${output_dir}ena_metadata.20240801.tsv.gz" |\ + gzip >> ${output_dir}ena_metadata.20240801-filtered.tsv.gz echo "Done filtering!" ls -lh ${output_dir}ena_metadata.20240801-filtered.tsv.gz @@ -76,30 +120,64 @@ echo -e "Filename\tDownload_URL\tmd5sum" head -3 ${output_dir}batches_to_download.tsv echo "----------" +## Step 4.2: Find out what genomes are in these batches and make a list of +# genomes to exclude (low-quality and other species). +# To get there, first collect all samples in the batches to download: +zgrep -f ${output_dir}batches_to_download.tsv ${output_dir}file_list.all.20240805.tsv.gz |\ + cut -f 1 > ${output_dir}samples_in_batches.txt + +# Now take out the samples of interest: +grep -v -w -f ${species_samples_file} ${output_dir}samples_in_batches.txt >\ + ${output_dir}samples_not_of_interest.txt + +# And for these 'not of interest genomes', give a summary of their +# numbers: +zgrep -w -f ${output_dir}samples_not_of_interest.txt\ + ${output_dir}file_list.all.20240805.tsv.gz | cut -f 2 | sort | uniq -c | sort -nr >\ + data/tmp/other_genomes-numbers.txt + +# And get the list of batches + sample accessions, to facilitate removal: +zgrep -w -f ${output_dir}samples_not_of_interest.txt\ + ${output_dir}file_list.all.20240805.tsv.gz |\ + cut -f 4 > data/tmp/samples_to_remove.txt + +echo "In the batches to download are $(wc -l ${output_dir}samples_not_of_interest.txt)" +echo "samples that have low-quality genomes or species other than the species of interest." +echo "These are summarised in:" +ls -lh ${output_dir}samples_not_of_interest.txt data/tmp/other_genomes-numbers.txt data/tmp/samples_to_remove.txt + ## Step 5: Download genome sequences echo "Step 5: Download genome sequences" bash bin/download_genomes.sh ${part} echo "Finished downloading!" -echo "The batches have been written to 'data/tmp/ATB/batch_*'" +echo "The batch archives have been written to 'data/tmp/ATB/'" +echo "and their contents extracted to 'data/tmp/assemblies/'" echo "----------" -# Step 6: Remove genomes of other species +## Step 6: Remove genomes of other species echo "Step 6: remove genomes of species other than species of interest" -echo "Files before filtering:" -for batch in data/tmp/ATB/batch_* +for fasta in $(cat data/tmp/samples_to_remove.txt) do - echo "$(basename ${batch}):" - ls ${batch} | wc -l +# Verbose remove: tell what is being removed + rm -fv "data/tmp/assemblies/${fasta}" done +echo "----------" -bash bin/remove_other_species.sh +## Step 7: Download functional annotations (Bakta) +echo "Step 7: download functional (gene) annotations" +bash bin/download_bakta_annotations.sh ${part} +echo "Finished downloading!" +echo "The batches have been downloaded to 'data/tmp/ATB/' and extracted in 'data/tmp/annotations'" +echo "----------" -echo "Files after filtering:" -for batch in data/tmp/ATB/batch_* +## Step 7.1: Also remove annotation files for non-of-interest samples +# 1: adjust the file basename from 'assembly' to 'bakta' +echo "Step 7.1: remove genome annotations of other/low-quality samples" +for file in $(sed 's|atb.assembly.|atb.bakta.|g' data/tmp/samples_to_remove.txt) do - echo "$(basename ${batch}):" - ls ${batch} | wc -l + # 2: add 'annotations' subdirectory + bakta_dir="data/tmp/annotations/" + # 3: exchange '.fa' extension to 'bakta.json' + json="${bakta_dir}${file/.fa/.bakta.json}" + rm -fv ${json} done -rmdir --ignore-fail-on-non-empty data/tmp/ATB/batch_* -echo "----------" -echo "Genome files have been prepared!" diff --git a/bin/remove_other_species.sh b/bin/remove_other_species.sh deleted file mode 100644 index 18e7237..0000000 --- a/bin/remove_other_species.sh +++ /dev/null @@ -1,31 +0,0 @@ -#! /usr/bin/env bash -set -euo pipefail -IFS=$'\n' - -downloaded_samples="data/tmp/all_downloaded_samples.txt" -other_species_samples="data/tmp/other_species-samples.txt" - -# Get all accession IDs from all batches -for batch in data/tmp/ATB/batch_* -do - ls ${batch} | sed 's/.fa//g' -done > ${downloaded_samples} - -# Compare the accession IDs with those from your species of interest - -# saving only the other ones (not of interest) -comm -23 <(sort ${downloaded_samples}) <(sort data/ATB/all_samples_of_interest.txt) > ${other_species_samples} - -echo -e "Removing $(wc -l ${other_species_samples}) fasta files..." - -# Use the list of non-Campylobacters to remove unnecessary fasta files -while read sample; -do - rm data/tmp/ATB/batch_*/${sample}.fa -done < ${other_species_samples} - -# For the curious: make a list of species that were downloaded -zgrep -f ${other_species_samples} -w data/ATB/species_calls.tsv.gz |\ - tee data/tmp/other_species_calls.tsv |\ - cut -f 2 | sort | uniq -c | sort -nr > data/tmp/other_species_numbers.txt - -echo "Done!" diff --git a/config/parameters.yaml b/config/parameters.yaml index de289d0..7686a41 100755 --- a/config/parameters.yaml +++ b/config/parameters.yaml @@ -2,8 +2,8 @@ ### 1. Input/output parameters ### -input_directory: "data/tmp/ATB/" -output_directory: "data/tmp/" +working_directory: "data/tmp/" +output_directory: "data/processed/" genomad_database: "data/genomad_db/" spacepharer_phage_database: "data/raw/phagescope/" diff --git a/doc/CRISPR_refinement.md b/doc/CRISPR_refinement.md index 8d297d1..a820ebf 100644 --- a/doc/CRISPR_refinement.md +++ b/doc/CRISPR_refinement.md @@ -53,11 +53,47 @@ this two-step approach. ## 2.2 Re-evaluation of the CRISPR arrays -(Brief description of how CRISPRidentify works) +Using the arrays parsed from CCTyper, CRISPRidentify has two seperate steps to +come to its final conclusion: Candidate generation and Candidate evaluation. +In Candidate generation, CRISPRidentify uses Vmatch to find putative repeat pairs, +which by default need to be between 21 and 55 nucleotides long and 18-78 nucleotides apart. +This process is relatively sensitive and usually more than one repeat candidate is generated. +All the repeat candidates are then aligned. This alignment is created from a +maximum element and a minimum element. The maximum element is the largest repeat +string generated from the most common nucleotides in each base of all candidates. +The minimum element is generated from the most common substring of all repeats, +this also by definition has 100% identity as a substring of the maximum element. + +Every possible repeat is then generated between the maximum and minimum element +and put alongside the matches found by Vmatch and has duplicates filtered out, +forming the set of repeat candidates. This set of candidates is then even further +extended by omitting up to 3 nucleotides on each side of the repeats, generating +an additional 15 candidates per repeat. In essence, many possible repeats are +considered for analysis. + +CRISPR array candidates are created by string searching the different repeats +in the provided sequence and attempts to minimise the number of editing +operations needed for the consensus repeat, while still allowing mutations +in the repeats to be detected. ## 2.3 Calculating CRISPR confidence -(Brief description of what the machine learning thing does) +After all CRISPR array candidates are generated, they are evaluated by +CRISPRidentify's internal scoring system. This scoring system was created +by considering 13 features that can predict array viability in multiple ways. +The 13 features are listed in +[Supplementary file 1](https://academic.oup.com/nar/article/49/4/e20/6027817?login=false#supplementary-data), +table S2 (page 20). +Performing feature subset selection on all combinations of these 13 features, +three models containing 8, 9 and 10 of the 13 features achieved similar accuracy. +By default, CRISPRidentify uses the average of these three models to score the candidate arrays. + +The scoring is divided into three possible categories. +0-0.4 are low scoring candidates which are unlikely to be CRISPR. +0.4-0.75 are possible candidate CRISPR arrays. +0.75-1.0 are Bona-Fide CRISPR arrays and are very likely to be valid CRISPR arrays. +In cases where CRISPR array candidates are overlapping but both Bona-Fide, +the lower scoring arrays are instead put into the alternative candidate category. ## 2.4 Combining the output with CCTyper's diff --git a/doc/extra_functions.md b/doc/extra_functions.md index 203b33e..3c822b9 100644 --- a/doc/extra_functions.md +++ b/doc/extra_functions.md @@ -38,3 +38,10 @@ be used to identify 'free' phages as well as integrated prophages. We map the identified CRISPR spacers back to the input genomes and use these plasmid/phage predictions to estimate the targets of the CRISPRs. + +## Mapping spacers back to input genomes + +... +By using the KMA flag `-hmm`, the output files (`*.frag.gz`) become easier +to interpret: it add columns with (1) target name, (2) start and (3) stop +positions. With this option enabled, KMA "uses a HMM to assign template". diff --git a/doc/prepare_genomes.md b/doc/prepare_genomes.md index 06ba45c..3eabeff 100644 --- a/doc/prepare_genomes.md +++ b/doc/prepare_genomes.md @@ -23,10 +23,12 @@ And it does the following things: 4. [Identify the right batches](#4-find-batches-that-contain-species-of-interest) -5. [Download genome batches](#5-download-the-genomes) +5. [Download genomes in batches](#5-download-the-genomes) 6. [Remove non-target species](#6-remove-other-species) +7. [Download functional annotations in batches](#7-download-functional-annotations) + ## 1. Download ATB metadata The script starts by calling a separate script: `bin/download_ATB_metadata.sh`. @@ -47,9 +49,11 @@ This script downloads: 6. Species calls (an interpretation of the Sylph and CheckM2 output) - three columns: accession ID, species name and high-quality (T/F) -7. A list of all files - - this is actually a table with accession IDs, species name, which +7. Two lists of 'all' files + - One is actually a table with accession IDs, species name, which ATB batch file they are part of, batch MD5 checksum and file size + - The other has no sample accessions, but lists all 'projects' within + ATB along with its respective filename, URL, MD5 checksum and filesize. ### Output files @@ -65,6 +69,7 @@ You then get: 6 checkm2.tsv.gz 7 species_calls.tsv.gz 8 file_list.all.20240805.tsv.gz +9 all_atb_files.tsv ``` ## 2. Look up accession IDs of species of interest @@ -111,6 +116,13 @@ the species of interest, the script: 5. and stores this in a separate file: `data/ATB/batches_to_download.tsv` +Next to the genomes of interest, these batches also contain lower-quality +genomes and sometimes different species are put together in a batch. +To remove these 'not of interest'-genomes and exclude them from further +analyses, the script also makes a list of their sample accession IDs so +that they can be removed after downloading. +Also see [step 6](#6-remove-other-species). + ## 5. Download the genomes The actual downloading of the genomes happens here! :material-file-download: @@ -153,32 +165,40 @@ AllTheBacteria has its own Some batches may contain more than only the species of interest. In those cases, FASTA files that contain genomes from other species are deleted. -To do this, the final script `bin/remove_other_species.sh` is called. -This script compares the accession IDs (samples) of the species of interest -with the accession IDs present in the downloaded batches to -compose a list of samples with other species (`data/tmp/other_species-samples.txt`). -It counts how many samples have other species and then the corresponding -FASTA files are removed. -For curious users, the other species names and numbers are stored as: -`data/tmp/other_species_calls.tsv` and `data/tmp/other_species_numbers.txt`. +This is based on sample accession IDs listed for each batch: +accessions that do not match high-quality genomes of the species of interest +are automatically removed. +For the curious, a list of the other genomes is stored as: +`data/tmp/other_genomes-numbers.txt`. +These samples are removed and this list shows what was in there. + +## 7. Download functional annotations + +For each genome assembled in ATB, functional (gene) annotations have also +been generated using [Bakta](https://github.com/oschwengers/bakta). +ATB stores only the JSON files, also wrapped in batches, and the download +script also automatically downloads them so they may be used in your +analyses! ## General remarks - The whole process is set up as Bash script, and uses GNU tools such as `grep` and `wget`. This should work on most Unix-like systems. -- The script can use a command-line option to donwload genomes from the +- The script can use a command-line option to download genomes from the complete ATB, only the first version, or the incremental update. For this, it uses options 'all', 'original' or 'update', respectively. By default, the script uses 'update', which has the smallest file sizes. -The option can be provided as: +The option can be provided as, for example: ```bash bash bin/prepare_genomes.sh all ``` -This option is most relevant to `bin/download_genomes.sh`. +This option is most relevant to `bin/download_genomes.sh` [(Step 5)](#5-download-the-genomes) +and `bin/download_bakta_annotations.sh` +[(step 7)](#7-download-functional-annotations). - The script reads whether files exist already. If they are already there, they will not be downloaded again. diff --git a/results/Descriptive_statistics.qmd b/results/Descriptive_statistics.qmd index 8b56e9a..14bae32 100644 --- a/results/Descriptive_statistics.qmd +++ b/results/Descriptive_statistics.qmd @@ -201,12 +201,12 @@ The output of CCTyper only contains sample accession IDs that can be used to identify samples. Import a minimal set of ENA metadata and species identifications by Sylph: -```{r import_metadata} -#| fig-width: 5 -#| fig-height: 1 +```{r} #| label: fig-number_of_species -#| fig-cap: "Number of species present in dataset" -#| message: false +#| message: true +#| fig-cap: Number of species present in dataset +#| fig-height: 1 +#| fig-width: 5 campylobacter_metadata_file <- here("data/processed/Campylobacter_ATB_metadata.tsv") @@ -218,52 +218,89 @@ if(file.exists(campylobacter_metadata_file)) { } else { message("Creating Campylobacter metadata file...") - total_campylobacter_metadata <- read_delim( - here("data/ATB/ena_metadata.20240801.selection-only_Campylobacter.tsv.gz"), - delim = "\t", - col_names = F, - show_col_types = F) - - older_campylobacter_metadata <- read_delim( - here("data/ATB/ena_metadata.0.2.20240606.selection-only_Campylobacter.tsv.gz"), + metadata_df <- read_delim( + here("data/ATB/ena_metadata.20240801-filtered.tsv.gz"), delim = "\t", col_names = F, show_col_types = F) - # Remove the older entries to keep only the Campylobacter genomes - # from the incremental update - campylobacter_metadata <- total_campylobacter_metadata %>% - filter(! X2 %in% older_campylobacter_metadata$X1) - # Take the column names from the table before grepping - colnames(campylobacter_metadata) <- read_delim( - file = here("data/ATB/ena_metadata.20240801.selection.tsv.gz"), + colnames(metadata_df) <- read_delim( + file = here("data/ATB/ena_metadata.20240801.tsv.gz"), n_max = 1, delim = "\t", show_col_types = F ) %>% colnames() # Remove columns that contain only NA values - campylobacter_metadata <- campylobacter_metadata %>% + metadata_df <- metadata_df %>% select_if(~sum(!is.na(.)) > 0) + # Filter metadata according to AllTheBacteria selection criteria + metadata_df <- metadata_df %>% + filter( + instrument_platform == "ILLUMINA" & + library_strategy == "WGS" & + library_source == "GENOMIC" & + library_layout == "PAIRED" + ) + + # Lookup samples contained in current folder + current_sample_files <- list.files( + here("data/tmp/ATB"), + recursive = T, include.dirs = F + ) + current_samples <- current_sample_files[ + current_sample_files %>% grepl( + pattern = "batch_[0-9]*\\/.*\\.fa$", x = .) + ] %>% + gsub(pattern = "batch_[0-9]*\\/", replacement = "", x = .) %>% + gsub(pattern = "\\.fa", replacement = "", x = .) + + metadata_df <- metadata_df %>% + filter(sample_accession %in% current_samples) + + # Match metadata with species calls simplified_species <- read_delim( file = here("data/ATB/sylph.tsv.gz"), delim = "\t", show_col_types = F) %>% select(Sample, Species) - campylobacter_metadata <- left_join( - x = campylobacter_metadata, - y = simplified_species, - by = c("sample_accession" = "Sample") - ) - # Remove all genomes whose taxonomic annotation (species) does - # not start with 'Campylobacter' - campylobacter_metadata <- campylobacter_metadata %>% + # not start with 'Campylobacter_D [jejuni/coli]' + simplified_species <- simplified_species %>% filter(grepl("Campylobacter_D jejuni|Campylobacter_D coli", Species)) + campylobacter_metadata <- inner_join( + x = simplified_species, + y = metadata_df, + by = c("Sample" = "sample_accession"), + multiple = "last" + ) %>% + distinct() + + ### NOTE: By picking the last sample from the metadata, I circumvent + # the 'issue' of having multiple runs per sample. This happens when there + # are multiple runs from the same sample, e.g. SAMN04579139. + # These samples may be very interesting to analyse, but would have to be + # downloaded and assembled separate from ATB, as ATB has only one assembly + # per sample accession! + # Since ATB does not mention this explicitly, and downloads paired-end + # reads for each run accession, I assume they would just download each + # run one by one and assemble them, so that in the end the last one + # will overwrite the 'sample' assembly. + # + # This issue has been raised on GitHub: + # https://github.com/AllTheBacteria/AllTheBacteria/issues/35 + # and a file has been provided to match samples with runs. + # Use this if necessary. + # run_lookup <- read_delim(file = here("data/ATB/status.202408.plus_runs.tsv.gz")) %>% select(Sample, Run) %>% filter(Run != "unknown") + # Another part, however, is in a separate file: + # run_lookup_2 <- read_delim(file = here("data/ATB/File9_all_metadata_ena_661K.txt.gz")) %>% select(sample_id, run_accession) + # (In this part, multiple runs seem to have been assembled together, + # making things more complicated.) + write_delim(x = campylobacter_metadata, file = campylobacter_metadata_file, delim = "\t") @@ -298,29 +335,6 @@ stacked_species_plot <- ggplot(data = campylobacter_species_numbers, stacked_species_plot ``` -```{r new_metadata_import} -# Read metadata without column names -test_metadata <- read_delim(here("data/ATB/ena_metadata.20240801-filtered.tsv.gz"), - col_names = F) - -# Collect the column names from the original metadata file -test_metadata_colnames <- read.table( - here("data/ATB/ena_metadata.20240801.tsv.gz"), - nrows = 1 -) - -current_samples <- list.files( - here("data/tmp/ATB/"), - recursive = T, include.dirs = F -) - -# Insert column names -colnames(test_metadata) <- test_metadata_colnames[1,] - -# Remove columns with only NA values -test_metadata <- test_metadata[colSums(!is.na(test_metadata)) > 0] -``` - # CRISPR arrays per genome {#crispr-arrays-per-genome} We have identified CRISPRs in all *Campylobacter jejuni* and *C. coli*