diff --git a/README.md b/README.md index 2a7dc81..10a6dec 100755 --- a/README.md +++ b/README.md @@ -26,51 +26,41 @@ which includes a quick start guide as well as a detailed step-by-step descriptio ## Release roadmap -- Version 0.1 updates: - - - Updates in documentation of implemented functions and output files -(e.g., expanding documentation on the use of CRISPRidentify) - - - Functions to concatenate and combine existing outputs - - - Code clean-up (moving long commands to separate scripts, -applying standardised formatting, remove unnecessary code) - -- Versions 0.2 and further: - - - New functionality (e.g., all-vs-all genome comparisons) - -## To do - -- [x] Add CRISPRidentify to workflow - - - [x] And make sure it works on a clean install - -- [x] Combine results from CCTyper with CRISPRidentify - -- [ ] Make and/or correct scripts for combining results into 'Output files' (write to `data/processed/`) - - - [x] Concatenate MLST results - - - [x] Enable spacer table creation script in Snakefile (add to `rule all`) - -- [x] Collect and combine results from geNomad and Jaeger - -- [x] Map spacers to genomes and phage/plasmid databases - -- [x] Add PADLOC for identifying other anti-phage systems - -- [ ] Write documentation for output files - -- [x] Rewrite 'Problems encountered' into a rationale for our tool selection (as separate document) - -- [x] Write detailed and technical step-by-step description of the workflow - - - [ ] While reviewing the workflow, remove unnecessary pieces and clean-up where possible - -- [x] Setup MkDocs-powered documentation (at least locally, integrate with GitHub pages later) - -(_Note to self: Remove this list when finished and use issues or roadmap instead!_) +- Version 0.2: tidy code + - remove outdated steps + - move long commands in Snakefile to separate script + - (re)apply linting (Black, Styler) + - apply 'bash strict mode' and suppress R messages + - move parts of Snakefile to separate scripts? + +- Version 0.3: solid foundation + - validate proper functioning of CCTyper + CRISPRidentify + - adjust helper scripts where necessary + - correct scripts for making tables, integrate with Snakemake and test! + - CRISPR spacer table (#24) + - CRISPR-Cas locus table + - (make sure every analysis part produces an output: include in 'rule all') + +- Version 0.4: clear documentation + - review and update README and docs + +- Future additions: + - genome deduplication (dRep) + - CRISPR spacer target prediction + - map to + - masked ATB genomes + - PLSDB + - PhageScope + - VIRE + - MEGAISurv metagenomes + - mini-benchmark different mapping algorithms + - Sassy + - KMA + - SpacePHARER + - (where feasible) connect spacer hits with functional annotations! + - Integrate downstream analyses with Snakemake? + - run RMarkdown/Quarto notebooks automatically + - build a database like [this spacerdb](https://spacers.jgi.doe.gov/database/overview/)? ## Workflow description diff --git a/Snakefile b/Snakefile deleted file mode 100644 index 5751211..0000000 --- a/Snakefile +++ /dev/null @@ -1,987 +0,0 @@ -""" -Author: Sam Nooij -Organisation: Utrecht University -Department: Clinical Infectiology (KLIF), Infectious Diseases & Immunology, - Biomolecular Health Sciences, Faculty of Veterinary Medicine -Date: 2024-11-06 - -Workflow for testing CRISPR analysis options -In contrast to the 'regular' Snakefile workflow, which works -per genome file, this workflow works per batch and runs GNU -parallel to parallelise processing of the genomes within -each batch. - - -Input: Fasta files of Campylobacter whole-genomes -Output: (various) - -Example use: - $ snakemake --profile config - -N.B. Variables are set in the configuration files under `config`. -""" - -from pathlib import Path -import functools -import operator - -### Step 1: Import configuration file ### - - -configfile: Path("config/parameters.yaml") - - -# Use Python functions to automatically detect batches of genomes fasta files -# in the input directory as 'BATCHES' -WORK_DIR = config["working_directory"] - -BATCH_PATHS = list((Path(WORK_DIR) / "assemblies").glob("atb.assembly.*")) -for batch in BATCH_PATHS: - assert Path(batch).is_dir(), f"Batches must be directories, got {batch}" - -BATCHES = [batch.name for batch in BATCH_PATHS] - -OUTPUT_DIR = config["output_directory"] - - -### Step 2: Specify output files ### - - -rule all: - input: - # Multilocus Sequence Types (ST) for Campylobacter - OUTPUT_DIR + "mlst_table.tsv", - # Virus and plasmid predictions per contig - OUTPUT_DIR + "genomad_predictions.csv", - OUTPUT_DIR + "jaeger_predictions.csv", - # Phage defence systems - OUTPUT_DIR + "padloc_table.csv", - # Concatenated CCTyper output - expand( - WORK_DIR + "cctyper/{batch}/{filename}-{batch}.tab", - batch=BATCHES, - filename=[ - "CRISPR_Cas", - "crisprs_all", - "crisprs_near_cas", - "crisprs_orphan", - "cas_operons", - ], - ), - expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), - # Combined CCTyper output as CSV + BED files - expand(WORK_DIR + "cctyper/{batch}/parsed", batch=BATCHES), - # CCTyper CRISPR spacer cluster analysis report - WORK_DIR + "cctyper/spacer_cluster_summary.tsv", - # Cluster unique CRISPR spacers - WORK_DIR + "cctyper/all_spacers-clustered.clstr", - WORK_DIR + "crispridentify/all_spacers-clustered.clstr", - OUTPUT_DIR + "all_spacers_table_identify.tsv", - OUTPUT_DIR + "all_spacers_table.tsv", - # Extracted CRISPR arrays (as fasta) - expand(WORK_DIR + "arrays/{batch}/complete", batch=BATCHES), - #CRISPRidentify output - expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES), - #concatenated CRISPRidentify output - WORK_DIR + "crispridentify/complete_summary.csv", - WORK_DIR + "crispridentify/all_spacers.fa", - #merged CRISPRidentify and CCtyper output - OUTPUT_DIR + "all_CRISPRS_with_identify.tab", - #spacepharer output - OUTPUT_DIR + "phage_matches.tsv", - OUTPUT_DIR + "plasmid_matches.tsv", - #KMA output - WORK_DIR + "kma/output/CRISPR.frag.gz", - WORK_DIR + "kma/CRISPR_alignment", - - -### Step 3: Define processing steps that generate the output ### - - -rule download_mlst_database: - output: - WORK_DIR + "mlst/campylobacter.db", - params: - species=config["mlst"]["species"], - conda: - "envs/pymlst.yaml" - threads: 1 - log: - "log/download_mlst_database.txt", - benchmark: - "log/benchmark/download_mlst_database.txt" - shell: - """ -claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1 - """ - - -rule mlst: - input: - batch=WORK_DIR + "assemblies/{batch}/", - db=WORK_DIR + "mlst/campylobacter.db", - output: - WORK_DIR + "mlst/{batch}/complete", - conda: - "envs/pymlst.yaml" - threads: config["mlst"]["threads"] - log: - "log/mlst/{batch}.txt", - benchmark: - "log/benchmark/mlst/{batch}.txt" - shell: - """ -find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ - parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - claMLST search {input.db} {{}} -o "$(dirname {output})/{{/.}}.txt" > {log} 2>&1 - -touch {output} - """ - - -rule concatenate_mlst_batches: - input: - WORK_DIR + "mlst/{batch}/complete", - output: - WORK_DIR + "mlst/{batch}-concatenated.tsv", - threads: config["mlst"]["threads"] - log: - "log/concatenate_mlst/{batch}.txt", - benchmark: - "log/benchmark/concatenate_mlst/{batch}.txt" - shell: - """ -echo -e "Genome\tST" > {output} -find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\ - parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'tail -n +2 {{}} | cut -f 1-2 >> {output}' - """ - - -rule concatenate_mlst_all: - input: - expand(WORK_DIR + "mlst/{batch}-concatenated.tsv", batch=BATCHES), - output: - OUTPUT_DIR + "mlst_table.tsv", - threads: 1 - log: - "log/concatenate_mlst_all.txt", - benchmark: - "log/benchmark/concatenate_mlst_all.txt" - shell: - """ -batches=( {input} ) -head -1 ${{batches[0]}} > {output} -sed --separate 1d ${{batches[@]}} >> {output} - """ - - -rule crisprcastyper: - input: - batch=WORK_DIR + "assemblies/{batch}/", - output: - WORK_DIR + "cctyper/{batch}/complete", - params: - out_dir=WORK_DIR + "cctyper/{batch}/", - conda: - "envs/cctyper.yaml" - threads: config["cctyper"]["threads"] - log: - "log/cctyper/{batch}.txt", - benchmark: - "log/benchmark/cctyper/{batch}.txt" - shell: - """ -find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ - parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'rm -rf "{params.out_dir}{{/.}}" &&\ - cctyper -t 1 {{}} "{params.out_dir}{{/.}}"' > {log} 2>&1 - -touch {output} - """ - - -rule download_padloc_database: - output: - WORK_DIR + "padloc/database", - conda: - "envs/padloc.yaml" - threads: 1 - log: - "log/download_padloc_database.txt", - benchmark: - "log/benchmark/download_padloc_database.txt" - shell: - """ -padloc --db-install v2.0.0 -touch {output} - """ - - -rule padloc: - input: - batch=WORK_DIR + "assemblies/{batch}/", - db=WORK_DIR + "padloc/database", - output: - WORK_DIR + "padloc/{batch}/complete", - conda: - "envs/padloc.yaml" - threads: config["padloc"]["threads"] - log: - "log/padloc/{batch}.txt", - benchmark: - "log/benchmark/padloc/{batch}.txt" - shell: - """ -find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ - parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'mkdir -p "$(dirname {output})/{{/.}}" && padloc --cpu 1 --fna {{}} --outdir "$(dirname {output})/{{/.}}"' > {log} 2>&1 - -touch {output} - """ - - -rule concatenate_padloc_batches: - input: - WORK_DIR + "padloc/{batch}/complete", - output: - WORK_DIR + "padloc/{batch}-concatenated.csv", - threads: config["padloc"]["threads"] - log: - "log/concatenate_padloc/{batch}.txt", - benchmark: - "log/benchmark/concatenate_padloc/{batch}.txt" - shell: - """ -file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") ) -head -1 ${{file_array[0]}} > {output} -parallel --jobs {threads} --retry-failed --halt='now,fail=1'\ - 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}} - """ - - -rule concatenate_padloc_all: - input: - expand(WORK_DIR + "padloc/{batch}-concatenated.csv", batch=BATCHES), - output: - OUTPUT_DIR + "padloc_table.csv", - threads: 1 - log: - "log/concatenate_padloc_all.txt", - benchmark: - "log/benchmark/concatenate_padloc_all.txt" - shell: - """ -batches=( {input} ) -head -1 ${{batches[0]}} > {output} -sed --separate 1d ${{batches[@]}} >> {output} - """ - - -rule parse_cctyper: - input: - WORK_DIR + "cctyper/{batch}/complete", - output: - WORK_DIR + "cctyper/{batch}/parsed", - conda: - "envs/pandas.yaml" - threads: config["parse_cctyper"]["threads"] - log: - "log/parse_cctyper/{batch}.txt", - benchmark: - "log/benchmark/parse_cctyper/{batch}.txt" - shell: - """ -find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\ -parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - python bin/cctyper_extender.py -d {{.}} > {log} 2>&1 - -touch {output} - """ - - -rule extract_sequences: - input: - WORK_DIR + "cctyper/{batch}/parsed", - output: - WORK_DIR + "cctyper/{batch}/subseq", - conda: - "envs/seqkit.yaml" - threads: config["extract_sequences"]["threads"] - log: - "log/extract_sequences/{batch}.txt", - benchmark: - "log/benchmark/extract_sequences/{batch}.txt" - shell: - """ -find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\ -parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - bash bin/extract_crispr-cas_from_fasta.sh {{}} > {log} 2>&1 - -touch {output} - """ - - -rule collect_cctyper: - input: - cctyper=WORK_DIR + "cctyper/{batch}/complete", - parser=WORK_DIR + "cctyper/{batch}/parsed", - output: - crispr_cas=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", - crisprs_all=WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", - crisprs_near_cas=WORK_DIR + "cctyper/{batch}/crisprs_near_cas-{batch}.tab", - crisprs_orphan=WORK_DIR + "cctyper/{batch}/crisprs_orphan-{batch}.tab", - spacers=WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", - cas_putative=temp(WORK_DIR + "cctyper/{batch}/cas_operons_putative-{batch}.tab"), - cas=WORK_DIR + "cctyper/{batch}/cas_operons-{batch}.tab", - csv=WORK_DIR + "cctyper/{batch}/CRISPR-Cas-{batch}.csv", - threads: 1 - log: - "log/cctyper/collect_{batch}.txt", - benchmark: - "log/benchmark/cctyper/collect_{batch}.txt" - shell: - """ -bash bin/concatenate_cctyper_output.sh $(dirname {input.cctyper}) > {log} 2>&1 -echo "\n========================" >> {log} -bash bin/concatenate_cctyper_csv.sh $(dirname {input.parser}) >> {log} 2>&1 - -find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat {{}} + > {output.spacers} 2>> {log} - """ - - -rule extract_crispr_cas_locations: - input: - WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab", - output: - WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", - threads: 1 - log: - "log/extract_crispr_cas_location/{batch}.txt", - benchmark: - "log/benchmark/extract_crispr_cas_location/{batch}.txt" - shell: - """ -python bin/create_CCTyper_bedfile.py -i {input} -o {output} > {log} 2>&1 - """ - - -rule extract_crispr_array: - input: - batch=WORK_DIR + "assemblies/{batch}/", - bed=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed", - output: - WORK_DIR + "arrays/{batch}/complete", - params: - out_dir=WORK_DIR + "arrays/{batch}/", - conda: - "envs/seqkit.yaml" - threads: config["extract_arrays"]["threads"] - log: - "log/extract_crispr_array/{batch}.txt", - benchmark: - "log/benchmark/extract_crispr_array/{batch}.txt" - "" - shell: - """ -cut -f 1 -d '.' {input.bed} | parallel --jobs {threads} --retry-failed\ - --halt='now,fail=1'\ - 'if [ -e "{input.batch}/{{}}.fa" ];\ - then seqkit subseq --bed {input.bed} "{input.batch}/{{}}.fa"\ - -o "{params.out_dir}{{}}.fa";\ - fi' > {log} 2>&1 - -touch {output} - """ - - -rule concatenate_all_spacers: - input: - expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), - output: - WORK_DIR + "cctyper/all_spacers.fa", - threads: 1 - log: - "log/concatenate_all_spacers.txt", - benchmark: - "log/benchmark/concatenate_all_spacers.txt" - shell: - """ -cat {input} > {output} 2> {log} - """ - - -rule cluster_all_spacers: - input: - WORK_DIR + "cctyper/all_spacers.fa", - output: - clusters=expand( - WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}.clstr", - cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], - ), - spacers=expand( - WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}", - cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], - ), - summary=WORK_DIR + "cctyper/spacer_cluster_summary.tsv", - params: - WORK_DIR=WORK_DIR + "cctyper/", - log_dir="log/spacer_clustering/", - conda: - "envs/cdhit.yaml" - threads: 1 - log: - "log/cluster_all_spacers.txt", - benchmark: - "log/benchmark/cluster_all_spacers.txt" - shell: - """ -bash bin/cluster_all_spacers.sh\ - {input}\ - {params.WORK_DIR}\ - {params.log_dir} > {log} 2>&1 - """ - - -rule cluster_unique_spacers: - input: - WORK_DIR + "cctyper/all_spacers.fa", - output: - clusters=WORK_DIR + "cctyper/all_spacers-clustered.clstr", - spacers=WORK_DIR + "cctyper/all_spacers-clustered", - distribution=WORK_DIR + "cctyper/all_spacers-clustered-distribution.tsv", - conda: - "envs/cdhit.yaml" - threads: 1 - log: - "log/cluster_unique_spacers.txt", - benchmark: - "log/benchmark/cluster_unique_spacers.txt" - shell: - """ -cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\ - -i {input} -o {output.spacers} > {log} 2>&1 - -plot_len1.pl {output.clusters}\ - 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\ - 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\ - > {output.distribution} - """ - - -rule create_crispr_cluster_table: - input: - clstr=WORK_DIR + "cctyper/all_spacers-clustered.clstr", - fasta=WORK_DIR + "cctyper/all_spacers.fa", - output: - OUTPUT_DIR + "all_spacers_table.tsv", - conda: - "envs/pyfaidx.yaml" - threads: 1 - log: - "log/create_crispr_cluster_table.txt", - benchmark: - "log/benchmark/create_crispr_cluster_table.txt" - script: - "bin/make_cluster_table.py" - - -rule crispridentify: - input: - WORK_DIR + "cctyper/{batch}/subseq", - output: - WORK_DIR + "crispridentify/{batch}/complete", - params: - out_dir=WORK_DIR + "crispridentify/{batch}", - arrays=WORK_DIR + "cctyper/{batch}", - conda: - "envs/crispridentify.yml" - threads: config["crispridentify"]["threads"] - log: - "log/crispridentify/{batch}.txt", - benchmark: - "log/benchmark/crispridentify/{batch}.txt" - shell: - """ - - cd bin/CRISPRidentify - find ../../{params.arrays}/*/fasta/CRISPR_arrays-with_flanks.fasta -size +0c -print0 | \ - parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ - python CRISPRidentify.py --file {{}} --result_folder "../../{params.out_dir}/{{/.}}" --fasta_report True --strand False > ../../{log} 2>&1 - - touch ../../{output} - - """ - - -rule merge_crispridentify_batches: - input: - expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES), - params: - spacers_crispr=expand( - WORK_DIR - + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta", - batch=BATCHES, - ), - summary_crispr=expand( - WORK_DIR - + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv", - batch=BATCHES, - ), - output: - spacers_crispr=WORK_DIR + "crispridentify/all_spacers.fa", - summary_crispr=WORK_DIR + "crispridentify/complete_summary.csv", - threads: 1 - log: - "log/merge_crispridentify_batches.txt", - benchmark: - "log/benchmark/merge_crispridentify_batches.txt" - shell: - """ - cat {params.spacers_crispr} > {output.spacers_crispr} - for summary in {params.summary_crispr} ; do header=$(head -n 1 "$summary"); if [ "$header" == "No arrays found" ]; - then - continue; - else - echo $header | tee {output.summary_crispr}; - break; - fi - done - for summary in {params.summary_crispr} ; do tail -n +2 "$summary" >> {output.summary_crispr} ; done - """ - - -rule merge_cctyper_identify: - input: - identify=WORK_DIR + "crispridentify/complete_summary.csv", - cctyper=expand( - WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES - ), - output: - OUTPUT_DIR + "all_CRISPRS_with_identify.tab", - params: - tmp1="tmp_file1", - tmp2="tmp_file2", - threads: 1 - log: - "log/merge_cctyper_identify", - shell: - """ - first=True - for summary in {input.cctyper} ; do - if [ $first == True ]; - then - cat $summary > {params.tmp1} - first=False - else - tail -n +2 $summary >> {params.tmp1} - fi - done - - header=$(head -n 1 {input.identify} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t") - tail -n +2 {input.identify} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t" > {params.tmp2} - first=True - while read line; do - if [ $first == True ]; - then - first=False - echo -e "$line\t$header" > {output} - else - sample=$(echo -e "$line" | cut -f 1) - start_cc=$(echo -e "$line" | cut -f 3) - start_id=$(expr "$start_cc" + 1) - match=$(grep "${{sample}}_$start_id" {params.tmp2} || true) - if [ -z "$match" ]; then - echo -e "$line" >> {output} - else - while read line2; do - if [ "$start_cc" -lt 5000 ]; - then - echo -e "$line\t$match" >> {output} - else - start=$(echo -e "$line2" | cut -f 2) - start=$(expr "$start" + "$start_cc" - 5000) - length=$(echo -e "$line2" | cut -f 4) - end=$(expr "$length" + "$start" - 1) - begin=$(echo -e "$line2" | cut -f 1) - rest=$(echo -e "$line2" | cut -f 4-9) - echo -e "$line\t$begin\t$start\t$end\t$rest" >> {output} - fi - done <<< "$match" - fi - fi - done < {params.tmp1} - rm -f {params.tmp1} {params.tmp2} - """ - - -rule cluster_unique_spacers_crispridentify: - input: - WORK_DIR + "crispridentify/all_spacers.fa", - output: - clusters=WORK_DIR + "crispridentify/all_spacers-clustered.clstr", - spacers=WORK_DIR + "crispridentify/all_spacers-clustered", - distribution=WORK_DIR + "crispridentify/all_spacers-clustered-distribution.tsv", - conda: - "envs/cdhit.yaml" - threads: 1 - log: - "log/cluster_unique_spacers_identify.txt", - benchmark: - "log/benchmark/cluster_unique_spacers_identify.txt" - shell: - """ -cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\ - -i {input} -o {output.spacers} > {log} 2>&1 - -plot_len1.pl {output.clusters}\ - 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\ - 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\ - > {output.distribution} - """ - - -rule create_crispr_cluster_table_identify: - input: - clstr=WORK_DIR + "crispridentify/all_spacers-clustered.clstr", - fasta=WORK_DIR + "crispridentify/all_spacers.fa", - output: - OUTPUT_DIR + "all_spacers_table_identify.tsv", - conda: - "envs/pyfaidx.yaml" - threads: 1 - log: - "log/create_crispr_cluster_table_identify.txt", - benchmark: - "log/benchmark/create_crispr_cluster_table_identify.txt" - script: - "bin/make_cluster_table_identify.py" - - -rule concatenate_batches: - input: - WORK_DIR + "assemblies/{batch}", - output: - temp(WORK_DIR + "assemblies/{batch}.fasta"), - threads: 1 - log: - "log/concatenate_{batch}.txt", - benchmark: - "log/benchmark/concatenate_{batch}.txt" - shell: - """ -cat {input}/*.fa > {output} 2> {log} - """ - - -rule genomad: - input: - fasta=WORK_DIR + "assemblies/{batch}.fasta", - db=config["genomad_database"], - output: - aggregated_classification=WORK_DIR - + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", - plasmid_summary=WORK_DIR - + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", - virus_summary=WORK_DIR - + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", - params: - work_dir=WORK_DIR + "genomad/{batch}/", - conda: - "envs/genomad.yaml" - threads: config["genomad"]["threads"] - log: - "log/genomad/{batch}.txt", - benchmark: - "log/benchmark/genomad/{batch}.txt" - shell: - """ -genomad end-to-end -t {threads} --cleanup --enable-score-calibration\ - {input.fasta} {params.work_dir} {input.db} > {log} 2>&1 - """ - - -rule collect_genomad_predictions: - input: - aggregated_classification=expand( - WORK_DIR - + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", - batch=BATCHES, - ), - plasmid_summary=expand( - WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", - batch=BATCHES, - ), - virus_summary=expand( - WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", - batch=BATCHES, - ), - output: - OUTPUT_DIR + "genomad_predictions.csv", - conda: - "envs/tidy_here.yaml" - threads: 1 - log: - "log/collect_genomad_predictions.txt", - benchmark: - "log/benchmark/collect_genomad_predictions.txt" - script: - "bin/collect_genomad_predictions.R" - - -rule jaeger: - input: - batch=WORK_DIR + "assemblies/{batch}/", - output: - WORK_DIR + "jaeger/{batch}/complete", - conda: - "envs/jaeger.yaml" - threads: config["jaeger"]["threads"] - log: - "log/jaeger/{batch}.txt", - benchmark: - "log/benchmark/jaeger/{batch}.txt" - shell: - """ -parallel --jobs {threads} --retry-failed --halt='now,fail=1'\ - jaeger run -p --workers 1 -i {{}} -o $(dirname {output}) --overwrite\ - > {log} 2>&1 ::: {input.batch}/*.fa - -touch {output} - """ - - -rule collect_jaeger_batch: - input: - WORK_DIR + "jaeger/{batch}/complete", - output: - WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv", - params: - batch="{batch}", - conda: - "envs/tidy_here.yaml" - threads: 1 - log: - "log/collect_jaeger_{batch}.txt", - benchmark: - "log/benchmark/collect_jaeger_{batch}.txt" - script: - "bin/collect_jaeger_batch.R" - - -rule collect_jaeger_predictions: - input: - expand(WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES), - output: - OUTPUT_DIR + "jaeger_predictions.csv", - threads: 1 - log: - "log/collect_jaeger_predictions.txt", - benchmark: - "log/benchmark/collect_jaeger_predictions.txt" - script: - "bin/collect_jaeger_predictions.sh" - - -rule spacepharer_spacer_setup: - input: - spacers=WORK_DIR + "crispridentify/all_spacers.fa", - output: - spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", - params: - tmp_folder=WORK_DIR + "spacepharer/tmpFolder", - conda: - "envs/spacepharer.yml" - threads: 48 - log: - "log/spacepharer/spacepharer_spacer_setup.txt", - benchmark: - "log/benchmark/spacepharer/spacepharer_spacer_setup.txt" - shell: - """ - spacer_DB=$(dirname {output.spacer_DB}) - rm -rf $spacer_DB/* > {log} 2>&1 - spacepharer createsetdb {input.spacers} {output.spacer_DB} {params.tmp_folder} --extractorf-spacer 1 --threads {threads} >> {log} 2>&1 - """ - - -rule spacepharer_phage_setup: - output: - phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB", - phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB", - params: - tmp_folder=WORK_DIR + "spacepharer/tmpFolder", - DB=config["spacepharer_phage_database"] + "*.fasta", - conda: - "envs/spacepharer.yml" - threads: 48 - log: - "log/spacepharer/spacepharer_phage_setup.txt", - benchmark: - "log/benchmark/spacepharer/spacepharer_setup.txt" - shell: - """ - phage_DB=$(dirname {output.phage_DB}) - rm -rf $phage_DB/* > {log} 2>&1 - spacepharer createsetdb {params.DB} {output.phage_DB} {params.tmp_folder} --threads {threads} >> {log} 2>&1 - spacepharer createsetdb {params.DB} {output.phage_control_DB} {params.tmp_folder} --reverse-fragments 1 --threads {threads} >> {log} 2>&1 - """ - - -rule spacepharer_phage: - input: - spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", - phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB", - phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB", - output: - result=WORK_DIR + "spacepharer/predicted_phage_matches.tsv", - result_sanitised=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv", - params: - tmp_folder=WORK_DIR + "spacepharer/tmpFolder", - conda: - "envs/spacepharer.yml" - threads: 48 - log: - "log/spacepharer/spacepharer_phage.txt", - benchmark: - "log/benchmark/spacepharer/spacepharer_phage.txt" - shell: - """ - spacepharer predictmatch {input.spacer_DB} {input.phage_DB} {input.phage_control_DB} {output.result} {params.tmp_folder} --threads {threads} > {log} 2>&1 - grep -v "#" {output.result} > {output.result_sanitised} - rm -r {params.tmp_folder} >> {log} 2>&1 - """ - - -rule spacepharer_plasmid_setup: - input: - DB=config["spacepharer_plasmid_database"] + "sequences.fasta", - output: - DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB", - control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB", - params: - tmp_folder=WORK_DIR + "spacepharer/tmpFolder", - conda: - "envs/spacepharer.yml" - threads: 48 - log: - "log/spacepharer/spacepharer_plasmid_setup.txt", - benchmark: - "log/benchmark/spacepharer/spacepharer_plasmid_setup.txt" - shell: - """ - plasmid_DB=$(dirname {output.DB}) - rm -f $plasmid_DB/* > {log} 2>&1 - spacepharer createsetdb {input.DB} {output.DB} {params.tmp_folder} --threads {threads} >> {log} 2>&1 - spacepharer createsetdb {input.DB} {output.control_DB} {params.tmp_folder} --reverse-fragments 1 --threads {threads} >> {log} 2>&1 - """ - - -rule spacepharer_plasmid: - input: - phage_DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB", - phage_control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB", - spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB", - output: - result=WORK_DIR + "spacepharer/predicted_plasmid_matches.tsv", - result_sanitised=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", - params: - tmp_folder=WORK_DIR + "spacepharer/tmpFolder", - conda: - "envs/spacepharer.yml" - threads: 48 - log: - "log/spacepharer/spacepharer_phage.txt", - benchmark: - "log/benchmark/spacepharer/spacepharer_phage.txt" - shell: - """ - spacepharer predictmatch {input.spacer_DB} {input.phage_DB} {input.phage_control_DB} {output.result} {params.tmp_folder} --threads {threads} > {log} 2>&1 - grep -v "#" {output.result} > {output.result_sanitised} - rm -r {params.tmp_folder} >> {log} 2>&1 - """ - - -rule create_spacepharer_table: - input: - phage=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv", - meta_phage=config["spacepharer_phage_database"], - plasmid=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv", - meta_plasmid=config["spacepharer_plasmid_database"], - output: - phage=OUTPUT_DIR + "phage_matches.tsv", - plasmid=OUTPUT_DIR + "plasmid_matches.tsv", - threads: 1 - log: - "log/create_spacepharer_table.txt", - script: - "bin/create_spacepharer_table.sh" - - -rule kma_indexing: - input: - spacers=WORK_DIR + "crispridentify/all_spacers.fa", - output: - indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name", - params: - WORK_DIR + "kma/spacer_DB/spacers", - conda: - "envs/kma.yaml" - threads: 12 - log: - "log/kma/kma_index.txt", - benchmark: - "log/benchmark/kma/kma_index.txt" - shell: - """ - kma index -i {input.spacers} -o {params} > {log} 2>&1 - """ - - -rule kma: - input: - genomes=expand(WORK_DIR + "assemblies/{batch}/", batch=BATCHES), - indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name", - output: - WORK_DIR + "kma/output/CRISPR.frag.gz", - params: - output=WORK_DIR + "kma/output/CRISPR", - indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers", - spacers=WORK_DIR + "crispridentify/all_spacers.fa", - conda: - "envs/kma.yaml" - threads: 24 - log: - "log/kma/kma.txt", - benchmark: - "log/benchmark/kma/kma.txt" - shell: - """ - grep ">" {params.spacers} | cut -f 2 -d ">" | cut -f 1 -d "-" | sort -u > tmp_file - find -L {input.genomes} -mindepth 1 -maxdepth 1 -type f -name "*.fa" > all_genomes.txt - genomes=$(grep -x ".*[0-9]\\.fa" all_genomes.txt | grep -v -f tmp_file) - kma -hmm -i $genomes -o {params.output} -t_db {params.indexed_spacers} > {log} 2>&1 - rm tmp_file all_genomes.txt - """ - - -rule collect_kma: - input: - WORK_DIR + "kma/output/CRISPR.frag.gz", - output: - WORK_DIR + "kma/CRISPR_alignment", - log: - "log/kma/collect_kma.txt", - benchmark: - "log/benchmark/kma/collect_kma.txt" - shell: - """ - echo -e "spacer\tgenome" > {output} - zcat {input} | cut -f 6,7 | cut -f 1 -d " " > tmp_file - while read line; do - match=$(echo $line | cut -f 2) - crispr=$(echo $line | cut -f 1 | cut -f 1,6,7,10,11 -d "_") - echo -e "$crispr\t$match" >> {output} - done < tmp_file - rm tmp_file - """ diff --git a/bin/create_CCTyper_bedfile.py b/bin/create_CCTyper_bedfile.py deleted file mode 100644 index 2ac384c..0000000 --- a/bin/create_CCTyper_bedfile.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 - -# Read the tabular output file 'CRISPR_Cas.tab' from CCTyper -# and convert it to a BED file. This way, seqkit can be easily -# used to extract the CRISPR-Cas array and its flanking regions. - -import argparse - -def parse_arguments(): - """ - Parse argument from the command line: - -i/--input = input file (CRISPR_Cas.tab) - -o/--output = output file (BED file) - -h/--help = show help - """ - parser = argparse.ArgumentParser( - prog="CCTyper_to_bedfile", - description="Convert the CRISPR_Cas.tab output file" - "from CCTyper to a BED file.", - ) - - required = parser.add_argument_group("Required arguments") - - required.add_argument("-i", "--input", - dest = "input", required = True, type = str, - help = "CCTyper CRISPR_Cas.tab file", - ) - required.add_argument("-o", "--output", - dest = "output", required = True, type = str, - help = "BED file to write output to") - - args = parser.parse_args() - - return args - -def convert_tab_to_bed(inputfile, outputfile): - """ - """ - with open(outputfile, 'w') as outfile: - with open(inputfile, 'r') as infile: - # Skip the first line, as it is the header - infile.readline() - for line in infile: - # Split by tab - elements = line.split("\t") - - # Collect required information - contig = elements[0] - locus_tag = elements[1] - start_and_stop = elements[2].split(",") - start = start_and_stop[0].strip(" [") - stop = start_and_stop[1].strip(" ]") - - # Combine in single line - bed_string = "%s\t%s\t%s\t%s\n" % (contig, start, stop, locus_tag) - - # Write to the output file - outfile.write(bed_string) - return 0 - -def main(): - arguments = parse_arguments() - - message = ( - "\n" - "These are the files you have provided:\n" - " INPUT:\n" - "{0}\n" - " OUTPUT:\n" - "{1}".format( - arguments.input, - arguments.output - ) - ) - - print(message) - - exit( - convert_tab_to_bed(inputfile = arguments.input, - outputfile = arguments.output) - ) - -if __name__=="__main__": - exit(main()) \ No newline at end of file diff --git a/bin/download_ATB_metadata.sh b/bin/download_ATB_metadata.sh index 62e9b38..0909864 100755 --- a/bin/download_ATB_metadata.sh +++ b/bin/download_ATB_metadata.sh @@ -1,4 +1,6 @@ #! /usr/bin/env bash +set -euo pipefail +IFS=$'\n' # Read the output directory from the command-line, with # 'data/ATB/' set as default. diff --git a/bin/download_bakta_annotations.sh b/bin/download_bakta_annotations.sh index b2ebc02..c75820b 100644 --- a/bin/download_bakta_annotations.sh +++ b/bin/download_bakta_annotations.sh @@ -5,18 +5,20 @@ IFS=$'\n' # Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/ part=${1:-"update"} +atb_dir=${2:-"resources/ATB/"} +output_dir="${atb_dir}archives/" if [ "${part}" == "update" ] then - download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv) + download_list=$(grep "incr_release" ${atb_dir}/batches_to_download.tsv) elif [ "${part}" == "original" ] then - download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv) + download_list=$(grep -v "incr_release" ${atb_dir}/batches_to_download.tsv) elif [ "${part}" == "all" ] then - download_list=$(cat data/ATB/batches_to_download.tsv) + download_list=$(cat ${atb_dir}/batches_to_download.tsv) else download_list="" @@ -24,6 +26,8 @@ else echo "Or none to use the default (=update)." fi +mkdir -p ${output_dir} + for line in ${download_list} do filename=$(echo ${line} | cut -f 1 | sed -e 's/assembly/bakta/') @@ -31,9 +35,7 @@ do checksum=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 5) echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}" - mkdir -p data/tmp/ATB - - outputfile="data/tmp/ATB/${filename}" + outputfile="${output_dir}${filename}" # If the output file is not a file of size greater than zero if [ ! -s ${outputfile} ] @@ -55,7 +57,7 @@ do fi # Extract the batch number from the file name - batchdir="data/tmp/annotations" + batchdir="${atb_dir}/annotations" mkdir -p ${batchdir} # If the batch directory has not been made yet diff --git a/bin/download_genomes.sh b/bin/download_genomes.sh index 09b9153..7665bd2 100644 --- a/bin/download_genomes.sh +++ b/bin/download_genomes.sh @@ -15,22 +15,26 @@ IFS=$'\n' # will not download complete/correct files again.) part=${1:-"update"} +atb_dir=${2:-"resources/ATB/"} +output_dir="${atb_dir}archives/" if [ "${part}" == "update" ] then - download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv) + download_list=$(grep "incr_release" ${atb_dir}/batches_to_download.tsv) elif [ "${part}" == "original" ] then - download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv) + download_list=$(grep -v "incr_release" ${atb_dir}/batches_to_download.tsv) elif [ "${part}" == "all" ] then - download_list=$(cat data/ATB/batches_to_download.tsv) + download_list=$(cat ${atb_dir}/batches_to_download.tsv) else download_list="" echo "Unknown argument provided! Please use 'all', 'original', or 'update'." echo "Or none to use the default (=update)." fi +mkdir -p ${output_dir} + for line in ${download_list} do filename=$(echo ${line} | cut -f 1) @@ -38,9 +42,7 @@ do checksum=$(echo ${line} | cut -f 3) echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}" - mkdir -p data/tmp/ATB - - outputfile="data/tmp/ATB/${filename}" + outputfile="${output_dir}${filename}" # If the output file is not a file of size greater than zero if [ ! -s ${outputfile} ] @@ -62,7 +64,7 @@ do fi # Extract the batch number from the file name - batchdir="data/tmp/assemblies/" + batchdir="${atb_dir}/assemblies/" mkdir -p ${batchdir} # If the batch directory has not been made yet diff --git a/bin/download_spacepharer_database.sh b/bin/download_spacepharer_database.sh deleted file mode 100644 index b1ba51a..0000000 --- a/bin/download_spacepharer_database.sh +++ /dev/null @@ -1,86 +0,0 @@ -#! bin/bash - -## This script downloads the phage and plasmid databases: Phagescope (https://phagescope.deepomics.org/) and PLSDB (https://ccb-microbe.cs.uni-saarland.de/plsdb2025/). -## They are then also extracted and specifically for Phagescope merged into one database for use in Spacepharer. -## -## Note: RUN FROM THE BASE FOLDER AS IT WILL NOT WORK OTHERWISE -## Usage: download_spacepharer_database.sh [threads] -## [threads]: the amount of threads used for extracting and merging phagescope data. can at maximum use 14 threads, default is 1. -threads="${1:-"1"}" -#PLSDB download -mkdir -p data/raw/PLSDB -if [ -f data/raw/PLSDB/download_meta.tar.gz ]; then - echo "Already downloaded PLSDB, skipping..." -else - echo "Downloading PLSDB" - wget -P data/raw/PLSDB https://ccb-microbe.cs.uni-saarland.de/plsdb2025/download_meta.tar.gz -fi - -echo "Extracting download" -if [ -f data/raw/PLSDB/sequences.fasta.bz2 ]; then - echo "already extracted, skipping..." -else - tar -xzf data/raw/PLSDB/download_meta.tar.gz -C data/raw/PLSDB/ -fi -echo "Unzipping sequences" -bzip2 -d data/raw/PLSDB/sequences.fasta.bz2 - -echo "correcting metadata delims" -sed -i -E ':a;s/"([^"]*),([^"]*)"/"\1\2"/g;ta' nuccore.csv - -#Phagescope download -mkdir -p data/raw/phagescope - -echo "Downloading Phagescope databases" - -for DB in "Genbank" "RefSeq" "DDBJ" "EMBL" "PhagesDB" "GPD" "GVD" "MGV" "TemPhD" "CHVD" "IGVD" "IMG_VR" "GOV2" "STV" - do - if test -f "data/raw/phagescope/$DB.tar.gz"; then - echo "Already downloaded $DB, skipping..." - else - echo "Downloading $DB" - wget -O "data/raw/phagescope/$DB.tar.gz" "https://phageapi.deepomics.org/download/phage/fasta/?datasource=$DB" - fi -done - -echo "Extracting databases" -parallel --jobs "$threads" 'DB={}; path=data/raw/phagescope; [ -f "$path/$DB.fasta" ] || [ -d "$path/$DB" ] && \ -echo "$DB already extracted, skipping..." || \ -( echo "extracting $DB"; tar -xzf "$path/$DB.tar.gz" -C "$path/" )' ::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD CHVD IGVD IMG_VR GOV2 STV - -echo "merging phagescope sequences" -parallel --jobs "$threads" 'DB={}; path=data/raw/phagescope; [ -f "${path}/${DB}.fasta" ] && \ -echo "$DB already merged, skipping..." || \ -( echo "merging $DB"; genomes=$(find "$path/$DB" -type f -name "*.fasta"); > $path/$DB.fasta ; for files in $genomes; do cat $files >> $path/$DB.fasta; done)' \ -::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD - - -echo "Downloading Phagescope metadata" - -for DB in "genbank" "refseq" "ddbj" "embl" "phagesdb" "gpd" "gvd" "mgv" "temphd" "chvd" "igvd" "img_vr" "gov2" "stv" - do - if [ -f "data/raw/phagescope/${DB}_phage_meta_data.tsv" ]; then - echo "Already downloaded $DB, skipping..." - else - echo "Downloading $DB" - wget -P data/raw/phagescope/ "https://phageapi.deepomics.org/files/Download/Phage_meta_data/${DB}_phage_meta_data.tsv" - fi -done - -echo "Merging phagescope metadata" -first=true -if test -f "data/raw/phagescope/merged_metadata.tsv"; then - echo "Already merged metadata, skipping..." -else - for DB in "genbank" "refseq" "ddbj" "embl" "phagesdb" "gpd" "gvd" "mgv" "temphd" "chvd" "igvd" "img_vr" "gov2" "stv"; - do - echo "Merging $DB" - if [ "$first" == true ]; then - cat "data/raw/phagescope/${DB}_phage_meta_data.tsv" > data/raw/phagescope/merged_metadata.tsv - first=false - else - tail -n +2 "data/raw/phagescope/${DB}_phage_meta_data.tsv" >> data/raw/phagescope/merged_metadata.tsv - fi - done -fi -echo "Done!" diff --git a/bin/make_cluster_table.py b/bin/make_cluster_table.py deleted file mode 100644 index 3e0fdba..0000000 --- a/bin/make_cluster_table.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 - -import re -from pyfaidx import Fasta - -cluster_file = snakemake.input["clstr"] -fasta_file = snakemake.input["fasta"] -table_file = snakemake.output[0] - - -def read_clusters(clstr, table, fasta): - """ - Read a CD-HIT generated file of clusters and the fasta file that was - used by CD-HIT to generate a tab-separated report of the clusters. - - clstr: the CD-HIT output file with .clstr extensin - table: output file to write tab-separated output file to - fasta: fasta file with input sequences - """ - sequence_dict = find_sequences(fasta) - - HEADER = "Genome\tContig\tLocus\tCluster\tLength\tCluster_representative\tSequence\tIdentity\tStrand\n" - cluster_regex = r"(>Cluster *)(\d*)" - locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_\d+:\d+)... (.*)$" - - with open(table, "w") as outfile: - outfile.write(HEADER) - - with open(clstr, "r") as infile: - for line in infile: - line = line.strip() - - if line.startswith(">"): - # Extract the digits from the cluster ID - cluster = re.search(cluster_regex, line).group(2) - - elif len(line) > 1: - # Use RegEx to extract information - crispr_info = re.search(locus_regex, line) - - member_nr = crispr_info.group(1) # not used - length = crispr_info.group(2) - genome = crispr_info.group(3) - locus = crispr_info.group(4) - full_locus = "%s.%s" % (genome, locus) - contig = locus.split("_")[0] - extra = crispr_info.group(5) - - sequence = sequence_dict[full_locus] - - # Check the final group for representative ('*') or other - if extra == "*": - representative = full_locus - strand = "NA" - identity = "NA" - else: - strand_and_identity = extra.split("/") - strand = strand_and_identity[0].replace("at ", "") - identity = strand_and_identity[1] - - # Write the information to the output file - crispr_line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( - genome, - contig, - full_locus, - cluster, - length, - representative, - sequence, - identity, - strand, - ) - outfile.write(crispr_line) - - # If the line does not start with '>' or have length > 1, stop - else: - break - - return 0 - - -def find_sequences(fasta_file): - """ - Look up the DNA sequence in a fasta file and return as dictionary. - """ - sequence_dict = Fasta(fasta_file, duplicate_action="first") - return sequence_dict - - -if __name__ == "__main__": - exit(read_clusters(clstr=cluster_file, table=table_file, fasta=fasta_file)) diff --git a/bin/make_cluster_table_identify.py b/bin/make_cluster_table_identify.py deleted file mode 100644 index df06eaf..0000000 --- a/bin/make_cluster_table_identify.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 - -## This is a slightly adjusted script from bin/make_cluster_table.py as the identify input is different. - -import re -from pyfaidx import Fasta - -cluster_file = snakemake.input["clstr"] -fasta_file = snakemake.input["fasta"] -table_file = snakemake.output[0] - - -def read_clusters(clstr, table, fasta): - """ - Read a CD-HIT generated file of clusters and the fasta file that was - used by CD-HIT to generate a tab-separated report of the clusters. - - clstr: the CD-HIT output file with .clstr extensin - table: output file to write tab-separated output file to - fasta: fasta file with input sequences - """ - sequence_dict = find_sequences(fasta) - - HEADER = "Genome\tContig\tLocus\tCluster\tLength\tCluster_representative\tSequence\tIdentity\tStrand\n" - cluster_regex = r"(>Cluster *)(\d*)" - locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d]+_.+)\.\.\. (.*)$" - - with open(table, "w") as outfile: - outfile.write(HEADER) - - with open(clstr, "r") as infile: - for line in infile: - line = line.strip() - - if line.startswith(">"): - # Extract the digits from the cluster ID - cluster = re.search(cluster_regex, line).group(2) - - elif len(line) > 1: - # Use RegEx to extract information - crispr_info = re.search(locus_regex, line) - - member_nr = crispr_info.group(1) # not used - length = crispr_info.group(2) - genome = crispr_info.group(3) - locus = crispr_info.group(4) - full_locus = "%s-%s" % (genome, locus) - contig = locus.split("_")[0] - crispr = locus.split("_")[6] - spacer = locus.split("_")[10] - full_locus_readable = "%s-%s_%s:%s" % ( - genome, - contig, - crispr, - spacer, - ) - extra = crispr_info.group(5) - - sequence = sequence_dict[full_locus] - - # Check the final group for representative ('*') or other - if extra == "*": - representative = full_locus_readable - strand = "NA" - identity = "NA" - else: - strand_and_identity = extra.split("/") - strand = strand_and_identity[0].replace("at ", "") - identity = strand_and_identity[1] - - # Write the information to the output file - crispr_line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( - genome, - contig, - full_locus_readable, - cluster, - length, - representative, - sequence, - identity, - strand, - ) - outfile.write(crispr_line) - - # If the line does not start with '>' or have length > 1, stop - else: - break - - return 0 - - -def find_sequences(fasta_file): - """ - Look up the DNA sequence in a fasta file and return as dictionary. - """ - sequence_dict = Fasta(fasta_file, duplicate_action="first") - return sequence_dict - - -if __name__ == "__main__": - exit(read_clusters(clstr=cluster_file, table=table_file, fasta=fasta_file)) diff --git a/bin/prepare_genomes.sh b/bin/prepare_genomes.sh index a7b4329..98cdfcc 100644 --- a/bin/prepare_genomes.sh +++ b/bin/prepare_genomes.sh @@ -24,16 +24,57 @@ IFS=$'\n' ## for this script. ## ## Usage: -## $ bash prepare_genomes.sh [part] +## $ bash prepare_genomes.sh -p/--part [part] -d/--directory [directory] -h/--help ## (where [part] = 'all', 'original', 'update' (=default; smallest size)) -part=${1:-"update"} +# Set global (default) variables +part="update" +output_dir="resources/ATB/" +species_of_interest="config/species_of_interest.txt" +genomes_of_interest="${output_dir}stats/total_genomes_of_interest.tsv" +bakta_dir="${output_dir}annotations/" + +print_help() { + # Display help for this script + echo "Prepare genomes from AllTheBacteria for use with CRISPRscape" + echo + echo "Syntax: prepare_genomes.sh -p [part] -d [directory] [-h]" + echo "Options:" + echo "-p/--part Select which part of AllTheBacteria to download" + echo " for the selected species ('all', 'original', or" + echo " 'update', default=update)" + echo "-d/--directory Directory in which to download the files" + echo " (default=resources/ATB/)" + echo "-h/--help Print this help message" + echo + exit 0 +} + +while [[ $# -gt 0 ]] +do + case "$1" in + -p|--part ) part="$2" + shift + shift + ;; + -d|--directory ) output_dir="$2" + shift + shift + ;; + -h|--help ) print_help + ;; + * ) + echo "Unknown option: $1" + print_help + esac +done echo "Preparing to download genomes from set '${part}'" +echo "And storing them in the directory: ${output_dir}." +echo "----------" ## Step 1: download metadata echo "Step 1: downloading metadata" -output_dir="data/ATB/" bash bin/download_ATB_metadata.sh ${output_dir} echo "Done downloading metadata!" @@ -47,28 +88,30 @@ echo "----------" echo "Step 2: extracting sample accession IDs of species of interest" species_samples_file="${output_dir}all_samples_of_interest.txt" -echo -e "Species of interest:\n$(cat config/species_of_interest.txt)" +echo -e "Species of interest:\n$(cat ${species_of_interest})" + +total_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | wc -l) +hq_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l) +lq_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l) -total_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | wc -l) -hq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l) -lq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l) +mkdir -p "${output_dir}/stats" -echo -e "Total_genomes\t${total_genomes}" > data/tmp/total_genomes_of_interest.tsv -echo -e "High-quality_genomes\t${hq_genomes}" >> data/tmp/total_genomes_of_interest.tsv -echo -e "Low-quality_genomes\t${lq_genomes}" >> data/tmp/total_genomes_of_interest.tsv +echo -e "Total_genomes\t${total_genomes}" > "${genomes_of_interest}" +echo -e "High-quality_genomes\t${hq_genomes}" >> "${genomes_of_interest}" +echo -e "Low-quality_genomes\t${lq_genomes}" >> "${genomes_of_interest}" echo "ATB contains ${total_genomes} genomes of your species of interest." echo "Of those, ${lq_genomes} are labeled as low-quality, which are not included for further analyses." echo "That means, ${hq_genomes} are available to work with." echo -echo "Also see the file data/tmp/total_genomes_of_interest.tsv to see these" +echo "Also see the file ${genomes_of_interest} to see these" echo "numbers in table form (tab-separated values)." echo # Use no further grep options to match anything that contains the species names, # including subspecies and lineages. Exclude low-quality 'HQ field == F'. -zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz |\ +zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz |\ grep -v -e "F$" | cut -f 1 > ${species_samples_file} echo "Done extracting sample names!" @@ -88,8 +131,8 @@ update_genomes=$(zgrep -w -f ${species_samples_file}\ ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\ grep -c "incr_release") -echo -e "Genomes_in_original_release\t${original_genomes}" >> data/tmp/total_genomes_of_interest.tsv -echo -e "Genomes_in_update_release\t${update_genomes}" >> data/tmp/total_genomes_of_interest.tsv +echo -e "Genomes_in_original_release\t${original_genomes}" >> "${genomes_of_interest}" +echo -e "Genomes_in_update_release\t${update_genomes}" >> "${genomes_of_interest}" ## Step 3: Filter metadata to the species of interest echo "Step 3: Filtering metadata for species of interest" @@ -134,50 +177,54 @@ grep -v -w -f ${species_samples_file} ${output_dir}samples_in_batches.txt >\ # numbers: zgrep -w -f ${output_dir}samples_not_of_interest.txt\ ${output_dir}file_list.all.20240805.tsv.gz | cut -f 2 | sort | uniq -c | sort -nr >\ - data/tmp/other_genomes-numbers.txt + ${output_dir}stats/other_genomes-numbers.txt # And get the list of batches + sample accessions, to facilitate removal: zgrep -w -f ${output_dir}samples_not_of_interest.txt\ ${output_dir}file_list.all.20240805.tsv.gz |\ - cut -f 4 > data/tmp/samples_to_remove.txt + cut -f 4 > ${output_dir}samples_to_remove.txt echo "In the batches to download are $(wc -l ${output_dir}samples_not_of_interest.txt)" echo "samples that have low-quality genomes or species other than the species of interest." echo "These are summarised in:" -ls -lh ${output_dir}samples_not_of_interest.txt data/tmp/other_genomes-numbers.txt data/tmp/samples_to_remove.txt +ls -lh ${output_dir}samples_not_of_interest.txt ${output_dir}stats/other_genomes-numbers.txt ${output_dir}samples_to_remove.txt ## Step 5: Download genome sequences echo "Step 5: Download genome sequences" -bash bin/download_genomes.sh ${part} +bash bin/download_genomes.sh ${part} ${output_dir} echo "Finished downloading!" -echo "The batch archives have been written to 'data/tmp/ATB/'" -echo "and their contents extracted to 'data/tmp/assemblies/'" +echo "The batch archives have been written to '${output_dir}'" +echo "and their contents extracted to '${output_dir}assemblies/'" echo "----------" ## Step 6: Remove genomes of other species echo "Step 6: remove genomes of species other than species of interest" -for fasta in $(cat data/tmp/samples_to_remove.txt) +for fasta in $(cat ${output_dir}samples_to_remove.txt) do # Verbose remove: tell what is being removed - rm -fv "data/tmp/assemblies/${fasta}" + rm -fv "${output_dir}assemblies/${fasta}" done echo "----------" ## Step 7: Download functional annotations (Bakta) echo "Step 7: download functional (gene) annotations" -bash bin/download_bakta_annotations.sh ${part} +bash bin/download_bakta_annotations.sh ${part} ${output_dir} echo "Finished downloading!" -echo "The batches have been downloaded to 'data/tmp/ATB/' and extracted in 'data/tmp/annotations'" +echo "The batches have been downloaded to '${output_dir}' and extracted in '${output_dir}annotations'" echo "----------" ## Step 7.1: Also remove annotation files for non-of-interest samples -# 1: adjust the file basename from 'assembly' to 'bakta' echo "Step 7.1: remove genome annotations of other/low-quality samples" -for file in $(sed 's|atb.assembly.|atb.bakta.|g' data/tmp/samples_to_remove.txt) +# adjust the file basename from 'assembly' to 'bakta' +for file in $(sed 's|atb.assembly.|atb.bakta.|g' ${output_dir}samples_to_remove.txt) do - # 2: add 'annotations' subdirectory - bakta_dir="data/tmp/annotations/" - # 3: exchange '.fa' extension to 'bakta.json' + # add 'annotations' subdirectory and exchange '.fa' extension to 'bakta.json' json="${bakta_dir}${file/.fa/.bakta.json}" rm -fv ${json} done + +echo -e "\n-----\nPlease review the files under ${output_dir}:\n" + +ls -lh "${output_dir}" + +echo -e "-----\nDone!\n-----" diff --git a/config/parameters.yaml b/config/parameters.yaml index 7686a41..f161b4d 100755 --- a/config/parameters.yaml +++ b/config/parameters.yaml @@ -1,14 +1,20 @@ # Snakemake config file for CRISPR screening workflow -### 1. Input/output parameters ### - -working_directory: "data/tmp/" -output_directory: "data/processed/" - -genomad_database: "data/genomad_db/" -spacepharer_phage_database: "data/raw/phagescope/" -spacepharer_plasmid_database: "data/raw/PLSDB/" -### 2. Tool-specific parameters ### +PhageScope_databases: + - "Genbank" + - "RefSeq" + - "DDBJ" + - "EMBL" + - "PhagesDB" + - "GPD" + - "GVD" + - "MGV" + - "TemPhD" + - "CHVD" + - "IGVD" + - "IMG_VR" + - "GOV2" + - "STV" mlst: threads: 20 @@ -37,3 +43,12 @@ jaeger: crispridentify: threads: 20 + +download_spacepharer_databases: + threads: 4 + +spacepharer: + threads: 20 + +kma: + threads: 16 \ No newline at end of file diff --git a/doc/allthebacteria.md b/doc/allthebacteria.md index b59506f..9dcbd8e 100755 --- a/doc/allthebacteria.md +++ b/doc/allthebacteria.md @@ -9,29 +9,34 @@ After downloading all metadata using the bundled script, you have: ``` bash $ ls -sh data/ATB/ total 1.4G +460K all_atb_files.tsv 679M ena_metadata.20240801.tsv.gz 74M assembly-stats.tsv.gz 18M file_list.all.20240805.tsv.gz 63M checkm2.tsv.gz 5.5M sample_list.txt.gz 371M ena_metadata.0.2.20240606.tsv.gz 17M species_calls.tsv.gz 104M sylph.tsv.gz ``` +- A list of files `all_atb_files.tsv` 460kB + - Lists all batch and metadata files present in AllTheBacteria with their + respective file name, url, md5 checksum and file size (not per sample) + - Assembly statistics `assembly-stats.tsv.gz` 74MB - - Lists per sample accession the total length, number of contigs, N50 and - more statistics of the assembly. + - Lists per sample accession the total length, number of contigs, N50 and + more statistics of the assembly. - CheckM2 results `checkm2.tsv.gz` 63MB - - Lists per sample accession the results of CheckM2 including assembly - completeness and contamination in percentages. + - Lists per sample accession the results of CheckM2 including assembly + completeness and contamination in percentages. - ENA sample metadata `ena_metadata.20240801.tsv.gz` 677MB - - Lists per sample all the metadata that have been deposited in the European - Nucleotide Archive - this is a table with over 100 columns! + - Lists per sample all the metadata that have been deposited in the European + Nucleotide Archive - this is a table with over 100 columns! - File list `file_list.all.20240805.tsv.gz` 17MB - - This file lists per sample accession the corresponding batch in which it is - archived, with download URL, md5sum and file size of the batch archive. - It can be used to identify which batch archives contain species of interest, - e.g. + - This file lists per sample accession the corresponding batch in which it is + archived, with download URL, md5sum and file size of the batch archive. + It can be used to identify which batch archives contain species of interest, + e.g.: ```bash zgrep -f data/ATB/all_samples_of_interest.txt\ @@ -43,14 +48,14 @@ the script `bin/prepare_genomes.sh`. Also see the [manual](manual.md#download-input-genomes).) - Sample list `sample_list.txt.gz` 5.4MB - - This file simply lists all the sample accessions that are present in the dataset. + - This file simply lists all the sample accessions that are present in the dataset. - Species calls `species_calls.tsv.gz` 17MB - - Lists per sample accession the species identified and whether or not it is of - high-quality (T/F). - This file can be used to identify which sample accessions contain - high-quality genomes of the species of interest, - e.g. + - Lists per sample accession the species identified and whether or not it is of + high-quality (T/F). + This file can be used to identify which sample accessions contain + high-quality genomes of the species of interest, + e.g.: ```bash zless species_calls.tsv.gz | grep "Campylobacter_D jejuni" |\ @@ -58,8 +63,8 @@ zless species_calls.tsv.gz | grep "Campylobacter_D jejuni" |\ ``` - Sylph results `sylph.tsv.gz` 103MB - - Lists per sample accession the output from Sylph, including relative - abundance (%) Average Nucleotide Identity score (%) and assigned species name. + - Lists per sample accession the output from Sylph, including relative + abundance (%) Average Nucleotide Identity score (%) and assigned species name. ## Note on file sizes per batch diff --git a/results/Metadata_and_spacer_summary.html b/results/Metadata_and_spacer_summary.html deleted file mode 100644 index 14f6a50..0000000 --- a/results/Metadata_and_spacer_summary.html +++ /dev/null @@ -1,951 +0,0 @@ - - - - - - - - - - -Metadata and spacer summary - - - - - - - - - - - - - - - - - - - -
- -
- -
-
-

Metadata and spacer summary

-
- - -
-
Author
-
Affiliation
- -
-

Gijswillem van Walt Meijer

-
-
-
-
- -
- - - - -
- - - -
- - -
-

CRISPR spacers in Campylobacter

-
-

Summary of metadata and spacers of CRISPR arrays in Campylobacter genomes from AllTheBacteria

-

As described in Descriptive_statistics.html, a large amount of bacteria of Campylobacter jejuni and coli have been downloaded from AllTheBacteria (ATB) including their metadata. These genomes have then been screened for CRISPR-cas operons using CCTyper (version 1.8.0). This has identified a large amount of spacers (figure 1) which has been further explored in Descriptive_statistics.html on its own. However, for the intended research, the origin of the Bacteria will need to be known so that it can potentially be linked to specific spacers or patterns. For this the metadata of ATB is vital. But with cursory inspection (figure 2), many of the origins are missing, unclear and without standardization.

-

In this report, we summarize the spacers data and metadata and discuss certain problems that remain within the data.

-

First relevant data will be imported.

-
-
-Code -
#all spacers from CCtyper were combined into a tsv file as FASTA format
-all_spacers <- read_tsv(here("joining CCtyper/all_spacers.tsv"), show_col_types=FALSE)
-names(all_spacers) <- c("Sample_accession", "Spacer_sequence")
-
-#contig index are extracted from sample accession and put into their own column for pattern analysis
-all_spacers[3] <- str_extract(all_spacers$Sample_accession, "\\d*:\\d*")
-all_spacers <- separate(all_spacers, 3, into = c("Array_index", "Spacer_index"))
-all_spacers["Contig_accession"] <- str_extract(all_spacers$Sample_accession, "contig\\d*")
-
-#clean up sample accession
-all_spacers$Sample_accession <- str_extract(all_spacers$Sample_accession, "(?<=>)\\w*")
-
-#these spacers also include repeats and can be orphans or connected to a cas operon, this is added to the all spacer frame.
-orphan_crispr <- read_tsv(here("joining CCtyper/crisprs_orphan-concatenated.tab"), show_col_types = F)[,c(3, 6)] %>% mutate("orphan?" = TRUE) %>% separate(col = 1, sep = "\\.|\\_", into = c("Sample_accession", "Contig_accession", "Array_index"))
-orphan_crispr$Contig_accession <- str_extract(orphan_crispr$Contig_accession, "^\\w*")
-
-
-crispr_cas <- read_tsv(here("joining CCtyper/crisprs_near_cas-concatenated.tab"), show_col_types = F)[,c(3, 6)] %>% mutate("orphan?" = FALSE) %>% separate(col = 1, sep = "\\.|\\_", into = c("Sample_accession", "Contig_accession", "Array_index"))
-crispr_cas$Contig_accession <- str_extract(crispr_cas$Contig_accession, "^\\w*")
-
-all_spacers <- left_join(all_spacers, bind_rows(orphan_crispr, crispr_cas))
-
-#ENA metadata is downloaded from ATB and was filtered for original analysis in descriptive_statistics
-colnames <- read_table(here("metadata project/colnames_metadata_filtered.txt"), col_names = F, show_col_types = F)
-metadata_source <-
-  read_tsv(here("metadata project/ena_metadata.20240801.selection-only_Campylobacter.tsv"),
-           col_names = colnames$X2, show_col_types = FALSE)
-
-#further filtering metadata as not all are needed for this analysis
-metadata <- select(metadata_source, c(sample_accession, scientific_name, isolation_source, host))
-
-#in the metadata there is a distinction between unclear data and data that is simply missing/unspecified. missing data is turned into NA and data that is present in isolation source but not in host is copied to host.
-missingno <- metadata %>%
-  mutate(across(c(isolation_source, host), ~str_to_lower(.x)))%>%
-  replace_with_na(replace = list(isolation_source = c(
-    "missing", "other", "not collected", "no source specified", "mising"
-  ), host = c(
-    "missing", "other", "not collected", "no source specified", "mising"
-  ))) %>% mutate(host = ifelse(is.na(host), isolation_source, host))
-
-#There are also bacteria that are incorrectly noted in the ENA metadata compared to the ATB designation of species
-simplified_species <- read_delim(
-    file = here("metadata project/sylph.tsv.gz"),
-    delim = "\t",
-    show_col_types = F) %>%
-    select(Sample, Species)
-
-simplified_species <- simplified_species %>% filter(str_detect(Species,"Campylobacter_D (jejuni|coli)")) %>% mutate(Species = str_replace_all(Species, "_(A|B|C|D)", ""))
-
-merged_metadata <- left_join(missingno, simplified_species, by = c("sample_accession" = "Sample")) %>% distinct(sample_accession, .keep_all = TRUE)
-
-
-


-

-


-

-
-
-

Metadata analysis

-
-

Highlighting the importance of FAIR metadata handling

-
-
-Code -
cat("10 examples of 'host' metadata")
-
-
-
10 examples of 'host' metadata
-
-
-
-
-Code -
as.data.frame(list(Hosts = unique(missingno$host)[2:11]))
-
-
-
                Hosts
-1               feces
-2                calf
-3          sus scrofa
-4       gallus gallus
-5        homo sapiens
-6     chicken carcass
-7             poultry
-8  raw intact chicken
-9                food
-10     chicken breast
-
-
-
-
-Code -
cat("10 examples of 'isolation source' metadata")
-
-
-
10 examples of 'isolation source' metadata
-
-
-
-
-Code -
as.data.frame(list(isolation_source = unique(missingno$isolation_source)[2:11]))
-
-
-
               isolation_source
-1                         feces
-2               chicken carcass
-3                          farm
-4            raw intact chicken
-5                         human
-6                          food
-7                chicken breast
-8  animal-chicken-young chicken
-9     animal-swine-market swine
-10      animal-cattle-dairy cow
-
-
-

This shows some examples of how this metadata has inconsistent notation of origins. Many problems occur when trying to parse this data as is, for example ‘raw intact chicken’ and ‘chicken carcass’ describe the same host species. This additional information is not useful for ‘host’ metadata but is for ‘isolation source’, yet information in isolation source is also lacking as the distinction between broiler and meat producing chickens is sometimes difficult to discern. Unfortunately this alongside spelling errors (‘boivine’) and unclear designations (‘food’), makes the metadata require a significant amount of processing. Not to mention the amount of data that potentially could have information regarding origin, but was never noted. This in turn highlights the importance of the FAIR principles for research.

-
-
-

Consolidating origins

-
-
-Code -
#TODO: keep discussing and updating categories, data crawl through original studies to find actual hosts and investigate why about a 1000 samples are lost from left_join.
-
-#consolidates the many differing inputs of host into more managable categories: Meat producing poultry, Adult cattle, layers, veal calves, pets, small ruminants, pigs, surface water and water birds, wild birds, non-water environment and human. 
-
-#this last category consists of all species not part of these initial categories, or unclear enough to not be useful.
-definedmeta <- merged_metadata %>%
-  mutate(category = host) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(field|environment|pasture|sediment|soil|surfaces|crates).*",
-      "dry environment"
-    )
-  ) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(calf|veal).*",
-      "calves"
-    )
-  ) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(taurus|cattle|milk|boi?vine|cow|heifer|steer|beef|dairy|indicus).*",
-      "adult cattle"
-    )
-  ) %>% 
-  mutate(
-    category = ifelse(str_detect(
-      isolation_source,
-      ".*((?<!v)egg|layer).*"), yes = "broiler", no = host)
-    ) %>%
-  mutate(category = str_replace(category, ".*(water|lagoon|sewage|river|wetland|fulica\\satra|platyrhynchos|duck).*", "surface water and water birds")) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(pheasant|phasianus|gallopavo|hen|chi[ec]ken|turkey|broiler|gallus|cb-|drumsticks|poultry).*",
-      "meat producing poultry"
-    )
-  ) %>%
-  mutate(category = str_replace(
-    category,
-    ".*(sus\\sscrofa|porcine|pig|swine|sow|pork).*",
-    "swine/pig"
-  )) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(puppy|canii?ne|cat|feline|dog|kitten|pet|familiaris)(?!tle).*",
-      "pet animals"
-    )
-  ) %>%
-  mutate(category = str_replace(category, ".*(human|clinical|guillain|sapiens).*", "human")) %>%
-  mutate(
-    category = str_replace(
-      category,
-      ".*(avian|cloaca|(?<!water\\s)bird|columba livia|crow|corvus|dove).*",
-      "wild avian"
-    )
-  ) %>%
-  mutate(category = str_replace(category, ".*(sheep|aries|ovine|goat|hircus).*", "small ruminants")) %>%
-  mutate(category = str_replace(category, ".*(human|sapiens).*", "human")) %>%
-  mutate(
-    category = str_replace(
-      category,
-      "^(?!(dry environment|calves|adult cattle|broiler|meat producing poultry|swine/pig|pet animals|wild avian|small ruminants|food|feces|human|surface water and water birds)).*",
-      "other/undetermined"
-    ))
-#calculate frequency of categories 
-origincounts <- definedmeta %>% group_by(category) %>% summarise(total_n = n()) %>% arrange(desc(total_n))
-
-#calculate percentages and create labels
-processedcount <-
-  origincounts %>% mutate(percentage = round(total_n / sum(total_n) * 100, 3))
-processedcount$labels <- paste(processedcount$category, "\n", processedcount$total_n, "(", processedcount$percentage, "%)") 
-processedcount <- processedcount %>% arrange(desc(total_n))
-summed_N <- as.character(sum(processedcount$total_n))
-
-
-treemap(
-  processedcount,
-  index = c("labels"),
-  vSize = "total_n",  
-  vColor = "total_n",  
-  draw = TRUE,
-  border.col = "black",
-  title = paste("Treemap of Categories with total N:", sum(processedcount$total_n)),
-  fontsize.title = 16)
-
-
-
-
-

-
-
-
-
-Code -
#same calculations but without NA
-processedcount <-
-  origincounts[-1,] %>% mutate(percentage = round(total_n / sum(total_n) * 100, 3))
-processedcount$labels <- paste(processedcount$category, "\n", processedcount$total_n, "(", processedcount$percentage, "%)") 
-processedcount <- processedcount %>% arrange(desc(total_n))
-
-
-treemap(
-  processedcount,
-  index = c("labels"),
-  vSize = "total_n",  
-  vColor = "total_n",  
-  draw = TRUE,
-  border.col = "black",
-  title = paste("Treemap of Categories (without NA) with total N:", sum(processedcount$total_n)),
-  fontsize.title = 16)
-
-
-
-
-

-
-
-
-
-
-
-Code -
processedcount[,1:3]
-
-
-
# A tibble: 13 × 3
-   host                   total_n percentage
-   <chr>                    <int>      <dbl>
- 1 meat producing poultry   39772     50.2  
- 2 human                    12773     16.1  
- 3 adult cattle              9170     11.6  
- 4 other/undetermined        8215     10.4  
- 5 swine/pig                 3668      4.63 
- 6 food                      2974      3.75 
- 7 dry environment            775      0.978
- 8 small ruminants            689      0.87 
- 9 wild avian                 451      0.569
-10 pet animals                247      0.312
-11 calves                     182      0.23 
-12 feces                      180      0.227
-13 egg layer                  140      0.177
-
-
-

This is a visualization of the categories that this document will work with going forward. As can be noticed, poultry is abundant with other species quickly falling off. Additionally a large amount of the host and isolation_source data is NA, considered to be mostly unusable for source attribution. Though this is still a large amount of data still to use, lets see how this is divided between jejuni and coli

-
-
-

Campylobacter species

-
-
-Code -
#same calculations as above, but now also additionally grouped on species.
-
-origincounts_jejuni <- definedmeta %>% group_by(category, Species) %>% filter(as.character(Species) == "Campylobacter jejuni") %>% summarise(total_n = n()) %>% arrange(desc(total_n))
-
-summed_N <- sum(origincounts_jejuni[-1,]$total_n)
-processedcount_jejuni <-
-  origincounts_jejuni[-1,] %>% mutate(percentage = round(total_n / summed_N * 100, 3))
-processedcount_jejuni$labels <- paste(processedcount_jejuni$category, "\n", processedcount_jejuni$total_n, "(", processedcount_jejuni$percentage, "%)") 
-processedcount_jejuni <- processedcount_jejuni %>% arrange(desc(total_n))
-
-
-
-
-treemap(
-  processedcount_jejuni,
-  index = c("labels"),
-  vSize = "total_n",  
-  vColor = "total_n",  
-  draw = TRUE,
-  border.col = "black",
-  title = paste("Treemap of C. jejuni Categories (without NA) with total N:", sum(processedcount_jejuni$total_n)),
-  fontsize.title = 16)
-
-
-
-
-

-
-
-
-
-
-
-Code -
origincounts_coli <- definedmeta %>% group_by(category, Species) %>% filter(as.character(Species) == "Campylobacter coli") %>% summarise(total_n = n()) %>% arrange(desc(total_n))
-
-
-summed_N <- sum(origincounts_coli[-2,]$total_n)
-processedcount_coli <-
-  origincounts_coli[-2,] %>% mutate(percentage = round(total_n / summed_N * 100, 3))
-processedcount_coli$labels <- paste(processedcount_coli$category, "\n", processedcount_coli$total_n, "(", processedcount_coli$percentage, "%)") 
-processedcount_coli <- processedcount_coli %>% arrange(desc(total_n))
-
-treemap(
-  processedcount_coli,
-  index = c("labels"),
-  vSize = "total_n",  
-  vColor = "total_n",  
-  draw = TRUE,
-  border.col = "black",
-  title = paste("Treemap of C. Coli Categories (without NA) with total N:", sum(processedcount_coli$total_n)),
-  fontsize.title = 16)
-
-
-
-
-

-
-
-
-
-

As can be seen, meat producing poultry is most frequent in both species. Though interestingly, swine are much more frequent within Campylobacter coli than in C. jejuni. Next lets look at the spacers found

-
-
-
-

Spacer analysis with metadata

-
-
-Code -
#join the two files
-merged_data <- inner_join(all_spacers, definedmeta, by = c("Sample_accession" = "sample_accession"))
-merged_data <- unique(merged_data)
-
-
-#condensing spacer sequences into unique sequences for faster calculation and performing an initial count of spacers
-counts_spacer <- merged_data %>% group_by(Spacer_sequence, category) %>% count()
-
-#some spacer sequences are directly complementary to each other and so the same spacer
-dna <- DNAStringSet(counts_spacer$Spacer_sequence)
-reverse_complements <- reverseComplement(dna)
-
-
-processed <- logical(length(dna))
-for (i in seq_along(dna)) {
-  if (processed[i]) next
-  match <- which(as.character(dna) == as.character(reverse_complements[i]))
-  if (length(match) > 0) {
-    processed[match] <- TRUE
-    dna[match] <- reverse_complements[match]
-  }
-}
-
-dna_list <- as.character(dna)
-counts_spacer$Spacer_sequence <- dna_list
-
-counts_spacer_host <- counts_spacer %>% group_by(Spacer_sequence, category) %>% summarise(n = sum(n)) %>% arrange(desc(n))
-counts_spacer_hostless <- counts_spacer %>% group_by(Spacer_sequence) %>% summarise(n = sum(n)) %>% arrange(desc(n))
-
-merged_data <- rename(merged_data, c("scientific_name" = "ENA_species", "Species" = "ATB_species"))
-
-merged_data_NAless <- merged_data %>% filter(!is.na(category))
-counts_spacer_NAless <- counts_spacer_host %>% filter(!is.na(category)) %>% arrange(desc(n))
-
-
-head(counts_spacer_NAless)
-
-
-
# A tibble: 6 × 3
-# Groups:   Spacer_sequence [6]
-  Spacer_sequence                category                   n
-  <chr>                          <chr>                  <int>
-1 AATAATGGCTAAATATTTCATGAGAATGGA meat producing poultry   479
-2 AACCGCCAAGCTCTTTTAAAAACTGCCATA meat producing poultry   231
-3 GCTTTAGGAAATGCTTTAAAACGCTTTGGA meat producing poultry   208
-4 GTTTCTTTCTTATTTACTCTATACTCTAAA meat producing poultry   206
-5 CAATGTTTTGTCAAGTTTCAAGCGAAGGCG meat producing poultry   184
-6 AAACTTTTTACAGCTTTGTAGAATATATAA meat producing poultry   175
-
-
-

As described in Descriptive_statistics, many spacers detected by CCtyper were similar to each other. Some were reverse complements of each other or were identical except for additional nucleotides in one. Additionally some spacers have high similarity to each other with minor mutations. This table above has accounted for reverse complements and merged them into the count, however finding a solution for the other similarities has not been successful so far.
-
-

-
-
- -
- - -
- - - - - \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..dc3677c --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,109 @@ +""" +Author: Sam Nooij +Organisation: Utrecht University +Department: Clinical Infectiology (KLIF), Infectious Diseases & Immunology, + Biomolecular Health Sciences, Faculty of Veterinary Medicine +Date: 2024-11-06 + +Workflow for testing CRISPR analysis options +In contrast to the 'regular' Snakefile workflow, which works +per genome file, this workflow works per batch and runs GNU +parallel to parallelise processing of the genomes within +each batch. + + +Input: Fasta files of Campylobacter whole-genomes +Output: (various) + +Example use: + $ snakemake --profile config + +N.B. Variables are set in the configuration files under `config`. +""" + +# Step 1: Load configuration +# --------------------------------------------------------- + + +configfile: Path("config/parameters.yaml") + + +# Log messages +onstart: + print("\n--- Starting analysis! ---\n") + + +onsuccess: + print("\n --- Workflow finished! ---\n") + + +onerror: + print("\n --- An error occurred! ---\n") + + +# Step 2: Load rules that do the work +# --------------------------------------------------------- +include: "rules/helper_rules.smk" +include: "rules/screen_crispr-cas.smk" +include: "rules/refine_crispr-cas.smk" +include: "rules/classify_genomes.smk" +include: "rules/identify_defences.smk" +include: "rules/map_spacers.smk" + + +# Step 3: specify target outputs +# --------------------------------------------------------- + + +rule all: + input: + ## Extra outputs (non-CRISPR) + # Multilocus Sequence Types (ST) for Campylobacter + "results/mlst_table.tsv", + # Virus and plasmid predictions per contig + "results/genomad_predictions.csv", + "results/jaeger_predictions.csv", + # Phage defence systems + "results/padloc_table.csv", + ## CRISPR-related outputs: 1. pre-screening + # Concatenated CCTyper output (CRISPR arrays and spacers) + expand( + "results/cctyper/{batch}/{filename}-{batch}.tab", + batch=BATCHES, + filename=[ + "CRISPR_Cas", + "crisprs_all", + "crisprs_near_cas", + "crisprs_orphan", + "cas_operons", + ], + ), + expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), + # CCTyper CRISPR spacer cluster analysis report + "results/cctyper/spacer_cluster_summary.tsv", + # Cluster unique CRISPR spacers + "results/cctyper/all_spacers-clustered.clstr", + "results/all_spacers_table.tsv", + # CRISPR output 2. CRISPRidentify output (refinement) + expand( + "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta", + batch=BATCHES, + ), + expand( + "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv", + batch=BATCHES, + ), + # Concatenated CRISPRidentify output + "results/crispridentify/complete_summary.csv", + "results/crispridentify/all_spacers.fa", + "results/crispridentify/all_spacers-clustered.clstr", + "results/all_spacers_table_identify.tsv", + # Merged CRISPRidentify and CCtyper output + "results/all_CRISPRS_with_identify.tab", + # Spacepharer output (spacer targets, putative protospacers) + "results/phage_matches.tsv", + "results/plasmid_matches.tsv", + # KMA output (mapping spacers to input Campy genomes) + "results/kma/output/CRISPR.frag.gz", + "results/kma/CRISPR_alignment", + default_target: True diff --git a/workflow/envs/bash.yaml b/workflow/envs/bash.yaml new file mode 100644 index 0000000..196651f --- /dev/null +++ b/workflow/envs/bash.yaml @@ -0,0 +1,8 @@ +name: bash + +dependencies: + - bash=5.1.16 + - parallel=20251122 + +channels: + - conda-forge diff --git a/envs/cctyper.yaml b/workflow/envs/cctyper.yaml similarity index 100% rename from envs/cctyper.yaml rename to workflow/envs/cctyper.yaml diff --git a/envs/cdhit.yaml b/workflow/envs/cdhit.yaml similarity index 100% rename from envs/cdhit.yaml rename to workflow/envs/cdhit.yaml diff --git a/workflow/envs/crispridentify.yaml b/workflow/envs/crispridentify.yaml new file mode 100644 index 0000000..cc5fef9 --- /dev/null +++ b/workflow/envs/crispridentify.yaml @@ -0,0 +1,48 @@ +name: crispr_identify_env +channels: + - conda-forge + - bioconda + - defaults + - biobuilds + - r + - axfeh +dependencies: + - python==3.7.6 + - pip + - python_abi=3.7 + - biopython=1.76 + - h5py=2.10.0 + - hdf5=1.10.6 + - hmmer=3.3 + - numpy==1.18.1 + - pandas=1.0.3 + - matplotlib=3.1.3 + - perl=5.26.2 + - perl-threaded=5.26.0 + - perl-yaml=1.29 + - prodigal==2.6.3 + - dill=0.3.3 + - protobuf=3.13.0.1 + - regex=2019.03.09 + - pyasn1=0.4.8 + - pycparser=2.20 + - networkx=2.5 + - pyjwt=1.7.1 + - pyparsing=2.4.7 + - pyqt=5.9.2 + - pysocks=1.7.1 + - python-dateutil=2.8.1 + - pytz=2020.1 + - pyyaml=5.3.1 + - scikit-learn==0.22.1 + - scipy=1.4.1 + - requests=2.28.1 + - viennarna==2.4.15 + - pyopenssl=22.0.0 + - certifi=2022.12.7 + - vmatch==2.3.0 + - clustalo==1.2.3 + - blast==2.5.0 + - libffi=3.2.1 + - pip: + - python-Levenshtein diff --git a/envs/genomad.yaml b/workflow/envs/genomad.yaml similarity index 100% rename from envs/genomad.yaml rename to workflow/envs/genomad.yaml diff --git a/envs/jaeger.yaml b/workflow/envs/jaeger.yaml similarity index 100% rename from envs/jaeger.yaml rename to workflow/envs/jaeger.yaml diff --git a/envs/kma.yaml b/workflow/envs/kma.yaml similarity index 100% rename from envs/kma.yaml rename to workflow/envs/kma.yaml diff --git a/envs/padloc.yaml b/workflow/envs/padloc.yaml similarity index 100% rename from envs/padloc.yaml rename to workflow/envs/padloc.yaml diff --git a/envs/pandas.yaml b/workflow/envs/pandas.yaml similarity index 100% rename from envs/pandas.yaml rename to workflow/envs/pandas.yaml diff --git a/envs/pyfaidx.yaml b/workflow/envs/pyfaidx_pandas.yaml similarity index 66% rename from envs/pyfaidx.yaml rename to workflow/envs/pyfaidx_pandas.yaml index e7dc681..b1d3a92 100644 --- a/envs/pyfaidx.yaml +++ b/workflow/envs/pyfaidx_pandas.yaml @@ -1,4 +1,4 @@ -name: pyfaidx +name: pyfaidx_pandas channels: - conda-forge @@ -6,3 +6,4 @@ channels: dependencies: - pyfaidx=0.8.1.3 + - pandas=2.2.3 diff --git a/envs/pymlst.yaml b/workflow/envs/pymlst.yaml similarity index 100% rename from envs/pymlst.yaml rename to workflow/envs/pymlst.yaml diff --git a/envs/seqkit.yaml b/workflow/envs/seqkit.yaml similarity index 100% rename from envs/seqkit.yaml rename to workflow/envs/seqkit.yaml diff --git a/envs/spacepharer.yml b/workflow/envs/spacepharer.yaml similarity index 100% rename from envs/spacepharer.yml rename to workflow/envs/spacepharer.yaml diff --git a/envs/tidy_here.yaml b/workflow/envs/tidy_here.yaml similarity index 100% rename from envs/tidy_here.yaml rename to workflow/envs/tidy_here.yaml diff --git a/results/Assess_clustered_spacers.qmd b/workflow/notebooks/Assess_clustered_spacers.qmd similarity index 100% rename from results/Assess_clustered_spacers.qmd rename to workflow/notebooks/Assess_clustered_spacers.qmd diff --git a/results/Descriptive_statistics.qmd b/workflow/notebooks/Descriptive_statistics.qmd similarity index 100% rename from results/Descriptive_statistics.qmd rename to workflow/notebooks/Descriptive_statistics.qmd diff --git a/results/Metadata_and_spacer_summary.qmd b/workflow/notebooks/Metadata_and_spacer_summary.qmd similarity index 100% rename from results/Metadata_and_spacer_summary.qmd rename to workflow/notebooks/Metadata_and_spacer_summary.qmd diff --git a/workflow/rules/classify_genomes.smk b/workflow/rules/classify_genomes.smk new file mode 100644 index 0000000..c92603a --- /dev/null +++ b/workflow/rules/classify_genomes.smk @@ -0,0 +1,215 @@ +### Classify genomes +## 1: determine multilocus sequence type (MLST) + + +rule download_mlst_database: + output: + "resources/mlst/campylobacter.db", + params: + species=config["mlst"]["species"], + conda: + "../envs/pymlst.yaml" + threads: 1 + log: + "log/download_mlst_database.txt", + benchmark: + "log/benchmark/download_mlst_database.txt" + shell: + """ +claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1 + """ + + +rule mlst: + input: + batch="resources/ATB/assemblies/{batch}/", + db="resources/mlst/campylobacter.db", + output: + "results/mlst/{batch}/complete", + conda: + "../envs/pymlst.yaml" + threads: config["mlst"]["threads"] + log: + "log/mlst/{batch}.txt", + benchmark: + "log/benchmark/mlst/{batch}.txt" + shell: + """ +find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ + parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + claMLST search {input.db} {{}} -o "$(dirname {output})/{{/.}}.txt" > {log} 2>&1 + +touch {output} + """ + + +rule concatenate_mlst_batches: + input: + "results/mlst/{batch}/complete", + output: + "results/mlst/{batch}-concatenated.tsv", + conda: + "../envs/bash.yaml" + threads: config["mlst"]["threads"] + log: + "log/concatenate_mlst/{batch}.txt", + benchmark: + "log/benchmark/concatenate_mlst/{batch}.txt" + shell: + """ +echo -e "Genome\tST" > {output} +find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\ + parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + 'tail -n +2 {{}} | cut -f 1-2 >> {output}' + """ + + +rule concatenate_mlst_all: + input: + expand("results/mlst/{batch}-concatenated.tsv", batch=BATCHES), + output: + "results/mlst_table.tsv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/concatenate_mlst_all.txt", + benchmark: + "log/benchmark/concatenate_mlst_all.txt" + shell: + """ +batches=( {input} ) +head -1 ${{batches[0]}} > {output} +sed --separate 1d ${{batches[@]}} >> {output} + """ + + +## 2. identify whether contig derive from a chromosome, plasmid or virus +# Using both geNomad (chromosome/plasmid/virus) + + +rule download_genomad_database: + output: + db=directory("resources/genomad_db"), + conda: + "../envs/genomad.yaml" + threads: 1 + log: + "log/download_genomad_database.txt", + benchmark: + "log/benchmark/download_genomad_database.txt" + shell: + """ +mkdir -p $(dirname {output.db}) +genomad download-database $(dirname {output.db}) > {log} 2>&1 + """ + + +rule genomad: + input: + fasta="resources/ATB/assemblies/{batch}.fasta", + db="resources/genomad_db", + output: + aggregated_classification="results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", + plasmid_summary="results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", + virus_summary="results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", + params: + work_dir=subpath(output.aggregated_classification, ancestor=2), + conda: + "../envs/genomad.yaml" + threads: config["genomad"]["threads"] + log: + "log/genomad/{batch}.txt", + benchmark: + "log/benchmark/genomad/{batch}.txt" + shell: + """ +genomad end-to-end -t {threads} --cleanup --enable-score-calibration\ + {input.fasta} {params.work_dir} {input.db} > {log} 2>&1 + """ + + +rule collect_genomad_predictions: + input: + aggregated_classification=expand( + "results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv", + batch=BATCHES, + ), + plasmid_summary=expand( + "results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv", + batch=BATCHES, + ), + virus_summary=expand( + "results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv", + batch=BATCHES, + ), + output: + "results/genomad_predictions.csv", + conda: + "../envs/tidy_here.yaml" + threads: 1 + log: + "log/collect_genomad_predictions.txt", + benchmark: + "log/benchmark/collect_genomad_predictions.txt" + script: + "../scripts/collect_genomad_predictions.R" + + +# And Jaeger (virus (phage/prophage) or not) + + +rule jaeger: + input: + batch="resources/ATB/assemblies/{batch}/", + output: + "results/jaeger/{batch}/complete", + conda: + "../envs/jaeger.yaml" + threads: config["jaeger"]["threads"] + log: + "log/jaeger/{batch}.txt", + benchmark: + "log/benchmark/jaeger/{batch}.txt" + shell: + """ +parallel --jobs {threads} --retry-failed --halt='now,fail=1'\ + jaeger run -p --workers 1 -i {{}} -o $(dirname {output}) --overwrite\ + > {log} 2>&1 ::: {input.batch}/*.fa + +touch {output} + """ + + +rule collect_jaeger_batch: + input: + "results/jaeger/{batch}/complete", + output: + "results/jaeger/{batch}/jaeger-{batch}.csv", + params: + batch=subpath(output[0], parent=True), + conda: + "../envs/tidy_here.yaml" + threads: 1 + log: + "log/collect_jaeger_{batch}.txt", + benchmark: + "log/benchmark/collect_jaeger_{batch}.txt" + script: + "../scripts/collect_jaeger_batch.R" + + +rule collect_jaeger_predictions: + input: + expand("results/jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES), + output: + "results/jaeger_predictions.csv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/collect_jaeger_predictions.txt", + benchmark: + "log/benchmark/collect_jaeger_predictions.txt" + script: + "../scripts/collect_jaeger_predictions.sh" diff --git a/workflow/rules/helper_rules.smk b/workflow/rules/helper_rules.smk new file mode 100644 index 0000000..558ecb1 --- /dev/null +++ b/workflow/rules/helper_rules.smk @@ -0,0 +1,39 @@ +## General helper functions: define input and output +from pathlib import Path + +# Use Python functions to automatically detect batches of genomes fasta files +# in the input directory as 'BATCHES' +BATCH_PATHS = list(Path("resources/ATB/assemblies").glob("atb.assembly.*")) + +# Make sure there is at least one batch directory: +assert len(BATCH_PATHS) > 0, ( + "-- No input (batch) directories found in resources/ATB/assemblies.\n" + "Please run the script bin/prepare_genomes.sh to prepare input. --\n" +) + +# And make sure they are actually directories +for batch in BATCH_PATHS: + assert Path(batch).is_dir(), f"-- Batches must be directories, got {batch} --" + +BATCHES = [batch.name for batch in BATCH_PATHS] + + +## Helper rules (not fitting any particular goal) + + +rule concatenate_batches: + input: + "resources/ATB/assemblies/{batch}", + output: + temp("resources/ATB/assemblies/{batch}.fasta"), + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/concatenate_{batch}.txt", + benchmark: + "log/benchmark/concatenate_{batch}.txt" + shell: + """ +cat {input}/*.fa > {output} 2> {log} + """ diff --git a/workflow/rules/identify_defences.smk b/workflow/rules/identify_defences.smk new file mode 100644 index 0000000..6923b08 --- /dev/null +++ b/workflow/rules/identify_defences.smk @@ -0,0 +1,89 @@ +## Identify anti-phage defence systems using PADLOC + + +rule download_padloc_database: + output: + directory=directory("resources/padloc_db"), + cm=directory("resources/padloc_db/cm"), + cm_meta="resources/padloc_db/cm_meta.txt", + hmm=directory("resources/padloc_db/hmm"), + hmm_meta="resources/padloc_db/hmm_meta.txt", + sys=directory("resources/padloc_db/sys"), + sys_meta="resources/padloc_db/sys_meta.txt", + system_info="resources/padloc_db/system_info.md", + conda: + "../envs/padloc.yaml" + threads: 1 + log: + "log/download_padloc_database.txt", + benchmark: + "log/benchmark/download_padloc_database.txt" + shell: + """ +padloc --data resources/padloc_db --db-install v2.0.0 > {log} 2>&1 + """ + + +rule padloc: + input: + batch="resources/ATB/assemblies/{batch}/", + db="resources/padloc_db", + output: + "results/padloc/{batch}/complete", + conda: + "../envs/padloc.yaml" + threads: config["padloc"]["threads"] + log: + "log/padloc/{batch}.txt", + benchmark: + "log/benchmark/padloc/{batch}.txt" + shell: + """ +find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ + parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + 'mkdir -p "$(dirname {output})/{{/.}}" && padloc --data {input.db}\ + --cpu 1 --fna {{}} --outdir "$(dirname {output})/{{/.}}"' > {log} 2>&1 + +touch {output} + """ + + +rule concatenate_padloc_batches: + input: + "results/padloc/{batch}/complete", + output: + "results/padloc/{batch}-concatenated.csv", + conda: + "../envs/bash.yaml" + threads: config["padloc"]["threads"] + log: + "log/concatenate_padloc/{batch}.txt", + benchmark: + "log/benchmark/concatenate_padloc/{batch}.txt" + shell: + """ +file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") ) +head -1 ${{file_array[0]}} > {output} +parallel --jobs {threads} --retry-failed --halt='now,fail=1'\ + 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}} + """ + + +rule concatenate_padloc_all: + input: + expand("results/padloc/{batch}-concatenated.csv", batch=BATCHES), + output: + "results/padloc_table.csv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/concatenate_padloc_all.txt", + benchmark: + "log/benchmark/concatenate_padloc_all.txt" + shell: + """ +batches=( {input} ) +head -1 ${{batches[0]}} > {output} +sed --separate 1d ${{batches[@]}} >> {output} + """ diff --git a/workflow/rules/map_spacers.smk b/workflow/rules/map_spacers.smk new file mode 100644 index 0000000..3f6414c --- /dev/null +++ b/workflow/rules/map_spacers.smk @@ -0,0 +1,295 @@ +### Map spacers to putative targets/protospacers +## 2. By using SpacePHARER + + +rule spacepharer_spacer_setup: + input: + spacers="results/crispridentify/all_spacers.fa", + output: + spacer_DB="results/spacepharer/DB_CRISPR/querysetDB", + params: + tmp_folder=subpath(output.spacer_DB, parent=True), + conda: + "../envs/spacepharer.yaml" + threads: 48 + log: + "log/spacepharer/spacepharer_spacer_setup.txt", + benchmark: + "log/benchmark/spacepharer/spacepharer_spacer_setup.txt" + shell: + """ +spacer_DB=$(dirname {output.spacer_DB}) +rm -rf $spacer_DB/* > {log} 2>&1 + +spacepharer createsetdb {input.spacers} {output.spacer_DB}\ + "{params.tmp_folder}/tmpFolder" --extractorf-spacer 1\ + --threads {threads} >> {log} 2>&1 + """ + + +## First to bacteriophages + + +rule download_phage_database: + output: + phage_dir=directory("resources/phagescope"), + phage_archives=expand( + "resources/phagescope/{database}.tar.gz", + database=config["PhageScope_databases"], + ), + phage_fasta=expand( + "resources/phagescope/{database}.fasta", + database=config["PhageScope_databases"], + ), + combined_meta="resources/phagescope/phagescope_metadata.tsv", + params: + databases=config["PhageScope_databases"], + conda: + "../envs/bash.yaml" + threads: config["download_spacepharer_databases"]["threads"] + log: + out="log/download_phage_database.out", + err="log/download_phage_database.err", + benchmark: + "log/benchmark/download_phage_database.txt" + script: + "../scripts/download_phage_database.sh" + + +rule spacepharer_phage_setup: + input: + db=collect( + "resources/phagescope/{database}.fasta", + database=[ + "DDBJ", + "EMBL", + "Genbank", + "GPD", + "GVD", + "MGV", + "PhagesDB", + "RefSeq", + "TemPhD", + ], + ), + output: + phage_DB="results/spacepharer/phage_DB/targetsetDB", + phage_control_DB="results/spacepharer/phage_DB/controlsetDB", + params: + tmp_folder=subpath(output.phage_DB, ancestor=2), + conda: + "../envs/spacepharer.yaml" + threads: config["spacepharer"]["threads"] + log: + "log/spacepharer/spacepharer_phage_setup.txt", + benchmark: + "log/benchmark/spacepharer/spacepharer_setup.txt" + shell: + """ +phage_DB=$(dirname {output.phage_DB}) +rm -rf $phage_DB/* > {log} 2>&1 + +spacepharer createsetdb {input.db} {output.phage_DB}\ + "{params.tmp_folder}/tmpFolder" --threads {threads} >> {log} 2>&1 +spacepharer createsetdb {input.db} {output.phage_control_DB}\ + "{params.tmp_folder}/tmpFolder" --reverse-fragments 1 --threads {threads} >> {log} 2>&1 + """ + + +rule spacepharer_phage: + input: + spacer_DB="results/spacepharer/DB_CRISPR/querysetDB", + phage_DB="results/spacepharer/phage_DB/targetsetDB", + phage_control_DB="results/spacepharer/phage_DB/controlsetDB", + output: + result="results/spacepharer/predicted_phage_matches.tsv", + result_sanitised="results/spacepharer/predicted_phage_matches_san.tsv", + params: + tmp_folder="results/spacepharer/tmpFolder", + conda: + "../envs/spacepharer.yaml" + threads: config["spacepharer"]["threads"] + log: + "log/spacepharer/spacepharer_phage.txt", + benchmark: + "log/benchmark/spacepharer/spacepharer_phage.txt" + shell: + """ +spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\ + {input.phage_control_DB} {output.result} {params.tmp_folder}\ + --threads {threads} > {log} 2>&1 + +grep -v "#" {output.result} > {output.result_sanitised} +rm -r {params.tmp_folder} >> {log} 2>&1 + """ + + +## Then also to plasmids + + +rule download_plasmid_database: + output: + plasmid_dir=directory("resources/PLSDB"), + plasmid_fasta="resources/PLSDB/sequences.fasta", + plasmid_nuccore="resources/PLSDB/nuccore.csv", + plasmid_taxonomy="resources/PLSDB/taxonomy.csv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + out="log/download_plasmid_database.out", + err="log/download_plasmid_database.err", + benchmark: + "log/benchmark/download_plasmid_database.txt" + script: + "../scripts/download_plasmid_database.sh" + + +rule spacepharer_plasmid_setup: + input: + db="resources/PLSDB/sequences.fasta", + output: + DB="results/spacepharer/plasmid_DB/targetsetDB", + control_DB="results/spacepharer/plasmid_DB/controlsetDB", + params: + tmp_folder="results/spacepharer/tmpFolder", + conda: + "../envs/spacepharer.yaml" + threads: config["spacepharer"]["threads"] + log: + "log/spacepharer/spacepharer_plasmid_setup.txt", + benchmark: + "log/benchmark/spacepharer/spacepharer_plasmid_setup.txt" + shell: + """ +plasmid_DB=$(dirname {output.DB}) +rm -f $plasmid_DB/* > {log} 2>&1 + +spacepharer createsetdb {input.db} {output.DB} {params.tmp_folder}\ + --threads {threads} >> {log} 2>&1 +spacepharer createsetdb {input.db} {output.control_DB} {params.tmp_folder}\ + --reverse-fragments 1 --threads {threads} >> {log} 2>&1 + """ + + +rule spacepharer_plasmid: + input: + phage_DB="results/spacepharer/plasmid_DB/targetsetDB", + phage_control_DB="results/spacepharer/plasmid_DB/controlsetDB", + spacer_DB="results/spacepharer/DB_CRISPR/querysetDB", + output: + result="results/spacepharer/predicted_plasmid_matches.tsv", + result_sanitised="results/spacepharer/predicted_plasmid_matches_san.tsv", + params: + tmp_folder="results/spacepharer/tmpFolder", + conda: + "../envs/spacepharer.yaml" + threads: config["spacepharer"]["threads"] + log: + "log/spacepharer/spacepharer_phage.txt", + benchmark: + "log/benchmark/spacepharer/spacepharer_phage.txt" + shell: + """ +spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\ + {input.phage_control_DB} {output.result} {params.tmp_folder}\ + --threads {threads} > {log} 2>&1 + +grep -v "#" {output.result} > {output.result_sanitised} +rm -r {params.tmp_folder} >> {log} 2>&1 + """ + + +rule create_spacepharer_table: + input: + phage="results/spacepharer/predicted_phage_matches_san.tsv", + phage_meta="resources/phagescope/merged_metadata.tsv", + plasmid="results/spacepharer/predicted_plasmid_matches_san.tsv", + plasmid_nuccore="resources/PLSDB/nuccore.csv", + plasmid_taxonomy="resources/PLSDB/taxonomy.csv", + output: + phage="results/phage_matches.tsv", + plasmid="results/plasmid_matches.tsv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/create_spacepharer_table.txt", + script: + "../scripts/create_spacepharer_table.sh" + + +## 2. By using KMA to the input genomes + + +rule kma_indexing: + input: + spacers="results/crispridentify/all_spacers.fa", + output: + indexed_spacers="results/kma/spacer_DB/spacers.name", + params: + "results/kma/spacer_DB/spacers", + conda: + "../envs/kma.yaml" + threads: config["kma"]["threads"] + log: + "log/kma/kma_index.txt", + benchmark: + "log/benchmark/kma/kma_index.txt" + shell: + """ +kma index -i {input.spacers} -o {params} > {log} 2>&1 + """ + + +rule kma: + input: + genomes=expand("resources/ATB/assemblies/{batch}/", batch=BATCHES), + indexed_spacers="results/kma/spacer_DB/spacers.name", + output: + "results/kma/output/CRISPR.frag.gz", + params: + output=subpath(output[0], parent=True), + indexed_spacers=subpath(input.indexed_spacers, parent=True), + spacers="results/crispridentify/all_spacers.fa", + conda: + "../envs/kma.yaml" + threads: config["kma"]["threads"] + log: + "log/kma/kma.txt", + benchmark: + "log/benchmark/kma/kma.txt" + shell: + """ +grep ">" {params.spacers} | cut -f 2 -d ">" | cut -f 1 -d "-" | sort -u > tmp_file +find -L {input.genomes} -mindepth 1 -maxdepth 1 -type f -name "*.fa" > all_genomes.txt +genomes=$(grep -x ".*[0-9]\\.fa" all_genomes.txt | grep -v -f tmp_file) + +kma -hmm -i $genomes -o {params.output} -t_db "{params.indexed_spacers}/spacers" > {log} 2>&1 +rm tmp_file all_genomes.txt + """ + + +rule collect_kma: + input: + "results/kma/output/CRISPR.frag.gz", + output: + "results/kma/CRISPR_alignment", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/kma/collect_kma.txt", + benchmark: + "log/benchmark/kma/collect_kma.txt" + shell: + """ +echo -e "spacer\tgenome" > {output} +zcat {input} | cut -f 6,7 | cut -f 1 -d " " > tmp_file +while read line; do + match=$(echo $line | cut -f 2) + crispr=$(echo $line | cut -f 1 | cut -f 1,6,7,10,11 -d "_") + echo -e "$crispr\t$match" >> {output} +done < tmp_file +rm tmp_file + """ diff --git a/workflow/rules/refine_crispr-cas.smk b/workflow/rules/refine_crispr-cas.smk new file mode 100644 index 0000000..c0835d1 --- /dev/null +++ b/workflow/rules/refine_crispr-cas.smk @@ -0,0 +1,113 @@ +### Refine CRISPR-Cas identifation + + +rule crispridentify: + input: + "results/cctyper/{batch}/subseq", + output: + spacers="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta", + summary="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv", + params: + out_dir=subpath(output[0], parent=True), + arrays=subpath(input[0], parent=True), + conda: + "../envs/crispridentify.yaml" + threads: config["crispridentify"]["threads"] + log: + "log/crispridentify/{batch}.txt", + benchmark: + "log/benchmark/crispridentify/{batch}.txt" + shell: + """ +cd bin/CRISPRidentify + +find ../../{params.arrays}/*/fasta/CRISPR_arrays-with_flanks.fasta -size +0c -print0 |\ + parallel -0 --jobs {threads} --retry-failed --halt='now,fail,1'\ + 'python CRISPRidentify.py --file {{}}\ + --result_folder "../../{params.out_dir}/"\ + --fasta_report True --strand False' > ../../{log} 2>&1 + """ + + +rule merge_crispridentify_batches: + input: + spacers_crispr=expand( + "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta", + batch=BATCHES, + ), + summary_crispr=expand( + "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv", + batch=BATCHES, + ), + output: + spacers_crispr="results/crispridentify/all_spacers.fa", + summary_crispr="results/crispridentify/complete_summary.csv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/merge_crispridentify_batches.txt", + benchmark: + "log/benchmark/merge_crispridentify_batches.txt" + script: + "../scripts/merge_crispridentify_batches.sh" + + +rule merge_cctyper_identify: + input: + identify="results/crispridentify/complete_summary.csv", + cctyper=expand("results/cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES), + output: + table="results/all_CRISPRS_with_identify.tab", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/merge_cctyper_identify.txt", + benchmark: + "log/benchmark/merge_cctyper_identify.txt" + script: + "../scripts/merge_cctyper_crispridentify.sh" + + +rule cluster_unique_spacers_crispridentify: + input: + "results/crispridentify/all_spacers.fa", + output: + clusters="results/crispridentify/all_spacers-clustered.clstr", + spacers="results/crispridentify/all_spacers-clustered", + distribution="results/crispridentify/all_spacers-clustered-distribution.tsv", + conda: + "../envs/cdhit.yaml" + threads: 1 + log: + "log/cluster_unique_spacers_identify.txt", + benchmark: + "log/benchmark/cluster_unique_spacers_identify.txt" + shell: + """ +cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\ + -i {input} -o {output.spacers} > {log} 2>&1 + +plot_len1.pl {output.clusters}\ + 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\ + 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\ + > {output.distribution} + """ + + +rule create_crispr_cluster_table_identify: + input: + clstr="results/crispridentify/all_spacers-clustered.clstr", + fasta="results/crispridentify/all_spacers.fa", + output: + "results/all_spacers_table_identify.tsv", + conda: + "../envs/pyfaidx_pandas.yaml" + threads: 1 + log: + "log/create_crispr_cluster_table_identify.txt", + benchmark: + "log/benchmark/create_crispr_cluster_table_identify.txt" + script: + "../scripts/make_cluster_table_identify.py" diff --git a/workflow/rules/screen_crispr-cas.smk b/workflow/rules/screen_crispr-cas.smk new file mode 100644 index 0000000..7d78f56 --- /dev/null +++ b/workflow/rules/screen_crispr-cas.smk @@ -0,0 +1,195 @@ +### Screen for CRISPR-Cas +## Run CCTyper and parse/merge its output + + +rule crisprcastyper: + input: + batch="resources/ATB/assemblies/{batch}/", + output: + "results/cctyper/{batch}/complete", + params: + out_dir=subpath(output[0], parent=True), + conda: + "../envs/cctyper.yaml" + threads: config["cctyper"]["threads"] + log: + "log/cctyper/{batch}.txt", + benchmark: + "log/benchmark/cctyper/{batch}.txt" + shell: + """ +find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\ + parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + 'rm -rf "{params.out_dir}{{/.}}" &&\ + cctyper -t 1 {{}} "{params.out_dir}/{{/.}}"' > {log} 2>&1 + +touch {output} + """ + + +rule parse_cctyper: + input: + "results/cctyper/{batch}/complete", + output: + "results/cctyper/{batch}/parsed", + conda: + "../envs/pandas.yaml" + threads: config["parse_cctyper"]["threads"] + log: + "log/parse_cctyper/{batch}.txt", + benchmark: + "log/benchmark/parse_cctyper/{batch}.txt" + shell: + """ +find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\ +parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + python workflow/scripts/cctyper_extender.py -d {{.}} > {log} 2>&1 + +touch {output} + """ + + +rule extract_sequences: + input: + flag="results/cctyper/{batch}/parsed", + genomes="resources/ATB/assemblies/{batch}", + output: + "results/cctyper/{batch}/subseq", + conda: + "../envs/seqkit.yaml" + threads: config["extract_sequences"]["threads"] + log: + "log/extract_sequences/{batch}.txt", + benchmark: + "log/benchmark/extract_sequences/{batch}.txt" + shell: + """ +find $(dirname {input.flag}) -mindepth 1 -maxdepth 1 -type d -print0 |\ +parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\ + bash workflow/scripts/extract_crispr-cas_from_fasta.sh {{}} {input.genomes} > {log} 2>&1 + +touch {output} + """ + + +rule collect_cctyper: + input: + cctyper="results/cctyper/{batch}/complete", + parser="results/cctyper/{batch}/parsed", + output: + crispr_cas="results/cctyper/{batch}/CRISPR_Cas-{batch}.tab", + crisprs_all="results/cctyper/{batch}/crisprs_all-{batch}.tab", + crisprs_near_cas="results/cctyper/{batch}/crisprs_near_cas-{batch}.tab", + crisprs_orphan="results/cctyper/{batch}/crisprs_orphan-{batch}.tab", + spacers="results/cctyper/{batch}/all_spacers-{batch}.fa", + cas_putative="results/cctyper/{batch}/cas_operons_putative-{batch}.tab", + cas="results/cctyper/{batch}/cas_operons-{batch}.tab", + csv="results/cctyper/{batch}/CRISPR-Cas-{batch}.csv", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/cctyper/collect_{batch}.txt", + benchmark: + "log/benchmark/cctyper/collect_{batch}.txt" + shell: + """ +bash workflow/scripts/concatenate_cctyper_output.sh $(dirname {input.cctyper}) > {log} 2>&1 +echo "\n========================" >> {log} +bash workflow/scripts/concatenate_cctyper_csv.sh $(dirname {input.parser}) >> {log} 2>&1 + +find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat {{}} + > {output.spacers} 2>> {log} + """ + + +rule concatenate_all_spacers: + input: + expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES), + output: + "results/cctyper/all_spacers.fa", + conda: + "../envs/bash.yaml" + threads: 1 + log: + "log/concatenate_all_spacers.txt", + benchmark: + "log/benchmark/concatenate_all_spacers.txt" + shell: + """ +cat {input} > {output} 2> {log} + """ + + +rule cluster_all_spacers: + input: + "results/cctyper/all_spacers.fa", + output: + clusters=expand( + "results/cctyper/all_spacers-clustered-{cutoff}.clstr", + cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], + ), + spacers=expand( + "results/cctyper/all_spacers-clustered-{cutoff}", + cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81], + ), + summary="results/cctyper/spacer_cluster_summary.tsv", + params: + work_dir=subpath(input[0], parent=True), + log_dir="log/spacer_clustering", + conda: + "../envs/cdhit.yaml" + threads: 1 + log: + "log/cluster_all_spacers.txt", + benchmark: + "log/benchmark/cluster_all_spacers.txt" + shell: + """ +bash workflow/scripts/cluster_all_spacers.sh\ + {input}\ + {params.work_dir}\ + {params.log_dir} > {log} 2>&1 + """ + + +rule cluster_unique_spacers: + input: + "results/cctyper/all_spacers.fa", + output: + clusters="results/cctyper/all_spacers-clustered.clstr", + spacers="results/cctyper/all_spacers-clustered", + distribution="results/cctyper/all_spacers-clustered-distribution.tsv", + conda: + "../envs/cdhit.yaml" + threads: 1 + log: + "log/cluster_unique_spacers.txt", + benchmark: + "log/benchmark/cluster_unique_spacers.txt" + shell: + """ +cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\ + -i {input} -o {output.spacers} > {log} 2>&1 + +plot_len1.pl {output.clusters}\ + 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\ + 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\ + > {output.distribution} + """ + + +rule create_crispr_cluster_table: + input: + clstr="results/cctyper/all_spacers-clustered.clstr", + fasta="results/cctyper/all_spacers.fa", + output: + "results/all_spacers_table.tsv", + conda: + "../envs/pyfaidx_pandas.yaml" + threads: 1 + log: + "log/create_crispr_cluster_table.txt", + benchmark: + "log/benchmark/create_crispr_cluster_table.txt" + script: + "../scripts/make_cluster_table.py" diff --git a/bin/cctyper_extender.py b/workflow/scripts/cctyper_extender.py similarity index 100% rename from bin/cctyper_extender.py rename to workflow/scripts/cctyper_extender.py diff --git a/bin/cluster_all_spacers.sh b/workflow/scripts/cluster_all_spacers.sh similarity index 100% rename from bin/cluster_all_spacers.sh rename to workflow/scripts/cluster_all_spacers.sh diff --git a/bin/collect_genomad_predictions.R b/workflow/scripts/collect_genomad_predictions.R similarity index 90% rename from bin/collect_genomad_predictions.R rename to workflow/scripts/collect_genomad_predictions.R index 0cc783e..7356717 100644 --- a/bin/collect_genomad_predictions.R +++ b/workflow/scripts/collect_genomad_predictions.R @@ -17,13 +17,13 @@ read_stats <- function(filename, name_position) { sample_name <- str_split_1(string = filename, pattern = "/") %>% tail(name_position) %>% head(1) - + df <- read_delim( file = filename, show_col_types = FALSE ) %>% mutate(batch = sample_name) - + return(df) } @@ -34,10 +34,11 @@ genomad_scores <- do.call( rename("contig" = "seq_name") genomad_scores <- genomad_scores %>% - mutate(genome = gsub(pattern = "\\.contig[0-9]*", - replacement = "", - x = contig) - ) + mutate(genome = gsub( + pattern = "\\.contig[0-9]*", + replacement = "", + x = contig + )) plasmid_classifications <- do.call( rbind, @@ -78,10 +79,10 @@ genomad_df <- left_join( !is.na(plasmid_topology) ~ "plasmid", !is.na(virus_topology) ~ "virus", TRUE ~ "chromosome" - ) + ) ) write_csv( x = genomad_df, - file = snakemake@output[[1]] #here("data", "processed", "genomad_predictions.csv") -) \ No newline at end of file + file = snakemake@output[[1]] # here("data", "processed", "genomad_predictions.csv") +) diff --git a/bin/collect_jaeger_batch.R b/workflow/scripts/collect_jaeger_batch.R similarity index 61% rename from bin/collect_jaeger_batch.R rename to workflow/scripts/collect_jaeger_batch.R index 167e0aa..ef50d11 100644 --- a/bin/collect_jaeger_batch.R +++ b/workflow/scripts/collect_jaeger_batch.R @@ -9,7 +9,7 @@ suppressPackageStartupMessages({ }) # Read all output files in a batch (one per genome) -jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv")) +jaeger_files <- Sys.glob(paths = here(snakemake@params[["batch"]], "*", "*_default_jaeger.tsv")) # Concatenate them all in one dataframe jaeger_df <- do.call( @@ -20,15 +20,21 @@ jaeger_df <- do.call( # Simplify the dataframe by extracting essential info jaeger_df_simple <- jaeger_df %>% mutate( - contig = gsub(pattern = " .*", - replacement = "", - x = contig_id), - accession_id = gsub(pattern = ".contig[0-9]*", - replacement = "", - x = contig) + contig = gsub( + pattern = " .*", + replacement = "", + x = contig_id + ), + accession_id = gsub( + pattern = ".contig[0-9]*", + replacement = "", + x = contig + ) ) %>% select(accession_id, contig, length, prediction, reliability_score, prophage_contam) # And write to a CSV file (which can easily be concatenated with a script) -write_csv(x = jaeger_df_simple, - file = snakemake@output[[1]]) \ No newline at end of file +write_csv( + x = jaeger_df_simple, + file = snakemake@output[[1]] +) diff --git a/bin/collect_jaeger_predictions.sh b/workflow/scripts/collect_jaeger_predictions.sh similarity index 100% rename from bin/collect_jaeger_predictions.sh rename to workflow/scripts/collect_jaeger_predictions.sh diff --git a/bin/concatenate_cctyper_csv.sh b/workflow/scripts/concatenate_cctyper_csv.sh similarity index 100% rename from bin/concatenate_cctyper_csv.sh rename to workflow/scripts/concatenate_cctyper_csv.sh diff --git a/bin/concatenate_cctyper_output.sh b/workflow/scripts/concatenate_cctyper_output.sh similarity index 100% rename from bin/concatenate_cctyper_output.sh rename to workflow/scripts/concatenate_cctyper_output.sh diff --git a/bin/create_spacepharer_table.sh b/workflow/scripts/create_spacepharer_table.sh similarity index 81% rename from bin/create_spacepharer_table.sh rename to workflow/scripts/create_spacepharer_table.sh index 888b286..ecf3812 100644 --- a/bin/create_spacepharer_table.sh +++ b/workflow/scripts/create_spacepharer_table.sh @@ -7,7 +7,7 @@ echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]} while read line; do ID=$(echo $line | cut -f 2) - metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7) + metadata_match=$(grep -w "$ID" "${snakemake_input[phage_meta]}" | cut -f 2-7) echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]} done < ${snakemake_input[phage]} @@ -16,7 +16,7 @@ echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end while read line; do ID=$(echo $line | cut -f 2) #PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy - nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",") - taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",") + nuccore_match=$(grep -w "$ID" "${snakemake_input[plasmid_nuccore]}" | cut -f 13 -d ",") + taxonomy_match=$(grep -w "^$nuccore_match" "${snakemake_input[plasmid_taxonomy]}" | cut -f 3 -d ",") echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]} done < ${snakemake_input[plasmid]} diff --git a/workflow/scripts/download_phage_database.sh b/workflow/scripts/download_phage_database.sh new file mode 100644 index 0000000..f84fd19 --- /dev/null +++ b/workflow/scripts/download_phage_database.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +## This script downloads and prepares the phage database: Phagescope +## (https://phagescope.deepomics.org/) for use with SpacePHARER. +set -euo pipefail + +exec > "${snakemake_log[out]}" # send stdout to a log file +exec 2> "${snakemake_log[err]}" # also send stderr to a log file + +. "${snakemake[scriptdir]}/utils.sh" + +threads=${snakemake[threads]} +phage_dir="${snakemake_output[phage_dir]}" +databases=(${snakemake_params[databases]}) + +message "Downloading PhageScope databases (archived fasta files)" +message "This comprises ${#databases[@]} parts, as specified in 'config/parameters.yaml'" +echo "-----" + +for DB in "${databases[@]}" +do + message "Downloading ${DB}" + wget -O "${phage_dir}/${DB}.tar.gz"\ + "https://phageapi.deepomics.org/download/phage/fasta/?datasource=${DB}" +done + +echo "-----" +message "Extracting databases\n" + +# Extracts the .tar.gz archives to subdirectories with separate fasta files. + +parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\ + [ -f "${phage_dir}/${DB}.fasta" ] || [ -d "${phage_dir}/${DB}" ] && \ + message "${DB} already extracted, skipping..." || \ + ( message "Extracting ${DB}"; tar -xzf "${phage_dir}/${DB}.tar.gz" -C "${phage_dir}/" )'\ + ::: "${databases[@]}" ::: ${phage_dir} + +echo "-----" +message "Concatenating PhageScope sequences\n" + +# Concatenate separate fasta files in one long file, +# and then remove the directory with the separate files. + +parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\ + ( message "Concatenating ${DB}"; genomes=$(find "${phage_dir}/${DB}" -type f -name "*.fasta");\ + > "${phage_dir}/${DB}.fasta";\ + for files in ${genomes}; do cat ${files} >> "${phage_dir}/${DB}.fasta"; done; + rm -r "${phage_dir}/${DB}")' \ + ::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD ::: ${phage_dir} + +echo "-----" +message "Downloading Phagescope metadata\n" + +# Download metadata as tab-separated values (TSV) text file. + +for DB in "${databases[@]}" +do + metadata_file="${phage_dir}/${DB}_phage_metadata.tsv" + message "Downloading ${DB}" + wget -O "${metadata_file}" "https://phageapi.deepomics.org/files/Download/Phage_meta_data/${DB,,}_phage_meta_data.tsv" + # With ${DB,,} the string variable is converted to lowercase +done + +echo "-----" +message "Concatenating PhageScope metadata files\n" + +# Make one long file with all metadata from the different databases. +combined_metadata="${snakemake_output[combined_meta]}" + +# Rename the first metadata file into the concatenated file +mv "${phage_dir}/${databases[0]}_phage_metadata.tsv" ${combined_metadata} + +# Then, for all the other metadata files +for DB in "${databases[@]:1}" +do + metadata_file="${phage_dir}/${DB}_phage_metadata.tsv" + # Take all but the first line (header) + tail -n +2 "${metadata_file}" >> "${combined_metadata}" + # and remove (clean-up, avoid keeping duplicated data) + rm ${metadata_file} +done + +message "--- Done! ---" diff --git a/workflow/scripts/download_plasmid_database.sh b/workflow/scripts/download_plasmid_database.sh new file mode 100644 index 0000000..440596c --- /dev/null +++ b/workflow/scripts/download_plasmid_database.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +## This script downloads and prepares the plasmid database: PLSDB +## (https://ccb-microbe.cs.uni-saarland.de/plsdb2025/) for use with SpacePHARER. +set -euo pipefail + +exec > "${snakemake_log[out]}" # send all stdout to a log file +exec 2> "${snakemake_log[err]}" # send stderr to separate log file + +. "${snakemake[scriptdir]}/utils.sh" + +plasmid_dir=${snakemake_output[plasmid_dir]} +plasmid_archive="${plasmid_dir}/download_meta.tar.gz" + +message "Downloading PLSDB plasmid database" +echo "-----" + +wget -P "${plasmid_dir}" https://ccb-microbe.cs.uni-saarland.de/plsdb2025/download_meta.tar.gz + +message "Extracting PLSDB" +tar -xzf "${plasmid_archive}" -C "${plasmid_dir}" + +message "Extracting sequences" +bzip2 -d "${plasmid_dir}/sequences.fasta.bz2" + +message "Adjusting metadata delimiters" +sed -i -E ':a;s/"([^"]*),([^"]*)"/"\1\2"/g;ta' "${plasmid_dir}/nuccore.csv" + +echo -e "-----\nDone!\n-----" diff --git a/bin/extract_crispr-cas_from_fasta.sh b/workflow/scripts/extract_crispr-cas_from_fasta.sh similarity index 89% rename from bin/extract_crispr-cas_from_fasta.sh rename to workflow/scripts/extract_crispr-cas_from_fasta.sh index a012e77..177b8c7 100644 --- a/bin/extract_crispr-cas_from_fasta.sh +++ b/workflow/scripts/extract_crispr-cas_from_fasta.sh @@ -3,8 +3,8 @@ cctyper_dir=$1 sample_name=$(basename ${cctyper_dir}) -batch_name=$(basename $(dirname ${cctyper_dir})) -fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa" +genome_dir=$2 +fasta_file="${genome_dir}/${sample_name}.fa" bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") ) diff --git a/workflow/scripts/make_cluster_table.py b/workflow/scripts/make_cluster_table.py new file mode 100644 index 0000000..11b9d84 --- /dev/null +++ b/workflow/scripts/make_cluster_table.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +################################################################ +# Read CD-HIT's output cluster file, along with the FASTA used # +# to generate it to create a table of clustered sequences. # +# - Aim of the program: # +# 1) read the cluster file # +# a) create a cluster-based dictionary (cluster #, # +# representative) # +# b) create a sequence-based dictionary (ID, length, # +# representative or % identity and strand) # +# 2) combine the two dictionaries into a dataframe with # +# columns: genome, contig, locus, cluster ID, length, # +# longest_sequence (CD-HIT representative), sequence, # +# identity (%), strand # +# 3) for each cluster, also mark the shortest and most common# +# sequence (as alternative representatives) # +# 4) write the dataframe to a tab-separated table file # +################################################################ + +import re +from pyfaidx import Fasta +import pandas as pd + +cluster_file = snakemake.input["clstr"] +fasta_file = snakemake.input["fasta"] +table_file = snakemake.output[0] + + +def read_clstr_file(inputfile=str): + """ + Read a CD-HIT generated .clstr file and parse its elements into + two dictionaries: 1) cluster ID + representative (locus ID), + 2) sequences (locus ID) with length and + representative/identity + """ + cluster_dict = {"Cluster": [], "Longest_sequence": []} + locus_dict = { + "Genome": [], + "Contig": [], + "Locus": [], + "Full_locus": [], + "Length": [], + "Cluster": [], + "Strand_to_longest": [], + "Identity_to_longest": [], + } + + cluster_regex = r"(>Cluster *)(\d*)" + locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_\d+:\d+)... (.*)$" + + with open(inputfile, "r") as infile: + for line in infile: + line = line.strip() # Remove funny characters + + if line.startswith(">"): + # Extract the digits from the cluster ID + cluster = re.search(cluster_regex, line).group(2) + + elif len(line) > 1: + # Use RegEx to extract information + crispr_info = re.search(locus_regex, line) + + member_nr = crispr_info.group(1) # not used + length = int(crispr_info.group(2).rstrip("nt")) + genome = crispr_info.group(3) + locus = crispr_info.group(4) + full_locus = f"{genome}.{locus}" + contig = locus.split("_")[0] + extra = crispr_info.group(5) + + # Check the final group for representative ('*') or other + if extra == "*": + strand = "NA" + identity = "NA" + cluster_dict["Cluster"].append(cluster) + cluster_dict["Longest_sequence"].append(full_locus) + + else: + strand_and_identity = extra.split("/") + strand = strand_and_identity[0].replace("at ", "") + identity = strand_and_identity[1] + + locus_dict["Genome"].append(genome) + locus_dict["Contig"].append(contig) + locus_dict["Locus"].append(locus) + locus_dict["Full_locus"].append(full_locus) + locus_dict["Length"].append(length) + locus_dict["Cluster"].append(cluster) + locus_dict["Strand_to_longest"].append(strand) + locus_dict["Identity_to_longest"].append(identity) + + # If the line does not start with '>' or have length > 1, stop + else: + break + + return cluster_dict, locus_dict + + +def generate_sequence_df(fasta=str, ids=list): + """ + Given a fasta file and list of identifiers, create a dataframe + of DNA sequences that can be merged with the cluster/locus dataframe. + """ + sequence_dict = Fasta(fasta, duplicate_action="first") + sequence_list = [] + for locus in ids: + sequence_list.append(sequence_dict[locus][:].seq) + + return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list}) + + +def main(): + """ + Main function, running the whole script. + """ + # Read clstr file, store as dictionaries + cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file) + + # Convert dictionaries to dataframes + cluster_df = pd.DataFrame(cluster_dict) + locus_df = pd.DataFrame(locus_dict) + + # Merge dataframes + combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster") + + # Add sequences + sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"]) + + combined_with_sequences = combined_df.merge( + sequence_df, how="inner", on="Full_locus" + ) + + ## Not yet implemented: + # Find shortest sequence per cluster + # Find most common sequence per cluster + + # Save as tab-separated text file + combined_with_sequences.to_csv(table_file, sep="\t", index=False) + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/workflow/scripts/make_cluster_table_identify.py b/workflow/scripts/make_cluster_table_identify.py new file mode 100644 index 0000000..237361f --- /dev/null +++ b/workflow/scripts/make_cluster_table_identify.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +################################################################ +# Read CD-HIT's output cluster file, along with the FASTA used # +# to generate it to create a table of clustered sequences. # +# - Aim of the program: # +# 1) read the cluster file # +# a) create a cluster-based dictionary (cluster #, # +# representative) # +# b) create a sequence-based dictionary (ID, length, # +# representative or % identity and strand) # +# 2) combine the two dictionaries into a dataframe with # +# columns: genome, contig, locus, cluster ID, length, # +# longest_sequence (CD-HIT representative), sequence, # +# identity (%), strand # +# 3) for each cluster, also mark the shortest and most common# +# sequence (as alternative representatives) # +# 4) write the dataframe to a tab-separated table file # +################################################################ + +## This is a slightly adjusted script from make_cluster_table.py as the CRISPRidentify output is different. + +import re +from pyfaidx import Fasta +import pandas as pd + +cluster_file = snakemake.input["clstr"] +fasta_file = snakemake.input["fasta"] +table_file = snakemake.output[0] + + +def read_clstr_file(inputfile=str): + """ + Read a CD-HIT generated .clstr file and parse its elements into + two dictionaries: 1) cluster ID + representative (locus ID), + 2) sequences (locus ID) with length and + representative/identity + """ + cluster_dict = {"Cluster": [], "Longest_sequence": []} + locus_dict = { + "Genome": [], + "Contig": [], + "Locus": [], + "Full_locus": [], + "Length": [], + "Cluster": [], + "Strand_to_longest": [], + "Identity_to_longest": [], + } + + cluster_regex = r"(>Cluster *)(\d*)" + locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_.+)\.\.\. (.*)$" + + with open(inputfile, "r") as infile: + for line in infile: + line = line.strip() # Remove funny characters + + if line.startswith(">"): + # Extract the digits from the cluster ID + cluster = re.search(cluster_regex, line).group(2) + + elif len(line) > 1: + # Use RegEx to extract information + crispr_info = re.search(locus_regex, line) + + member_nr = crispr_info.group(1) # not used + length = int(crispr_info.group(2).rstrip("nt")) + genome = crispr_info.group(3) + locus = crispr_info.group(4) + full_locus = f"{genome}-{locus}" + contig = locus.split("_")[0] + extra = crispr_info.group(5) + + # Check the final group for representative ('*') or other + if extra == "*": + strand = "NA" + identity = "NA" + cluster_dict["Cluster"].append(cluster) + cluster_dict["Longest_sequence"].append(full_locus) + + else: + strand_and_identity = extra.split("/") + strand = strand_and_identity[0].replace("at ", "") + identity = strand_and_identity[1] + + locus_dict["Genome"].append(genome) + locus_dict["Contig"].append(contig) + locus_dict["Locus"].append(locus) + locus_dict["Full_locus"].append(full_locus) + locus_dict["Length"].append(length) + locus_dict["Cluster"].append(cluster) + locus_dict["Strand_to_longest"].append(strand) + locus_dict["Identity_to_longest"].append(identity) + + # If the line does not start with '>' or have length > 1, stop + else: + break + + return cluster_dict, locus_dict + + +def generate_sequence_df(fasta=str, ids=list): + """ + Given a fasta file and list of identifiers, create a dataframe + of DNA sequences that can be merged with the cluster/locus dataframe. + """ + sequence_dict = Fasta(fasta, duplicate_action="first") + sequence_list = [] + for locus in ids: + sequence_list.append(sequence_dict[locus][:].seq) + + return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list}) + + +def main(): + """ + Main function, running the whole script. + """ + # Read clstr file, store as dictionaries + cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file) + + # Convert dictionaries to dataframes + cluster_df = pd.DataFrame(cluster_dict) + locus_df = pd.DataFrame(locus_dict) + + # Merge dataframes + combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster") + + # Add sequences + sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"]) + + combined_with_sequences = combined_df.merge( + sequence_df, how="inner", on="Full_locus" + ) + + ## Not yet implemented: + # Find shortest sequence per cluster + # Find most common sequence per cluster + + # Save as tab-separated text file + combined_with_sequences.to_csv(table_file, sep="\t", index=False) + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/workflow/scripts/merge_cctyper_crispridentify.sh b/workflow/scripts/merge_cctyper_crispridentify.sh new file mode 100644 index 0000000..e59393d --- /dev/null +++ b/workflow/scripts/merge_cctyper_crispridentify.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +exec 2> "${snakemake_log[0]}" # send all stderr to the log file + +cctyper=(${snakemake_input[cctyper]}) + +first=True +for summary in "${cctyper[@]}" +do + # If it is the first, copy the whole file (including header) + if [ "${first}" == True ] + then + cat "${summary}" > tmp_file1 + first=False + # otherwise, only take the 'contents', without header + else + tail -n +2 "${summary}" >> tmp_file1 + fi + # And write it all in one concatenated temporary file: 'tmp_file1' +done + +# For CRISPRidentify, collect the header as variable +header=$(head -n 1 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t") +# and write the contents of the 'complete summary' to another temporary file: 'tmp_file2' +tail -n +2 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t" > tmp_file2 + +first=True + +# Start reading 'tmp_file1' = CCTyper CRISPRs +while read line +do + if [ "${first}" == True ] + # For the first line... + then + first=False + # ...concatenate the header with CRISPRidentify's header + echo -e "$line\t$header" > ${snakemake_output[table]} + else + + # For all other lines, + sample=$(echo -e "${line}" | cut -f 1) + start_cc=$(echo -e "${line}" | cut -f 3) + start_id=$(expr "${start_cc}" + 1) + # see if the sample has a match in CRISPRidentify (tmp_file2) + match=$(grep "${sample}_${start_id}" tmp_file2 || true) + + if [ -z "${match}" ] + then + # If there's *no* match (-z = lenth 0) + echo -e "${line}" >> ${snakemak_output[table]} + # Just copy the line from CCTyper + else + # but if there is a match, + while read line2 + do + if [ "${start_cc}" -lt 5000 ]; + # check if the reported start position is greater than 5000 (the length of the flanking regions), + # and match the lines from CCTyper (line) and CRISPRidentify (match/line2) accordingly + then + echo -e "${line}\t${match}" >> ${snakemake_output[table]} + else + start=$(echo -e "${line2}" | cut -f 2) + start=$(expr "${start}" + "${start_cc}" - 5000) + length=$(echo -e "${line2}" | cut -f 4) + end=$(expr "${length}" + "${start}" - 1) + begin=$(echo -e "${line2}" | cut -f 1) + rest=$(echo -e "${line2}" | cut -f 4-9) + echo -e "${line}\t${begin}\t${start}\t${end}\t${rest}" >> ${snakemake_output[table]} + fi + done <<< "${match}" + fi + fi +done < tmp_file1 + +# Remove the temporary files +rm -f tmp_file1 tmp_file2 diff --git a/workflow/scripts/merge_crispridentify_batches.sh b/workflow/scripts/merge_crispridentify_batches.sh new file mode 100644 index 0000000..caca261 --- /dev/null +++ b/workflow/scripts/merge_crispridentify_batches.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +exec 2> "${snakemake_log[0]}" # send all stderr to the log file + +spacers_crispr=(${snakemake_input[spacers_crispr]}) +summary_crispr=(${snakemake_input[summary_crispr]}) + +> ${snakemake_output[spacers_crispr]} +for spacers in "${spacers_crispr[@]}" +do + cat "${spacers}" >> ${snakemake_output[spacers_crispr]} +done + +for summary in "${summary_crispr[@]}" +do + header=$(head -n 1 "${summary}") + if [ "${header}" == "No arrays found" ] + then + continue + else + echo "${header}" > ${snakemake_output[summary_crispr]} + break + fi +done + +for summary in "${summary_crispr[@]}" +do + tail -n +2 "${summary}" >> ${snakemake_output[summary_crispr]} +done