UtrechtUniversity · samnooij · Dec 5, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/Snakefile b/Snakefile
diff --git a/bin/collect_genomad_predictions.R b/bin/collect_genomad_predictions.R
@@ -2,8 +2,10 @@
 
 # Find output files for geNomad and create one overall prediction report.
 
-library(tidyverse)
-library(here)
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(here)
+})
 
 genomad_scores_files <- snakemake@input[["aggregated_classification"]]
 genomad_plasmid_files <- snakemake@input[["plasmid_summary"]]

diff --git a/bin/collect_jaeger_batch.R b/bin/collect_jaeger_batch.R
@@ -3,8 +3,10 @@
 # Concatenate Jaeger output files per batch while selecting only the most
 # relevant, easy-to-use columns.
 
-library(here)
-library(tidyverse)
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(here)
+})
 
 # Read all output files in a batch (one per genome)
 jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))

diff --git a/bin/create_spacepharer_table.sh b/bin/create_spacepharer_table.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+## this script uses the output from the two spacepharer matches created from the spacepharer rules. This script assumes that Phagescope and PLSDB are the used databases.
+## Overall the script takes every match and extracts the ID which is directly taking from the database and then tries to match this back to the metadata and attach this to the match.
+
+#spacepharer does not include its own column names and expected metadata column names are also manually added
+echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]}
+while read line; do
+    ID=$(echo $line | cut -f 2)
+    metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7)
+    echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]}
+done < ${snakemake_input[phage]}
+
+echo "starting plasmid"
+echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\ttaxonomy" > ${snakemake_output[plasmid]}
+while read line; do
+    ID=$(echo $line | cut -f 2)
+    #PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy
+    nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",")
+    taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",")
+    echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]}
+done < ${snakemake_input[plasmid]}
diff --git a/bin/download_ATB_metadata.sh b/bin/download_ATB_metadata.sh
@@ -42,4 +42,7 @@ download_if_not_present ${output_dir}checkm2.tsv.gz https://osf.io/download/289f
 download_if_not_present ${output_dir}species_calls.tsv.gz https://osf.io/download/7t9qd/
 
 # Get a list of all files:
+download_if_not_present ${output_dir}all_atb_files.tsv https://osf.io/download/r6gcp/
+
+# And a list with sample - batch matches
 download_if_not_present ${output_dir}file_list.all.20240805.tsv.gz https://osf.io/download/dw3h7
diff --git a/bin/download_bakta_annotations.sh b/bin/download_bakta_annotations.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n'
+
+# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/
+
+part=${1:-"update"}
+
+if [ "${part}" == "update" ]
+then
+    download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv)
+
+elif [ "${part}" == "original" ]
+then
+    download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv)
+
+elif [ "${part}" == "all" ]
+then
+    download_list=$(cat data/ATB/batches_to_download.tsv)
+
+else
+    download_list=""
+    echo "Unknown argument provided! Please use 'all', 'original', or 'update'."
+    echo "Or none to use the default (=update)."
+fi
+
+for line in ${download_list}
+do
+    filename=$(echo ${line} | cut -f 1 | sed -e 's/assembly/bakta/')
+    url=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 4)
+    checksum=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 5)
+    echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}"
+
+    mkdir -p data/tmp/ATB
+
+    outputfile="data/tmp/ATB/${filename}"
+
+    # If the output file is not a file of size greater than zero
+    if [ ! -s ${outputfile} ]
+    then
+        # Then download it
+        wget -O ${outputfile} ${url}
+    else
+        echo "${outputfile} has been downloaded before!"
+    fi
+
+    # Check the md5sum
+    if echo ${checksum} ${outputfile} | md5sum -c --status;
+    then
+        echo "OK: md5sum for ${outputfile} is correct!"
+    else
+        # If it is wrong, delete the file
+        echo "BAD: md5sum for ${outputfile} is incorrect... Deleting file"
+        rm ${outputfile}
+    fi
+
+    # Extract the batch number from the file name
+    batchdir="data/tmp/annotations"
+    mkdir -p ${batchdir}
+
+    # If the batch directory has not been made yet
+    if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ]
+    then
+        echo "Extracting ${outputfile}!"
+
+        # Decompress the XZ archive and send the output to the specified directory.
+        tar -Jxf ${outputfile} -C ${batchdir}
+
+    else
+        echo "Bakta files have been extracted previously!"
+    fi
+done
diff --git a/bin/download_genomes.sh b/bin/download_genomes.sh
@@ -3,6 +3,16 @@ set -euo pipefail
 IFS=$'\n'
 
 # Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/
+# This script does:
+# 1. Look for the batches of interest,
+# 2. download them from ATB,
+# 3. check their md5 checksum (file integrity)
+# 4. extract to 'data/tmp/assemblies'
+
+# Note: If the md5sum does not match (for example, the file is incomplete or missing),
+# this script does not download the archive again. You would have to re-run the script
+# to download missing batches. (It checks file presence before downloading, so you
+# will not download complete/correct files again.)
 
 part=${1:-"update"}
 
@@ -52,19 +62,16 @@ do
     fi
 
     # Extract the batch number from the file name
-    batchnumber=$(basename ${outputfile} | cut -f 6 -d '.')
-    batchdir="data/tmp/ATB/batch_${batchnumber}"
+    batchdir="data/tmp/assemblies/"
+    mkdir -p ${batchdir}
 
     # If the batch directory has not been made yet
-    if [ ! -d ${batchdir} ]
+    if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ]
     then
         echo "Extracting ${outputfile}!"
 
-        mkdir -p ${batchdir}
-
-        # Decompress the XZ archive, send the output to the specified directory,
-        # and strip the leading directory from within the XZ archive.
-        tar -Jxf ${outputfile} -C ${batchdir} --strip-components 1
+        # Decompress the XZ archive and send the output to the specified directory.
+        tar -Jxf ${outputfile} -C ${batchdir}
 
     else
         echo "Fasta files have been extracted previously!"

diff --git a/bin/extract_crispr-cas_from_fasta.sh b/bin/extract_crispr-cas_from_fasta.sh
@@ -4,7 +4,7 @@ cctyper_dir=$1
 
 sample_name=$(basename ${cctyper_dir})
 batch_name=$(basename $(dirname ${cctyper_dir}))
-fasta_file="data/tmp/ATB/${batch_name}/${sample_name}.fa"
+fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa"
 
 bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") )
 

diff --git a/bin/prepare_genomes.sh b/bin/prepare_genomes.sh
@@ -1,4 +1,8 @@
 #!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n'
+
+# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/
 
 ## Prepare genome sequences for CRISPR analysis
 ##
@@ -42,22 +46,62 @@ echo "----------"
 #  and cut column 1 which contains the accession IDs
 echo "Step 2: extracting sample accession IDs of species of interest"
 species_samples_file="${output_dir}all_samples_of_interest.txt"
+
+echo -e "Species of interest:\n$(cat config/species_of_interest.txt)"
+
+total_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | wc -l)
+hq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l)
+lq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l)
+
+echo -e "Total_genomes\t${total_genomes}" > data/tmp/total_genomes_of_interest.tsv
+echo -e "High-quality_genomes\t${hq_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+echo -e "Low-quality_genomes\t${lq_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+
+echo "ATB contains ${total_genomes} genomes of your species of interest."
+echo "Of those, ${lq_genomes} are labeled as low-quality, which are not included for further analyses."
+echo "That means, ${hq_genomes} are available to work with."
+
+echo
+echo "Also see the file data/tmp/total_genomes_of_interest.tsv to see these"
+echo "numbers in table form (tab-separated values)."
+echo
+
+# Use no further grep options to match anything that contains the species names,
+# including subspecies and lineages. Exclude low-quality 'HQ field == F'.
 zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz |\
  grep -v -e "F$" | cut -f 1 > ${species_samples_file}
 
 echo "Done extracting sample names!"
 ls -lh ${species_samples_file}
-echo "Contains: $(zless ${species_samples_file} | wc -l) entries"
+echo "Contains: $(wc -l ${species_samples_file}) entries"
 echo "----------"
 
 # Collect the number of genomes included for each species of interest
 zgrep -f ${species_samples_file} -w ${output_dir}species_calls.tsv.gz |\
  cut -f 2 | sort | uniq -c | sort -nr > ${output_dir}number_of_genomes_per_species.txt
 
+# Count the number of genomes of interest in the 'original' and 'update' releases:
+original_genomes=$(zgrep -w -f ${species_samples_file}\
+ ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\
+  grep -c "r0.2")
+update_genomes=$(zgrep -w -f ${species_samples_file}\
+ ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\
+  grep -c "incr_release")
+
+echo -e "Genomes_in_original_release\t${original_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+echo -e "Genomes_in_update_release\t${update_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+
 ## Step 3: Filter metadata to the species of interest
 echo "Step 3: Filtering metadata for species of interest"
-zgrep -w -f ${species_samples_file} ${output_dir}ena_metadata.20240801.tsv.gz |\
- gzip > ${output_dir}ena_metadata.20240801-filtered.tsv.gz
+# Include the header for simpler processing in e.g. R:
+zless ${output_dir}ena_metadata.20240801.tsv.gz | head -1 | gzip >\
+ ${output_dir}ena_metadata.20240801-filtered.tsv.gz || true
+# N.B. the 'or true' is inserted to avoid the non-zero exit status of head
+
+# Use 'grep -w' to match only whole words; don't match longer accession numbers that
+#  contain the sample accession of interest!
+zgrep -w -f "${species_samples_file}" "${output_dir}ena_metadata.20240801.tsv.gz" |\
+ gzip >> ${output_dir}ena_metadata.20240801-filtered.tsv.gz
 
 echo "Done filtering!"
 ls -lh ${output_dir}ena_metadata.20240801-filtered.tsv.gz
@@ -76,30 +120,64 @@ echo -e "Filename\tDownload_URL\tmd5sum"
 head -3 ${output_dir}batches_to_download.tsv
 echo "----------"
 
+## Step 4.2: Find out what genomes are in these batches and make a list of
+#  genomes to exclude (low-quality and other species).
+#  To get there, first collect all samples in the batches to download:
+zgrep -f ${output_dir}batches_to_download.tsv ${output_dir}file_list.all.20240805.tsv.gz |\
+ cut -f 1 > ${output_dir}samples_in_batches.txt
+
+# Now take out the samples of interest:
+grep -v -w -f ${species_samples_file} ${output_dir}samples_in_batches.txt >\
+ ${output_dir}samples_not_of_interest.txt
+
+# And for these 'not of interest genomes', give a summary of their
+# numbers:
+zgrep -w -f ${output_dir}samples_not_of_interest.txt\
+ ${output_dir}file_list.all.20240805.tsv.gz | cut -f 2 | sort | uniq -c | sort -nr >\
+ data/tmp/other_genomes-numbers.txt
+
+# And get the list of batches + sample accessions, to facilitate removal:
+zgrep -w -f ${output_dir}samples_not_of_interest.txt\
+ ${output_dir}file_list.all.20240805.tsv.gz |\
+ cut -f 4 > data/tmp/samples_to_remove.txt
+
+echo "In the batches to download are $(wc -l ${output_dir}samples_not_of_interest.txt)"
+echo "samples that have low-quality genomes or species other than the species of interest."
+echo "These are summarised in:"
+ls -lh ${output_dir}samples_not_of_interest.txt data/tmp/other_genomes-numbers.txt data/tmp/samples_to_remove.txt
+
 ## Step 5: Download genome sequences
 echo "Step 5: Download genome sequences"
 bash bin/download_genomes.sh ${part}
 echo "Finished downloading!"
-echo "The batches have been written to 'data/tmp/ATB/batch_*'"
+echo "The batch archives have been written to 'data/tmp/ATB/'"
+echo "and their contents extracted to 'data/tmp/assemblies/'"
 echo "----------"
 
-# Step 6: Remove genomes of other species
+## Step 6: Remove genomes of other species
 echo "Step 6: remove genomes of species other than species of interest"
-echo "Files before filtering:"
-for batch in data/tmp/ATB/batch_*
+for fasta in $(cat data/tmp/samples_to_remove.txt)
 do
-    echo "$(basename ${batch}):"
-    ls ${batch} | wc -l
+# Verbose remove: tell what is being removed
+    rm -fv "data/tmp/assemblies/${fasta}"
 done
+echo "----------"
 
-bash bin/remove_other_species.sh
+## Step 7: Download functional annotations (Bakta)
+echo "Step 7: download functional (gene) annotations"
+bash bin/download_bakta_annotations.sh ${part}
+echo "Finished downloading!"
+echo "The batches have been downloaded to 'data/tmp/ATB/' and extracted in 'data/tmp/annotations'"
+echo "----------"
 
-echo "Files after filtering:"
-for batch in data/tmp/ATB/batch_*
+## Step 7.1: Also remove annotation files for non-of-interest samples
+# 1: adjust the file basename from 'assembly' to 'bakta'
+echo "Step 7.1: remove genome annotations of other/low-quality samples"
+for file in $(sed 's|atb.assembly.|atb.bakta.|g' data/tmp/samples_to_remove.txt)
 do
-    echo "$(basename ${batch}):"
-    ls ${batch} | wc -l
+    # 2: add 'annotations' subdirectory
+    bakta_dir="data/tmp/annotations/"
+    # 3: exchange '.fa' extension to 'bakta.json'
+    json="${bakta_dir}${file/.fa/.bakta.json}"
+    rm -fv ${json}
 done
-rmdir --ignore-fail-on-non-empty data/tmp/ATB/batch_*
-echo "----------"
-echo "Genome files have been prepared!"
diff --git a/bin/remove_other_species.sh b/bin/remove_other_species.sh
diff --git a/config/parameters.yaml b/config/parameters.yaml
@@ -2,8 +2,8 @@
 
 ### 1. Input/output parameters ###
 
-input_directory: "data/tmp/ATB/"
-output_directory: "data/tmp/"
+working_directory: "data/tmp/"
+output_directory: "data/processed/"
 
 genomad_database: "data/genomad_db/"
 spacepharer_phage_database: "data/raw/phagescope/"