Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
398 changes: 198 additions & 200 deletions Snakefile

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions bin/collect_genomad_predictions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

# Find output files for geNomad and create one overall prediction report.

library(tidyverse)
library(here)
suppressPackageStartupMessages({
library(tidyverse)
library(here)
})

genomad_scores_files <- snakemake@input[["aggregated_classification"]]
genomad_plasmid_files <- snakemake@input[["plasmid_summary"]]
Expand Down
6 changes: 4 additions & 2 deletions bin/collect_jaeger_batch.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
# Concatenate Jaeger output files per batch while selecting only the most
# relevant, easy-to-use columns.

library(here)
library(tidyverse)
suppressPackageStartupMessages({
library(tidyverse)
library(here)
})

# Read all output files in a batch (one per genome)
jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))
Expand Down
22 changes: 22 additions & 0 deletions bin/create_spacepharer_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

## this script uses the output from the two spacepharer matches created from the spacepharer rules. This script assumes that Phagescope and PLSDB are the used databases.
## Overall the script takes every match and extracts the ID which is directly taking from the database and then tries to match this back to the metadata and attach this to the match.

#spacepharer does not include its own column names and expected metadata column names are also manually added
echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]}
while read line; do
ID=$(echo $line | cut -f 2)
metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7)
echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]}
done < ${snakemake_input[phage]}

echo "starting plasmid"
echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\ttaxonomy" > ${snakemake_output[plasmid]}
while read line; do
ID=$(echo $line | cut -f 2)
#PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy
nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",")
taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",")
echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]}
done < ${snakemake_input[plasmid]}
3 changes: 3 additions & 0 deletions bin/download_ATB_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,7 @@ download_if_not_present ${output_dir}checkm2.tsv.gz https://osf.io/download/289f
download_if_not_present ${output_dir}species_calls.tsv.gz https://osf.io/download/7t9qd/

# Get a list of all files:
download_if_not_present ${output_dir}all_atb_files.tsv https://osf.io/download/r6gcp/

# And a list with sample - batch matches
download_if_not_present ${output_dir}file_list.all.20240805.tsv.gz https://osf.io/download/dw3h7
72 changes: 72 additions & 0 deletions bin/download_bakta_annotations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n'

# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/

part=${1:-"update"}

if [ "${part}" == "update" ]
then
download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv)

elif [ "${part}" == "original" ]
then
download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv)

elif [ "${part}" == "all" ]
then
download_list=$(cat data/ATB/batches_to_download.tsv)

else
download_list=""
echo "Unknown argument provided! Please use 'all', 'original', or 'update'."
echo "Or none to use the default (=update)."
fi

for line in ${download_list}
do
filename=$(echo ${line} | cut -f 1 | sed -e 's/assembly/bakta/')
url=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 4)
checksum=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 5)
echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}"

mkdir -p data/tmp/ATB

outputfile="data/tmp/ATB/${filename}"

# If the output file is not a file of size greater than zero
if [ ! -s ${outputfile} ]
then
# Then download it
wget -O ${outputfile} ${url}
else
echo "${outputfile} has been downloaded before!"
fi

# Check the md5sum
if echo ${checksum} ${outputfile} | md5sum -c --status;
then
echo "OK: md5sum for ${outputfile} is correct!"
else
# If it is wrong, delete the file
echo "BAD: md5sum for ${outputfile} is incorrect... Deleting file"
rm ${outputfile}
fi

# Extract the batch number from the file name
batchdir="data/tmp/annotations"
mkdir -p ${batchdir}

# If the batch directory has not been made yet
if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ]
then
echo "Extracting ${outputfile}!"

# Decompress the XZ archive and send the output to the specified directory.
tar -Jxf ${outputfile} -C ${batchdir}

else
echo "Bakta files have been extracted previously!"
fi
done
23 changes: 15 additions & 8 deletions bin/download_genomes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@ set -euo pipefail
IFS=$'\n'

# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/
# This script does:
# 1. Look for the batches of interest,
# 2. download them from ATB,
# 3. check their md5 checksum (file integrity)
# 4. extract to 'data/tmp/assemblies'

# Note: If the md5sum does not match (for example, the file is incomplete or missing),
# this script does not download the archive again. You would have to re-run the script
# to download missing batches. (It checks file presence before downloading, so you
# will not download complete/correct files again.)

part=${1:-"update"}

Expand Down Expand Up @@ -52,19 +62,16 @@ do
fi

# Extract the batch number from the file name
batchnumber=$(basename ${outputfile} | cut -f 6 -d '.')
batchdir="data/tmp/ATB/batch_${batchnumber}"
batchdir="data/tmp/assemblies/"
mkdir -p ${batchdir}

# If the batch directory has not been made yet
if [ ! -d ${batchdir} ]
if [ ! -d "${batchdir}${outputfile/.tar.xz/}" ]
then
echo "Extracting ${outputfile}!"

mkdir -p ${batchdir}

# Decompress the XZ archive, send the output to the specified directory,
# and strip the leading directory from within the XZ archive.
tar -Jxf ${outputfile} -C ${batchdir} --strip-components 1
# Decompress the XZ archive and send the output to the specified directory.
tar -Jxf ${outputfile} -C ${batchdir}

else
echo "Fasta files have been extracted previously!"
Expand Down
2 changes: 1 addition & 1 deletion bin/extract_crispr-cas_from_fasta.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ cctyper_dir=$1

sample_name=$(basename ${cctyper_dir})
batch_name=$(basename $(dirname ${cctyper_dir}))
fasta_file="data/tmp/ATB/${batch_name}/${sample_name}.fa"
fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa"

bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") )

Expand Down
112 changes: 95 additions & 17 deletions bin/prepare_genomes.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n'

# Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/

## Prepare genome sequences for CRISPR analysis
##
Expand Down Expand Up @@ -42,22 +46,62 @@ echo "----------"
# and cut column 1 which contains the accession IDs
echo "Step 2: extracting sample accession IDs of species of interest"
species_samples_file="${output_dir}all_samples_of_interest.txt"

echo -e "Species of interest:\n$(cat config/species_of_interest.txt)"

total_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | wc -l)
hq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l)
lq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l)

echo -e "Total_genomes\t${total_genomes}" > data/tmp/total_genomes_of_interest.tsv
echo -e "High-quality_genomes\t${hq_genomes}" >> data/tmp/total_genomes_of_interest.tsv
echo -e "Low-quality_genomes\t${lq_genomes}" >> data/tmp/total_genomes_of_interest.tsv

echo "ATB contains ${total_genomes} genomes of your species of interest."
echo "Of those, ${lq_genomes} are labeled as low-quality, which are not included for further analyses."
echo "That means, ${hq_genomes} are available to work with."

echo
echo "Also see the file data/tmp/total_genomes_of_interest.tsv to see these"
echo "numbers in table form (tab-separated values)."
echo

# Use no further grep options to match anything that contains the species names,
# including subspecies and lineages. Exclude low-quality 'HQ field == F'.
zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz |\
grep -v -e "F$" | cut -f 1 > ${species_samples_file}

echo "Done extracting sample names!"
ls -lh ${species_samples_file}
echo "Contains: $(zless ${species_samples_file} | wc -l) entries"
echo "Contains: $(wc -l ${species_samples_file}) entries"
echo "----------"

# Collect the number of genomes included for each species of interest
zgrep -f ${species_samples_file} -w ${output_dir}species_calls.tsv.gz |\
cut -f 2 | sort | uniq -c | sort -nr > ${output_dir}number_of_genomes_per_species.txt

# Count the number of genomes of interest in the 'original' and 'update' releases:
original_genomes=$(zgrep -w -f ${species_samples_file}\
${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\
grep -c "r0.2")
update_genomes=$(zgrep -w -f ${species_samples_file}\
${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\
grep -c "incr_release")

echo -e "Genomes_in_original_release\t${original_genomes}" >> data/tmp/total_genomes_of_interest.tsv
echo -e "Genomes_in_update_release\t${update_genomes}" >> data/tmp/total_genomes_of_interest.tsv

## Step 3: Filter metadata to the species of interest
echo "Step 3: Filtering metadata for species of interest"
zgrep -w -f ${species_samples_file} ${output_dir}ena_metadata.20240801.tsv.gz |\
gzip > ${output_dir}ena_metadata.20240801-filtered.tsv.gz
# Include the header for simpler processing in e.g. R:
zless ${output_dir}ena_metadata.20240801.tsv.gz | head -1 | gzip >\
${output_dir}ena_metadata.20240801-filtered.tsv.gz || true
# N.B. the 'or true' is inserted to avoid the non-zero exit status of head

# Use 'grep -w' to match only whole words; don't match longer accession numbers that
# contain the sample accession of interest!
zgrep -w -f "${species_samples_file}" "${output_dir}ena_metadata.20240801.tsv.gz" |\
gzip >> ${output_dir}ena_metadata.20240801-filtered.tsv.gz

echo "Done filtering!"
ls -lh ${output_dir}ena_metadata.20240801-filtered.tsv.gz
Expand All @@ -76,30 +120,64 @@ echo -e "Filename\tDownload_URL\tmd5sum"
head -3 ${output_dir}batches_to_download.tsv
echo "----------"

## Step 4.2: Find out what genomes are in these batches and make a list of
# genomes to exclude (low-quality and other species).
# To get there, first collect all samples in the batches to download:
zgrep -f ${output_dir}batches_to_download.tsv ${output_dir}file_list.all.20240805.tsv.gz |\
cut -f 1 > ${output_dir}samples_in_batches.txt

# Now take out the samples of interest:
grep -v -w -f ${species_samples_file} ${output_dir}samples_in_batches.txt >\
${output_dir}samples_not_of_interest.txt

# And for these 'not of interest genomes', give a summary of their
# numbers:
zgrep -w -f ${output_dir}samples_not_of_interest.txt\
${output_dir}file_list.all.20240805.tsv.gz | cut -f 2 | sort | uniq -c | sort -nr >\
data/tmp/other_genomes-numbers.txt

# And get the list of batches + sample accessions, to facilitate removal:
zgrep -w -f ${output_dir}samples_not_of_interest.txt\
${output_dir}file_list.all.20240805.tsv.gz |\
cut -f 4 > data/tmp/samples_to_remove.txt

echo "In the batches to download are $(wc -l ${output_dir}samples_not_of_interest.txt)"
echo "samples that have low-quality genomes or species other than the species of interest."
echo "These are summarised in:"
ls -lh ${output_dir}samples_not_of_interest.txt data/tmp/other_genomes-numbers.txt data/tmp/samples_to_remove.txt

## Step 5: Download genome sequences
echo "Step 5: Download genome sequences"
bash bin/download_genomes.sh ${part}
echo "Finished downloading!"
echo "The batches have been written to 'data/tmp/ATB/batch_*'"
echo "The batch archives have been written to 'data/tmp/ATB/'"
echo "and their contents extracted to 'data/tmp/assemblies/'"
echo "----------"

# Step 6: Remove genomes of other species
## Step 6: Remove genomes of other species
echo "Step 6: remove genomes of species other than species of interest"
echo "Files before filtering:"
for batch in data/tmp/ATB/batch_*
for fasta in $(cat data/tmp/samples_to_remove.txt)
do
echo "$(basename ${batch}):"
ls ${batch} | wc -l
# Verbose remove: tell what is being removed
rm -fv "data/tmp/assemblies/${fasta}"
done
echo "----------"

bash bin/remove_other_species.sh
## Step 7: Download functional annotations (Bakta)
echo "Step 7: download functional (gene) annotations"
bash bin/download_bakta_annotations.sh ${part}
echo "Finished downloading!"
echo "The batches have been downloaded to 'data/tmp/ATB/' and extracted in 'data/tmp/annotations'"
echo "----------"

echo "Files after filtering:"
for batch in data/tmp/ATB/batch_*
## Step 7.1: Also remove annotation files for non-of-interest samples
# 1: adjust the file basename from 'assembly' to 'bakta'
echo "Step 7.1: remove genome annotations of other/low-quality samples"
for file in $(sed 's|atb.assembly.|atb.bakta.|g' data/tmp/samples_to_remove.txt)
do
echo "$(basename ${batch}):"
ls ${batch} | wc -l
# 2: add 'annotations' subdirectory
bakta_dir="data/tmp/annotations/"
# 3: exchange '.fa' extension to 'bakta.json'
json="${bakta_dir}${file/.fa/.bakta.json}"
rm -fv ${json}
done
rmdir --ignore-fail-on-non-empty data/tmp/ATB/batch_*
echo "----------"
echo "Genome files have been prepared!"
31 changes: 0 additions & 31 deletions bin/remove_other_species.sh

This file was deleted.

4 changes: 2 additions & 2 deletions config/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

### 1. Input/output parameters ###

input_directory: "data/tmp/ATB/"
output_directory: "data/tmp/"
working_directory: "data/tmp/"
output_directory: "data/processed/"

genomad_database: "data/genomad_db/"
spacepharer_phage_database: "data/raw/phagescope/"
Expand Down
Loading