Summary of metadata and spacers of CRISPR arrays in Campylobacter genomes from AllTheBacteria
-
As described in Descriptive_statistics.html, a large amount of bacteria of Campylobacter jejuni and coli have been downloaded from AllTheBacteria (ATB) including their metadata. These genomes have then been screened for CRISPR-cas operons using CCTyper (version 1.8.0). This has identified a large amount of spacers (figure 1) which has been further explored in Descriptive_statistics.html on its own. However, for the intended research, the origin of the Bacteria will need to be known so that it can potentially be linked to specific spacers or patterns. For this the metadata of ATB is vital. But with cursory inspection (figure 2), many of the origins are missing, unclear and without standardization.
-
In this report, we summarize the spacers data and metadata and discuss certain problems that remain within the data.
-
First relevant data will be imported.
-
-
-Code
-
#all spacers from CCtyper were combined into a tsv file as FASTA format
-all_spacers <-read_tsv(here("joining CCtyper/all_spacers.tsv"), show_col_types=FALSE)
-names(all_spacers) <-c("Sample_accession", "Spacer_sequence")
-
-#contig index are extracted from sample accession and put into their own column for pattern analysis
-all_spacers[3] <-str_extract(all_spacers$Sample_accession, "\\d*:\\d*")
-all_spacers <-separate(all_spacers, 3, into =c("Array_index", "Spacer_index"))
-all_spacers["Contig_accession"] <-str_extract(all_spacers$Sample_accession, "contig\\d*")
-
-#clean up sample accession
-all_spacers$Sample_accession <-str_extract(all_spacers$Sample_accession, "(?<=>)\\w*")
-
-#these spacers also include repeats and can be orphans or connected to a cas operon, this is added to the all spacer frame.
-orphan_crispr <-read_tsv(here("joining CCtyper/crisprs_orphan-concatenated.tab"), show_col_types = F)[,c(3, 6)] %>%mutate("orphan?"=TRUE) %>%separate(col =1, sep ="\\.|\\_", into =c("Sample_accession", "Contig_accession", "Array_index"))
-orphan_crispr$Contig_accession <-str_extract(orphan_crispr$Contig_accession, "^\\w*")
-
-
-crispr_cas <-read_tsv(here("joining CCtyper/crisprs_near_cas-concatenated.tab"), show_col_types = F)[,c(3, 6)] %>%mutate("orphan?"=FALSE) %>%separate(col =1, sep ="\\.|\\_", into =c("Sample_accession", "Contig_accession", "Array_index"))
-crispr_cas$Contig_accession <-str_extract(crispr_cas$Contig_accession, "^\\w*")
-
-all_spacers <-left_join(all_spacers, bind_rows(orphan_crispr, crispr_cas))
-
-#ENA metadata is downloaded from ATB and was filtered for original analysis in descriptive_statistics
-colnames <-read_table(here("metadata project/colnames_metadata_filtered.txt"), col_names = F, show_col_types = F)
-metadata_source <-
-read_tsv(here("metadata project/ena_metadata.20240801.selection-only_Campylobacter.tsv"),
-col_names = colnames$X2, show_col_types =FALSE)
-
-#further filtering metadata as not all are needed for this analysis
-metadata <-select(metadata_source, c(sample_accession, scientific_name, isolation_source, host))
-
-#in the metadata there is a distinction between unclear data and data that is simply missing/unspecified. missing data is turned into NA and data that is present in isolation source but not in host is copied to host.
-missingno <- metadata %>%
-mutate(across(c(isolation_source, host), ~str_to_lower(.x)))%>%
-replace_with_na(replace =list(isolation_source =c(
-"missing", "other", "not collected", "no source specified", "mising"
- ), host =c(
-"missing", "other", "not collected", "no source specified", "mising"
- ))) %>%mutate(host =ifelse(is.na(host), isolation_source, host))
-
-#There are also bacteria that are incorrectly noted in the ENA metadata compared to the ATB designation of species
-simplified_species <-read_delim(
-file =here("metadata project/sylph.tsv.gz"),
-delim ="\t",
-show_col_types = F) %>%
-select(Sample, Species)
-
-simplified_species <- simplified_species %>%filter(str_detect(Species,"Campylobacter_D (jejuni|coli)")) %>%mutate(Species =str_replace_all(Species, "_(A|B|C|D)", ""))
-
-merged_metadata <-left_join(missingno, simplified_species, by =c("sample_accession"="Sample")) %>%distinct(sample_accession, .keep_all =TRUE)
-
-
-
-
-
-
-
-
-
Metadata analysis
-
-
Highlighting the importance of FAIR metadata handling
isolation_source
-1 feces
-2 chicken carcass
-3 farm
-4 raw intact chicken
-5 human
-6 food
-7 chicken breast
-8 animal-chicken-young chicken
-9 animal-swine-market swine
-10 animal-cattle-dairy cow
-
-
-
This shows some examples of how this metadata has inconsistent notation of origins. Many problems occur when trying to parse this data as is, for example ‘raw intact chicken’ and ‘chicken carcass’ describe the same host species. This additional information is not useful for ‘host’ metadata but is for ‘isolation source’, yet information in isolation source is also lacking as the distinction between broiler and meat producing chickens is sometimes difficult to discern. Unfortunately this alongside spelling errors (‘boivine’) and unclear designations (‘food’), makes the metadata require a significant amount of processing. Not to mention the amount of data that potentially could have information regarding origin, but was never noted. This in turn highlights the importance of the FAIR principles for research.
-
-
-
Consolidating origins
-
-
-Code
-
#TODO: keep discussing and updating categories, data crawl through original studies to find actual hosts and investigate why about a 1000 samples are lost from left_join.
-
-#consolidates the many differing inputs of host into more managable categories: Meat producing poultry, Adult cattle, layers, veal calves, pets, small ruminants, pigs, surface water and water birds, wild birds, non-water environment and human.
-
-#this last category consists of all species not part of these initial categories, or unclear enough to not be useful.
-definedmeta <- merged_metadata %>%
-mutate(category = host) %>%
-mutate(
-category =str_replace(
- category,
-".*(field|environment|pasture|sediment|soil|surfaces|crates).*",
-"dry environment"
- )
- ) %>%
-mutate(
-category =str_replace(
- category,
-".*(calf|veal).*",
-"calves"
- )
- ) %>%
-mutate(
-category =str_replace(
- category,
-".*(taurus|cattle|milk|boi?vine|cow|heifer|steer|beef|dairy|indicus).*",
-"adult cattle"
- )
- ) %>%
-mutate(
-category =ifelse(str_detect(
- isolation_source,
-".*((?<!v)egg|layer).*"), yes ="broiler", no = host)
- ) %>%
-mutate(category =str_replace(category, ".*(water|lagoon|sewage|river|wetland|fulica\\satra|platyrhynchos|duck).*", "surface water and water birds")) %>%
-mutate(
-category =str_replace(
- category,
-".*(pheasant|phasianus|gallopavo|hen|chi[ec]ken|turkey|broiler|gallus|cb-|drumsticks|poultry).*",
-"meat producing poultry"
- )
- ) %>%
-mutate(category =str_replace(
- category,
-".*(sus\\sscrofa|porcine|pig|swine|sow|pork).*",
-"swine/pig"
- )) %>%
-mutate(
-category =str_replace(
- category,
-".*(puppy|canii?ne|cat|feline|dog|kitten|pet|familiaris)(?!tle).*",
-"pet animals"
- )
- ) %>%
-mutate(category =str_replace(category, ".*(human|clinical|guillain|sapiens).*", "human")) %>%
-mutate(
-category =str_replace(
- category,
-".*(avian|cloaca|(?<!water\\s)bird|columba livia|crow|corvus|dove).*",
-"wild avian"
- )
- ) %>%
-mutate(category =str_replace(category, ".*(sheep|aries|ovine|goat|hircus).*", "small ruminants")) %>%
-mutate(category =str_replace(category, ".*(human|sapiens).*", "human")) %>%
-mutate(
-category =str_replace(
- category,
-"^(?!(dry environment|calves|adult cattle|broiler|meat producing poultry|swine/pig|pet animals|wild avian|small ruminants|food|feces|human|surface water and water birds)).*",
-"other/undetermined"
- ))
-#calculate frequency of categories
-origincounts <- definedmeta %>%group_by(category) %>%summarise(total_n =n()) %>%arrange(desc(total_n))
-
-#calculate percentages and create labels
-processedcount <-
- origincounts %>%mutate(percentage =round(total_n /sum(total_n) *100, 3))
-processedcount$labels <-paste(processedcount$category, "\n", processedcount$total_n, "(", processedcount$percentage, "%)")
-processedcount <- processedcount %>%arrange(desc(total_n))
-summed_N <-as.character(sum(processedcount$total_n))
-
-
-treemap(
- processedcount,
-index =c("labels"),
-vSize ="total_n",
-vColor ="total_n",
-draw =TRUE,
-border.col ="black",
-title =paste("Treemap of Categories with total N:", sum(processedcount$total_n)),
-fontsize.title =16)
-
-
-
-
-
-
-
-
-
-Code
-
#same calculations but without NA
-processedcount <-
- origincounts[-1,] %>%mutate(percentage =round(total_n /sum(total_n) *100, 3))
-processedcount$labels <-paste(processedcount$category, "\n", processedcount$total_n, "(", processedcount$percentage, "%)")
-processedcount <- processedcount %>%arrange(desc(total_n))
-
-
-treemap(
- processedcount,
-index =c("labels"),
-vSize ="total_n",
-vColor ="total_n",
-draw =TRUE,
-border.col ="black",
-title =paste("Treemap of Categories (without NA) with total N:", sum(processedcount$total_n)),
-fontsize.title =16)
This is a visualization of the categories that this document will work with going forward. As can be noticed, poultry is abundant with other species quickly falling off. Additionally a large amount of the host and isolation_source data is NA, considered to be mostly unusable for source attribution. Though this is still a large amount of data still to use, lets see how this is divided between jejuni and coli
-
-
-
Campylobacter species
-
-
-Code
-
#same calculations as above, but now also additionally grouped on species.
-
-origincounts_jejuni <- definedmeta %>%group_by(category, Species) %>%filter(as.character(Species) =="Campylobacter jejuni") %>%summarise(total_n =n()) %>%arrange(desc(total_n))
-
-summed_N <-sum(origincounts_jejuni[-1,]$total_n)
-processedcount_jejuni <-
- origincounts_jejuni[-1,] %>%mutate(percentage =round(total_n / summed_N *100, 3))
-processedcount_jejuni$labels <-paste(processedcount_jejuni$category, "\n", processedcount_jejuni$total_n, "(", processedcount_jejuni$percentage, "%)")
-processedcount_jejuni <- processedcount_jejuni %>%arrange(desc(total_n))
-
-
-
-
-treemap(
- processedcount_jejuni,
-index =c("labels"),
-vSize ="total_n",
-vColor ="total_n",
-draw =TRUE,
-border.col ="black",
-title =paste("Treemap of C. jejuni Categories (without NA) with total N:", sum(processedcount_jejuni$total_n)),
-fontsize.title =16)
As can be seen, meat producing poultry is most frequent in both species. Though interestingly, swine are much more frequent within Campylobacter coli than in C.jejuni. Next lets look at the spacers found
-
-
-
-
Spacer analysis with metadata
-
-
-Code
-
#join the two files
-merged_data <-inner_join(all_spacers, definedmeta, by =c("Sample_accession"="sample_accession"))
-merged_data <-unique(merged_data)
-
-
-#condensing spacer sequences into unique sequences for faster calculation and performing an initial count of spacers
-counts_spacer <- merged_data %>%group_by(Spacer_sequence, category) %>%count()
-
-#some spacer sequences are directly complementary to each other and so the same spacer
-dna <-DNAStringSet(counts_spacer$Spacer_sequence)
-reverse_complements <-reverseComplement(dna)
-
-
-processed <-logical(length(dna))
-for (i inseq_along(dna)) {
-if (processed[i]) next
- match <-which(as.character(dna) ==as.character(reverse_complements[i]))
-if (length(match) >0) {
- processed[match] <-TRUE
- dna[match] <- reverse_complements[match]
- }
-}
-
-dna_list <-as.character(dna)
-counts_spacer$Spacer_sequence <- dna_list
-
-counts_spacer_host <- counts_spacer %>%group_by(Spacer_sequence, category) %>%summarise(n =sum(n)) %>%arrange(desc(n))
-counts_spacer_hostless <- counts_spacer %>%group_by(Spacer_sequence) %>%summarise(n =sum(n)) %>%arrange(desc(n))
-
-merged_data <-rename(merged_data, c("scientific_name"="ENA_species", "Species"="ATB_species"))
-
-merged_data_NAless <- merged_data %>%filter(!is.na(category))
-counts_spacer_NAless <- counts_spacer_host %>%filter(!is.na(category)) %>%arrange(desc(n))
-
-
-head(counts_spacer_NAless)
As described in Descriptive_statistics, many spacers detected by CCtyper were similar to each other. Some were reverse complements of each other or were identical except for additional nucleotides in one. Additionally some spacers have high similarity to each other with minor mutations. This table above has accounted for reverse complements and merged them into the count, however finding a solution for the other similarities has not been successful so far.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
new file mode 100644
index 0000000..dc3677c
--- /dev/null
+++ b/workflow/Snakefile
@@ -0,0 +1,109 @@
+"""
+Author: Sam Nooij
+Organisation: Utrecht University
+Department: Clinical Infectiology (KLIF), Infectious Diseases & Immunology,
+ Biomolecular Health Sciences, Faculty of Veterinary Medicine
+Date: 2024-11-06
+
+Workflow for testing CRISPR analysis options
+In contrast to the 'regular' Snakefile workflow, which works
+per genome file, this workflow works per batch and runs GNU
+parallel to parallelise processing of the genomes within
+each batch.
+
+
+Input: Fasta files of Campylobacter whole-genomes
+Output: (various)
+
+Example use:
+ $ snakemake --profile config
+
+N.B. Variables are set in the configuration files under `config`.
+"""
+
+# Step 1: Load configuration
+# ---------------------------------------------------------
+
+
+configfile: Path("config/parameters.yaml")
+
+
+# Log messages
+onstart:
+ print("\n--- Starting analysis! ---\n")
+
+
+onsuccess:
+ print("\n --- Workflow finished! ---\n")
+
+
+onerror:
+ print("\n --- An error occurred! ---\n")
+
+
+# Step 2: Load rules that do the work
+# ---------------------------------------------------------
+include: "rules/helper_rules.smk"
+include: "rules/screen_crispr-cas.smk"
+include: "rules/refine_crispr-cas.smk"
+include: "rules/classify_genomes.smk"
+include: "rules/identify_defences.smk"
+include: "rules/map_spacers.smk"
+
+
+# Step 3: specify target outputs
+# ---------------------------------------------------------
+
+
+rule all:
+ input:
+ ## Extra outputs (non-CRISPR)
+ # Multilocus Sequence Types (ST) for Campylobacter
+ "results/mlst_table.tsv",
+ # Virus and plasmid predictions per contig
+ "results/genomad_predictions.csv",
+ "results/jaeger_predictions.csv",
+ # Phage defence systems
+ "results/padloc_table.csv",
+ ## CRISPR-related outputs: 1. pre-screening
+ # Concatenated CCTyper output (CRISPR arrays and spacers)
+ expand(
+ "results/cctyper/{batch}/{filename}-{batch}.tab",
+ batch=BATCHES,
+ filename=[
+ "CRISPR_Cas",
+ "crisprs_all",
+ "crisprs_near_cas",
+ "crisprs_orphan",
+ "cas_operons",
+ ],
+ ),
+ expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
+ # CCTyper CRISPR spacer cluster analysis report
+ "results/cctyper/spacer_cluster_summary.tsv",
+ # Cluster unique CRISPR spacers
+ "results/cctyper/all_spacers-clustered.clstr",
+ "results/all_spacers_table.tsv",
+ # CRISPR output 2. CRISPRidentify output (refinement)
+ expand(
+ "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+ batch=BATCHES,
+ ),
+ expand(
+ "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+ batch=BATCHES,
+ ),
+ # Concatenated CRISPRidentify output
+ "results/crispridentify/complete_summary.csv",
+ "results/crispridentify/all_spacers.fa",
+ "results/crispridentify/all_spacers-clustered.clstr",
+ "results/all_spacers_table_identify.tsv",
+ # Merged CRISPRidentify and CCtyper output
+ "results/all_CRISPRS_with_identify.tab",
+ # Spacepharer output (spacer targets, putative protospacers)
+ "results/phage_matches.tsv",
+ "results/plasmid_matches.tsv",
+ # KMA output (mapping spacers to input Campy genomes)
+ "results/kma/output/CRISPR.frag.gz",
+ "results/kma/CRISPR_alignment",
+ default_target: True
diff --git a/workflow/envs/bash.yaml b/workflow/envs/bash.yaml
new file mode 100644
index 0000000..196651f
--- /dev/null
+++ b/workflow/envs/bash.yaml
@@ -0,0 +1,8 @@
+name: bash
+
+dependencies:
+ - bash=5.1.16
+ - parallel=20251122
+
+channels:
+ - conda-forge
diff --git a/envs/cctyper.yaml b/workflow/envs/cctyper.yaml
similarity index 100%
rename from envs/cctyper.yaml
rename to workflow/envs/cctyper.yaml
diff --git a/envs/cdhit.yaml b/workflow/envs/cdhit.yaml
similarity index 100%
rename from envs/cdhit.yaml
rename to workflow/envs/cdhit.yaml
diff --git a/workflow/envs/crispridentify.yaml b/workflow/envs/crispridentify.yaml
new file mode 100644
index 0000000..cc5fef9
--- /dev/null
+++ b/workflow/envs/crispridentify.yaml
@@ -0,0 +1,48 @@
+name: crispr_identify_env
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+ - biobuilds
+ - r
+ - axfeh
+dependencies:
+ - python==3.7.6
+ - pip
+ - python_abi=3.7
+ - biopython=1.76
+ - h5py=2.10.0
+ - hdf5=1.10.6
+ - hmmer=3.3
+ - numpy==1.18.1
+ - pandas=1.0.3
+ - matplotlib=3.1.3
+ - perl=5.26.2
+ - perl-threaded=5.26.0
+ - perl-yaml=1.29
+ - prodigal==2.6.3
+ - dill=0.3.3
+ - protobuf=3.13.0.1
+ - regex=2019.03.09
+ - pyasn1=0.4.8
+ - pycparser=2.20
+ - networkx=2.5
+ - pyjwt=1.7.1
+ - pyparsing=2.4.7
+ - pyqt=5.9.2
+ - pysocks=1.7.1
+ - python-dateutil=2.8.1
+ - pytz=2020.1
+ - pyyaml=5.3.1
+ - scikit-learn==0.22.1
+ - scipy=1.4.1
+ - requests=2.28.1
+ - viennarna==2.4.15
+ - pyopenssl=22.0.0
+ - certifi=2022.12.7
+ - vmatch==2.3.0
+ - clustalo==1.2.3
+ - blast==2.5.0
+ - libffi=3.2.1
+ - pip:
+ - python-Levenshtein
diff --git a/envs/genomad.yaml b/workflow/envs/genomad.yaml
similarity index 100%
rename from envs/genomad.yaml
rename to workflow/envs/genomad.yaml
diff --git a/envs/jaeger.yaml b/workflow/envs/jaeger.yaml
similarity index 100%
rename from envs/jaeger.yaml
rename to workflow/envs/jaeger.yaml
diff --git a/envs/kma.yaml b/workflow/envs/kma.yaml
similarity index 100%
rename from envs/kma.yaml
rename to workflow/envs/kma.yaml
diff --git a/envs/padloc.yaml b/workflow/envs/padloc.yaml
similarity index 100%
rename from envs/padloc.yaml
rename to workflow/envs/padloc.yaml
diff --git a/envs/pandas.yaml b/workflow/envs/pandas.yaml
similarity index 100%
rename from envs/pandas.yaml
rename to workflow/envs/pandas.yaml
diff --git a/envs/pyfaidx.yaml b/workflow/envs/pyfaidx_pandas.yaml
similarity index 66%
rename from envs/pyfaidx.yaml
rename to workflow/envs/pyfaidx_pandas.yaml
index e7dc681..b1d3a92 100644
--- a/envs/pyfaidx.yaml
+++ b/workflow/envs/pyfaidx_pandas.yaml
@@ -1,4 +1,4 @@
-name: pyfaidx
+name: pyfaidx_pandas
channels:
- conda-forge
@@ -6,3 +6,4 @@ channels:
dependencies:
- pyfaidx=0.8.1.3
+ - pandas=2.2.3
diff --git a/envs/pymlst.yaml b/workflow/envs/pymlst.yaml
similarity index 100%
rename from envs/pymlst.yaml
rename to workflow/envs/pymlst.yaml
diff --git a/envs/seqkit.yaml b/workflow/envs/seqkit.yaml
similarity index 100%
rename from envs/seqkit.yaml
rename to workflow/envs/seqkit.yaml
diff --git a/envs/spacepharer.yml b/workflow/envs/spacepharer.yaml
similarity index 100%
rename from envs/spacepharer.yml
rename to workflow/envs/spacepharer.yaml
diff --git a/envs/tidy_here.yaml b/workflow/envs/tidy_here.yaml
similarity index 100%
rename from envs/tidy_here.yaml
rename to workflow/envs/tidy_here.yaml
diff --git a/results/Assess_clustered_spacers.qmd b/workflow/notebooks/Assess_clustered_spacers.qmd
similarity index 100%
rename from results/Assess_clustered_spacers.qmd
rename to workflow/notebooks/Assess_clustered_spacers.qmd
diff --git a/results/Descriptive_statistics.qmd b/workflow/notebooks/Descriptive_statistics.qmd
similarity index 100%
rename from results/Descriptive_statistics.qmd
rename to workflow/notebooks/Descriptive_statistics.qmd
diff --git a/results/Metadata_and_spacer_summary.qmd b/workflow/notebooks/Metadata_and_spacer_summary.qmd
similarity index 100%
rename from results/Metadata_and_spacer_summary.qmd
rename to workflow/notebooks/Metadata_and_spacer_summary.qmd
diff --git a/workflow/rules/classify_genomes.smk b/workflow/rules/classify_genomes.smk
new file mode 100644
index 0000000..c92603a
--- /dev/null
+++ b/workflow/rules/classify_genomes.smk
@@ -0,0 +1,215 @@
+### Classify genomes
+## 1: determine multilocus sequence type (MLST)
+
+
+rule download_mlst_database:
+ output:
+ "resources/mlst/campylobacter.db",
+ params:
+ species=config["mlst"]["species"],
+ conda:
+ "../envs/pymlst.yaml"
+ threads: 1
+ log:
+ "log/download_mlst_database.txt",
+ benchmark:
+ "log/benchmark/download_mlst_database.txt"
+ shell:
+ """
+claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1
+ """
+
+
+rule mlst:
+ input:
+ batch="resources/ATB/assemblies/{batch}/",
+ db="resources/mlst/campylobacter.db",
+ output:
+ "results/mlst/{batch}/complete",
+ conda:
+ "../envs/pymlst.yaml"
+ threads: config["mlst"]["threads"]
+ log:
+ "log/mlst/{batch}.txt",
+ benchmark:
+ "log/benchmark/mlst/{batch}.txt"
+ shell:
+ """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ claMLST search {input.db} {{}} -o "$(dirname {output})/{{/.}}.txt" > {log} 2>&1
+
+touch {output}
+ """
+
+
+rule concatenate_mlst_batches:
+ input:
+ "results/mlst/{batch}/complete",
+ output:
+ "results/mlst/{batch}-concatenated.tsv",
+ conda:
+ "../envs/bash.yaml"
+ threads: config["mlst"]["threads"]
+ log:
+ "log/concatenate_mlst/{batch}.txt",
+ benchmark:
+ "log/benchmark/concatenate_mlst/{batch}.txt"
+ shell:
+ """
+echo -e "Genome\tST" > {output}
+find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'tail -n +2 {{}} | cut -f 1-2 >> {output}'
+ """
+
+
+rule concatenate_mlst_all:
+ input:
+ expand("results/mlst/{batch}-concatenated.tsv", batch=BATCHES),
+ output:
+ "results/mlst_table.tsv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/concatenate_mlst_all.txt",
+ benchmark:
+ "log/benchmark/concatenate_mlst_all.txt"
+ shell:
+ """
+batches=( {input} )
+head -1 ${{batches[0]}} > {output}
+sed --separate 1d ${{batches[@]}} >> {output}
+ """
+
+
+## 2. identify whether contig derive from a chromosome, plasmid or virus
+# Using both geNomad (chromosome/plasmid/virus)
+
+
+rule download_genomad_database:
+ output:
+ db=directory("resources/genomad_db"),
+ conda:
+ "../envs/genomad.yaml"
+ threads: 1
+ log:
+ "log/download_genomad_database.txt",
+ benchmark:
+ "log/benchmark/download_genomad_database.txt"
+ shell:
+ """
+mkdir -p $(dirname {output.db})
+genomad download-database $(dirname {output.db}) > {log} 2>&1
+ """
+
+
+rule genomad:
+ input:
+ fasta="resources/ATB/assemblies/{batch}.fasta",
+ db="resources/genomad_db",
+ output:
+ aggregated_classification="results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
+ plasmid_summary="results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
+ virus_summary="results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
+ params:
+ work_dir=subpath(output.aggregated_classification, ancestor=2),
+ conda:
+ "../envs/genomad.yaml"
+ threads: config["genomad"]["threads"]
+ log:
+ "log/genomad/{batch}.txt",
+ benchmark:
+ "log/benchmark/genomad/{batch}.txt"
+ shell:
+ """
+genomad end-to-end -t {threads} --cleanup --enable-score-calibration\
+ {input.fasta} {params.work_dir} {input.db} > {log} 2>&1
+ """
+
+
+rule collect_genomad_predictions:
+ input:
+ aggregated_classification=expand(
+ "results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
+ batch=BATCHES,
+ ),
+ plasmid_summary=expand(
+ "results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
+ batch=BATCHES,
+ ),
+ virus_summary=expand(
+ "results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
+ batch=BATCHES,
+ ),
+ output:
+ "results/genomad_predictions.csv",
+ conda:
+ "../envs/tidy_here.yaml"
+ threads: 1
+ log:
+ "log/collect_genomad_predictions.txt",
+ benchmark:
+ "log/benchmark/collect_genomad_predictions.txt"
+ script:
+ "../scripts/collect_genomad_predictions.R"
+
+
+# And Jaeger (virus (phage/prophage) or not)
+
+
+rule jaeger:
+ input:
+ batch="resources/ATB/assemblies/{batch}/",
+ output:
+ "results/jaeger/{batch}/complete",
+ conda:
+ "../envs/jaeger.yaml"
+ threads: config["jaeger"]["threads"]
+ log:
+ "log/jaeger/{batch}.txt",
+ benchmark:
+ "log/benchmark/jaeger/{batch}.txt"
+ shell:
+ """
+parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
+ jaeger run -p --workers 1 -i {{}} -o $(dirname {output}) --overwrite\
+ > {log} 2>&1 ::: {input.batch}/*.fa
+
+touch {output}
+ """
+
+
+rule collect_jaeger_batch:
+ input:
+ "results/jaeger/{batch}/complete",
+ output:
+ "results/jaeger/{batch}/jaeger-{batch}.csv",
+ params:
+ batch=subpath(output[0], parent=True),
+ conda:
+ "../envs/tidy_here.yaml"
+ threads: 1
+ log:
+ "log/collect_jaeger_{batch}.txt",
+ benchmark:
+ "log/benchmark/collect_jaeger_{batch}.txt"
+ script:
+ "../scripts/collect_jaeger_batch.R"
+
+
+rule collect_jaeger_predictions:
+ input:
+ expand("results/jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES),
+ output:
+ "results/jaeger_predictions.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/collect_jaeger_predictions.txt",
+ benchmark:
+ "log/benchmark/collect_jaeger_predictions.txt"
+ script:
+ "../scripts/collect_jaeger_predictions.sh"
diff --git a/workflow/rules/helper_rules.smk b/workflow/rules/helper_rules.smk
new file mode 100644
index 0000000..558ecb1
--- /dev/null
+++ b/workflow/rules/helper_rules.smk
@@ -0,0 +1,39 @@
+## General helper functions: define input and output
+from pathlib import Path
+
+# Use Python functions to automatically detect batches of genomes fasta files
+# in the input directory as 'BATCHES'
+BATCH_PATHS = list(Path("resources/ATB/assemblies").glob("atb.assembly.*"))
+
+# Make sure there is at least one batch directory:
+assert len(BATCH_PATHS) > 0, (
+ "-- No input (batch) directories found in resources/ATB/assemblies.\n"
+ "Please run the script bin/prepare_genomes.sh to prepare input. --\n"
+)
+
+# And make sure they are actually directories
+for batch in BATCH_PATHS:
+ assert Path(batch).is_dir(), f"-- Batches must be directories, got {batch} --"
+
+BATCHES = [batch.name for batch in BATCH_PATHS]
+
+
+## Helper rules (not fitting any particular goal)
+
+
+rule concatenate_batches:
+ input:
+ "resources/ATB/assemblies/{batch}",
+ output:
+ temp("resources/ATB/assemblies/{batch}.fasta"),
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/concatenate_{batch}.txt",
+ benchmark:
+ "log/benchmark/concatenate_{batch}.txt"
+ shell:
+ """
+cat {input}/*.fa > {output} 2> {log}
+ """
diff --git a/workflow/rules/identify_defences.smk b/workflow/rules/identify_defences.smk
new file mode 100644
index 0000000..6923b08
--- /dev/null
+++ b/workflow/rules/identify_defences.smk
@@ -0,0 +1,89 @@
+## Identify anti-phage defence systems using PADLOC
+
+
+rule download_padloc_database:
+ output:
+ directory=directory("resources/padloc_db"),
+ cm=directory("resources/padloc_db/cm"),
+ cm_meta="resources/padloc_db/cm_meta.txt",
+ hmm=directory("resources/padloc_db/hmm"),
+ hmm_meta="resources/padloc_db/hmm_meta.txt",
+ sys=directory("resources/padloc_db/sys"),
+ sys_meta="resources/padloc_db/sys_meta.txt",
+ system_info="resources/padloc_db/system_info.md",
+ conda:
+ "../envs/padloc.yaml"
+ threads: 1
+ log:
+ "log/download_padloc_database.txt",
+ benchmark:
+ "log/benchmark/download_padloc_database.txt"
+ shell:
+ """
+padloc --data resources/padloc_db --db-install v2.0.0 > {log} 2>&1
+ """
+
+
+rule padloc:
+ input:
+ batch="resources/ATB/assemblies/{batch}/",
+ db="resources/padloc_db",
+ output:
+ "results/padloc/{batch}/complete",
+ conda:
+ "../envs/padloc.yaml"
+ threads: config["padloc"]["threads"]
+ log:
+ "log/padloc/{batch}.txt",
+ benchmark:
+ "log/benchmark/padloc/{batch}.txt"
+ shell:
+ """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'mkdir -p "$(dirname {output})/{{/.}}" && padloc --data {input.db}\
+ --cpu 1 --fna {{}} --outdir "$(dirname {output})/{{/.}}"' > {log} 2>&1
+
+touch {output}
+ """
+
+
+rule concatenate_padloc_batches:
+ input:
+ "results/padloc/{batch}/complete",
+ output:
+ "results/padloc/{batch}-concatenated.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: config["padloc"]["threads"]
+ log:
+ "log/concatenate_padloc/{batch}.txt",
+ benchmark:
+ "log/benchmark/concatenate_padloc/{batch}.txt"
+ shell:
+ """
+file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") )
+head -1 ${{file_array[0]}} > {output}
+parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}}
+ """
+
+
+rule concatenate_padloc_all:
+ input:
+ expand("results/padloc/{batch}-concatenated.csv", batch=BATCHES),
+ output:
+ "results/padloc_table.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/concatenate_padloc_all.txt",
+ benchmark:
+ "log/benchmark/concatenate_padloc_all.txt"
+ shell:
+ """
+batches=( {input} )
+head -1 ${{batches[0]}} > {output}
+sed --separate 1d ${{batches[@]}} >> {output}
+ """
diff --git a/workflow/rules/map_spacers.smk b/workflow/rules/map_spacers.smk
new file mode 100644
index 0000000..3f6414c
--- /dev/null
+++ b/workflow/rules/map_spacers.smk
@@ -0,0 +1,295 @@
+### Map spacers to putative targets/protospacers
+## 2. By using SpacePHARER
+
+
+rule spacepharer_spacer_setup:
+ input:
+ spacers="results/crispridentify/all_spacers.fa",
+ output:
+ spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+ params:
+ tmp_folder=subpath(output.spacer_DB, parent=True),
+ conda:
+ "../envs/spacepharer.yaml"
+ threads: 48
+ log:
+ "log/spacepharer/spacepharer_spacer_setup.txt",
+ benchmark:
+ "log/benchmark/spacepharer/spacepharer_spacer_setup.txt"
+ shell:
+ """
+spacer_DB=$(dirname {output.spacer_DB})
+rm -rf $spacer_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.spacers} {output.spacer_DB}\
+ "{params.tmp_folder}/tmpFolder" --extractorf-spacer 1\
+ --threads {threads} >> {log} 2>&1
+ """
+
+
+## First to bacteriophages
+
+
+rule download_phage_database:
+ output:
+ phage_dir=directory("resources/phagescope"),
+ phage_archives=expand(
+ "resources/phagescope/{database}.tar.gz",
+ database=config["PhageScope_databases"],
+ ),
+ phage_fasta=expand(
+ "resources/phagescope/{database}.fasta",
+ database=config["PhageScope_databases"],
+ ),
+ combined_meta="resources/phagescope/phagescope_metadata.tsv",
+ params:
+ databases=config["PhageScope_databases"],
+ conda:
+ "../envs/bash.yaml"
+ threads: config["download_spacepharer_databases"]["threads"]
+ log:
+ out="log/download_phage_database.out",
+ err="log/download_phage_database.err",
+ benchmark:
+ "log/benchmark/download_phage_database.txt"
+ script:
+ "../scripts/download_phage_database.sh"
+
+
+rule spacepharer_phage_setup:
+ input:
+ db=collect(
+ "resources/phagescope/{database}.fasta",
+ database=[
+ "DDBJ",
+ "EMBL",
+ "Genbank",
+ "GPD",
+ "GVD",
+ "MGV",
+ "PhagesDB",
+ "RefSeq",
+ "TemPhD",
+ ],
+ ),
+ output:
+ phage_DB="results/spacepharer/phage_DB/targetsetDB",
+ phage_control_DB="results/spacepharer/phage_DB/controlsetDB",
+ params:
+ tmp_folder=subpath(output.phage_DB, ancestor=2),
+ conda:
+ "../envs/spacepharer.yaml"
+ threads: config["spacepharer"]["threads"]
+ log:
+ "log/spacepharer/spacepharer_phage_setup.txt",
+ benchmark:
+ "log/benchmark/spacepharer/spacepharer_setup.txt"
+ shell:
+ """
+phage_DB=$(dirname {output.phage_DB})
+rm -rf $phage_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.db} {output.phage_DB}\
+ "{params.tmp_folder}/tmpFolder" --threads {threads} >> {log} 2>&1
+spacepharer createsetdb {input.db} {output.phage_control_DB}\
+ "{params.tmp_folder}/tmpFolder" --reverse-fragments 1 --threads {threads} >> {log} 2>&1
+ """
+
+
+rule spacepharer_phage:
+ input:
+ spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+ phage_DB="results/spacepharer/phage_DB/targetsetDB",
+ phage_control_DB="results/spacepharer/phage_DB/controlsetDB",
+ output:
+ result="results/spacepharer/predicted_phage_matches.tsv",
+ result_sanitised="results/spacepharer/predicted_phage_matches_san.tsv",
+ params:
+ tmp_folder="results/spacepharer/tmpFolder",
+ conda:
+ "../envs/spacepharer.yaml"
+ threads: config["spacepharer"]["threads"]
+ log:
+ "log/spacepharer/spacepharer_phage.txt",
+ benchmark:
+ "log/benchmark/spacepharer/spacepharer_phage.txt"
+ shell:
+ """
+spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\
+ {input.phage_control_DB} {output.result} {params.tmp_folder}\
+ --threads {threads} > {log} 2>&1
+
+grep -v "#" {output.result} > {output.result_sanitised}
+rm -r {params.tmp_folder} >> {log} 2>&1
+ """
+
+
+## Then also to plasmids
+
+
+rule download_plasmid_database:
+ output:
+ plasmid_dir=directory("resources/PLSDB"),
+ plasmid_fasta="resources/PLSDB/sequences.fasta",
+ plasmid_nuccore="resources/PLSDB/nuccore.csv",
+ plasmid_taxonomy="resources/PLSDB/taxonomy.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ out="log/download_plasmid_database.out",
+ err="log/download_plasmid_database.err",
+ benchmark:
+ "log/benchmark/download_plasmid_database.txt"
+ script:
+ "../scripts/download_plasmid_database.sh"
+
+
+rule spacepharer_plasmid_setup:
+ input:
+ db="resources/PLSDB/sequences.fasta",
+ output:
+ DB="results/spacepharer/plasmid_DB/targetsetDB",
+ control_DB="results/spacepharer/plasmid_DB/controlsetDB",
+ params:
+ tmp_folder="results/spacepharer/tmpFolder",
+ conda:
+ "../envs/spacepharer.yaml"
+ threads: config["spacepharer"]["threads"]
+ log:
+ "log/spacepharer/spacepharer_plasmid_setup.txt",
+ benchmark:
+ "log/benchmark/spacepharer/spacepharer_plasmid_setup.txt"
+ shell:
+ """
+plasmid_DB=$(dirname {output.DB})
+rm -f $plasmid_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.db} {output.DB} {params.tmp_folder}\
+ --threads {threads} >> {log} 2>&1
+spacepharer createsetdb {input.db} {output.control_DB} {params.tmp_folder}\
+ --reverse-fragments 1 --threads {threads} >> {log} 2>&1
+ """
+
+
+rule spacepharer_plasmid:
+ input:
+ phage_DB="results/spacepharer/plasmid_DB/targetsetDB",
+ phage_control_DB="results/spacepharer/plasmid_DB/controlsetDB",
+ spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+ output:
+ result="results/spacepharer/predicted_plasmid_matches.tsv",
+ result_sanitised="results/spacepharer/predicted_plasmid_matches_san.tsv",
+ params:
+ tmp_folder="results/spacepharer/tmpFolder",
+ conda:
+ "../envs/spacepharer.yaml"
+ threads: config["spacepharer"]["threads"]
+ log:
+ "log/spacepharer/spacepharer_phage.txt",
+ benchmark:
+ "log/benchmark/spacepharer/spacepharer_phage.txt"
+ shell:
+ """
+spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\
+ {input.phage_control_DB} {output.result} {params.tmp_folder}\
+ --threads {threads} > {log} 2>&1
+
+grep -v "#" {output.result} > {output.result_sanitised}
+rm -r {params.tmp_folder} >> {log} 2>&1
+ """
+
+
+rule create_spacepharer_table:
+ input:
+ phage="results/spacepharer/predicted_phage_matches_san.tsv",
+ phage_meta="resources/phagescope/merged_metadata.tsv",
+ plasmid="results/spacepharer/predicted_plasmid_matches_san.tsv",
+ plasmid_nuccore="resources/PLSDB/nuccore.csv",
+ plasmid_taxonomy="resources/PLSDB/taxonomy.csv",
+ output:
+ phage="results/phage_matches.tsv",
+ plasmid="results/plasmid_matches.tsv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/create_spacepharer_table.txt",
+ script:
+ "../scripts/create_spacepharer_table.sh"
+
+
+## 2. By using KMA to the input genomes
+
+
+rule kma_indexing:
+ input:
+ spacers="results/crispridentify/all_spacers.fa",
+ output:
+ indexed_spacers="results/kma/spacer_DB/spacers.name",
+ params:
+ "results/kma/spacer_DB/spacers",
+ conda:
+ "../envs/kma.yaml"
+ threads: config["kma"]["threads"]
+ log:
+ "log/kma/kma_index.txt",
+ benchmark:
+ "log/benchmark/kma/kma_index.txt"
+ shell:
+ """
+kma index -i {input.spacers} -o {params} > {log} 2>&1
+ """
+
+
+rule kma:
+ input:
+ genomes=expand("resources/ATB/assemblies/{batch}/", batch=BATCHES),
+ indexed_spacers="results/kma/spacer_DB/spacers.name",
+ output:
+ "results/kma/output/CRISPR.frag.gz",
+ params:
+ output=subpath(output[0], parent=True),
+ indexed_spacers=subpath(input.indexed_spacers, parent=True),
+ spacers="results/crispridentify/all_spacers.fa",
+ conda:
+ "../envs/kma.yaml"
+ threads: config["kma"]["threads"]
+ log:
+ "log/kma/kma.txt",
+ benchmark:
+ "log/benchmark/kma/kma.txt"
+ shell:
+ """
+grep ">" {params.spacers} | cut -f 2 -d ">" | cut -f 1 -d "-" | sort -u > tmp_file
+find -L {input.genomes} -mindepth 1 -maxdepth 1 -type f -name "*.fa" > all_genomes.txt
+genomes=$(grep -x ".*[0-9]\\.fa" all_genomes.txt | grep -v -f tmp_file)
+
+kma -hmm -i $genomes -o {params.output} -t_db "{params.indexed_spacers}/spacers" > {log} 2>&1
+rm tmp_file all_genomes.txt
+ """
+
+
+rule collect_kma:
+ input:
+ "results/kma/output/CRISPR.frag.gz",
+ output:
+ "results/kma/CRISPR_alignment",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/kma/collect_kma.txt",
+ benchmark:
+ "log/benchmark/kma/collect_kma.txt"
+ shell:
+ """
+echo -e "spacer\tgenome" > {output}
+zcat {input} | cut -f 6,7 | cut -f 1 -d " " > tmp_file
+while read line; do
+ match=$(echo $line | cut -f 2)
+ crispr=$(echo $line | cut -f 1 | cut -f 1,6,7,10,11 -d "_")
+ echo -e "$crispr\t$match" >> {output}
+done < tmp_file
+rm tmp_file
+ """
diff --git a/workflow/rules/refine_crispr-cas.smk b/workflow/rules/refine_crispr-cas.smk
new file mode 100644
index 0000000..c0835d1
--- /dev/null
+++ b/workflow/rules/refine_crispr-cas.smk
@@ -0,0 +1,113 @@
+### Refine CRISPR-Cas identifation
+
+
+rule crispridentify:
+ input:
+ "results/cctyper/{batch}/subseq",
+ output:
+ spacers="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+ summary="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+ params:
+ out_dir=subpath(output[0], parent=True),
+ arrays=subpath(input[0], parent=True),
+ conda:
+ "../envs/crispridentify.yaml"
+ threads: config["crispridentify"]["threads"]
+ log:
+ "log/crispridentify/{batch}.txt",
+ benchmark:
+ "log/benchmark/crispridentify/{batch}.txt"
+ shell:
+ """
+cd bin/CRISPRidentify
+
+find ../../{params.arrays}/*/fasta/CRISPR_arrays-with_flanks.fasta -size +0c -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail,1'\
+ 'python CRISPRidentify.py --file {{}}\
+ --result_folder "../../{params.out_dir}/"\
+ --fasta_report True --strand False' > ../../{log} 2>&1
+ """
+
+
+rule merge_crispridentify_batches:
+ input:
+ spacers_crispr=expand(
+ "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+ batch=BATCHES,
+ ),
+ summary_crispr=expand(
+ "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+ batch=BATCHES,
+ ),
+ output:
+ spacers_crispr="results/crispridentify/all_spacers.fa",
+ summary_crispr="results/crispridentify/complete_summary.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/merge_crispridentify_batches.txt",
+ benchmark:
+ "log/benchmark/merge_crispridentify_batches.txt"
+ script:
+ "../scripts/merge_crispridentify_batches.sh"
+
+
+rule merge_cctyper_identify:
+ input:
+ identify="results/crispridentify/complete_summary.csv",
+ cctyper=expand("results/cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES),
+ output:
+ table="results/all_CRISPRS_with_identify.tab",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/merge_cctyper_identify.txt",
+ benchmark:
+ "log/benchmark/merge_cctyper_identify.txt"
+ script:
+ "../scripts/merge_cctyper_crispridentify.sh"
+
+
+rule cluster_unique_spacers_crispridentify:
+ input:
+ "results/crispridentify/all_spacers.fa",
+ output:
+ clusters="results/crispridentify/all_spacers-clustered.clstr",
+ spacers="results/crispridentify/all_spacers-clustered",
+ distribution="results/crispridentify/all_spacers-clustered-distribution.tsv",
+ conda:
+ "../envs/cdhit.yaml"
+ threads: 1
+ log:
+ "log/cluster_unique_spacers_identify.txt",
+ benchmark:
+ "log/benchmark/cluster_unique_spacers_identify.txt"
+ shell:
+ """
+cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
+ -i {input} -o {output.spacers} > {log} 2>&1
+
+plot_len1.pl {output.clusters}\
+ 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
+ 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
+ > {output.distribution}
+ """
+
+
+rule create_crispr_cluster_table_identify:
+ input:
+ clstr="results/crispridentify/all_spacers-clustered.clstr",
+ fasta="results/crispridentify/all_spacers.fa",
+ output:
+ "results/all_spacers_table_identify.tsv",
+ conda:
+ "../envs/pyfaidx_pandas.yaml"
+ threads: 1
+ log:
+ "log/create_crispr_cluster_table_identify.txt",
+ benchmark:
+ "log/benchmark/create_crispr_cluster_table_identify.txt"
+ script:
+ "../scripts/make_cluster_table_identify.py"
diff --git a/workflow/rules/screen_crispr-cas.smk b/workflow/rules/screen_crispr-cas.smk
new file mode 100644
index 0000000..7d78f56
--- /dev/null
+++ b/workflow/rules/screen_crispr-cas.smk
@@ -0,0 +1,195 @@
+### Screen for CRISPR-Cas
+## Run CCTyper and parse/merge its output
+
+
+rule crisprcastyper:
+ input:
+ batch="resources/ATB/assemblies/{batch}/",
+ output:
+ "results/cctyper/{batch}/complete",
+ params:
+ out_dir=subpath(output[0], parent=True),
+ conda:
+ "../envs/cctyper.yaml"
+ threads: config["cctyper"]["threads"]
+ log:
+ "log/cctyper/{batch}.txt",
+ benchmark:
+ "log/benchmark/cctyper/{batch}.txt"
+ shell:
+ """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'rm -rf "{params.out_dir}{{/.}}" &&\
+ cctyper -t 1 {{}} "{params.out_dir}/{{/.}}"' > {log} 2>&1
+
+touch {output}
+ """
+
+
+rule parse_cctyper:
+ input:
+ "results/cctyper/{batch}/complete",
+ output:
+ "results/cctyper/{batch}/parsed",
+ conda:
+ "../envs/pandas.yaml"
+ threads: config["parse_cctyper"]["threads"]
+ log:
+ "log/parse_cctyper/{batch}.txt",
+ benchmark:
+ "log/benchmark/parse_cctyper/{batch}.txt"
+ shell:
+ """
+find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\
+parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ python workflow/scripts/cctyper_extender.py -d {{.}} > {log} 2>&1
+
+touch {output}
+ """
+
+
+rule extract_sequences:
+ input:
+ flag="results/cctyper/{batch}/parsed",
+ genomes="resources/ATB/assemblies/{batch}",
+ output:
+ "results/cctyper/{batch}/subseq",
+ conda:
+ "../envs/seqkit.yaml"
+ threads: config["extract_sequences"]["threads"]
+ log:
+ "log/extract_sequences/{batch}.txt",
+ benchmark:
+ "log/benchmark/extract_sequences/{batch}.txt"
+ shell:
+ """
+find $(dirname {input.flag}) -mindepth 1 -maxdepth 1 -type d -print0 |\
+parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ bash workflow/scripts/extract_crispr-cas_from_fasta.sh {{}} {input.genomes} > {log} 2>&1
+
+touch {output}
+ """
+
+
+rule collect_cctyper:
+ input:
+ cctyper="results/cctyper/{batch}/complete",
+ parser="results/cctyper/{batch}/parsed",
+ output:
+ crispr_cas="results/cctyper/{batch}/CRISPR_Cas-{batch}.tab",
+ crisprs_all="results/cctyper/{batch}/crisprs_all-{batch}.tab",
+ crisprs_near_cas="results/cctyper/{batch}/crisprs_near_cas-{batch}.tab",
+ crisprs_orphan="results/cctyper/{batch}/crisprs_orphan-{batch}.tab",
+ spacers="results/cctyper/{batch}/all_spacers-{batch}.fa",
+ cas_putative="results/cctyper/{batch}/cas_operons_putative-{batch}.tab",
+ cas="results/cctyper/{batch}/cas_operons-{batch}.tab",
+ csv="results/cctyper/{batch}/CRISPR-Cas-{batch}.csv",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/cctyper/collect_{batch}.txt",
+ benchmark:
+ "log/benchmark/cctyper/collect_{batch}.txt"
+ shell:
+ """
+bash workflow/scripts/concatenate_cctyper_output.sh $(dirname {input.cctyper}) > {log} 2>&1
+echo "\n========================" >> {log}
+bash workflow/scripts/concatenate_cctyper_csv.sh $(dirname {input.parser}) >> {log} 2>&1
+
+find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat {{}} + > {output.spacers} 2>> {log}
+ """
+
+
+rule concatenate_all_spacers:
+ input:
+ expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
+ output:
+ "results/cctyper/all_spacers.fa",
+ conda:
+ "../envs/bash.yaml"
+ threads: 1
+ log:
+ "log/concatenate_all_spacers.txt",
+ benchmark:
+ "log/benchmark/concatenate_all_spacers.txt"
+ shell:
+ """
+cat {input} > {output} 2> {log}
+ """
+
+
+rule cluster_all_spacers:
+ input:
+ "results/cctyper/all_spacers.fa",
+ output:
+ clusters=expand(
+ "results/cctyper/all_spacers-clustered-{cutoff}.clstr",
+ cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
+ ),
+ spacers=expand(
+ "results/cctyper/all_spacers-clustered-{cutoff}",
+ cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
+ ),
+ summary="results/cctyper/spacer_cluster_summary.tsv",
+ params:
+ work_dir=subpath(input[0], parent=True),
+ log_dir="log/spacer_clustering",
+ conda:
+ "../envs/cdhit.yaml"
+ threads: 1
+ log:
+ "log/cluster_all_spacers.txt",
+ benchmark:
+ "log/benchmark/cluster_all_spacers.txt"
+ shell:
+ """
+bash workflow/scripts/cluster_all_spacers.sh\
+ {input}\
+ {params.work_dir}\
+ {params.log_dir} > {log} 2>&1
+ """
+
+
+rule cluster_unique_spacers:
+ input:
+ "results/cctyper/all_spacers.fa",
+ output:
+ clusters="results/cctyper/all_spacers-clustered.clstr",
+ spacers="results/cctyper/all_spacers-clustered",
+ distribution="results/cctyper/all_spacers-clustered-distribution.tsv",
+ conda:
+ "../envs/cdhit.yaml"
+ threads: 1
+ log:
+ "log/cluster_unique_spacers.txt",
+ benchmark:
+ "log/benchmark/cluster_unique_spacers.txt"
+ shell:
+ """
+cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
+ -i {input} -o {output.spacers} > {log} 2>&1
+
+plot_len1.pl {output.clusters}\
+ 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
+ 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
+ > {output.distribution}
+ """
+
+
+rule create_crispr_cluster_table:
+ input:
+ clstr="results/cctyper/all_spacers-clustered.clstr",
+ fasta="results/cctyper/all_spacers.fa",
+ output:
+ "results/all_spacers_table.tsv",
+ conda:
+ "../envs/pyfaidx_pandas.yaml"
+ threads: 1
+ log:
+ "log/create_crispr_cluster_table.txt",
+ benchmark:
+ "log/benchmark/create_crispr_cluster_table.txt"
+ script:
+ "../scripts/make_cluster_table.py"
diff --git a/bin/cctyper_extender.py b/workflow/scripts/cctyper_extender.py
similarity index 100%
rename from bin/cctyper_extender.py
rename to workflow/scripts/cctyper_extender.py
diff --git a/bin/cluster_all_spacers.sh b/workflow/scripts/cluster_all_spacers.sh
similarity index 100%
rename from bin/cluster_all_spacers.sh
rename to workflow/scripts/cluster_all_spacers.sh
diff --git a/bin/collect_genomad_predictions.R b/workflow/scripts/collect_genomad_predictions.R
similarity index 90%
rename from bin/collect_genomad_predictions.R
rename to workflow/scripts/collect_genomad_predictions.R
index 0cc783e..7356717 100644
--- a/bin/collect_genomad_predictions.R
+++ b/workflow/scripts/collect_genomad_predictions.R
@@ -17,13 +17,13 @@ read_stats <- function(filename, name_position) {
sample_name <- str_split_1(string = filename, pattern = "/") %>%
tail(name_position) %>%
head(1)
-
+
df <- read_delim(
file = filename,
show_col_types = FALSE
) %>%
mutate(batch = sample_name)
-
+
return(df)
}
@@ -34,10 +34,11 @@ genomad_scores <- do.call(
rename("contig" = "seq_name")
genomad_scores <- genomad_scores %>%
- mutate(genome = gsub(pattern = "\\.contig[0-9]*",
- replacement = "",
- x = contig)
- )
+ mutate(genome = gsub(
+ pattern = "\\.contig[0-9]*",
+ replacement = "",
+ x = contig
+ ))
plasmid_classifications <- do.call(
rbind,
@@ -78,10 +79,10 @@ genomad_df <- left_join(
!is.na(plasmid_topology) ~ "plasmid",
!is.na(virus_topology) ~ "virus",
TRUE ~ "chromosome"
- )
+ )
)
write_csv(
x = genomad_df,
- file = snakemake@output[[1]] #here("data", "processed", "genomad_predictions.csv")
-)
\ No newline at end of file
+ file = snakemake@output[[1]] # here("data", "processed", "genomad_predictions.csv")
+)
diff --git a/bin/collect_jaeger_batch.R b/workflow/scripts/collect_jaeger_batch.R
similarity index 61%
rename from bin/collect_jaeger_batch.R
rename to workflow/scripts/collect_jaeger_batch.R
index 167e0aa..ef50d11 100644
--- a/bin/collect_jaeger_batch.R
+++ b/workflow/scripts/collect_jaeger_batch.R
@@ -9,7 +9,7 @@ suppressPackageStartupMessages({
})
# Read all output files in a batch (one per genome)
-jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))
+jaeger_files <- Sys.glob(paths = here(snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))
# Concatenate them all in one dataframe
jaeger_df <- do.call(
@@ -20,15 +20,21 @@ jaeger_df <- do.call(
# Simplify the dataframe by extracting essential info
jaeger_df_simple <- jaeger_df %>%
mutate(
- contig = gsub(pattern = " .*",
- replacement = "",
- x = contig_id),
- accession_id = gsub(pattern = ".contig[0-9]*",
- replacement = "",
- x = contig)
+ contig = gsub(
+ pattern = " .*",
+ replacement = "",
+ x = contig_id
+ ),
+ accession_id = gsub(
+ pattern = ".contig[0-9]*",
+ replacement = "",
+ x = contig
+ )
) %>%
select(accession_id, contig, length, prediction, reliability_score, prophage_contam)
# And write to a CSV file (which can easily be concatenated with a script)
-write_csv(x = jaeger_df_simple,
- file = snakemake@output[[1]])
\ No newline at end of file
+write_csv(
+ x = jaeger_df_simple,
+ file = snakemake@output[[1]]
+)
diff --git a/bin/collect_jaeger_predictions.sh b/workflow/scripts/collect_jaeger_predictions.sh
similarity index 100%
rename from bin/collect_jaeger_predictions.sh
rename to workflow/scripts/collect_jaeger_predictions.sh
diff --git a/bin/concatenate_cctyper_csv.sh b/workflow/scripts/concatenate_cctyper_csv.sh
similarity index 100%
rename from bin/concatenate_cctyper_csv.sh
rename to workflow/scripts/concatenate_cctyper_csv.sh
diff --git a/bin/concatenate_cctyper_output.sh b/workflow/scripts/concatenate_cctyper_output.sh
similarity index 100%
rename from bin/concatenate_cctyper_output.sh
rename to workflow/scripts/concatenate_cctyper_output.sh
diff --git a/bin/create_spacepharer_table.sh b/workflow/scripts/create_spacepharer_table.sh
similarity index 81%
rename from bin/create_spacepharer_table.sh
rename to workflow/scripts/create_spacepharer_table.sh
index 888b286..ecf3812 100644
--- a/bin/create_spacepharer_table.sh
+++ b/workflow/scripts/create_spacepharer_table.sh
@@ -7,7 +7,7 @@
echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]}
while read line; do
ID=$(echo $line | cut -f 2)
- metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7)
+ metadata_match=$(grep -w "$ID" "${snakemake_input[phage_meta]}" | cut -f 2-7)
echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]}
done < ${snakemake_input[phage]}
@@ -16,7 +16,7 @@ echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end
while read line; do
ID=$(echo $line | cut -f 2)
#PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy
- nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",")
- taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",")
+ nuccore_match=$(grep -w "$ID" "${snakemake_input[plasmid_nuccore]}" | cut -f 13 -d ",")
+ taxonomy_match=$(grep -w "^$nuccore_match" "${snakemake_input[plasmid_taxonomy]}" | cut -f 3 -d ",")
echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]}
done < ${snakemake_input[plasmid]}
diff --git a/workflow/scripts/download_phage_database.sh b/workflow/scripts/download_phage_database.sh
new file mode 100644
index 0000000..f84fd19
--- /dev/null
+++ b/workflow/scripts/download_phage_database.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+## This script downloads and prepares the phage database: Phagescope
+## (https://phagescope.deepomics.org/) for use with SpacePHARER.
+set -euo pipefail
+
+exec > "${snakemake_log[out]}" # send stdout to a log file
+exec 2> "${snakemake_log[err]}" # also send stderr to a log file
+
+. "${snakemake[scriptdir]}/utils.sh"
+
+threads=${snakemake[threads]}
+phage_dir="${snakemake_output[phage_dir]}"
+databases=(${snakemake_params[databases]})
+
+message "Downloading PhageScope databases (archived fasta files)"
+message "This comprises ${#databases[@]} parts, as specified in 'config/parameters.yaml'"
+echo "-----"
+
+for DB in "${databases[@]}"
+do
+ message "Downloading ${DB}"
+ wget -O "${phage_dir}/${DB}.tar.gz"\
+ "https://phageapi.deepomics.org/download/phage/fasta/?datasource=${DB}"
+done
+
+echo "-----"
+message "Extracting databases\n"
+
+# Extracts the .tar.gz archives to subdirectories with separate fasta files.
+
+parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\
+ [ -f "${phage_dir}/${DB}.fasta" ] || [ -d "${phage_dir}/${DB}" ] && \
+ message "${DB} already extracted, skipping..." || \
+ ( message "Extracting ${DB}"; tar -xzf "${phage_dir}/${DB}.tar.gz" -C "${phage_dir}/" )'\
+ ::: "${databases[@]}" ::: ${phage_dir}
+
+echo "-----"
+message "Concatenating PhageScope sequences\n"
+
+# Concatenate separate fasta files in one long file,
+# and then remove the directory with the separate files.
+
+parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\
+ ( message "Concatenating ${DB}"; genomes=$(find "${phage_dir}/${DB}" -type f -name "*.fasta");\
+ > "${phage_dir}/${DB}.fasta";\
+ for files in ${genomes}; do cat ${files} >> "${phage_dir}/${DB}.fasta"; done;
+ rm -r "${phage_dir}/${DB}")' \
+ ::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD ::: ${phage_dir}
+
+echo "-----"
+message "Downloading Phagescope metadata\n"
+
+# Download metadata as tab-separated values (TSV) text file.
+
+for DB in "${databases[@]}"
+do
+ metadata_file="${phage_dir}/${DB}_phage_metadata.tsv"
+ message "Downloading ${DB}"
+ wget -O "${metadata_file}" "https://phageapi.deepomics.org/files/Download/Phage_meta_data/${DB,,}_phage_meta_data.tsv"
+ # With ${DB,,} the string variable is converted to lowercase
+done
+
+echo "-----"
+message "Concatenating PhageScope metadata files\n"
+
+# Make one long file with all metadata from the different databases.
+combined_metadata="${snakemake_output[combined_meta]}"
+
+# Rename the first metadata file into the concatenated file
+mv "${phage_dir}/${databases[0]}_phage_metadata.tsv" ${combined_metadata}
+
+# Then, for all the other metadata files
+for DB in "${databases[@]:1}"
+do
+ metadata_file="${phage_dir}/${DB}_phage_metadata.tsv"
+ # Take all but the first line (header)
+ tail -n +2 "${metadata_file}" >> "${combined_metadata}"
+ # and remove (clean-up, avoid keeping duplicated data)
+ rm ${metadata_file}
+done
+
+message "--- Done! ---"
diff --git a/workflow/scripts/download_plasmid_database.sh b/workflow/scripts/download_plasmid_database.sh
new file mode 100644
index 0000000..440596c
--- /dev/null
+++ b/workflow/scripts/download_plasmid_database.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+## This script downloads and prepares the plasmid database: PLSDB
+## (https://ccb-microbe.cs.uni-saarland.de/plsdb2025/) for use with SpacePHARER.
+set -euo pipefail
+
+exec > "${snakemake_log[out]}" # send all stdout to a log file
+exec 2> "${snakemake_log[err]}" # send stderr to separate log file
+
+. "${snakemake[scriptdir]}/utils.sh"
+
+plasmid_dir=${snakemake_output[plasmid_dir]}
+plasmid_archive="${plasmid_dir}/download_meta.tar.gz"
+
+message "Downloading PLSDB plasmid database"
+echo "-----"
+
+wget -P "${plasmid_dir}" https://ccb-microbe.cs.uni-saarland.de/plsdb2025/download_meta.tar.gz
+
+message "Extracting PLSDB"
+tar -xzf "${plasmid_archive}" -C "${plasmid_dir}"
+
+message "Extracting sequences"
+bzip2 -d "${plasmid_dir}/sequences.fasta.bz2"
+
+message "Adjusting metadata delimiters"
+sed -i -E ':a;s/"([^"]*),([^"]*)"/"\1\2"/g;ta' "${plasmid_dir}/nuccore.csv"
+
+echo -e "-----\nDone!\n-----"
diff --git a/bin/extract_crispr-cas_from_fasta.sh b/workflow/scripts/extract_crispr-cas_from_fasta.sh
similarity index 89%
rename from bin/extract_crispr-cas_from_fasta.sh
rename to workflow/scripts/extract_crispr-cas_from_fasta.sh
index a012e77..177b8c7 100644
--- a/bin/extract_crispr-cas_from_fasta.sh
+++ b/workflow/scripts/extract_crispr-cas_from_fasta.sh
@@ -3,8 +3,8 @@
cctyper_dir=$1
sample_name=$(basename ${cctyper_dir})
-batch_name=$(basename $(dirname ${cctyper_dir}))
-fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa"
+genome_dir=$2
+fasta_file="${genome_dir}/${sample_name}.fa"
bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") )
diff --git a/workflow/scripts/make_cluster_table.py b/workflow/scripts/make_cluster_table.py
new file mode 100644
index 0000000..11b9d84
--- /dev/null
+++ b/workflow/scripts/make_cluster_table.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+################################################################
+# Read CD-HIT's output cluster file, along with the FASTA used #
+# to generate it to create a table of clustered sequences. #
+# - Aim of the program: #
+# 1) read the cluster file #
+# a) create a cluster-based dictionary (cluster #, #
+# representative) #
+# b) create a sequence-based dictionary (ID, length, #
+# representative or % identity and strand) #
+# 2) combine the two dictionaries into a dataframe with #
+# columns: genome, contig, locus, cluster ID, length, #
+# longest_sequence (CD-HIT representative), sequence, #
+# identity (%), strand #
+# 3) for each cluster, also mark the shortest and most common#
+# sequence (as alternative representatives) #
+# 4) write the dataframe to a tab-separated table file #
+################################################################
+
+import re
+from pyfaidx import Fasta
+import pandas as pd
+
+cluster_file = snakemake.input["clstr"]
+fasta_file = snakemake.input["fasta"]
+table_file = snakemake.output[0]
+
+
+def read_clstr_file(inputfile=str):
+ """
+ Read a CD-HIT generated .clstr file and parse its elements into
+ two dictionaries: 1) cluster ID + representative (locus ID),
+ 2) sequences (locus ID) with length and
+ representative/identity
+ """
+ cluster_dict = {"Cluster": [], "Longest_sequence": []}
+ locus_dict = {
+ "Genome": [],
+ "Contig": [],
+ "Locus": [],
+ "Full_locus": [],
+ "Length": [],
+ "Cluster": [],
+ "Strand_to_longest": [],
+ "Identity_to_longest": [],
+ }
+
+ cluster_regex = r"(>Cluster *)(\d*)"
+ locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_\d+:\d+)... (.*)$"
+
+ with open(inputfile, "r") as infile:
+ for line in infile:
+ line = line.strip() # Remove funny characters
+
+ if line.startswith(">"):
+ # Extract the digits from the cluster ID
+ cluster = re.search(cluster_regex, line).group(2)
+
+ elif len(line) > 1:
+ # Use RegEx to extract information
+ crispr_info = re.search(locus_regex, line)
+
+ member_nr = crispr_info.group(1) # not used
+ length = int(crispr_info.group(2).rstrip("nt"))
+ genome = crispr_info.group(3)
+ locus = crispr_info.group(4)
+ full_locus = f"{genome}.{locus}"
+ contig = locus.split("_")[0]
+ extra = crispr_info.group(5)
+
+ # Check the final group for representative ('*') or other
+ if extra == "*":
+ strand = "NA"
+ identity = "NA"
+ cluster_dict["Cluster"].append(cluster)
+ cluster_dict["Longest_sequence"].append(full_locus)
+
+ else:
+ strand_and_identity = extra.split("/")
+ strand = strand_and_identity[0].replace("at ", "")
+ identity = strand_and_identity[1]
+
+ locus_dict["Genome"].append(genome)
+ locus_dict["Contig"].append(contig)
+ locus_dict["Locus"].append(locus)
+ locus_dict["Full_locus"].append(full_locus)
+ locus_dict["Length"].append(length)
+ locus_dict["Cluster"].append(cluster)
+ locus_dict["Strand_to_longest"].append(strand)
+ locus_dict["Identity_to_longest"].append(identity)
+
+ # If the line does not start with '>' or have length > 1, stop
+ else:
+ break
+
+ return cluster_dict, locus_dict
+
+
+def generate_sequence_df(fasta=str, ids=list):
+ """
+ Given a fasta file and list of identifiers, create a dataframe
+ of DNA sequences that can be merged with the cluster/locus dataframe.
+ """
+ sequence_dict = Fasta(fasta, duplicate_action="first")
+ sequence_list = []
+ for locus in ids:
+ sequence_list.append(sequence_dict[locus][:].seq)
+
+ return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list})
+
+
+def main():
+ """
+ Main function, running the whole script.
+ """
+ # Read clstr file, store as dictionaries
+ cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file)
+
+ # Convert dictionaries to dataframes
+ cluster_df = pd.DataFrame(cluster_dict)
+ locus_df = pd.DataFrame(locus_dict)
+
+ # Merge dataframes
+ combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster")
+
+ # Add sequences
+ sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"])
+
+ combined_with_sequences = combined_df.merge(
+ sequence_df, how="inner", on="Full_locus"
+ )
+
+ ## Not yet implemented:
+ # Find shortest sequence per cluster
+ # Find most common sequence per cluster
+
+ # Save as tab-separated text file
+ combined_with_sequences.to_csv(table_file, sep="\t", index=False)
+
+ return 0
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/workflow/scripts/make_cluster_table_identify.py b/workflow/scripts/make_cluster_table_identify.py
new file mode 100644
index 0000000..237361f
--- /dev/null
+++ b/workflow/scripts/make_cluster_table_identify.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+
+################################################################
+# Read CD-HIT's output cluster file, along with the FASTA used #
+# to generate it to create a table of clustered sequences. #
+# - Aim of the program: #
+# 1) read the cluster file #
+# a) create a cluster-based dictionary (cluster #, #
+# representative) #
+# b) create a sequence-based dictionary (ID, length, #
+# representative or % identity and strand) #
+# 2) combine the two dictionaries into a dataframe with #
+# columns: genome, contig, locus, cluster ID, length, #
+# longest_sequence (CD-HIT representative), sequence, #
+# identity (%), strand #
+# 3) for each cluster, also mark the shortest and most common#
+# sequence (as alternative representatives) #
+# 4) write the dataframe to a tab-separated table file #
+################################################################
+
+## This is a slightly adjusted script from make_cluster_table.py as the CRISPRidentify output is different.
+
+import re
+from pyfaidx import Fasta
+import pandas as pd
+
+cluster_file = snakemake.input["clstr"]
+fasta_file = snakemake.input["fasta"]
+table_file = snakemake.output[0]
+
+
+def read_clstr_file(inputfile=str):
+ """
+ Read a CD-HIT generated .clstr file and parse its elements into
+ two dictionaries: 1) cluster ID + representative (locus ID),
+ 2) sequences (locus ID) with length and
+ representative/identity
+ """
+ cluster_dict = {"Cluster": [], "Longest_sequence": []}
+ locus_dict = {
+ "Genome": [],
+ "Contig": [],
+ "Locus": [],
+ "Full_locus": [],
+ "Length": [],
+ "Cluster": [],
+ "Strand_to_longest": [],
+ "Identity_to_longest": [],
+ }
+
+ cluster_regex = r"(>Cluster *)(\d*)"
+ locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_.+)\.\.\. (.*)$"
+
+ with open(inputfile, "r") as infile:
+ for line in infile:
+ line = line.strip() # Remove funny characters
+
+ if line.startswith(">"):
+ # Extract the digits from the cluster ID
+ cluster = re.search(cluster_regex, line).group(2)
+
+ elif len(line) > 1:
+ # Use RegEx to extract information
+ crispr_info = re.search(locus_regex, line)
+
+ member_nr = crispr_info.group(1) # not used
+ length = int(crispr_info.group(2).rstrip("nt"))
+ genome = crispr_info.group(3)
+ locus = crispr_info.group(4)
+ full_locus = f"{genome}-{locus}"
+ contig = locus.split("_")[0]
+ extra = crispr_info.group(5)
+
+ # Check the final group for representative ('*') or other
+ if extra == "*":
+ strand = "NA"
+ identity = "NA"
+ cluster_dict["Cluster"].append(cluster)
+ cluster_dict["Longest_sequence"].append(full_locus)
+
+ else:
+ strand_and_identity = extra.split("/")
+ strand = strand_and_identity[0].replace("at ", "")
+ identity = strand_and_identity[1]
+
+ locus_dict["Genome"].append(genome)
+ locus_dict["Contig"].append(contig)
+ locus_dict["Locus"].append(locus)
+ locus_dict["Full_locus"].append(full_locus)
+ locus_dict["Length"].append(length)
+ locus_dict["Cluster"].append(cluster)
+ locus_dict["Strand_to_longest"].append(strand)
+ locus_dict["Identity_to_longest"].append(identity)
+
+ # If the line does not start with '>' or have length > 1, stop
+ else:
+ break
+
+ return cluster_dict, locus_dict
+
+
+def generate_sequence_df(fasta=str, ids=list):
+ """
+ Given a fasta file and list of identifiers, create a dataframe
+ of DNA sequences that can be merged with the cluster/locus dataframe.
+ """
+ sequence_dict = Fasta(fasta, duplicate_action="first")
+ sequence_list = []
+ for locus in ids:
+ sequence_list.append(sequence_dict[locus][:].seq)
+
+ return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list})
+
+
+def main():
+ """
+ Main function, running the whole script.
+ """
+ # Read clstr file, store as dictionaries
+ cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file)
+
+ # Convert dictionaries to dataframes
+ cluster_df = pd.DataFrame(cluster_dict)
+ locus_df = pd.DataFrame(locus_dict)
+
+ # Merge dataframes
+ combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster")
+
+ # Add sequences
+ sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"])
+
+ combined_with_sequences = combined_df.merge(
+ sequence_df, how="inner", on="Full_locus"
+ )
+
+ ## Not yet implemented:
+ # Find shortest sequence per cluster
+ # Find most common sequence per cluster
+
+ # Save as tab-separated text file
+ combined_with_sequences.to_csv(table_file, sep="\t", index=False)
+
+ return 0
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/workflow/scripts/merge_cctyper_crispridentify.sh b/workflow/scripts/merge_cctyper_crispridentify.sh
new file mode 100644
index 0000000..e59393d
--- /dev/null
+++ b/workflow/scripts/merge_cctyper_crispridentify.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+exec 2> "${snakemake_log[0]}" # send all stderr to the log file
+
+cctyper=(${snakemake_input[cctyper]})
+
+first=True
+for summary in "${cctyper[@]}"
+do
+ # If it is the first, copy the whole file (including header)
+ if [ "${first}" == True ]
+ then
+ cat "${summary}" > tmp_file1
+ first=False
+ # otherwise, only take the 'contents', without header
+ else
+ tail -n +2 "${summary}" >> tmp_file1
+ fi
+ # And write it all in one concatenated temporary file: 'tmp_file1'
+done
+
+# For CRISPRidentify, collect the header as variable
+header=$(head -n 1 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t")
+# and write the contents of the 'complete summary' to another temporary file: 'tmp_file2'
+tail -n +2 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t" > tmp_file2
+
+first=True
+
+# Start reading 'tmp_file1' = CCTyper CRISPRs
+while read line
+do
+ if [ "${first}" == True ]
+ # For the first line...
+ then
+ first=False
+ # ...concatenate the header with CRISPRidentify's header
+ echo -e "$line\t$header" > ${snakemake_output[table]}
+ else
+
+ # For all other lines,
+ sample=$(echo -e "${line}" | cut -f 1)
+ start_cc=$(echo -e "${line}" | cut -f 3)
+ start_id=$(expr "${start_cc}" + 1)
+ # see if the sample has a match in CRISPRidentify (tmp_file2)
+ match=$(grep "${sample}_${start_id}" tmp_file2 || true)
+
+ if [ -z "${match}" ]
+ then
+ # If there's *no* match (-z = lenth 0)
+ echo -e "${line}" >> ${snakemak_output[table]}
+ # Just copy the line from CCTyper
+ else
+ # but if there is a match,
+ while read line2
+ do
+ if [ "${start_cc}" -lt 5000 ];
+ # check if the reported start position is greater than 5000 (the length of the flanking regions),
+ # and match the lines from CCTyper (line) and CRISPRidentify (match/line2) accordingly
+ then
+ echo -e "${line}\t${match}" >> ${snakemake_output[table]}
+ else
+ start=$(echo -e "${line2}" | cut -f 2)
+ start=$(expr "${start}" + "${start_cc}" - 5000)
+ length=$(echo -e "${line2}" | cut -f 4)
+ end=$(expr "${length}" + "${start}" - 1)
+ begin=$(echo -e "${line2}" | cut -f 1)
+ rest=$(echo -e "${line2}" | cut -f 4-9)
+ echo -e "${line}\t${begin}\t${start}\t${end}\t${rest}" >> ${snakemake_output[table]}
+ fi
+ done <<< "${match}"
+ fi
+ fi
+done < tmp_file1
+
+# Remove the temporary files
+rm -f tmp_file1 tmp_file2
diff --git a/workflow/scripts/merge_crispridentify_batches.sh b/workflow/scripts/merge_crispridentify_batches.sh
new file mode 100644
index 0000000..caca261
--- /dev/null
+++ b/workflow/scripts/merge_crispridentify_batches.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+exec 2> "${snakemake_log[0]}" # send all stderr to the log file
+
+spacers_crispr=(${snakemake_input[spacers_crispr]})
+summary_crispr=(${snakemake_input[summary_crispr]})
+
+> ${snakemake_output[spacers_crispr]}
+for spacers in "${spacers_crispr[@]}"
+do
+ cat "${spacers}" >> ${snakemake_output[spacers_crispr]}
+done
+
+for summary in "${summary_crispr[@]}"
+do
+ header=$(head -n 1 "${summary}")
+ if [ "${header}" == "No arrays found" ]
+ then
+ continue
+ else
+ echo "${header}" > ${snakemake_output[summary_crispr]}
+ break
+ fi
+done
+
+for summary in "${summary_crispr[@]}"
+do
+ tail -n +2 "${summary}" >> ${snakemake_output[summary_crispr]}
+done