diff --git a/README.md b/README.md
index 2a7dc81..10a6dec 100755
--- a/README.md
+++ b/README.md
@@ -26,51 +26,41 @@ which includes a quick start guide as well as a detailed step-by-step descriptio
 
 ## Release roadmap
 
-- Version 0.1 updates:
-
-    - Updates in documentation of implemented functions and output files
-(e.g., expanding documentation on the use of CRISPRidentify)
-
-    - Functions to concatenate and combine existing outputs
-
-    - Code clean-up (moving long commands to separate scripts,
-applying standardised formatting, remove unnecessary code)
-
-- Versions 0.2 and further:
-
-    - New functionality (e.g., all-vs-all genome comparisons)
-
-## To do
-
-- [x] Add CRISPRidentify to workflow
-
-   - [x] And make sure it works on a clean install
-
-- [x] Combine results from CCTyper with CRISPRidentify
-
-- [ ] Make and/or correct scripts for combining results into 'Output files' (write to `data/processed/`)
-
-   - [x] Concatenate MLST results
-
-   - [x] Enable spacer table creation script in Snakefile (add to `rule all`)
-
-- [x] Collect and combine results from geNomad and Jaeger
-
-- [x] Map spacers to genomes and phage/plasmid databases
-
-- [x] Add PADLOC for identifying other anti-phage systems
-
-- [ ] Write documentation for output files
-
-- [x] Rewrite 'Problems encountered' into a rationale for our tool selection (as separate document)
-
-- [x] Write detailed and technical step-by-step description of the workflow
-
-    - [ ] While reviewing the workflow, remove unnecessary pieces and clean-up where possible
-
-- [x] Setup MkDocs-powered documentation (at least locally, integrate with GitHub pages later)
-
-(_Note to self: Remove this list when finished and use issues or roadmap instead!_)
+- Version 0.2: tidy code
+   - remove outdated steps
+   - move long commands in Snakefile to separate script
+   - (re)apply linting (Black, Styler)
+   - apply 'bash strict mode' and suppress R messages
+   - move parts of Snakefile to separate scripts?
+    
+- Version 0.3: solid foundation
+   - validate proper functioning of CCTyper + CRISPRidentify
+      - adjust helper scripts where necessary
+   - correct scripts for making tables, integrate with Snakemake and test!
+      - CRISPR spacer table (#24)
+      - CRISPR-Cas locus table
+      - (make sure every analysis part produces an output: include in 'rule all')
+
+- Version 0.4: clear documentation
+   -  review and update README and docs
+
+- Future additions:
+   - genome deduplication (dRep)
+   - CRISPR spacer target prediction
+      - map to
+         - masked ATB genomes
+         - PLSDB
+         - PhageScope
+         - VIRE
+         - MEGAISurv metagenomes
+      - mini-benchmark different mapping algorithms
+         - Sassy
+         - KMA
+         - SpacePHARER
+      - (where feasible) connect spacer hits with functional annotations!
+   - Integrate downstream analyses with Snakemake?
+      - run RMarkdown/Quarto notebooks automatically
+   - build a database like [this spacerdb](https://spacers.jgi.doe.gov/database/overview/)?
 
 ## Workflow description
 
diff --git a/Snakefile b/Snakefile
deleted file mode 100644
index 5751211..0000000
--- a/Snakefile
+++ /dev/null
@@ -1,987 +0,0 @@
-"""
-Author: Sam Nooij
-Organisation: Utrecht University
-Department: Clinical Infectiology (KLIF), Infectious Diseases & Immunology,
-  Biomolecular Health Sciences, Faculty of Veterinary Medicine
-Date: 2024-11-06
-
-Workflow for testing CRISPR analysis options
-In contrast to the 'regular' Snakefile workflow, which works
-per genome file, this workflow works per batch and runs GNU
-parallel to parallelise processing of the genomes within
-each batch.
-
-
-Input: Fasta files of Campylobacter whole-genomes
-Output: (various)
-
-Example use:
-    $ snakemake --profile config
-
-N.B. Variables are set in the configuration files under `config`.
-"""
-
-from pathlib import Path
-import functools
-import operator
-
-### Step 1: Import configuration file ###
-
-
-configfile: Path("config/parameters.yaml")
-
-
-# Use Python functions to automatically detect batches of genomes fasta files
-# in the input directory as 'BATCHES'
-WORK_DIR = config["working_directory"]
-
-BATCH_PATHS = list((Path(WORK_DIR) / "assemblies").glob("atb.assembly.*"))
-for batch in BATCH_PATHS:
-    assert Path(batch).is_dir(), f"Batches must be directories, got {batch}"
-
-BATCHES = [batch.name for batch in BATCH_PATHS]
-
-OUTPUT_DIR = config["output_directory"]
-
-
-### Step 2: Specify output files ###
-
-
-rule all:
-    input:
-        # Multilocus Sequence Types (ST) for Campylobacter
-        OUTPUT_DIR + "mlst_table.tsv",
-        # Virus and plasmid predictions per contig
-        OUTPUT_DIR + "genomad_predictions.csv",
-        OUTPUT_DIR + "jaeger_predictions.csv",
-        # Phage defence systems
-        OUTPUT_DIR + "padloc_table.csv",
-        # Concatenated CCTyper output
-        expand(
-            WORK_DIR + "cctyper/{batch}/{filename}-{batch}.tab",
-            batch=BATCHES,
-            filename=[
-                "CRISPR_Cas",
-                "crisprs_all",
-                "crisprs_near_cas",
-                "crisprs_orphan",
-                "cas_operons",
-            ],
-        ),
-        expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
-        # Combined CCTyper output as CSV + BED files
-        expand(WORK_DIR + "cctyper/{batch}/parsed", batch=BATCHES),
-        # CCTyper CRISPR spacer cluster analysis report
-        WORK_DIR + "cctyper/spacer_cluster_summary.tsv",
-        # Cluster unique CRISPR spacers
-        WORK_DIR + "cctyper/all_spacers-clustered.clstr",
-        WORK_DIR + "crispridentify/all_spacers-clustered.clstr",
-        OUTPUT_DIR + "all_spacers_table_identify.tsv",
-        OUTPUT_DIR + "all_spacers_table.tsv",
-        # Extracted CRISPR arrays (as fasta)
-        expand(WORK_DIR + "arrays/{batch}/complete", batch=BATCHES),
-        #CRISPRidentify output
-        expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES),
-        #concatenated CRISPRidentify output
-        WORK_DIR + "crispridentify/complete_summary.csv",
-        WORK_DIR + "crispridentify/all_spacers.fa",
-        #merged CRISPRidentify and CCtyper output
-        OUTPUT_DIR + "all_CRISPRS_with_identify.tab",
-        #spacepharer output
-        OUTPUT_DIR + "phage_matches.tsv",
-        OUTPUT_DIR + "plasmid_matches.tsv",
-        #KMA output
-        WORK_DIR + "kma/output/CRISPR.frag.gz",
-        WORK_DIR + "kma/CRISPR_alignment",
-
-
-### Step 3: Define processing steps that generate the output ###
-
-
-rule download_mlst_database:
-    output:
-        WORK_DIR + "mlst/campylobacter.db",
-    params:
-        species=config["mlst"]["species"],
-    conda:
-        "envs/pymlst.yaml"
-    threads: 1
-    log:
-        "log/download_mlst_database.txt",
-    benchmark:
-        "log/benchmark/download_mlst_database.txt"
-    shell:
-        """
-claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1
-        """
-
-
-rule mlst:
-    input:
-        batch=WORK_DIR + "assemblies/{batch}/",
-        db=WORK_DIR + "mlst/campylobacter.db",
-    output:
-        WORK_DIR + "mlst/{batch}/complete",
-    conda:
-        "envs/pymlst.yaml"
-    threads: config["mlst"]["threads"]
-    log:
-        "log/mlst/{batch}.txt",
-    benchmark:
-        "log/benchmark/mlst/{batch}.txt"
-    shell:
-        """
-find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
- parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
- claMLST search {input.db} {{}} -o "$(dirname {output})/{{/.}}.txt" > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule concatenate_mlst_batches:
-    input:
-        WORK_DIR + "mlst/{batch}/complete",
-    output:
-        WORK_DIR + "mlst/{batch}-concatenated.tsv",
-    threads: config["mlst"]["threads"]
-    log:
-        "log/concatenate_mlst/{batch}.txt",
-    benchmark:
-        "log/benchmark/concatenate_mlst/{batch}.txt"
-    shell:
-        """
-echo -e "Genome\tST" > {output}
-find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\
- parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
- 'tail -n +2 {{}} | cut -f 1-2 >> {output}'
-        """
-
-
-rule concatenate_mlst_all:
-    input:
-        expand(WORK_DIR + "mlst/{batch}-concatenated.tsv", batch=BATCHES),
-    output:
-        OUTPUT_DIR + "mlst_table.tsv",
-    threads: 1
-    log:
-        "log/concatenate_mlst_all.txt",
-    benchmark:
-        "log/benchmark/concatenate_mlst_all.txt"
-    shell:
-        """
-batches=( {input} )
-head -1 ${{batches[0]}} > {output}
-sed --separate 1d ${{batches[@]}} >> {output}
-        """
-
-
-rule crisprcastyper:
-    input:
-        batch=WORK_DIR + "assemblies/{batch}/",
-    output:
-        WORK_DIR + "cctyper/{batch}/complete",
-    params:
-        out_dir=WORK_DIR + "cctyper/{batch}/",
-    conda:
-        "envs/cctyper.yaml"
-    threads: config["cctyper"]["threads"]
-    log:
-        "log/cctyper/{batch}.txt",
-    benchmark:
-        "log/benchmark/cctyper/{batch}.txt"
-    shell:
-        """
-find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
- parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
- 'rm -rf "{params.out_dir}{{/.}}" &&\
- cctyper -t 1 {{}} "{params.out_dir}{{/.}}"' > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule download_padloc_database:
-    output:
-        WORK_DIR + "padloc/database",
-    conda:
-        "envs/padloc.yaml"
-    threads: 1
-    log:
-        "log/download_padloc_database.txt",
-    benchmark:
-        "log/benchmark/download_padloc_database.txt"
-    shell:
-        """
-padloc --db-install v2.0.0
-touch {output}
-        """
-
-
-rule padloc:
-    input:
-        batch=WORK_DIR + "assemblies/{batch}/",
-        db=WORK_DIR + "padloc/database",
-    output:
-        WORK_DIR + "padloc/{batch}/complete",
-    conda:
-        "envs/padloc.yaml"
-    threads: config["padloc"]["threads"]
-    log:
-        "log/padloc/{batch}.txt",
-    benchmark:
-        "log/benchmark/padloc/{batch}.txt"
-    shell:
-        """
-find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
- parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
- 'mkdir -p "$(dirname {output})/{{/.}}" && padloc --cpu 1 --fna {{}} --outdir "$(dirname {output})/{{/.}}"' > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule concatenate_padloc_batches:
-    input:
-        WORK_DIR + "padloc/{batch}/complete",
-    output:
-        WORK_DIR + "padloc/{batch}-concatenated.csv",
-    threads: config["padloc"]["threads"]
-    log:
-        "log/concatenate_padloc/{batch}.txt",
-    benchmark:
-        "log/benchmark/concatenate_padloc/{batch}.txt"
-    shell:
-        """
-file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") )
-head -1 ${{file_array[0]}} > {output}
-parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
- 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}}
-        """
-
-
-rule concatenate_padloc_all:
-    input:
-        expand(WORK_DIR + "padloc/{batch}-concatenated.csv", batch=BATCHES),
-    output:
-        OUTPUT_DIR + "padloc_table.csv",
-    threads: 1
-    log:
-        "log/concatenate_padloc_all.txt",
-    benchmark:
-        "log/benchmark/concatenate_padloc_all.txt"
-    shell:
-        """
-batches=( {input} )
-head -1 ${{batches[0]}} > {output}
-sed --separate 1d ${{batches[@]}} >> {output}
-        """
-
-
-rule parse_cctyper:
-    input:
-        WORK_DIR + "cctyper/{batch}/complete",
-    output:
-        WORK_DIR + "cctyper/{batch}/parsed",
-    conda:
-        "envs/pandas.yaml"
-    threads: config["parse_cctyper"]["threads"]
-    log:
-        "log/parse_cctyper/{batch}.txt",
-    benchmark:
-        "log/benchmark/parse_cctyper/{batch}.txt"
-    shell:
-        """
-find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\
-parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
-    python bin/cctyper_extender.py -d {{.}} > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule extract_sequences:
-    input:
-        WORK_DIR + "cctyper/{batch}/parsed",
-    output:
-        WORK_DIR + "cctyper/{batch}/subseq",
-    conda:
-        "envs/seqkit.yaml"
-    threads: config["extract_sequences"]["threads"]
-    log:
-        "log/extract_sequences/{batch}.txt",
-    benchmark:
-        "log/benchmark/extract_sequences/{batch}.txt"
-    shell:
-        """
-find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\
-parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
-    bash bin/extract_crispr-cas_from_fasta.sh {{}} > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule collect_cctyper:
-    input:
-        cctyper=WORK_DIR + "cctyper/{batch}/complete",
-        parser=WORK_DIR + "cctyper/{batch}/parsed",
-    output:
-        crispr_cas=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab",
-        crisprs_all=WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab",
-        crisprs_near_cas=WORK_DIR + "cctyper/{batch}/crisprs_near_cas-{batch}.tab",
-        crisprs_orphan=WORK_DIR + "cctyper/{batch}/crisprs_orphan-{batch}.tab",
-        spacers=WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa",
-        cas_putative=temp(WORK_DIR + "cctyper/{batch}/cas_operons_putative-{batch}.tab"),
-        cas=WORK_DIR + "cctyper/{batch}/cas_operons-{batch}.tab",
-        csv=WORK_DIR + "cctyper/{batch}/CRISPR-Cas-{batch}.csv",
-    threads: 1
-    log:
-        "log/cctyper/collect_{batch}.txt",
-    benchmark:
-        "log/benchmark/cctyper/collect_{batch}.txt"
-    shell:
-        """
-bash bin/concatenate_cctyper_output.sh $(dirname {input.cctyper}) > {log} 2>&1
-echo "\n========================" >> {log}
-bash bin/concatenate_cctyper_csv.sh $(dirname {input.parser}) >> {log} 2>&1
-
-find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat {{}} + > {output.spacers} 2>> {log}
-        """
-
-
-rule extract_crispr_cas_locations:
-    input:
-        WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.tab",
-    output:
-        WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed",
-    threads: 1
-    log:
-        "log/extract_crispr_cas_location/{batch}.txt",
-    benchmark:
-        "log/benchmark/extract_crispr_cas_location/{batch}.txt"
-    shell:
-        """
-python bin/create_CCTyper_bedfile.py -i {input} -o {output} > {log} 2>&1
-        """
-
-
-rule extract_crispr_array:
-    input:
-        batch=WORK_DIR + "assemblies/{batch}/",
-        bed=WORK_DIR + "cctyper/{batch}/CRISPR_Cas-{batch}.bed",
-    output:
-        WORK_DIR + "arrays/{batch}/complete",
-    params:
-        out_dir=WORK_DIR + "arrays/{batch}/",
-    conda:
-        "envs/seqkit.yaml"
-    threads: config["extract_arrays"]["threads"]
-    log:
-        "log/extract_crispr_array/{batch}.txt",
-    benchmark:
-        "log/benchmark/extract_crispr_array/{batch}.txt"
-        ""
-    shell:
-        """
-cut -f 1 -d '.' {input.bed} | parallel --jobs {threads} --retry-failed\
- --halt='now,fail=1'\
- 'if [ -e "{input.batch}/{{}}.fa" ];\
- then seqkit subseq --bed {input.bed} "{input.batch}/{{}}.fa"\
- -o "{params.out_dir}{{}}.fa";\
- fi' > {log} 2>&1
-
-touch {output}
-        """
-
-
-rule concatenate_all_spacers:
-    input:
-        expand(WORK_DIR + "cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
-    output:
-        WORK_DIR + "cctyper/all_spacers.fa",
-    threads: 1
-    log:
-        "log/concatenate_all_spacers.txt",
-    benchmark:
-        "log/benchmark/concatenate_all_spacers.txt"
-    shell:
-        """
-cat {input} > {output} 2> {log}
-        """
-
-
-rule cluster_all_spacers:
-    input:
-        WORK_DIR + "cctyper/all_spacers.fa",
-    output:
-        clusters=expand(
-            WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}.clstr",
-            cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
-        ),
-        spacers=expand(
-            WORK_DIR + "cctyper/all_spacers-clustered-{cutoff}",
-            cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
-        ),
-        summary=WORK_DIR + "cctyper/spacer_cluster_summary.tsv",
-    params:
-        WORK_DIR=WORK_DIR + "cctyper/",
-        log_dir="log/spacer_clustering/",
-    conda:
-        "envs/cdhit.yaml"
-    threads: 1
-    log:
-        "log/cluster_all_spacers.txt",
-    benchmark:
-        "log/benchmark/cluster_all_spacers.txt"
-    shell:
-        """
-bash bin/cluster_all_spacers.sh\
-    {input}\
-    {params.WORK_DIR}\
-    {params.log_dir} > {log} 2>&1
-        """
-
-
-rule cluster_unique_spacers:
-    input:
-        WORK_DIR + "cctyper/all_spacers.fa",
-    output:
-        clusters=WORK_DIR + "cctyper/all_spacers-clustered.clstr",
-        spacers=WORK_DIR + "cctyper/all_spacers-clustered",
-        distribution=WORK_DIR + "cctyper/all_spacers-clustered-distribution.tsv",
-    conda:
-        "envs/cdhit.yaml"
-    threads: 1
-    log:
-        "log/cluster_unique_spacers.txt",
-    benchmark:
-        "log/benchmark/cluster_unique_spacers.txt"
-    shell:
-        """
-cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
- -i {input} -o {output.spacers} > {log} 2>&1
-
-plot_len1.pl {output.clusters}\
- 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
- 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
- > {output.distribution}
-        """
-
-
-rule create_crispr_cluster_table:
-    input:
-        clstr=WORK_DIR + "cctyper/all_spacers-clustered.clstr",
-        fasta=WORK_DIR + "cctyper/all_spacers.fa",
-    output:
-        OUTPUT_DIR + "all_spacers_table.tsv",
-    conda:
-        "envs/pyfaidx.yaml"
-    threads: 1
-    log:
-        "log/create_crispr_cluster_table.txt",
-    benchmark:
-        "log/benchmark/create_crispr_cluster_table.txt"
-    script:
-        "bin/make_cluster_table.py"
-
-
-rule crispridentify:
-    input:
-        WORK_DIR + "cctyper/{batch}/subseq",
-    output:
-        WORK_DIR + "crispridentify/{batch}/complete",
-    params:
-        out_dir=WORK_DIR + "crispridentify/{batch}",
-        arrays=WORK_DIR + "cctyper/{batch}",
-    conda:
-        "envs/crispridentify.yml"
-    threads: config["crispridentify"]["threads"]
-    log:
-        "log/crispridentify/{batch}.txt",
-    benchmark:
-        "log/benchmark/crispridentify/{batch}.txt"
-    shell:
-        """
-    
-    cd bin/CRISPRidentify
-    find ../../{params.arrays}/*/fasta/CRISPR_arrays-with_flanks.fasta -size +0c -print0 | \
-    parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
-    python CRISPRidentify.py --file {{}} --result_folder "../../{params.out_dir}/{{/.}}" --fasta_report True --strand False > ../../{log} 2>&1   
-
-    touch ../../{output}
-    
-        """
-
-
-rule merge_crispridentify_batches:
-    input:
-        expand(WORK_DIR + "crispridentify/{batch}/complete", batch=BATCHES),
-    params:
-        spacers_crispr=expand(
-            WORK_DIR
-            + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
-            batch=BATCHES,
-        ),
-        summary_crispr=expand(
-            WORK_DIR
-            + "crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
-            batch=BATCHES,
-        ),
-    output:
-        spacers_crispr=WORK_DIR + "crispridentify/all_spacers.fa",
-        summary_crispr=WORK_DIR + "crispridentify/complete_summary.csv",
-    threads: 1
-    log:
-        "log/merge_crispridentify_batches.txt",
-    benchmark:
-        "log/benchmark/merge_crispridentify_batches.txt"
-    shell:
-        """
-    cat {params.spacers_crispr} > {output.spacers_crispr}
-    for summary in {params.summary_crispr} ; do header=$(head -n 1 "$summary"); if [ "$header" == "No arrays found" ];
-    then
-        continue;
-    else 
-        echo $header | tee {output.summary_crispr};
-        break;
-    fi
-    done
-    for summary in {params.summary_crispr} ; do tail -n +2 "$summary" >> {output.summary_crispr} ; done
-        """
-
-
-rule merge_cctyper_identify:
-    input:
-        identify=WORK_DIR + "crispridentify/complete_summary.csv",
-        cctyper=expand(
-            WORK_DIR + "cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES
-        ),
-    output:
-        OUTPUT_DIR + "all_CRISPRS_with_identify.tab",
-    params:
-        tmp1="tmp_file1",
-        tmp2="tmp_file2",
-    threads: 1
-    log:
-        "log/merge_cctyper_identify",
-    shell:
-        """
-    first=True
-    for summary in {input.cctyper} ; do
-        if [ $first == True ];
-        then
-            cat $summary > {params.tmp1}
-            first=False
-        else
-            tail -n +2 $summary >> {params.tmp1}
-        fi
-    done
-
-    header=$(head -n 1 {input.identify} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t")
-    tail -n +2 {input.identify} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t" > {params.tmp2}
-    first=True
-    while read line; do
-        if [ $first == True ];
-        then
-            first=False
-            echo -e "$line\t$header" > {output}
-        else
-            sample=$(echo -e "$line" | cut -f 1)
-            start_cc=$(echo -e "$line" | cut -f 3)
-            start_id=$(expr "$start_cc" + 1)
-            match=$(grep "${{sample}}_$start_id" {params.tmp2} || true)
-            if [ -z "$match" ]; then
-                echo -e "$line" >> {output}
-            else
-                while read line2; do
-                    if [ "$start_cc" -lt 5000 ];
-                    then
-                        echo -e "$line\t$match" >> {output}
-                    else
-                        start=$(echo -e "$line2" | cut -f 2)
-                        start=$(expr "$start" + "$start_cc" - 5000)
-                        length=$(echo -e "$line2" | cut -f 4)
-                        end=$(expr "$length" + "$start" - 1)
-                        begin=$(echo -e "$line2" | cut -f 1)
-                        rest=$(echo -e "$line2" | cut -f 4-9)
-                        echo -e "$line\t$begin\t$start\t$end\t$rest" >> {output}
-                    fi
-                done <<< "$match"
-            fi
-        fi
-    done < {params.tmp1}
-    rm -f {params.tmp1} {params.tmp2}
-        """
-
-
-rule cluster_unique_spacers_crispridentify:
-    input:
-        WORK_DIR + "crispridentify/all_spacers.fa",
-    output:
-        clusters=WORK_DIR + "crispridentify/all_spacers-clustered.clstr",
-        spacers=WORK_DIR + "crispridentify/all_spacers-clustered",
-        distribution=WORK_DIR + "crispridentify/all_spacers-clustered-distribution.tsv",
-    conda:
-        "envs/cdhit.yaml"
-    threads: 1
-    log:
-        "log/cluster_unique_spacers_identify.txt",
-    benchmark:
-        "log/benchmark/cluster_unique_spacers_identify.txt"
-    shell:
-        """
-cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
- -i {input} -o {output.spacers} > {log} 2>&1
-
-plot_len1.pl {output.clusters}\
- 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
- 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
- > {output.distribution}
-        """
-
-
-rule create_crispr_cluster_table_identify:
-    input:
-        clstr=WORK_DIR + "crispridentify/all_spacers-clustered.clstr",
-        fasta=WORK_DIR + "crispridentify/all_spacers.fa",
-    output:
-        OUTPUT_DIR + "all_spacers_table_identify.tsv",
-    conda:
-        "envs/pyfaidx.yaml"
-    threads: 1
-    log:
-        "log/create_crispr_cluster_table_identify.txt",
-    benchmark:
-        "log/benchmark/create_crispr_cluster_table_identify.txt"
-    script:
-        "bin/make_cluster_table_identify.py"
-
-
-rule concatenate_batches:
-    input:
-        WORK_DIR + "assemblies/{batch}",
-    output:
-        temp(WORK_DIR + "assemblies/{batch}.fasta"),
-    threads: 1
-    log:
-        "log/concatenate_{batch}.txt",
-    benchmark:
-        "log/benchmark/concatenate_{batch}.txt"
-    shell:
-        """
-cat {input}/*.fa > {output} 2> {log}
-        """
-
-
-rule genomad:
-    input:
-        fasta=WORK_DIR + "assemblies/{batch}.fasta",
-        db=config["genomad_database"],
-    output:
-        aggregated_classification=WORK_DIR
-        + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
-        plasmid_summary=WORK_DIR
-        + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
-        virus_summary=WORK_DIR
-        + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
-    params:
-        work_dir=WORK_DIR + "genomad/{batch}/",
-    conda:
-        "envs/genomad.yaml"
-    threads: config["genomad"]["threads"]
-    log:
-        "log/genomad/{batch}.txt",
-    benchmark:
-        "log/benchmark/genomad/{batch}.txt"
-    shell:
-        """
-genomad end-to-end -t {threads} --cleanup --enable-score-calibration\
- {input.fasta} {params.work_dir} {input.db} > {log} 2>&1
-        """
-
-
-rule collect_genomad_predictions:
-    input:
-        aggregated_classification=expand(
-            WORK_DIR
-            + "genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
-            batch=BATCHES,
-        ),
-        plasmid_summary=expand(
-            WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
-            batch=BATCHES,
-        ),
-        virus_summary=expand(
-            WORK_DIR + "genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
-            batch=BATCHES,
-        ),
-    output:
-        OUTPUT_DIR + "genomad_predictions.csv",
-    conda:
-        "envs/tidy_here.yaml"
-    threads: 1
-    log:
-        "log/collect_genomad_predictions.txt",
-    benchmark:
-        "log/benchmark/collect_genomad_predictions.txt"
-    script:
-        "bin/collect_genomad_predictions.R"
-
-
-rule jaeger:
-    input:
-        batch=WORK_DIR + "assemblies/{batch}/",
-    output:
-        WORK_DIR + "jaeger/{batch}/complete",
-    conda:
-        "envs/jaeger.yaml"
-    threads: config["jaeger"]["threads"]
-    log:
-        "log/jaeger/{batch}.txt",
-    benchmark:
-        "log/benchmark/jaeger/{batch}.txt"
-    shell:
-        """
-parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
- jaeger run -p --workers 1 -i {{}} -o $(dirname {output}) --overwrite\
- > {log} 2>&1 ::: {input.batch}/*.fa
-
-touch {output}
-        """
-
-
-rule collect_jaeger_batch:
-    input:
-        WORK_DIR + "jaeger/{batch}/complete",
-    output:
-        WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv",
-    params:
-        batch="{batch}",
-    conda:
-        "envs/tidy_here.yaml"
-    threads: 1
-    log:
-        "log/collect_jaeger_{batch}.txt",
-    benchmark:
-        "log/benchmark/collect_jaeger_{batch}.txt"
-    script:
-        "bin/collect_jaeger_batch.R"
-
-
-rule collect_jaeger_predictions:
-    input:
-        expand(WORK_DIR + "jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES),
-    output:
-        OUTPUT_DIR + "jaeger_predictions.csv",
-    threads: 1
-    log:
-        "log/collect_jaeger_predictions.txt",
-    benchmark:
-        "log/benchmark/collect_jaeger_predictions.txt"
-    script:
-        "bin/collect_jaeger_predictions.sh"
-
-
-rule spacepharer_spacer_setup:
-    input:
-        spacers=WORK_DIR + "crispridentify/all_spacers.fa",
-    output:
-        spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB",
-    params:
-        tmp_folder=WORK_DIR + "spacepharer/tmpFolder",
-    conda:
-        "envs/spacepharer.yml"
-    threads: 48
-    log:
-        "log/spacepharer/spacepharer_spacer_setup.txt",
-    benchmark:
-        "log/benchmark/spacepharer/spacepharer_spacer_setup.txt"
-    shell:
-        """
-        spacer_DB=$(dirname {output.spacer_DB})
-        rm -rf $spacer_DB/* > {log} 2>&1 
-        spacepharer createsetdb {input.spacers} {output.spacer_DB} {params.tmp_folder} --extractorf-spacer 1 --threads {threads} >> {log} 2>&1
-        """
-
-
-rule spacepharer_phage_setup:
-    output:
-        phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB",
-        phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB",
-    params:
-        tmp_folder=WORK_DIR + "spacepharer/tmpFolder",
-        DB=config["spacepharer_phage_database"] + "*.fasta",
-    conda:
-        "envs/spacepharer.yml"
-    threads: 48
-    log:
-        "log/spacepharer/spacepharer_phage_setup.txt",
-    benchmark:
-        "log/benchmark/spacepharer/spacepharer_setup.txt"
-    shell:
-        """
-        phage_DB=$(dirname {output.phage_DB})
-        rm -rf $phage_DB/* > {log} 2>&1
-        spacepharer createsetdb {params.DB} {output.phage_DB} {params.tmp_folder} --threads {threads} >> {log} 2>&1
-        spacepharer createsetdb {params.DB} {output.phage_control_DB} {params.tmp_folder} --reverse-fragments 1 --threads {threads} >> {log} 2>&1
-        """
-
-
-rule spacepharer_phage:
-    input:
-        spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB",
-        phage_DB=WORK_DIR + "spacepharer/phage_DB/targetsetDB",
-        phage_control_DB=WORK_DIR + "spacepharer/phage_DB/controlsetDB",
-    output:
-        result=WORK_DIR + "spacepharer/predicted_phage_matches.tsv",
-        result_sanitised=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv",
-    params:
-        tmp_folder=WORK_DIR + "spacepharer/tmpFolder",
-    conda:
-        "envs/spacepharer.yml"
-    threads: 48
-    log:
-        "log/spacepharer/spacepharer_phage.txt",
-    benchmark:
-        "log/benchmark/spacepharer/spacepharer_phage.txt"
-    shell:
-        """
-        spacepharer predictmatch {input.spacer_DB} {input.phage_DB} {input.phage_control_DB} {output.result} {params.tmp_folder} --threads {threads} > {log} 2>&1
-        grep -v "#" {output.result} > {output.result_sanitised} 
-        rm -r {params.tmp_folder} >> {log} 2>&1
-        """
-
-
-rule spacepharer_plasmid_setup:
-    input:
-        DB=config["spacepharer_plasmid_database"] + "sequences.fasta",
-    output:
-        DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB",
-        control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB",
-    params:
-        tmp_folder=WORK_DIR + "spacepharer/tmpFolder",
-    conda:
-        "envs/spacepharer.yml"
-    threads: 48
-    log:
-        "log/spacepharer/spacepharer_plasmid_setup.txt",
-    benchmark:
-        "log/benchmark/spacepharer/spacepharer_plasmid_setup.txt"
-    shell:
-        """
-        plasmid_DB=$(dirname {output.DB})
-        rm -f $plasmid_DB/* > {log} 2>&1
-        spacepharer createsetdb {input.DB} {output.DB} {params.tmp_folder} --threads {threads} >> {log} 2>&1
-        spacepharer createsetdb {input.DB} {output.control_DB} {params.tmp_folder} --reverse-fragments 1 --threads {threads} >> {log} 2>&1
-        """
-
-
-rule spacepharer_plasmid:
-    input:
-        phage_DB=WORK_DIR + "spacepharer/plasmid_DB/targetsetDB",
-        phage_control_DB=WORK_DIR + "spacepharer/plasmid_DB/controlsetDB",
-        spacer_DB=WORK_DIR + "spacepharer/DB_CRISPR/querysetDB",
-    output:
-        result=WORK_DIR + "spacepharer/predicted_plasmid_matches.tsv",
-        result_sanitised=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv",
-    params:
-        tmp_folder=WORK_DIR + "spacepharer/tmpFolder",
-    conda:
-        "envs/spacepharer.yml"
-    threads: 48
-    log:
-        "log/spacepharer/spacepharer_phage.txt",
-    benchmark:
-        "log/benchmark/spacepharer/spacepharer_phage.txt"
-    shell:
-        """
-        spacepharer predictmatch {input.spacer_DB} {input.phage_DB} {input.phage_control_DB} {output.result} {params.tmp_folder} --threads {threads} > {log} 2>&1
-        grep -v "#" {output.result} > {output.result_sanitised} 
-        rm -r {params.tmp_folder} >> {log} 2>&1
-        """
-
-
-rule create_spacepharer_table:
-    input:
-        phage=WORK_DIR + "spacepharer/predicted_phage_matches_san.tsv",
-        meta_phage=config["spacepharer_phage_database"],
-        plasmid=WORK_DIR + "spacepharer/predicted_plasmid_matches_san.tsv",
-        meta_plasmid=config["spacepharer_plasmid_database"],
-    output:
-        phage=OUTPUT_DIR + "phage_matches.tsv",
-        plasmid=OUTPUT_DIR + "plasmid_matches.tsv",
-    threads: 1
-    log:
-        "log/create_spacepharer_table.txt",
-    script:
-        "bin/create_spacepharer_table.sh"
-
-
-rule kma_indexing:
-    input:
-        spacers=WORK_DIR + "crispridentify/all_spacers.fa",
-    output:
-        indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name",
-    params:
-        WORK_DIR + "kma/spacer_DB/spacers",
-    conda:
-        "envs/kma.yaml"
-    threads: 12
-    log:
-        "log/kma/kma_index.txt",
-    benchmark:
-        "log/benchmark/kma/kma_index.txt"
-    shell:
-        """
-        kma index -i {input.spacers} -o {params} > {log} 2>&1
-        """
-
-
-rule kma:
-    input:
-        genomes=expand(WORK_DIR + "assemblies/{batch}/", batch=BATCHES),
-        indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers.name",
-    output:
-        WORK_DIR + "kma/output/CRISPR.frag.gz",
-    params:
-        output=WORK_DIR + "kma/output/CRISPR",
-        indexed_spacers=WORK_DIR + "kma/spacer_DB/spacers",
-        spacers=WORK_DIR + "crispridentify/all_spacers.fa",
-    conda:
-        "envs/kma.yaml"
-    threads: 24
-    log:
-        "log/kma/kma.txt",
-    benchmark:
-        "log/benchmark/kma/kma.txt"
-    shell:
-        """     
-        grep ">" {params.spacers} | cut -f 2 -d ">" | cut -f 1 -d "-" | sort -u > tmp_file
-        find -L {input.genomes} -mindepth 1 -maxdepth 1 -type f -name "*.fa" > all_genomes.txt
-        genomes=$(grep -x ".*[0-9]\\.fa" all_genomes.txt | grep -v -f tmp_file)
-        kma -hmm -i $genomes -o {params.output} -t_db {params.indexed_spacers} > {log} 2>&1
-        rm tmp_file all_genomes.txt
-        """
-
-
-rule collect_kma:
-    input:
-        WORK_DIR + "kma/output/CRISPR.frag.gz",
-    output:
-        WORK_DIR + "kma/CRISPR_alignment",
-    log:
-        "log/kma/collect_kma.txt",
-    benchmark:
-        "log/benchmark/kma/collect_kma.txt"
-    shell:
-        """
-        echo -e "spacer\tgenome" > {output}
-        zcat {input} | cut -f 6,7 | cut -f 1 -d " " > tmp_file
-        while read line; do
-            match=$(echo $line | cut -f 2)
-            crispr=$(echo $line | cut -f 1 | cut -f 1,6,7,10,11 -d "_")
-            echo -e "$crispr\t$match" >> {output}
-        done < tmp_file
-        rm tmp_file
-        """
diff --git a/bin/create_CCTyper_bedfile.py b/bin/create_CCTyper_bedfile.py
deleted file mode 100644
index 2ac384c..0000000
--- a/bin/create_CCTyper_bedfile.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-
-# Read the tabular output file 'CRISPR_Cas.tab' from CCTyper
-# and convert it to a BED file. This way, seqkit can be easily
-# used to extract the CRISPR-Cas array and its flanking regions.
-
-import argparse
-
-def parse_arguments():
-    """
-    Parse argument from the command line:
-     -i/--input = input file (CRISPR_Cas.tab)
-     -o/--output = output file (BED file)
-     -h/--help = show help
-    """
-    parser = argparse.ArgumentParser(
-                        prog="CCTyper_to_bedfile",
-                        description="Convert the CRISPR_Cas.tab output file"
-                        "from CCTyper to a BED file.",
-    )
-
-    required = parser.add_argument_group("Required arguments")
-
-    required.add_argument("-i", "--input",
-    dest = "input", required = True, type = str,
-    help = "CCTyper CRISPR_Cas.tab file",
-    )
-    required.add_argument("-o", "--output",
-    dest = "output", required = True, type = str,
-    help = "BED file to write output to")
-
-    args = parser.parse_args()
-
-    return args
-
-def convert_tab_to_bed(inputfile, outputfile):
-    """
-    """
-    with open(outputfile, 'w') as outfile:
-        with open(inputfile, 'r') as infile:
-            # Skip the first line, as it is the header
-            infile.readline()
-            for line in infile:
-                # Split by tab
-                elements = line.split("\t")
-
-                # Collect required information
-                contig = elements[0]
-                locus_tag = elements[1]
-                start_and_stop = elements[2].split(",")
-                start = start_and_stop[0].strip(" [")
-                stop = start_and_stop[1].strip(" ]")
-
-                # Combine in single line
-                bed_string = "%s\t%s\t%s\t%s\n" % (contig, start, stop, locus_tag)
-
-                # Write to the output file
-                outfile.write(bed_string)
-    return 0
-
-def main():
-    arguments = parse_arguments()
-
-    message = (
-        "\n"
-        "These are the files you have provided:\n"
-        "  INPUT:\n"
-        "{0}\n"
-        "  OUTPUT:\n"
-        "{1}".format(
-            arguments.input,
-            arguments.output
-        )
-    )
-
-    print(message)
-
-    exit(
-        convert_tab_to_bed(inputfile = arguments.input,
-                           outputfile = arguments.output)
-    )
-
-if __name__=="__main__":
-    exit(main())
\ No newline at end of file
diff --git a/bin/download_ATB_metadata.sh b/bin/download_ATB_metadata.sh
index 62e9b38..0909864 100755
--- a/bin/download_ATB_metadata.sh
+++ b/bin/download_ATB_metadata.sh
@@ -1,4 +1,6 @@
 #! /usr/bin/env bash
+set -euo pipefail
+IFS=$'\n'
 
 # Read the output directory from the command-line, with
 #  'data/ATB/' set as default.
diff --git a/bin/download_bakta_annotations.sh b/bin/download_bakta_annotations.sh
index b2ebc02..c75820b 100644
--- a/bin/download_bakta_annotations.sh
+++ b/bin/download_bakta_annotations.sh
@@ -5,18 +5,20 @@ IFS=$'\n'
 # Above thanks to Aaron Maxwell: http://redsymbol.net/articles/unofficial-bash-strict-mode/
 
 part=${1:-"update"}
+atb_dir=${2:-"resources/ATB/"}
+output_dir="${atb_dir}archives/"
 
 if [ "${part}" == "update" ]
 then
-    download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv)
+    download_list=$(grep "incr_release" ${atb_dir}/batches_to_download.tsv)
 
 elif [ "${part}" == "original" ]
 then
-    download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv)
+    download_list=$(grep -v "incr_release" ${atb_dir}/batches_to_download.tsv)
 
 elif [ "${part}" == "all" ]
 then
-    download_list=$(cat data/ATB/batches_to_download.tsv)
+    download_list=$(cat ${atb_dir}/batches_to_download.tsv)
 
 else
     download_list=""
@@ -24,6 +26,8 @@ else
     echo "Or none to use the default (=update)."
 fi
 
+mkdir -p ${output_dir}
+
 for line in ${download_list}
 do
     filename=$(echo ${line} | cut -f 1 | sed -e 's/assembly/bakta/')
@@ -31,9 +35,7 @@ do
     checksum=$(grep ${filename} data/ATB/all_atb_files.tsv | cut -f 5)
     echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}"
 
-    mkdir -p data/tmp/ATB
-
-    outputfile="data/tmp/ATB/${filename}"
+    outputfile="${output_dir}${filename}"
 
     # If the output file is not a file of size greater than zero
     if [ ! -s ${outputfile} ]
@@ -55,7 +57,7 @@ do
     fi
 
     # Extract the batch number from the file name
-    batchdir="data/tmp/annotations"
+    batchdir="${atb_dir}/annotations"
     mkdir -p ${batchdir}
 
     # If the batch directory has not been made yet
diff --git a/bin/download_genomes.sh b/bin/download_genomes.sh
index 09b9153..7665bd2 100644
--- a/bin/download_genomes.sh
+++ b/bin/download_genomes.sh
@@ -15,22 +15,26 @@ IFS=$'\n'
 # will not download complete/correct files again.)
 
 part=${1:-"update"}
+atb_dir=${2:-"resources/ATB/"}
+output_dir="${atb_dir}archives/"
 
 if [ "${part}" == "update" ]
 then
-    download_list=$(grep "incr_release" data/ATB/batches_to_download.tsv)
+    download_list=$(grep "incr_release" ${atb_dir}/batches_to_download.tsv)
 elif [ "${part}" == "original" ]
 then
-    download_list=$(grep -v "incr_release" data/ATB/batches_to_download.tsv)
+    download_list=$(grep -v "incr_release" ${atb_dir}/batches_to_download.tsv)
 elif [ "${part}" == "all" ]
 then
-    download_list=$(cat data/ATB/batches_to_download.tsv)
+    download_list=$(cat ${atb_dir}/batches_to_download.tsv)
 else
     download_list=""
     echo "Unknown argument provided! Please use 'all', 'original', or 'update'."
     echo "Or none to use the default (=update)."
 fi
 
+mkdir -p ${output_dir}
+
 for line in ${download_list}
 do
     filename=$(echo ${line} | cut -f 1)
@@ -38,9 +42,7 @@ do
     checksum=$(echo ${line} | cut -f 3)
     echo -e "Filename: ${filename}\tURL: ${url}\tmd5sum: ${checksum}"
 
-    mkdir -p data/tmp/ATB
-
-    outputfile="data/tmp/ATB/${filename}"
+    outputfile="${output_dir}${filename}"
 
     # If the output file is not a file of size greater than zero
     if [ ! -s ${outputfile} ]
@@ -62,7 +64,7 @@ do
     fi
 
     # Extract the batch number from the file name
-    batchdir="data/tmp/assemblies/"
+    batchdir="${atb_dir}/assemblies/"
     mkdir -p ${batchdir}
 
     # If the batch directory has not been made yet
diff --git a/bin/download_spacepharer_database.sh b/bin/download_spacepharer_database.sh
deleted file mode 100644
index b1ba51a..0000000
--- a/bin/download_spacepharer_database.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#! bin/bash
-
-## This script downloads the phage and plasmid databases: Phagescope (https://phagescope.deepomics.org/) and PLSDB (https://ccb-microbe.cs.uni-saarland.de/plsdb2025/).
-## They are then also extracted and specifically for Phagescope merged into one database for use in Spacepharer. 
-##
-## Note: RUN FROM THE BASE FOLDER AS IT WILL NOT WORK OTHERWISE
-## Usage: download_spacepharer_database.sh [threads]
-## [threads]: the amount of threads used for extracting and merging phagescope data. can at maximum use 14 threads, default is 1. 
-threads="${1:-"1"}"
-#PLSDB download
-mkdir -p data/raw/PLSDB
-if  [ -f data/raw/PLSDB/download_meta.tar.gz ]; then
-    echo "Already downloaded PLSDB, skipping..."
-else
-    echo "Downloading PLSDB"
-    wget -P data/raw/PLSDB https://ccb-microbe.cs.uni-saarland.de/plsdb2025/download_meta.tar.gz
-fi
-
-echo "Extracting download"
-if [ -f data/raw/PLSDB/sequences.fasta.bz2 ]; then
-    echo "already extracted, skipping..."
-else
-    tar -xzf data/raw/PLSDB/download_meta.tar.gz -C data/raw/PLSDB/
-fi
-echo "Unzipping sequences"
-bzip2 -d data/raw/PLSDB/sequences.fasta.bz2
-
-echo "correcting metadata delims"
-sed -i -E ':a;s/"([^"]*),([^"]*)"/"\1\2"/g;ta' nuccore.csv
-
-#Phagescope download
-mkdir -p data/raw/phagescope
-
-echo "Downloading Phagescope databases"
-
-for DB in "Genbank" "RefSeq" "DDBJ" "EMBL" "PhagesDB" "GPD" "GVD" "MGV" "TemPhD" "CHVD" "IGVD" "IMG_VR" "GOV2" "STV"
-    do
-    if test -f "data/raw/phagescope/$DB.tar.gz"; then
-        echo "Already downloaded $DB, skipping..."
-    else
-        echo "Downloading $DB"
-        wget -O "data/raw/phagescope/$DB.tar.gz"  "https://phageapi.deepomics.org/download/phage/fasta/?datasource=$DB"
-    fi
-done
-
-echo "Extracting databases"
-parallel --jobs "$threads" 'DB={}; path=data/raw/phagescope; [ -f "$path/$DB.fasta" ] || [ -d "$path/$DB" ] && \
-echo "$DB already extracted, skipping..." || \
-( echo "extracting $DB"; tar -xzf "$path/$DB.tar.gz" -C "$path/" )' ::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD CHVD IGVD IMG_VR GOV2 STV
-
-echo "merging phagescope sequences"
-parallel --jobs "$threads" 'DB={}; path=data/raw/phagescope; [ -f "${path}/${DB}.fasta" ] && \
-echo "$DB already merged, skipping..." || \
-( echo "merging $DB"; genomes=$(find "$path/$DB" -type f -name "*.fasta"); > $path/$DB.fasta ; for files in $genomes; do cat $files >> $path/$DB.fasta; done)' \
-::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD
-
-
-echo "Downloading Phagescope metadata"
-
-for DB in "genbank" "refseq" "ddbj" "embl" "phagesdb" "gpd" "gvd" "mgv" "temphd" "chvd" "igvd" "img_vr" "gov2" "stv"
-    do
-    if [ -f "data/raw/phagescope/${DB}_phage_meta_data.tsv" ]; then
-        echo "Already downloaded $DB, skipping..."
-    else
-        echo "Downloading $DB"
-        wget -P data/raw/phagescope/ "https://phageapi.deepomics.org/files/Download/Phage_meta_data/${DB}_phage_meta_data.tsv"
-    fi
-done
-
-echo "Merging phagescope metadata"
-first=true
-if test -f "data/raw/phagescope/merged_metadata.tsv"; then
-    echo "Already merged metadata, skipping..."
-else
-    for DB in "genbank" "refseq" "ddbj" "embl" "phagesdb" "gpd" "gvd" "mgv" "temphd" "chvd" "igvd" "img_vr" "gov2" "stv";
-        do
-        echo "Merging $DB"
-        if [ "$first" == true ]; then
-            cat "data/raw/phagescope/${DB}_phage_meta_data.tsv" > data/raw/phagescope/merged_metadata.tsv
-            first=false
-        else
-            tail -n +2 "data/raw/phagescope/${DB}_phage_meta_data.tsv" >> data/raw/phagescope/merged_metadata.tsv
-        fi
-    done 
-fi
-echo "Done!"
diff --git a/bin/make_cluster_table.py b/bin/make_cluster_table.py
deleted file mode 100644
index 3e0fdba..0000000
--- a/bin/make_cluster_table.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-from pyfaidx import Fasta
-
-cluster_file = snakemake.input["clstr"]
-fasta_file = snakemake.input["fasta"]
-table_file = snakemake.output[0]
-
-
-def read_clusters(clstr, table, fasta):
-    """
-    Read a CD-HIT generated file of clusters and the fasta file that was
-    used by CD-HIT to generate a tab-separated report of the clusters.
-
-        clstr: the CD-HIT output file with .clstr extensin
-        table: output file to write tab-separated output file to
-        fasta: fasta file with input sequences
-    """
-    sequence_dict = find_sequences(fasta)
-
-    HEADER = "Genome\tContig\tLocus\tCluster\tLength\tCluster_representative\tSequence\tIdentity\tStrand\n"
-    cluster_regex = r"(>Cluster *)(\d*)"
-    locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_\d+:\d+)... (.*)$"
-
-    with open(table, "w") as outfile:
-        outfile.write(HEADER)
-
-        with open(clstr, "r") as infile:
-            for line in infile:
-                line = line.strip()
-
-                if line.startswith(">"):
-                    # Extract the digits from the cluster ID
-                    cluster = re.search(cluster_regex, line).group(2)
-
-                elif len(line) > 1:
-                    # Use RegEx to extract information
-                    crispr_info = re.search(locus_regex, line)
-
-                    member_nr = crispr_info.group(1)  # not used
-                    length = crispr_info.group(2)
-                    genome = crispr_info.group(3)
-                    locus = crispr_info.group(4)
-                    full_locus = "%s.%s" % (genome, locus)
-                    contig = locus.split("_")[0]
-                    extra = crispr_info.group(5)
-
-                    sequence = sequence_dict[full_locus]
-
-                    # Check the final group for representative ('*') or other
-                    if extra == "*":
-                        representative = full_locus
-                        strand = "NA"
-                        identity = "NA"
-                    else:
-                        strand_and_identity = extra.split("/")
-                        strand = strand_and_identity[0].replace("at ", "")
-                        identity = strand_and_identity[1]
-
-                    # Write the information to the output file
-                    crispr_line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
-                        genome,
-                        contig,
-                        full_locus,
-                        cluster,
-                        length,
-                        representative,
-                        sequence,
-                        identity,
-                        strand,
-                    )
-                    outfile.write(crispr_line)
-
-                # If the line does not start with '>' or have length > 1, stop
-                else:
-                    break
-
-    return 0
-
-
-def find_sequences(fasta_file):
-    """
-    Look up the DNA sequence in a fasta file and return as dictionary.
-    """
-    sequence_dict = Fasta(fasta_file, duplicate_action="first")
-    return sequence_dict
-
-
-if __name__ == "__main__":
-    exit(read_clusters(clstr=cluster_file, table=table_file, fasta=fasta_file))
diff --git a/bin/make_cluster_table_identify.py b/bin/make_cluster_table_identify.py
deleted file mode 100644
index df06eaf..0000000
--- a/bin/make_cluster_table_identify.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-
-## This is a slightly adjusted script from bin/make_cluster_table.py as the identify input is different.
-
-import re
-from pyfaidx import Fasta
-
-cluster_file = snakemake.input["clstr"]
-fasta_file = snakemake.input["fasta"]
-table_file = snakemake.output[0]
-
-
-def read_clusters(clstr, table, fasta):
-    """
-    Read a CD-HIT generated file of clusters and the fasta file that was
-    used by CD-HIT to generate a tab-separated report of the clusters.
-
-        clstr: the CD-HIT output file with .clstr extensin
-        table: output file to write tab-separated output file to
-        fasta: fasta file with input sequences
-    """
-    sequence_dict = find_sequences(fasta)
-
-    HEADER = "Genome\tContig\tLocus\tCluster\tLength\tCluster_representative\tSequence\tIdentity\tStrand\n"
-    cluster_regex = r"(>Cluster *)(\d*)"
-    locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d]+_.+)\.\.\. (.*)$"
-
-    with open(table, "w") as outfile:
-        outfile.write(HEADER)
-
-        with open(clstr, "r") as infile:
-            for line in infile:
-                line = line.strip()
-
-                if line.startswith(">"):
-                    # Extract the digits from the cluster ID
-                    cluster = re.search(cluster_regex, line).group(2)
-
-                elif len(line) > 1:
-                    # Use RegEx to extract information
-                    crispr_info = re.search(locus_regex, line)
-
-                    member_nr = crispr_info.group(1)  # not used
-                    length = crispr_info.group(2)
-                    genome = crispr_info.group(3)
-                    locus = crispr_info.group(4)
-                    full_locus = "%s-%s" % (genome, locus)
-                    contig = locus.split("_")[0]
-                    crispr = locus.split("_")[6]
-                    spacer = locus.split("_")[10]
-                    full_locus_readable = "%s-%s_%s:%s" % (
-                        genome,
-                        contig,
-                        crispr,
-                        spacer,
-                    )
-                    extra = crispr_info.group(5)
-
-                    sequence = sequence_dict[full_locus]
-
-                    # Check the final group for representative ('*') or other
-                    if extra == "*":
-                        representative = full_locus_readable
-                        strand = "NA"
-                        identity = "NA"
-                    else:
-                        strand_and_identity = extra.split("/")
-                        strand = strand_and_identity[0].replace("at ", "")
-                        identity = strand_and_identity[1]
-
-                    # Write the information to the output file
-                    crispr_line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
-                        genome,
-                        contig,
-                        full_locus_readable,
-                        cluster,
-                        length,
-                        representative,
-                        sequence,
-                        identity,
-                        strand,
-                    )
-                    outfile.write(crispr_line)
-
-                # If the line does not start with '>' or have length > 1, stop
-                else:
-                    break
-
-    return 0
-
-
-def find_sequences(fasta_file):
-    """
-    Look up the DNA sequence in a fasta file and return as dictionary.
-    """
-    sequence_dict = Fasta(fasta_file, duplicate_action="first")
-    return sequence_dict
-
-
-if __name__ == "__main__":
-    exit(read_clusters(clstr=cluster_file, table=table_file, fasta=fasta_file))
diff --git a/bin/prepare_genomes.sh b/bin/prepare_genomes.sh
index a7b4329..98cdfcc 100644
--- a/bin/prepare_genomes.sh
+++ b/bin/prepare_genomes.sh
@@ -24,16 +24,57 @@ IFS=$'\n'
 ##  for this script.
 ##
 ## Usage:
-## $ bash prepare_genomes.sh [part]
+## $ bash prepare_genomes.sh -p/--part [part] -d/--directory [directory] -h/--help
 ##  (where [part] = 'all', 'original', 'update' (=default; smallest size))
 
-part=${1:-"update"}
+# Set global (default) variables
+part="update"
+output_dir="resources/ATB/"
+species_of_interest="config/species_of_interest.txt"
+genomes_of_interest="${output_dir}stats/total_genomes_of_interest.tsv"
+bakta_dir="${output_dir}annotations/"
+
+print_help() {
+  # Display help for this script
+  echo "Prepare genomes from AllTheBacteria for use with CRISPRscape"
+  echo
+  echo "Syntax: prepare_genomes.sh -p [part] -d [directory] [-h]"
+  echo "Options:"
+  echo "-p/--part      Select which part of AllTheBacteria to download"
+  echo "               for the selected species ('all', 'original', or"
+  echo "               'update', default=update)"
+  echo "-d/--directory Directory in which to download the files"
+  echo "               (default=resources/ATB/)"
+  echo "-h/--help      Print this help message"
+  echo
+  exit 0
+}
+
+while [[ $# -gt 0 ]]
+do
+  case "$1" in
+    -p|--part ) part="$2"
+    shift
+    shift
+    ;;
+    -d|--directory ) output_dir="$2"
+    shift
+    shift
+    ;;
+    -h|--help ) print_help
+    ;;
+    * )
+    echo "Unknown option: $1"
+    print_help
+  esac
+done
 
 echo "Preparing to download genomes from set '${part}'"
+echo "And storing them in the directory: ${output_dir}."
+echo "----------"
 
 ## Step 1: download metadata
 echo "Step 1: downloading metadata"
-output_dir="data/ATB/"
 bash bin/download_ATB_metadata.sh ${output_dir}
 
 echo "Done downloading metadata!"
@@ -47,28 +88,30 @@ echo "----------"
 echo "Step 2: extracting sample accession IDs of species of interest"
 species_samples_file="${output_dir}all_samples_of_interest.txt"
 
-echo -e "Species of interest:\n$(cat config/species_of_interest.txt)"
+echo -e "Species of interest:\n$(cat ${species_of_interest})"
+
+total_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | wc -l)
+hq_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l)
+lq_genomes=$(zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l)
 
-total_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | wc -l)
-hq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "T$" | wc -l)
-lq_genomes=$(zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz | grep "F$" | wc -l)
+mkdir -p "${output_dir}/stats"
 
-echo -e "Total_genomes\t${total_genomes}" > data/tmp/total_genomes_of_interest.tsv
-echo -e "High-quality_genomes\t${hq_genomes}" >> data/tmp/total_genomes_of_interest.tsv
-echo -e "Low-quality_genomes\t${lq_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+echo -e "Total_genomes\t${total_genomes}" > "${genomes_of_interest}"
+echo -e "High-quality_genomes\t${hq_genomes}" >> "${genomes_of_interest}"
+echo -e "Low-quality_genomes\t${lq_genomes}" >> "${genomes_of_interest}"
 
 echo "ATB contains ${total_genomes} genomes of your species of interest."
 echo "Of those, ${lq_genomes} are labeled as low-quality, which are not included for further analyses."
 echo "That means, ${hq_genomes} are available to work with."
 
 echo
-echo "Also see the file data/tmp/total_genomes_of_interest.tsv to see these"
+echo "Also see the file ${genomes_of_interest} to see these"
 echo "numbers in table form (tab-separated values)."
 echo
 
 # Use no further grep options to match anything that contains the species names,
 # including subspecies and lineages. Exclude low-quality 'HQ field == F'.
-zgrep -f config/species_of_interest.txt ${output_dir}species_calls.tsv.gz |\
+zgrep -f ${species_of_interest} ${output_dir}species_calls.tsv.gz |\
  grep -v -e "F$" | cut -f 1 > ${species_samples_file}
 
 echo "Done extracting sample names!"
@@ -88,8 +131,8 @@ update_genomes=$(zgrep -w -f ${species_samples_file}\
  ${output_dir}file_list.all.20240805.tsv.gz | cut -f 5 |\
   grep -c "incr_release")
 
-echo -e "Genomes_in_original_release\t${original_genomes}" >> data/tmp/total_genomes_of_interest.tsv
-echo -e "Genomes_in_update_release\t${update_genomes}" >> data/tmp/total_genomes_of_interest.tsv
+echo -e "Genomes_in_original_release\t${original_genomes}" >> "${genomes_of_interest}"
+echo -e "Genomes_in_update_release\t${update_genomes}" >> "${genomes_of_interest}"
 
 ## Step 3: Filter metadata to the species of interest
 echo "Step 3: Filtering metadata for species of interest"
@@ -134,50 +177,54 @@ grep -v -w -f ${species_samples_file} ${output_dir}samples_in_batches.txt >\
 # numbers:
 zgrep -w -f ${output_dir}samples_not_of_interest.txt\
  ${output_dir}file_list.all.20240805.tsv.gz | cut -f 2 | sort | uniq -c | sort -nr >\
- data/tmp/other_genomes-numbers.txt
+ ${output_dir}stats/other_genomes-numbers.txt
 
 # And get the list of batches + sample accessions, to facilitate removal:
 zgrep -w -f ${output_dir}samples_not_of_interest.txt\
  ${output_dir}file_list.all.20240805.tsv.gz |\
- cut -f 4 > data/tmp/samples_to_remove.txt
+ cut -f 4 > ${output_dir}samples_to_remove.txt
 
 echo "In the batches to download are $(wc -l ${output_dir}samples_not_of_interest.txt)"
 echo "samples that have low-quality genomes or species other than the species of interest."
 echo "These are summarised in:"
-ls -lh ${output_dir}samples_not_of_interest.txt data/tmp/other_genomes-numbers.txt data/tmp/samples_to_remove.txt
+ls -lh ${output_dir}samples_not_of_interest.txt ${output_dir}stats/other_genomes-numbers.txt ${output_dir}samples_to_remove.txt
 
 ## Step 5: Download genome sequences
 echo "Step 5: Download genome sequences"
-bash bin/download_genomes.sh ${part}
+bash bin/download_genomes.sh ${part} ${output_dir}
 echo "Finished downloading!"
-echo "The batch archives have been written to 'data/tmp/ATB/'"
-echo "and their contents extracted to 'data/tmp/assemblies/'"
+echo "The batch archives have been written to '${output_dir}'"
+echo "and their contents extracted to '${output_dir}assemblies/'"
 echo "----------"
 
 ## Step 6: Remove genomes of other species
 echo "Step 6: remove genomes of species other than species of interest"
-for fasta in $(cat data/tmp/samples_to_remove.txt)
+for fasta in $(cat ${output_dir}samples_to_remove.txt)
 do
 # Verbose remove: tell what is being removed
-    rm -fv "data/tmp/assemblies/${fasta}"
+    rm -fv "${output_dir}assemblies/${fasta}"
 done
 echo "----------"
 
 ## Step 7: Download functional annotations (Bakta)
 echo "Step 7: download functional (gene) annotations"
-bash bin/download_bakta_annotations.sh ${part}
+bash bin/download_bakta_annotations.sh ${part} ${output_dir}
 echo "Finished downloading!"
-echo "The batches have been downloaded to 'data/tmp/ATB/' and extracted in 'data/tmp/annotations'"
+echo "The batches have been downloaded to '${output_dir}' and extracted in '${output_dir}annotations'"
 echo "----------"
 
 ## Step 7.1: Also remove annotation files for non-of-interest samples
-# 1: adjust the file basename from 'assembly' to 'bakta'
 echo "Step 7.1: remove genome annotations of other/low-quality samples"
-for file in $(sed 's|atb.assembly.|atb.bakta.|g' data/tmp/samples_to_remove.txt)
+# adjust the file basename from 'assembly' to 'bakta'
+for file in $(sed 's|atb.assembly.|atb.bakta.|g' ${output_dir}samples_to_remove.txt)
 do
-    # 2: add 'annotations' subdirectory
-    bakta_dir="data/tmp/annotations/"
-    # 3: exchange '.fa' extension to 'bakta.json'
+    # add 'annotations' subdirectory and exchange '.fa' extension to 'bakta.json'
     json="${bakta_dir}${file/.fa/.bakta.json}"
     rm -fv ${json}
 done
+
+echo -e "\n-----\nPlease review the files under ${output_dir}:\n"
+
+ls -lh "${output_dir}"
+
+echo -e "-----\nDone!\n-----"
diff --git a/config/parameters.yaml b/config/parameters.yaml
index 7686a41..f161b4d 100755
--- a/config/parameters.yaml
+++ b/config/parameters.yaml
@@ -1,14 +1,20 @@
 # Snakemake config file for CRISPR screening workflow
 
-### 1. Input/output parameters ###
-
-working_directory: "data/tmp/"
-output_directory: "data/processed/"
-
-genomad_database: "data/genomad_db/"
-spacepharer_phage_database: "data/raw/phagescope/"
-spacepharer_plasmid_database: "data/raw/PLSDB/"
-### 2. Tool-specific parameters ###
+PhageScope_databases:
+  - "Genbank"
+  - "RefSeq"
+  - "DDBJ"
+  - "EMBL"
+  - "PhagesDB"
+  - "GPD"
+  - "GVD"
+  - "MGV"
+  - "TemPhD"
+  - "CHVD"
+  - "IGVD"
+  - "IMG_VR"
+  - "GOV2"
+  - "STV"
 
 mlst:
   threads: 20
@@ -37,3 +43,12 @@ jaeger:
 
 crispridentify:
   threads: 20
+
+download_spacepharer_databases:
+  threads: 4
+
+spacepharer:
+  threads: 20
+
+kma:
+  threads: 16
\ No newline at end of file
diff --git a/doc/allthebacteria.md b/doc/allthebacteria.md
index b59506f..9dcbd8e 100755
--- a/doc/allthebacteria.md
+++ b/doc/allthebacteria.md
@@ -9,29 +9,34 @@ After downloading all metadata using the bundled script, you have:
 ``` bash
 $ ls -sh data/ATB/
 total 1.4G
+460K all_atb_files.tsv
 679M ena_metadata.20240801.tsv.gz            74M assembly-stats.tsv.gz
  18M file_list.all.20240805.tsv.gz           63M checkm2.tsv.gz
 5.5M sample_list.txt.gz                     371M ena_metadata.0.2.20240606.tsv.gz
 17M species_calls.tsv.gz                    104M sylph.tsv.gz
 ```
 
+- A list of files `all_atb_files.tsv` 460kB
+    - Lists all batch and metadata files present in AllTheBacteria with their
+    respective file name, url, md5 checksum and file size (not per sample)
+
 - Assembly statistics `assembly-stats.tsv.gz` 74MB
-  - Lists per sample accession the total length, number of contigs, N50 and
-   more statistics of the assembly.
+    - Lists per sample accession the total length, number of contigs, N50 and
+     more statistics of the assembly.
 
 - CheckM2 results `checkm2.tsv.gz` 63MB
-  - Lists per sample accession the results of CheckM2 including assembly
-   completeness and contamination in percentages.
+    - Lists per sample accession the results of CheckM2 including assembly
+     completeness and contamination in percentages.
 
 - ENA sample metadata `ena_metadata.20240801.tsv.gz` 677MB
-  - Lists per sample all the metadata that have been deposited in the European
-   Nucleotide Archive - this is a table with over 100 columns!
+    - Lists per sample all the metadata that have been deposited in the European
+     Nucleotide Archive - this is a table with over 100 columns!
 
 - File list `file_list.all.20240805.tsv.gz` 17MB
-  - This file lists per sample accession the corresponding batch in which it is
-   archived, with download URL, md5sum and file size of the batch archive.
-    It can be used to identify which batch archives contain species of interest,
-    e.g.
+    - This file lists per sample accession the corresponding batch in which it is
+     archived, with download URL, md5sum and file size of the batch archive.
+      It can be used to identify which batch archives contain species of interest,
+      e.g.:
 
 ```bash
 zgrep -f data/ATB/all_samples_of_interest.txt\
@@ -43,14 +48,14 @@ the script `bin/prepare_genomes.sh`. Also see the
 [manual](manual.md#download-input-genomes).)
 
 - Sample list `sample_list.txt.gz` 5.4MB
-  - This file simply lists all the sample accessions that are present in the dataset.
+    - This file simply lists all the sample accessions that are present in the dataset.
 
 - Species calls `species_calls.tsv.gz` 17MB
-  - Lists per sample accession the species identified and whether or not it is of
-   high-quality (T/F).
-    This file can be used to identify which sample accessions contain
-     high-quality genomes of the species of interest,
-    e.g.
+    - Lists per sample accession the species identified and whether or not it is of
+     high-quality (T/F).
+      This file can be used to identify which sample accessions contain
+       high-quality genomes of the species of interest,
+      e.g.:
 
 ```bash
 zless species_calls.tsv.gz | grep "Campylobacter_D jejuni" |\
@@ -58,8 +63,8 @@ zless species_calls.tsv.gz | grep "Campylobacter_D jejuni" |\
 ```
 
 - Sylph results `sylph.tsv.gz` 103MB
-  - Lists per sample accession the output from Sylph, including relative
-   abundance (%) Average Nucleotide Identity score (%) and assigned species name.
+    - Lists per sample accession the output from Sylph, including relative
+     abundance (%) Average Nucleotide Identity score (%) and assigned species name.
 
 ## Note on file sizes per batch
 
diff --git a/results/Metadata_and_spacer_summary.html b/results/Metadata_and_spacer_summary.html
deleted file mode 100644
index 14f6a50..0000000
--- a/results/Metadata_and_spacer_summary.html
+++ /dev/null
@@ -1,951 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.5.57">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-<meta name="author" content="Gijswillem van Walt Meijer">
-
-<title>Metadata and spacer summary</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="Metadata_and_spacer_summary_files/libs/clipboard/clipboard.min.js"></script>
-<script src="Metadata_and_spacer_summary_files/libs/quarto-html/quarto.js"></script>
-<script src="Metadata_and_spacer_summary_files/libs/quarto-html/popper.min.js"></script>
-<script src="Metadata_and_spacer_summary_files/libs/quarto-html/tippy.umd.min.js"></script>
-<script src="Metadata_and_spacer_summary_files/libs/quarto-html/anchor.min.js"></script>
-<link href="Metadata_and_spacer_summary_files/libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="Metadata_and_spacer_summary_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="Metadata_and_spacer_summary_files/libs/bootstrap/bootstrap.min.js"></script>
-<link href="Metadata_and_spacer_summary_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="Metadata_and_spacer_summary_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-
-
-</head>
-
-<body class="fullcontent">
-
-<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
-
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<h1 class="title">Metadata and spacer summary</h1>
-</div>
-
-
-<div class="quarto-title-meta-author">
-  <div class="quarto-title-meta-heading">Author</div>
-  <div class="quarto-title-meta-heading">Affiliation</div>
-  
-    <div class="quarto-title-meta-contents">
-    <p class="author">Gijswillem van Walt Meijer <a href="mailto:g.s.a.vanwaltmeijer [at] uu.nl" class="quarto-title-author-email"><i class="bi bi-envelope"></i></a> </p>
-  </div>
-  <div class="quarto-title-meta-contents">
-      </div>
-  </div>
-
-<div class="quarto-title-meta">
-
-      
-  
-    
-  </div>
-  
-
-
-</header>
-
-
-<section id="crispr-spacers-in-campylobacter" class="level1">
-<h1>CRISPR spacers in <em>Campylobacter</em></h1>
-<section id="summary-of-metadata-and-spacers-of-crispr-arrays-in-campylobacter-genomes-from-allthebacteria" class="level3">
-<h3 class="anchored" data-anchor-id="summary-of-metadata-and-spacers-of-crispr-arrays-in-campylobacter-genomes-from-allthebacteria">Summary of metadata and spacers of CRISPR arrays in Campylobacter genomes from <a href="https://allthebacteria.readthedocs.io/" title="AllTheBacteria documentation">AllTheBacteria</a></h3>
-<p>As described in <em>Descriptive_statistics.html,</em> a large amount of bacteria of <em>Campylobacter jejuni</em> and <em>coli</em> have been downloaded from AllTheBacteria (ATB) including their metadata. These genomes have then been screened for CRISPR-cas operons using <a href="https://www.biorxiv.org/content/10.1101/2020.05.15.097824v1" title="Russel et al., 2020. bioRxiv">CCTyper</a> (version 1.8.0). This has identified a large amount of spacers (figure 1) which has been further explored in <em>Descriptive_statistics.html</em> on its own. However, for the intended research, the origin of the Bacteria will need to be known so that it can potentially be linked to specific spacers or patterns. For this the metadata of ATB is vital. But with cursory inspection (figure 2), many of the origins are missing, unclear and without standardization.</p>
-<p>In this report, we summarize the spacers data and metadata and discuss certain problems that remain within the data.</p>
-<p>First relevant data will be imported.</p>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co">#all spacers from CCtyper were combined into a tsv file as FASTA format</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>all_spacers <span class="ot">&lt;-</span> <span class="fu">read_tsv</span>(<span class="fu">here</span>(<span class="st">"joining CCtyper/all_spacers.tsv"</span>), <span class="at">show_col_types=</span><span class="cn">FALSE</span>)</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">names</span>(all_spacers) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">"Sample_accession"</span>, <span class="st">"Spacer_sequence"</span>)</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="co">#contig index are extracted from sample accession and put into their own column for pattern analysis</span></span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>all_spacers[<span class="dv">3</span>] <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(all_spacers<span class="sc">$</span>Sample_accession, <span class="st">"</span><span class="sc">\\</span><span class="st">d*:</span><span class="sc">\\</span><span class="st">d*"</span>)</span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>all_spacers <span class="ot">&lt;-</span> <span class="fu">separate</span>(all_spacers, <span class="dv">3</span>, <span class="at">into =</span> <span class="fu">c</span>(<span class="st">"Array_index"</span>, <span class="st">"Spacer_index"</span>))</span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>all_spacers[<span class="st">"Contig_accession"</span>] <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(all_spacers<span class="sc">$</span>Sample_accession, <span class="st">"contig</span><span class="sc">\\</span><span class="st">d*"</span>)</span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">#clean up sample accession</span></span>
-<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>all_spacers<span class="sc">$</span>Sample_accession <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(all_spacers<span class="sc">$</span>Sample_accession, <span class="st">"(?&lt;=&gt;)</span><span class="sc">\\</span><span class="st">w*"</span>)</span>
-<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">#these spacers also include repeats and can be orphans or connected to a cas operon, this is added to the all spacer frame.</span></span>
-<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>orphan_crispr <span class="ot">&lt;-</span> <span class="fu">read_tsv</span>(<span class="fu">here</span>(<span class="st">"joining CCtyper/crisprs_orphan-concatenated.tab"</span>), <span class="at">show_col_types =</span> F)[,<span class="fu">c</span>(<span class="dv">3</span>, <span class="dv">6</span>)] <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="st">"orphan?"</span> <span class="ot">=</span> <span class="cn">TRUE</span>) <span class="sc">%&gt;%</span> <span class="fu">separate</span>(<span class="at">col =</span> <span class="dv">1</span>, <span class="at">sep =</span> <span class="st">"</span><span class="sc">\\</span><span class="st">.|</span><span class="sc">\\</span><span class="st">_"</span>, <span class="at">into =</span> <span class="fu">c</span>(<span class="st">"Sample_accession"</span>, <span class="st">"Contig_accession"</span>, <span class="st">"Array_index"</span>))</span>
-<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a>orphan_crispr<span class="sc">$</span>Contig_accession <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(orphan_crispr<span class="sc">$</span>Contig_accession, <span class="st">"^</span><span class="sc">\\</span><span class="st">w*"</span>)</span>
-<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>crispr_cas <span class="ot">&lt;-</span> <span class="fu">read_tsv</span>(<span class="fu">here</span>(<span class="st">"joining CCtyper/crisprs_near_cas-concatenated.tab"</span>), <span class="at">show_col_types =</span> F)[,<span class="fu">c</span>(<span class="dv">3</span>, <span class="dv">6</span>)] <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="st">"orphan?"</span> <span class="ot">=</span> <span class="cn">FALSE</span>) <span class="sc">%&gt;%</span> <span class="fu">separate</span>(<span class="at">col =</span> <span class="dv">1</span>, <span class="at">sep =</span> <span class="st">"</span><span class="sc">\\</span><span class="st">.|</span><span class="sc">\\</span><span class="st">_"</span>, <span class="at">into =</span> <span class="fu">c</span>(<span class="st">"Sample_accession"</span>, <span class="st">"Contig_accession"</span>, <span class="st">"Array_index"</span>))</span>
-<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a>crispr_cas<span class="sc">$</span>Contig_accession <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(crispr_cas<span class="sc">$</span>Contig_accession, <span class="st">"^</span><span class="sc">\\</span><span class="st">w*"</span>)</span>
-<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a>all_spacers <span class="ot">&lt;-</span> <span class="fu">left_join</span>(all_spacers, <span class="fu">bind_rows</span>(orphan_crispr, crispr_cas))</span>
-<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">#ENA metadata is downloaded from ATB and was filtered for original analysis in descriptive_statistics</span></span>
-<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a>colnames <span class="ot">&lt;-</span> <span class="fu">read_table</span>(<span class="fu">here</span>(<span class="st">"metadata project/colnames_metadata_filtered.txt"</span>), <span class="at">col_names =</span> F, <span class="at">show_col_types =</span> F)</span>
-<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a>metadata_source <span class="ot">&lt;-</span></span>
-<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a>  <span class="fu">read_tsv</span>(<span class="fu">here</span>(<span class="st">"metadata project/ena_metadata.20240801.selection-only_Campylobacter.tsv"</span>),</span>
-<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a>           <span class="at">col_names =</span> colnames<span class="sc">$</span>X2, <span class="at">show_col_types =</span> <span class="cn">FALSE</span>)</span>
-<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a><span class="co">#further filtering metadata as not all are needed for this analysis</span></span>
-<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a>metadata <span class="ot">&lt;-</span> <span class="fu">select</span>(metadata_source, <span class="fu">c</span>(sample_accession, scientific_name, isolation_source, host))</span>
-<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a><span class="co">#in the metadata there is a distinction between unclear data and data that is simply missing/unspecified. missing data is turned into NA and data that is present in isolation source but not in host is copied to host.</span></span>
-<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a>missingno <span class="ot">&lt;-</span> metadata <span class="sc">%&gt;%</span></span>
-<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="fu">across</span>(<span class="fu">c</span>(isolation_source, host), <span class="sc">~</span><span class="fu">str_to_lower</span>(.x)))<span class="sc">%&gt;%</span></span>
-<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a>  <span class="fu">replace_with_na</span>(<span class="at">replace =</span> <span class="fu">list</span>(<span class="at">isolation_source =</span> <span class="fu">c</span>(</span>
-<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>    <span class="st">"missing"</span>, <span class="st">"other"</span>, <span class="st">"not collected"</span>, <span class="st">"no source specified"</span>, <span class="st">"mising"</span></span>
-<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>  ), <span class="at">host =</span> <span class="fu">c</span>(</span>
-<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>    <span class="st">"missing"</span>, <span class="st">"other"</span>, <span class="st">"not collected"</span>, <span class="st">"no source specified"</span>, <span class="st">"mising"</span></span>
-<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a>  ))) <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">host =</span> <span class="fu">ifelse</span>(<span class="fu">is.na</span>(host), isolation_source, host))</span>
-<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a><span class="co">#There are also bacteria that are incorrectly noted in the ENA metadata compared to the ATB designation of species</span></span>
-<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a>simplified_species <span class="ot">&lt;-</span> <span class="fu">read_delim</span>(</span>
-<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a>    <span class="at">file =</span> <span class="fu">here</span>(<span class="st">"metadata project/sylph.tsv.gz"</span>),</span>
-<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a>    <span class="at">delim =</span> <span class="st">"</span><span class="sc">\t</span><span class="st">"</span>,</span>
-<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a>    <span class="at">show_col_types =</span> F) <span class="sc">%&gt;%</span></span>
-<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a>    <span class="fu">select</span>(Sample, Species)</span>
-<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a>simplified_species <span class="ot">&lt;-</span> simplified_species <span class="sc">%&gt;%</span> <span class="fu">filter</span>(<span class="fu">str_detect</span>(Species,<span class="st">"Campylobacter_D (jejuni|coli)"</span>)) <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">Species =</span> <span class="fu">str_replace_all</span>(Species, <span class="st">"_(A|B|C|D)"</span>, <span class="st">""</span>))</span>
-<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a>merged_metadata <span class="ot">&lt;-</span> <span class="fu">left_join</span>(missingno, simplified_species, <span class="at">by =</span> <span class="fu">c</span>(<span class="st">"sample_accession"</span> <span class="ot">=</span> <span class="st">"Sample"</span>)) <span class="sc">%&gt;%</span> <span class="fu">distinct</span>(sample_accession, <span class="at">.keep_all =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-</div>
-<p><br>
-</p>
-<p><br>
-</p>
-</section>
-<section id="metadata-analysis" class="level2">
-<h2 class="anchored" data-anchor-id="metadata-analysis">Metadata analysis</h2>
-<section id="highlighting-the-importance-of-fair-metadata-handling" class="level3">
-<h3 class="anchored" data-anchor-id="highlighting-the-importance-of-fair-metadata-handling">Highlighting the importance of FAIR metadata handling</h3>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">cat</span>(<span class="st">"10 examples of 'host' metadata"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>10 examples of 'host' metadata</code></pre>
-</div>
-</div>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">as.data.frame</span>(<span class="fu">list</span>(<span class="at">Hosts =</span> <span class="fu">unique</span>(missingno<span class="sc">$</span>host)[<span class="dv">2</span><span class="sc">:</span><span class="dv">11</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>                Hosts
-1               feces
-2                calf
-3          sus scrofa
-4       gallus gallus
-5        homo sapiens
-6     chicken carcass
-7             poultry
-8  raw intact chicken
-9                food
-10     chicken breast</code></pre>
-</div>
-</div>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">cat</span>(<span class="st">"10 examples of 'isolation source' metadata"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>10 examples of 'isolation source' metadata</code></pre>
-</div>
-</div>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">as.data.frame</span>(<span class="fu">list</span>(<span class="at">isolation_source =</span> <span class="fu">unique</span>(missingno<span class="sc">$</span>isolation_source)[<span class="dv">2</span><span class="sc">:</span><span class="dv">11</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>               isolation_source
-1                         feces
-2               chicken carcass
-3                          farm
-4            raw intact chicken
-5                         human
-6                          food
-7                chicken breast
-8  animal-chicken-young chicken
-9     animal-swine-market swine
-10      animal-cattle-dairy cow</code></pre>
-</div>
-</div>
-<p>This shows some examples of how this metadata has inconsistent notation of origins. Many problems occur when trying to parse this data as is, for example ‘raw intact chicken’ and ‘chicken carcass’ describe the same host species. This additional information is not useful for ‘host’ metadata but is for ‘isolation source’, yet information in isolation source is also lacking as the distinction between broiler and meat producing chickens is sometimes difficult to discern. Unfortunately this alongside spelling errors (‘boivine’) and unclear designations (‘food’), makes the metadata require a significant amount of processing. Not to mention the amount of data that potentially could have information regarding origin, but was never noted. This in turn highlights the importance of the FAIR principles for research.</p>
-</section>
-<section id="consolidating-origins" class="level3">
-<h3 class="anchored" data-anchor-id="consolidating-origins">Consolidating origins</h3>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co">#TODO: keep discussing and updating categories, data crawl through original studies to find actual hosts and investigate why about a 1000 samples are lost from left_join.</span></span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="co">#consolidates the many differing inputs of host into more managable categories: Meat producing poultry, Adult cattle, layers, veal calves, pets, small ruminants, pigs, surface water and water birds, wild birds, non-water environment and human. </span></span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="co">#this last category consists of all species not part of these initial categories, or unclear enough to not be useful.</span></span>
-<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>definedmeta <span class="ot">&lt;-</span> merged_metadata <span class="sc">%&gt;%</span></span>
-<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> host) <span class="sc">%&gt;%</span></span>
-<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(field|environment|pasture|sediment|soil|surfaces|crates).*"</span>,</span>
-<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a>      <span class="st">"dry environment"</span></span>
-<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(calf|veal).*"</span>,</span>
-<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a>      <span class="st">"calves"</span></span>
-<span id="cb10-20"><a href="#cb10-20" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-21"><a href="#cb10-21" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-22"><a href="#cb10-22" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-23"><a href="#cb10-23" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-24"><a href="#cb10-24" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-25"><a href="#cb10-25" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(taurus|cattle|milk|boi?vine|cow|heifer|steer|beef|dairy|indicus).*"</span>,</span>
-<span id="cb10-26"><a href="#cb10-26" aria-hidden="true" tabindex="-1"></a>      <span class="st">"adult cattle"</span></span>
-<span id="cb10-27"><a href="#cb10-27" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-28"><a href="#cb10-28" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span> </span>
-<span id="cb10-29"><a href="#cb10-29" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-30"><a href="#cb10-30" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">ifelse</span>(<span class="fu">str_detect</span>(</span>
-<span id="cb10-31"><a href="#cb10-31" aria-hidden="true" tabindex="-1"></a>      isolation_source,</span>
-<span id="cb10-32"><a href="#cb10-32" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*((?&lt;!v)egg|layer).*"</span>), <span class="at">yes =</span> <span class="st">"broiler"</span>, <span class="at">no =</span> host)</span>
-<span id="cb10-33"><a href="#cb10-33" aria-hidden="true" tabindex="-1"></a>    ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-34"><a href="#cb10-34" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> <span class="fu">str_replace</span>(category, <span class="st">".*(water|lagoon|sewage|river|wetland|fulica</span><span class="sc">\\</span><span class="st">satra|platyrhynchos|duck).*"</span>, <span class="st">"surface water and water birds"</span>)) <span class="sc">%&gt;%</span></span>
-<span id="cb10-35"><a href="#cb10-35" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-36"><a href="#cb10-36" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-37"><a href="#cb10-37" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-38"><a href="#cb10-38" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(pheasant|phasianus|gallopavo|hen|chi[ec]ken|turkey|broiler|gallus|cb-|drumsticks|poultry).*"</span>,</span>
-<span id="cb10-39"><a href="#cb10-39" aria-hidden="true" tabindex="-1"></a>      <span class="st">"meat producing poultry"</span></span>
-<span id="cb10-40"><a href="#cb10-40" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-41"><a href="#cb10-41" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-42"><a href="#cb10-42" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-43"><a href="#cb10-43" aria-hidden="true" tabindex="-1"></a>    category,</span>
-<span id="cb10-44"><a href="#cb10-44" aria-hidden="true" tabindex="-1"></a>    <span class="st">".*(sus</span><span class="sc">\\</span><span class="st">sscrofa|porcine|pig|swine|sow|pork).*"</span>,</span>
-<span id="cb10-45"><a href="#cb10-45" aria-hidden="true" tabindex="-1"></a>    <span class="st">"swine/pig"</span></span>
-<span id="cb10-46"><a href="#cb10-46" aria-hidden="true" tabindex="-1"></a>  )) <span class="sc">%&gt;%</span></span>
-<span id="cb10-47"><a href="#cb10-47" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-48"><a href="#cb10-48" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-49"><a href="#cb10-49" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-50"><a href="#cb10-50" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(puppy|canii?ne|cat|feline|dog|kitten|pet|familiaris)(?!tle).*"</span>,</span>
-<span id="cb10-51"><a href="#cb10-51" aria-hidden="true" tabindex="-1"></a>      <span class="st">"pet animals"</span></span>
-<span id="cb10-52"><a href="#cb10-52" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-53"><a href="#cb10-53" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-54"><a href="#cb10-54" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> <span class="fu">str_replace</span>(category, <span class="st">".*(human|clinical|guillain|sapiens).*"</span>, <span class="st">"human"</span>)) <span class="sc">%&gt;%</span></span>
-<span id="cb10-55"><a href="#cb10-55" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-56"><a href="#cb10-56" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-57"><a href="#cb10-57" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-58"><a href="#cb10-58" aria-hidden="true" tabindex="-1"></a>      <span class="st">".*(avian|cloaca|(?&lt;!water</span><span class="sc">\\</span><span class="st">s)bird|columba livia|crow|corvus|dove).*"</span>,</span>
-<span id="cb10-59"><a href="#cb10-59" aria-hidden="true" tabindex="-1"></a>      <span class="st">"wild avian"</span></span>
-<span id="cb10-60"><a href="#cb10-60" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb10-61"><a href="#cb10-61" aria-hidden="true" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
-<span id="cb10-62"><a href="#cb10-62" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> <span class="fu">str_replace</span>(category, <span class="st">".*(sheep|aries|ovine|goat|hircus).*"</span>, <span class="st">"small ruminants"</span>)) <span class="sc">%&gt;%</span></span>
-<span id="cb10-63"><a href="#cb10-63" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">category =</span> <span class="fu">str_replace</span>(category, <span class="st">".*(human|sapiens).*"</span>, <span class="st">"human"</span>)) <span class="sc">%&gt;%</span></span>
-<span id="cb10-64"><a href="#cb10-64" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(</span>
-<span id="cb10-65"><a href="#cb10-65" aria-hidden="true" tabindex="-1"></a>    <span class="at">category =</span> <span class="fu">str_replace</span>(</span>
-<span id="cb10-66"><a href="#cb10-66" aria-hidden="true" tabindex="-1"></a>      category,</span>
-<span id="cb10-67"><a href="#cb10-67" aria-hidden="true" tabindex="-1"></a>      <span class="st">"^(?!(dry environment|calves|adult cattle|broiler|meat producing poultry|swine/pig|pet animals|wild avian|small ruminants|food|feces|human|surface water and water birds)).*"</span>,</span>
-<span id="cb10-68"><a href="#cb10-68" aria-hidden="true" tabindex="-1"></a>      <span class="st">"other/undetermined"</span></span>
-<span id="cb10-69"><a href="#cb10-69" aria-hidden="true" tabindex="-1"></a>    ))</span>
-<span id="cb10-70"><a href="#cb10-70" aria-hidden="true" tabindex="-1"></a><span class="co">#calculate frequency of categories </span></span>
-<span id="cb10-71"><a href="#cb10-71" aria-hidden="true" tabindex="-1"></a>origincounts <span class="ot">&lt;-</span> definedmeta <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(category) <span class="sc">%&gt;%</span> <span class="fu">summarise</span>(<span class="at">total_n =</span> <span class="fu">n</span>()) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb10-72"><a href="#cb10-72" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-73"><a href="#cb10-73" aria-hidden="true" tabindex="-1"></a><span class="co">#calculate percentages and create labels</span></span>
-<span id="cb10-74"><a href="#cb10-74" aria-hidden="true" tabindex="-1"></a>processedcount <span class="ot">&lt;-</span></span>
-<span id="cb10-75"><a href="#cb10-75" aria-hidden="true" tabindex="-1"></a>  origincounts <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">percentage =</span> <span class="fu">round</span>(total_n <span class="sc">/</span> <span class="fu">sum</span>(total_n) <span class="sc">*</span> <span class="dv">100</span>, <span class="dv">3</span>))</span>
-<span id="cb10-76"><a href="#cb10-76" aria-hidden="true" tabindex="-1"></a>processedcount<span class="sc">$</span>labels <span class="ot">&lt;-</span> <span class="fu">paste</span>(processedcount<span class="sc">$</span>category, <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>, processedcount<span class="sc">$</span>total_n, <span class="st">"("</span>, processedcount<span class="sc">$</span>percentage, <span class="st">"%)"</span>) </span>
-<span id="cb10-77"><a href="#cb10-77" aria-hidden="true" tabindex="-1"></a>processedcount <span class="ot">&lt;-</span> processedcount <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb10-78"><a href="#cb10-78" aria-hidden="true" tabindex="-1"></a>summed_N <span class="ot">&lt;-</span> <span class="fu">as.character</span>(<span class="fu">sum</span>(processedcount<span class="sc">$</span>total_n))</span>
-<span id="cb10-79"><a href="#cb10-79" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-80"><a href="#cb10-80" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-81"><a href="#cb10-81" aria-hidden="true" tabindex="-1"></a><span class="fu">treemap</span>(</span>
-<span id="cb10-82"><a href="#cb10-82" aria-hidden="true" tabindex="-1"></a>  processedcount,</span>
-<span id="cb10-83"><a href="#cb10-83" aria-hidden="true" tabindex="-1"></a>  <span class="at">index =</span> <span class="fu">c</span>(<span class="st">"labels"</span>),</span>
-<span id="cb10-84"><a href="#cb10-84" aria-hidden="true" tabindex="-1"></a>  <span class="at">vSize =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb10-85"><a href="#cb10-85" aria-hidden="true" tabindex="-1"></a>  <span class="at">vColor =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb10-86"><a href="#cb10-86" aria-hidden="true" tabindex="-1"></a>  <span class="at">draw =</span> <span class="cn">TRUE</span>,</span>
-<span id="cb10-87"><a href="#cb10-87" aria-hidden="true" tabindex="-1"></a>  <span class="at">border.col =</span> <span class="st">"black"</span>,</span>
-<span id="cb10-88"><a href="#cb10-88" aria-hidden="true" tabindex="-1"></a>  <span class="at">title =</span> <span class="fu">paste</span>(<span class="st">"Treemap of Categories with total N:"</span>, <span class="fu">sum</span>(processedcount<span class="sc">$</span>total_n)),</span>
-<span id="cb10-89"><a href="#cb10-89" aria-hidden="true" tabindex="-1"></a>  <span class="at">fontsize.title =</span> <span class="dv">16</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="Metadata_and_spacer_summary_files/figure-html/unnamed-chunk-5-1.png" class="img-fluid figure-img" width="672"></p>
-</figure>
-</div>
-</div>
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co">#same calculations but without NA</span></span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>processedcount <span class="ot">&lt;-</span></span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>  origincounts[<span class="sc">-</span><span class="dv">1</span>,] <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">percentage =</span> <span class="fu">round</span>(total_n <span class="sc">/</span> <span class="fu">sum</span>(total_n) <span class="sc">*</span> <span class="dv">100</span>, <span class="dv">3</span>))</span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>processedcount<span class="sc">$</span>labels <span class="ot">&lt;-</span> <span class="fu">paste</span>(processedcount<span class="sc">$</span>category, <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>, processedcount<span class="sc">$</span>total_n, <span class="st">"("</span>, processedcount<span class="sc">$</span>percentage, <span class="st">"%)"</span>) </span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>processedcount <span class="ot">&lt;-</span> processedcount <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a><span class="fu">treemap</span>(</span>
-<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a>  processedcount,</span>
-<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a>  <span class="at">index =</span> <span class="fu">c</span>(<span class="st">"labels"</span>),</span>
-<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>  <span class="at">vSize =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>  <span class="at">vColor =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>  <span class="at">draw =</span> <span class="cn">TRUE</span>,</span>
-<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a>  <span class="at">border.col =</span> <span class="st">"black"</span>,</span>
-<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a>  <span class="at">title =</span> <span class="fu">paste</span>(<span class="st">"Treemap of Categories (without NA) with total N:"</span>, <span class="fu">sum</span>(processedcount<span class="sc">$</span>total_n)),</span>
-<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a>  <span class="at">fontsize.title =</span> <span class="dv">16</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="Metadata_and_spacer_summary_files/figure-html/unnamed-chunk-5-2.png" class="img-fluid figure-img" width="672"></p>
-</figure>
-</div>
-</div>
-</div>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>processedcount[,<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code># A tibble: 13 × 3
-   host                   total_n percentage
-   &lt;chr&gt;                    &lt;int&gt;      &lt;dbl&gt;
- 1 meat producing poultry   39772     50.2  
- 2 human                    12773     16.1  
- 3 adult cattle              9170     11.6  
- 4 other/undetermined        8215     10.4  
- 5 swine/pig                 3668      4.63 
- 6 food                      2974      3.75 
- 7 dry environment            775      0.978
- 8 small ruminants            689      0.87 
- 9 wild avian                 451      0.569
-10 pet animals                247      0.312
-11 calves                     182      0.23 
-12 feces                      180      0.227
-13 egg layer                  140      0.177</code></pre>
-</div>
-</div>
-<p>This is a visualization of the categories that this document will work with going forward. As can be noticed, poultry is abundant with other species quickly falling off. Additionally a large amount of the host and isolation_source data is NA, considered to be mostly unusable for source attribution. Though this is still a large amount of data still to use, lets see how this is divided between <em>jejuni</em> and <em>coli</em></p>
-</section>
-<section id="campylobacter-species" class="level3">
-<h3 class="anchored" data-anchor-id="campylobacter-species">Campylobacter species</h3>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co">#same calculations as above, but now also additionally grouped on species.</span></span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>origincounts_jejuni <span class="ot">&lt;-</span> definedmeta <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(category, Species) <span class="sc">%&gt;%</span> <span class="fu">filter</span>(<span class="fu">as.character</span>(Species) <span class="sc">==</span> <span class="st">"Campylobacter jejuni"</span>) <span class="sc">%&gt;%</span> <span class="fu">summarise</span>(<span class="at">total_n =</span> <span class="fu">n</span>()) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>summed_N <span class="ot">&lt;-</span> <span class="fu">sum</span>(origincounts_jejuni[<span class="sc">-</span><span class="dv">1</span>,]<span class="sc">$</span>total_n)</span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>processedcount_jejuni <span class="ot">&lt;-</span></span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a>  origincounts_jejuni[<span class="sc">-</span><span class="dv">1</span>,] <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">percentage =</span> <span class="fu">round</span>(total_n <span class="sc">/</span> summed_N <span class="sc">*</span> <span class="dv">100</span>, <span class="dv">3</span>))</span>
-<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>processedcount_jejuni<span class="sc">$</span>labels <span class="ot">&lt;-</span> <span class="fu">paste</span>(processedcount_jejuni<span class="sc">$</span>category, <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>, processedcount_jejuni<span class="sc">$</span>total_n, <span class="st">"("</span>, processedcount_jejuni<span class="sc">$</span>percentage, <span class="st">"%)"</span>) </span>
-<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a>processedcount_jejuni <span class="ot">&lt;-</span> processedcount_jejuni <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="fu">treemap</span>(</span>
-<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a>  processedcount_jejuni,</span>
-<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a>  <span class="at">index =</span> <span class="fu">c</span>(<span class="st">"labels"</span>),</span>
-<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a>  <span class="at">vSize =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>  <span class="at">vColor =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a>  <span class="at">draw =</span> <span class="cn">TRUE</span>,</span>
-<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a>  <span class="at">border.col =</span> <span class="st">"black"</span>,</span>
-<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a>  <span class="at">title =</span> <span class="fu">paste</span>(<span class="st">"Treemap of C. jejuni Categories (without NA) with total N:"</span>, <span class="fu">sum</span>(processedcount_jejuni<span class="sc">$</span>total_n)),</span>
-<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a>  <span class="at">fontsize.title =</span> <span class="dv">16</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="Metadata_and_spacer_summary_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid figure-img" width="672"></p>
-</figure>
-</div>
-</div>
-</div>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>origincounts_coli <span class="ot">&lt;-</span> definedmeta <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(category, Species) <span class="sc">%&gt;%</span> <span class="fu">filter</span>(<span class="fu">as.character</span>(Species) <span class="sc">==</span> <span class="st">"Campylobacter coli"</span>) <span class="sc">%&gt;%</span> <span class="fu">summarise</span>(<span class="at">total_n =</span> <span class="fu">n</span>()) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>summed_N <span class="ot">&lt;-</span> <span class="fu">sum</span>(origincounts_coli[<span class="sc">-</span><span class="dv">2</span>,]<span class="sc">$</span>total_n)</span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>processedcount_coli <span class="ot">&lt;-</span></span>
-<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>  origincounts_coli[<span class="sc">-</span><span class="dv">2</span>,] <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">percentage =</span> <span class="fu">round</span>(total_n <span class="sc">/</span> summed_N <span class="sc">*</span> <span class="dv">100</span>, <span class="dv">3</span>))</span>
-<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>processedcount_coli<span class="sc">$</span>labels <span class="ot">&lt;-</span> <span class="fu">paste</span>(processedcount_coli<span class="sc">$</span>category, <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>, processedcount_coli<span class="sc">$</span>total_n, <span class="st">"("</span>, processedcount_coli<span class="sc">$</span>percentage, <span class="st">"%)"</span>) </span>
-<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>processedcount_coli <span class="ot">&lt;-</span> processedcount_coli <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(total_n))</span>
-<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="fu">treemap</span>(</span>
-<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a>  processedcount_coli,</span>
-<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a>  <span class="at">index =</span> <span class="fu">c</span>(<span class="st">"labels"</span>),</span>
-<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a>  <span class="at">vSize =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a>  <span class="at">vColor =</span> <span class="st">"total_n"</span>,  </span>
-<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a>  <span class="at">draw =</span> <span class="cn">TRUE</span>,</span>
-<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a>  <span class="at">border.col =</span> <span class="st">"black"</span>,</span>
-<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a>  <span class="at">title =</span> <span class="fu">paste</span>(<span class="st">"Treemap of C. Coli Categories (without NA) with total N:"</span>, <span class="fu">sum</span>(processedcount_coli<span class="sc">$</span>total_n)),</span>
-<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a>  <span class="at">fontsize.title =</span> <span class="dv">16</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="Metadata_and_spacer_summary_files/figure-html/unnamed-chunk-8-1.png" class="img-fluid figure-img" width="672"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>As can be seen, meat producing poultry is most frequent in both species. Though interestingly, swine are much more frequent within <em>Campylobacter coli</em> than in <em>C.</em> <em>jejuni</em>. Next lets look at the spacers found</p>
-</section>
-</section>
-<section id="spacer-analysis-with-metadata" class="level2">
-<h2 class="anchored" data-anchor-id="spacer-analysis-with-metadata">Spacer analysis with metadata</h2>
-<div class="cell">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co">#join the two files</span></span>
-<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>merged_data <span class="ot">&lt;-</span> <span class="fu">inner_join</span>(all_spacers, definedmeta, <span class="at">by =</span> <span class="fu">c</span>(<span class="st">"Sample_accession"</span> <span class="ot">=</span> <span class="st">"sample_accession"</span>))</span>
-<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>merged_data <span class="ot">&lt;-</span> <span class="fu">unique</span>(merged_data)</span>
-<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="co">#condensing spacer sequences into unique sequences for faster calculation and performing an initial count of spacers</span></span>
-<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a>counts_spacer <span class="ot">&lt;-</span> merged_data <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(Spacer_sequence, category) <span class="sc">%&gt;%</span> <span class="fu">count</span>()</span>
-<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a><span class="co">#some spacer sequences are directly complementary to each other and so the same spacer</span></span>
-<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a>dna <span class="ot">&lt;-</span> <span class="fu">DNAStringSet</span>(counts_spacer<span class="sc">$</span>Spacer_sequence)</span>
-<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a>reverse_complements <span class="ot">&lt;-</span> <span class="fu">reverseComplement</span>(dna)</span>
-<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a>processed <span class="ot">&lt;-</span> <span class="fu">logical</span>(<span class="fu">length</span>(dna))</span>
-<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="fu">seq_along</span>(dna)) {</span>
-<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a>  <span class="cf">if</span> (processed[i]) <span class="cf">next</span></span>
-<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a>  match <span class="ot">&lt;-</span> <span class="fu">which</span>(<span class="fu">as.character</span>(dna) <span class="sc">==</span> <span class="fu">as.character</span>(reverse_complements[i]))</span>
-<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a>  <span class="cf">if</span> (<span class="fu">length</span>(match) <span class="sc">&gt;</span> <span class="dv">0</span>) {</span>
-<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a>    processed[match] <span class="ot">&lt;-</span> <span class="cn">TRUE</span></span>
-<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a>    dna[match] <span class="ot">&lt;-</span> reverse_complements[match]</span>
-<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a>  }</span>
-<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a>}</span>
-<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a>dna_list <span class="ot">&lt;-</span> <span class="fu">as.character</span>(dna)</span>
-<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a>counts_spacer<span class="sc">$</span>Spacer_sequence <span class="ot">&lt;-</span> dna_list</span>
-<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a>counts_spacer_host <span class="ot">&lt;-</span> counts_spacer <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(Spacer_sequence, category) <span class="sc">%&gt;%</span> <span class="fu">summarise</span>(<span class="at">n =</span> <span class="fu">sum</span>(n)) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(n))</span>
-<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a>counts_spacer_hostless <span class="ot">&lt;-</span> counts_spacer <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(Spacer_sequence) <span class="sc">%&gt;%</span> <span class="fu">summarise</span>(<span class="at">n =</span> <span class="fu">sum</span>(n)) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(n))</span>
-<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a>merged_data <span class="ot">&lt;-</span> <span class="fu">rename</span>(merged_data, <span class="fu">c</span>(<span class="st">"scientific_name"</span> <span class="ot">=</span> <span class="st">"ENA_species"</span>, <span class="st">"Species"</span> <span class="ot">=</span> <span class="st">"ATB_species"</span>))</span>
-<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a>merged_data_NAless <span class="ot">&lt;-</span> merged_data <span class="sc">%&gt;%</span> <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(category))</span>
-<span id="cb16-33"><a href="#cb16-33" aria-hidden="true" tabindex="-1"></a>counts_spacer_NAless <span class="ot">&lt;-</span> counts_spacer_host <span class="sc">%&gt;%</span> <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(category)) <span class="sc">%&gt;%</span> <span class="fu">arrange</span>(<span class="fu">desc</span>(n))</span>
-<span id="cb16-34"><a href="#cb16-34" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-35"><a href="#cb16-35" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-36"><a href="#cb16-36" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(counts_spacer_NAless)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code># A tibble: 6 × 3
-# Groups:   Spacer_sequence [6]
-  Spacer_sequence                category                   n
-  &lt;chr&gt;                          &lt;chr&gt;                  &lt;int&gt;
-1 AATAATGGCTAAATATTTCATGAGAATGGA meat producing poultry   479
-2 AACCGCCAAGCTCTTTTAAAAACTGCCATA meat producing poultry   231
-3 GCTTTAGGAAATGCTTTAAAACGCTTTGGA meat producing poultry   208
-4 GTTTCTTTCTTATTTACTCTATACTCTAAA meat producing poultry   206
-5 CAATGTTTTGTCAAGTTTCAAGCGAAGGCG meat producing poultry   184
-6 AAACTTTTTACAGCTTTGTAGAATATATAA meat producing poultry   175</code></pre>
-</div>
-</div>
-<p>As described in <em>Descriptive_statistics</em>, many spacers detected by CCtyper were similar to each other. Some were reverse complements of each other or were identical except for additional nucleotides in one. Additionally some spacers have high similarity to each other with minor mutations. This table above has accounted for reverse complements and merged them into the count, however finding a solution for the other similarities has not been successful so far.<br>
-<br>
-</p>
-</section>
-</section>
-
-</main>
-<!-- /main column -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const onCopySuccess = function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  }
-  const getTextToCopy = function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
-    text: getTextToCopy
-  });
-  clipboard.on('success', onCopySuccess);
-  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
-    // For code content inside modals, clipBoardJS needs to be initialized with a container option
-    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
-    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
-      text: getTextToCopy,
-      container: window.document.getElementById('quarto-embedded-source-code-modal')
-    });
-    clipboardModal.on('success', onCopySuccess);
-  }
-    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
-    var mailtoRegex = new RegExp(/^mailto:/);
-      var filterRegex = new RegExp('/' + window.location.host + '/');
-    var isInternal = (href) => {
-        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
-    }
-    // Inspect non-navigation links and adorn them if external
- 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
-    for (var i=0; i<links.length; i++) {
-      const link = links[i];
-      if (!isInternal(link.href)) {
-        // undo the damage that might have been done by quarto-nav.js in the case of
-        // links that we want to consider external
-        if (link.dataset.originalHref !== undefined) {
-          link.href = link.dataset.originalHref;
-        }
-      }
-    }
-  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
-    const config = {
-      allowHTML: true,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start',
-    };
-    if (contentFn) {
-      config.content = contentFn;
-    }
-    if (onTriggerFn) {
-      config.onTrigger = onTriggerFn;
-    }
-    if (onUntriggerFn) {
-      config.onUntrigger = onUntriggerFn;
-    }
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      if (note) {
-        return note.innerHTML;
-      } else {
-        return "";
-      }
-    });
-  }
-  const xrefs = window.document.querySelectorAll('a.quarto-xref');
-  const processXRef = (id, note) => {
-    // Strip column container classes
-    const stripColumnClz = (el) => {
-      el.classList.remove("page-full", "page-columns");
-      if (el.children) {
-        for (const child of el.children) {
-          stripColumnClz(child);
-        }
-      }
-    }
-    stripColumnClz(note)
-    if (id === null || id.startsWith('sec-')) {
-      // Special case sections, only their first couple elements
-      const container = document.createElement("div");
-      if (note.children && note.children.length > 2) {
-        container.appendChild(note.children[0].cloneNode(true));
-        for (let i = 1; i < note.children.length; i++) {
-          const child = note.children[i];
-          if (child.tagName === "P" && child.innerText === "") {
-            continue;
-          } else {
-            container.appendChild(child.cloneNode(true));
-            break;
-          }
-        }
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(container);
-        }
-        return container.innerHTML
-      } else {
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(note);
-        }
-        return note.innerHTML;
-      }
-    } else {
-      // Remove any anchor links if they are present
-      const anchorLink = note.querySelector('a.anchorjs-link');
-      if (anchorLink) {
-        anchorLink.remove();
-      }
-      if (window.Quarto?.typesetMath) {
-        window.Quarto.typesetMath(note);
-      }
-      // TODO in 1.5, we should make sure this works without a callout special case
-      if (note.classList.contains("callout")) {
-        return note.outerHTML;
-      } else {
-        return note.innerHTML;
-      }
-    }
-  }
-  for (var i=0; i<xrefs.length; i++) {
-    const xref = xrefs[i];
-    tippyHover(xref, undefined, function(instance) {
-      instance.disable();
-      let url = xref.getAttribute('href');
-      let hash = undefined; 
-      if (url.startsWith('#')) {
-        hash = url;
-      } else {
-        try { hash = new URL(url).hash; } catch {}
-      }
-      if (hash) {
-        const id = hash.replace(/^#\/?/, "");
-        const note = window.document.getElementById(id);
-        if (note !== null) {
-          try {
-            const html = processXRef(id, note.cloneNode(true));
-            instance.setContent(html);
-          } finally {
-            instance.enable();
-            instance.show();
-          }
-        } else {
-          // See if we can fetch this
-          fetch(url.split('#')[0])
-          .then(res => res.text())
-          .then(html => {
-            const parser = new DOMParser();
-            const htmlDoc = parser.parseFromString(html, "text/html");
-            const note = htmlDoc.getElementById(id);
-            if (note !== null) {
-              const html = processXRef(id, note);
-              instance.setContent(html);
-            } 
-          }).finally(() => {
-            instance.enable();
-            instance.show();
-          });
-        }
-      } else {
-        // See if we can fetch a full url (with no hash to target)
-        // This is a special case and we should probably do some content thinning / targeting
-        fetch(url)
-        .then(res => res.text())
-        .then(html => {
-          const parser = new DOMParser();
-          const htmlDoc = parser.parseFromString(html, "text/html");
-          const note = htmlDoc.querySelector('main.content');
-          if (note !== null) {
-            // This should only happen for chapter cross references
-            // (since there is no id in the URL)
-            // remove the first header
-            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
-              note.children[0].remove();
-            }
-            const html = processXRef(null, note);
-            instance.setContent(html);
-          } 
-        }).finally(() => {
-          instance.enable();
-          instance.show();
-        });
-      }
-    }, function(instance) {
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            div.style.left = 0;
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-        // Handle positioning of the toggle
-    window.addEventListener(
-      "resize",
-      throttle(() => {
-        elRect = undefined;
-        if (selectedAnnoteEl) {
-          selectCodeLines(selectedAnnoteEl);
-        }
-      }, 10)
-    );
-    function throttle(fn, ms) {
-    let throttle = false;
-    let timer;
-      return (...args) => {
-        if(!throttle) { // first call gets through
-            fn.apply(this, args);
-            throttle = true;
-        } else { // all the others get throttled
-            if(timer) clearTimeout(timer); // cancel #2
-            timer = setTimeout(() => {
-              fn.apply(this, args);
-              timer = throttle = false;
-            }, ms);
-        }
-      };
-    }
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
new file mode 100644
index 0000000..dc3677c
--- /dev/null
+++ b/workflow/Snakefile
@@ -0,0 +1,109 @@
+"""
+Author: Sam Nooij
+Organisation: Utrecht University
+Department: Clinical Infectiology (KLIF), Infectious Diseases & Immunology,
+  Biomolecular Health Sciences, Faculty of Veterinary Medicine
+Date: 2024-11-06
+
+Workflow for testing CRISPR analysis options
+In contrast to the 'regular' Snakefile workflow, which works
+per genome file, this workflow works per batch and runs GNU
+parallel to parallelise processing of the genomes within
+each batch.
+
+
+Input: Fasta files of Campylobacter whole-genomes
+Output: (various)
+
+Example use:
+    $ snakemake --profile config
+
+N.B. Variables are set in the configuration files under `config`.
+"""
+
+# Step 1: Load configuration
+# ---------------------------------------------------------
+
+
+configfile: Path("config/parameters.yaml")
+
+
+# Log messages
+onstart:
+    print("\n--- Starting analysis! ---\n")
+
+
+onsuccess:
+    print("\n --- Workflow finished! ---\n")
+
+
+onerror:
+    print("\n --- An error occurred! ---\n")
+
+
+# Step 2: Load rules that do the work
+# ---------------------------------------------------------
+include: "rules/helper_rules.smk"
+include: "rules/screen_crispr-cas.smk"
+include: "rules/refine_crispr-cas.smk"
+include: "rules/classify_genomes.smk"
+include: "rules/identify_defences.smk"
+include: "rules/map_spacers.smk"
+
+
+# Step 3: specify target outputs
+# ---------------------------------------------------------
+
+
+rule all:
+    input:
+        ## Extra outputs (non-CRISPR)
+        # Multilocus Sequence Types (ST) for Campylobacter
+        "results/mlst_table.tsv",
+        # Virus and plasmid predictions per contig
+        "results/genomad_predictions.csv",
+        "results/jaeger_predictions.csv",
+        # Phage defence systems
+        "results/padloc_table.csv",
+        ## CRISPR-related outputs: 1. pre-screening
+        # Concatenated CCTyper output (CRISPR arrays and spacers)
+        expand(
+            "results/cctyper/{batch}/{filename}-{batch}.tab",
+            batch=BATCHES,
+            filename=[
+                "CRISPR_Cas",
+                "crisprs_all",
+                "crisprs_near_cas",
+                "crisprs_orphan",
+                "cas_operons",
+            ],
+        ),
+        expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
+        # CCTyper CRISPR spacer cluster analysis report
+        "results/cctyper/spacer_cluster_summary.tsv",
+        # Cluster unique CRISPR spacers
+        "results/cctyper/all_spacers-clustered.clstr",
+        "results/all_spacers_table.tsv",
+        # CRISPR output 2. CRISPRidentify output (refinement)
+        expand(
+            "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+            batch=BATCHES,
+        ),
+        expand(
+            "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+            batch=BATCHES,
+        ),
+        # Concatenated CRISPRidentify output
+        "results/crispridentify/complete_summary.csv",
+        "results/crispridentify/all_spacers.fa",
+        "results/crispridentify/all_spacers-clustered.clstr",
+        "results/all_spacers_table_identify.tsv",
+        # Merged CRISPRidentify and CCtyper output
+        "results/all_CRISPRS_with_identify.tab",
+        # Spacepharer output (spacer targets, putative protospacers)
+        "results/phage_matches.tsv",
+        "results/plasmid_matches.tsv",
+        # KMA output (mapping spacers to input Campy genomes)
+        "results/kma/output/CRISPR.frag.gz",
+        "results/kma/CRISPR_alignment",
+    default_target: True
diff --git a/workflow/envs/bash.yaml b/workflow/envs/bash.yaml
new file mode 100644
index 0000000..196651f
--- /dev/null
+++ b/workflow/envs/bash.yaml
@@ -0,0 +1,8 @@
+name: bash
+
+dependencies:
+ - bash=5.1.16
+ - parallel=20251122
+
+channels:
+ - conda-forge
diff --git a/envs/cctyper.yaml b/workflow/envs/cctyper.yaml
similarity index 100%
rename from envs/cctyper.yaml
rename to workflow/envs/cctyper.yaml
diff --git a/envs/cdhit.yaml b/workflow/envs/cdhit.yaml
similarity index 100%
rename from envs/cdhit.yaml
rename to workflow/envs/cdhit.yaml
diff --git a/workflow/envs/crispridentify.yaml b/workflow/envs/crispridentify.yaml
new file mode 100644
index 0000000..cc5fef9
--- /dev/null
+++ b/workflow/envs/crispridentify.yaml
@@ -0,0 +1,48 @@
+name: crispr_identify_env
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - biobuilds
+  - r
+  - axfeh
+dependencies:
+  - python==3.7.6
+  - pip
+  - python_abi=3.7
+  - biopython=1.76
+  - h5py=2.10.0
+  - hdf5=1.10.6
+  - hmmer=3.3
+  - numpy==1.18.1
+  - pandas=1.0.3
+  - matplotlib=3.1.3
+  - perl=5.26.2
+  - perl-threaded=5.26.0
+  - perl-yaml=1.29
+  - prodigal==2.6.3
+  - dill=0.3.3
+  - protobuf=3.13.0.1
+  - regex=2019.03.09
+  - pyasn1=0.4.8
+  - pycparser=2.20
+  - networkx=2.5
+  - pyjwt=1.7.1
+  - pyparsing=2.4.7
+  - pyqt=5.9.2
+  - pysocks=1.7.1
+  - python-dateutil=2.8.1
+  - pytz=2020.1
+  - pyyaml=5.3.1
+  - scikit-learn==0.22.1
+  - scipy=1.4.1
+  - requests=2.28.1
+  - viennarna==2.4.15
+  - pyopenssl=22.0.0
+  - certifi=2022.12.7
+  - vmatch==2.3.0
+  - clustalo==1.2.3
+  - blast==2.5.0
+  - libffi=3.2.1
+  - pip:
+    - python-Levenshtein
diff --git a/envs/genomad.yaml b/workflow/envs/genomad.yaml
similarity index 100%
rename from envs/genomad.yaml
rename to workflow/envs/genomad.yaml
diff --git a/envs/jaeger.yaml b/workflow/envs/jaeger.yaml
similarity index 100%
rename from envs/jaeger.yaml
rename to workflow/envs/jaeger.yaml
diff --git a/envs/kma.yaml b/workflow/envs/kma.yaml
similarity index 100%
rename from envs/kma.yaml
rename to workflow/envs/kma.yaml
diff --git a/envs/padloc.yaml b/workflow/envs/padloc.yaml
similarity index 100%
rename from envs/padloc.yaml
rename to workflow/envs/padloc.yaml
diff --git a/envs/pandas.yaml b/workflow/envs/pandas.yaml
similarity index 100%
rename from envs/pandas.yaml
rename to workflow/envs/pandas.yaml
diff --git a/envs/pyfaidx.yaml b/workflow/envs/pyfaidx_pandas.yaml
similarity index 66%
rename from envs/pyfaidx.yaml
rename to workflow/envs/pyfaidx_pandas.yaml
index e7dc681..b1d3a92 100644
--- a/envs/pyfaidx.yaml
+++ b/workflow/envs/pyfaidx_pandas.yaml
@@ -1,4 +1,4 @@
-name: pyfaidx
+name: pyfaidx_pandas
 
 channels:
  - conda-forge
@@ -6,3 +6,4 @@ channels:
 
 dependencies:
  - pyfaidx=0.8.1.3
+ - pandas=2.2.3
diff --git a/envs/pymlst.yaml b/workflow/envs/pymlst.yaml
similarity index 100%
rename from envs/pymlst.yaml
rename to workflow/envs/pymlst.yaml
diff --git a/envs/seqkit.yaml b/workflow/envs/seqkit.yaml
similarity index 100%
rename from envs/seqkit.yaml
rename to workflow/envs/seqkit.yaml
diff --git a/envs/spacepharer.yml b/workflow/envs/spacepharer.yaml
similarity index 100%
rename from envs/spacepharer.yml
rename to workflow/envs/spacepharer.yaml
diff --git a/envs/tidy_here.yaml b/workflow/envs/tidy_here.yaml
similarity index 100%
rename from envs/tidy_here.yaml
rename to workflow/envs/tidy_here.yaml
diff --git a/results/Assess_clustered_spacers.qmd b/workflow/notebooks/Assess_clustered_spacers.qmd
similarity index 100%
rename from results/Assess_clustered_spacers.qmd
rename to workflow/notebooks/Assess_clustered_spacers.qmd
diff --git a/results/Descriptive_statistics.qmd b/workflow/notebooks/Descriptive_statistics.qmd
similarity index 100%
rename from results/Descriptive_statistics.qmd
rename to workflow/notebooks/Descriptive_statistics.qmd
diff --git a/results/Metadata_and_spacer_summary.qmd b/workflow/notebooks/Metadata_and_spacer_summary.qmd
similarity index 100%
rename from results/Metadata_and_spacer_summary.qmd
rename to workflow/notebooks/Metadata_and_spacer_summary.qmd
diff --git a/workflow/rules/classify_genomes.smk b/workflow/rules/classify_genomes.smk
new file mode 100644
index 0000000..c92603a
--- /dev/null
+++ b/workflow/rules/classify_genomes.smk
@@ -0,0 +1,215 @@
+### Classify genomes
+## 1: determine multilocus sequence type (MLST)
+
+
+rule download_mlst_database:
+    output:
+        "resources/mlst/campylobacter.db",
+    params:
+        species=config["mlst"]["species"],
+    conda:
+        "../envs/pymlst.yaml"
+    threads: 1
+    log:
+        "log/download_mlst_database.txt",
+    benchmark:
+        "log/benchmark/download_mlst_database.txt"
+    shell:
+        """
+claMLST import -r pubmlst --no-prompt {output} {params.species} > {log} 2>&1
+        """
+
+
+rule mlst:
+    input:
+        batch="resources/ATB/assemblies/{batch}/",
+        db="resources/mlst/campylobacter.db",
+    output:
+        "results/mlst/{batch}/complete",
+    conda:
+        "../envs/pymlst.yaml"
+    threads: config["mlst"]["threads"]
+    log:
+        "log/mlst/{batch}.txt",
+    benchmark:
+        "log/benchmark/mlst/{batch}.txt"
+    shell:
+        """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ claMLST search {input.db} {{}} -o "$(dirname {output})/{{/.}}.txt" > {log} 2>&1
+
+touch {output}
+        """
+
+
+rule concatenate_mlst_batches:
+    input:
+        "results/mlst/{batch}/complete",
+    output:
+        "results/mlst/{batch}-concatenated.tsv",
+    conda:
+        "../envs/bash.yaml"
+    threads: config["mlst"]["threads"]
+    log:
+        "log/concatenate_mlst/{batch}.txt",
+    benchmark:
+        "log/benchmark/concatenate_mlst/{batch}.txt"
+    shell:
+        """
+echo -e "Genome\tST" > {output}
+find $(dirname {input}) -mindepth 1 -maxdepth 1 -type f -name "*.txt" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'tail -n +2 {{}} | cut -f 1-2 >> {output}'
+        """
+
+
+rule concatenate_mlst_all:
+    input:
+        expand("results/mlst/{batch}-concatenated.tsv", batch=BATCHES),
+    output:
+        "results/mlst_table.tsv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/concatenate_mlst_all.txt",
+    benchmark:
+        "log/benchmark/concatenate_mlst_all.txt"
+    shell:
+        """
+batches=( {input} )
+head -1 ${{batches[0]}} > {output}
+sed --separate 1d ${{batches[@]}} >> {output}
+        """
+
+
+## 2. identify whether contig derive from a chromosome, plasmid or virus
+# Using both geNomad (chromosome/plasmid/virus)
+
+
+rule download_genomad_database:
+    output:
+        db=directory("resources/genomad_db"),
+    conda:
+        "../envs/genomad.yaml"
+    threads: 1
+    log:
+        "log/download_genomad_database.txt",
+    benchmark:
+        "log/benchmark/download_genomad_database.txt"
+    shell:
+        """
+mkdir -p $(dirname {output.db})
+genomad download-database $(dirname {output.db}) > {log} 2>&1
+        """
+
+
+rule genomad:
+    input:
+        fasta="resources/ATB/assemblies/{batch}.fasta",
+        db="resources/genomad_db",
+    output:
+        aggregated_classification="results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
+        plasmid_summary="results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
+        virus_summary="results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
+    params:
+        work_dir=subpath(output.aggregated_classification, ancestor=2),
+    conda:
+        "../envs/genomad.yaml"
+    threads: config["genomad"]["threads"]
+    log:
+        "log/genomad/{batch}.txt",
+    benchmark:
+        "log/benchmark/genomad/{batch}.txt"
+    shell:
+        """
+genomad end-to-end -t {threads} --cleanup --enable-score-calibration\
+ {input.fasta} {params.work_dir} {input.db} > {log} 2>&1
+        """
+
+
+rule collect_genomad_predictions:
+    input:
+        aggregated_classification=expand(
+            "results/genomad/{batch}/{batch}_aggregated_classification/{batch}_aggregated_classification.tsv",
+            batch=BATCHES,
+        ),
+        plasmid_summary=expand(
+            "results/genomad/{batch}/{batch}_summary/{batch}_plasmid_summary.tsv",
+            batch=BATCHES,
+        ),
+        virus_summary=expand(
+            "results/genomad/{batch}/{batch}_summary/{batch}_virus_summary.tsv",
+            batch=BATCHES,
+        ),
+    output:
+        "results/genomad_predictions.csv",
+    conda:
+        "../envs/tidy_here.yaml"
+    threads: 1
+    log:
+        "log/collect_genomad_predictions.txt",
+    benchmark:
+        "log/benchmark/collect_genomad_predictions.txt"
+    script:
+        "../scripts/collect_genomad_predictions.R"
+
+
+# And Jaeger (virus (phage/prophage) or not)
+
+
+rule jaeger:
+    input:
+        batch="resources/ATB/assemblies/{batch}/",
+    output:
+        "results/jaeger/{batch}/complete",
+    conda:
+        "../envs/jaeger.yaml"
+    threads: config["jaeger"]["threads"]
+    log:
+        "log/jaeger/{batch}.txt",
+    benchmark:
+        "log/benchmark/jaeger/{batch}.txt"
+    shell:
+        """
+parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
+ jaeger run -p --workers 1 -i {{}} -o $(dirname {output}) --overwrite\
+ > {log} 2>&1 ::: {input.batch}/*.fa
+
+touch {output}
+        """
+
+
+rule collect_jaeger_batch:
+    input:
+        "results/jaeger/{batch}/complete",
+    output:
+        "results/jaeger/{batch}/jaeger-{batch}.csv",
+    params:
+        batch=subpath(output[0], parent=True),
+    conda:
+        "../envs/tidy_here.yaml"
+    threads: 1
+    log:
+        "log/collect_jaeger_{batch}.txt",
+    benchmark:
+        "log/benchmark/collect_jaeger_{batch}.txt"
+    script:
+        "../scripts/collect_jaeger_batch.R"
+
+
+rule collect_jaeger_predictions:
+    input:
+        expand("results/jaeger/{batch}/jaeger-{batch}.csv", batch=BATCHES),
+    output:
+        "results/jaeger_predictions.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/collect_jaeger_predictions.txt",
+    benchmark:
+        "log/benchmark/collect_jaeger_predictions.txt"
+    script:
+        "../scripts/collect_jaeger_predictions.sh"
diff --git a/workflow/rules/helper_rules.smk b/workflow/rules/helper_rules.smk
new file mode 100644
index 0000000..558ecb1
--- /dev/null
+++ b/workflow/rules/helper_rules.smk
@@ -0,0 +1,39 @@
+## General helper functions: define input and output
+from pathlib import Path
+
+# Use Python functions to automatically detect batches of genomes fasta files
+# in the input directory as 'BATCHES'
+BATCH_PATHS = list(Path("resources/ATB/assemblies").glob("atb.assembly.*"))
+
+# Make sure there is at least one batch directory:
+assert len(BATCH_PATHS) > 0, (
+    "-- No input (batch) directories found in resources/ATB/assemblies.\n"
+    "Please run the script bin/prepare_genomes.sh to prepare input. --\n"
+)
+
+# And make sure they are actually directories
+for batch in BATCH_PATHS:
+    assert Path(batch).is_dir(), f"-- Batches must be directories, got {batch} --"
+
+BATCHES = [batch.name for batch in BATCH_PATHS]
+
+
+## Helper rules (not fitting any particular goal)
+
+
+rule concatenate_batches:
+    input:
+        "resources/ATB/assemblies/{batch}",
+    output:
+        temp("resources/ATB/assemblies/{batch}.fasta"),
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/concatenate_{batch}.txt",
+    benchmark:
+        "log/benchmark/concatenate_{batch}.txt"
+    shell:
+        """
+cat {input}/*.fa > {output} 2> {log}
+        """
diff --git a/workflow/rules/identify_defences.smk b/workflow/rules/identify_defences.smk
new file mode 100644
index 0000000..6923b08
--- /dev/null
+++ b/workflow/rules/identify_defences.smk
@@ -0,0 +1,89 @@
+## Identify anti-phage defence systems using PADLOC
+
+
+rule download_padloc_database:
+    output:
+        directory=directory("resources/padloc_db"),
+        cm=directory("resources/padloc_db/cm"),
+        cm_meta="resources/padloc_db/cm_meta.txt",
+        hmm=directory("resources/padloc_db/hmm"),
+        hmm_meta="resources/padloc_db/hmm_meta.txt",
+        sys=directory("resources/padloc_db/sys"),
+        sys_meta="resources/padloc_db/sys_meta.txt",
+        system_info="resources/padloc_db/system_info.md",
+    conda:
+        "../envs/padloc.yaml"
+    threads: 1
+    log:
+        "log/download_padloc_database.txt",
+    benchmark:
+        "log/benchmark/download_padloc_database.txt"
+    shell:
+        """
+padloc --data resources/padloc_db --db-install v2.0.0 > {log} 2>&1
+        """
+
+
+rule padloc:
+    input:
+        batch="resources/ATB/assemblies/{batch}/",
+        db="resources/padloc_db",
+    output:
+        "results/padloc/{batch}/complete",
+    conda:
+        "../envs/padloc.yaml"
+    threads: config["padloc"]["threads"]
+    log:
+        "log/padloc/{batch}.txt",
+    benchmark:
+        "log/benchmark/padloc/{batch}.txt"
+    shell:
+        """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'mkdir -p "$(dirname {output})/{{/.}}" && padloc --data {input.db}\
+  --cpu 1 --fna {{}} --outdir "$(dirname {output})/{{/.}}"' > {log} 2>&1
+
+touch {output}
+        """
+
+
+rule concatenate_padloc_batches:
+    input:
+        "results/padloc/{batch}/complete",
+    output:
+        "results/padloc/{batch}-concatenated.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: config["padloc"]["threads"]
+    log:
+        "log/concatenate_padloc/{batch}.txt",
+    benchmark:
+        "log/benchmark/concatenate_padloc/{batch}.txt"
+    shell:
+        """
+file_array=( $(find $(dirname {input}) -mindepth 2 -maxdepth 2 -type f -name "*_padloc.csv") )
+head -1 ${{file_array[0]}} > {output}
+parallel --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'tail -n +2 {{}} >> {output}' ::: ${{file_array[@]}}
+        """
+
+
+rule concatenate_padloc_all:
+    input:
+        expand("results/padloc/{batch}-concatenated.csv", batch=BATCHES),
+    output:
+        "results/padloc_table.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/concatenate_padloc_all.txt",
+    benchmark:
+        "log/benchmark/concatenate_padloc_all.txt"
+    shell:
+        """
+batches=( {input} )
+head -1 ${{batches[0]}} > {output}
+sed --separate 1d ${{batches[@]}} >> {output}
+        """
diff --git a/workflow/rules/map_spacers.smk b/workflow/rules/map_spacers.smk
new file mode 100644
index 0000000..3f6414c
--- /dev/null
+++ b/workflow/rules/map_spacers.smk
@@ -0,0 +1,295 @@
+### Map spacers to putative targets/protospacers
+## 2. By using SpacePHARER
+
+
+rule spacepharer_spacer_setup:
+    input:
+        spacers="results/crispridentify/all_spacers.fa",
+    output:
+        spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+    params:
+        tmp_folder=subpath(output.spacer_DB, parent=True),
+    conda:
+        "../envs/spacepharer.yaml"
+    threads: 48
+    log:
+        "log/spacepharer/spacepharer_spacer_setup.txt",
+    benchmark:
+        "log/benchmark/spacepharer/spacepharer_spacer_setup.txt"
+    shell:
+        """
+spacer_DB=$(dirname {output.spacer_DB})
+rm -rf $spacer_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.spacers} {output.spacer_DB}\
+ "{params.tmp_folder}/tmpFolder" --extractorf-spacer 1\
+ --threads {threads} >> {log} 2>&1
+        """
+
+
+## First to bacteriophages
+
+
+rule download_phage_database:
+    output:
+        phage_dir=directory("resources/phagescope"),
+        phage_archives=expand(
+            "resources/phagescope/{database}.tar.gz",
+            database=config["PhageScope_databases"],
+        ),
+        phage_fasta=expand(
+            "resources/phagescope/{database}.fasta",
+            database=config["PhageScope_databases"],
+        ),
+        combined_meta="resources/phagescope/phagescope_metadata.tsv",
+    params:
+        databases=config["PhageScope_databases"],
+    conda:
+        "../envs/bash.yaml"
+    threads: config["download_spacepharer_databases"]["threads"]
+    log:
+        out="log/download_phage_database.out",
+        err="log/download_phage_database.err",
+    benchmark:
+        "log/benchmark/download_phage_database.txt"
+    script:
+        "../scripts/download_phage_database.sh"
+
+
+rule spacepharer_phage_setup:
+    input:
+        db=collect(
+            "resources/phagescope/{database}.fasta",
+            database=[
+                "DDBJ",
+                "EMBL",
+                "Genbank",
+                "GPD",
+                "GVD",
+                "MGV",
+                "PhagesDB",
+                "RefSeq",
+                "TemPhD",
+            ],
+        ),
+    output:
+        phage_DB="results/spacepharer/phage_DB/targetsetDB",
+        phage_control_DB="results/spacepharer/phage_DB/controlsetDB",
+    params:
+        tmp_folder=subpath(output.phage_DB, ancestor=2),
+    conda:
+        "../envs/spacepharer.yaml"
+    threads: config["spacepharer"]["threads"]
+    log:
+        "log/spacepharer/spacepharer_phage_setup.txt",
+    benchmark:
+        "log/benchmark/spacepharer/spacepharer_setup.txt"
+    shell:
+        """
+phage_DB=$(dirname {output.phage_DB})
+rm -rf $phage_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.db} {output.phage_DB}\
+ "{params.tmp_folder}/tmpFolder" --threads {threads} >> {log} 2>&1
+spacepharer createsetdb {input.db} {output.phage_control_DB}\
+ "{params.tmp_folder}/tmpFolder" --reverse-fragments 1 --threads {threads} >> {log} 2>&1
+        """
+
+
+rule spacepharer_phage:
+    input:
+        spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+        phage_DB="results/spacepharer/phage_DB/targetsetDB",
+        phage_control_DB="results/spacepharer/phage_DB/controlsetDB",
+    output:
+        result="results/spacepharer/predicted_phage_matches.tsv",
+        result_sanitised="results/spacepharer/predicted_phage_matches_san.tsv",
+    params:
+        tmp_folder="results/spacepharer/tmpFolder",
+    conda:
+        "../envs/spacepharer.yaml"
+    threads: config["spacepharer"]["threads"]
+    log:
+        "log/spacepharer/spacepharer_phage.txt",
+    benchmark:
+        "log/benchmark/spacepharer/spacepharer_phage.txt"
+    shell:
+        """
+spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\
+ {input.phage_control_DB} {output.result} {params.tmp_folder}\
+ --threads {threads} > {log} 2>&1
+
+grep -v "#" {output.result} > {output.result_sanitised}
+rm -r {params.tmp_folder} >> {log} 2>&1
+        """
+
+
+## Then also to plasmids
+
+
+rule download_plasmid_database:
+    output:
+        plasmid_dir=directory("resources/PLSDB"),
+        plasmid_fasta="resources/PLSDB/sequences.fasta",
+        plasmid_nuccore="resources/PLSDB/nuccore.csv",
+        plasmid_taxonomy="resources/PLSDB/taxonomy.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        out="log/download_plasmid_database.out",
+        err="log/download_plasmid_database.err",
+    benchmark:
+        "log/benchmark/download_plasmid_database.txt"
+    script:
+        "../scripts/download_plasmid_database.sh"
+
+
+rule spacepharer_plasmid_setup:
+    input:
+        db="resources/PLSDB/sequences.fasta",
+    output:
+        DB="results/spacepharer/plasmid_DB/targetsetDB",
+        control_DB="results/spacepharer/plasmid_DB/controlsetDB",
+    params:
+        tmp_folder="results/spacepharer/tmpFolder",
+    conda:
+        "../envs/spacepharer.yaml"
+    threads: config["spacepharer"]["threads"]
+    log:
+        "log/spacepharer/spacepharer_plasmid_setup.txt",
+    benchmark:
+        "log/benchmark/spacepharer/spacepharer_plasmid_setup.txt"
+    shell:
+        """
+plasmid_DB=$(dirname {output.DB})
+rm -f $plasmid_DB/* > {log} 2>&1
+
+spacepharer createsetdb {input.db} {output.DB} {params.tmp_folder}\
+ --threads {threads} >> {log} 2>&1
+spacepharer createsetdb {input.db} {output.control_DB} {params.tmp_folder}\
+ --reverse-fragments 1 --threads {threads} >> {log} 2>&1
+        """
+
+
+rule spacepharer_plasmid:
+    input:
+        phage_DB="results/spacepharer/plasmid_DB/targetsetDB",
+        phage_control_DB="results/spacepharer/plasmid_DB/controlsetDB",
+        spacer_DB="results/spacepharer/DB_CRISPR/querysetDB",
+    output:
+        result="results/spacepharer/predicted_plasmid_matches.tsv",
+        result_sanitised="results/spacepharer/predicted_plasmid_matches_san.tsv",
+    params:
+        tmp_folder="results/spacepharer/tmpFolder",
+    conda:
+        "../envs/spacepharer.yaml"
+    threads: config["spacepharer"]["threads"]
+    log:
+        "log/spacepharer/spacepharer_phage.txt",
+    benchmark:
+        "log/benchmark/spacepharer/spacepharer_phage.txt"
+    shell:
+        """
+spacepharer predictmatch {input.spacer_DB} {input.phage_DB}\
+ {input.phage_control_DB} {output.result} {params.tmp_folder}\
+  --threads {threads} > {log} 2>&1
+
+grep -v "#" {output.result} > {output.result_sanitised}
+rm -r {params.tmp_folder} >> {log} 2>&1
+        """
+
+
+rule create_spacepharer_table:
+    input:
+        phage="results/spacepharer/predicted_phage_matches_san.tsv",
+        phage_meta="resources/phagescope/merged_metadata.tsv",
+        plasmid="results/spacepharer/predicted_plasmid_matches_san.tsv",
+        plasmid_nuccore="resources/PLSDB/nuccore.csv",
+        plasmid_taxonomy="resources/PLSDB/taxonomy.csv",
+    output:
+        phage="results/phage_matches.tsv",
+        plasmid="results/plasmid_matches.tsv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/create_spacepharer_table.txt",
+    script:
+        "../scripts/create_spacepharer_table.sh"
+
+
+## 2. By using KMA to the input genomes
+
+
+rule kma_indexing:
+    input:
+        spacers="results/crispridentify/all_spacers.fa",
+    output:
+        indexed_spacers="results/kma/spacer_DB/spacers.name",
+    params:
+        "results/kma/spacer_DB/spacers",
+    conda:
+        "../envs/kma.yaml"
+    threads: config["kma"]["threads"]
+    log:
+        "log/kma/kma_index.txt",
+    benchmark:
+        "log/benchmark/kma/kma_index.txt"
+    shell:
+        """
+kma index -i {input.spacers} -o {params} > {log} 2>&1
+        """
+
+
+rule kma:
+    input:
+        genomes=expand("resources/ATB/assemblies/{batch}/", batch=BATCHES),
+        indexed_spacers="results/kma/spacer_DB/spacers.name",
+    output:
+        "results/kma/output/CRISPR.frag.gz",
+    params:
+        output=subpath(output[0], parent=True),
+        indexed_spacers=subpath(input.indexed_spacers, parent=True),
+        spacers="results/crispridentify/all_spacers.fa",
+    conda:
+        "../envs/kma.yaml"
+    threads: config["kma"]["threads"]
+    log:
+        "log/kma/kma.txt",
+    benchmark:
+        "log/benchmark/kma/kma.txt"
+    shell:
+        """
+grep ">" {params.spacers} | cut -f 2 -d ">" | cut -f 1 -d "-" | sort -u > tmp_file
+find -L {input.genomes} -mindepth 1 -maxdepth 1 -type f -name "*.fa" > all_genomes.txt
+genomes=$(grep -x ".*[0-9]\\.fa" all_genomes.txt | grep -v -f tmp_file)
+
+kma -hmm -i $genomes -o {params.output} -t_db "{params.indexed_spacers}/spacers" > {log} 2>&1
+rm tmp_file all_genomes.txt
+        """
+
+
+rule collect_kma:
+    input:
+        "results/kma/output/CRISPR.frag.gz",
+    output:
+        "results/kma/CRISPR_alignment",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/kma/collect_kma.txt",
+    benchmark:
+        "log/benchmark/kma/collect_kma.txt"
+    shell:
+        """
+echo -e "spacer\tgenome" > {output}
+zcat {input} | cut -f 6,7 | cut -f 1 -d " " > tmp_file
+while read line; do
+    match=$(echo $line | cut -f 2)
+    crispr=$(echo $line | cut -f 1 | cut -f 1,6,7,10,11 -d "_")
+    echo -e "$crispr\t$match" >> {output}
+done < tmp_file
+rm tmp_file
+        """
diff --git a/workflow/rules/refine_crispr-cas.smk b/workflow/rules/refine_crispr-cas.smk
new file mode 100644
index 0000000..c0835d1
--- /dev/null
+++ b/workflow/rules/refine_crispr-cas.smk
@@ -0,0 +1,113 @@
+### Refine CRISPR-Cas identifation
+
+
+rule crispridentify:
+    input:
+        "results/cctyper/{batch}/subseq",
+    output:
+        spacers="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+        summary="results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+    params:
+        out_dir=subpath(output[0], parent=True),
+        arrays=subpath(input[0], parent=True),
+    conda:
+        "../envs/crispridentify.yaml"
+    threads: config["crispridentify"]["threads"]
+    log:
+        "log/crispridentify/{batch}.txt",
+    benchmark:
+        "log/benchmark/crispridentify/{batch}.txt"
+    shell:
+        """
+cd bin/CRISPRidentify
+
+find ../../{params.arrays}/*/fasta/CRISPR_arrays-with_flanks.fasta -size +0c -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail,1'\
+ 'python CRISPRidentify.py --file {{}}\
+ --result_folder "../../{params.out_dir}/"\
+ --fasta_report True --strand False' > ../../{log} 2>&1
+        """
+
+
+rule merge_crispridentify_batches:
+    input:
+        spacers_crispr=expand(
+            "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_spacer_dataset.fasta",
+            batch=BATCHES,
+        ),
+        summary_crispr=expand(
+            "results/crispridentify/{batch}/CRISPR_arrays-with_flanks/Complete_summary.csv",
+            batch=BATCHES,
+        ),
+    output:
+        spacers_crispr="results/crispridentify/all_spacers.fa",
+        summary_crispr="results/crispridentify/complete_summary.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/merge_crispridentify_batches.txt",
+    benchmark:
+        "log/benchmark/merge_crispridentify_batches.txt"
+    script:
+        "../scripts/merge_crispridentify_batches.sh"
+
+
+rule merge_cctyper_identify:
+    input:
+        identify="results/crispridentify/complete_summary.csv",
+        cctyper=expand("results/cctyper/{batch}/crisprs_all-{batch}.tab", batch=BATCHES),
+    output:
+        table="results/all_CRISPRS_with_identify.tab",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/merge_cctyper_identify.txt",
+    benchmark:
+        "log/benchmark/merge_cctyper_identify.txt"
+    script:
+        "../scripts/merge_cctyper_crispridentify.sh"
+
+
+rule cluster_unique_spacers_crispridentify:
+    input:
+        "results/crispridentify/all_spacers.fa",
+    output:
+        clusters="results/crispridentify/all_spacers-clustered.clstr",
+        spacers="results/crispridentify/all_spacers-clustered",
+        distribution="results/crispridentify/all_spacers-clustered-distribution.tsv",
+    conda:
+        "../envs/cdhit.yaml"
+    threads: 1
+    log:
+        "log/cluster_unique_spacers_identify.txt",
+    benchmark:
+        "log/benchmark/cluster_unique_spacers_identify.txt"
+    shell:
+        """
+cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
+ -i {input} -o {output.spacers} > {log} 2>&1
+
+plot_len1.pl {output.clusters}\
+ 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
+ 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
+ > {output.distribution}
+        """
+
+
+rule create_crispr_cluster_table_identify:
+    input:
+        clstr="results/crispridentify/all_spacers-clustered.clstr",
+        fasta="results/crispridentify/all_spacers.fa",
+    output:
+        "results/all_spacers_table_identify.tsv",
+    conda:
+        "../envs/pyfaidx_pandas.yaml"
+    threads: 1
+    log:
+        "log/create_crispr_cluster_table_identify.txt",
+    benchmark:
+        "log/benchmark/create_crispr_cluster_table_identify.txt"
+    script:
+        "../scripts/make_cluster_table_identify.py"
diff --git a/workflow/rules/screen_crispr-cas.smk b/workflow/rules/screen_crispr-cas.smk
new file mode 100644
index 0000000..7d78f56
--- /dev/null
+++ b/workflow/rules/screen_crispr-cas.smk
@@ -0,0 +1,195 @@
+### Screen for CRISPR-Cas
+## Run CCTyper and parse/merge its output
+
+
+rule crisprcastyper:
+    input:
+        batch="resources/ATB/assemblies/{batch}/",
+    output:
+        "results/cctyper/{batch}/complete",
+    params:
+        out_dir=subpath(output[0], parent=True),
+    conda:
+        "../envs/cctyper.yaml"
+    threads: config["cctyper"]["threads"]
+    log:
+        "log/cctyper/{batch}.txt",
+    benchmark:
+        "log/benchmark/cctyper/{batch}.txt"
+    shell:
+        """
+find -L {input.batch} -mindepth 1 -maxdepth 1 -type f -name "*.fa" -print0 |\
+ parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+ 'rm -rf "{params.out_dir}{{/.}}" &&\
+ cctyper -t 1 {{}} "{params.out_dir}/{{/.}}"' > {log} 2>&1
+
+touch {output}
+        """
+
+
+rule parse_cctyper:
+    input:
+        "results/cctyper/{batch}/complete",
+    output:
+        "results/cctyper/{batch}/parsed",
+    conda:
+        "../envs/pandas.yaml"
+    threads: config["parse_cctyper"]["threads"]
+    log:
+        "log/parse_cctyper/{batch}.txt",
+    benchmark:
+        "log/benchmark/parse_cctyper/{batch}.txt"
+    shell:
+        """
+find $(dirname {input}) -mindepth 1 -maxdepth 1 -type d -print0 |\
+parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+    python workflow/scripts/cctyper_extender.py -d {{.}} > {log} 2>&1
+
+touch {output}
+        """
+
+
+rule extract_sequences:
+    input:
+        flag="results/cctyper/{batch}/parsed",
+        genomes="resources/ATB/assemblies/{batch}",
+    output:
+        "results/cctyper/{batch}/subseq",
+    conda:
+        "../envs/seqkit.yaml"
+    threads: config["extract_sequences"]["threads"]
+    log:
+        "log/extract_sequences/{batch}.txt",
+    benchmark:
+        "log/benchmark/extract_sequences/{batch}.txt"
+    shell:
+        """
+find $(dirname {input.flag}) -mindepth 1 -maxdepth 1 -type d -print0 |\
+parallel -0 --jobs {threads} --retry-failed --halt='now,fail=1'\
+    bash workflow/scripts/extract_crispr-cas_from_fasta.sh {{}} {input.genomes} > {log} 2>&1
+
+touch {output}
+        """
+
+
+rule collect_cctyper:
+    input:
+        cctyper="results/cctyper/{batch}/complete",
+        parser="results/cctyper/{batch}/parsed",
+    output:
+        crispr_cas="results/cctyper/{batch}/CRISPR_Cas-{batch}.tab",
+        crisprs_all="results/cctyper/{batch}/crisprs_all-{batch}.tab",
+        crisprs_near_cas="results/cctyper/{batch}/crisprs_near_cas-{batch}.tab",
+        crisprs_orphan="results/cctyper/{batch}/crisprs_orphan-{batch}.tab",
+        spacers="results/cctyper/{batch}/all_spacers-{batch}.fa",
+        cas_putative="results/cctyper/{batch}/cas_operons_putative-{batch}.tab",
+        cas="results/cctyper/{batch}/cas_operons-{batch}.tab",
+        csv="results/cctyper/{batch}/CRISPR-Cas-{batch}.csv",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/cctyper/collect_{batch}.txt",
+    benchmark:
+        "log/benchmark/cctyper/collect_{batch}.txt"
+    shell:
+        """
+bash workflow/scripts/concatenate_cctyper_output.sh $(dirname {input.cctyper}) > {log} 2>&1
+echo "\n========================" >> {log}
+bash workflow/scripts/concatenate_cctyper_csv.sh $(dirname {input.parser}) >> {log} 2>&1
+
+find $(dirname {input.cctyper}) -mindepth 3 -maxdepth 3 -name "*.fa" -exec cat {{}} + > {output.spacers} 2>> {log}
+        """
+
+
+rule concatenate_all_spacers:
+    input:
+        expand("results/cctyper/{batch}/all_spacers-{batch}.fa", batch=BATCHES),
+    output:
+        "results/cctyper/all_spacers.fa",
+    conda:
+        "../envs/bash.yaml"
+    threads: 1
+    log:
+        "log/concatenate_all_spacers.txt",
+    benchmark:
+        "log/benchmark/concatenate_all_spacers.txt"
+    shell:
+        """
+cat {input} > {output} 2> {log}
+        """
+
+
+rule cluster_all_spacers:
+    input:
+        "results/cctyper/all_spacers.fa",
+    output:
+        clusters=expand(
+            "results/cctyper/all_spacers-clustered-{cutoff}.clstr",
+            cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
+        ),
+        spacers=expand(
+            "results/cctyper/all_spacers-clustered-{cutoff}",
+            cutoff=[1, 0.96, 0.93, 0.9, 0.87, 0.84, 0.81],
+        ),
+        summary="results/cctyper/spacer_cluster_summary.tsv",
+    params:
+        work_dir=subpath(input[0], parent=True),
+        log_dir="log/spacer_clustering",
+    conda:
+        "../envs/cdhit.yaml"
+    threads: 1
+    log:
+        "log/cluster_all_spacers.txt",
+    benchmark:
+        "log/benchmark/cluster_all_spacers.txt"
+    shell:
+        """
+bash workflow/scripts/cluster_all_spacers.sh\
+    {input}\
+    {params.work_dir}\
+    {params.log_dir} > {log} 2>&1
+        """
+
+
+rule cluster_unique_spacers:
+    input:
+        "results/cctyper/all_spacers.fa",
+    output:
+        clusters="results/cctyper/all_spacers-clustered.clstr",
+        spacers="results/cctyper/all_spacers-clustered",
+        distribution="results/cctyper/all_spacers-clustered-distribution.tsv",
+    conda:
+        "../envs/cdhit.yaml"
+    threads: 1
+    log:
+        "log/cluster_unique_spacers.txt",
+    benchmark:
+        "log/benchmark/cluster_unique_spacers.txt"
+    shell:
+        """
+cd-hit-est -c 1 -n 8 -r 1 -g 1 -AS 0 -sf 1 -d 0 -T {threads}\
+ -i {input} -o {output.spacers} > {log} 2>&1
+
+plot_len1.pl {output.clusters}\
+ 1,2-4,5-9,10-19,20-49,50-99,100-499,500-99999\
+ 1-10,11-20,21-25,26-30,31-35,36-40,41-50,51-60,61-70,71-999999\
+ > {output.distribution}
+        """
+
+
+rule create_crispr_cluster_table:
+    input:
+        clstr="results/cctyper/all_spacers-clustered.clstr",
+        fasta="results/cctyper/all_spacers.fa",
+    output:
+        "results/all_spacers_table.tsv",
+    conda:
+        "../envs/pyfaidx_pandas.yaml"
+    threads: 1
+    log:
+        "log/create_crispr_cluster_table.txt",
+    benchmark:
+        "log/benchmark/create_crispr_cluster_table.txt"
+    script:
+        "../scripts/make_cluster_table.py"
diff --git a/bin/cctyper_extender.py b/workflow/scripts/cctyper_extender.py
similarity index 100%
rename from bin/cctyper_extender.py
rename to workflow/scripts/cctyper_extender.py
diff --git a/bin/cluster_all_spacers.sh b/workflow/scripts/cluster_all_spacers.sh
similarity index 100%
rename from bin/cluster_all_spacers.sh
rename to workflow/scripts/cluster_all_spacers.sh
diff --git a/bin/collect_genomad_predictions.R b/workflow/scripts/collect_genomad_predictions.R
similarity index 90%
rename from bin/collect_genomad_predictions.R
rename to workflow/scripts/collect_genomad_predictions.R
index 0cc783e..7356717 100644
--- a/bin/collect_genomad_predictions.R
+++ b/workflow/scripts/collect_genomad_predictions.R
@@ -17,13 +17,13 @@ read_stats <- function(filename, name_position) {
   sample_name <- str_split_1(string = filename, pattern = "/") %>%
     tail(name_position) %>%
     head(1)
-  
+
   df <- read_delim(
     file = filename,
     show_col_types = FALSE
   ) %>%
     mutate(batch = sample_name)
-  
+
   return(df)
 }
 
@@ -34,10 +34,11 @@ genomad_scores <- do.call(
   rename("contig" = "seq_name")
 
 genomad_scores <- genomad_scores %>%
-  mutate(genome = gsub(pattern = "\\.contig[0-9]*",
-                       replacement = "",
-                       x = contig)
-  )
+  mutate(genome = gsub(
+    pattern = "\\.contig[0-9]*",
+    replacement = "",
+    x = contig
+  ))
 
 plasmid_classifications <- do.call(
   rbind,
@@ -78,10 +79,10 @@ genomad_df <- left_join(
       !is.na(plasmid_topology) ~ "plasmid",
       !is.na(virus_topology) ~ "virus",
       TRUE ~ "chromosome"
-      )
+    )
   )
 
 write_csv(
   x = genomad_df,
-  file = snakemake@output[[1]] #here("data", "processed", "genomad_predictions.csv")
-)
\ No newline at end of file
+  file = snakemake@output[[1]] # here("data", "processed", "genomad_predictions.csv")
+)
diff --git a/bin/collect_jaeger_batch.R b/workflow/scripts/collect_jaeger_batch.R
similarity index 61%
rename from bin/collect_jaeger_batch.R
rename to workflow/scripts/collect_jaeger_batch.R
index 167e0aa..ef50d11 100644
--- a/bin/collect_jaeger_batch.R
+++ b/workflow/scripts/collect_jaeger_batch.R
@@ -9,7 +9,7 @@ suppressPackageStartupMessages({
 })
 
 # Read all output files in a batch (one per genome)
-jaeger_files <- Sys.glob(paths = here("data", "tmp", "jaeger", snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))
+jaeger_files <- Sys.glob(paths = here(snakemake@params[["batch"]], "*", "*_default_jaeger.tsv"))
 
 # Concatenate them all in one dataframe
 jaeger_df <- do.call(
@@ -20,15 +20,21 @@ jaeger_df <- do.call(
 # Simplify the dataframe by extracting essential info
 jaeger_df_simple <- jaeger_df %>%
   mutate(
-    contig = gsub(pattern = " .*",
-                  replacement = "",
-                  x = contig_id),
-    accession_id = gsub(pattern = ".contig[0-9]*",
-                        replacement = "",
-                        x = contig)
+    contig = gsub(
+      pattern = " .*",
+      replacement = "",
+      x = contig_id
+    ),
+    accession_id = gsub(
+      pattern = ".contig[0-9]*",
+      replacement = "",
+      x = contig
+    )
   ) %>%
   select(accession_id, contig, length, prediction, reliability_score, prophage_contam)
 
 # And write to a CSV file (which can easily be concatenated with a script)
-write_csv(x = jaeger_df_simple,
-          file = snakemake@output[[1]])
\ No newline at end of file
+write_csv(
+  x = jaeger_df_simple,
+  file = snakemake@output[[1]]
+)
diff --git a/bin/collect_jaeger_predictions.sh b/workflow/scripts/collect_jaeger_predictions.sh
similarity index 100%
rename from bin/collect_jaeger_predictions.sh
rename to workflow/scripts/collect_jaeger_predictions.sh
diff --git a/bin/concatenate_cctyper_csv.sh b/workflow/scripts/concatenate_cctyper_csv.sh
similarity index 100%
rename from bin/concatenate_cctyper_csv.sh
rename to workflow/scripts/concatenate_cctyper_csv.sh
diff --git a/bin/concatenate_cctyper_output.sh b/workflow/scripts/concatenate_cctyper_output.sh
similarity index 100%
rename from bin/concatenate_cctyper_output.sh
rename to workflow/scripts/concatenate_cctyper_output.sh
diff --git a/bin/create_spacepharer_table.sh b/workflow/scripts/create_spacepharer_table.sh
similarity index 81%
rename from bin/create_spacepharer_table.sh
rename to workflow/scripts/create_spacepharer_table.sh
index 888b286..ecf3812 100644
--- a/bin/create_spacepharer_table.sh
+++ b/workflow/scripts/create_spacepharer_table.sh
@@ -7,7 +7,7 @@
 echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end\tphage_start\tphage_end\t5_3_PAM\t3_5_PAM\tLength\tGC_content\ttaxonomy\tcompleteness\thost\tlifestyle" > ${snakemake_output[phage]}
 while read line; do
     ID=$(echo $line | cut -f 2)
-    metadata_match=$(grep -w "$ID" ${snakemake_input[meta_phage]}/merged_metadata.tsv | cut -f 2-7)
+    metadata_match=$(grep -w "$ID" "${snakemake_input[phage_meta]}" | cut -f 2-7)
     echo -e "$line\t$metadata_match" >> ${snakemake_output[phage]}
 done < ${snakemake_input[phage]}
 
@@ -16,7 +16,7 @@ echo -e "sample_accession\tphage_accession\tp_best_hit\tspacer_start\tspacer_end
 while read line; do
     ID=$(echo $line | cut -f 2)
     #PLSDB uses a metadata system where there are many different files for differing purposes. taxonomy.csv uses a different ID so the taxonomy ID needs to be collected from nuccore and then matched to taxonomy
-    nuccore_match=$(grep -w "$ID" ${snakemake_input[meta_plasmid]}/nuccore.csv | cut -f 13 -d ",")
-    taxonomy_match=$(grep -w "^$nuccore_match" ${snakemake_input[meta_plasmid]}/taxonomy.csv | cut -f 3 -d ",")
+    nuccore_match=$(grep -w "$ID" "${snakemake_input[plasmid_nuccore]}" | cut -f 13 -d ",")
+    taxonomy_match=$(grep -w "^$nuccore_match" "${snakemake_input[plasmid_taxonomy]}" | cut -f 3 -d ",")
     echo -e "$line\t$taxonomy_match" >> ${snakemake_output[plasmid]}
 done < ${snakemake_input[plasmid]}
diff --git a/workflow/scripts/download_phage_database.sh b/workflow/scripts/download_phage_database.sh
new file mode 100644
index 0000000..f84fd19
--- /dev/null
+++ b/workflow/scripts/download_phage_database.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+## This script downloads and prepares the phage database: Phagescope
+## (https://phagescope.deepomics.org/) for use with SpacePHARER.
+set -euo pipefail
+
+exec > "${snakemake_log[out]}" # send stdout to a log file
+exec 2> "${snakemake_log[err]}" # also send stderr to a log file
+
+. "${snakemake[scriptdir]}/utils.sh"
+
+threads=${snakemake[threads]}
+phage_dir="${snakemake_output[phage_dir]}"
+databases=(${snakemake_params[databases]})
+
+message "Downloading PhageScope databases (archived fasta files)"
+message "This comprises ${#databases[@]} parts, as specified in 'config/parameters.yaml'"
+echo "-----"
+
+for DB in "${databases[@]}"
+do
+    message "Downloading ${DB}"
+    wget -O "${phage_dir}/${DB}.tar.gz"\
+     "https://phageapi.deepomics.org/download/phage/fasta/?datasource=${DB}"
+done
+
+echo "-----"
+message "Extracting databases\n"
+
+# Extracts the .tar.gz archives to subdirectories with separate fasta files.
+
+parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\
+ [ -f "${phage_dir}/${DB}.fasta" ] || [ -d "${phage_dir}/${DB}" ] && \
+ message "${DB} already extracted, skipping..." || \
+ ( message "Extracting ${DB}"; tar -xzf "${phage_dir}/${DB}.tar.gz" -C "${phage_dir}/" )'\
+ ::: "${databases[@]}" ::: ${phage_dir}
+
+echo "-----"
+message "Concatenating PhageScope sequences\n"
+
+# Concatenate separate fasta files in one long file,
+# and then remove the directory with the separate files.
+
+parallel --jobs "${threads}" 'DB={1}; phage_dir={2};\
+ ( message "Concatenating ${DB}"; genomes=$(find "${phage_dir}/${DB}" -type f -name "*.fasta");\
+  > "${phage_dir}/${DB}.fasta";\
+ for files in ${genomes}; do cat ${files} >> "${phage_dir}/${DB}.fasta"; done;
+ rm -r "${phage_dir}/${DB}")' \
+ ::: Genbank RefSeq DDBJ EMBL PhagesDB GPD GVD MGV TemPhD ::: ${phage_dir}
+
+echo "-----"
+message "Downloading Phagescope metadata\n"
+
+# Download metadata as tab-separated values (TSV) text file.
+
+for DB in "${databases[@]}"
+do
+    metadata_file="${phage_dir}/${DB}_phage_metadata.tsv"
+    message "Downloading ${DB}"
+    wget -O "${metadata_file}" "https://phageapi.deepomics.org/files/Download/Phage_meta_data/${DB,,}_phage_meta_data.tsv"
+    # With ${DB,,} the string variable is converted to lowercase
+done
+
+echo "-----"
+message "Concatenating PhageScope metadata files\n"
+
+# Make one long file with all metadata from the different databases.
+combined_metadata="${snakemake_output[combined_meta]}"
+
+# Rename the first metadata file into the concatenated file
+mv "${phage_dir}/${databases[0]}_phage_metadata.tsv" ${combined_metadata}
+
+# Then, for all the other metadata files
+for DB in "${databases[@]:1}"
+do
+    metadata_file="${phage_dir}/${DB}_phage_metadata.tsv"
+    # Take all but the first line (header)
+    tail -n +2 "${metadata_file}" >> "${combined_metadata}"
+    # and remove (clean-up, avoid keeping duplicated data)
+    rm ${metadata_file}
+done
+
+message "--- Done! ---"
diff --git a/workflow/scripts/download_plasmid_database.sh b/workflow/scripts/download_plasmid_database.sh
new file mode 100644
index 0000000..440596c
--- /dev/null
+++ b/workflow/scripts/download_plasmid_database.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+## This script downloads and prepares the plasmid database: PLSDB
+## (https://ccb-microbe.cs.uni-saarland.de/plsdb2025/) for use with SpacePHARER.
+set -euo pipefail
+
+exec > "${snakemake_log[out]}" # send all stdout to a log file
+exec 2> "${snakemake_log[err]}" # send stderr to separate log file
+
+. "${snakemake[scriptdir]}/utils.sh"
+
+plasmid_dir=${snakemake_output[plasmid_dir]}
+plasmid_archive="${plasmid_dir}/download_meta.tar.gz"
+
+message "Downloading PLSDB plasmid database"
+echo "-----"
+
+wget -P "${plasmid_dir}" https://ccb-microbe.cs.uni-saarland.de/plsdb2025/download_meta.tar.gz
+
+message "Extracting PLSDB"
+tar -xzf "${plasmid_archive}" -C "${plasmid_dir}"
+
+message "Extracting sequences"
+bzip2 -d "${plasmid_dir}/sequences.fasta.bz2"
+
+message "Adjusting metadata delimiters"
+sed -i -E ':a;s/"([^"]*),([^"]*)"/"\1\2"/g;ta' "${plasmid_dir}/nuccore.csv"
+
+echo -e "-----\nDone!\n-----"
diff --git a/bin/extract_crispr-cas_from_fasta.sh b/workflow/scripts/extract_crispr-cas_from_fasta.sh
similarity index 89%
rename from bin/extract_crispr-cas_from_fasta.sh
rename to workflow/scripts/extract_crispr-cas_from_fasta.sh
index a012e77..177b8c7 100644
--- a/bin/extract_crispr-cas_from_fasta.sh
+++ b/workflow/scripts/extract_crispr-cas_from_fasta.sh
@@ -3,8 +3,8 @@
 cctyper_dir=$1
 
 sample_name=$(basename ${cctyper_dir})
-batch_name=$(basename $(dirname ${cctyper_dir}))
-fasta_file="data/tmp/assemblies/${batch_name}/${sample_name}.fa"
+genome_dir=$2
+fasta_file="${genome_dir}/${sample_name}.fa"
 
 bed_files=( $(find $1 -mindepth 1 -maxdepth 1 -name "*bed") )
 
diff --git a/workflow/scripts/make_cluster_table.py b/workflow/scripts/make_cluster_table.py
new file mode 100644
index 0000000..11b9d84
--- /dev/null
+++ b/workflow/scripts/make_cluster_table.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+################################################################
+# Read CD-HIT's output cluster file, along with the FASTA used #
+# to generate it to create a table of clustered sequences.     #
+# - Aim of the program:                                        #
+#   1) read the cluster file                                   #
+#      a) create a cluster-based dictionary (cluster #,        #
+#         representative)                                      #
+#      b) create a sequence-based dictionary (ID, length,      #
+#         representative or % identity and strand)             #
+#   2) combine the two dictionaries into a dataframe with      #
+#       columns: genome, contig, locus, cluster ID, length,    #
+#       longest_sequence (CD-HIT representative), sequence,    #
+#       identity (%), strand                                   #
+#   3) for each cluster, also mark the shortest and most common#
+#       sequence (as alternative representatives)              #
+#   4) write the dataframe to a tab-separated table file       #
+################################################################
+
+import re
+from pyfaidx import Fasta
+import pandas as pd
+
+cluster_file = snakemake.input["clstr"]
+fasta_file = snakemake.input["fasta"]
+table_file = snakemake.output[0]
+
+
+def read_clstr_file(inputfile=str):
+    """
+    Read a CD-HIT generated .clstr file and parse its elements into
+     two dictionaries: 1) cluster ID + representative (locus ID),
+                       2) sequences (locus ID) with length and
+                        representative/identity
+    """
+    cluster_dict = {"Cluster": [], "Longest_sequence": []}
+    locus_dict = {
+        "Genome": [],
+        "Contig": [],
+        "Locus": [],
+        "Full_locus": [],
+        "Length": [],
+        "Cluster": [],
+        "Strand_to_longest": [],
+        "Identity_to_longest": [],
+    }
+
+    cluster_regex = r"(>Cluster *)(\d*)"
+    locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_\d+:\d+)... (.*)$"
+
+    with open(inputfile, "r") as infile:
+        for line in infile:
+            line = line.strip()  # Remove funny characters
+
+            if line.startswith(">"):
+                # Extract the digits from the cluster ID
+                cluster = re.search(cluster_regex, line).group(2)
+
+            elif len(line) > 1:
+                # Use RegEx to extract information
+                crispr_info = re.search(locus_regex, line)
+
+                member_nr = crispr_info.group(1)  # not used
+                length = int(crispr_info.group(2).rstrip("nt"))
+                genome = crispr_info.group(3)
+                locus = crispr_info.group(4)
+                full_locus = f"{genome}.{locus}"
+                contig = locus.split("_")[0]
+                extra = crispr_info.group(5)
+
+                # Check the final group for representative ('*') or other
+                if extra == "*":
+                    strand = "NA"
+                    identity = "NA"
+                    cluster_dict["Cluster"].append(cluster)
+                    cluster_dict["Longest_sequence"].append(full_locus)
+
+                else:
+                    strand_and_identity = extra.split("/")
+                    strand = strand_and_identity[0].replace("at ", "")
+                    identity = strand_and_identity[1]
+
+                locus_dict["Genome"].append(genome)
+                locus_dict["Contig"].append(contig)
+                locus_dict["Locus"].append(locus)
+                locus_dict["Full_locus"].append(full_locus)
+                locus_dict["Length"].append(length)
+                locus_dict["Cluster"].append(cluster)
+                locus_dict["Strand_to_longest"].append(strand)
+                locus_dict["Identity_to_longest"].append(identity)
+
+            # If the line does not start with '>' or have length > 1, stop
+            else:
+                break
+
+        return cluster_dict, locus_dict
+
+
+def generate_sequence_df(fasta=str, ids=list):
+    """
+    Given a fasta file and list of identifiers, create a dataframe
+    of DNA sequences that can be merged with the cluster/locus dataframe.
+    """
+    sequence_dict = Fasta(fasta, duplicate_action="first")
+    sequence_list = []
+    for locus in ids:
+        sequence_list.append(sequence_dict[locus][:].seq)
+
+    return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list})
+
+
+def main():
+    """
+    Main function, running the whole script.
+    """
+    # Read clstr file, store as dictionaries
+    cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file)
+
+    # Convert dictionaries to dataframes
+    cluster_df = pd.DataFrame(cluster_dict)
+    locus_df = pd.DataFrame(locus_dict)
+
+    # Merge dataframes
+    combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster")
+
+    # Add sequences
+    sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"])
+
+    combined_with_sequences = combined_df.merge(
+        sequence_df, how="inner", on="Full_locus"
+    )
+
+    ## Not yet implemented:
+    # Find shortest sequence per cluster
+    # Find most common sequence per cluster
+
+    # Save as tab-separated text file
+    combined_with_sequences.to_csv(table_file, sep="\t", index=False)
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/workflow/scripts/make_cluster_table_identify.py b/workflow/scripts/make_cluster_table_identify.py
new file mode 100644
index 0000000..237361f
--- /dev/null
+++ b/workflow/scripts/make_cluster_table_identify.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+
+################################################################
+# Read CD-HIT's output cluster file, along with the FASTA used #
+# to generate it to create a table of clustered sequences.     #
+# - Aim of the program:                                        #
+#   1) read the cluster file                                   #
+#      a) create a cluster-based dictionary (cluster #,        #
+#         representative)                                      #
+#      b) create a sequence-based dictionary (ID, length,      #
+#         representative or % identity and strand)             #
+#   2) combine the two dictionaries into a dataframe with      #
+#       columns: genome, contig, locus, cluster ID, length,    #
+#       longest_sequence (CD-HIT representative), sequence,    #
+#       identity (%), strand                                   #
+#   3) for each cluster, also mark the shortest and most common#
+#       sequence (as alternative representatives)              #
+#   4) write the dataframe to a tab-separated table file       #
+################################################################
+
+## This is a slightly adjusted script from make_cluster_table.py as the CRISPRidentify output is different.
+
+import re
+from pyfaidx import Fasta
+import pandas as pd
+
+cluster_file = snakemake.input["clstr"]
+fasta_file = snakemake.input["fasta"]
+table_file = snakemake.output[0]
+
+
+def read_clstr_file(inputfile=str):
+    """
+    Read a CD-HIT generated .clstr file and parse its elements into
+     two dictionaries: 1) cluster ID + representative (locus ID),
+                       2) sequences (locus ID) with length and
+                        representative/identity
+    """
+    cluster_dict = {"Cluster": [], "Longest_sequence": []}
+    locus_dict = {
+        "Genome": [],
+        "Contig": [],
+        "Locus": [],
+        "Full_locus": [],
+        "Length": [],
+        "Cluster": [],
+        "Strand_to_longest": [],
+        "Identity_to_longest": [],
+    }
+
+    cluster_regex = r"(>Cluster *)(\d*)"
+    locus_regex = r"^(\d+)\s+(\d+nt), >(\w+).(contig[\d-]+_.+)\.\.\. (.*)$"
+
+    with open(inputfile, "r") as infile:
+        for line in infile:
+            line = line.strip()  # Remove funny characters
+
+            if line.startswith(">"):
+                # Extract the digits from the cluster ID
+                cluster = re.search(cluster_regex, line).group(2)
+
+            elif len(line) > 1:
+                # Use RegEx to extract information
+                crispr_info = re.search(locus_regex, line)
+
+                member_nr = crispr_info.group(1)  # not used
+                length = int(crispr_info.group(2).rstrip("nt"))
+                genome = crispr_info.group(3)
+                locus = crispr_info.group(4)
+                full_locus = f"{genome}-{locus}"
+                contig = locus.split("_")[0]
+                extra = crispr_info.group(5)
+
+                # Check the final group for representative ('*') or other
+                if extra == "*":
+                    strand = "NA"
+                    identity = "NA"
+                    cluster_dict["Cluster"].append(cluster)
+                    cluster_dict["Longest_sequence"].append(full_locus)
+
+                else:
+                    strand_and_identity = extra.split("/")
+                    strand = strand_and_identity[0].replace("at ", "")
+                    identity = strand_and_identity[1]
+
+                locus_dict["Genome"].append(genome)
+                locus_dict["Contig"].append(contig)
+                locus_dict["Locus"].append(locus)
+                locus_dict["Full_locus"].append(full_locus)
+                locus_dict["Length"].append(length)
+                locus_dict["Cluster"].append(cluster)
+                locus_dict["Strand_to_longest"].append(strand)
+                locus_dict["Identity_to_longest"].append(identity)
+
+            # If the line does not start with '>' or have length > 1, stop
+            else:
+                break
+
+        return cluster_dict, locus_dict
+
+
+def generate_sequence_df(fasta=str, ids=list):
+    """
+    Given a fasta file and list of identifiers, create a dataframe
+    of DNA sequences that can be merged with the cluster/locus dataframe.
+    """
+    sequence_dict = Fasta(fasta, duplicate_action="first")
+    sequence_list = []
+    for locus in ids:
+        sequence_list.append(sequence_dict[locus][:].seq)
+
+    return pd.DataFrame({"Full_locus": ids, "Sequence": sequence_list})
+
+
+def main():
+    """
+    Main function, running the whole script.
+    """
+    # Read clstr file, store as dictionaries
+    cluster_dict, locus_dict = read_clstr_file(inputfile=cluster_file)
+
+    # Convert dictionaries to dataframes
+    cluster_df = pd.DataFrame(cluster_dict)
+    locus_df = pd.DataFrame(locus_dict)
+
+    # Merge dataframes
+    combined_df = locus_df.merge(cluster_df, how="inner", on="Cluster")
+
+    # Add sequences
+    sequence_df = generate_sequence_df(fasta=fasta_file, ids=locus_df["Full_locus"])
+
+    combined_with_sequences = combined_df.merge(
+        sequence_df, how="inner", on="Full_locus"
+    )
+
+    ## Not yet implemented:
+    # Find shortest sequence per cluster
+    # Find most common sequence per cluster
+
+    # Save as tab-separated text file
+    combined_with_sequences.to_csv(table_file, sep="\t", index=False)
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/workflow/scripts/merge_cctyper_crispridentify.sh b/workflow/scripts/merge_cctyper_crispridentify.sh
new file mode 100644
index 0000000..e59393d
--- /dev/null
+++ b/workflow/scripts/merge_cctyper_crispridentify.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+exec 2> "${snakemake_log[0]}" # send all stderr to the log file
+
+cctyper=(${snakemake_input[cctyper]})
+
+first=True
+for summary in "${cctyper[@]}"
+do
+    # If it is the first, copy the whole file (including header)
+    if [ "${first}" == True ]
+    then
+        cat "${summary}" > tmp_file1
+        first=False
+    # otherwise, only take the 'contents', without header
+    else
+        tail -n +2 "${summary}" >> tmp_file1
+    fi
+    # And write it all in one concatenated temporary file: 'tmp_file1'
+done
+
+# For CRISPRidentify, collect the header as variable
+header=$(head -n 1 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t")
+# and write the contents of the 'complete summary' to another temporary file: 'tmp_file2'
+tail -n +2 ${snakemake_input[identify]} | cut -f 1,5,6,7,8,9,10,11,14 -d "," | tr "," "\t" > tmp_file2
+
+first=True
+
+# Start reading 'tmp_file1' = CCTyper CRISPRs
+while read line
+do
+    if [ "${first}" == True ]
+    # For the first line...
+    then
+        first=False
+        # ...concatenate the header with CRISPRidentify's header
+        echo -e "$line\t$header" > ${snakemake_output[table]}
+    else
+
+    # For all other lines,
+        sample=$(echo -e "${line}" | cut -f 1)
+        start_cc=$(echo -e "${line}" | cut -f 3)
+        start_id=$(expr "${start_cc}" + 1)
+        # see if the sample has a match in CRISPRidentify (tmp_file2)
+        match=$(grep "${sample}_${start_id}" tmp_file2 || true)
+
+        if [ -z "${match}" ]
+        then
+        # If there's *no* match (-z = lenth 0)
+            echo -e "${line}" >> ${snakemak_output[table]}
+            # Just copy the line from CCTyper
+        else
+        # but if there is a match,
+            while read line2
+            do
+                if [ "${start_cc}" -lt 5000 ];
+                # check if the reported start position is greater than 5000 (the length of the flanking regions),
+                # and match the lines from CCTyper (line) and CRISPRidentify (match/line2) accordingly
+                then
+                    echo -e "${line}\t${match}" >> ${snakemake_output[table]}
+                else
+                    start=$(echo -e "${line2}" | cut -f 2)
+                    start=$(expr "${start}" + "${start_cc}" - 5000)
+                    length=$(echo -e "${line2}" | cut -f 4)
+                    end=$(expr "${length}" + "${start}" - 1)
+                    begin=$(echo -e "${line2}" | cut -f 1)
+                    rest=$(echo -e "${line2}" | cut -f 4-9)
+                    echo -e "${line}\t${begin}\t${start}\t${end}\t${rest}" >> ${snakemake_output[table]}
+                fi
+            done <<< "${match}"
+        fi
+    fi
+done < tmp_file1
+
+# Remove the temporary files
+rm -f tmp_file1 tmp_file2
diff --git a/workflow/scripts/merge_crispridentify_batches.sh b/workflow/scripts/merge_crispridentify_batches.sh
new file mode 100644
index 0000000..caca261
--- /dev/null
+++ b/workflow/scripts/merge_crispridentify_batches.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+exec 2> "${snakemake_log[0]}" # send all stderr to the log file
+
+spacers_crispr=(${snakemake_input[spacers_crispr]})
+summary_crispr=(${snakemake_input[summary_crispr]})
+
+> ${snakemake_output[spacers_crispr]}
+for spacers in "${spacers_crispr[@]}"
+do
+    cat "${spacers}" >> ${snakemake_output[spacers_crispr]}
+done
+
+for summary in "${summary_crispr[@]}"
+do
+    header=$(head -n 1 "${summary}")
+    if [ "${header}" == "No arrays found" ]
+    then
+        continue
+    else
+        echo "${header}" > ${snakemake_output[summary_crispr]}
+        break
+    fi
+done
+
+for summary in "${summary_crispr[@]}"
+do
+    tail -n +2 "${summary}" >> ${snakemake_output[summary_crispr]}
+done