Commands.sh

# Retrieve contigs from the Logan database using the aws cli and a list of SRA accession numbers ("SraAccList.csv").
# Accession file contains 493 476 accession numbers, split into 12 parallell processes doing 41123 each.
---
#!/bin/bash
#set -x

for x in {1..41123}
do
        acc=`sed -n ''${x}'p' SraAccList.csv`
        aws s3 cp s3://logan-pub/c/${acc}/${acc}.contigs.fa.zst . --no-sign-request
        zstd -d ${acc}.contigs.fa.zst
        rm ${acc}.contigs.fa.zst
done&

for x in {41124..82246}
do
        acc=`sed -n ''${x}'p' SraAccList.csv`
        aws s3 cp s3://logan-pub/c/${acc}/${acc}.contigs.fa.zst . --no-sign-request
        zstd -d ${acc}.contigs.fa.zst
        rm ${acc}.contigs.fa.zst
done&
---

# Deactivate the base conda environment, uses an older version of blast (dependency for something)
conda deactivate

# makeblastdb terminates with the error: terminate called after throwing an instance of 'std::bad_alloc' 
# if the input file is too big (>600 GB), couldn't figure out how to fix it. Doesn't seem to be a
# system resources issue. Split into smaller batches:

# Concatenate the first 10 000 fasta files
find ./Sequences/ -type f -name "*.fa" | sed -n '1,10000p' | xargs cat > comb1.fasta

# Concatenate the fasta files 10 001 to 20 000
find ./Sequences/ -type f -name "*.fa" | sed -n '10001,20000p' | xargs cat > comb2.fasta

# Concatenate the fasta files 20 001 to 30 000
find ./Sequences/ -type f -name "*.fa" | sed -n '20001,30000p' | xargs cat > comb3.fasta

# etc..
# makeblastdb is quite slow at 180 000 sequences / second (on my machine)
nohup makeblastdb -in comb1.fasta -dbtype nucl -input_type fasta -out big_db1 -max_file_sz 3GB -logfile makedb_log1.txt &

tblastn -db big_db1 -num_threads 8 -query ${query_sequence} -out local_search1.txt

# Make a list of all downloaded fasta sequence files with their full path:
find ./Sequences/ -type f -name "*.fa" | xargs realpath > sequenceList.txt

# Make a blast database of all files in chunks of 8219 files per database
# Delete fasta files after database files have been created successfully
start=1
stop=8219

for x in {1..56};
do
	sed -n ''${start}','${stop}'p' sequenceList.txt | xargs cat > /mnt/databases/Logan/combined.fasta
	makeblastdb -in /mnt/databases/Logan/combined.fasta -dbtype nucl -input_type fasta -out /mnt/databases/Logan/blastdbs/comb_"${x}" -max_file_sz 3GB
	if [ $? -ne 0 ]; then
		echo "Error encountered, exiting loop."
		break
	fi
	sed -n ''${start}','${stop}'p' sequenceList.txt | xargs rm
	rm /mnt/databases/Logan/combined.fasta
	start=$((start + 8219))
	stop=$((stop + 8219))
done