diff --git a/scripts/mask_low_complexity.sh b/scripts/mask_low_complexity.sh index e35151c..b631cb3 100755 --- a/scripts/mask_low_complexity.sh +++ b/scripts/mask_low_complexity.sh @@ -23,17 +23,26 @@ if ! which $MASKER > /dev/null; then exit 1 fi +function mask_data_chunk () { + # Removes empty records and performs masking, all in pipes + MASKER=$1 + awk -v RS=">" -v FS="\n" -v ORS="" ' { if ($2) print ">"$0 } ' |\ + $MASKER -in - -outfmt fasta |\ + sed -e '/^>/!s/[a-z]/x/g' +} +export -f mask_data_chunk + if [ -d $target ]; then for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do if [ ! -e "$file.masked" ]; then - $MASKER -in $file -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$file.tmp" + cat $file | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk $MASKER > "$file.tmp" mv "$file.tmp" $file touch "$file.masked" fi done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then - $MASKER -in $target -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp" + cat $target | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk $MASKER > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi