uf-sort

#!/bin/sh
#
#  uf-sort - Sort sequences in an unfasta file
#  Copyright (C) 2016  Marco van Zwetselaar <io@zwets.it>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#  This utility is part of http://io.zwets.it/unfasta

# Function to exit this script with an error message on stderr
err_exit() {
    echo "$(basename "$0"): $*" >&2
    exit 1
}

# Function to show usage information and exit
usage_exit() {
    echo "
Usage: $(basename $0) [OPTIONS] [FILE ...]

  Sort the sequences in unfasta FILEs in order of decreasing length, and
  write to standard output.  If no FILE is present or when FILE is '-',
  read standard input.  Length ties are broken alphabetically.

  OPTIONS
   -r, --reverse   Reverse the order of the sort
" >&2
    exit ${1:-1}
}

# Parse options

unset REVERSE

while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do
    case $1 in
    -r|--reverse) REVERSE="-r" ;;
    -h|--help)    usage_exit 0 ;;
    *)            usage_exit ;;
    esac
    shift || usage_exit
done

# Do the work

# We need to sort on length, then alphabet, and also need to keep each header
# together with its sequence.  The solution is to first turn every sequence
# into a single line having fields length <> sequence data <> header (where 
# header really is anything from field 3 to the end).
# Then we sort, and then we unpack everything again.

# Pre-process into single records
awk -v OFS='\t' '
    NR % 2 == 1 { HDR = $0; }
    NR % 2 == 0 { print length(), $0, HDR }
    ' "$@" |
# Sort in order
#LC_ALL=C sort $REVERSE --buffer-size=1G --key='1rn,2' -t '    ' - |
LC_ALL=C sort $REVERSE --key='1rn,2' -t "$(printf '\t')" - |
# And unpack again - let's hope there are no tabs in headers ...
awk -F '\t' '{ print $3; print $2; }'

# vim: sts=4:sw=4:et:si:ai