clustalO-profile.yaml

!mobyle/program
name: clustalO-profile
title: 'Clustal-Omega: Profile alignment'
description: Align 2 profiles (alignments)
inputs: !mobyle/inputparagraph
    children:
    - !mobyle/inputprogramparagraph
        comment: "the columns in each profile will be kept\n    fixed and the alignment\
            \ of the two profiles will be written\n    out.Use this option to align\
            \ two alignments (profiles) together."
        prompt: Data Input
        name: input
        children:
        - !mobyle/inputprogramparameter
            prompt: Pre-aligned multiple sequence file (aligned columns will be kept
                fixed)
            format: '" --profile1=" + str( value )'
            simple: true
            mandatory: true
            name: profile1
            command: false
            type: !mobyle/formattedtype
                format_terms: ['EDAM_format:2200', 'EDAM_format:1982', 'EDAM_format:1961',
                    'EDAM_format:1993']
                data_terms: ['EDAM_data:1384']
        - !mobyle/inputprogramparameter
            prompt: Pre-aligned multiple sequence file (aligned columns will be kept
                fixed)
            format: '" --profile2=" + str( value )'
            simple: true
            mandatory: true
            name: profile2
            command: false
            type: !mobyle/formattedtype
                format_terms: ['EDAM_format:2200', 'EDAM_format:1982', 'EDAM_format:1961',
                    'EDAM_format:1993']
                data_terms: ['EDAM_data:1384']
    - !mobyle/inputprogramparagraph
        comment: "In order to produce a multiple alignment Clustal-Omega requires\
            \ a\n          guide tree which defines the order in which sequences/profiles\
            \ are\n          aligned. A guide tree in turn is constructed, based on\
            \ a distance\n          matrix. Conventionally, this distance matrix is\
            \ comprised of all the\n          pair-wise distances of the sequences.\
            \ The distance measure\n          Clustal-Omega uses for pair-wise distances\
            \ of un-aligned sequences is\n          the k-tuple measure [4], which\
            \ was also implemented in Clustal 1.83\n          and ClustalW2\n    \
            \      [5,6]. If the sequences inputted via -i are aligned\n         \
            \ Clustal-Omega uses the Kimura-corrected pairwise aligned identities\n\
            \          [7]. The computational effort (time/memory) to calculate and\
            \ store a\n          full distance matrix grows quadratically with the\
            \ number of sequences.\n          Clustal-Omega can improve this scalability\
            \ to N*log(N) by employing a\n          fast clustering algorithm called\
            \ mBed [2]; this option is\n          automatically invoked (default).\
            \ If a full distance matrix evaluation\n          is desired, then the\
            \ --full flag has to\n          be set. The mBed mode\n          calculates\
            \ a reduced set of pair-wise distances. These distances are\n        \
            \  used in a k-means algorithm, that clusters at most 100 sequences. For\n\
            \          each cluster a full distance matrix is calculated. No full\
            \ distance\n          matrix (of all input sequences) is calculated in\
            \ mBed mode. If there\n          are less than 100 sequences in the input,\
            \ then in effect a full\n          distance matrix is calculated in mBed\
            \ mode, however, no distance\n          matrix can be outputted (see below).Clustal-Omega\
            \ uses Muscle's [8] fast UPGMA implementation to construct\n         \
            \ its guide trees from the distance matrix. By default, the distance\n\
            \          matrix is used internally to construct the guide tree and is\
            \ then\n          discarded. By specifying --distmat-out the internal\
            \ distance matrix\n          can be written to file. This is only possible\
            \ in --full mode. The\n          guide trees by default are used internally\
            \ to guide the multiple\n          alignment and are then discarded. By\
            \ specifying the --guidetree-out\n          option these\n          internal\
            \ guide trees can be written out to\n          file. Conversely, the distance\
            \ calculation and/or guide tree building\n          stage can be skipped,\
            \ by reading in a pre-calculated distance matrix\n          and/or pre-calculated\
            \ guide tree. These options are invoked by\n          specifying the --distmat-in\
            \ and/or --guidetree-in flags,\n          respectively. However, distance\
            \ matrix reading is disabled in the\n          current version. By default,\
            \ distance matrix and guide tree files are\n          not over-written,\
            \ if a file with the specified name already\n          exists. In\n  \
            \        this case Clustal-Omega aborts during the command-line processing\n\
            \          stage. In mBed mode a full distance matrix cannot\n       \
            \   be outputted, distance matrix output is only possible in --full mode.\n\
            \          mBed or --full distance mode do not affect the ability to write\
            \ out\n          guide-trees.Guide trees can be iterated to refine the\
            \ alignment (see section\n          ITERATION). Clustal-Omega takes the\
            \ alignment, that was produced\n          initially and constructs a new\
            \ distance matrix from this alignment.\n          The distance measure\
            \ used at this stage is the Kimura distance [7]. By\n          default,\
            \ Clustal-Omega constructs a reduced distance matrix at this\n       \
            \   stage using the mBed algorithm, which will then be used to create\
            \ an\n          improved (iterated) new guide tree. To turn off mBed-like\
            \ clustering\n          at this\n          stage the --full-iter flag\
            \ has to be set. While Kimura\n          distances in general are much\
            \ faster to calculate than k-tuple\n          distances, time and memory\
            \ requirements still scale quadratically with\n          the number of\
            \ sequences and --full-iter clustering should only be\n          considered\
            \ for smaller cases ( << 10,000 sequences).[2] Blackshields G, Sievers\
            \ F, Shi W, Wilm A, Higgins DG. Sequence\n          embedding for fast\
            \ construction of guide trees for multiple\n          sequence alignment.\
            \ Algorithms Mol Biol. 2010 May 14;5:21.[4] Wilbur and Lipman, 1983; PMID\
            \ 6572363[5] Thompson JD, Higgins DG, Gibson TJ. (1994). CLUSTAL W: improving\n\
            \          the sensitivity of progressive multiple sequence alignment\
            \ through\n          sequence weighting, position-specific gap penalties\
            \ and weight\n          matrix choice. Nucleic Acids Res., 22, 4673-4680.[6]\
            \ Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,\n    \
            \      McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson\
            \ JD,\n          Gibson TJ, Higgins DG. (2007). Clustal W and Clustal\
            \ X version\n          2.0. Bioinformatics, 23, 2947-2948.[7] Kimura M\
            \ (1980). \"A simple method for estimating evolutionary\n          rates\
            \ of base substitutions through comparative studies of\n          nucleotide\
            \ sequences\". Journal of Molecular Evolution 16: 111\u2013120."
        prompt: Clustering
        name: clustering
        children:
        - !mobyle/inputprogramparameter
            prompt: Guide tree input file (--guidetree-in)
            format: ( "" , " --guidetree-in="+str(value))[ value is not None ]
            name: guidetree_in
            command: false
            type: !mobyle/formattedtype
                format_terms: ['EDAM_format:1910']
                data_terms: ['EDAM_data:0872']
        - !mobyle/inputprogramparameter
            prompt: Use full distance matrix for guide-tree calculation (slow; mBed
                is default) (--full)
            format: ( "" , " --full ")[ value is not None and value ]
            name: full
            command: false
            type: !mobyle/booleantype {default: false}
        - !mobyle/inputprogramparameter
            prompt: Use full distance matrix for guide-tree calculation during iteration
                (mBed is default) (--full-iter)
            format: ( "" , " --full-iter ")[ value is not None and value ]
            name: full_iter
            command: false
            type: !mobyle/booleantype {default: false}
    - !mobyle/inputprogramparagraph
        prompt: Alignment Output
        name: output_format
        children:
        - !mobyle/inputprogramparameter
            prompt: alignment output format
            format: ( "" , " --outfmt=" + value )[ value is not None and value !=
                vdef ]
            name: output_format
            command: false
            type: !mobyle/stringtype
                default: fasta
                options:
                - {label: fasta, value: fasta}
                - {label: clustal, value: clustal}
                - {label: msf, value: msf}
                - {label: phylip, value: phylip}
                - {label: stockholm, value: stockholm}
                - {label: vienna, value: vienna}
    - !mobyle/inputprogramparagraph
        comment: "By default, Clustal-Omega calculates (or reads in) a guide tree\
            \ and\n          performs a multiple alignment in the order specified\
            \ by this guide\n          tree. This alignment is then outputted. Clustal-Omega\
            \ can 'iterate'\n          its guide tree. The hope is that the (Kimura)\
            \ distances, that can be\n          derived from the initial alignment,\
            \ will give rise to a better guide\n          tree, and by extension,\
            \ to a better alignment.A similar rationale applies to HMM-iteration.\
            \ MSAs in general are very\n          'vulnerable' at their early stages.\
            \ Sequences that are aligned at an\n          early stage remain fixed\
            \ for the rest of the MSA. Another way of\n          putting this is:\
            \ 'once a gap, always a gap'. This behaviour can be\n          mitigated\
            \ by HMM iteration. An initial alignment is created and turned\n     \
            \     into a HMM. This HMM can help in a new round of MSA to 'anticipate'\n\
            \          where residues should align. This is using the HMM as an External\n\
            \          Profile and carrying out iterative EPA. In practice, individual\n\
            \          sequences and profiles are aligned to the External HMM, derived\
            \ after\n          the initial alignment. Pseudo-count information is\
            \ then transferred to\n          the (internal) HMM, corresponding to\
            \ the individual\n          sequence/profile. The now somewhat 'softened'\
            \ sequences/profiles are\n          then in turn aligned in the order\
            \ specified by the guide\n          tree. Pseudo-count transfer is reduced\
            \ with the size of the\n          profile. Individual sequences attain\
            \ the greatest\n          pseudo-count\n          transfer, larger profiles\
            \ less so. Pseudo-count transfer to profiles\n          larger than, say,\
            \ 10 is negligible. The effect of HMM iteration is\n          more pronounced\
            \ in larger test sets (that is, with more sequences).Both, HMM- and guide\
            \ tree-iteration come at a cost of increasing the\n          run-time.\
            \ One round of guide tree iteration adds on (roughly) the time\n     \
            \     it took to construct the initial alignment. If, for example, the\n\
            \          initial alignment took 1min, then it will take (roughly) 2min\
            \ to\n          iterate the guide tree once, 3min to iterate the guide\
            \ tree twice, and\n          so on. HMM-iteration is more costly, as each\
            \ round of iteration adds\n          three times the time required for\
            \ the alignment stage. For example, if\n          the initial alignment\
            \ took 1min, then each additional round of HMM\n          iteration will\
            \ add on 3min; so 4 iterations will take 13min\n          (=1min+4*3min).\
            \ The factor of 3 stems from the fact that at every\n          stage both\
            \ intermediate profiles have to be aligned with the\n          background\
            \ HMM, and finally the (softened) HMMs have to be aligned as\n       \
            \   well. All times are quoted for single processors.By default, guide\
            \ tree iteration and HMM-iteration are coupled. This\n          means,\
            \ at each iteration step both, guide tree and HMM, are\n          re-calculated.\
            \ This is invoked by setting the --iter flag. For\n          example,\
            \ if --iter=1, then first an initial alignment is produced\n         \
            \ (without external HMM background information and using k-tuple\n   \
            \       distances to calculate the guide tree). This initial alignment\
            \ is then\n          used to re-calculate a new guide tree (using Kimura\
            \ distances) and to\n          create a HMM. The new\n          guide\
            \ tree and the HMM are then used to produce\n          a new MSA.Iteration\
            \ of guide tree and HMM can be de-coupled. This means that the\n     \
            \     number of guide tree iterations and HMM iterations can be\n    \
            \      different. This can be done by combining the --iter flag with the\n\
            \          --max-guidetree-iterations and/or the --max-hmm-iterations\
            \ flag. The\n          number of guide tree iterations is the minimum\
            \ of --iter and\n          --max-guidetree-iterations, while the number\
            \ of HMM iterations is the\n          minimum of --iter and --max-hmm-iterations.\
            \ If, for example, HMM\n          iteration should be\n          performed\
            \ 5 times but guide tree iteration should\n          be performed only\
            \ 3 times, then one should set --iter=5 and\n          --max-guidetree-iterations=3.\
            \ All three flags can be specified at the\n          same time (however,\
            \ this makes no sense). It is not sufficient just to\n          specify\
            \ --max-guidetree-iterations and --max-hmm-iterations but not\n      \
            \    --iter. If any iteration is desired --iter has to be set."
        prompt: Iteration
        name: iteration
        children:
        - !mobyle/inputprogramparameter
            prompt: Number of (combined guide-tree/HMM) iterations (--iter)
            format: ( "" , " --iter="+str(value) )[ value is not None ]
            name: iterations
            command: false
            type: !mobyle/integertype {}
        - !mobyle/inputprogramparameter
            prompt: Maximum number guidetree iterations (--max-guidetree-iterations)
            format: ( "" , " --max-guidetree-iterations="+str(value) )[ value is not
                None ]
            name: max_guidetree_iterations
            command: false
            type: !mobyle/integertype {}
        - !mobyle/inputprogramparameter
            prompt: Maximum number of HMM iterations (--max-hmm-iterations)
            format: ( "" , " --max-hmm-iterations="+str(value) )[ value is not None
                ]
            name: max_hmm_iterations
            command: false
            type: !mobyle/integertype {}
    - !mobyle/inputprogramparagraph
        prompt: Miscellaneous
        name: miscellaneous
        children:
        - !mobyle/inputprogramparameter
            comment: "Users may feel unsure which options are appropriate in certain\n\
                \              situations even though using ClustalO without any special\
                \ options\n              should give you the desired results. The\
                \ --auto flag tries to\n              alleviate this problem and selects\
                \ accuracy/speed flags according to\n              the number of sequences.\
                \ For all cases will use mBed and thereby\n              possibly\
                \ overwrite the --full option. For more than 1,000 sequences\n   \
                \           the iteration is turned off as the effect of iteration\
                \ is more\n              noticeable for 'larger'\n              problems.\
                \ Otherwise iterations are set to 1 if\n              not already\
                \ set to a higher value by the user. Expert users may want\n     \
                \         to avoid this flag and exercise more fine tuned control\
                \ by selecting\n              the appropriate options manually."
            prompt: Set options automatically (might overwrite some of your options)
                (--auto)
            format: ( "" , " --auto ")[value is not None and value]
            name: auto
            command: false
            type: !mobyle/booleantype {default: false}
        - !mobyle/inputprogramparameter
            format: '" -v --force --log=clustalO_log"'
            argpos: 100
            name: verbosity
            command: false
            hidden: true
            type: !mobyle/stringtype {}
outputs: !mobyle/outputparagraph
    children:
    - !mobyle/outputprogramparameter
        prompt: Multiple Sequence Alignment
        filenames: '"clustalO-profile.out"'
        name: alignment_output
        output_type: stdout
        type: !mobyle/formattedtype
            data_terms: ['EDAM_data:1384']
    - !mobyle/outputprogramparameter
        prompt: Clustal omega log file
        filenames: '"clustalO_log"'
        name: logfile
        type: !mobyle/formattedtype
            data_terms: ['EDAM_data:2048']
    - !mobyle/outputprogramparameter
        prompt: Standard error
        filenames: '"clustalO-profile.err"'
        name: stderr
        type: !mobyle/formattedtype
            data_terms: ['EDAM_data:2048']
comment: "Use this interface to align two alignments (profiles) together.The columns\
    \ in each profile will be kept\n    fixed and the alignment of the two profiles\
    \ will be written\n    out."
operations: ['EDAM_operation:0512']
topics: ['EDAM_topic:0080', 'EDAM_topic:0182']
command: clustalo
env: {}