forked from snakemake-workflows/single-cell-rna-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
121 lines (110 loc) · 4.67 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Path to sheet describing meta-information for each cell.
# The first column has to contain the cell id, all other columns are optional.
# They are added as metadata to the SingleCellExperiment object created in the
# workflow.
cells: cells.tsv
counts:
# specify count table (rows: genes/transcripts/spikes, cols: cells)
path: counts.tsv
# define which kind of features are described in each row (must be a term understood by Ensembl biomart, e.g. ensembl_gene_id, hgnc_symbol)
feature_ids: ensembl_gene_id
# BioMart host to use (e.g. www.ensembl.org, useast.ensembl.org, ...)
biomart: useast.ensembl.org
# Variables to use for batch effect removal and as factors to be ignored in variance
# analysis. This should always be the two cell cycle phases (G1, G2M) which are
# determined automatically during analysis. In addition, you may add any column
# name of the cell sheet (see above).
model:
# R formula representing known batch effects
design: "~ G1 + G2M + detection_rate"
# Rely on spike-ins for variance model estimation.
# This is usually not desired because of few and noisy spike-ins.
# If this is set to false, engogeneous genes are used instead, under the
# assumption that most of them are not differentially expressed.
use-spikes: false
# Minimum value of average difference in true (biological) log2 expression
# between any two cells.
min-bio-comp: 0.5
# FDR threshold for selecting highly variable genes/transcripts (HVGs)
fdr: 0.05
# show the expression distribution of the top n HVGs
show-n: 20
# compute correlation for top n HVGs
top-n: 200
# Expressions below this logcount are considered as dropouts
# logcounts are log2 transformed, including 1 pseudocount, normalized,
# and batch effect corrected).
# This is currently only used for gene-vs-gene plots (see below).
dropout-threshold: 1
# Comment out to not assign cell type via signatures
celltype:
# Table describing markers for assignment of cell types.
# Columns: name (cell type name), parent (parent cell type name),
# genes (comma-separated list of gene names/ids, as listed in the
# count matrix)
# Thereby, parent is usually empty. If not, it means that assignment for that
# type happens recursively only on those cells that have been assigned to the
# parent type.
markers: resources/markers.tsv
# Minimum gamma score for assigned cell type (resembles a posterior) to be
# considered as correctly assigned. Cells where the certainty of cellassign
# does not pass this threshold will show as celltype=NA.
min_gamma: 0.9
# Genes to create expression plots stratified by celltype for.
# This can be used to find the right selection of marker genes for cellassign.
expression-plot-genes:
- GBP1
- MYCN
gene-vs-gene-plots:
all-malignant:
# uncomment below to perform a correlation of given type (pearson, spearman, ...)
correlation: spearman
# uncomment below to perform a regression with given formula
# regression: "y ~ x"
# constrain to cells of the following types (comment out if not needed)
constrain-celltypes:
- Malignant
pairs:
x: DDX58
y:
- IRF1
- CCL5
- CXCL10
# Comment out to not do differential expression analysis.
diffexp:
# Add one entry per comparison here. The key below can be an arbitrary name.
a-vs-b:
# EdgeR design formula.
# Refer to any colData from SingleCellExperiment here.
# In addition, you can use celltype and detection_rate
# (number of expressed genes in cell divided by total
# number of genes in experiment).
design: "~ test.condition"
# Which coefficients of the model should be tested equal to zero.
# E.g., 2 to test the first coefficient after the implicit intercept
# (i.e., celltype in the example above).
coef: 2
# False discovery rate to control for.
fdr: 0.05
# Optional: constrain to cell types (comment out to use all cell types).
constrain-celltypes:
celltypes:
- Endothelial-cell
# Optional: constrain cells to those with the given covariate occurring in all celltypes
# (comment in if needed).
# This can be used to avoid confounding of an important batch variable.
# E.g., if you want the differential expression across cell types, and sample is a
# batch variable to control for, you need to ensure that each sample contains all
# considered cell types.
# common: sample
# Genes to plot
genes_of_interest:
- MITF
- MYCN
species: mouse
spike-ins:
# Regular expression pattern to detect spike-ins.
pattern: "^ERCC"
filtering:
# Remove all genes with a mean count less than the given threshold.
min-avg-count: 1