-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
261 lines (233 loc) · 8.34 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#########################################
# Parameters for running hotspot pipeline
#########################################
# mysql information
MYSQL_HOST=karchin-db01
MYSQL_USER=collin
MYSQL_DB=mupit_modbase
MYSQL_PASSWD=YourPASSWORD
# directory containing output files
OUTPUT_DIR=output/all_pdb_run/
# q-value threshold for significance
Q_VALUE=.01
# number of simulations
NUM_SIM=10000
# sphere radius for residue neighbors
RADIUS=10.0
# annotation input for pdb files
PDB_INFO=data/fully_described_pdb_info.txt
# mutation file from mupit
MUT_FILE=data/mutations.txt
# directory containing PDB_INFO and MUT_FILE
# split into pieces for parallelization
SPLIT_DIR=data/split_pdbs/
# temp data files
pdb_info_init=data/pdb_info.txt
TEMP_DIR=tmp/
GROUP_FUNC=min
# script that fetchs the PDB information file
PDB_SQL_SCRIPT=scripts/sql/get_pdb_info.sql
##################################################
# Directories containing mutations and their
# annotations
##################################################
HYPERMUT=500
MUT_DIR=data/mutations
MUT_REGEX='^input.+\.maf' # regex to recognize maf files
# Directory for merged annotation info
MUPIT_ANNOTATION_DIR=data/annotation/mupit_annotations/
###################################
# Prepare mutations from MAF file
###################################
# Maps mutations to structure
mapMafToStructure:
mkdir -p ${MUT_DIR}
python scripts/mupit/map_maf_to_structure.py \
--data-dir ${MUT_DIR} \
--match-regex ${MUT_REGEX} \
--host ${MYSQL_HOST} \
--db ${MYSQL_DB} \
--mysql-user ${MYSQL_USER} \
--mysql-passwd ${MYSQL_PASSWD} \
--output-dir ${MUT_DIR}
# prepare file relating protein structure to genomic mapping
prepMupitAnnotationMaf:
mkdir -p ${MUPIT_ANNOTATION_DIR}
for ttype_file in `ls ${MUT_DIR}/ | egrep ${MUT_REGEX}` ; do \
echo $$ttype_file ; \
ttype=`basename $$ttype_file | awk -F"." '{print $$2}'` ; \
python scripts/maf/convert_maf_to_mupit.py \
--maf ${MUT_DIR}/$$ttype_file \
-mh ${MYSQL_HOST} \
-mdb ${MYSQL_DB} \
--mysql-user ${MYSQL_USER} \
--mysql-passwd ${MYSQL_PASSWD} \
--tumor-type $$ttype \
--no-stratify \
-mt ${HYPERMUT} \
-i data/ \
--output ${MUPIT_ANNOTATION_DIR}/mupit_mutations_$$ttype ; \
done
# filter mappings from mupit and create mutations table
prepareMutationsTableMaf:
mkdir -p ${MUT_DIR}
python scripts/mupit/filter_hypermutated.py \
--raw-dir ${MUT_DIR} \
--match-regex ${MUT_REGEX} \
--mut-threshold ${HYPERMUT} \
--sample-col Tumor_Sample_Barcode \
--data-dir ${MUT_DIR}
python scripts/mupit/count_mutations.py \
--data-dir ${MUT_DIR}
python scripts/mupit/format_mutations_table.py \
--data-dir ${MUT_DIR}
python scripts/mupit/merge_mutations_table_data.py ${MUT_DIR}
## Load changes to MUPIT mysql
# load the mutations into the Mupit mysql db
# this will drop the table and reload a completely new
# set of mutations
loadMupitMutations:
python scripts/mupit/load_mutations_table.py \
-m ${MUT_DIR}/mysql.mutations.tcga.txt \
--host ${MYSQL_HOST} \
--mysql-user ${MYSQL_USER} \
--mysql-passwd ${MYSQL_PASSWD} \
--db ${MYSQL_DB}
# update the mutations into the Mupit mysql db
# this will only remove mutations for the tumor types provided in
# input set of mutations
updateMupitMutations:
python scripts/mupit/load_mutations_table.py \
-m ${MUT_DIR}/mysql.mutations.tcga.txt \
--update-table \
--host ${MYSQL_HOST} \
--db ${MYSQL_DB}
# run all the steps
# complete reload of mutations table
prepMutations: mapMafToStructure prepMupitAnnotationMaf prepareMutationsTableMaf loadMupitMutations
# only reload relevant tissue
prepMutationsUpdate: mapMafToStructure prepMupitAnnotationMaf prepareMutationsTableMaf updateMupitMutations
# only create a mutation file and don't update mysql
prepMutationsNoLoad: mapMafToStructure prepMupitAnnotationMaf prepareMutationsTableMaf
##################################
# Prepare input files for hot spot
# detetection
##################################
# Get info about PDB, chain, and gene names
# important for running on known structures
getPDBInfo:
mkdir -p data
mysql -u ${MYSQL_USER} -A -p -h ${MYSQL_HOST} ${MYSQL_DB} < ${PDB_SQL_SCRIPT} > ${pdb_info_init}
# get mutations from mupit database
getMutations:
mysql -u ${MYSQL_USER} -A -p -h ${MYSQL_HOST} ${MYSQL_DB} < scripts/sql/get_mutations.sql > ${MUT_FILE}
# add file path information for pdb files
getPDBPath:
python scripts/add_path_info.py -p ${pdb_info_init} -o data/pdb_info.path.txt
# get the chain desciption for the PDB files
getPDBDescription:
python scripts/chain_description.py -i data/pdb_info.path.txt -o ${PDB_INFO}
# split input files for parallelization
splitInputFiles:
python scripts/divide_pdb_info.py \
-f ${PDB_INFO} \
-m ${MUT_FILE} \
--split-dir ${SPLIT_DIR}
# Run all commands for preparing input for hot spot detection code
prepareHotspotInput: getPDBInfo getMutations getPDBPath getPDBDescription splitInputFiles
annotateStructures: getPDBPath getPDBDescription splitInputFiles
#####################################
# Run hotspot code
#####################################
# run the 3D hotspot code in parallel on the cluster
runParallelHotspot:
# create output directories if needed
mkdir -p ${OUTPUT_DIR}
mkdir -p ${OUTPUT_DIR}/data/hotspot/full_output
mkdir -p ${OUTPUT_DIR}/data/hotspot/residues
mkdir -p ${OUTPUT_DIR}/error
# run hotspot code in parallel
qsub -N PDB2HOTSPOT -v PATH=$$PATH scripts/qsub/run_parallel_hotspot.sh ${SPLIT_DIR} ${NUM_SIM} ${RADIUS} ${OUTPUT_DIR}
# run hotspot without the parallel aspect
runNormalHotspot:
mkdir -p ${OUTPUT_DIR}
mkdir -p ${OUTPUT_DIR}/data/hotspot/full_output
mkdir -p ${OUTPUT_DIR}/data/hotspot/residues
mkdir -p ${OUTPUT_DIR}/error
python hotspot.py --log-level=INFO \
-m ${MUT_FILE} \
-a ${PDB_INFO} \
-t EVERY \
-n ${NUM_SIM} \
-r ${RADIUS} \
-o ${OUTPUT_DIR}/output_merged.txt \
-e ${OUTPUT_DIR}/error/error_pdb_${PDB_INFO}.txt \
--log=stdout
# merge all files about hotspots together
mergeHotspotFiles:
rm -f ${OUTPUT_DIR}/output_merged.txt
cat ${OUTPUT_DIR}/data/hotspot/full_output/output_* | awk -F"\t" 'NR==1 || $$1!="Structure"' > ${OUTPUT_DIR}/output_merged.txt
# Multiple testing correction
#
# NOTE: the annotation results from CRAVAT
# are needed. Thus, please run commands in the next
# section before doing multiple testing correction.
multipleTestCorrect:
python multiple_testing_correction.py \
-i ${OUTPUT_DIR}/output_merged.txt \
-f ${GROUP_FUNC} \
-m ${MUPIT_ANNOTATION_DIR} \
-q ${Q_VALUE} \
-o ${OUTPUT_DIR}/mtc_output_${GROUP_FUNC}_${Q_VALUE}.txt \
-s ${OUTPUT_DIR}/significance_level_${Q_VALUE}.txt
# find hotspot regions (i.e. collection of residues) in structures
findHotregionStruct:
python find_hotspot_regions_struct.py \
-i ${OUTPUT_DIR}/output_merged.txt \
-a ${MUPIT_ANNOTATION_DIR} \
-p ${PDB_INFO} \
-r ${RADIUS} \
-o ${OUTPUT_DIR}/hotspot_regions_structure_${Q_VALUE}.txt \
-s ${OUTPUT_DIR}/significance_level_${Q_VALUE}.txt \
--log=stdout
# find hotspot regions (i.e. collection of residues) for gene
# rather then for a single structure
findHotregionGene:
python find_hotspot_regions_gene.py \
-m ${OUTPUT_DIR}/mtc_output_min_${Q_VALUE}.txt \
-a ${MUPIT_ANNOTATION_DIR} \
-p ${PDB_INFO} \
-r ${RADIUS} \
-q ${Q_VALUE} \
-o ${OUTPUT_DIR}/hotspot_regions_gene_${Q_VALUE}.txt \
--log=stdout
###############################
# Command to create files necessary
# to load into MuPit MYSQL DB
##############################
# find residues with multiple mappings within
# a single protein struct. Creates a blacklist
# to avoid these in mupit.
makeBlackList:
mkdir -p ${OUTPUT_DIR}/qc
python scripts/mupit/make_black_list.py \
-a ${MUPIT_ANNOTATION_DIR} \
-o ${OUTPUT_DIR}/qc/structure_level_residue_black_list.txt \
--structure
# This command should be ran after
# the findHotregionStruct command
hotspotToMupitTable:
python scripts/mupit/make_mupit_cluster_tables.py \
-r ${OUTPUT_DIR}/hotspot_regions_structure_${Q_VALUE}.txt \
-reg ${OUTPUT_DIR}/mupit_table_regions_${Q_VALUE}.txt \
-res ${OUTPUT_DIR}/mupit_table_residues_${Q_VALUE}.txt
#-b ${OUTPUT_DIR}/qc/structure_level_residue_black_list.txt \
# load the hotspots into MuPIT
loadMupitCluster:
python scripts/mupit/load_cluster_tables.py \
-c ${OUTPUT_DIR}/mupit_table_regions_${Q_VALUE}.txt \
-r ${OUTPUT_DIR}/mupit_table_residues_${Q_VALUE}.txt \
--mysql-user ${MYSQL_USER} \
--mysql-passwd ${MYSQL_PASSWD} \
-u \
--db=${MYSQL_DB}