-
Notifications
You must be signed in to change notification settings - Fork 18
/
train.py
195 lines (157 loc) · 6.22 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import logging
import os
from pathlib import Path
from time import sleep, time
import hydra
import yaml
from addict import Dict
from comet_ml import ExistingExperiment, Experiment
from omegaconf import OmegaConf
from climategan.trainer import Trainer
from climategan.utils import (
comet_kwargs,
copy_run_files,
env_to_path,
find_existing_training,
flatten_opts,
get_existing_comet_id,
get_git_branch,
get_git_revision_hash,
get_increased_path,
kill_job,
load_opts,
pprint,
)
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)
hydra_config_path = Path(__file__).resolve().parent / "shared/trainer/config.yaml"
# requires hydra-core==0.11.3 and omegaconf==1.4.1
@hydra.main(config_path=hydra_config_path, strict=False)
def main(opts):
"""
Opts prevalence:
1. Load file specified in args.default (or shared/trainer/defaults.yaml
if none is provided)
2. Update with file specified in args.config (or no update if none is provided)
3. Update with parsed command-line arguments
e.g.
`python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10`
loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10
"""
# -----------------------------
# ----- Parse arguments -----
# -----------------------------
hydra_opts = Dict(OmegaConf.to_container(opts))
args = hydra_opts.pop("args", None)
auto_resumed = {}
config_path = args.config
if hydra_opts.train.resume:
out_ = str(env_to_path(hydra_opts.output_path))
config_path = Path(out_) / "opts.yaml"
if not config_path.exists():
config_path = None
print("WARNING: could not reuse the opts in {}".format(out_))
default = args.default or Path(__file__).parent / "shared/trainer/defaults.yaml"
# -----------------------
# ----- Load opts -----
# -----------------------
opts = load_opts(config_path, default=default, commandline_opts=hydra_opts)
if args.resume:
opts.train.resume = True
opts.jobID = os.environ.get("SLURM_JOBID")
opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION")
opts.output_path = str(env_to_path(opts.output_path))
print("Config output_path:", opts.output_path)
exp = comet_previous_id = None
# -------------------------------
# ----- Check output_path -----
# -------------------------------
# Auto-continue if same slurm job ID (=job was requeued)
if not opts.train.resume and opts.train.auto_resume:
print("\n\nTrying to auto-resume...")
existing_path = find_existing_training(opts)
if existing_path is not None and existing_path.exists():
auto_resumed["original output_path"] = str(opts.output_path)
auto_resumed["existing_path"] = str(existing_path)
opts.train.resume = True
opts.output_path = str(existing_path)
# Still not resuming: creating new output path
if not opts.train.resume:
opts.output_path = str(get_increased_path(opts.output_path))
Path(opts.output_path).mkdir(parents=True, exist_ok=True)
# Copy the opts's sbatch_file to output_path
copy_run_files(opts)
# store git hash
opts.git_hash = get_git_revision_hash()
opts.git_branch = get_git_branch()
if not args.no_comet:
# ----------------------------------
# ----- Set Comet Experiment -----
# ----------------------------------
if opts.train.resume:
# Is resuming: get existing comet exp id
assert Path(opts.output_path).exists(), "Output_path does not exist"
comet_previous_id = get_existing_comet_id(opts.output_path)
# Continue existing experiment
if comet_previous_id is None:
print("WARNING could not retreive previous comet id")
print(f"from {opts.output_path}")
else:
print("Continuing previous experiment", comet_previous_id)
auto_resumed["continuing exp id"] = comet_previous_id
exp = ExistingExperiment(
previous_experiment=comet_previous_id, **comet_kwargs
)
print("Comet Experiment resumed")
if exp is None:
# Create new experiment
print("Starting new experiment")
exp = Experiment(project_name="climategan", **comet_kwargs)
exp.log_asset_folder(
str(Path(__file__).parent / "climategan"),
recursive=True,
log_file_name=True,
)
exp.log_asset(str(Path(__file__)))
# Log note
if args.note:
exp.log_parameter("note", args.note)
# Merge and log tags
if args.comet_tags or opts.comet.tags:
tags = set([f"branch:{opts.git_branch}"])
if args.comet_tags:
tags.update(args.comet_tags)
if opts.comet.tags:
tags.update(opts.comet.tags)
opts.comet.tags = list(tags)
print("Logging to comet.ml with tags", opts.comet.tags)
exp.add_tags(opts.comet.tags)
# Log all opts
exp.log_parameters(flatten_opts(opts))
if auto_resumed:
exp.log_text("\n".join(f"{k:20}: {v}" for k, v in auto_resumed.items()))
# allow some time for comet to get its url
sleep(1)
# Save comet exp url
url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt")
with open(url_path, "w") as f:
f.write(exp.url)
# Save config file
opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml")
with (opts_path).open("w") as f:
yaml.safe_dump(opts.to_dict(), f)
pprint("Running model in", opts.output_path)
# -------------------
# ----- Train -----
# -------------------
trainer = Trainer(opts, comet_exp=exp, verbose=1)
trainer.logger.time.start_time = time()
trainer.setup()
trainer.train()
# -----------------------------
# ----- End of training -----
# -----------------------------
pprint("Done training")
kill_job(opts.jobID)
if __name__ == "__main__":
main()