-
Notifications
You must be signed in to change notification settings - Fork 3
/
loss.py
26 lines (22 loc) · 939 Bytes
/
loss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import theano
import theano.tensor as T
def policy_loss(values, a_probs, norm=True, entropy_coeff=.0):
bias = T.sum(a_probs*values, axis=1, keepdims=True)
adv = (values - bias)
if norm:
adv /= (T.abs_(bias) + 1e-8)
adv = theano.gradient.disconnected_grad(adv)
objective = a_probs * adv
entropy = -1. * T.sum(T.log(a_probs + 1e-8) * a_probs, axis=1, keepdims=True)
actor_loss = -1. * T.mean(objective + entropy_coeff*entropy, axis=-1)
return actor_loss
def value_softmax(values, a_probs, norm=True, norm_coeff=10):
val_max = T.max(values, axis=1, keepdims=True)
if norm:
val_min = T.min(values, axis=1, keepdims=True)
values = 0.5 + (values - val_min) / 2. / (val_max - val_min + 1e-8)
else:
values = (values - val_max)
values /= norm_coeff
targets = T.nnet.softmax(values)
return T.mean(T.nnet.categorical_crossentropy(a_probs, targets), axis=-1)