-
Notifications
You must be signed in to change notification settings - Fork 1
/
Hindsight-Experience-Replay.html
1549 lines (973 loc) · 59.1 KB
/
Hindsight-Experience-Replay.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html class="theme-next mist use-motion" lang="zh-Hans">
<head><meta name="generator" content="Hexo 3.8.0">
<meta name="google-site-verification" content="zu-9nWphPjrzXV8v514mkHknIz4dNfHlib56-KNAu44">
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">
<script src="/lib/pace/pace.min.js?v=1.0.2"></script>
<link href="/lib/pace/pace-theme-flash.min.css?v=1.0.2" rel="stylesheet">
<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
<script>
(function(i,s,o,g,r,a,m){i["DaoVoiceObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;a.charset="utf-8";m.parentNode.insertBefore(a,m)})(window,document,"script",('https:' == document.location.protocol ? 'https:' : 'http:') + "//widget.daovoice.io/widget/356f1943.js","daovoice")
daovoice('init', {
app_id: "356f1943"
});
daovoice('update');
</script>
<link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">
<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">
<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon.png?v=5.1.4">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32png?v=5.1.4">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16.png?v=5.1.4">
<link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">
<meta name="keywords" content="rl,">
<link rel="alternate" href="/atom.xml" title="Keavnn'Blog" type="application/atom+xml">
<script>
(function(){
if(''){
if (prompt('请输入文章密码','') !== ''){
alert('密码错误!');
history.back();
}
}
})();
</script>
<meta name="description" content="本文介绍了一个“事后诸葛亮”的经验池机制,简称为HER,它可以很好地应用于稀疏奖励和二分奖励的问题中,不需要复杂的奖励函数工程设计。 推荐: 稀疏奖励问题的一种解决方案 通俗易懂">
<meta name="keywords" content="rl">
<meta property="og:type" content="article">
<meta property="og:title" content="Hindsight Experience Replay">
<meta property="og:url" content="http://StepNeverStop.github.io/Hindsight-Experience-Replay.html">
<meta property="og:site_name" content="Keavnn'Blog">
<meta property="og:description" content="本文介绍了一个“事后诸葛亮”的经验池机制,简称为HER,它可以很好地应用于稀疏奖励和二分奖励的问题中,不需要复杂的奖励函数工程设计。 推荐: 稀疏奖励问题的一种解决方案 通俗易懂">
<meta property="og:locale" content="zh-Hans">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/hindsight.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/Her.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/pseudo.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/tasks.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/finalvsfuture.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/singlegoal.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/rewardshape.png">
<meta property="og:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/fourmodel.png">
<meta property="og:updated_time" content="2019-05-30T09:52:24.494Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Hindsight Experience Replay">
<meta name="twitter:description" content="本文介绍了一个“事后诸葛亮”的经验池机制,简称为HER,它可以很好地应用于稀疏奖励和二分奖励的问题中,不需要复杂的奖励函数工程设计。 推荐: 稀疏奖励问题的一种解决方案 通俗易懂">
<meta name="twitter:image" content="http://stepneverstop.github.io/Hindsight-Experience-Replay/hindsight.png">
<script type="text/javascript" id="hexo.configurations">
var NexT = window.NexT || {};
var CONFIG = {
root: '/',
scheme: 'Mist',
version: '5.1.4',
sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":true,"onmobile":true},
fancybox: true,
tabs: true,
motion: {"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
duoshuo: {
userId: '0',
author: '博主'
},
algolia: {
applicationID: '',
apiKey: '',
indexName: '',
hits: {"per_page":10},
labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
}
};
</script>
<link rel="canonical" href="http://StepNeverStop.github.io/Hindsight-Experience-Replay.html">
<title>Hindsight Experience Replay | Keavnn'Blog</title>
</head>
<body itemscope="" itemtype="http://schema.org/WebPage" lang="zh-Hans">
<div class="container sidebar-position-left page-post-detail">
<div class="headband"></div>
<a href="https://github.com/StepNeverStop" class="github-corner" aria-label="View source on GitHub" rel="external nofollow" target="_blank"><svg width="80" height="80" viewbox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"/><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"/><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"/></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
<header id="header" class="header" itemscope="" itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-wrapper">
<div class="site-meta ">
<div class="custom-logo-site-title">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<span class="site-title">Keavnn'Blog</span>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<h1 class="site-subtitle" itemprop="description">If it is to be, it is up to me.</h1>
</div>
<div class="site-nav-toggle">
<button>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
</button>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section">
<i class="menu-item-icon fa fa-fw fa-home"></i> <br>
首页
</a>
</li>
<li class="menu-item menu-item-about">
<a href="/about/" rel="section">
<i class="menu-item-icon fa fa-fw fa-user"></i> <br>
关于
</a>
</li>
<li class="menu-item menu-item-tags">
<a href="/tags/" rel="section">
<i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
标签
</a>
</li>
<li class="menu-item menu-item-categories">
<a href="/categories/" rel="section">
<i class="menu-item-icon fa fa-fw fa-th"></i> <br>
分类
</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section">
<i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
归档
</a>
</li>
<li class="menu-item menu-item-search">
<a href="javascript:;" class="popup-trigger">
<i class="menu-item-icon fa fa-search fa-fw"></i> <br>
搜索
</a>
</li>
</ul>
<div class="site-search">
<div class="popup search-popup local-search-popup">
<div class="local-search-header clearfix">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<span class="popup-btn-close">
<i class="fa fa-times-circle"></i>
</span>
<div class="local-search-input-wrapper">
<input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
</div>
</div>
<div id="local-search-result"></div>
</div>
</div>
</nav>
</div>
</header>
<main id="main" class="main">
<div class="main-inner">
<div class="content-wrap">
<div id="content" class="content">
<div id="posts" class="posts-expand">
<article class="post post-type-normal" itemscope="" itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://StepNeverStop.github.io/Hindsight-Experience-Replay.html">
<span hidden itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<meta itemprop="name" content="Keavnn">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/Kicon.jpg">
</span>
<span hidden itemprop="publisher" itemscope="" itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Keavnn'Blog">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">Hindsight Experience Replay</h2>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2019-05-28T18:38:56+08:00">
2019-05-28
</time>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-calendar-check-o"></i>
</span>
<span class="post-meta-item-text">更新于:</span>
<time title="更新于" itemprop="dateModified" datetime="2019-05-30T17:52:24+08:00">
2019-05-30
</time>
</span>
<span class="post-category">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope="" itemtype="http://schema.org/Thing">
<a href="/categories/ReinforcementLearning/" itemprop="url" rel="index">
<span itemprop="name">ReinforcementLearning</span>
</a>
</span>
</span>
<div class="post-wordcount">
<span class="post-meta-item-icon">
<i class="fa fa-file-word-o"></i>
</span>
<span class="post-meta-item-text">字数统计:</span>
<span title="字数统计">
3.4k
</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-clock-o"></i>
</span>
<span class="post-meta-item-text">阅读时长 ≈</span>
<span title="阅读时长">
14
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<p>本文介绍了一个“事后诸葛亮”的经验池机制,简称为<strong>HER</strong>,它可以很好地应用于<strong>稀疏奖励</strong>和<strong>二分奖励</strong>的问题中,不需要复杂的奖励函数工程设计。</p>
<p>推荐:</p>
<ul>
<li>稀疏奖励问题的一种解决方案</li>
<li>通俗易懂</li>
</ul>
<a id="more"></a>
<h1 id="简介"><a href="#简介" class="headerlink" title="简介"></a>简介</h1><p>论文地址:<a href="https://papers.nips.cc/paper/7090-hindsight-experience-replay.pdf" rel="external nofollow" target="_blank">https://papers.nips.cc/paper/7090-hindsight-experience-replay.pdf</a></p>
<blockquote>
<p>Dealing with sparse rewards is one of the biggest challenges in Reinforcement Learning (RL). </p>
</blockquote>
<p>强化学习问题中最棘手的问题之一就是稀疏奖励。</p>
<p>本文提出了一个新颖的技术:Hindsight Experience Replay(HER),可以从稀疏、二分的奖励问题中高效采样并进行学习,而且可以应用于<strong>所有的Off-Policy</strong>算法中。</p>
<p><img src="./Hindsight-Experience-Replay/hindsight.png" alt=""></p>
<p>Hindsight意为事后,结合强化学习中序贯决策问题的特性,我们很容易就可以猜想到,“事后”要不然指的是在状态s下执行动作a之后,要不然指的就是当一个episode结束之后。其实,文中对常规经验池的改进也正是运用了这样的含义。</p>
<blockquote>
<p>HER lets an agent learn from undesired outcomes and tackles the problem of sparse rewards in Reinforcement Learning (RL).——Zhao, R., & Tresp, V. (2018). Energy-Based Hindsight Experience Prioritization. <em>CoRL</em>.</p>
</blockquote>
<p>HER使智能体从没达到的结果中去学习,解决了强化学习中稀疏奖励的问题。</p>
<h2 id="二分奖励-binary-reward"><a href="#二分奖励-binary-reward" class="headerlink" title="二分奖励 binary reward"></a>二分奖励 binary reward</h2><p>简言之,完成目标为一个值,没完成目标为另一个值。如:</p>
<ul>
<li>$S_{T}=Goal,r=0$</li>
<li>$S\neq Goal, r=-1. for \ S \in \mathbb{S}$</li>
</ul>
<h2 id="稀疏奖励-sparse-reward"><a href="#稀疏奖励-sparse-reward" class="headerlink" title="稀疏奖励 sparse reward"></a>稀疏奖励 sparse reward</h2><p>简言之,完成目标的episode太少或者完成目标的步数太长,导致负奖励的样本数过多</p>
<h1 id="文中精要"><a href="#文中精要" class="headerlink" title="文中精要"></a>文中精要</h1><p>在机器人领域,要想使强化学习训练它完美执行某任务,往往需要设计合理的奖励函数,但是设计这样的奖励函数工程师不仅需要懂得强化学习的领域知识,也需要懂得机器人、运动学等领域的知识。而且,有这些知识也未必能设计出很好的奖励函数供智能体进行学习。因此,如果可以从简单的奖励函数(如二分奖励)学习到可完成任务的模型,那就不需要费心设计复杂的奖励函数了。</p>
<p>文中介绍了一个例子来引入HER:</p>
<ul>
<li>名称:bit-flipping environment</li>
<li>状态空间$\mathcal{S}=\left \{ 0,1 \right \}^{n}$</li>
<li>动作空间$\mathcal{A}=\left \{ 0,1,\cdots,n-1 \right \}$</li>
<li>规则:对于每个episode,均匀采样长度为$n$的初始状态$s_{0}$(如$n=5,s_{0}=10101$)和目标状态$s_{g}$,每一步从动作空间中选取一个动作$a$,翻转$s_{0}$第$a$个位置的值,如$a=1\Rightarrow s_{1}=11101$,直到回合结束或者翻转后的状态与$s_{g}$相同</li>
<li>奖励函数:$r_{g}(s,a)=-\left [ s \neq g \right ]$,即达到目标状态则为0,未达到目标状态则为-1。这个很容易理解,$s \neq g \Rightarrow true \doteq 1,s = g \Rightarrow false \doteq 0$</li>
</ul>
<p><em>注:下文如无特殊说明,$g$即表示目标状态$s_{g}$</em></p>
<blockquote>
<p>Standard RL algorithms are bound to fail in this environment for n > 40 because they will never experience any reward other than -1. Notice that using techniques for improving exploration (e.g. VIME (Houthooft et al., 2016), count-based exploration (Ostrovski et al., 2017) or bootstrapped DQN (Osband et al., 2016)) does not help here because the real problem is not in lack of diversity of states being visited, rather it is simply impractical to explore such a large state space. </p>
</blockquote>
<p>当序列长度$n$大于40时,传统的强化学习算法就算有各种探索机制的加持,也不能学会解决这个问题,因为这个问题完全不是缺乏探索,而是<strong>状态太多,探索不完</strong>,导致奖励极其稀疏,算法根本不知道需要优化的目标在哪里。</p>
<p>为了解决这个问题,作者指出了两个思路:</p>
<ol>
<li>使用shaped reward(简言之,将reward设计成某些变量的函数,如$r_{g}(s,a)=-\left || s-g \right ||^{2}$,即奖励函数为当前状态与目标状态的欧氏距离的负数),将训练的算法逐步引导至奖励函数增大的决策空间。但是这种方法可能很难应用于复杂的问题中。</li>
<li>使用HER——事后经验池机制</li>
</ol>
<h2 id="HER"><a href="#HER" class="headerlink" title="HER"></a>HER</h2><blockquote>
<p>The pivotal idea behind our approach is to re-examine this trajectory with a different goal — while this trajectory may not help us learn how to achieve the state g, it definitely tells us something about how to achieve the state $s_{T}$ .</p>
</blockquote>
<p>HER的主要思想就是:<strong>为什么一定要考虑我们设定的目标呢?假设我们想让一个智能体学会移动到某个位置,它在一个episode中没有学到移动到目标位置就算失败吗?假定序列为$s_{0},s_{1},s_{2}, \cdots ,s_{T}$,目标为$g$,我们何不换一种思路考虑:如果我们在episode开始前就将目标状态$g$设置为$s_{T}$,即$g=s_{T}$,那么这样看来智能体不就算是完成目标了吗?</strong></p>
<p><img src="./Hindsight-Experience-Replay/Her.png" alt=""></p>
<p>HER就是运用了这个思想对经验池进行了扩充,将稀疏奖励问题给转化成非稀疏奖励,大大的扩展了经验池中完成任务的经验数量。</p>
<p>HER主要特点:</p>
<ul>
<li>传统经验池存入的是状态$s$,而HER存入的是$s||g$,也就是<code>tf.concat(s,g)</code></li>
<li>训练算法的输入也是$s||g$,也就是需要在当前状态后边连结上<strong>每个episode的</strong>目标状态,每个episode的目标状态可能不同</li>
<li>HER对经验池进行了扩充,不仅存入实际采样得到的transition/experience,$\left ( s_{t}||g,a_{t},r_{t},s_{t+1}||g \right )$,也要在回合结束时<strong>重新设置目标状态</strong>,得到相应的奖励值(在二分奖励问题中,只有在$s=g$时奖励才需要更改),存入“事后”(当初如果这样就好啦!)的经验$\left ( s_{t}||g’,a_{t},r_{t}’,s_{t+1}||g’ \right )$,详见伪代码,这个事后经验究竟存入多少份、多少种,由超参数$k$控制,下文讲解。</li>
<li>HER更适合解决多目标问题,多目标的意思为,目标点非固定,每个episode的目标状态可以不相同。详见实验部分</li>
</ul>
<p>HER的几种扩展方式:</p>
<blockquote>
<p>future — replay with k random states which come from the same episode as the transition being replayed and were observed after it,<br>episode — replay with k random states coming from the same episode as the transition being replayed,<br>random — replay with k random states encountered so far in the whole training procedure.</p>
</blockquote>
<ul>
<li>未来模式——future:在一个序列$s_{0},s_{1},s_{2},\cdots,s_{T}$中,如果遍历到状态$s_{2}$,则在$s_{3},\cdots,s_{T}$之间随机抽取$k$个状态作为目标状态$g’$,并依此向经验池中存入$\left ( s_{2}||g’,a_{2},r_{2}’,s_{3}||g’ \right )$,<strong>特点:一个episode的后续部分</strong></li>
<li><p>回合模式——episode:在一个序列$s_{0},s_{1},s_{2},…,s_{T}$中,如果遍历到状态$s_{2}$,则在整个序列中随机抽取$k$个状态作为目标状态$g’$,并依此向经验池中存入$\left ( s_{2}||g’,a_{2},r_{2}’,s_{3}||g’ \right )$,<strong>特点:一个episode</strong></p>
</li>
<li><p>随机模式——random:在一个序列$s_{0},s_{1},s_{2},…,s_{T}$中,如果遍历到状态$s_{2}$,则在多个序列$\tau_{0},\tau_{1},\tau_{2},\cdots$中随机抽取$k$个状态作为目标状态$g’$,并依此向经验池中存入$\left ( s_{2}||g’,a_{2},r_{2}’,s_{3}||g’ \right )$,<strong>特点:多个episode</strong></p>
</li>
<li><p>最终模式——final:在一个序列$s_{0},s_{1},s_{2},\cdots,s_{T}$中,如果遍历到状态$s_{2}$,则之间令$g’=s_{T}$,并向经验池中存入$\left ( s_{2}||g’,a_{2},r_{2}’,s_{3}||g’ \right )$,<strong>特点:一个episode的最后一个状态,如果设置k,则存入k个相同的经验</strong></p>
</li>
</ul>
<h2 id="伪代码"><a href="#伪代码" class="headerlink" title="伪代码"></a>伪代码</h2><p><img src="./Hindsight-Experience-Replay/pseudo.png" alt=""></p>
<p>解析:</p>
<ol>
<li>伪代码中没有提到超参数$k$,其实在循环条件$\textbf{for} \ g’ \in G \ \textbf{do}$中循环执行了$k$次</li>
<li>$||$操作为连结操作,简言之,将两个长度为5的向量合并成一个长度为10的向量</li>
<li>$G:=\mathbb{S}(\textbf{current episode})$即为上文提到的四种扩展模式:future、episode、random、final。</li>
<li>奖励函数$r(s,a,g)=-\left [ f_{g}(s)=0 \right ]$即为前文提到的$r_{g}(s,a)=-\left [ s \neq g \right ]$,即完成为0,未完成为-1,具体奖励函数可以根据我们的使用环境设计</li>
<li>$a_{t} \leftarrow \pi_{b}(s_{t}||g)$表示神经网络的输入为当前状态与目标状态的连结</li>
</ol>
<h2 id="HER的优点"><a href="#HER的优点" class="headerlink" title="HER的优点"></a>HER的优点</h2><ol>
<li>可解决稀疏奖励、二分奖励问题</li>
<li>可适用于所有的Off-Policy算法</li>
<li>提升了数据采样效率</li>
</ol>
<h1 id="实验部分"><a href="#实验部分" class="headerlink" title="实验部分"></a>实验部分</h1><p>文中实验结果:<a href="https://goo.gl/SMrQnI" rel="external nofollow" target="_blank">https://goo.gl/SMrQnI</a></p>
<p>实验部分的完整细节请参考论文原文。</p>
<h2 id="环境"><a href="#环境" class="headerlink" title="环境"></a>环境</h2><ul>
<li>7自由度机械臂</li>
<li>模拟环境:MuJoCo</li>
<li>任务分为3种<ul>
<li>Pushing,推:锁定机械臂的钳子,移动机械臂将物体推到目标点</li>
<li>Sliding,滑动:类似于冰球运动,锁定机械臂的钳子,移动机械臂给与物体一个力,使物体可以在较光滑的桌面上滑动并且达到目标位置</li>
<li>Pick-and-place,摆放:解锁钳子,使用机械臂夹起物体并移动至空中目标点</li>
</ul>
</li>
</ul>
<p><img src="./Hindsight-Experience-Replay/tasks.png" alt=""></p>
<h2 id="算法"><a href="#算法" class="headerlink" title="算法"></a>算法</h2><ul>
<li>DDPG</li>
<li>Adam优化器</li>
<li>多层感知机MLPs</li>
<li>ReLU激活函数</li>
<li>8核并行,更新参数后取平均</li>
<li>A-C网络都是3个隐藏层,每层64个隐节点,Actor输出层用tanh激活函数</li>
<li>经验池大小为$10^{6}$,折扣因子$\gamma=0.98$,学习率$\alpha=0.001$,探索因子$\epsilon = 0.2$</li>
</ul>
<blockquote>
<p>With probability 20% we sample (uniformly) a random action from the hypercube of valid actions. </p>
</blockquote>
<p>DDPG使用了随机探索机制</p>
<h2 id="训练结果"><a href="#训练结果" class="headerlink" title="训练结果"></a>训练结果</h2><h3 id="final模式与future模式对比"><a href="#final模式与future模式对比" class="headerlink" title="final模式与future模式对比"></a>final模式与future模式对比</h3><p><img src="./Hindsight-Experience-Replay/finalvsfuture.png" alt=""></p>
<ul>
<li>红色曲线为future模式,蓝色曲线为final模式,绿色曲线为使用了<a href="https://arxiv.org/pdf/1703.01310.pdf" rel="external nofollow" target="_blank">count-based</a>的DDPG,褐红色虚线为原始DDPG</li>
<li>从左至右依次是Pushing,Sliding,Pick-and-place任务</li>
<li>超参数$k=4$</li>
<li>这个实验中,目标状态会变,即为多个目标状态</li>
</ul>
<p>结果分析:</p>
<ul>
<li>future模式比final效果更好</li>
<li>使用了count-based的DDPG智能稍微解决一下Sliding任务</li>
<li>使用HER的DDPG可以完全胜任三个任务</li>
<li>证明了HER是使从稀疏、二分奖励问题中学习成为可能的关键因素</li>
</ul>
<h3 id="单个目标状态的实验"><a href="#单个目标状态的实验" class="headerlink" title="单个目标状态的实验"></a>单个目标状态的实验</h3><p><img src="./Hindsight-Experience-Replay/singlegoal.png" alt=""></p>
<ul>
<li>蓝色曲线为使用了HER的DDPG,文中并未说明HER是哪种模式,<strong>猜测</strong>是final模式,因为文中实验部分之前都是以final模式进行举例</li>
<li>绿色曲线代表应用了count-based的DDPG,褐红色虚线为原始DDPG</li>
<li>实验中,目标状态都为同一状态$g$</li>
</ul>
<p>结果分析:</p>
<ul>
<li>DDPG+HER比原始DDPG的性能要好很多</li>
<li><strong>相比于多个目标的实验,可以发现,在多目标的任务中DDPG训练更快</strong>,所以在实际中,即使我们只关心一个目标,我们最好也使用多个目标来训练</li>
</ul>
<h3 id="HER应用于reward-shaping问题中"><a href="#HER应用于reward-shaping问题中" class="headerlink" title="HER应用于reward shaping问题中"></a>HER应用于reward shaping问题中</h3><p>前文已经说过,reward shaping可以简单理解为将奖励函数设置为某些变量的函数,如$r_{g}(s,a)=-\left || s-g \right ||^{2}$,即奖励函数为当前状态与目标状态的欧氏距离的负数</p>
<p><img src="./Hindsight-Experience-Replay/rewardshape.png" alt=""></p>
<ul>
<li>奖励函数为$r_{g}(s,a)=-\left || s-g \right ||^{2}$</li>
</ul>
<p>结果分析:</p>
<ul>
<li><p>无论使用怎样的reward shaping函数,DDPG、DDPG+HER都不能解决这个问题</p>
</li>
<li><p>作者认为原因有二:</p>
<ul>
<li><blockquote>
<p>There is a huge discrepancy between what we optimize (i.e. a shaped reward function) and the success condition (i.e.: is the object within some radius from the goal at the end of the episode); </p>
</blockquote>
<p>判定完成目标的条件和要优化的问题有巨大的矛盾(虽然我也不理解这到底是什么意思,索性就直接抄了过来)</p>
</li>
<li><blockquote>
<p>Shaped rewards penalize for inappropriate behaviour (e.g. moving the box in a wrong direction) which may hinder exploration. It can cause the agent to learn not to touch the box at all if it can not manipulate it precisely and we noticed such behaviour in some of our experiments. </p>
</blockquote>
<p>reward shaping阻碍了探索</p>
</li>
</ul>
</li>
<li><blockquote>
<p>Our results suggest that domain-agnostic reward shaping does not work well (at least in the simple forms we have tried). Of course for every problem there exists a reward which makes it easy (Ng et al., 1999) but designing such shaped rewards requires a lot of domain knowledge and may in some cases not be much easier than directly scripting the policy. This strengthens our belief that learning from sparse, binary rewards is an important problem. </p>
</blockquote>
<p>研究结果表明,与领域无关的reward shaping效果并不好</p>
</li>
</ul>
<h3 id="四种模式比较"><a href="#四种模式比较" class="headerlink" title="四种模式比较"></a>四种模式比较</h3><p><img src="./Hindsight-Experience-Replay/fourmodel.png" alt=""></p>
<ul>
<li>红色代表future模式,蓝色代表final模式,绿色代表episode模式,紫色代表episode模式,褐红色虚线代表原始DDPG</li>
<li>横坐标代表超参数$k$,第一行三个图的纵坐标代表最高得分,第二行三个图的纵坐标代表平均得分</li>
</ul>
<p>结果分析:</p>
<ul>
<li><p>效果:future>final>episode>random>no HER</p>
</li>
<li><p>稳定性:final(好)=no-HER(差)>future>episode>random</p>
</li>
<li><p>future模式是唯一一个可以解决Sliding任务的,在$k=4$或者$k=8$时效果最好</p>
</li>
<li><p>增大$k$超过8会使性能有所下降,主要是因为$k$过大导致经验池中原始真实数据所占的比例太小</p>
</li>
<li><blockquote>
<p>It confirms that the most valuable goals for replay are the ones which are going to be achieved in the near future </p>
</blockquote>
<p>它证实了回放经验中最有价值的目标是那些在不久的将来能实现的目标</p>
</li>
</ul>
<p><em>注:作者根据 future 模式提出了最近邻的 future 模式,即把$g’$设置为$s_{t+1}$,并且进行了实验,实验结果不如 future 模式。</em></p>
</div>
<div>
<div>
<div style="text-align:center;color: #ccc;font-size:14px;">-------------本文结束<i class="fa fa-heart"></i>感谢您的阅读-------------</div>
</div>
</div>
<div>
<div class="my_post_copyright">
<script src="//cdn.bootcss.com/clipboard.js/1.5.10/clipboard.min.js"></script>
<!-- JS库 sweetalert 可修改路径 -->
<script src="https://cdn.bootcss.com/jquery/2.0.0/jquery.min.js"></script>
<script src="https://unpkg.com/sweetalert/dist/sweetalert.min.js"></script>
<p><span>本文标题:</span><a href="/Hindsight-Experience-Replay.html">Hindsight Experience Replay</a></p>
<p><span>文章作者:</span><a href="/" title="访问 Keavnn 的个人博客">Keavnn</a></p>
<p><span>发布时间:</span>2019年05月28日 - 18:05</p>
<p><span>最后更新:</span>2019年05月30日 - 17:05</p>
<p><span>原始链接:</span><a href="/Hindsight-Experience-Replay.html" title="Hindsight Experience Replay">http://StepNeverStop.github.io/Hindsight-Experience-Replay.html</a>
<span class="copy-path" title="点击复制文章链接"><i class="fa fa-clipboard" data-clipboard-text="http://StepNeverStop.github.io/Hindsight-Experience-Replay.html" aria-label="复制成功!"></i></span>
</p>
<p><span>许可协议:</span><i class="fa fa-creative-commons"></i> <a rel="external nofollow" href="https://creativecommons.org/licenses/by-nc-sa/4.0/" target="_blank" title="Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)">署名-非商业性使用-相同方式共享 4.0 国际</a> 转载请保留原文链接及作者。</p>
</div>
<script>
var clipboard = new Clipboard('.fa-clipboard');
$(".fa-clipboard").click(function(){
clipboard.on('success', function(){
swal({
title: "",
text: '复制成功',
icon: "success",
showConfirmButton: true
});
});
});
</script>
</div>
<div>
<div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
<div>如果您获得了帮助,也可以资助一下小的啦~</div>
<button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
<span>打赏啦</span>
</button>
<div id="QR" style="display: none;">
<div id="wechat" style="display: inline-block">
<img id="wechat_qr" src="/images/wechatpay.jpg" alt="Keavnn 微信">
<p>微信</p>
</div>
<div id="alipay" style="display: inline-block">
<img id="alipay_qr" src="/images/alipay.jpg" alt="Keavnn 支付宝">
<p>支付宝</p>
</div>
</div>
</div>
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/rl/" rel="tag"> <i class="fa fa-tag"></i> rl</a>
</div>
<div class="post-nav">
<div class="post-nav-next post-nav-item">
<a href="/Prioritized-Experience-Replay.html" rel="next" title="Prioritized Experience Replay">
<i class="fa fa-chevron-left"></i> Prioritized Experience Replay
</a>
</div>
<span class="post-nav-divider"></span>
<div class="post-nav-prev post-nav-item">
<a href="/energy-based-hindsight-experience-prioritization.html" rel="prev" title="Energy-Based Hindsight Experience Prioritization">
Energy-Based Hindsight Experience Prioritization <i class="fa fa-chevron-right"></i>
</a>
</div>
</div>
</footer>
</div>
</article>
<div class="post-spread">
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<div class="addthis_inline_share_toolbox">
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5cefbfc88c13b0e7" async="async"></script>
</div>
</div>
</div>
</div>
<div class="comments" id="comments">
<div id="lv-container" data-id="city" data-uid="MTAyMC80MTk0NS8xODQ5MQ=="></div>
</div>
</div>
<div class="sidebar-toggle">
<div class="sidebar-toggle-line-wrap">
<span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
</div>
</div>
<aside id="sidebar" class="sidebar">
<div id="sidebar-dimmer"></div>
<div class="sidebar-inner">
<ul class="sidebar-nav motion-element">
<li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
文章目录
</li>
<li class="sidebar-nav-overview" data-target="site-overview-wrap">
站点概览
</li>
</ul>
<section class="site-overview-wrap sidebar-panel">
<div class="site-overview">
<div class="site-author motion-element" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" src="/images/Kicon.jpg" alt="Keavnn">
<p class="site-author-name" itemprop="name">Keavnn</p>
<p class="site-description motion-element" itemprop="description">If it is to be, it is up to me.</p>
</div>
<nav class="site-state motion-element">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">51</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/index.html">
<span class="site-state-item-count">11</span>
<span class="site-state-item-name">分类</span>
</a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/index.html">
<span class="site-state-item-count">26</span>
<span class="site-state-item-name">标签</span>
</a>
</div>
</nav>
<div class="feed-link motion-element">
<a href="/atom.xml" rel="alternate">
<i class="fa fa-rss"></i>
RSS
</a>
</div>
<div class="links-of-author motion-element">
<span class="links-of-author-item">
<a href="https://github.com/StepNeverStop" target="_blank" title="GitHub" rel="external nofollow">
<i class="fa fa-fw fa-github"></i>GitHub</a>
</span>
<span class="links-of-author-item">
<a href="mailto:[email protected]" target="_blank" title="E-Mail" rel="external nofollow">
<i class="fa fa-fw fa-envelope"></i>E-Mail</a>
</span>
</div>
<div class="cc-license motion-element" itemprop="license">
<a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" target="_blank" rel="external nofollow">
<img src="/images/cc-by-nc-sa.svg" alt="Creative Commons">
</a>
</div>
<div class="links-of-blogroll motion-element links-of-blogroll-inline">
<div class="links-of-blogroll-title">
<i class="fa fa-fw fa-link"></i>
推荐阅读
</div>
<ul class="links-of-blogroll-list">
<li class="links-of-blogroll-item">
<a href="https://bluefisher.github.io" title="Fisher Chang" target="_blank" rel="external nofollow">Fisher Chang</a>
</li>
</ul>
</div>
</div>
</section>
<!--noindex-->
<section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
<div class="post-toc">
<div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#简介"><span class="nav-number">1.</span> <span class="nav-text">简介</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#二分奖励-binary-reward"><span class="nav-number">1.1.</span> <span class="nav-text">二分奖励 binary reward</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#稀疏奖励-sparse-reward"><span class="nav-number">1.2.</span> <span class="nav-text">稀疏奖励 sparse reward</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#文中精要"><span class="nav-number">2.</span> <span class="nav-text">文中精要</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#HER"><span class="nav-number">2.1.</span> <span class="nav-text">HER</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#伪代码"><span class="nav-number">2.2.</span> <span class="nav-text">伪代码</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#HER的优点"><span class="nav-number">2.3.</span> <span class="nav-text">HER的优点</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#实验部分"><span class="nav-number">3.</span> <span class="nav-text">实验部分</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#环境"><span class="nav-number">3.1.</span> <span class="nav-text">环境</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#算法"><span class="nav-number">3.2.</span> <span class="nav-text">算法</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#训练结果"><span class="nav-number">3.3.</span> <span class="nav-text">训练结果</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#final模式与future模式对比"><span class="nav-number">3.3.1.</span> <span class="nav-text">final模式与future模式对比</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#单个目标状态的实验"><span class="nav-number">3.3.2.</span> <span class="nav-text">单个目标状态的实验</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#HER应用于reward-shaping问题中"><span class="nav-number">3.3.3.</span> <span class="nav-text">HER应用于reward shaping问题中</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#四种模式比较"><span class="nav-number">3.3.4.</span> <span class="nav-text">四种模式比较</span></a></li></ol></li></ol></li></ol></div>
</div>
</section>
<!--/noindex-->
</div>
</aside>
</div>
</main>
<footer id="footer" class="footer">
<div class="footer-inner">
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<div class="copyright">© <span itemprop="copyrightYear">2020</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">Keavnn</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-area-chart"></i>
</span>
<span class="post-meta-item-text">Site words total count:</span>
<span title="Site words total count">80.3k</span>
</div>