-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
1001 lines (590 loc) · 122 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE HTML>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>今夕何夕</title>
<meta name="viewport" content="width=device-width, initial-scale=1,user-scalable=no">
<meta name="author" content="wendale">
<meta name="description" content="love it, love you,">
<meta property="og:type" content="website">
<meta property="og:title" content="今夕何夕">
<meta property="og:url" content="http://wendale.cn/index.html">
<meta property="og:site_name" content="今夕何夕">
<meta property="og:description" content="love it, love you,">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="今夕何夕">
<meta name="twitter:description" content="love it, love you,">
<link rel="alternative" href="/atom.xml" title="今夕何夕" type="application/atom+xml">
<link rel="icon" href="/img/favicon.ico">
<link rel="apple-touch-icon" href="/img/jacman.jpg">
<link rel="apple-touch-icon-precomposed" href="/img/jacman.jpg">
<link rel="stylesheet" href="/css/style.css" type="text/css">
</head>
<body>
<header>
<div>
<div id="imglogo">
<a href="/"><img src="/img/logo.png" alt="今夕何夕" title="今夕何夕"/></a>
</div>
<div id="textlogo">
<h1 class="site-name"><a href="/" title="今夕何夕">今夕何夕</a></h1>
<h2 class="blog-motto">love it, love life</h2>
</div>
<div class="navbar"><a class="navbutton navmobile" href="#" title="菜单">
</a></div>
<nav class="animated">
<ul>
<ul>
<li><a href="/">Home</a></li>
<li><a href="/archives">Archives</a></li>
<li><a href="/about">About</a></li>
<li>
<form class="search" action="//google.com/search" method="get" accept-charset="utf-8">
<label>Search</label>
<input type="search" id="search" name="q" autocomplete="off" maxlength="20" placeholder="搜索" />
<input type="hidden" name="q" value="site:wendale.cn">
</form>
</li>
</ul>
</nav>
</div>
</header>
<div id="container">
<div id="main">
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2016/04/24/种子/" title="种子" itemprop="url">种子</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2016-04-24T15:06:06.000Z" itemprop="datePublished"> 发表于 2016-04-24</time>
</p>
</header>
<div class="article-content">
<p>暮春<br>种子终于要重新破土而出<br>虽若隐若现<br>却毅力顽强<br>只是,我期盼的<br>并不是参天大树or绚彩夺目<br>是深深扎根于土壤,万千丛林中的一株灌木<br>纵享春夏<br>笑看秋冬</p>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
</div>
<div class="comments-count">
<span></span>
<a href="/2016/04/24/种子/#comments" class="ds-thread-count comments-count-link" data-thread-key="2016/04/24/种子/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/12/05/字符串相关的若干算法/" title="字符串相关的若干算法" itemprop="url">字符串相关的若干算法</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-12-05T13:31:17.000Z" itemprop="datePublished"> 发表于 2015-12-05</time>
</p>
</header>
<div class="article-content">
<h2 id="一-_kmp算法">一. kmp算法</h2><p>kmp算法的核心思想是next数组,了解next数组前先了解最长对称串。对于一个字符串s,我们定义: </p>
<ol>
<li>s的前缀,以s的首字符开头且不包含尾字符的子串集合。 </li>
<li>s的后缀,以s的尾字符结尾且不包含首字符的的子串集合。<br>最长对乘串:s的前缀集合与后缀集合交集中最长串。<br>例如:<br>s=’ababa’<br>s的前缀集合:{a, ab, aba, abab}<br>s的后缀集合: {baba, aba, ba, a}<br>可知s的最长对乘串为aba.<br>定义:<br>next[i]: s中以索引i为结尾的子串最长对乘串的长度。<br>下面求next[i]: <figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">cal_next</span><span class="params">(pattern)</span>:</span></span><br><span class="line"> next = [<span class="number">0</span>]</span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> range(<span class="number">1</span>, len(pattern)):</span><br><span class="line"> matched = next[i-<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">while</span> <span class="keyword">True</span>:</span><br><span class="line"> <span class="keyword">if</span> pattern[matched] == pattern[i]:</span><br><span class="line"> next.append(matched + <span class="number">1</span>)</span><br><span class="line"> <span class="keyword">break</span></span><br><span class="line"> <span class="keyword">if</span> matched == <span class="number">0</span>:</span><br><span class="line"> next.append(<span class="number">0</span>)</span><br><span class="line"> <span class="keyword">break</span></span><br><span class="line"> matched = next[matched]</span><br><span class="line"> <span class="keyword">return</span> next</span><br></pre></td></tr></table></figure>
</li>
</ol>
<p>匹配过程如下:<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">kmp</span><span class="params">(src, pattern)</span></span>:</span><br><span class="line"> <span class="tag">i</span> = <span class="number">0</span></span><br><span class="line"> j = <span class="number">0</span></span><br><span class="line"> match_index = []</span><br><span class="line"> next = <span class="function"><span class="title">cal_next</span><span class="params">(pattern)</span></span></span><br><span class="line"> while <span class="tag">i</span> < <span class="function"><span class="title">len</span><span class="params">(src)</span></span>:</span><br><span class="line"> <span class="keyword">if</span> src[i] == pattern[j]:</span><br><span class="line"> <span class="tag">i</span> += <span class="number">1</span></span><br><span class="line"> j += <span class="number">1</span></span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> <span class="keyword">if</span> j == <span class="number">0</span>:</span><br><span class="line"> <span class="tag">i</span> += <span class="number">1</span></span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> j = next[j-<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">if</span> j == <span class="function"><span class="title">len</span><span class="params">(pattern)</span></span>:</span><br><span class="line"> match_index.<span class="function"><span class="title">append</span><span class="params">(i - len(pattern)</span></span>)</span><br><span class="line"> j = next[j-<span class="number">1</span>]</span><br><span class="line"> return match_index</span><br></pre></td></tr></table></figure></p>
<h2 id="二、求全排列">二、求全排列</h2><p>已知字符s1s2s3…sn,求它们的全排列。<br><figure class="highlight stata"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre></td><td class="code"><pre><span class="line">def next_permutation(permutation):</span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> <span class="keyword">range</span>(len(permutation) - 1, 0, -1):</span><br><span class="line"> <span class="keyword">if</span> permutation[i] > permutation[i-1]:</span><br><span class="line"> min_index = <span class="literal">i</span></span><br><span class="line"> <span class="keyword">for</span> j <span class="keyword">in</span> <span class="keyword">range</span>(i, len(permutation)):</span><br><span class="line"> <span class="keyword">if</span> permutation[i-1] < permutation[j] < permutation[min_index]:</span><br><span class="line"> min_index = <span class="literal">j</span></span><br><span class="line"> permutation[i-1], permutation[min_index] = permutation[min_index], permutation[i-1]</span><br><span class="line"> <span class="keyword">m</span> = <span class="literal">i</span></span><br><span class="line"> <span class="keyword">n</span> = len(permutation) - 1</span><br><span class="line"> <span class="keyword">while</span> <span class="keyword">m</span> < <span class="keyword">n</span>:</span><br><span class="line"> permutation[<span class="keyword">m</span>], permutation[<span class="keyword">n</span>] = permutation[<span class="keyword">n</span>], permutation[<span class="keyword">m</span>]</span><br><span class="line"> <span class="keyword">m</span> += 1</span><br><span class="line"> <span class="keyword">n</span> -= 1</span><br><span class="line"> <span class="keyword">return</span> permutation</span><br><span class="line"> <span class="keyword">return</span> []</span><br></pre></td></tr></table></figure></p>
<h2 id="三、最长回文算法">三、最长回文算法</h2>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/算法/">算法</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/12/05/字符串相关的若干算法/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/12/05/字符串相关的若干算法/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/12/01/优先级矩阵求表达式的值/" title="优先级矩阵求算数表达式" itemprop="url">优先级矩阵求算数表达式</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-12-01T04:21:35.000Z" itemprop="datePublished"> 发表于 2015-12-01</time>
</p>
</header>
<div class="article-content">
<p>算数表达式求值是现代编译器要解决的最基本的问题。所谓表达式,就是包含常数、变量以及数学运算符、逻辑运算符、括号等的字符串。本次为了练习,只考虑包括加减乘除、正负号、括号的表达式。</p>
<hr>
<h2 id="1-_优先级">1. 优先级</h2><p>我们让右括号的优先级最低,左括号最高,其次正负号,其次乘除,最后加减,同一优先级左结合性时左面的高,右结核性右面高,加减乘除均是左结合性,正负号右结合性,左括号右结合性,右括号左结合性。综上可得优先级矩阵:</p>
<figure class="highlight cmake"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br></pre></td><td class="code"><pre><span class="line"></span><br><span class="line"><span class="comment"># row represent left operator</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># col represent right operator</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># PRIORITY_MATRIX[i][j] is True when i has high priority, otherwise, i has low priority</span></span><br><span class="line"></span><br><span class="line">PRIORITY_MATRIX = (</span><br><span class="line"></span><br><span class="line"><span class="comment"># +,-,*,/,+(sign),-(sign),(,)</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#+</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#-</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#*</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#/</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#+(sign)</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#-(sign)</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">False</span>,<span class="keyword">True</span>),<span class="comment">#(</span></span><br><span class="line"></span><br><span class="line">(<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>,<span class="keyword">True</span>)<span class="comment">#)</span></span><br><span class="line"></span><br><span class="line">)</span><br><span class="line"></span><br><span class="line"><span class="comment">#符号编码</span></span><br><span class="line">PLUS_OPERATOR = <span class="number">0</span></span><br><span class="line">MINUS_OPERATOR = <span class="number">1</span></span><br><span class="line">MULTIPLY_OPERATOR = <span class="number">2</span></span><br><span class="line">DIVIDE_OPERATOR = <span class="number">3</span></span><br><span class="line">POSITIVE_SIGN_OPERATOR = <span class="number">4</span></span><br><span class="line">NEGATIVE_SIGN_OPERATOR = <span class="number">5</span></span><br><span class="line">LEFT_BRACKET_OPERATOR = <span class="number">6</span></span><br><span class="line">RIGHT_BRACKET_OPERATOR = <span class="number">7</span></span><br></pre></td></tr></table></figure>
<p>PRIORITY_MATRIX[op1][op2]的含义是:操作符op1在左,op2在右,op1的优先级是否高于op2.</p>
<h2 id="2-_表达式符号解析">2. 表达式符号解析</h2><p>首先要解析表达式,此时需要注意的是正负号的识别,因为加减和正负号分别是同一个字符。那么当面对一个’+’或者’-‘字符的情况下如何判断它到底是哪一种符号呢?<br><strong>如果上一个符号是数字或者右括号,那么就是加减号,否则为正负号.</strong><br>有了上面的规则符号解析就很容易了:<br><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">push_num</span><span class="params">(tokens, current_num)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> current_num:</span><br><span class="line"> <span class="keyword">try</span>:</span><br><span class="line"> tokens.append(Token(<span class="keyword">False</span>, float(<span class="string">''</span>.join(current_num))))</span><br><span class="line"> <span class="keyword">del</span> current_num[:]</span><br><span class="line"> <span class="keyword">except</span> Exception:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'parse num failed'</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">judge_operator</span><span class="params">(tokens, ch)</span>:</span></span><br><span class="line"> last_token = tokens[-<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">if</span> ch == <span class="string">'+'</span>:</span><br><span class="line"> <span class="keyword">if</span> last_token.is_operator <span class="keyword">and</span> last_token.value != RIGHT_BRACKET_OPERATOR:</span><br><span class="line"> op = POSITIVE_SIGN_OPERATOR</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> op = PLUS_OPERATOR</span><br><span class="line"> <span class="keyword">elif</span> ch == <span class="string">'-'</span>:</span><br><span class="line"> <span class="keyword">if</span> last_token.is_operator <span class="keyword">and</span> last_token.value != RIGHT_BRACKET_OPERATOR:</span><br><span class="line"> op = NEGATIVE_SIGN_OPERATOR</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> op = MINUS_OPERATOR</span><br><span class="line"> <span class="keyword">elif</span> ch == <span class="string">'*'</span>:</span><br><span class="line"> op = MULTIPLY_OPERATOR</span><br><span class="line"> <span class="keyword">elif</span> ch == <span class="string">'/'</span>:</span><br><span class="line"> op = DIVIDE_OPERATOR</span><br><span class="line"> <span class="keyword">elif</span> ch == <span class="string">'('</span>:</span><br><span class="line"> op = LEFT_BRACKET_OPERATOR</span><br><span class="line"> <span class="keyword">elif</span> ch == <span class="string">')'</span>:</span><br><span class="line"> op = RIGHT_BRACKET_OPERATOR</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'unknown operator '</span> + ch)</span><br><span class="line"> <span class="keyword">return</span> op</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">parse_tokens</span><span class="params">(expression)</span>:</span></span><br><span class="line"> tokens = [Token(<span class="keyword">True</span>, LEFT_BRACKET_OPERATOR)]</span><br><span class="line"> current_num = []</span><br><span class="line"> <span class="keyword">for</span> ch <span class="keyword">in</span> expression + <span class="string">')'</span>:</span><br><span class="line"> <span class="keyword">if</span> <span class="string">'0'</span> <= ch <= <span class="string">'9'</span> <span class="keyword">or</span> ch == <span class="string">'.'</span>:</span><br><span class="line"> current_num.append(ch)</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> push_num(tokens, current_num)</span><br><span class="line"> <span class="keyword">if</span> ch == <span class="string">' '</span>:</span><br><span class="line"> <span class="keyword">continue</span></span><br><span class="line"> op = judge_operator(tokens, ch)</span><br><span class="line"> tokens.append(Token(<span class="keyword">True</span>, op))</span><br><span class="line"> <span class="keyword">return</span> tokens</span><br></pre></td></tr></table></figure></p>
<p>在解析符号的,为了后续计算不失一般性,在表达式首尾分别加上左右括号。</p>
<h2 id="3-_计算值">3. 计算值</h2><p>这里使用两个栈来分别存放操作数(operand_stack)和操作符(operator_stack),计算过程如下。<br>从前往后遍历符号表:<br> 1)如果是数字,入操作数栈<br> 2)如果是操作符,设为op,重复以下过程:<br> a)取操作符栈顶操作符top.<br> b)如果top优先级小于op,则将op压入操作符栈,结束循环。<br> c)如果top优先级大于op,弹出top,从操作数栈弹出相应数目的操作数,计算结果并压入操作数栈。如果top是左括号,结束循环,否则继续循环。<br><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br></pre></td><td class="code"><pre><span class="line">OP_MAP = {</span><br><span class="line"> PLUS_OPERATOR: <span class="keyword">lambda</span> x, y: x + y,</span><br><span class="line"> MINUS_OPERATOR: <span class="keyword">lambda</span> x, y: x - y,</span><br><span class="line"> MULTIPLY_OPERATOR: <span class="keyword">lambda</span> x, y: x * y,</span><br><span class="line"> DIVIDE_OPERATOR: <span class="keyword">lambda</span> x, y: x / y,</span><br><span class="line"> POSITIVE_SIGN_OPERATOR: <span class="keyword">lambda</span> x: x,</span><br><span class="line"> NEGATIVE_SIGN_OPERATOR: <span class="keyword">lambda</span> x: -x,</span><br><span class="line"> LEFT_BRACKET_OPERATOR: <span class="keyword">lambda</span> x: x</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">do_binary_operator_calculate</span><span class="params">(operand_stack, op)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> len(operand_stack) < <span class="number">2</span>:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'not enough operand for operator %d'</span> % op)</span><br><span class="line"> right = operand_stack.pop()</span><br><span class="line"> left = operand_stack.pop()</span><br><span class="line"> <span class="keyword">if</span> op == DIVIDE_OPERATOR <span class="keyword">and</span> right == <span class="number">0</span>:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'divisor can not be zero'</span>)</span><br><span class="line"> operand_stack.append(OP_MAP[op](left, right))</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">do_unary_operator_calculate</span><span class="params">(operand_stack, op)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> len(operand_stack) < <span class="number">1</span>:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'not enough operand for operator %d'</span> % op)</span><br><span class="line"> operand_stack.append(OP_MAP[op](operand_stack.pop()))</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">on_op</span><span class="params">(operator_stack, operand_stack, op)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> <span class="keyword">not</span> operator_stack:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'operator should be empty'</span>)</span><br><span class="line"> <span class="keyword">if</span> PRIORITY_MATRIX[operator_stack[-<span class="number">1</span>]][op]:</span><br><span class="line"> last_op = operator_stack.pop()</span><br><span class="line"> <span class="keyword">if</span> last_op <span class="keyword">in</span> (PLUS_OPERATOR, MINUS_OPERATOR, MULTIPLY_OPERATOR, DIVIDE_OPERATOR):</span><br><span class="line"> do_binary_operator_calculate(operand_stack, last_op)</span><br><span class="line"> on_op(operator_stack, operand_stack, op)</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> do_unary_operator_calculate(operand_stack, last_op)</span><br><span class="line"> <span class="keyword">if</span> last_op != LEFT_BRACKET_OPERATOR:</span><br><span class="line"> on_op(operator_stack, operand_stack, op)</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> operator_stack.append(op)</span><br><span class="line"> </span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">calculate</span><span class="params">(expression)</span>:</span></span><br><span class="line"> tokens = parse_tokens(expression)</span><br><span class="line"> operator_stack = [tokens[<span class="number">0</span>].value]</span><br><span class="line"> operand_stack = []</span><br><span class="line"> <span class="keyword">for</span> token <span class="keyword">in</span> tokens[<span class="number">1</span>:]:</span><br><span class="line"> <span class="keyword">if</span> token.is_operator:</span><br><span class="line"> on_op(operator_stack, operand_stack, token.value)</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> operand_stack.append(token.value)</span><br><span class="line"> <span class="keyword">if</span> len(operand_stack) != <span class="number">1</span>:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'too many operand left'</span>)</span><br><span class="line"> <span class="keyword">if</span> operator_stack:</span><br><span class="line"> <span class="keyword">raise</span> Exception(<span class="string">'operator left'</span>)</span><br><span class="line"> <span class="keyword">return</span> operand_stack.pop()</span><br></pre></td></tr></table></figure></p>
<p>到这已经完成了算数表达式的计算。需要注意的是异常情况的处理。下面再加上主函数,一个简易计算器就实现了:<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line"> while True:</span><br><span class="line"> text = <span class="function"><span class="title">raw_input</span><span class="params">(<span class="string">'>>>'</span>)</span></span></span><br><span class="line"> <span class="function"><span class="title">print</span><span class="params">(calculate(text)</span></span>)</span><br></pre></td></tr></table></figure></p>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/算法/">算法</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/12/01/优先级矩阵求表达式的值/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/12/01/优先级矩阵求表达式的值/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/11/11/利用netcat快速搭建http服务/" title="利用netcat快速搭建http服务" itemprop="url">利用netcat快速搭建http服务</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-11-11T08:18:13.000Z" itemprop="datePublished"> 发表于 2015-11-11</time>
</p>
</header>
<div class="article-content">
<h2 id="netcat">netcat</h2><p>netcat是linux系统自带的一个网络工具,通过,netcat的应用非常多,比如做代理,端口扫描,聊天服务器等,被誉为linux网络工具中的瑞士军刀,关于各种应用场景的介绍,可以参考下面参考1的链接。本文要分享的是netcat在工作中的一次应用:提供http服务。</p>
<h2 id="需求场景">需求场景</h2><p>场景是这样的,我在和其他部门联调测试,本质上就是对方把数据推过来了,我手动触发处理,然后再把结果推过去。每次联调都要沟通,相当麻烦,觉得应该想办法提高效率。其实联调中我的作用就下面三点:</p>
<ol>
<li>清空历史数据</li>
<li>查看数据是否接受到</li>
<li>调用数据处理程序,并把处理结果同步过去。</li>
</ol>
<p>其实完全可以让对方登陆到我的测试机器,但是总觉得这样不妥,万一我那高雅的代码被人看到了呢~~。所以我希望别人能够在我的机器执行以上三个操作,在不登陆我的机器前提下。想起之前用nc做过代理服务,就试了下提供个http服务:</p>
<h2 id="处理逻辑">处理逻辑</h2><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">#step 1 读取请求信息</span></span><br><span class="line"><span class="built_in">read</span> request</span><br><span class="line"></span><br><span class="line"><span class="keyword">while</span> /bin/<span class="literal">true</span>; <span class="keyword">do</span></span><br><span class="line"> <span class="built_in">read</span> header</span><br><span class="line"> [ <span class="string">"<span class="variable">$header</span>"</span> == $<span class="string">'\r'</span> ] && <span class="built_in">break</span>;</span><br><span class="line"><span class="keyword">done</span></span><br><span class="line"></span><br><span class="line"><span class="comment">#step 2 提取url</span></span><br><span class="line">url=<span class="string">"<span class="variable">${request#GET }</span>"</span></span><br><span class="line">url=<span class="string">"<span class="variable">${url% HTTP/*}</span>"</span></span><br><span class="line"></span><br><span class="line"><span class="comment">#step 3 解析命令,返回结果</span></span><br><span class="line"><span class="built_in">echo</span> <span class="operator">-e</span> <span class="string">"HTTP/1.1 200 OK\r"</span></span><br><span class="line"><span class="built_in">echo</span> <span class="operator">-e</span> <span class="string">"Content-Type: text/plain; charset=utf-8\r"</span></span><br><span class="line"><span class="built_in">echo</span> <span class="operator">-e</span> <span class="string">"\r"</span></span><br><span class="line"><span class="keyword">case</span> <span class="variable">$url</span> <span class="keyword">in</span></span><br><span class="line"><span class="comment">#接口1:清空数据</span></span><br><span class="line"> <span class="string">'/clear'</span>)</span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'开始清除历史数据...'</span></span><br><span class="line"> rm ./out/*.txt</span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'数据已清除!OK'</span></span><br><span class="line"> ;;</span><br><span class="line"><span class="comment">#接口3:调用数据处理程序,并把处理结果同步过去</span></span><br><span class="line"> <span class="string">'/exec'</span>)</span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'执行排名统计并同步数据..'</span></span><br><span class="line"> ./batch.sh</span><br><span class="line"> <span class="keyword">if</span> [[ $? <span class="operator">-eq</span> <span class="number">0</span> ]];<span class="keyword">then</span></span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'执行成功!OK'</span></span><br><span class="line"> <span class="keyword">else</span></span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'执行失败!error'</span></span><br><span class="line"> <span class="keyword">fi</span></span><br><span class="line"> ;;</span><br><span class="line"><span class="comment">#接口2:查看数据</span></span><br><span class="line"> <span class="string">'/watch'</span>)</span><br><span class="line"> i=<span class="number">0</span></span><br><span class="line"> <span class="keyword">while</span> <span class="built_in">read</span> line</span><br><span class="line"> <span class="keyword">do</span></span><br><span class="line"> <span class="keyword">if</span> [[ <span class="variable">$line</span> == <span class="number">4</span>* ]]; <span class="keyword">then</span></span><br><span class="line"> i=$((<span class="variable">$i</span>+<span class="number">1</span>))</span><br><span class="line"> <span class="built_in">echo</span> <span class="variable">$line</span></span><br><span class="line"> <span class="keyword">fi</span></span><br><span class="line"> <span class="keyword">done</span> < ./out/json_result.txt</span><br><span class="line"> <span class="built_in">echo</span> <span class="string">"共<span class="variable">$i</span>行"</span></span><br><span class="line"> ;;</span><br><span class="line"> *)</span><br><span class="line"> <span class="built_in">echo</span> <span class="string">'非法操作'</span></span><br><span class="line"> ;;</span><br><span class="line"><span class="keyword">esac</span></span><br></pre></td></tr></table></figure>
<h2 id="服务驱动">服务驱动</h2><p>nc监听端口,将接受到的数据丢给处理逻辑,处理逻辑处理完再交给nc,此处用双向管道实现循环管道<br><figure class="highlight bash"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">DPIPE=<span class="string">"/tmp/static_server"</span></span><br><span class="line"><span class="built_in">echo</span> <span class="string">"删除双向管道<span class="variable">${DPIPE}</span>(如果存在的话)..."</span></span><br><span class="line">rm <span class="operator">-f</span> <span class="variable">$DPIPE</span></span><br><span class="line"><span class="built_in">echo</span> <span class="string">"创建双向管道【<span class="variable">${DPIPE}</span>】..."</span></span><br><span class="line">mkfifo <span class="variable">$DPIPE</span></span><br><span class="line"><span class="built_in">echo</span> <span class="string">"服务开始..."</span></span><br><span class="line"><span class="keyword">while</span> /bin/<span class="literal">true</span>; <span class="keyword">do</span></span><br><span class="line"> cat <span class="variable">$DPIPE</span> | ./static_server.sh | nc <span class="operator">-l</span> <span class="number">127.0</span>.<span class="number">0.1</span> <span class="number">8080</span> > <span class="variable">$DPIPE</span></span><br><span class="line"><span class="keyword">done</span></span><br></pre></td></tr></table></figure></p>
<p>非常的轻量级,开发这个接口加测试不到一个小时,python3的http.server也能很快实现这样的功能,有机会也试下。</p>
<p><a href="http://www.oschina.net/translate/linux-netcat-command" target="_blank" rel="external">netcat应用场景介绍</a></p>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/linux/">linux</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/11/11/利用netcat快速搭建http服务/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/11/11/利用netcat快速搭建http服务/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/11/05/k近邻法-统计学习方法笔记(二)/" title="k近邻法-统计学习方法笔记(二)" itemprop="url">k近邻法-统计学习方法笔记(二)</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-11-05T10:34:36.000Z" itemprop="datePublished"> 发表于 2015-11-05</time>
</p>
</header>
<div class="article-content">
<h2 id="算法介绍">算法介绍</h2><p>k近邻法(KNN)是一种常用的机器学习方法,可用于分类,也可用于回归。对于分类,简单来讲就是对于待分类点找出与之最邻近的k个,根据这k个的类比以一定规则决定这个点的类别,一般用的是选举的策略。回归的话基本一致,只是最后取值规则修改下,可以采用平均值或者加权平均值。<br>通过上面的简介,可以看出来k近邻法有以下三个基本要素:</p>
<ol>
<li>k值的选择。当k较大时,可以假设当k为样本容量N时,就是求均值,模型过于简单了。如果k太小,极端情况下k取1,那么很容易被噪声干扰,本质上是k太小,导致模型过于复杂,出现了过拟合。</li>
<li>距离的度量。常用的欧式距离,还有曼哈顿距离,更为普遍的是Minkowski距离。</li>
<li>分类决策规则。分类一般是少数服从多数,回归的话取均值或者按距离加权均值,距离越近权值越大。</li>
</ol>
<h2 id="kd树">kd树</h2><p>对于给定样本点,当要查找预测点的k近邻时,可以直接遍历样本,选出最近的k个即可。但是这样每次都要遍历样本,当样本规模较大时,效率太低了。于是提出了KD树(k dimension tree)算法,kd数可以看作是多维度下的二叉排序树。对于一维的情况,就是平时所见的二叉排序树。当多维时,每次选取其中一维进行分割。至于选那一维,可以选方差最大的维度,方差越大说明数据分布越散,越容易分割。kd树生成算法描述如下:<br><strong>算法:构建k-d树(createKDTree)</strong><br><strong>输入:数据点集Data-set和其所在的空间Range</strong><br><strong>输出:Kd,类型为k-d tree</strong></p>
<ol>
<li>If Data-set为空,则返回空的k-d tree</li>
<li>调用节点生成程序:<br> (1)确定split域:对于所有描述子数据(特征矢量),统计它们在每个维上的数据方差。以SURF特征为例,描述子为64维,可计算64个方差。挑选出最大值,对应的维就是split域的值。数据方差大表明沿该坐标轴方向上的数据分散得比较开,在这个方向上进行数据分割有较好的分辨率;<br> (2)确定Node-data域:数据点集Data-set按其第split域的值排序。位于正中间的那个数据点被选为Node-data。此时新的Data-set’ = Data-set\Node-data(除去其中Node-data这一点)。|</li>
<li>dataleft = {d属于Data-set’ && d[split] ≤ Node-data[split]}<br>Left_Range = {Range && dataleft}<br>dataright = {d属于Data-set’ && d[split] > Node-data[split]}<br>Right_Range = {Range && dataright}|</li>
<li>left = 由(dataleft,Left<em>Range)建立的k-d tree,即递归调用createKDTree(dataleft,Left</em><br>Range)。并设置left的parent域为Kd;<br>right = 由(dataright,Right<em>Range)建立的k-d tree,即调用createKDTree(dataleft,Left</em><br>Range)。并设置right的parent域为Kd。</li>
</ol>
<h3 id="kd树结构">kd树结构</h3><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Node</span>:</span></span><br><span class="line"> <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, point, split)</span>:</span></span><br><span class="line"> self.left = <span class="keyword">None</span> <span class="comment">#左子树</span></span><br><span class="line"> self.right = <span class="keyword">None</span> <span class="comment">#右子树</span></span><br><span class="line"> self.point = point <span class="comment">#分割点</span></span><br><span class="line"> self.split = split <span class="comment">#分割维度</span></span><br></pre></td></tr></table></figure>
<h3 id="kd树构造">kd树构造</h3><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">#中位数</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">median</span><span class="params">(lst)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> len(lst) % <span class="number">2</span>:</span><br><span class="line"> <span class="keyword">return</span> np.median(lst)</span><br><span class="line"> <span class="keyword">return</span> np.median(lst[<span class="number">1</span>:])</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="comment">#欧氏距离</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">os_distance</span><span class="params">(v1, v2)</span>:</span></span><br><span class="line"> <span class="keyword">return</span> np.linalg.norm(np.array(v1) - np.array(v2))</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">build_kdtree</span><span class="params">(points, d)</span>:</span></span><br><span class="line"> <span class="keyword">if</span> len(points) == <span class="number">0</span>:</span><br><span class="line"> <span class="keyword">return</span> <span class="keyword">None</span></span><br><span class="line"></span><br><span class="line"> mid = median(np.array(points[:, d]))</span><br><span class="line"> left_points = np.array([m <span class="keyword">for</span> m <span class="keyword">in</span> points <span class="keyword">if</span> m[d] < mid])</span><br><span class="line"> right_points = np.array([m <span class="keyword">for</span> m <span class="keyword">in</span> points <span class="keyword">if</span> m[d] > mid])</span><br><span class="line"> points = np.array([m <span class="keyword">for</span> m <span class="keyword">in</span> points <span class="keyword">if</span> m[d] == mid])</span><br><span class="line"></span><br><span class="line"> n = Node(points[<span class="number">0</span>], d)</span><br><span class="line"> n.left = build_kdtree(np.concatenate((left_points.reshape((len(left_points), len(points[<span class="number">0</span>]))), points[<span class="number">1</span>:])),</span><br><span class="line"> (d+<span class="number">1</span>) % len(points[<span class="number">0</span>]))</span><br><span class="line"> n.right = build_kdtree(right_points, (d+<span class="number">1</span>) % len(points[<span class="number">0</span>]))</span><br><span class="line"> <span class="keyword">return</span> n</span><br></pre></td></tr></table></figure>
<h3 id="kd树查找(最近邻)">kd树查找(最近邻)</h3><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">search_nearest</span><span class="params">(kd_tree, target)</span></span>:</span><br><span class="line"> <span class="keyword">if</span> not kd_tree:</span><br><span class="line"> return None</span><br><span class="line"> search_path = []</span><br><span class="line"> nearest = kd_tree<span class="class">.point</span></span><br><span class="line"></span><br><span class="line"> #构造查找路劲</span><br><span class="line"> while kd_tree:</span><br><span class="line"> search_path.<span class="function"><span class="title">append</span><span class="params">(kd_tree)</span></span></span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">os_distance</span><span class="params">(target, nearest)</span></span> > <span class="function"><span class="title">os_distance</span><span class="params">(target, kd_tree.point)</span></span>:</span><br><span class="line"> nearest = kd_tree<span class="class">.point</span></span><br><span class="line"> <span class="keyword">if</span> target[kd_tree.split] < kd_tree<span class="class">.point</span>[kd_tree.split]:</span><br><span class="line"> kd_tree = kd_tree<span class="class">.left</span></span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> kd_tree = kd_tree<span class="class">.right</span></span><br><span class="line"> #回溯查找</span><br><span class="line"> while search_path:</span><br><span class="line"> tree = search_path.<span class="function"><span class="title">pop</span><span class="params">()</span></span></span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">abs</span><span class="params">(target[tree.split] - tree.point[tree.split])</span></span> < <span class="function"><span class="title">os_distance</span><span class="params">(target, nearest)</span></span>:</span><br><span class="line"> <span class="keyword">if</span> target[tree.split] < tree<span class="class">.point</span>[tree.split]:</span><br><span class="line"> to_add = tree<span class="class">.right</span></span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> to_add = tree<span class="class">.left</span></span><br><span class="line"> <span class="keyword">if</span> to_add:</span><br><span class="line"> search_path.<span class="function"><span class="title">append</span><span class="params">(to_add)</span></span></span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">os_distance</span><span class="params">(target, nearest)</span></span> > <span class="function"><span class="title">os_distance</span><span class="params">(target, tree.point)</span></span>:</span><br><span class="line"> nearest = tree<span class="class">.point</span></span><br><span class="line"> return nearest</span><br></pre></td></tr></table></figure>
<h2 id="总结">总结</h2><p>k近邻法要掌握它的三要素,算法部分是kd树,kd树设计到很多知识,包括kd树的增删改查,kd树的改进。k近邻法在分类领域是一个基本的常用算法,上次做kaggle的手写字体识别,knn准确率最高,达到了0.97左右。</p>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/机器学习/">机器学习</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/11/05/k近邻法-统计学习方法笔记(二)/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/11/05/k近邻法-统计学习方法笔记(二)/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/10/30/几道笔试题/" title="几道笔试题" itemprop="url">几道笔试题</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-10-30T13:00:49.000Z" itemprop="datePublished"> 发表于 2015-10-30</time>
</p>
</header>
<div class="article-content">
<p>最近开始用python了,正好公司过段时间研发人员考试,整几个校招笔试题练手吧。</p>
<h1 id="第一题">第一题</h1><blockquote>
<p>如何在排序数组中,找出给定数字出现次数? 比如:{0,1,2,3,3,3,3,3,3,3,3,4,5,6,7,13,19}</p>
</blockquote>
<p>解法非常多,但是要考虑最坏情况下也要有较好的性能,抓住排好序的属性。下面使用的是稍加修改的二分查找,两次二分分别找到上下界,直接相减即可。<br><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">binFindUp</span><span class="params">(arr, key)</span>:</span></span><br><span class="line"> low = <span class="number">0</span></span><br><span class="line"> high = len(arr) -<span class="number">1</span></span><br><span class="line"> <span class="keyword">while</span>(low < high): </span><br><span class="line"> <span class="keyword">print</span> <span class="string">"%d,%d"</span> % (low, high)</span><br><span class="line"> mid = (low + high) / <span class="number">2</span> </span><br><span class="line"> <span class="keyword">if</span> (arr[mid] <= key):</span><br><span class="line"> low = mid</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> high = mid - <span class="number">1</span> </span><br><span class="line"> <span class="keyword">return</span> low </span><br><span class="line"> </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">binFindDown</span><span class="params">(arr, key)</span>:</span></span><br><span class="line"> low = <span class="number">0</span></span><br><span class="line"> high = len(arr) -<span class="number">1</span></span><br><span class="line"> <span class="keyword">while</span>(low < high): </span><br><span class="line"> <span class="keyword">print</span> <span class="string">"%d,%d"</span> % (low, high)</span><br><span class="line"> mid = (low + high) / <span class="number">2</span> </span><br><span class="line"> <span class="keyword">if</span> (arr[mid] >= key):</span><br><span class="line"> high = mid</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> low = mid + <span class="number">1</span> </span><br><span class="line"> <span class="keyword">return</span> high</span><br></pre></td></tr></table></figure></p>
<h1 id="第二题">第二题</h1><blockquote>
<p>如何计算两个有序整形数组的交集,比如:<br>a=0,1,2,3,4<br>b=1,3,5,7,9</p>
</blockquote>
<p>由于求交集,交集一定是集合之间的运算,所以说a,b是集合,即它们本身不含重复元素,参考归并排序的方法即可。<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">interset</span><span class="params">(a, b)</span></span></span><br><span class="line"> <span class="tag">i</span> = <span class="number">0</span></span><br><span class="line"> j = <span class="number">0</span></span><br><span class="line"> c = []</span><br><span class="line"> while <span class="tag">i</span> < <span class="function"><span class="title">len</span><span class="params">(a)</span></span> and <span class="tag">b</span> < <span class="function"><span class="title">len</span><span class="params">(b)</span></span>:</span><br><span class="line"> <span class="keyword">if</span> <span class="tag">a</span>[i] == <span class="tag">b</span>[j]:</span><br><span class="line"> c.<span class="function"><span class="title">add</span><span class="params">(a[i])</span></span></span><br><span class="line"> i++</span><br><span class="line"> j++</span><br><span class="line"> <span class="keyword">else</span> <span class="keyword">if</span> <span class="tag">a</span>[i] > <span class="tag">b</span>[j]:</span><br><span class="line"> i++</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> j++</span><br><span class="line"> return c</span><br></pre></td></tr></table></figure></p>
<p>下面两题是评估字符串相似度的,在工程上都有广泛的应该用。解决思路基本一致,都用的动态规划的思想:</p>
<h1 id="第三题">第三题</h1><blockquote>
<p>求最长公共子串<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">lcs</span><span class="params">(str1, str2)</span></span>:</span><br><span class="line"> dp = [[<span class="number">0</span> <span class="keyword">for</span> col <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(len(str2)</span></span> + <span class="number">1</span>)] <span class="keyword">for</span> row <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(len(str1)</span></span> + <span class="number">1</span>)]</span><br><span class="line"> <span class="keyword">for</span> <span class="tag">i</span> <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(<span class="number">1</span>, len(str1)</span></span> + <span class="number">1</span>):</span><br><span class="line"> <span class="keyword">for</span> j <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(<span class="number">1</span>, len(str2)</span></span> + <span class="number">1</span>):</span><br><span class="line"> dp[i][j] = <span class="function"><span class="title">max</span><span class="params">(dp[i-<span class="number">1</span>][j], dp[i][j-<span class="number">1</span>], dp[i-<span class="number">1</span>][j-<span class="number">1</span>] + (<span class="number">1</span> if str1[i-<span class="number">1</span>] == str2[j-<span class="number">1</span>] else <span class="number">0</span>)</span></span>) </span><br><span class="line"> return dp[<span class="function"><span class="title">len</span><span class="params">(str1)</span></span>][<span class="function"><span class="title">len</span><span class="params">(str2)</span></span>]</span><br></pre></td></tr></table></figure></p>
</blockquote>
<h1 id="第四题">第四题</h1><blockquote>
<p>字符串编辑距离<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">leven</span><span class="params">(str1, str2)</span></span>:</span><br><span class="line"> dp = [[<span class="number">0</span> <span class="keyword">for</span> col <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(len(str2)</span></span> + <span class="number">1</span>)] <span class="keyword">for</span> row <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(len(str1)</span></span> + <span class="number">1</span>)]</span><br><span class="line"> <span class="keyword">for</span> <span class="tag">i</span> <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(<span class="number">0</span>, len(str1)</span></span> + <span class="number">1</span>):</span><br><span class="line"> <span class="keyword">for</span> j <span class="keyword">in</span> <span class="function"><span class="title">range</span><span class="params">(<span class="number">0</span>, len(str2)</span></span> + <span class="number">1</span>):</span><br><span class="line"> <span class="keyword">if</span> <span class="tag">i</span> == <span class="number">0</span> or j == <span class="number">0</span>:</span><br><span class="line"> dp[i][j] = <span class="tag">i</span> + j</span><br><span class="line"> <span class="keyword">else</span>: </span><br><span class="line"> dp[i][j] = <span class="function"><span class="title">min</span><span class="params">(dp[i-<span class="number">1</span>][j] + <span class="number">1</span>, dp[i][j-<span class="number">1</span>] + <span class="number">1</span>, dp[i-<span class="number">1</span>][j-<span class="number">1</span>] + (<span class="number">0</span> if str1[i-<span class="number">1</span>] == str2[j-<span class="number">1</span>] else <span class="number">1</span>)</span></span>) </span><br><span class="line"> return dp[<span class="function"><span class="title">len</span><span class="params">(str1)</span></span>][<span class="function"><span class="title">len</span><span class="params">(str2)</span></span>]</span><br></pre></td></tr></table></figure></p>
</blockquote>
<h1 id="第五题">第五题</h1><blockquote>
<p>求无序数组的第n小值。如果排序再求,那么是nlgn的时间复杂度,下面是一个n的线性复杂度,可用来求最大值,最小值,中位数等。</p>
</blockquote>
<figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">def <span class="function"><span class="title">npos</span><span class="params">(lst, n)</span></span>:</span><br><span class="line"> index = random.<span class="function"><span class="title">randint</span><span class="params">(<span class="number">0</span>, len(lst)</span></span> - <span class="number">1</span>)</span><br><span class="line"> big = [l <span class="keyword">for</span> l <span class="keyword">in</span> lst <span class="keyword">if</span> l > lst[index]]</span><br><span class="line"> little = [l <span class="keyword">for</span> l <span class="keyword">in</span> lst <span class="keyword">if</span> l < lst[index]]</span><br><span class="line"> equal = [l <span class="keyword">for</span> l <span class="keyword">in</span> lst <span class="keyword">if</span> l == lst[index]]</span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">len</span><span class="params">(little)</span></span> > n - <span class="number">1</span>:</span><br><span class="line"> return <span class="function"><span class="title">npos</span><span class="params">(little, n)</span></span></span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">len</span><span class="params">(little)</span></span> <= n - <span class="number">1</span>:</span><br><span class="line"> <span class="keyword">if</span> <span class="function"><span class="title">len</span><span class="params">(little)</span></span> + <span class="function"><span class="title">len</span><span class="params">(equal)</span></span> >= n:</span><br><span class="line"> return lst[index]</span><br><span class="line"> return <span class="function"><span class="title">npos</span><span class="params">(big, n - len(little)</span></span> - <span class="function"><span class="title">len</span><span class="params">(equal)</span></span>)</span><br></pre></td></tr></table></figure>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/笔试题/">笔试题</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/10/30/几道笔试题/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/10/30/几道笔试题/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/10/30/感知机模型-统计学习方法笔记(一)/" title="感知机模型--统计学习方法笔记(一)" itemprop="url">感知机模型--统计学习方法笔记(一)</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-10-30T12:35:23.000Z" itemprop="datePublished"> 发表于 2015-10-30</time>
</p>
</header>
<div class="article-content">
<p>通过kaggle的练习,了解了几个分类算法,但是对于算法的原理还不理解。经过推荐,了解了李航的《统计学习方法》,感觉不错。书内容不多,全书主要讲了几种常见的分类算法。内容整体上比较容易理解。最近正好在学习python,python的numpy包在矩阵计算方面也很不错,matplotlib提供类丰富的绘图功能。可以通过matplotlib绘制gif图,这样可以把算法很生动地表现出来。争取以后每章做一篇读书笔记,笔记主要是代码、gif图像以及个人的心得。<br>本节对应原书第二章。</p>
<h2 id="原始形式">原始形式</h2><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> copy</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> draw</span><br><span class="line"></span><br><span class="line">HISTORY = []</span><br><span class="line">TRAINS = [[(<span class="number">3</span>, <span class="number">3</span>), <span class="number">1</span>], [(<span class="number">4</span>, <span class="number">3</span>), <span class="number">1</span>], [(<span class="number">1</span>, <span class="number">1</span>), -<span class="number">1</span>]]</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">update</span><span class="params">(w, b, item)</span>:</span></span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> range(len(item[<span class="number">0</span>])):</span><br><span class="line"> w[i] += item[<span class="number">1</span>] * item[<span class="number">0</span>][i]</span><br><span class="line"> b += item[<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">return</span> w, b</span><br><span class="line"> </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">cal</span><span class="params">(item)</span>:</span></span><br><span class="line"> res = <span class="number">0</span></span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> range(len(item[<span class="number">0</span>])):</span><br><span class="line"> res += item[<span class="number">0</span>][i] * w[i]</span><br><span class="line"> res += b</span><br><span class="line"> res *= item[<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">return</span> res</span><br><span class="line"> </span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">iterComp</span><span class="params">(w, b, trains)</span>:</span></span><br><span class="line"> <span class="keyword">global</span> HISTORY </span><br><span class="line"> flag = <span class="keyword">False</span></span><br><span class="line"> <span class="keyword">for</span> item <span class="keyword">in</span> trains:</span><br><span class="line"> <span class="keyword">if</span> cal(item) <= <span class="number">0</span>:</span><br><span class="line"> flag = <span class="keyword">True</span></span><br><span class="line"> (w, b) = update(w, b, item)</span><br><span class="line"> HISTORY.append([copy.copy(w), b])</span><br><span class="line"> <span class="keyword">if</span> <span class="keyword">not</span> flag:</span><br><span class="line"> <span class="keyword">print</span> <span class="string">"RESULT:w:"</span> + str(w) + <span class="string">"b:"</span> + str(b)</span><br><span class="line"> <span class="keyword">return</span> flag, w, b</span><br><span class="line"> </span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">"__main__"</span>:</span><br><span class="line"> w = [<span class="number">0</span>, <span class="number">0</span>]</span><br><span class="line"> b = <span class="number">0</span> </span><br><span class="line"> <span class="keyword">while</span> <span class="keyword">True</span>:</span><br><span class="line"> (flag, w, b) = iterComp(w, b, TRAINS)</span><br><span class="line"> <span class="keyword">if</span> <span class="keyword">not</span> flag:</span><br><span class="line"> <span class="keyword">break</span></span><br><span class="line"> draw.show(TRAINS, HISTORY, <span class="keyword">True</span>, <span class="string">'result/perceptron.gif'</span>)</span><br></pre></td></tr></table></figure>
<p>得到如下结果:<br><img src="/images/perceptron.gif" alt=""></p>
<h2 id="对偶形式">对偶形式</h2><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre></td><td class="code"><pre><span class="line">__author__ = <span class="string">'wendale'</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> draw</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">__cal_gram</span><span class="params">(trains)</span>:</span></span><br><span class="line"> gram = np.empty((len(trains), len(trains)), np.int)</span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> range(len(trains)):</span><br><span class="line"> <span class="keyword">for</span> j <span class="keyword">in</span> range(len(trains)):</span><br><span class="line"> gram[i][j] = np.dot(trains[i][<span class="number">0</span>], trains[j][<span class="number">0</span>])</span><br><span class="line"> <span class="keyword">return</span> gram</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">__check</span><span class="params">(trains, gram, alpha, b)</span>:</span></span><br><span class="line"> y = np.array(trains[:,<span class="number">1</span>])</span><br><span class="line"> <span class="keyword">for</span> i <span class="keyword">in</span> range(len(trains)):</span><br><span class="line"> <span class="keyword">if</span> (np.dot(alpha * y, gram[i]) + b) * y[i] <= <span class="number">0</span>:</span><br><span class="line"> <span class="keyword">return</span> <span class="keyword">False</span>, i</span><br><span class="line"> <span class="keyword">return</span> <span class="keyword">True</span>, -<span class="number">1</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">find</span><span class="params">(trains)</span>:</span></span><br><span class="line"> gram = __cal_gram(trains)</span><br><span class="line"> alpha = np.zeros(len(trains))</span><br><span class="line"> b = <span class="number">0</span></span><br><span class="line"> x = np.array(trains[:,<span class="number">0</span>].tolist()).reshape(<span class="number">3</span>,<span class="number">2</span>)</span><br><span class="line"> y = np.array(trains[:,<span class="number">1</span>])</span><br><span class="line"> history = []</span><br><span class="line"> <span class="keyword">while</span> <span class="keyword">True</span>:</span><br><span class="line"> (flag, i) = __check(trains, gram, alpha, b)</span><br><span class="line"> <span class="keyword">if</span> flag:</span><br><span class="line"> <span class="keyword">break</span></span><br><span class="line"> <span class="comment">#update</span></span><br><span class="line"> alpha[i] += <span class="number">1</span></span><br><span class="line"> b += trains[i][<span class="number">1</span>]</span><br><span class="line"> history.append([np.dot(alpha * y, x).tolist(), b])</span><br><span class="line"> <span class="keyword">return</span> alpha, b, history</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line"> trains = np.array([[[<span class="number">3</span>, <span class="number">3</span>], <span class="number">1</span>], [[<span class="number">4</span>, <span class="number">3</span>], <span class="number">1</span>], [[<span class="number">1</span>, <span class="number">1</span>], -<span class="number">1</span>]])</span><br><span class="line"> (a, b, history) = find(trains)</span><br><span class="line"> draw.show(trains, history, <span class="keyword">True</span>, <span class="string">'result/duality_perceptron.gif'</span>)</span><br></pre></td></tr></table></figure>
<p>得到如下结果:<br><img src="/images/duality_perceptron.gif" alt=""></p>
<h2 id="公共绘图部分代码">公共绘图部分代码</h2><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">from</span> matplotlib <span class="keyword">import</span> pyplot <span class="keyword">as</span> plt</span><br><span class="line"><span class="keyword">from</span> matplotlib <span class="keyword">import</span> animation</span><br><span class="line"><span class="keyword">from</span> functools <span class="keyword">import</span> partial</span><br><span class="line"></span><br><span class="line"><span class="comment"># first set up the figure, the axis, and the plot element we want to animate</span></span><br><span class="line">__FIG = plt.figure()</span><br><span class="line">__AX = plt.axes(xlim=(<span class="number">0</span>, <span class="number">2</span>), ylim=(-<span class="number">2</span>, <span class="number">2</span>))</span><br><span class="line">__LINE, = __AX.plot([], [], <span class="string">'g'</span>, lw = <span class="number">2</span>)</span><br><span class="line">__LABEL = __AX.text([], [], <span class="string">''</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># initialization function: plot the background of each frame</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">__init</span><span class="params">(trains)</span>:</span></span><br><span class="line"> __LINE.set_data([], [])</span><br><span class="line"> x, y, x_, y_ = [], [], [], []</span><br><span class="line"> <span class="keyword">for</span> p <span class="keyword">in</span> trains:</span><br><span class="line"> <span class="keyword">if</span> p[<span class="number">1</span>] > <span class="number">0</span>:</span><br><span class="line"> x.append(p[<span class="number">0</span>][<span class="number">0</span>])</span><br><span class="line"> y.append(p[<span class="number">0</span>][<span class="number">1</span>])</span><br><span class="line"> <span class="keyword">else</span>:</span><br><span class="line"> x_.append(p[<span class="number">0</span>][<span class="number">0</span>])</span><br><span class="line"> y_.append(p[<span class="number">0</span>][<span class="number">1</span>])</span><br><span class="line"></span><br><span class="line"> plt.plot(x, y, <span class="string">'bo'</span>, x_, y_, <span class="string">'rx'</span>)</span><br><span class="line"> plt.axis([-<span class="number">6</span>, <span class="number">6</span>, -<span class="number">6</span>, <span class="number">6</span>])</span><br><span class="line"> plt.grid(<span class="keyword">True</span>)</span><br><span class="line"> plt.xlabel(<span class="string">'x'</span>)</span><br><span class="line"> plt.ylabel(<span class="string">'y'</span>)</span><br><span class="line"> plt.title(<span class="string">'Prceptron Algorithm(wendale.cn)'</span>)</span><br><span class="line"> <span class="keyword">return</span> __LINE, __LABEL</span><br><span class="line"></span><br><span class="line"><span class="comment"># animation function. this is called sequentially</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">__animate</span><span class="params">(history, i)</span>:</span></span><br><span class="line"> <span class="keyword">global</span> __AX, __LINE, __LABEL</span><br><span class="line"></span><br><span class="line"> w = history[i][<span class="number">0</span>]</span><br><span class="line"> b = history[i][<span class="number">1</span>]</span><br><span class="line"> <span class="keyword">if</span> w[<span class="number">1</span>] == <span class="number">0</span>:</span><br><span class="line"> <span class="keyword">return</span> __LINE, __LABEL</span><br><span class="line"> x1 = -<span class="number">7</span></span><br><span class="line"> y1 = -(b + w[<span class="number">0</span>]*x1)/w[<span class="number">1</span>]</span><br><span class="line"> x2 = <span class="number">7</span></span><br><span class="line"> y2 = -(b + w[<span class="number">0</span>]*x2)/w[<span class="number">1</span>]</span><br><span class="line"> __LINE.set_data([x1, x2], [y1, y2])</span><br><span class="line"> x1 = <span class="number">0</span></span><br><span class="line"> y1 = -(b + w[<span class="number">0</span>] * x1) / w[<span class="number">1</span>]</span><br><span class="line"> __LABEL.set_text(history[i])</span><br><span class="line"> __LABEL.set_position([x1, y1])</span><br><span class="line"> <span class="keyword">return</span> __LINE, __LABEL</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">show</span><span class="params">(trains, history, save=False, name=<span class="string">''</span>)</span>:</span></span><br><span class="line"> <span class="comment"># call the animator. blit=true means only re-draw the parts that have changed.</span></span><br><span class="line"> anim = animation.FuncAnimation(__FIG, partial(__animate, history), init_func=partial(__init, trains),</span><br><span class="line"> frames=len(history), interval=<span class="number">1000</span>, repeat=<span class="keyword">True</span>, blit=<span class="keyword">True</span>)</span><br><span class="line"> plt.show()</span><br><span class="line"> <span class="keyword">if</span> save:</span><br><span class="line"> anim.save(name, fps=<span class="number">2</span>, writer=<span class="string">'imagemagick'</span>)</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line"> trains = np.array([[(<span class="number">3</span>, <span class="number">3</span>), <span class="number">1</span>], [(<span class="number">4</span>, <span class="number">3</span>), <span class="number">1</span>], [(<span class="number">1</span>, <span class="number">1</span>), -<span class="number">1</span>]])</span><br><span class="line"> history = [[[<span class="number">1.0</span>, <span class="number">0.0</span>, -<span class="number">0.0</span>], <span class="number">1</span>], [[<span class="number">1.0</span>, <span class="number">0.0</span>, -<span class="number">1.0</span>], <span class="number">0</span>], [[<span class="number">1.0</span>, <span class="number">0.0</span>, -<span class="number">2.0</span>], -<span class="number">1</span>], [[<span class="number">1.0</span>, <span class="number">0.0</span>, -<span class="number">3.0</span>], -<span class="number">2</span>], [[<span class="number">2.0</span>, <span class="number">0.0</span>, -<span class="number">3.0</span>], -<span class="number">1</span>], [[<span class="number">2.0</span>, <span class="number">0.0</span>, -<span class="number">4.0</span>], -<span class="number">2</span>], [[<span class="number">2.0</span>, <span class="number">0.0</span>, -<span class="number">5.0</span>], -<span class="number">3</span>]]</span><br><span class="line"> show(trains, history)</span><br></pre></td></tr></table></figure>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/机器学习/">机器学习</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/10/30/感知机模型-统计学习方法笔记(一)/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/10/30/感知机模型-统计学习方法笔记(一)/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/09/28/kaggle-digit-recognizer之naive-bayes解法/" title="kaggle digit recognizer之naive bayes解法" itemprop="url">kaggle digit recognizer之naive bayes解法</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-09-28T09:58:36.000Z" itemprop="datePublished"> 发表于 2015-09-28</time>
</p>
</header>
<div class="article-content">
<h2 id="naive_bayes">naive bayes</h2><p>naive bayes(朴素贝叶斯)是一种常见的分类算法,原理就是概率论一开始讲的贝叶斯原理。之所以naive,是因为它有个很关键的假设:各个特征之间是无关联的。这是一种理想化的假设,所以如果特征之间关联比较密切,就不适合了。朴素贝叶斯在文本分类应用广泛,如分词,垃圾邮件识别等。至于做手写图片识别,其实是不太合适的,因为图片的像素之间关联还是挺大的,但是作为初步学习该方法,还是尝试了下,结果确实不怎么样,不到0.6的正确率,网上看有用朴素贝叶斯达到0.8的,应该是在哪方面做优化了。关于朴素贝叶斯方法的详细介绍,可以看下面参考1的链接,讲的很详细直观。</p>
<h2 id="实现">实现</h2><p>R的e1071包有naiveBayes方法,但是使用该方法要注意,label一定得是factor类型的,函数文档也说了,只能做离散值的预测,如果是数字它会认为是连续的。所以可以看到代码中对label做了类型转换。关于该函数的参考文档,可以参考2的链接。</p>
<h2 id="代码">代码</h2><figure class="highlight nimrod"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">train <- read.csv(<span class="string">"data/train.csv"</span>, header=<span class="type">TRUE</span>)</span><br><span class="line">test <- read.csv(<span class="string">"data/test.csv"</span>, header=<span class="type">TRUE</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment">#注意要对label做类型转换,使用拉普拉斯平滑处理</span></span><br><span class="line">classifier <- naiveBayes(<span class="keyword">as</span>.factor(label) ~ ., train, laplace = <span class="number">10</span>)</span><br><span class="line"><span class="literal">result</span> <- predict(classifier, test)</span><br><span class="line"></span><br><span class="line"><span class="comment">#这个地方很奇怪,写入文件时result都+1了,所以提前-1,还没找到是什么原因</span></span><br><span class="line"><span class="literal">result</span>.table <- data.frame(<span class="type">ImageId</span>=<span class="number">1</span>:nrow(test), <span class="type">Label</span>=<span class="keyword">as</span>.numeric(<span class="literal">result</span>) - <span class="number">1</span>)</span><br><span class="line">write.csv(<span class="literal">result</span>.table, <span class="string">"result/naive_bayes.csv"</span>, row.names = F)</span><br></pre></td></tr></table></figure>
<h2 id="心得">心得</h2><p>朴素贝叶斯算法算是比较简单了,其实之前用的pca、knn、logistic regression、以及线性回归都是非常直观的算法,可能公式推导有的比较复杂。最近也算是了解了它们,然后使用已有的库做练习。后续再熟悉下svm、神经网络,然后就用R或者其他语言实现一遍,还有它们的mapreduce版本也尝试实现下,然后就是能自己用尽可能简单的语言将这些算法尽可能清晰地描述出来。这么一看,任务还是挺多滴。加油!</p>
<h2 id="参考">参考</h2><ol>
<li><a href="http://www.cnblogs.com/leoo2sk/archive/2010/09/17/naive-bayesian-classifier.html" target="_blank" rel="external">朴素贝叶斯</a></li>
<li><a href="http://www.inside-r.org/packages/cran/e1071/docs/naiveBayes" target="_blank" rel="external">R naiveBayes</a></li>
</ol>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/kaggle/">kaggle</a><a href="/tags/机器学习/">机器学习</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/09/28/kaggle-digit-recognizer之naive-bayes解法/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/09/28/kaggle-digit-recognizer之naive-bayes解法/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/09/24/hive-udf使用/" title="hive udf使用" itemprop="url">hive udf使用</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-09-24T10:21:35.000Z" itemprop="datePublished"> 发表于 2015-09-24</time>
</p>
</header>
<div class="article-content">
<p>hive以udf的方式提供了扩展hive的方法。每个udf只需要继承hive指定的类,通过固定的方法名识别出提供的方法。工作中经常遇到这种场景:计算周累计或者月累计。计算的逻辑是判断今天与昨天是否属于同一个周(月),如果是,结果为昨天结果加上今天的结果,否则结果为今天的结果。用hql如下:<br><figure class="highlight sql"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">CASE WHEN WEEKOFYEAR('${TIME}')= WEEKOFYEAR('${BEFORETIME}') AND F1.LOGIN_YN = 'Y' THEN</span><br><span class="line"> COALESCE(F2.WK_DAY_CNT,0) + 1</span><br><span class="line">WHEN WEEKOFYEAR('${TIME}')= WEEKOFYEAR('${BEFORETIME}') THEN COALESCE(F2.WK_DAY_CNT,0) </span><br><span class="line"> WHEN F1.LOGIN_YN = 'Y' THEN 1 ELSE 0 <span class="operator"><span class="keyword">END</span>)</span></span><br></pre></td></tr></table></figure></p>
<p>现在我想把这段代码用一个udf实现:<br><figure class="highlight scss"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="function">week_acc</span>(<span class="attr_selector">[time]</span>, <span class="attr_selector">[beforeTime]</span>, value, accValue)</span><br></pre></td></tr></table></figure></p>
<hr>
<p>现在开始动手实现:</p>
<ol>
<li><p>新建maven工程,pom文件添加hive的开发工具包以来以及joda依赖:</p>
<figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line"><span class="tag"><<span class="title">dependency</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">groupId</span>></span>org.apache.hive<span class="tag"></<span class="title">groupId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">artifactId</span>></span>hive-pdk<span class="tag"></<span class="title">artifactId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">version</span>></span>0.10.0<span class="tag"></<span class="title">version</span>></span></span><br><span class="line"> <span class="tag"></<span class="title">dependency</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">dependency</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">groupId</span>></span>org.apache.hadoop<span class="tag"></<span class="title">groupId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">artifactId</span>></span>hadoop-core<span class="tag"></<span class="title">artifactId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">version</span>></span>1.2.1<span class="tag"></<span class="title">version</span>></span></span><br><span class="line"> <span class="tag"></<span class="title">dependency</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">dependency</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">groupId</span>></span>joda-time<span class="tag"></<span class="title">groupId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">artifactId</span>></span>joda-time<span class="tag"></<span class="title">artifactId</span>></span></span><br><span class="line"> <span class="tag"><<span class="title">version</span>></span>2.8.2<span class="tag"></<span class="title">version</span>></span></span><br><span class="line"> <span class="tag"></<span class="title">dependency</span>></span></span><br></pre></td></tr></table></figure>
</li>
<li><p>编写自定义的udf类,继承UDF类</p>
<figure class="highlight scala"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line">public <span class="keyword">abstract</span> <span class="class"><span class="keyword">class</span> <span class="title">WeekAccumulator</span> <span class="keyword"><span class="keyword">extends</span></span> <span class="title">UDF</span> {</span></span><br><span class="line"></span><br><span class="line"> public <span class="type">LongWritable</span> evaluate(<span class="type">Text</span> timeStr,</span><br><span class="line"> <span class="type">Text</span> beforeTimeStr,</span><br><span class="line"> <span class="type">LongWritable</span> todayValue,</span><br><span class="line"> <span class="type">LongWritable</span> beforeAccumulator) {</span><br><span class="line"> <span class="keyword">return</span> inSamePeriod(timeStr, beforeTimeStr) ?</span><br><span class="line"> <span class="keyword">new</span> <span class="type">LongWritable</span>(todayValue.get() + beforeAccumulator.get()) : todayValue;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> public <span class="type">LongWritable</span> evaluate(<span class="type">LongWritable</span> todayValue, <span class="type">LongWritable</span> beforeAccumulator) {</span><br><span class="line"> <span class="type">DateTime</span> dateTime = <span class="keyword">new</span> <span class="type">DateTime</span>();</span><br><span class="line"> <span class="type">Text</span> timeStr = <span class="keyword">new</span> <span class="type">Text</span>(dateTime.toString(<span class="string">"yyyy-MM-dd"</span>));</span><br><span class="line"> <span class="type">Text</span> beforeTimeStr = <span class="keyword">new</span> <span class="type">Text</span>(dateTime.minusDays(<span class="number">1</span>).toString(<span class="string">"yyyy-MM-dd"</span>));</span><br><span class="line"> <span class="keyword">return</span> evaluate(timeStr, beforeTimeStr, todayValue, beforeAccumulator);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> <span class="keyword">private</span> boolean inSamePeriod(<span class="type">Text</span> timeStr, <span class="type">Text</span> beforeTimeStr) {</span><br><span class="line"> <span class="type">DateTime</span> thisDate = <span class="keyword">new</span> <span class="type">DateTime</span>(timeStr.toString());</span><br><span class="line"> <span class="type">DateTime</span> beforeDate = <span class="keyword">new</span> <span class="type">DateTime</span>(beforeTimeStr.toString());</span><br><span class="line"> <span class="keyword">return</span> thisDate.getWeekOfWeekyear() == beforeDate.getWeekOfWeekyear();</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
</li>
</ol>
<p>3.打包(注意要把依赖的jar包也打进jar包里,否则运行时会找不到依赖的jar包。)</p>
<ol>
<li>将jar包copy到hdfs中</li>
<li>编辑hive conf文件夹下的.hiverc文件(没有手动创建),文件内容:<figure class="highlight sql"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">add jar hdfs://{MASTER}/{udf path}</span><br><span class="line"><span class="operator"><span class="keyword">create</span> <span class="keyword">temporary</span> <span class="keyword">function</span> week_acc <span class="keyword">as</span> <span class="string">'{class}'</span>;</span></span><br></pre></td></tr></table></figure>
</li>
</ol>
<hr>
<p><strong>打开hive cli,输入:</strong><br><code>select week_acc('2015-09-14', '2015-09-13', 1, 10)</code><br><strong>执行成功,那么udf就完成了。</strong></p>
<h2 id="注意事项">注意事项</h2><p>如果hive设置hive.exec.mode.local.auto(默认false)为true的话,上面命令会执行失败,提示jar文件找不到。只能把这个配置改成false了。</p>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/hive/">hive</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/09/24/hive-udf使用/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/09/24/hive-udf使用/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<article class="post-expand post" itemprop="articleBody">
<header class="article-info clearfix">
<h1 itemprop="name">
<a href="/2015/09/23/kaggle-digit-recognizer之logistic-regression解法/" title="kaggle digit recognizer之logistic regression解法" itemprop="url">kaggle digit recognizer之logistic regression解法</a>
</h1>
<p class="article-author">By
<a href="/about" title="wendale" target="_blank" itemprop="author">wendale</a>
<p class="article-time">
<time datetime="2015-09-23T10:16:02.000Z" itemprop="datePublished"> 发表于 2015-09-23</time>
</p>
</header>
<div class="article-content">
<h2 id="logistic_regression">logistic regression</h2><blockquote>
<p><strong>逻辑回归</strong>是一种常用的分类算法(虽然名称带着回归,其实是分类),在广告行业中运用非常广泛,主要用来判断用户是否点击某个广告,以此实现广告的最佳投放效果。逻辑回归一般用来解决二分类的问题,即某件事情只有两种可能,通过训练数据得到预测数据发生某种情况的可能性,一般如果大于0.5我们认为该事件会发生,小于0.5则该事件的互斥事件会发生。</p>
</blockquote>
<h2 id="本题思路">本题思路</h2><p>由于数字有10种可能,所以这是一种多分类的场景。逻辑回归在这种场景下的一般做法是:如果有n种分类,则做n次逻辑回归,每次把是该种类的视为一类,把非该种类的视为另一类,这样可以计算出n个可能性,取可能性最大的作为分类。针对本题我也是这么做的,但是效果非常不理想,正确率仅为0.80(tnn的还运行了好几个小时)。但是作为通过这次练习,大致熟悉了逻辑回归。</p>
<h2 id="代码">代码</h2><figure class="highlight stata"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line">train <- <span class="keyword">read</span>.csv(<span class="string">"data/train.csv"</span>, header=TRUE)</span><br><span class="line"><span class="keyword">test</span> <- <span class="keyword">read</span>.csv(<span class="string">"data/test.csv"</span>, header=TRUE)</span><br><span class="line"></span><br><span class="line">labels <- train[,1]</span><br><span class="line">train <- train[,-1]</span><br><span class="line"></span><br><span class="line">#create formula</span><br><span class="line">xnam <- paste0(<span class="string">"pixel"</span>, c(0:783))</span><br><span class="line">formula <- <span class="keyword">as</span>.formula(paste(<span class="string">"lr.labels ~ "</span>, paste(xnam, <span class="keyword">collapse</span> = <span class="string">"+"</span>)))</span><br><span class="line"></span><br><span class="line">pre <- <span class="keyword">list</span>()</span><br><span class="line"><span class="keyword">for</span> (i <span class="keyword">in</span> 0 : 9) {</span><br><span class="line"> lr.labels <- ifelse(labels == i, 1, 0)</span><br><span class="line"> logic.<span class="keyword">fit</span> <- <span class="keyword">glm</span>(formula = formula, data = cbind(lr.labels, train), family=<span class="literal">binomial</span>(link=<span class="string">"logit"</span>))</span><br><span class="line"> p <- <span class="keyword">predict</span>(logic.<span class="keyword">fit</span>, <span class="keyword">test</span>)</span><br><span class="line"> pre <- cbind(pre, <span class="literal">exp</span>(p) / 1 + <span class="literal">exp</span>(p))</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line">result <- max.col(pre) - 1</span><br></pre></td></tr></table></figure>
<h2 id="说明">说明</h2><p>glm是R广义线性模型函数,可以处理很多种回归,通过family参数控制回归类型,主要有:</p>
<ol>
<li>binomal(link=’logit’) ——响应变量服从二项分布,连接函数为logit,即logistic回归</li>
<li>binomal(link=’probit’) ——响应变量服从二项分布,连接函数为probit</li>
<li>poisson(link=’identity’) ——响应变量服从泊松分布,即泊松回归</li>
</ol>
<p>还有是对R的formula不熟悉,最后参考<a href="http://site.douban.com/182577/widget/notes/10567181/note/318916395/" target="_blank" rel="external">这里</a>算是理解了吧。</p>
<h2 id="参考">参考</h2><ol>
<li><a href="http://blog.csdn.net/pakko/article/details/37878837" target="_blank" rel="external">逻辑回归</a></li>
<li><a href="http://www.cnblogs.com/runner-ljt/p/4574275.html" target="_blank" rel="external">R广义线性模型</a></li>
<li><a href="http://site.douban.com/182577/widget/notes/10567181/note/318916395/" target="_blank" rel="external">R中的formula与Formula</a></li>
</ol>
<p class="article-more-link">
</p>
</div>
<footer class="article-footer clearfix">
<div class="article-catetags">
<div class="article-tags">
<span></span> <a href="/tags/kaggle/">kaggle</a><a href="/tags/机器学习/">机器学习</a>
</div>
</div>
<div class="comments-count">
<span></span>
<a href="/2015/09/23/kaggle-digit-recognizer之logistic-regression解法/#comments" class="ds-thread-count comments-count-link" data-thread-key="2015/09/23/kaggle-digit-recognizer之logistic-regression解法/" data-count-type="comments"> </a>
</div>
</footer>
</article>
<nav id="page-nav" class="clearfix">
<span class="page-number current">1</span><a class="page-number" href="/page/2/">2</a><a class="extend next" rel="next" href="/page/2/">Next<span></span></a>
</nav>
</div>
<div class="openaside"><a class="navbutton" href="#" title="显示侧边栏"></a></div>
<div id="asidepart">
<div class="closeaside"><a class="closebutton" href="#" title="隐藏侧边栏"></a></div>
<aside class="clearfix">
<div class="tagslist">
<p class="asidetitle">标签</p>
<ul class="clearfix">
<li><a href="/tags/机器学习/" title="机器学习">机器学习<sup>5</sup></a></li>
<li><a href="/tags/笔试题/" title="笔试题">笔试题<sup>3</sup></a></li>
<li><a href="/tags/kaggle/" title="kaggle">kaggle<sup>3</sup></a></li>
<li><a href="/tags/算法/" title="算法">算法<sup>2</sup></a></li>
<li><a href="/tags/linux/" title="linux">linux<sup>1</sup></a></li>
<li><a href="/tags/hive/" title="hive">hive<sup>1</sup></a></li>
</ul>
</div>
<div class="linkslist">
<p class="asidetitle">友情链接</p>
<ul>
<li>
<a href="https://coderq.com" target="_blank" title="一个面向程序员交流分享的新一代社区">码农圈</a>
</li>
<li>
<a href="http://wuchong.me" target="_blank" title="Jark's Blog">Jark's Blog</a>
</li>
</ul>
</div>
<div class="weiboshow">
<p class="asidetitle">新浪微博</p>
<iframe width="100%" height="550" class="share_self" frameborder="0" scrolling="no" src="http://widget.weibo.com/weiboshow/index.php?language=&width=0&height=550&fansRow=2&ptype=1&speed=0&skin=1&isTitle=1&noborder=1&isWeibo=1&isFans=1&uid=3283060652&verifier=bb6a83de&dpc=1"></iframe>
</div>
</aside>
</div>
</div>
<footer><div id="footer" >
<div class="line">
<span></span>
<div class="author"></div>
</div>
<section class="info">
<p> Hello ,I'm wendale. <br/>
This is my blog,wish you like it.</p>
</section>
<div class="social-font" class="clearfix">
<a href="http://weibo.com/wendale_8375## e.g. wuchong1014 or 2176287895 for http://weibo.com/2176287895" target="_blank" class="icon-weibo" title="微博"></a>
</div>
<p class="copyright">
Powered by <a href="http://hexo.io" target="_blank" title="hexo">hexo</a> and Theme by <a href="https://github.com/wuchong/jacman" target="_blank" title="Jacman">Jacman</a> © 2016
<a href="/about" target="_blank" title="wendale">wendale</a>
</p>
</div>
</footer>
<script src="/js/jquery-2.0.3.min.js"></script>
<script src="/js/jquery.imagesloaded.min.js"></script>
<script src="/js/gallery.js"></script>
<script type="text/javascript">
$(document).ready(function(){
$('.navbar').click(function(){
$('header nav').toggleClass('shownav');
});
var myWidth = 0;
function getSize(){
if( typeof( window.innerWidth ) == 'number' ) {
myWidth = window.innerWidth;
} else if( document.documentElement && document.documentElement.clientWidth) {
myWidth = document.documentElement.clientWidth;
};
};
var m = $('#main'),
a = $('#asidepart'),
c = $('.closeaside'),
o = $('.openaside');
c.click(function(){
a.addClass('fadeOut').css('display', 'none');
o.css('display', 'block').addClass('fadeIn');
m.addClass('moveMain');
});
o.click(function(){
o.css('display', 'none').removeClass('beforeFadeIn');
a.css('display', 'block').removeClass('fadeOut').addClass('fadeIn');
m.removeClass('moveMain');
});
$(window).scroll(function(){
o.css("top",Math.max(80,260-$(this).scrollTop()));
});
$(window).resize(function(){
getSize();
if (myWidth >= 1024) {
$('header nav').removeClass('shownav');
}else{
m.removeClass('moveMain');
a.css('display', 'block').removeClass('fadeOut');
o.css('display', 'none');
}
});
});
</script>
<script type="text/javascript">
var duoshuoQuery = {short_name:"wendale"};
(function() {
var ds = document.createElement('script');
ds.type = 'text/javascript';ds.async = true;
ds.src = '//static.duoshuo.com/embed.js';
ds.charset = 'UTF-8';
(document.getElementsByTagName('head')[0]
|| document.getElementsByTagName('body')[0]).appendChild(ds);
})();
</script>
<link rel="stylesheet" href="/fancybox/jquery.fancybox.css" media="screen" type="text/css">
<script src="/fancybox/jquery.fancybox.pack.js"></script>
<script type="text/javascript">
$(document).ready(function(){
$('.article-content').each(function(i){
$(this).find('img').each(function(){
if ($(this).parent().hasClass('fancybox')) return;
var alt = this.alt;
if (alt) $(this).after('<span class="caption">' + alt + '</span>');
$(this).wrap('<a href="' + this.src + '" title="' + alt + '" class="fancybox"></a>');
});
$(this).find('.fancybox').each(function(){
$(this).attr('rel', 'article' + i);
});
});
if($.fancybox){
$('.fancybox').fancybox();
}
});
</script>
<!-- Analytics Begin -->
<script type="text/javascript">
var _bdhmProtocol = (("https:" == document.location.protocol) ? " https://" : " http://");
document.write(unescape("%3Cscript src='" + _bdhmProtocol + "hm.baidu.com/h.js%3F96f43d712694a08946a01389510d03bb' type='text/javascript'%3E%3C/script%3E"));
</script>
<!-- Analytics End -->
<!-- Totop Begin -->
<div id="totop">
<a title="返回顶部"><img src="/img/scrollup.png"/></a>
</div>
<script src="/js/totop.js"></script>
<!-- Totop End -->
<!-- MathJax Begin -->
<!-- mathjax config similar to math.stackexchange -->
<!-- MathJax End -->
<!-- Tiny_search Begin -->
<!-- Tiny_search End -->
</body>