-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataCamp.Course_013_Data_Visualization_with_ggplot2_pt2
1796 lines (1237 loc) · 64.8 KB
/
DataCamp.Course_013_Data_Visualization_with_ggplot2_pt2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
######################################################################
######################################################################
######################################################################
# COURSE 013_Data Visualization with ggplot2 (Part 2)
######################################################################
######################################################################
######################################################################
######## Statistics (Module 01-013)
######################################################################
ggplot 2 course
Stats and Geoms
VIDEO
Statistics layer
Two categories of functions
called from within a geom
called independently
stat_bin : counts # of observations in a group
stat_ geom_
stat_bin() geom_histogram()
stat_bin() geom_bar()
stat_bin() geom_freqpoly()
stat_smooth() geom_smooth()
### suavizar.. pasar ventana de savinsky-golay
Ex:
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_point() +
geom_smooth(se = FALSE, span = 0.4)
# el valor escogido de span lo hace menos suave
# for ggplot2 we can use method arguments to call parametric models such ass: lm, glm, rlm, gam.
# for larger groups the method is set to gam for default
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_point() +
geom_smooth(method = "lm")
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
#ask for predictions by using the full range argument
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_point() +
geom_smooth(method = "lm", fullrange = FALSE)
#---------------------------------------------------------------------
Smoothing
Welcome to the exercises for the second ggplot2 course!
To practice on the remaining four layers (statistics, coordinates, facets and themes), we'll continue working on several datasets that we already encountered in the first course.
The mtcars dataset contains information for 32 cars from Motor Trends magazine from 1973. This dataset is small, intuitive, and contains a variety of continuous and categorical (both nominal and ordinal) variables.
In the previous course we learned how to effectively use some basic geometries, such as point, bar and line. In the first chapter of this course we'll explore statistics associated with specific geoms, for example, smoothing and lines.
# ggplot2 is already loaded
# Explore the mtcars data frame with str()
str(mtcars)
# A scatter plot with LOESS smooth
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth()
# A scatter plot with an ordinary Least Squares linear model
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm")
# The previous plot, without CI ribbon
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# The previous plot, without points
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_smooth(method = "lm", se = FALSE)
#---------------------------------------------------------------------
Grouping variables
We'll continue with the previous exercise by considering the situation of looking at sub-groups in our dataset. For this we'll encounter the invisible group aesthetic.
# ggplot2 is already loaded
# 1 - Define cyl as a factor variable
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
# 2 - Plot 1, plus another stat_smooth() containing a nested aes()
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "lm", se = FALSE, aes(group = 1))
#---------------------------------------------------------------------
Modifying stat_smooth
In the previous exercise we used se = FALSE in stat_smooth() to remove the 95% Confidence Interval. Here we'll consider another argument, span, used in LOESS smoothing, and we'll take a look at a nice scenario of properly mapping different models.
ggplot2 is already loaded and several of the linear models we looked at in the two previous exercises are already given.
# Plot 1: change the LOESS span
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add span below
geom_smooth(se = FALSE, span = 0.7)
# Plot 2: Set the second stat_smooth() to use LOESS with a span of 0.7
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
# Change method and add span below
stat_smooth(method = "loess", aes(group = 1),
se = FALSE, col = "black", span = 0.7)
# Plot 3: Set col to "All", inside the aes layer of stat_smooth()
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(method = "loess",
# Add col inside aes()
aes(group = 1, col = "All"),
# Remove the col argument below
se = FALSE, span = 0.7)
# Plot 4: Add scale_color_manual to change the colors
myColors <- c(brewer.pal(3, "Dark2"), "black")
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE, span = 0.7) +
stat_smooth(method = "loess",
aes(group = 1, col="All"),
se = FALSE, span = 0.7) +
# Add correct arguments to scale_color_manual
scale_color_manual("Cylinders", values = myColors)
#---------------------------------------------------------------------
Modifying stat_smooth (2)
In this exercise we'll take a look at a more subtle example of defining and using linear models. ggplot2 and the Vocab data frame are already loaded for you.
# Plot 1: Jittered scatter plot, add a linear model (lm) smooth
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.2) +
stat_smooth(method = "lm", se = FALSE) # smooth
# Plot 2: points, colored by year
ggplot(Vocab, aes(x = education, y = vocabulary, col = year)) +
geom_jitter(alpha = 0.2)
# Plot 3: lm, colored by year
ggplot(Vocab, aes(x = education, y = vocabulary, col = factor(year))) +
stat_smooth(method = "lm", se = FALSE) # smooth
# Plot 4: Set a color brewer palette
ggplot(Vocab, aes(x = education, y = vocabulary, col = factor(year))) +
stat_smooth(method = "lm", se = FALSE) + # smooth
scale_color_brewer() # colors
# Plot 5: Add the group aes, specify alpha and size
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_smooth(method = "lm", se = FALSE, alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9, "YlOrRd"))
#---------------------------------------------------------------------
Quantiles
The previous example used the Vocab dataset and applied linear models describing vocabulary by education for different years. Here we'll continue with that example by using stat_quantile() to apply a quantile regression (method rq).
By default, the 1st, 2nd (i.e. median), and 3rd quartiles are modeled as a response to the predictor variable, in this case education. Specific quantiles can be specified with the quantiles argument.
If you want to specify many quantile and color according to year, then things get too busy. We'll explore ways of dealing with this in the next chapter.
# Use stat_quantile instead of stat_smooth
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
# Set quantile to 0.5
ggplot(Vocab, aes(x = education, y = vocabulary, col = year, group = factor(year))) +
stat_quantile(quantiles = 0.5, alpha = 0.6, size = 2) +
scale_color_gradientn(colors = brewer.pal(9,"YlOrRd"))
#---------------------------------------------------------------------
Sum
Another useful stat function is stat_sum(). This function calculates the total number of overlapping observations and is another good alternative to overplotting.
# Plot 1: Jittering only
p <- ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.2)
# Plot 2: Add stat_sum
p +
stat_sum() # sum statistic
# Plot 3: Set size range
p +
stat_sum() + # sum statistic
scale_size(range = c(1,10)) # set size scale
#---------------------------------------------------------------------
VIDEO
Stats outside Geoms
ggplot(iris, aes(x = Species , y = Sepal.Length)) +
geom_point(position = position_jitter(0.2))
# what can we do with continuos variables: mean, standard deviarion or the 95% confidence interval 'CI': to calculate this values we can use basic package and make a new dataframe.
#we can call a function of 'Hmisc' package on the ggplot package
#Ejemplo con numeros random
set.seed(123)
xx <- rnorm(100)
mean(xx)
# Mean
mean(xx) + (sd(xx)* c(-1, 1))
# Lower Upper
library(Hmisc)
smean.sdl(xx, mult = 1)
# Mean Lower Upper
# Hmisc vs. ggplot2
# Hmisc
smean.sdl(xx, mult = 1)
# Mean Lower Upper
# ggplot2
mean_sdl(xx, mult = 1)
# y ymin ymax
#to use this in ggplot
ggplot(iris, aes(x = Species , y = Sepal.Length)) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1))
# This use geom_pointrange() by default
ggplot(iris, aes(x = Species , y = Sepal.Length)) +
stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# Now it looks more like errorbars
ggplot(iris, aes(x = Species , y = Sepal.Length)) +
stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)
# we can turn this into an errorbar but Not recommended v;
### 95% confidence interval
ERR <- qt(0.975, length(xx) -1) * (sd(xx) / sqrt(length(xx)))
mean(xx) + (ERR* c(-1, 1))
# Hmisc
smean.cl.normal(xx)
# Mean Lower Upper
# ggplot2
mean_cl_normal(xx)
# y ymin ymax
ggplot(iris, aes(x = Species , y = Sepal.Length)) +
stat_summary(fun.data = mean_cl_normal, width = 0.1)
#### Other star_functions
stat_ description
stat_summary() Summarise 'y' values at distinct 'x' value
stat_function() Compute 'y' values from a function of 'x' values
stat_qq() Perform calculations for quantile-quantile plot
ex:
# Normal distribution
library(MASS)
mam.new <- data.frame(body = log10(mammals$body))
ggplot(mam.new, aes(x = body)) +
geom_histogram(aes( y = ..density..)) +
geom_rug() +
stat_function(fun = dnorm, colour = "red",
arg = list(mean = mean(mam.new$body),
sd = sd(mam.new$body)))
# another way to see if a sample match a normal distribution is whith a qqplot
# QQ plot
mam.new$slope <- diff(quantile(mam.new$body, c(0.25, 0.75))) /
diff(qnorm(c(0.25, 0.75)))
mam.new$int <- quantile(mam.new$body, 0.25) -
mam.new$slope * qnorm(0.25)
ggplot(mam.new, aes(sample = body)) +
stat_qq() +
geom_abline(aes(slope = slope, intercept = int), col = "red")
#---------------------------------------------------------------------
Preparations
Here we'll look at stat_summary() in action. We'll build up various plots one-by-one.
In this exercise we'll consider the preparations. That means we'll make sure the data is in the right format and that all the positions that we might use in our plots are defined. Lastly, we'll set the base layer for our plot. ggplot2 is already loaded, so you can get started straight away!
# Display structure of mtcars
str(mtcars)
# Convert cyl and am to factors
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
# Define positions
posn.d <- position_dodge(width = 0.1)
posn.jd <- position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2)
posn.j <- position_jitter(width = 0.2)
# Base layers
wt.cyl.am <- ggplot(mtcars, aes(x = cyl , y = wt, col = am, fill = am, group = am))
#---------------------------------------------------------------------
Plotting variations
Now that the preparation work is done, let's have a look at at stat_summary().
ggplot2 is already loaded, as is wt.cyl.am, which is defined as
wt.cyl.am <- ggplot(mtcars, aes(x = cyl, y = wt, col = am, fill = am, group = am))
Also all the position objects of the previous exercise, posn.d, posn.jd and posn.j, are available. For starters, Plot 1 is already coded for you
# wt.cyl.am, posn.d, posn.jd and posn.j are available
# Plot 1: Jittered, dodged scatter plot with transparent points
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6)
# Plot 2: Mean and SD - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1),
position = posn.d)
# Plot 3: Mean and 95% CI - the easy way
wt.cyl.am +
geom_point(position = posn.jd, alpha = 0.6) +
stat_summary(fun.data = mean_cl_normal,
position = posn.d)
# Plot 4: Mean and SD - with T-tipped error bars - fill in ___
wt.cyl.am +
stat_summary(geom = "point", fun.y = mean,
position = posn.d) +
stat_summary(geom = "errorbar", fun.data = mean_sdl,
position = posn.d, fun.args = list(mult = 1), width = 0.1)
#---------------------------------------------------------------------
Custom Functions
In the video we saw that the only difference between ggplot2::mean_sdl() and Hmisc::smean.sdl() is the naming convention. In order to use the results of a function directly in ggplot2 we need to ensure that the names of the variables match the aesthetics needed for our respective geoms.
Here we'll create two new functions in order to create the plot shown in the viewer. One function will measure the full range of the dataset and the other will measure the interquartile range.
A play vector, xx, has been created for you. Execute
mean_sdl(xx, mult = 1)
in the R Console and consider the format of the output. You'll have to produce functions which return similar outputs.
# Play vector xx is available
# Function to save range for use in ggplot
gg_range <- function(x) {
# Change x below to return the instructed values
data.frame(ymin = min(x), # Min
ymax = max(x)) # Max
}
gg_range(xx)
# Required output
# ymin ymax
# 1 1 100
# Function to Custom function
med_IQR <- function(x) {
# Change x below to return the instructed values
data.frame(y = median(x), # Median
ymin = quantile(x)[2], # 1st quartile
ymax = quantile(x)[4]) # 3rd quartile
}
med_IQR(xx)
# Required output
# y ymin ymax
# 25% 50.5 25.75 75.25
#---------------------------------------------------------------------
Custom Functions (2)
In the last exercise we created functions that will allow us to plot the so-called five-number summary (the minimum, 1st quartile, median, 3rd quartile, and the maximum). Here, we'll implement that into a unique plot type.
All the functions and objects from the previous exercise are available including the updated mtcars data frame, the position object posn.d, the base layers wt.cyl.am and the functions med_IQR() and gg_range().
The plot you'll end up with at the end of this exercise is shown on the right. When using stat_summary() recall that the fun.data argument requires a properly labelled 3-element long vector, which we saw in the previous exercises. The fun.y argument requires only a 1-element long vector.
# The base ggplot command; you don't have to change this
wt.cyl.am <- ggplot(mtcars, aes(x = cyl,y = wt, col = am, fill = am, group = am))
# Add three stat_summary calls to wt.cyl.am
wt.cyl.am +
stat_summary(geom = "linerange", fun.data = med_IQR,
position = posn.d, size = 3) +
stat_summary(geom = "linerange", fun.data = gg_range,
position = posn.d, size = 3,
alpha = 0.4) +
stat_summary(geom = "point", fun.y = median,
position = posn.d, size = 3,
col = "black", shape = "X")
######## Coordinates and Facets (Module 02-013)
######################################################################
VIDEO
Coordinates Layer
- Controls plot dimensions
- coord_
- coord_cartesian()
### Zooming in
- scale_x_continuous(limits = ...)
- xlim()
- coord_cartesian(xlim = ...)
iris.smooth <- ggplot(iris, aes(x = Sepal.Length,
y = Sepal.Width,
col = Species)) +
geom_point(alpha = 0.7) +
geom_smooth()
iris.smooth
# scale_x_continuous
iris.smooth + scale_x_continuous(limits = c(4.5, 5.5))
# xlim()
iris.smooth + xlim(c(4.5, 5.5))
# coord_cartesian(xlim = ...)
iris.smooth + coord_cartesian(xlim = c(4.5, 5.5))
### Aspect Ratio
- Height-to-width ratio
- Deception!
- Standarization attempts
- Typically 1:1
library(reshape2); library(zoo)
sunspots.m <- data.frame(year = index(sunspots.month),
value = melt(sunspots.month)$value)
ggplot(sunspots.m, aes(x = year, y = value)) +
geom_line() +
coord_equal() # a 1:1 aspect ratio
#another aspect ratio
ggplot(sunspots.m, aes(x = year, y = value)) +
geom_line() +
coord_fixed(0.055)
#---------------------------------------------------------------------
Zooming In
In the video, you saw different ways of using the coordinates layer to zoom in. In this exercise, we'll compare some of the techniques again.
As usual, you'll be working with the mtcars dataset, which is already cleaned up for you (cyl and am are categorical variables). Also p, a ggplot object you coded in the previous chapter, is already available. Execute p in the console to check it out.
# Basic ggplot() command, coded for you
p <- ggplot(mtcars, aes(x = wt, y = hp, col = am)) + geom_point() + geom_smooth()
# Add scale_x_continuous()
p + scale_x_continuous(limits = c(3, 6), expand = c(0, 0))
# Add coord_cartesian(): the proper way to zoom in
p + coord_cartesian(xlim = c(3, 6))
#---------------------------------------------------------------------
Aspect Ratio
We can set the aspect ratio of a plot with coord_fixed() or coord_equal(). Both use ratio = 1 as a default. A 1:1 aspect ratio is most appropriate when two continuous variables are on the same scale, as with the iris dataset.
All variables are measured in centimeters, so it only makes sense that one unit on the plot should be the same physical distance on each axis. This gives a more truthful depiction of the relationship between the two variables since the aspect ratio can change the angle of our smoothing line. This would give an erroneous impression of the data.
Of course the underlying linear models don't change, but our perception can be influenced by the angle drawn.
# Complete basic scatter plot function
base.plot <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = FALSE)
# Plot base.plot: default aspect ratio
base.plot
# Fix aspect ratio (1:1) of base.plot
base.plot + coord_equal()
#---------------------------------------------------------------------
Pie Charts
The coord_polar() function converts a planar x-y Cartesian plot to polar coordinates. This can be useful if you are producing pie charts.
We can imagine two forms for pie charts - the typical filled circle, or a colored ring.
As an example, consider the stacked bar chart shown in the viewer. Imagine that we just take the y axis on the left and bend it until it loops back on itself, while expanding the right side as we go along. We'd end up with a pie chart - it's simply a bar chart transformed onto a polar coordinate system.
Typical pie charts omit all of the non-data ink, which we'll learn about in the next chapter. Pie charts are not really better than stacked bar charts, but we'll come back to this point in the fourth chapter on best practices.
The mtcars data frame is available, with cyl converted to a factor for you.
# Create a stacked bar plot: wide.bar
wide.bar <- ggplot(mtcars, aes(x = 1, fill = cyl)) +
geom_bar()
# Convert wide.bar to pie chart
wide.bar +
coord_polar(theta = "y")
# Create stacked bar plot: thin.bar
thin.bar <- ggplot(mtcars, aes(x = 1, fill = cyl)) +
geom_bar(width = 0.1) +
scale_x_continuous(limits = c(0.5,1.5))
# Convert thin.bar to "ring" type pie chart
thin.bar +
coord_polar(theta = "y")
#---------------------------------------------------------------------
VIDEO
Facets Layer
- straigtht-forward yet useful
- Concept of small multiples
p <- ggplot(iris.wide, aes( x = Length,
y = Width,
col = Part)) +
geom_point( position = position_jitter(),
alpha = 0.7) +
scale_color_brewer(pallette = "Setl") +
coord_fixed()
p + facet_grid(.~ Species) #
#---------------------------------------------------------------------
Facets: the basics
The most straightforward way of using facets is facet_grid(). Here we just need to specify the categorical variable to use on rows and columns using standard R formula notation (rows ~ columns).
Notice that we can also take advantage of ordinal variables by positioning them in the correct order as columns or rows, as is the case with the number of cylinders. Get some hands-on practice in this exercise; ggplot2 is already loaded for you and mtcars is available. The variables cyl and am are factors. However, this is not necessary for facets; ggplot2 will coerce variables to factors in this case.
# Basic scatter plot
p <- ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point()
# 1 - Separate rows according to transmission type, am
p +
facet_grid(am ~.)
# 2 - Separate columns according to cylinders, cyl
p +
facet_grid(.~ cyl)
# 3 - Separate by both columns and rows
p +
facet_grid(am ~ cyl)
#---------------------------------------------------------------------
Many variables
Facets are another way of presenting categorical variables. Recall that we saw all the ways of combining variables, both categorical and continuous, in the aesthetics chapter. Sometimes it's possible to overdo it. Here we'll present a plot with 6 variables and see if we can add even more.
Let's begin by using a trick to map two variables onto two color scales - hue and lightness. We combine cyl and am into a single variable cyl_am. To accommodate this we also make a new color palette with alternating red and blue of increasing darkness. This is saved as myCol. If you are not familiar with these steps, execute the code piece-by-piece.
# Code to create the cyl_am col and myCol vector
mtcars$cyl_am <- paste(mtcars$cyl, mtcars$am, sep = "_")
myCol <- rbind(brewer.pal(9, "Blues")[c(3,6,8)],
brewer.pal(9, "Reds")[c(3,6,8)])
# Map cyl_am onto col
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am)) +
geom_point() +
# Add a manual colour scale
scale_color_manual(values = myCol)
# Grid facet on gear vs. vs
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am)) +
geom_point() +
scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
# Also map disp to size
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl_am, size = disp)) +
geom_point() +
scale_color_manual(values = myCol) +
facet_grid(gear ~ vs)
#---------------------------------------------------------------------
Dropping levels
When you have a categorical variable with many levels which are not all present in each sub-group of another variable, it may be desirable to drop the unused levels. As an example let's return to the mammalian sleep dataset, mamsleep. It is available in your workspace.
The variables of interest here are name, which contains the full popular name of each animal, and vore, the eating behavior. Each animal can only be classified under one eating habit, so if we facet according to vore, we don't need to repeat the full list in each sub-plot.
# Basic scatter plot
p <- ggplot(mamsleep, aes(x = time, y = name, col = sleep)) +
geom_point()
# Execute to display plot
p
# Facet rows accoding to vore
p +
facet_grid(vore ~.)
# Specify scale and space arguments to free up rows
p +
facet_grid(vore ~., scale = "free_y", space = "free_y")
######## Themes (Module 03-013)
######################################################################
Themes Layer
All the non-data ink
Visual elements not part of data
Three types
text element_text()
line element_line()
rectangle element_rect()
All of these are arguments of theme
# element_text()
theme( text= element_text()
title =
plot.title =
legend.text =
legend.title =
axis.title =
axis.title.x =
axis.title.y =
axis.text =
axis.text.x =
axis.text.y =
strip.text =
strip.text.x =
strip.text.y =
)
# element_line()
theme( line = element_line()
axis.ticks =
axis.ticks.x =
axis.ticks.y =
axis.line =
axis.line.x =
axis.line.y =
panel.grid =
panel.grid.major =
panel.grid.minor =
panel.grid.major.x =
panel.grid.major.y =
panel.grid.minor.x =
panel.grid.minor.y =
)
# element_rect()
theme( rect = element_rect()
legend.background =
legend.key =
panel.background =
panel.border =
plot.background =
strip.background =
)
Inheritance
text
title
plot.title
legend.title
axis.title
axis.title.x
axis.title.y
legend.text
axis.text
axis.text.x
axis.text.y
strip.text
strip.text.x
strip.text.y
line
axis.ticks
axis.ticks.x
axis.ticks.y a
xis.line
axis.line.x
axis.line.y
panel.grid
panel.grid.major
panel.grid.major.x
panel.grid.major.y
panel.grid.minor
panel.grid.minor.x
panel.grid.minor.y
rect
legend.background
legend.key
panel.background
panel.border
plot.background
strip.background
# element_blank
#we use it to remove elements
theme( text = element_blank()
line = element_blank()
rect = element_blank()
)
#---------------------------------------------------------------------
Rectangles
To understand all the arguments for the themes, you'll modify an existing plot over the next series of exercises.
Here you'll focus on the rectangles of the plotting object z that has already been created for you. If you type z in the console, you can check it out. The goal is to turn z into the plot in the viewer. Do this by following the instructions step by step.
# Starting point
z
# Plot 1: Change the plot background fill to myPink
z +
theme(plot.background = element_rect(fill = myPink))
# Plot 2: Adjust the border to be a black line of size 3
z +
theme(plot.background = element_rect(fill = myPink, color = "black", size = 3)) # expanded from plot 1
# Theme to remove all rectangles
no_panels <- theme(rect = element_blank())
# Plot 3: Combine custom themes
z +
no_panels +
theme(plot.background = element_rect(fill = myPink, color = "black", size = 3)) # from plot 2
#---------------------------------------------------------------------
Lines
To change the appearance of lines use the element_line() function.
The plot you created in the last exercise, with the fancy pink background, is available as the plotting object z. Your goal is to produce the plot in the viewer - no grid lines, but red axes and tick marks.
For each of the arguments that specify lines, use element_line() to modify attributes. e.g. element_line(color = "red").
Remember, to remove a non-data element, use element_blank().
# Extend z using theme() function and 3 args
z +
theme(panel.grid = element_blank(),
axis.line = element_line(color = "red"),
axis.ticks = element_line(color = "red")
)
#---------------------------------------------------------------------
Text
Next we can make the text on your plot prettier and easier to spot. You can do this through the element_text() function and by passing the appropriate arguments inside the theme() function.
As before, the plot you've created in the previous exercise is available as z. The plot you should end up with after successfully completing this exercises is shown in the viewer.
# Original plot, color provided
z
myRed
# Extend z with theme() function and 3 args
z +
theme(strip.text = element_text(size = 16, color = myRed),
axis.title = element_text(color = myRed, hjust = 0, face = "italic"),
axis.text = element_text(color = "black"))
#---------------------------------------------------------------------
Legends
The themes layer also allows you to specify the appearance and location of legends.
The plot you've coded up to now is available as z. It's also displayed in the viewer. Solve the instructions and compare the resulting plots with the plot you started with.
# Move legend by position
z +
theme(legend.position = c(0.85, 0.85))
# Change direction
z +
theme(legend.direction = "horizontal")
# Change location by name
z +
theme(legend.position = "bottom")
# Remove legend entirely
z +
theme(legend.position = "none")
#--------------------------------------------------------------------
Positions
The different rectangles of your plot have spacing between them. There's spacing between the facets, between the axis labels and the plot rectangle, between the plot rectangle and the entire panel background, etc. Let's experiment!
The last plot you created in the previous exercise, without a legend, is available as z.
# Increase spacing between facets
library(grid)
z +
theme(panel.spacing.x = unit(2, "cm"))
# Adjust the plot margin
z +
theme(panel.spacing.x = unit(2, "cm"),
plot.margin = unit(c(1,2,1,1), "cm"))
#---------------------------------------------------------------------
VIDEO
Recycling Themes
- Many plots
- Consistency in style
- Apply speci???c theme everywhere
z <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
geom_jitter(alpha = 0.7) +
scale_color_brewer("Species",
palette = "Dark2",
labels = c("Setosa",
"Versicolor",
"Virginica")) +
scale_y_continuous("Width (cm)", limits = c(2, 4.5), expand = c(0, 0)) +
scale_x_continuous("Length (cm)", limits = c(4, 8), expand = c(0, 0)) +
ggtitle("Sepals") +
coord_fixed(1)
z
z + theme(panel.background = element_blank(),
legend.background = element_blank(),
legend.key = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(colour = "black"),
axis.line = element_line(colour = "black"))
theme_iris <- theme(panel.background = element_blank(),
legend.background = element_blank(),
legend.key = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(colour = "black"),
axis.line = element_line(colour = "black"))
z + theme_iris
### Reuse theme
m <- ggplot(iris.wide, aes(x = Length, y = Width, col = Part)) +
geom_point() +
facet_grid(. ~ Species)
m
m + theme_iris
### Extend theme
theme_iris <- theme_iris +
theme(strip.background = element_blank())
m + theme_iris
### Discrete x-axis
p <- ggplot(iris.tidy, aes(x = Measure, y = Value, col = Part)) +
geom_point(position = position_jitter(0.1), alpha = 0.6,
width = 0.4) +
scale_y_continuous("Value (cm)", limits = c(0, 8),
expand = c(0, 0)) +
facet_grid(. ~ Species)
p
p + theme_iris
### Derivative theme
theme_iris_disX <- theme_iris +
theme(axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
p + theme_iris_disX
# Built-in theme templates
z + theme_iris
z + theme_classic()
# Built-in theme templates
m + theme_classic()
m + theme_classic() +
theme(strip.background = element_blank())
# ggthemes
library(ggthemes)
z + theme_tufte()
# Theme update
original <- theme_update(panel.background = element_blank(),
legend.background = element_blank(),
legend.key = element_blank(),
panel.grid = element_blank(),
axis.text = element_text(colour = "black"),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(colour = "black"),
strip.background = element_blank())