diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000..e7334b120d Binary files /dev/null and b/.DS_Store differ diff --git a/.Rproj.user/1755C669/sources/prop/167CFA38 b/.Rproj.user/1755C669/sources/prop/167CFA38 new file mode 100644 index 0000000000..62bedca53a --- /dev/null +++ b/.Rproj.user/1755C669/sources/prop/167CFA38 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "3,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/1755C669/sources/prop/INDEX b/.Rproj.user/1755C669/sources/prop/INDEX new file mode 100644 index 0000000000..a7584ad287 --- /dev/null +++ b/.Rproj.user/1755C669/sources/prop/INDEX @@ -0,0 +1,3 @@ +~%2FDocuments%2FR%20Directory%2FWebsites%2Fpdsnd_github%2F.gitignore="167CFA38" +~%2FDocuments%2FR%20Directory%2FWebsites%2Fpdsnd_github%2FREADME.md="D3748000" +~%2FDocuments%2FR%20Directory%2FWebsites%2Fpdsnd_github%2Fbike_sharing%2Fbike_sharing.Rmd="90455F94" diff --git a/.Rproj.user/shared/notebooks/patch-chunk-names b/.Rproj.user/shared/notebooks/patch-chunk-names new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.gitignore b/.gitignore index c129b08a55..975716b205 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .github/** +.Rpserroj.u +.csv +.Rproj.user diff --git a/bike_sharing/.DS_Store b/bike_sharing/.DS_Store new file mode 100644 index 0000000000..5008ddfcf5 Binary files /dev/null and b/bike_sharing/.DS_Store differ diff --git a/bike_sharing/bike_sharing.Rmd b/bike_sharing/bike_sharing.Rmd new file mode 100644 index 0000000000..2e94925b47 --- /dev/null +++ b/bike_sharing/bike_sharing.Rmd @@ -0,0 +1,217 @@ +--- +title: "bike_sharing" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +# Load Libraries + +```{r message=FALSE, warning=FALSE} +library(ggplot2) +library(dplyr) +library(anytime) # for time +library(lubridate) # for time +library(janitor) # to clean column names + +``` + + + +# load the data + + +```{r} + +new_york = read.csv("new_york_city.csv")# using base-R read.csv +washington = read.csv("washington.csv") # using tidyvers read_csv +chicago = read.csv("chicago.csv") + +``` + + +Let us observe the data: + +```{r} +glimpse(new_york) +``` + + +The variable names are not the best way of naming, let us use janitor package to clean all the column names and use camel_case style + +# clean names using janitor package + + +```{r} +tidy_new_york <- clean_names(new_york) +tidy_washington <- clean_names(washington) +tidy_chicago <- clean_names(chicago) + +``` + + + +```{r} + +glimpse(tidy_new_york) +``` + + +# Transform the dataset + +```{r} + +chi_clean <- tidy_chicago %>% + mutate(start_time = as_datetime(as.character(start_time), tz = "UTC")) %>% # changing to time format + mutate(end_time = as_datetime(as.character(end_time), tz = "UTC")) %>% # changing to time format + mutate(state = 'New York') # create a new variable to compare bwteen dataset + + +wash_clean <- tidy_washington %>% + mutate(start_time = as_datetime(as.character(start_time), tz = "UTC"))%>% # changing to time format + mutate(end_time = as_datetime(as.character(end_time), tz = "UTC")) %>% # changing to time format + mutate(state = 'Washington') # create a new variable to compare + + + +ny_clean <- tidy_new_york %>% + mutate(start_time = anytime(start_time)) %>% # using anytime to change to time + mutate(end_time = anytime(end_time)) %>% # using anytime to change + mutate(state = 'Chicago') %>% + relocate() + +``` + +# Combine the dataframes + + +```{r} +df_all_states <- bind_rows(wash_clean, chi_clean, ny_clean) + +``` + +# Question 1: What is the most common month? + + +## Vuzualization + +Getting the month from the dates: + +```{r} +df_all_states$month <- month(df_all_states$start_time, label = TRUE) # extracting month from the dates + +``` + +Observing the new dataframe with the new column + +```{r} +glimpse(df_all_states) +``` + + +Plotting the month: + +```{r} + +df_all_states<- subset(df_all_states, !is.na(month)) + +ggplot(df_all_states)+ + geom_bar(aes(x= month, fill= state), position='dodge') + + ggtitle(" Months") + +``` + +From the plot, the most common month is June + +## Summary + + +```{r} +summary(df_all_states$month) + +``` + +We can see June has the highest number of : 37147. All other months have 0 entries. We also have + + +```{r} + +``` + + + +# Question 2: What is the most common day of week? + +## Vizualization + + +```{r} +df_all_states$day_of_week <- wday(df_all_states$start_time, label = TRUE) # making a da of the week from date using wday function from lubridate + +``` + + +Plotting the graph of the date + +```{r} + + +ggplot(df_all_states)+ + geom_bar(aes(x= day_of_week, fill= state), position='dodge') + + ggtitle(" Days of the week") + +``` + + +From the vizualizaton above, we can see that Wedensday has the highest day of the week in Chicago and Washington. However, NeYork seems to have small number in all days. It may turns out people dont rent bike as in other cities. + + +## Summary + + +```{r} +summary(df_all_states$day_of_week) + +``` + +Still, `Wed` is the most common day of the week with total number of : `25092` + + + +```{r} +summary(chi_clean$start_time) + +``` + +# Question 3: What is the most common hour of day? + + +## Vuzualization + + +```{r} +## Create day hour field. +df_all_states$hour <-strftime(df_all_states$start_time, format="%H") + +ggplot(df_all_states)+ + geom_bar(aes(x=hour, fill=state), position='dodge') + + ggtitle("Number of rides for each hour of the day") +``` + + +The most common hour 8AM that is when people are going work. + + + + +# Summary of all the questions + + +```{r} +skim(df_all_states, start_time, hour, day_of_week ) %>% + +``` + diff --git a/bike_sharing/bike_sharing.html b/bike_sharing/bike_sharing.html new file mode 100644 index 0000000000..5497935a61 --- /dev/null +++ b/bike_sharing/bike_sharing.html @@ -0,0 +1,363 @@ + + + + +
+ + + + + + + + +library(ggplot2)
+library(dplyr)
+library(anytime) # for time
+library(lubridate) # for time
+library(janitor) # to clean column names
+new_york = read.csv("new_york_city.csv")# using base-R read.csv
+washington = read.csv("washington.csv") # using tidyvers read_csv
+chicago = read.csv("chicago.csv")
+Let us observe the data:
+glimpse(new_york)
+## Rows: 54,770
+## Columns: 9
+## $ X <int> 5688089, 4096714, 2173887, 3945638, 6208972, 1285652, 16…
+## $ Start.Time <chr> "2017-06-11 14:55:05", "2017-05-11 15:30:11", "2017-03-2…
+## $ End.Time <chr> "2017-06-11 15:08:21", "2017-05-11 15:41:43", "2017-03-2…
+## $ Trip.Duration <int> 795, 692, 1325, 703, 329, 998, 478, 4038, 5132, 309, 113…
+## $ Start.Station <chr> "Suffolk St & Stanton St", "Lexington Ave & E 63 St", "1…
+## $ End.Station <chr> "W Broadway & Spring St", "1 Ave & E 78 St", "Henry St &…
+## $ User.Type <chr> "Subscriber", "Subscriber", "Subscriber", "Subscriber", …
+## $ Gender <chr> "Male", "Male", "Male", "Female", "Male", "Male", "Male"…
+## $ Birth.Year <dbl> 1998, 1981, 1987, 1986, 1992, 1986, 1982, 1984, NA, 1992…
+The variable names are not the best way of naming, let us use janitor package to clean all the column names and use camel_case style
+tidy_new_york <- clean_names(new_york)
+tidy_washington <- clean_names(washington)
+tidy_chicago <- clean_names(chicago)
+glimpse(tidy_new_york)
+## Rows: 54,770
+## Columns: 9
+## $ x <int> 5688089, 4096714, 2173887, 3945638, 6208972, 1285652, 16…
+## $ start_time <chr> "2017-06-11 14:55:05", "2017-05-11 15:30:11", "2017-03-2…
+## $ end_time <chr> "2017-06-11 15:08:21", "2017-05-11 15:41:43", "2017-03-2…
+## $ trip_duration <int> 795, 692, 1325, 703, 329, 998, 478, 4038, 5132, 309, 113…
+## $ start_station <chr> "Suffolk St & Stanton St", "Lexington Ave & E 63 St", "1…
+## $ end_station <chr> "W Broadway & Spring St", "1 Ave & E 78 St", "Henry St &…
+## $ user_type <chr> "Subscriber", "Subscriber", "Subscriber", "Subscriber", …
+## $ gender <chr> "Male", "Male", "Male", "Female", "Male", "Male", "Male"…
+## $ birth_year <dbl> 1998, 1981, 1987, 1986, 1992, 1986, 1982, 1984, NA, 1992…
+chi_clean <- tidy_chicago %>%
+ mutate(start_time = as_datetime(as.character(start_time), tz = "UTC")) %>% # changing to time format
+ mutate(end_time = as_datetime(as.character(end_time), tz = "UTC")) %>% # changing to time format
+ mutate(state = 'New York') # create a new variable to compare bwteen dataset
+
+
+wash_clean <- tidy_washington %>%
+ mutate(start_time = as_datetime(as.character(start_time), tz = "UTC"))%>% # changing to time format
+ mutate(end_time = as_datetime(as.character(end_time), tz = "UTC")) %>% # changing to time format
+ mutate(gender = NA_character_) %>% # create an NA variable since it is empty
+ mutate(birth_year = NA) %>% #create an NA variable since it is empty
+ mutate(state = 'Washington') # create a new variable to compare
+
+
+
+ny_clean <- tidy_new_york %>%
+ mutate(start_time = anytime(start_time)) %>% # using anytime to change to time
+ mutate(end_time = anytime(end_time)) %>% # using anytime to change
+ mutate(state = 'Chicago') %>%
+ relocate()
+df_all_states <- bind_rows(wash_clean, chi_clean, ny_clean)
+Getting the month from the dates:
+df_all_states$month <- month(df_all_states$start_time, label = TRUE) # extracting month from the dates
+Observing the new dataframe with the new column
+glimpse(df_all_states)
+## Rows: 152,451
+## Columns: 11
+## $ x <int> 1621326, 482740, 1330037, 665458, 1481135, 1148202, 1594…
+## $ start_time <dttm> 2017-06-21 08:36:34, 2017-03-11 10:40:00, 2017-05-30 01…
+## $ end_time <dttm> 2017-06-21 08:44:43, 2017-03-11 10:46:00, 2017-05-30 01…
+## $ trip_duration <dbl> 489.066, 402.549, 637.251, 1827.341, 1549.427, 398.000, …
+## $ start_station <chr> "14th & Belmont St NW", "Yuma St & Tenley Circle NW", "1…
+## $ end_station <chr> "15th & K St NW", "Connecticut Ave & Yuma St NW", "5th &…
+## $ user_type <chr> "Subscriber", "Subscriber", "Subscriber", "Customer", "S…
+## $ gender <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
+## $ birth_year <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
+## $ state <chr> "Washington", "Washington", "Washington", "Washington", …
+## $ month <ord> Jun, Mar, May, Apr, Jun, May, Jun, Jun, Mar, Feb, Apr, A…
+Plotting the month:
+df_all_states<- subset(df_all_states, !is.na(month))
+
+ggplot(df_all_states)+
+ geom_bar(aes(x= month, fill= state), position='dodge') +
+ ggtitle(" Months")
+
+From the plot, the most common month is June
+summary(df_all_states$month)
+## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
+## 15337 18860 19235 30709 31158 37147 0 0 0 0 0 4
+We can see June has the highest number of : 37147. All other months have 0 entries. We also have
+df_all_states$day_of_week <- wday(df_all_states$start_time, label = TRUE) # making a da of the week from date using wday function from lubridate
+Plotting the graph of the date
+ggplot(df_all_states)+
+ geom_bar(aes(x= day_of_week, fill= state), position='dodge') +
+ ggtitle(" Days of the week")
+
+From the vizualizaton above, we can see that Wedensday has the highest day of the week in Chicago and Washington. However, NeYork seems to have small number in all days. It may turns out people dont rent bike as in other cities.
+summary(df_all_states$day_of_week)
+## Sun Mon Tue Wed Thu Fri Sat
+## 19220 20578 22495 25092 23204 22406 19455
+Still, Wed
is the most common day of the week with total number of : 25092
## Create day hour field.
+df_all_states$hour <-strftime(df_all_states$start_time, format="%H")
+
+ggplot(df_all_states)+
+ geom_bar(aes(x=hour, fill=state), position='dodge') +
+ ggtitle("Number of rides for each hour of the day")
+
+The most common hour 8AM that is when people are going work.
+