-
Notifications
You must be signed in to change notification settings - Fork 2
/
explore_timetable.R
229 lines (190 loc) · 8.63 KB
/
explore_timetable.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
library(tidyverse)
library(purrr)
library(stringr)
library(magrittr)
library(rworldmap)
library(collections)
library(tuple)
library(lubridate)
library(ggmap)
library(svMisc)
library(doParallel)
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
setwd("C:/Users/Mactuary/OneDrive - UTS/36103/AT2")
data_dir <- paste(getwd(), "Data", sep = "/")
data_dir
timetable_dir <- paste(data_dir, "Timetable", sep = "/")
filepaths <- grep("*.txt", list.files(timetable_dir), value = TRUE)
filepaths
data_names <- str_sub(filepaths, 1, -5)
data_names
filepaths <- map(filepaths, function(x) paste(timetable_dir, x, sep = "/"))
names(filepaths) <- data_names
filepaths
# Defining columng types
cols_list <- list(agency = cols(agency_id = col_character(),
agency_name = col_character(),
agency_url = col_character(),
agency_timezone = col_character(),
agency_lang = col_character(),
agency_phone = col_double()),
calendar = cols(service_id = col_character(),
monday = col_factor(),
tuesday = col_factor(),
wednesday = col_factor(),
thursday = col_factor(),
friday = col_factor(),
saturday = col_factor(),
sunday = col_factor(),
start_date = col_date(format = "%Y%m%d"),
end_date = col_date(format = "%Y%m%d")),
calendar_dates = cols(service_id = col_character(),
date = col_date(format = "%Y%m%d"),
exception_type = col_factor()),
routes = cols(route_id = col_character(),
agency_id = col_character(),
route_short_name = col_character(),
route_long_name = col_character(),
route_desc = col_factor(),
route_type = col_factor(),
route_color = col_factor(),
route_text_color = col_factor()),
shapes = cols(shape_id = col_character(),
shape_pt_lat = col_double(),
shape_pt_lon = col_double(),
shape_pt_sequence = col_integer(),
shape_dist_traveled = col_double()),
stop_times = cols(trip_id = col_character(),
arrival_time = col_character(),
departure_time = col_character(),
stop_id = col_character(),
stop_sequence = col_integer(),
stop_headsign = col_factor(),
pickup_type = col_factor(),
drop_off_type = col_factor(),
shape_dist_traveled = col_double()),
stops = cols(stop_id = col_character(),
stop_code = col_double(),
stop_name = col_character(),
stop_lat = col_double(),
stop_lon = col_double(),
location_type = col_factor(),
parent_station = col_character(),
wheelchair_boarding = col_factor(),
platform_code = col_factor()),
trips = cols(route_id = col_character(),
service_id = col_character(),
trip_id = col_character(),
shape_id = col_character(),
trip_headsign = col_character(),
direction_id = col_factor(),
block_id = col_character(),
wheelchair_accessible = col_factor())
)
files_df <- tibble(data = data_names,
filepath = filepaths,
column_def = cols_list)
glimpse(files_df)
data <- map2(files_df$filepath, files_df$column_def, ~readr::read_csv(.x, col_types = .y))
names(data) <- data_names
map(data, glimpse)
data$stop_times %<>%
mutate(arrival_time = hms(arrival_time),
departure_time = hms(departure_time))
glimpse(data$stop_times)
# Calculating the level of Public Transport In a Given Area
joined <- data$stops %>%
inner_join(data$stop_times, by = 'stop_id') %>%
inner_join(data$trips, by = 'trip_id') %>%
inner_join(data$routes, by = 'route_id')
head(joined)
glimpse(joined)
count_by_type <- joined %>%
group_by(stop_id, route_type) %>%
summarize(num_service = n()) %>%
select(stop_id,
route_type,
num_service)
glimpse(count_by_type)
head(count_by_type)
# SHAPES; assigning each stop_id to a particular LGA or locality
nsw_locality_polygon_shp_path = paste(data_dir, "Map/nsw_locality_polygon_shp/NSW_LOCALITY_POLYGON_shp.shp", sep = "/")
nsw_lga_polygon_shp_path = paste(data_dir, "Map/nsw_lga_polygon_shp/NSW_LGA_POLYGON_shp.shp", sep = "/")
nsw_locality_polygon_shp <- rgdal::readOGR(nsw_locality_polygon_shp_path)
nsw_lga_polygon_shp <- rgdal::readOGR(nsw_lga_polygon_shp_path)
nsw_lga_polygon_tbl <- as.tbl(nsw_lga_polygon_shp@data)
class(nsw_lga_polygon_tbl)
glimpse(nsw_lga_polygon_tbl)
num_stops <- nrow(data$stops)
num_stops
# Looping over the LGAs
num_lgas <- nrow(nsw_lga_polygon_shp@data)
num_lgas
coords_list = list()
for(lga in 1:num_lgas){
coords = nsw_lga_polygon_shp@polygons[[lga]]@Polygons[[1]]@coords
coords_list[[lga]] = coords
}
nsw_lga_polygon_tbl$polygon = coords_list
glimpse(nsw_lga_polygon_tbl)
# Get the stop_id and coordinates in a data.frame
stop_coords_df <- data$stops %>%
select(stop_id, stop_lon, stop_lat)
head(stop_coords_df)
rownames(stop_coords_df) <- 1:num_stops
# Calculate to which LGA a particular stop_id belongs
in_polygon = list()
Sys.time()
for(lga in 1:num_lgas){
progress(lga)
polygon_coords = nsw_lga_polygon_tbl$polygon[[lga]]
in_polygon[[lga]] = sp::point.in.polygon(stop_coords_df$stop_lon,
stop_coords_df$stop_lat,
polygon_coords[, 1],
polygon_coords[, 2])
}
Sys.time()
in_polygon_df <- data.frame(in_polygon)
# Add the lga number (in the order given in the original data)
names(in_polygon_df) <- 1:num_lgas
# Add the stop_id column
in_polygon_df$stop_id <- data$stops$stop_id
write.csv(in_polygon_df, file = paste(getwd(), "Output/in_polygon.csv", sep = "/"))
# Check that for each stop (row), the sum is 1, i.e. that it only belongs to one LGA
stop_id_total_lga <- in_polygon_df %>%
select(-stop_id) %>%
apply(1, function(x) sum(x != 0))
table(stop_id_total_lga)
# 0: 189 stops don't belong to an LGA
# 1: 43927 stops that are strictly within an LGA
# 2: 77 stops lies that on the relative interios of an edge of a polygon
# 3: 0 stops are the vertex of a polygon
# For each stop_id, get the LGA number, i.e. 1:197.
stop_id_lga_map <- list()
for(stop in 1:num_stops){
index <- which(in_polygon_df[stop, ] == 1)
# Consider only the first lga
if(length(index) == 1)
stop_id_lga_map[[stop]] <- index
else{
stop_id_lga_map[[stop]] <- index[1]
}
}
# Append the stop_id_total_lga to the stop_coord_df
stop_coords_df$total_lga <- stop_id_total_lga
# Append the stop_id_lga_map to the stop_coord_df
stop_coords_df$lga_number <- unlist(stop_id_lga_map)
# Join the three tables:
# 1. stop_coords_df
# 2. nsw_lga_polygon_tbl (the table containing the LGA information), excluding the polygon
stop_coords_df %<>%
mutate(lga_number = as.character(lga_number)) %>%
left_join(rownames_to_column(nsw_lga_polygon_tbl), by = c("lga_number" = "rowname")) %>%
select(-polygon)
# Join stop_coords_df with count_by_type to get the route_type and the
# num_service for each stop_id.
final_stop_count_coords_df <- stop_coords_df %>%
left_join(count_by_type, by = 'stop_id')
write.csv(final_stop_count_coords_df,
file = paste(getwd(), "Output/final_stop_coords_df.csv", sep = "/"))