-
Notifications
You must be signed in to change notification settings - Fork 0
/
arrow_summary.qmd
123 lines (91 loc) · 2.51 KB
/
arrow_summary.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
---
title: "Arrow_summary"
author: "Mike Spencer"
format: gfm
editor: visual
---
## Intro
Doc trials different ways of reading data.
```{r}
library(tidyverse)
library(arrow)
```
## csv
### Single file, readr
```{r}
tick = proc.time()
f = list.files("data_in", full.names = TRUE)
parallel::mclapply(f, mc.cores = 8, function(i){
read_csv(i) %>%
group_by(sex) %>%
summarise(end_of_this_period = unique(end_of_this_period),
mean_income = mean(income),
overdraft_users = sum(cash_min < 0))
}) %>%
bind_rows()
tock_csv_single_readr = proc.time()[3] - tick[3]
```
### Single file, arrow
```{r}
tick = proc.time()
f = list.files("data_in", full.names = TRUE)
parallel::mclapply(f, mc.cores = 8, function(i){
read_csv_arrow(i) %>%
group_by(sex) %>%
summarise(end_of_this_period = unique(end_of_this_period),
mean_income = mean(income),
overdraft_users = sum(cash_min < 0))
}) %>%
bind_rows()
tock_csv_single_arrow = proc.time()[3] - tick[3]
```
### Dataset, arrow
```{r}
tick = proc.time()
f = list.files("data_in", full.names = TRUE)
open_csv_dataset(f) %>%
group_by(end_of_this_period, sex) %>%
summarise(mean_income = mean(income),
overdraft_users = sum(cash_min < 0)) %>%
collect()
tock_csv_dataset_arrow = proc.time()[3] - tick[3]
```
## Parquet
### Single file
```{r}
tick = proc.time()
f = list.files("data_part_date", recursive = T, full.names = TRUE)
parallel::mclapply(f, mc.cores = 8, function(i){
read_parquet(i) %>%
group_by(sex) %>%
summarise(end_of_this_period = as.Date(str_sub(i, 35, 44)),
mean_income = mean(income),
overdraft_users = sum(cash_min < 0))
}) %>%
bind_rows()
tock_parquet_single_arrow = proc.time()[3] - tick[3]
```
### Dataset
```{r}
tick = proc.time()
open_dataset("data_part_date") %>%
group_by(end_of_this_period, sex) %>%
summarise(mean_income = mean(income),
overdraft_users = sum(cash_min < 0)) %>%
collect()
tock_parquet_dataset_arrow = proc.time()[3] - tick[3]
```
## Results
```{r}
tibble(method = c("csv_single_readr",
"csv_single_arrow",
"csv_dataset_arrow",
"parquet_single_arrow",
"parquet_dataset_arrow"),
time_seconds = c(tock_csv_single_readr,
tock_csv_single_arrow,
tock_csv_dataset_arrow,
tock_parquet_single_arrow,
tock_parquet_dataset_arrow)) %>%
knitr::kable()
```