-
Notifications
You must be signed in to change notification settings - Fork 0
/
intersect.sql
138 lines (115 loc) · 3.29 KB
/
intersect.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*
Basic parameter of the test:
v_numOfDoc - Number of documents
The main idea of the test is to compare intersection performed by a classical relational design with designs
based on arrays.
*/
SET max_parallel_workers_per_gather = 0;
-----------------
-- Version No.1 - Binding table (classical relational M:N design)
DO
$$
DECLARE
v_idword integer;
v_numOfDoc integer := 10000;
BEGIN
DROP TABLE IF EXISTS Belongs;
DROP TABLE IF EXISTS Doc;
DROP TABLE IF EXISTS Word;
-- Documents
CREATE TABLE Doc
(
id int primary key,
padding varchar(10)
);
-- Words
CREATE TABLE Word
(
id int primary key,
padding varchar(50)
);
-- Binding table
CREATE TABLE Belongs
(
id_doc int references Doc,
id_word int references Word,
primary key (id_doc, id_word)
);
-- We have many documents ...
INSERT
INTO Doc
SELECT id,
RPAD('Value ' || id || ' ', 10, '*') as padding
FROM generate_series(0, v_numOfDoc) id;
-- ... with just few words
INSERT
INTO Word
SELECT id,
RPAD('Value ' || id || ' ', 50, '*') as padding
FROM generate_series(0, 99) id;
-- We randomly insert several words to every document
FOR v_iddoc in 0 .. v_numOfDoc
LOOP
v_idword := random() * 10;
WHILE v_idword < 100
LOOP
INSERT INTO Belongs VALUES (v_iddoc, v_idword);
v_idword := v_idword + (random() * v_idword) + 1;
END LOOP;
END LOOP;
CREATE INDEX idx_Belongs_idword ON Belongs (id_word);
END
$$;
-----------------
-- Version No.2 - ARRAY + GIN
DROP TABLE IF EXISTS DocWord;
CREATE TABLE DocWord
(
id_doc int primary key,
padding varchar(10),
id_words int array
);
-- We copy data from the relational design
INSERT INTO DocWord
SELECT id_doc, padding, array_agg(id_word) id_words
FROM Belongs
JOIN Doc ON Belongs.id_doc = Doc.id
GROUP BY id_doc, padding;
CREATE EXTENSION intarray;
CREATE INDEX idx_DocWord_idb ON DocWord USING gin (id_words gin__int_ops);
-----------------
-- Version No.3 - Roaringbitmap
CREATE EXTENSION roaringbitmap;
DROP TABLE IF EXISTS WordDoc;
CREATE TABLE WordDoc
(
id_word int primary key,
padding varchar(50),
id_docs roaringbitmap
);
-- We copy data from the relational design
INSERT INTO WordDoc
SELECT id_word, padding, rb_build(array_agg(id_doc)) id_docs
FROM Belongs
JOIN Word ON Belongs.id_word = Word.id
GROUP BY id_word, padding;
---- Queries
-- We search for documents containing words 10, 20, 30
-- Versio No.1 - Binding table
SELECT id_doc
FROM Belongs
WHERE id_word in (10, 20, 30)
GROUP BY id_doc
HAVING count(*) = 3;
-- Version No.2 - Array + GIN
SELECT id_doc
FROM DocWord
WHERE '{10, 20, 30}' <@ id_words;
-- Version No.3 - Roaringbitmaps
SELECT unnest(rb_to_array(rb_and_agg(id_docs)))
FROM WordDoc
WHERE WordDoc.id_word in (10, 20, 30);
----- velikost tabulek
SELECT pg_indexes_size('Belongs');
SELECT pg_indexes_size('DocWord');
SELECT pg_indexes_size('WordDoc');