-
Notifications
You must be signed in to change notification settings - Fork 7
/
index.html
1888 lines (1744 loc) · 91.1 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<title>Data Integration using Deep Learning</title>
<link rel='stylesheet' href='http://webdatacommons.org/style.css' type='text/css' media='screen'/>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<style>
.tar {
text-align: right;
}
.rtable {
float: right;
padding-left: 10px;
}
.smalltable,
.smalltable TD,
.smalltable TH {
font-size: 9pt;
}
.tab {
overflow: hidden;
border: 1px solid #ccc;
background-color: #eaf3fa;
clear: both;
padding-left: 25px;
width: 350px;
margin-left: auto;
margin-right: auto;
}
.tab button {
background-color: inherit;
float: left;
border: none;
outline: none;
cursor: pointer;
padding: 15px 60px;
transition: 0.3s;
margin-left: auto;
margin-right: auto;
}
.tab button:hover {
background-color: #ddd;
}
.tab button.active {
background-color: #ccc;
}
.tabcontent {
display: none;
padding: 6px 12px;
border-top: none;
animation: fadeEffect 1s;
width: 500px;
margin-left: auto;
margin-right: auto;
}
.show {
display: block;
}
.no-show {
display: none;
}
caption {
caption-side: top;
font-style: italic;
}
.center {
display: block;
margin-left: auto;
margin-right: auto;
}
.center1 {
display: block;
margin-left: auto;
margin-right: auto;
}
.center2 {
float: left;
margin-left: auto;
margin-right: auto;
width: 50%;
}
.center2small {
float: left;
margin-left: auto;
margin-right: auto;
width: 35%;
}
.center3 {
float: left;
margin-left: auto;
margin-right: auto;
width: 33%;
height: auto;
}
.centertable {
margin-left: auto;
margin-right: auto;
}
.picturetable {
border: none;
margin-left: auto;
margin-right: auto;
}
td[scope="mergedcol"] {
text-align: center;
}
hr {
width: 50%;
margin: 20px 0;
/* This leaves 10px margin on left and right. If only right margin is needed try margin-right: 10px; */
}
.column {
float: left;
width: 50%;
padding: 5px;
}
.row::after {
content: "";
clear: both;
display: table;
}
@keyframes fadeEffect {
from {
opacity: 0;
}
to {
opacity: 1;
}
}
</style>
<script type="text/javascript" src="https://www.google.com/jsapi"></script>
<script type="text/javascript">
google.load('visualization', '1', {
packages: ['bar', 'line', 'corechart']
});
</script>
<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
<script type="text/javascript" src="../../jquery.toc.min.js"></script>
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-30248817-1']);
_gaq.push(['_trackPageview']);
(function () {
var ga = document.createElement('script');
ga.type = 'text/javascript';
ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(ga, s);
})();
function openExpResult(evt, expName) {
// Declare all variables
var i, tabcontent, tablinks;
// Get all elements with class="tabcontent" and hide them
tabcontent = document.getElementsByClassName("tabcontent");
for (i = 0; i < tabcontent.length; i++) {
tabcontent[i].style.display = "none";
}
// Get all elements with class="tablinks" and remove the class "active"
tablinks = document.getElementsByClassName("tablinks");
for (i = 0; i < tablinks.length; i++) {
tablinks[i].className = tablinks[i].className.replace(" active", "");
}
// Show the current tab, and add an "active" class to the button that opened the tab
document.getElementById(expName).style.display = "block";
evt.currentTarget.className += " active";
}
</script>
<script type="application/ld+json">
{
"@context": "http://schema.org/",
"@type": "Dataset",
"name": "XXXXXXXX",
"description": "XXXXXXXXXXX",
"url": "XXXXXXX",
"keywords": [
"XXXXXX",
"XXXXXX"
],
"creator": [
{
"@type": "Person",
"url": "XXXXXXX",
"name": "XXXXXXXX"
},
{
"@type": "Person",
"url": "XXXXXX",
"name": "XXXXXXX"
}
],
"citation": [
]
}
</script>
</head>
<body>
<div id="logo" style="text-align:right; background-color: white;"> <a
href="http://dws.informatik.uni-mannheim.de"><img src="./visualizations/Website_images/ma-logo.gif"
alt="University of Mannheim - Logo"></a></div>
<div id="header">
<h1 style="font-size: 250%;">Data Integration using Deep Learning</h1>
</div>
<div id="authors">
<a>Christian Bizer</a></br>
<a>Cheng Chen</a><br/>
<a>Jennifer Hahn</a><br/>
<a>Kim-Carolin Lindner</a></br>
<a>Ralph Peeters</a></br>
<a>Jannik Reißfelder</a><br/>
<a>Marvin Rösel</a><br/>
<a>Niklas Sabel</a><br/>
<a>Luisa Theobald</a></br>
<a>Estelle Weinstock</a></br>
</div>
<div id="content1">
<p>
This website engages with the experiments and results of a six month student team project on the topic "Data
Integration using Deep Learning" at the School of Business Informatics and Mathematics of the University of Mannheim under the supervision of Ralph
Peeters and Christian Bizer (Chair of Information Systems V: Web-based Systems). We have investigated the performance
of frameworks based on the tasks of matching entities (entity matching) and schemata
(schema matching) across tables. The task is presented as a multi-class classification
problem characterising whether a row (entity) or a column (schema) belongs to a predefined
cluster/has a specific label. All of our experiments and code is publicly available in our <a href="https://github.com/NiklasSabel/data_integration_using_deep_learning"> git repository</a>.
</br>
The website is structured as follows. In Chapter 1, we give a short introduction into the use cases and the challenges that
are addressed by our project. Afterwards, we continue with a brief overview on the theoretical framework needed to follow the experiments within the second Chapter.
Chapter 3 establishes the algorithms we focused on followed by the creation and preparation of the task-specific datasets included in chapter 4.
Subsequently, we present our experiments including our baselines in Chapter 5 in order to transition to an error analysis
in Chapter 6. Chapter 7 concludes the content of this website by a discussion of the results and an outlook on further possibilities generated by our work .
All of our references can be found in Chapter 8 and our datasets can be downloaded in Chapter 9
but are available for research purposes only.
<h2>Contents</h2>
<ul>
<li class="toc-h2 toc-active">
<a href="#toc1">1 Introduction</a>
</li>
<li class="toc-h2 toc-active">
<a href="#toc2">2 Theoretical Framework</a>
<ul>
<li><a href="#toc2.1">2.1 Schema Matching</a></li>
<li><a href="#toc2.2">2.2 Entity Matching</a></li>
<li><a href="#toc2.3">2.3 Transformer Models</a></li>
</ul>
</li>
<li class="toc-h2 toc-active">
<a href="#toc3">3 Algorithms</a>
<ul>
<li><a href="#toc3.1">3.1 TURL</a></li>
<li><a href="#toc3.2">3.2 Contrastive Learning</a></li>
</ul>
</li>
<li class="toc-h2 toc-active">
<a href="#toc4">4 Datasets and Preprocessing</a>
<ul>
<li><a href="#toc4.1">4.1 Entitiy Matching</a></li>
<li><a href="#toc4.2">4.2 Schema Matching</a></li>
</ul>
</li>
<li class="toc-h2 toc-active">
<a href="#toc5">5 Experimental Setup</a>
<ul>
<li><a href="#toc5.1">5.1 Baseline Experiments</a></li>
<li><a href="#toc5.2">5.2 TURL Experiments</a></li>
<li><a href="#toc5.3">5.3 Contrastive Learning Experiments</a></li>
</ul>
</li>
<li class="toc-h2 toc-active">
<a href="#toc6">6 Experimental Results and Error Analyis</a>
<ul>
<li><a href="#toc6.1">6.1 Schema Matching Results</a></li>
<li><a href="#toc6.2">6.2 Entitiy Matching Results</a></li>
</ul>
</li>
<li class="toc-h2 toc-active">
<a href="#toc7">7 Discussion and Outlook</a>
</li>
<li class="toc-h2 toc-active">
<a href="#toc8">8 References </a>
</li>
<li class="toc-h2 toc-active">
<a href="#toc9">9 Downloads </a>
</li>
</ul>
<span id="toc1"></span>
<h2>1 Introduction</h2>
<!--CONTENT-->
<p>
According to estimations of the International Data Corporation, the amount of data created in 2025 will be
around 180 zettabytes with increasing tendency. Reasons for that are increasing connection and information flow
due to the world wide web. The web contains a massive amount of data in all forms. There can be structured or
unstructured data and a lot of different sources which use different data models or different schemata but
actually describe the same real-world entity. Moreover, information can differ in content, syntax or even in
technical characteristics.
Consequently, it gets quite challenging when trying to use or merge such heterogeneous data in order to compare and work with
it for further applications such as online shopping, to name just one example where data from different sources need
to be compared.
Addressing this problem, the aim of this work is to master the challenges mentioned above and to establish
different methods for both schema and entity matching.
<p>
<span id="toc2"></span>
<h2>2 Theoretical Framework </h2>
<!--CONTENT-->
<p>
This Chapter provides an overview of the theoretical underpinnings, frameworks as well as specific information relevant to follow our work. For this reason, we start by
introducing the main tasks we are trying to solve: Schema and entity matching. We also give a brief introduction to transformer models, especially BERT-based implementations,
as they form the basis for the algorithms we use and, due to limitations of standard algorithms and measures,
became more and more popular in recent years.
<p>
<span id="toc2.1"></span>
<h3>2.1 Schema Matching</h3>
<p>
Schema matching describes the task of matching similar or rather the same schemata and finding agreement and unity between applied structures.
Database instances, for example, comprising of schemata and respective (table) columns describing the same attribute can or often need to be matched.
The main challenges are size, semantic heterogeneity, generic names, esoteric naming conventions and different languages.
Therefore, correspondences between the schemes should be detected in an automated or semi-automated manner. Although 1:n and n:1 approaches are possible,
the scope of the object is reduced to only considering 1:1 matching as defined in the problem statement of the initial project discussion.
<p>
<span id="toc2.2"></span>
<h3>2.2 Entity Matching</h3>
<p>
Entity matching, often also called identity resolution, is a crucial task for data integration and describes the exercise of finding all records that refer to the same entity,
e.g. when integrating data from different source systems. Unfortunately, entity representations in real-world
environments are in general neither identical nor always complete, but have to be processed at massive scale. One solution to address this difficulty is offered by
comparing multiple attributes of different record representations with attribute-specific similarity measures
like Levenshtein distance or advanced techniques like BERT. Newer approaches include the application of
so-called table transformers which will be discussed in <a href="#toc3">Section 3</a>. Entity matching tries to allocate entities
with different representations under the assumption that the higher the similarity is, the more likely two
entity representations are a match <a href="#toc8">[7]</a>.
<span id="toc2.3"></span>
<h3>2.3 Transformer Models</h3>
<p>
In 2017, Google Brain proposed the transformer model, that based on an encoder-decoder structure and an attention mechanism showed
impressive improvements over state-of-the-art methods and simplicity in adoption to a wide range of machine learning tasks,
especially in the context of NLP <a href="#toc8">[8]</a>.
As a result, a new language representation model named BERT was introduced in 2019 to pre-train deep bidirectional
representations from unlabeled text. That resulted in a lot of possibilities as a "pre-trained BERT model can be finetuned with
just one additional output layer to create state-of-the art methods for a wide range of tasks" <a href="#toc8">[2]</a>. In the
context of this work, we used algorithms that are pre-trained on different BERT extensions, in particular TinyBERT and RoBERTa
<a href="#toc8">[9]</a> <a href="#toc8">[10]</a>.
<p>
<span id="toc3"></span>
<h2>3 Algorithms</h2>
<p>
In the following, we present different algorithms namely TURL from a class of table transformers and Contrastive Learning as
a more general technique without a focus on tables.
table transformers are models that not only incorporate data from individual entries,
but include information from their surroundings inside the table as well. This results in models which take a
whole table representation of a website or a knowledge base as input instead of only single entries like most other models do.
The Contrastive Learning approach on the other hand rather tries to learn high-level features of a dataset by
exploring the differences and similarities of data points.
</p>
<span id="toc3.1"></span>
<h3>3.1 TURL</h3>
<p>
table Understanding through Representation Learning (TURL) is a "novel framework that introduces the pretraining/finetuning paradigm to
relational Web tables" <a href="#toc8">[1]</a>. TURL is a TinyBERT based extension model that was pre-trained on around 600,000 Wikipedia tables such that it
can be applied to different tasks with "minimal task-specific finetuning". The authors show that the model generalizes well and
outperforms existing methods for example in column type annotation <a href="#toc8">[1]</a>.
The basic idea of TURL is to " learn[s] deep contextualized representations on relational
tables in an unsupervised manner" and generate a framework that can be finetuned to a wide range of tasks <a href="#toc8">[1]</a>.
More specifically, the TURL architecture which can be seen in Figure 3.1 consists of three modules.
At first, an embedding layer followed by a structure-aware stacked transformer, as introduced in <a href="#toc2.3">Section 2.3</a>,
to capture textual and relational knowledge with a "visibility matrix" that models the row-column structure
of the tables and concluded by a projection layer.
<figcaption style="text-align:center">
Figure 3.1: Overview TURL architecture
</figcaption>
<figure >
<img src="./visualizations/Website_images/TURL framework.JPG" style="margin-bottom:2em;" class="center1" width="50%"/>
</figure>
<p>
After the described pretraining procedure the model can then be applied to different proposed finetuning tasks
such as entity linking, column type annotation, relation extraction, row population, cell filling, and schema
augmentation.
</p>
</p>
<span id="toc3.2"></span>
<h3>3.2 Contrastive Learning</h3>
<p>
Contrastive Learning has become a promising approach both in information retrieval <a href="#toc8">[6]</a> as well as in computer vision outperforming previous methods
in self-supervised and semi-supervised learning <a href="#toc8">[5]</a>. This framework has further been extended to a fully-supervised setting introducing an alternative
to the usual cross-entropy loss function <a href="#toc8">[4]</a> while achieving state-of-the-art results. In addition, supervised Contrastive Learning has seen recent success
in product matching which is a special form of entity matching <a href="#toc8">[3]</a>.
<figcaption style="text-align:center">
Figure 3.2: Supervised Contrastive Learning
</figcaption>
<figure >
<img src="./visualizations/Website_images/contrastive.png" style="margin-bottom:2em;"class="center1" width="30%"/>
</figure>
Figure 3.2 summarizes the overall framework of supervised Contrastive Learning. In general, the whole process is split up in two stages, the contrastive pretraining
followed by the second stage of finetuning. The main purpose of contrastive pretraining is to learn hidden representations of respective clusters in such a way
that instances from the same class end up close in space while instances from different classes end up far in space.
<br>
After the pretraining step, the encoder network ideally maps each class to a well seperated cluster in the embedding space. For the final stage of finetuning
the parameters of the encoder network remain frozen and only the linear classification head is trained.
In contrast to other methods, contrastive pretraining makes it much easier to learn decision boundaries for a linear classifier given a pretrained embedding space.
This makes contrastive pretraining a powerful method for further downstream tasks such as multi-class classification.
</p>
</p>
<span id="toc4"></span>
<h2>4 Datasets and Preprocessing </h2>
<!--CONTENT-->
<p>
<!-- Concept Explanation-->
Chapter 4 gives an overview on the generation of our datasets and the final values contained in the sets.
Our dataset is based on the Web Data Commons - Schema.org table Corpus <a href="#toc8">[12]</a> maintained by the Data
and Web Science Research Group at the University of Mannheim.
<br> For both tasks, i.e. entity and schema matching, different selection and filtering methods were applied to attain the final datasets.
However, in a first action, chosen tables for both taks were cleaned using a two-step approach in order to extract English language data only.
We applied a TLD-based method to first filter our data on English internet domain
endings, e.g. ".com",or ".uk", and afterwards applied the fastText language detection algorithm <a href="#toc8">[11]</a> on each single row in
the remaining tables to check whether it belongs to the English language and if not we discarded them.
Further cleaning of selected tables and further preprocessing approaches specific to each task of schema or entity matching will be explained in detail in the following.
<span id="toc4.1"></span>
<h3>4.1 Entity Matching</h3><br>
Within entity matching we focused on a specific part of the corpus, namely the Product data as our biggest entity type
with 1.66 million tables and 231 million entities in the data segments Top100 and Minimum3, while excluding the tables from the Rest unit,
since those tables were too small for our task. After the initial language cleaning step, we remained with 435,000 tables and 100 million entities.
The Product corpus already provided a clustering for the entities, so no
further annotation was needed. Here, a cluster corresponds to a collection of identical products across different origins, i.e. websites.
Focusing on clusters with at least eight tables, we used the most common brands for different
chosen categories, in particular bikes, cars, clothes, drugstore, electronics,
and technology as keywords to get relevant clusters for our final dataset. To further enhance our data, we
searched for brands with at least 1,000 distinct clusters and established another category called "random". In order to
make the final matching for the algorithms not too easy, we browsed the selected clusters for homogeneous entities
using Doc2Vec <a href="#toc8">[13]</a> and Jaccard similarity to include hard-to-distinguish clusters in our dataset.
Here, we based our selection on the balance between both, Jaccard and Doc2Vec score, by manually validating the best
results for each of the defined domains.
<p>
To get a better understanding of hard-to-distinguish cases, we provide some examples in Table 4.1.
The upper table illustrates three examples of so-called hard non-matching cases. A hard non-match describes two entities which are semantically very close
but belong to different classes. As one can see, some entities only differ in some characters, resulting in different cluster assignments.
On the other hand, hard matches relate to two entities which belong to the same cluster but are not as close in space as other entities within that class.
This can easily happen when the same entities differ in the character length of their respective name column.
For hard non-matches we provide the corresponding cosine similarity between a query entity and its closest match while for hard matches we show
the cosine metric between a query and its closest match within the same cluster. In both cases, the cosine similarity is computed by comparing the vectorized
entities obtained by Doc2Vec.
</p>
<table style="width:100%" class="centertable">
<caption style="margin-top: 1em">Table 4.1: Illustration of hard non-matches and hard matches .</caption>
<thead>
<tr>
<th>Entity</th>
<th>Most Similar Entity</th>
<th>Cosine Metric</th>
</tr>
</thead>
<tbody>
<tr>
<td>Lifeproof Case Iphone 11</td>
<td>iPhone 11 <b>Pro Max</b> case</td>
<td>0.9776</td>
</tr>
<tr>
<td>Lego Star Wars The Complete Saga <b>DS</b></td>
<td>Lego Star Wars: The Complete Saga - <b>Wii Video</b> Game</td>
<td>0.9367</td>
</tr>
<tr>
<td> <b>10 2010</b> Audi A5 Quattro Fuel Injector 2.0L 4 Cyl Bosch High Pressure</td>
<td> <b>18 2018</b> Audi A5 Quattro Fuel Injector 2.0L 4 Cyl Standard Motor Products</td>
<td>0.9771</td>
</tr>
</tbody>
<thead>
<tr>
<th>Entity</th>
<th>Matching Entity</th>
<th>Cosine Metric</th>
</tr>
</thead>
<tbody>
<tr>
<td>iPhone 11 <b>Pro Max</b> case</td>
<td>For iphone 11 <b>pro</b> x xr xs <b>max</b> cell phone case cover with camera lens protection</td>
<td>0.8514 (below top 30)</td>
</tr>
<tr>
<td>08MP-08FPS 90° Elbow <b>Long</b> Forged</td>
<td>08MP-08FPS 90° Elbow Forged</td>
<td>0.9062 (15th place)</td>
</tr>
<tr>
<td>Jasmine Dragon Pearls Green Tea</td>
<td>Jasmine Dragon Pearl <b>Jasmine</b> Green Tea</td>
<td>0.9501 (5th place)</td>
</tr>
</tbody>
</table>
To increase the performance of the models, post-preprocessing was applied through a cleaning procedure. This
entailed the removal of special characters and duplicates. More complex cleaning approaches were not used due to
the high possibility of multiple pitfalls given by the high amount of data.
With a multilabel stratified shuffle-split <a href="#toc8">[15]</a> we distributed the remaining clusters
into ⅜ train, ¼ validation and ⅜ test data. With this approach we ensured that
tables were distributed equally across size and selected clusters. To avoid skewing the results, we also manually
cleaned the test set, detecting about 10% noise, which we discarded entirely. Thereby, we ended up with an amount of
1,410 clusters that were used for training the algorithms. The final set sizes can be seen in Table 4.2.
</p>
<table style="width:30%" class="centertable">
<caption style="margin-top: 1em">Table 4.2: Final product dataset sizes for entity matching.</caption>
<thead>
<tr>
<th></th>
<th>Number of tables</th>
<th>Number of Entities</th>
</tr>
</thead>
<tbody>
<tr>
<td>Train</td>
<td>1,345</td>
<td>11,121</td>
</tr>
<tr>
<td>Validation</td>
<td>885</td>
<td>7,154</td>
</tr>
<tr>
<td>Test</td>
<td>1,331</td>
<td>10,655</td>
</tr>
</tbody>
</table>
<span id="toc4.2"></span>
<h3>4.2 Schema Matching</h3>
<p>
As a basis for schema matching, it was important to gather a large amount of diverse table data to create a sufficient dataset
with at least 200 different columns that were later on used for matching. Thereby, the chosen columns should be evenly distributed within large, mid-sized and small tables,
and their content should represent different data types, different lengths as well as hard cases for distinction.
We started with 668,593 tables that were taken into account. With the described removal technique of non-English tables the dataset was reduced to
267,283 tables.
To create a solid database, we chose the 20 largest categories of Schema.org in order to gain a large amount of
tables that contain a sufficient amount of data. Due to the vast amount of disorderly data we initally selected 207 columns to be able
to additionally reduce the column set in case that we would detect further misfitting criteria during the following data preparation and preprocessing.
However, as we wanted to make sure that all tables contain at least three of the selected columns for schema matching, our initial dataset was already reduced to 79,318 tables.
Further, in order to create a useful dataset, we reduced our sample of tables by checking for tables with more than ten rows,
less than 50% NAs within selected columns as well as less than 15% NAs within relevant columns in the entire table.
<br> The resulting 54,190 tables were then divided into
a training and test set by performing a multi-label stratified shuffle split with three separations (random state = 42) resulting in 44,435 training tables and 9,894 test tables.
We chose to perform a multi-label stratified shuffled split <a href="#toc8">[15]</a> to not only distribute the different categories
but also the selected columns in each category proportionally between the training and test set.
To compare different input and training sizes later on, our training data was further divided into a medium as well as small sized training set. Again,
we performed a multi-label stratified shuffled split to distribute the data and especially columns proportionally making sure that all selected columns
were represented in all datasets. Hereby, the large training set of 44,345 tables contains all 9,776 tables of the medium-sized training and the medium-sized training
set contains all 2,444 tables of the small training set.
<br>
As a means to create a clean and reliable set of test tables, the entire test set containing 9,894 tables was manually checked for languages other than English, wrong column
labels as well as multiple, missing or odd entries such as symbols, for example. Thereby, about 10% of the tables were removed with more than 50% due to other foreign languages,
hence reducing the test table size to 8,912.
<br>
The distribution of all tables as well as tables within each selected category is presented in the Table 4.3.
</p>
<table class="centertable">
<caption style="margin-top: 1em" >Table 4.3: Final data and column set sizes and category distribution for schema matching.</caption>
<thead>
<tr>
<th></th>
<th>Small Training Set</th>
<th>Medium Training Set</th>
<th>Large Training Set</th>
<th>Test Set</th>
<th># of selected columns<br>within each category</th>
</tr>
</thead>
<tbody>
<tr>
<td>All</td>
<td>2,444</td>
<td>9,776</td>
<td>44,345</td>
<td>8,912</td>
<td>207</td>
</tr>
<tr>
<td>Product</td>
<td>1,033</td>
<td>4,256</td>
<td>19,367</td>
<td>3,839</td>
<td>46</td>
</tr>
<tr>
<td>Music Recording</td>
<td>318</td>
<td>1,102</td>
<td>5,031</td>
<td>1,097</td>
<td>7</td>
</tr>
<tr>
<td>Event</td>
<td>248</td>
<td>1,007</td>
<td>4,563</td>
<td>1,000</td>
<td>11</td>
</tr>
<td>Creative Work</td>
<td>221</td>
<td>869</td>
<td>3,925</td>
<td>876</td>
<td>23</td>
</tr>
<td>Recipe</td>
<td>195</td>
<td>770</td>
<td>3,522</td>
<td>727</td>
<td>24</td>
</tr>
</tr>
<td>Person</td>
<td>163</td>
<td>690</td>
<td>3,148</td>
<td>545</td>
<td>24</td>
</tr>
</tr>
<td>Local Business</td>
<td>123</td>
<td>490</td>
<td>2,209</td>
<td>381</td>
<td>23</td>
</tr>
</tr>
<td>Place</td>
<td>38</td>
<td>160</td>
<td>728</td>
<td>131</td>
<td>5</td>
</tr>
</tr>
<td>Hotel</td>
<td>38</td>
<td>156</td>
<td>701</td>
<td>117</td>
<td>8</td>
</tr>
</tr>
<td>Book</td>
<td>29</td>
<td>118</td>
<td>537</td>
<td>65</td>
<td>12</td>
</tr>
</tr>
<td>Restaurant</td>
<td>20</td>
<td>79</td>
<td>353</td>
<td>61</td>
<td>12</td>
</tr>
</tr>
<td>Music Album</td>
<td>9</td>
<td>41</td>
<td>189</td>
<td>42</td>
<td>4</td>
</tr>
</tr>
<td>TV Episode</td>
<td>9</td>
<td>38</td>
<td>162</td>
<td>31</td>
<td>3</td>
</tr>
</tbody>
</table>
<br><br>
The respective distribution of the chosen columns for schema matching is also displayed in Table 4.3 with product, being the largest category and source of tables,
including the majority of the columns. As mentioned above, the chosen columns were to represent different data types so that the set of 207 columns
includes 150 strings, 21 datetime, 18 float, 13 integers as well as five geolocation columns. During the selection of the dataset and the aforementioned columns,
we made sure to include hard matching cases, i.e. columns that are really hard to distinguish for the algorithm. Table 4.4 illustrates two examples
of those hard matching cases. For example, our data set comprises of five different product gtins (global trade item numbers) that are almost identical and only differ in length.
Another example of two very similar entities shown in Table 4.4 is derived from the recipe class as most recipe headlines sound exactly like recipe names.
Hence, we refer to hard matching cases if column content can be easily mixed up within as well as between classes and is therefore hard to distinguish.
<br>
<table style="width:100%" class="centertable">
<caption style="margin-top: 1em">Table 4.4: Illustration of hard cases.</caption>
<thead>
<tr>
<th>Gtin</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr>
<td>gtin13Product</td>
<td>7321428469419 0 8032767441293 0 8032766030245...</td>
</tr>
<tr>
<td>gtin14Product</td>
<td>00032054003584 00032054003560 00032054003591 ...</td>
</tr>
</tbody>
<thead>
<tr>
<th>Recipe Description Type</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr>
<td>headlineRecipe</td>
<td>Lemony Angel Hair Pasta with Crab Turkey Spina...</td>
</tr>
<tr>
<td>nameRecipe</td>
<td>Peach Cake Recipe using Fresh Peaches Canned G.</td>
</tr>
</tbody>
</table>
</p>
<span id="toc5"></span>
<h2>5 Experimental Setup</h2>
<p>
For running the experiments we used the resources from the University of Mannheim (dws-server) as well as the resources from the state Baden-Württemberg,
the bw-uni-cluster. With that we had access to different
setups to efficiently run the experiments. Furthermore, we had enough storage space to store
the different datasets created for the experiments.
</p>
<span id="toc5.1"></span>
<h3>5.1 Baseline Experiments</h3>
<!--CONTENT-->
<p>
In the case of entity matching, we used three different baseline models to be able to compare the results of our algorithms.
We included one tree-based model in Random Forest, and two BERT-based models in TinyBERT and RoBERTa, because, as mentioned
in <a href="#toc3.1">Section 3.1</a>, TURL is based on TinyBERT.
All the baselines were modelled as multi-class classification that were presented a concatentation of the entity name and
a description in case of the product dataset.
The results are presented in Table 5.1 below.
<table style="width:30%" class="centertable">
<caption style="margin-top: 1em">Table 5.1: F1 scores for different baselines models in entity matching.</caption>
<thead>
<tr>
<th></th>
<th>Random Forest</th>
<th>TinyBERT</th>
<th>RoBERTa</th>
</tr>
</thead>
<tbody>
<tr>
<td>Product</td>
<td>0.8684</td>
<td>0.8329</td>
<td>0.8958</td>
</tr>
</tbody>
</table>
<p>
As a baseline for schema matching, both tree and BERT-based models were used. As a tree based model, we applied a Random Forest with
both value and meta-based data.
<br> The value based datasets were created with TF-IDF, whereby a global and a binary approach was used.
For TF-IDF the data was preprocessed with the following steps: The concatenated text of each column was lower-cased and tokenized and
stopwords as well as punctuation were removed.
Based on the meta data, following variables were created as a data base for the meta data approach model:
a binary variable that indicates whether or not the column content was structured within brackets such as "{}";
a variable that gives the length of the value;
a variable with the average word length and
a binary variable that indicates whether the column includes dates.
To keep the original structure of the data, no preprocessing was performed. Hereby, the regular TF-IDF approach yielded a micro F1 score of 0.35 and the binary TF-IDF
approach yielded a micro F1 score of 0.27 while the meta approach yielded a micro F1 score of 0.12.
Further, we used BERT-based models such as BERT, RoBERTa, TinyBERT and DistilBERT as baseline models for the
respective small, medium as well as large training dataset. As a database the concatenated values of the target columns were used.
To keep as much information as possible, again, no further preprocessing was applied. The models were trained with 25 epochs. As it can be
seen in the results in Table 5.2, the models performed quite different on the small and medium datasets. However, when looking at the
results of the large training dataset, all models performed quite well and, except for TinyBERT,
all models reached a performance of 0.80 micro F1 score.
<table style="width:35%" class="centertable">
<caption style="margin-top: 1em">Table 5.2: Micro F1 scores for BERT-based baseline models in schema matching.</caption>
<thead>
<tr>
<th></th>
<th>DistilBERT</th>
<th>BERT</th>
<th>TinyBERT</th>
<th>RoBERTa</th>
</tr>
</thead>
<tbody>
<tr>
<td>Small</td>
<td>0.6089</td>
<td>0.7327</td>
<td>0.6982</td>
<td>0.6601</td>
</tr>
<tr>
<td>Medium</td>
<td>0.6166</td>
<td>0.7623</td>
<td>0.7044</td>
<td>0.7569</td>
</tr>
<tr>
<td>Large</td>
<td>0.8019</td>
<td>0.8014</td>
<td>0.7593</td>
<td>0.8030</td>
</tr>
</tbody>
</table>
<p>
As mentioned earlier, TURL is based on TinyBERT so that TinyBERT represents a feasable baseline model and will mainly be used
for comparison and evaluation of the results in the subsequent chapters.
</p>
<!-- include pair-wise trainsizes?-->
<span id="toc5.2"></span>
<h3>5.2 TURL Experiments</h3>
<!--CONTENT-->
<p>
TURL does already offer several predefined tasks to evaluate the pre-trained framework on <a href="#toc8">[1]</a>.
Hereby, the task of column type annotation was evaluated to be the most suitable for both entity and schema matching. In order to pretain the model with our selected tables and
entities and to perform the task of column type annotation the data has to be structured as a nested list where each table was represented by an inner list itself containing
table id, page title, Section title as well as further lists of headers, cell content and the column types.
The given input representation of tables for the task of column type annotation can be found within the README file within the TURL
<a href="https://github.com/sunlab-osu/TURL"> git repository</a>.
Further information on the pretraining and the respective finetuning task can be found in TURL: table Understanding through Representation Learning <a href="#toc8">[1]</a>.
For Entity matching we experimented with two different approaches. One setting was based on transposing the matrix, such that
each column was modelled as one entity as the prebuild framework aggregated the column information. In the second case
we changed the TURL code itself to be able to aggregate over rows instead of columns. For schema matching the proposed column type annotation was directly applicable.<br>
Detailed information on settings and hyperparameters adjusted during the experimentation are presented in Table 5.3/5.4.
</p>
<br>
<div style="margin-top:2em;" >
<div style=";margin-right: 2.5em">
<div class="tab">
<button class="tablinks" onclick="openExpResult(event, 'schema')">Schema</button>
<button class="tablinks" onclick="openExpResult(event, 'entity')">Entity</button>
</div>
<div id="schema" class="tabcontent" style="display:block;">
<table class="Multi-class" >
<caption style="margin-top: 1em">Table 5.3: Schema TURL Settings</caption>
<tr>
<th>Category</th>
<th colspan="1" style='text-align:center; vertical-align:middle'> Inital Inputs</th>
<th colspan="3" style='text-align:center; vertical-align:middle'> Adjusted/Final Inputs per Train Size</th>
</tr>
<tr>
<th></th>
<th></th>
<th>Small</th>
<th>Medium</th>
<th>Large</th>
</tr>
<tr>
<td style="background-color:#E8E8E8">Training Epochs</td>
<td>10</td>
<td>50</td>
<td>50</td>
<td>50</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Learning Rate</td>
<td>5e-5</td>
<td>5e-5</td>
<td>5e-5</td>
<td>5e-5</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Batch Size</td>
<td>20</td>
<td>20</td>
<td>20</td>
<td>20</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Accumulation Steps</td>
<td>2</td>
<td>2</td>
<td>2</td>
<td>2</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Save Steps</td>
<td>5000</td>
<td>50</td>
<td>125</td>
<td>650</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Logging Steps</td>
<td>1500</td>
<td>15</td>
<td>50</td>
<td>200</td>
</tr>
<tr>
<td style="background-color:#E8E8E8">Warm up Steps</td>
<td>5000</td>
<td>50</td>
<td>125</td>
<td>650</td>
</tr>
</table>
</div>
<div id="entity" class="tabcontent" style="display: none;">
<table class="pair-wiseSmall">
<caption style="margin-top: 1em">Table 5.4: Entity TURL Settings</caption>
<tr>
<th>Category</th>
<th colspan="1" style='text-align:center; vertical-align:middle'> Inital Inputs</th>
<th colspan="2" style='text-align:center; vertical-align:middle'> Adjusted/Final Inputs per Model Type</th>
</tr>
<tr>
<th></th>
<th></th>
<th>TURL (transposed) </th>
<th>TURL (rewritten)</th>
</tr>