index.html

<!DOCTYPE html>
<html>

<head>
    <title>Data Integration using Deep Learning</title>
    <link rel='stylesheet' href='http://webdatacommons.org/style.css' type='text/css' media='screen'/>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    <style>
        .tar {
            text-align: right;
        }

        .rtable {
            float: right;
            padding-left: 10px;
        }

        .smalltable,
        .smalltable TD,
        .smalltable TH {
            font-size: 9pt;
        }

        .tab {
            overflow: hidden;
            border: 1px solid #ccc;
            background-color: #eaf3fa;
            clear: both;
            padding-left: 25px;
            width: 350px;
            margin-left: auto;
            margin-right: auto;
        }

        .tab button {
            background-color: inherit;
            float: left;
            border: none;
            outline: none;
            cursor: pointer;
            padding: 15px 60px;
            transition: 0.3s;
            margin-left: auto;
            margin-right: auto;
        }

        .tab button:hover {
            background-color: #ddd;
        }

        .tab button.active {
            background-color: #ccc;
        }

        .tabcontent {
            display: none;
            padding: 6px 12px;
            border-top: none;
            animation: fadeEffect 1s;
            width: 500px;
            margin-left: auto;
            margin-right: auto;
        }

        .show {
            display: block;
        }

        .no-show {
            display: none;
        }

        caption {
            caption-side: top;
            font-style: italic;
        }
        .center {
            display: block;
            margin-left: auto;
            margin-right: auto;
        }
        .center1 {
            display: block;
            margin-left: auto;
            margin-right: auto;
        }
        .center2 {
            float: left;
            margin-left: auto;
            margin-right: auto;
            width: 50%;
        }
        .center2small {
            float: left;
            margin-left: auto;
            margin-right: auto;
            width: 35%;
        }
        .center3 {
            float: left;
            margin-left: auto;
            margin-right: auto;
            width: 33%;
            height: auto;
        }

        .centertable {
            margin-left: auto;
            margin-right: auto;
            }

        .picturetable {
            border: none;
            margin-left: auto;
            margin-right: auto;
        }

        td[scope="mergedcol"] {
            text-align: center;
        }

        hr {
            width: 50%;
            margin: 20px 0;
            /* This leaves 10px margin on left and right. If only right margin is needed try margin-right: 10px; */
        }

        .column {
          float: left;
          width: 50%;
          padding: 5px;
        }

        .row::after {
          content: "";
          clear: both;
          display: table;
        }

        @keyframes fadeEffect {
            from {
                opacity: 0;
            }

            to {
                opacity: 1;
            }
        }
    </style>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
        google.load('visualization', '1', {
            packages: ['bar', 'line', 'corechart']
        });


    </script>

    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
    <script type="text/javascript" src="../../jquery.toc.min.js"></script>
    <script type="text/javascript">
        var _gaq = _gaq || [];
        _gaq.push(['_setAccount', 'UA-30248817-1']);
        _gaq.push(['_trackPageview']);

        (function () {
            var ga = document.createElement('script');
            ga.type = 'text/javascript';
            ga.async = true;
            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
            var s = document.getElementsByTagName('script')[0];
            s.parentNode.insertBefore(ga, s);
        })();

        function openExpResult(evt, expName) {
            // Declare all variables
            var i, tabcontent, tablinks;

            // Get all elements with class="tabcontent" and hide them
            tabcontent = document.getElementsByClassName("tabcontent");
            for (i = 0; i < tabcontent.length; i++) {
                tabcontent[i].style.display = "none";
            }

            // Get all elements with class="tablinks" and remove the class "active"
            tablinks = document.getElementsByClassName("tablinks");
            for (i = 0; i < tablinks.length; i++) {
                tablinks[i].className = tablinks[i].className.replace(" active", "");
            }

            // Show the current tab, and add an "active" class to the button that opened the tab
            document.getElementById(expName).style.display = "block";
            evt.currentTarget.className += " active";
        }
    </script>

    <script type="application/ld+json">
        {
            "@context": "http://schema.org/",
            "@type": "Dataset",
            "name": "XXXXXXXX",
            "description": "XXXXXXXXXXX",
            "url": "XXXXXXX",
            "keywords": [
                "XXXXXX",
                "XXXXXX"
            ],
            "creator": [
                {
                    "@type": "Person",
                    "url": "XXXXXXX",
                    "name": "XXXXXXXX"
                },
                {
                    "@type": "Person",
                    "url": "XXXXXX",
                    "name": "XXXXXXX"
                }
            ],
            "citation": [
            ]
        }


    </script>

</head>

<body>
<div id="logo" style="text-align:right; background-color: white;">&nbsp;&nbsp;<a
        href="http://dws.informatik.uni-mannheim.de"><img src="./visualizations/Website_images/ma-logo.gif"
                                                          alt="University of Mannheim - Logo"></a></div>
<div id="header">
    <h1 style="font-size: 250%;">Data Integration using Deep Learning</h1>
</div>
<div id="authors">
    <a>Christian Bizer</a></br>
    <a>Cheng Chen</a><br/>
    <a>Jennifer Hahn</a><br/>
    <a>Kim-Carolin Lindner</a></br>
    <a>Ralph Peeters</a></br>
    <a>Jannik Reißfelder</a><br/>
    <a>Marvin Rösel</a><br/>
    <a>Niklas Sabel</a><br/>
    <a>Luisa Theobald</a></br>
    <a>Estelle Weinstock</a></br>
</div>
<div id="content1">
    <p>
        This website engages with the experiments and results of a six month student team project on the topic "Data
        Integration using Deep Learning" at the School of Business Informatics and Mathematics of the University of Mannheim under the supervision of Ralph
        Peeters and Christian Bizer (Chair of Information Systems V: Web-based Systems). We have investigated the performance
        of frameworks based on the tasks of matching entities (entity matching) and schemata
        (schema matching) across tables. The task is presented as a multi-class classification
        problem characterising whether a row (entity) or a column (schema) belongs to a predefined
        cluster/has a specific label. All of our experiments and code is publicly available in our <a href="https://github.com/NiklasSabel/data_integration_using_deep_learning"> git repository</a>.
        </br>
        The website is structured as follows. In Chapter 1, we give a short introduction into the use cases and the challenges that
        are addressed by our project. Afterwards, we continue with a brief overview on the theoretical framework needed to follow the experiments within the second Chapter.
        Chapter 3 establishes the algorithms we focused on followed by the creation and preparation of the task-specific datasets included in chapter 4.
        Subsequently, we present our experiments including our baselines in Chapter 5 in order to transition to an error analysis
        in Chapter 6. Chapter 7 concludes the content of this website by a discussion of the results and an outlook on further possibilities generated by our work .
        All of our references can be found in Chapter 8 and our datasets can be downloaded in Chapter 9
        but are available for research purposes only.


    <h2>Contents</h2>
    <ul>
        <li class="toc-h2 toc-active">
            <a href="#toc1">1 Introduction</a>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc2">2 Theoretical Framework</a>
            <ul>
                <li><a href="#toc2.1">2.1 Schema Matching</a></li>
                <li><a href="#toc2.2">2.2 Entity Matching</a></li>
                <li><a href="#toc2.3">2.3 Transformer Models</a></li>
            </ul>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc3">3 Algorithms</a>
            <ul>
                <li><a href="#toc3.1">3.1 TURL</a></li>
                <li><a href="#toc3.2">3.2 Contrastive Learning</a></li>
            </ul>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc4">4 Datasets and Preprocessing</a>
            <ul>
                <li><a href="#toc4.1">4.1 Entitiy Matching</a></li>
                <li><a href="#toc4.2">4.2 Schema Matching</a></li>
            </ul>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc5">5 Experimental Setup</a>
            <ul>
                <li><a href="#toc5.1">5.1 Baseline Experiments</a></li>
                <li><a href="#toc5.2">5.2 TURL Experiments</a></li>
                <li><a href="#toc5.3">5.3 Contrastive Learning Experiments</a></li>
            </ul>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc6">6 Experimental Results and Error Analyis</a>
            <ul>
                <li><a href="#toc6.1">6.1 Schema Matching Results</a></li>
                <li><a href="#toc6.2">6.2 Entitiy Matching Results</a></li>
            </ul>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc7">7 Discussion and Outlook</a>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc8">8 References </a>
        </li>
        <li class="toc-h2 toc-active">
            <a href="#toc9">9 Downloads </a>
        </li>
    </ul>

    <span id="toc1"></span>
    <h2>1 Introduction</h2>

    <!--CONTENT-->
    <p>
        According to estimations of the International Data Corporation, the amount of data created in 2025 will be
        around 180 zettabytes with increasing tendency. Reasons for that are increasing connection and information flow
        due to the world wide web. The web contains a massive amount of data in all forms. There can be structured or
        unstructured data and a lot of different sources which use different data models or different schemata but
        actually describe the same real-world entity. Moreover, information can differ in content, syntax or even in
        technical characteristics.
        Consequently, it gets quite challenging when trying to use or merge such heterogeneous data in order to compare and work with
        it for further applications such as online shopping, to name just one example where data from different sources need
        to be compared.
        Addressing this problem, the aim of this work is to master the challenges mentioned above and to establish
        different methods for both schema and entity matching.
    <p>

        <span id="toc2"></span>
    <h2>2 Theoretical Framework </h2>
    <!--CONTENT-->
    <p>
        This Chapter provides an overview of the theoretical underpinnings, frameworks as well as specific information relevant to follow our work. For this reason, we start by
        introducing the main tasks we are trying to solve: Schema and entity matching. We also give a brief introduction to transformer models, especially BERT-based implementations,
        as they form the basis for the algorithms we use and, due to limitations of standard algorithms and measures,
        became more and more popular in recent years.
    <p>
    <span id="toc2.1"></span>
    <h3>2.1 Schema Matching</h3>
    <p>
        Schema matching describes the task of matching similar or rather the same schemata and finding agreement and unity between applied structures.
        Database instances, for example, comprising of schemata and respective (table) columns describing the same attribute can or often need to be matched.
        The main challenges are size, semantic heterogeneity, generic names, esoteric naming conventions and different languages.
        Therefore, correspondences between the schemes should be detected in an automated or semi-automated manner. Although 1:n and n:1 approaches are possible,
        the scope of the object is reduced to only considering 1:1 matching as defined in the problem statement of the initial project discussion.

    <p>
    <span id="toc2.2"></span>
    <h3>2.2 Entity Matching</h3>
    <p>
        Entity matching, often also called identity resolution, is a crucial task for data integration and describes the exercise of finding all records that refer to the same entity,
        e.g. when integrating data from different source systems. Unfortunately, entity representations in real-world
        environments are in general neither identical nor always complete, but have to be processed at massive scale. One solution to address this difficulty is offered by
        comparing multiple attributes of different record representations with attribute-specific similarity measures
        like Levenshtein distance or advanced techniques like BERT. Newer approaches include the application of
        so-called table transformers which will be discussed in <a href="#toc3">Section 3</a>. Entity matching tries to allocate entities
        with different representations under the assumption that the higher the similarity is, the more likely two
        entity representations are a match <a href="#toc8">[7]</a>.

    <span id="toc2.3"></span>
    <h3>2.3 Transformer Models</h3>
    <p>
        In 2017, Google Brain proposed the transformer model, that based on an encoder-decoder structure and an attention mechanism showed
        impressive improvements over state-of-the-art methods and simplicity in adoption to a wide range of machine learning tasks,
         especially in the context of NLP <a href="#toc8">[8]</a>.
        As a result, a new language representation model named BERT was introduced in 2019 to pre-train deep bidirectional
        representations from unlabeled text. That resulted in a lot of possibilities as a "pre-trained BERT model can be finetuned with
        just one additional output layer to create state-of-the art methods for a wide range of tasks" <a href="#toc8">[2]</a>. In the
        context of this work, we used algorithms that are pre-trained on different BERT extensions, in particular TinyBERT and RoBERTa
        <a href="#toc8">[9]</a> <a href="#toc8">[10]</a>.
    <p>

    <span id="toc3"></span>
    <h2>3 Algorithms</h2>
    <p>
        In the following, we present different algorithms namely TURL from a class of table transformers and Contrastive Learning as
        a more general technique without a focus on tables.
        table transformers are models that not only incorporate data from individual entries,
        but include information from their surroundings inside the table as well. This results in models which take a
        whole table representation of a website or a knowledge base as input instead of only single entries like most other models do.
        The Contrastive Learning approach on the other hand rather tries to learn high-level features of a dataset by
        exploring the differences and similarities of data points.
    </p>
    <span id="toc3.1"></span>
    <h3>3.1 TURL</h3>
    <p>
        table Understanding through Representation Learning (TURL) is a "novel framework that introduces the pretraining/finetuning paradigm to
        relational Web tables" <a href="#toc8">[1]</a>. TURL is a TinyBERT based extension model that was pre-trained on around 600,000 Wikipedia tables such that it
         can be applied to different tasks with "minimal task-specific finetuning". The authors show that the model generalizes well and
         outperforms existing methods for example in column type annotation <a href="#toc8">[1]</a>.
        The basic idea of TURL is to " learn[s] deep contextualized representations on relational
        tables in an unsupervised manner" and generate a framework that can be finetuned to a wide range of tasks <a href="#toc8">[1]</a>.
        More specifically, the TURL architecture which can be seen in Figure 3.1 consists of three modules.
       At first, an embedding layer followed by a structure-aware stacked transformer, as introduced in <a href="#toc2.3">Section 2.3</a>,
        to capture textual and relational knowledge with a "visibility matrix" that models the row-column structure
        of the tables and concluded by a projection layer.
    <figcaption style="text-align:center">
     Figure 3.1: Overview TURL architecture
    </figcaption>
    <figure >
        <img src="./visualizations/Website_images/TURL framework.JPG" style="margin-bottom:2em;" class="center1" width="50%"/>
    </figure>

    <p>
        After the described pretraining procedure the model can then be applied to different proposed finetuning tasks
        such as entity linking, column type annotation, relation extraction, row population, cell filling, and schema
        augmentation.
    </p>
    </p>

    <span id="toc3.2"></span>
    <h3>3.2 Contrastive Learning</h3>
    <p>
        Contrastive Learning has become a promising approach both in information retrieval <a href="#toc8">[6]</a> as well as in computer vision outperforming previous methods
        in self-supervised and semi-supervised learning <a href="#toc8">[5]</a>. This framework has further been extended to a fully-supervised setting introducing an alternative
        to the usual cross-entropy loss function <a href="#toc8">[4]</a> while achieving state-of-the-art results. In addition, supervised Contrastive Learning has seen recent success
        in product matching which is a special form of entity matching <a href="#toc8">[3]</a>.

        <figcaption style="text-align:center">
            Figure 3.2: Supervised Contrastive Learning

        </figcaption>
        <figure >
            <img src="./visualizations/Website_images/contrastive.png" style="margin-bottom:2em;"class="center1" width="30%"/>
        </figure>

        Figure 3.2 summarizes the overall framework of supervised Contrastive Learning. In general, the whole process is split up in two stages, the contrastive pretraining
        followed by the second stage of finetuning. The main purpose of contrastive pretraining is to learn hidden representations of respective clusters in such a way
        that instances from the same class end up close in space while instances from different classes end up far in space.
        <br>
        After the pretraining step, the encoder network ideally maps each class to a well seperated cluster in the embedding space. For the final stage of finetuning
        the parameters of the encoder network remain frozen and only the linear classification head is trained.
        In contrast to other methods, contrastive pretraining makes it much easier to learn decision boundaries for a linear classifier given a pretrained embedding space.
        This makes contrastive pretraining a powerful method for further downstream tasks such as multi-class classification.
    </p>


    </p>
    <span id="toc4"></span>
    <h2>4 Datasets and Preprocessing </h2>
    <!--CONTENT-->
    <p>
        <!-- Concept Explanation-->
        Chapter 4 gives an overview on the generation of our datasets and the final values contained in the sets.
        Our dataset is based on the Web Data Commons - Schema.org table Corpus <a href="#toc8">[12]</a> maintained by the Data
        and Web Science Research Group at the University of Mannheim.
        <br> For both tasks, i.e. entity and schema matching, different selection and filtering methods were applied to attain the final datasets.
        However, in a first action, chosen tables for both taks were cleaned using a two-step approach in order to extract English language data only.
        We applied a TLD-based method to first filter our data on English internet domain
        endings, e.g. ".com",or ".uk", and afterwards applied the fastText language detection algorithm <a href="#toc8">[11]</a> on each single row in
        the remaining tables to check whether it belongs to the English language and if not we discarded them.
        Further cleaning of selected tables and further preprocessing approaches specific to each task of schema or entity matching will be explained in detail in the following.

      <span id="toc4.1"></span>
      <h3>4.1 Entity Matching</h3><br>
        Within entity matching we focused on a specific part of the corpus, namely the Product data as our biggest entity type
        with 1.66 million tables and 231 million entities in the data segments Top100 and Minimum3, while excluding the tables from the Rest unit,
        since those tables were too small for our task. After the initial language cleaning step, we remained with 435,000 tables and 100 million entities.
        The Product corpus already provided a clustering for the entities, so no
        further annotation was needed. Here, a cluster corresponds to a collection of identical products across different origins, i.e. websites.
        Focusing on clusters with at least eight tables, we used the most common brands for different
        chosen categories, in particular bikes, cars, clothes, drugstore, electronics,
        and technology as keywords to get relevant clusters for our final dataset. To further enhance our data, we
        searched for brands with at least 1,000 distinct clusters and established another category called "random". In order to
        make the final matching for the algorithms not too easy, we browsed the selected clusters for homogeneous entities
        using Doc2Vec <a href="#toc8">[13]</a> and Jaccard similarity to include hard-to-distinguish clusters in our dataset.
        Here, we based our selection on the balance between both, Jaccard and Doc2Vec score, by manually validating the best
        results for each of the defined domains.
        <p>
            To get a better understanding of hard-to-distinguish cases, we provide some examples in Table 4.1.
            The upper table illustrates three examples of so-called hard non-matching cases. A hard non-match describes two entities which are semantically very close
            but belong to different classes.  As one can see, some entities only differ in some characters, resulting in different cluster assignments.
            On the other hand, hard matches relate to two entities which belong to the same cluster but are not as close in space as other entities within that class.
            This can easily happen when the same entities differ in the character length of their respective name column.
            For hard non-matches we provide the corresponding cosine similarity between a query entity and its closest match while for hard matches we show
            the cosine metric between a query and its closest match within the same cluster. In both cases, the cosine similarity is computed by comparing the vectorized
            entities obtained by Doc2Vec.
        </p>

        <table style="width:100%" class="centertable">
            <caption style="margin-top: 1em">Table 4.1: Illustration of hard non-matches and hard matches .</caption>
            <thead>
              <tr>
                <th>Entity</th>
                <th>Most Similar Entity</th>
                <th>Cosine Metric</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>Lifeproof Case Iphone 11</td>
                <td>iPhone 11 <b>Pro Max</b> case</td>
                <td>0.9776</td>
              </tr>
              <tr>
                <td>Lego Star Wars The Complete Saga <b>DS</b></td>
                <td>Lego Star Wars: The Complete Saga - <b>Wii Video</b> Game</td>
                <td>0.9367</td>
              </tr>
              <tr>
                <td> <b>10 2010</b> Audi A5 Quattro Fuel Injector 2.0L 4 Cyl Bosch High Pressure</td>
                <td> <b>18 2018</b> Audi A5 Quattro Fuel Injector 2.0L 4 Cyl Standard Motor Products</td>
                <td>0.9771</td>
              </tr>
            </tbody>
            <thead>
                <tr>
                  <th>Entity</th>
                  <th>Matching Entity</th>
                  <th>Cosine Metric</th>
                </tr>
              </thead>
              <tbody>
                <tr>
                  <td>iPhone 11 <b>Pro Max</b> case</td>
                  <td>For iphone 11 <b>pro</b> x xr xs <b>max</b> cell phone case cover with camera lens protection</td>
                  <td>0.8514 (below top 30)</td>
                </tr>
                <tr>
                  <td>08MP-08FPS 90° Elbow <b>Long</b> Forged</td>
                  <td>08MP-08FPS 90° Elbow Forged</td>
                  <td>0.9062 (15th place)</td>
                </tr>
                <tr>
                  <td>Jasmine Dragon Pearls Green Tea</td>
                  <td>Jasmine Dragon Pearl <b>Jasmine</b> Green Tea</td>
                  <td>0.9501 (5th place)</td>
                </tr>
              </tbody>
            </table>

        To increase the performance of the models, post-preprocessing was applied through a cleaning procedure. This
        entailed the removal of special characters and duplicates. More complex cleaning approaches were not used due to
        the high possibility of multiple pitfalls given by the high amount of data.

        With a multilabel stratified shuffle-split <a href="#toc8">[15]</a> we distributed the remaining clusters
        into &frac38; train, &frac14; validation and &frac38; test data. With this approach we ensured that
        tables were distributed equally across size and selected clusters. To avoid skewing the results, we also manually
        cleaned the test set, detecting about 10% noise, which we discarded entirely. Thereby, we ended up with an amount of
        1,410 clusters that were used for training the algorithms. The final set sizes can be seen in Table 4.2.


    </p>
    <table style="width:30%" class="centertable">
        <caption style="margin-top: 1em">Table 4.2: Final product dataset sizes for entity matching.</caption>
        <thead>
          <tr>
            <th></th>
            <th>Number of tables</th>
            <th>Number of Entities</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td>Train</td>
            <td>1,345</td>
            <td>11,121</td>
          </tr>
          <tr>
            <td>Validation</td>
            <td>885</td>
            <td>7,154</td>
          </tr>
          <tr>
            <td>Test</td>
            <td>1,331</td>
            <td>10,655</td>
          </tr>
        </tbody>
        </table>

        <span id="toc4.2"></span>
        <h3>4.2 Schema Matching</h3>
        <p>
            As a basis for schema matching, it was important to gather a large amount of diverse table data to create a sufficient dataset
            with at least 200 different columns that were later on used for matching. Thereby, the chosen columns should be evenly distributed within large, mid-sized and small tables,
            and their content should represent different data types, different lengths as well as hard cases for distinction.
            We started with 668,593 tables that were taken into account. With the described removal technique of non-English tables the dataset was reduced to
            267,283 tables.
            To create a solid database, we chose the 20 largest categories of Schema.org in order to gain a large amount of
            tables that contain a sufficient amount of data. Due to the vast amount of disorderly data we initally selected 207 columns to be able
            to additionally reduce the column set in case that we would detect further misfitting criteria during the following data preparation and preprocessing.
            However, as we wanted to make sure that all tables contain at least three of the selected columns for schema matching, our initial dataset was already reduced to 79,318 tables.
            Further, in order to create a useful dataset, we reduced our sample of tables by checking for tables with more than ten rows,
            less than 50% NAs within selected columns​ as well as less than 15% NAs within relevant columns in the entire table.
            <br> The resulting 54,190 tables were then divided into
            a training and test set by performing a multi-label stratified shuffle split with three separations (random state = 42) resulting in 44,435 training tables and 9,894 test tables.
            We chose to perform a multi-label stratified shuffled split <a href="#toc8">[15]</a> to not only distribute the different categories
            but also the selected columns in each category proportionally between the training and test set.
            To compare different input and training sizes later on, our training data was further divided into a medium as well as small sized training set. Again,
            we performed a multi-label stratified shuffled split to distribute the data and especially columns proportionally making sure that all selected columns
            were represented in all datasets. Hereby, the large training set of 44,345 tables contains all 9,776 tables of the medium-sized training and the medium-sized training
            set contains all 2,444 tables of the small training set.
            <br>
            As a means to create a clean and reliable set of test tables, the entire test set containing 9,894 tables was manually checked for languages other than English, wrong column
            labels as well as multiple, missing or odd entries such as symbols, for example. Thereby, about 10% of the tables were removed with more than 50% due to other foreign languages,
            hence reducing the test table size to 8,912.
            <br>
            The distribution of all tables as well as tables within each selected category is presented in the Table 4.3.

        </p>
        <table class="centertable">
            <caption style="margin-top: 1em" >Table 4.3: Final data and column set sizes and category distribution for schema matching.</caption>
            <thead>
              <tr>
                <th></th>
                <th>Small Training Set</th>
                <th>Medium Training Set</th>
                <th>Large Training Set</th>
                <th>Test Set</th>
                <th># of selected columns<br>within each category</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>All</td>
                <td>2,444</td>
                <td>9,776</td>
                <td>44,345</td>
                <td>8,912</td>
                <td>207</td>
              </tr>
              <tr>
                <td>Product</td>
                <td>1,033</td>
                <td>4,256</td>
                <td>19,367</td>
                <td>3,839</td>
                <td>46</td>
              </tr>
              <tr>
                <td>Music Recording</td>
                <td>318</td>
                <td>1,102</td>
                <td>5,031</td>
                <td>1,097</td>
                <td>7</td>
              </tr>
              <tr>
                <td>Event</td>
                <td>248</td>
                <td>1,007</td>
                <td>4,563</td>
                <td>1,000</td>
                <td>11</td>
              </tr>
                <td>Creative Work</td>
                <td>221</td>
                <td>869</td>
                <td>3,925</td>
                <td>876</td>
                <td>23</td>
              </tr>
                <td>Recipe</td>
                <td>195</td>
                <td>770</td>
                <td>3,522</td>
                <td>727</td>
                <td>24</td>
              </tr>
              </tr>
                <td>Person</td>
                <td>163</td>
                <td>690</td>
                <td>3,148</td>
                <td>545</td>
                <td>24</td>
              </tr>
              </tr>
                <td>Local Business</td>
                <td>123</td>
                <td>490</td>
                <td>2,209</td>
                <td>381</td>
                <td>23</td>
              </tr>
              </tr>
                <td>Place</td>
                <td>38</td>
                <td>160</td>
                <td>728</td>
                <td>131</td>
                <td>5</td>
              </tr>
              </tr>
                <td>Hotel</td>
                <td>38</td>
                <td>156</td>
                <td>701</td>
                <td>117</td>
                <td>8</td>
              </tr>
              </tr>
                <td>Book</td>
                <td>29</td>
                <td>118</td>
                <td>537</td>
                <td>65</td>
                <td>12</td>
              </tr>
              </tr>
                <td>Restaurant</td>
                <td>20</td>
                <td>79</td>
                <td>353</td>
                <td>61</td>
                <td>12</td>
              </tr>
              </tr>
                <td>Music Album</td>
                <td>9</td>
                <td>41</td>
                <td>189</td>
                <td>42</td>
                <td>4</td>
              </tr>
              </tr>
                <td>TV Episode</td>
                <td>9</td>
                <td>38</td>
                <td>162</td>
                <td>31</td>
                <td>3</td>
              </tr>
            </tbody>
            </table>

     <br><br>
    The respective distribution of the chosen columns for schema matching is also displayed in Table 4.3 with product, being the largest category and source of tables,
    including the majority of the columns. As mentioned above, the chosen columns were to represent different data types so that the set of 207 columns
    includes 150 strings, 21 datetime, 18 float, 13 integers as well as five geolocation columns. During the selection of the dataset and the aforementioned columns,
     we made sure to include hard matching cases, i.e. columns that are really hard to distinguish for the algorithm. Table 4.4 illustrates two examples
    of those hard matching cases. For example, our data set comprises of five different product gtins (global trade item numbers) that are almost identical and only differ in length.
    Another example of two very similar entities shown in Table 4.4 is derived from the recipe class as most recipe headlines sound exactly like recipe names.
    Hence, we refer to hard matching cases if column content can be easily mixed up within as well as between classes and is therefore hard to distinguish.
    <br>


    <table style="width:100%" class="centertable">
            <caption style="margin-top: 1em">Table 4.4: Illustration of hard cases.</caption>
            <thead>
              <tr>
                <th>Gtin</th>
                <th>Example</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>gtin13Product</td>
                <td>7321428469419 0 8032767441293 0 8032766030245...</td>
              </tr>
              <tr>
                <td>gtin14Product</td>
                <td>00032054003584 00032054003560 00032054003591 ...</td>
              </tr>
            </tbody>
            <thead>
                <tr>
                  <th>Recipe Description Type</th>
                  <th>Example</th>
                </tr>
              </thead>
              <tbody>
                <tr>
                  <td>headlineRecipe</td>
                  <td>Lemony Angel Hair Pasta with Crab Turkey Spina...</td>
                </tr>
                <tr>
                  <td>nameRecipe</td>
                  <td>Peach Cake Recipe using Fresh Peaches Canned G.</td>
                </tr>
              </tbody>
            </table>


</p>
<span id="toc5"></span>
<h2>5 Experimental Setup</h2>
<p>
    For running the experiments we used the resources from the University of Mannheim (dws-server) as well as the resources from the state Baden-Württemberg,
    the bw-uni-cluster. With that we had access to different
     setups to efficiently run the experiments. Furthermore, we had enough storage space to store
    the different datasets created for the experiments.
</p>
<span id="toc5.1"></span>
<h3>5.1 Baseline Experiments</h3>
<!--CONTENT-->
<p>
    In the case of entity matching, we used three different baseline models to be able to compare the results of our algorithms.
    We included one tree-based model in Random Forest, and two BERT-based models in TinyBERT and RoBERTa, because, as mentioned
    in <a href="#toc3.1">Section 3.1</a>, TURL is based on TinyBERT.
    All the baselines were modelled as multi-class classification that were presented a concatentation of the entity name and
    a description in case of the product dataset.
    The results are presented in Table 5.1 below.
    <table style="width:30%" class="centertable">
        <caption style="margin-top: 1em">Table 5.1: F1 scores for different baselines models in entity matching.</caption>
        <thead>
          <tr>
            <th></th>
            <th>Random Forest</th>
            <th>TinyBERT</th>
            <th>RoBERTa</th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td>Product</td>
            <td>0.8684</td>
            <td>0.8329</td>
            <td>0.8958</td>
          </tr>
        </tbody>
        </table>
<p>
    As a baseline for schema matching, both tree and BERT-based models were used. As a tree based model, we applied a Random Forest with
    both value and meta-based data.
    <br> The value based datasets were created with TF-IDF, whereby a global and a binary approach was used.
    For TF-IDF the data was preprocessed with the following steps: The concatenated text of each column was lower-cased and tokenized and
    stopwords as well as punctuation were removed.
    Based on the meta data, following variables were created as a data base for the meta data approach model:
    a binary variable that indicates whether or not the column content was structured within brackets such as "{}";
    a variable that gives the length of the value;
    a variable with the average word length and
    a binary variable that indicates whether the column includes dates.
    To keep the original structure of the data, no preprocessing was performed. Hereby, the regular TF-IDF approach yielded a micro F1 score of 0.35 and the binary TF-IDF
    approach yielded a micro F1 score of 0.27 while the meta approach yielded a micro F1 score of 0.12.
    Further, we used BERT-based models such as BERT, RoBERTa, TinyBERT and DistilBERT as baseline models for the
    respective small, medium as well as large training dataset. As a database the concatenated values of the target columns were used.
    To keep as much information as possible, again, no further preprocessing was applied. The models were trained with 25 epochs. As it can be
    seen in the results in Table 5.2, the models performed quite different on the small and medium datasets. However, when looking at the
    results of the large training dataset, all models performed quite well and, except for TinyBERT,
    all models reached a performance of 0.80 micro F1 score.

    <table style="width:35%" class="centertable">
        <caption style="margin-top: 1em">Table 5.2: Micro F1 scores for BERT-based baseline models in schema matching.</caption>
            <thead>
              <tr>
                <th></th>
                <th>DistilBERT</th>
                <th>BERT</th>
                <th>TinyBERT</th>
                <th>RoBERTa</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>Small</td>
                <td>0.6089</td>
                <td>0.7327</td>
                <td>0.6982</td>
                <td>0.6601</td>
              </tr>
              <tr>
                <td>Medium</td>
                <td>0.6166</td>
                <td>0.7623</td>
                <td>0.7044</td>
                <td>0.7569</td>
              </tr>
              <tr>
                <td>Large</td>
                <td>0.8019</td>
                <td>0.8014</td>
                <td>0.7593</td>
                <td>0.8030</td>
              </tr>
            </tbody>
            </table>

<p>
    As mentioned earlier, TURL is based on TinyBERT so that TinyBERT represents a feasable baseline model and will mainly be used
    for comparison and evaluation of the results in the subsequent chapters.
</p>

    <!-- include pair-wise trainsizes?-->

    <span id="toc5.2"></span>
<h3>5.2 TURL Experiments</h3>
<!--CONTENT-->
<p>
    TURL does already offer several predefined tasks to evaluate the pre-trained framework on <a href="#toc8">[1]</a>.
    Hereby, the task of column type annotation was evaluated to be the most suitable for both entity and schema matching. In order to pretain the model with our selected tables and
    entities and to perform the task of column type annotation the data has to be structured as a nested list where each table was represented by an inner list itself containing
    table id, page title, Section title as well as further lists of headers, cell content and the column types.
    The given input representation of tables for the task of column type annotation can be found within the README file within the TURL
    <a href="https://github.com/sunlab-osu/TURL"> git repository</a>.
    Further information on the pretraining and the respective finetuning task can be found in TURL: table Understanding through Representation Learning <a href="#toc8">[1]</a>.
    For Entity matching we experimented with two different approaches. One setting was based on transposing the matrix, such that
    each column was modelled as one entity as the prebuild framework aggregated the column information. In the second case
    we changed the TURL code itself to be able to aggregate over rows instead of columns. For schema matching the proposed column type annotation was directly applicable.<br>
    Detailed information on settings and hyperparameters adjusted during the experimentation are presented in Table 5.3/5.4.


</p>
<br>
<div style="margin-top:2em;" >
    <div style=";margin-right: 2.5em">
        <div class="tab">
            <button class="tablinks" onclick="openExpResult(event, 'schema')">Schema</button>
            <button class="tablinks" onclick="openExpResult(event, 'entity')">Entity</button>
        </div>
        <div id="schema" class="tabcontent" style="display:block;">
            <table class="Multi-class" >
                <caption style="margin-top: 1em">Table 5.3: Schema TURL Settings</caption>
                <tr>
                    <th>Category</th>
                    <th colspan="1" style='text-align:center; vertical-align:middle'> Inital Inputs</th>
                    <th colspan="3" style='text-align:center; vertical-align:middle'> Adjusted/Final Inputs per Train Size</th>
                </tr>
                <tr>
                    <th></th>
                    <th></th>
                    <th>Small</th>
                    <th>Medium</th>
                    <th>Large</th>
                </tr>

                <tr>
                    <td  style="background-color:#E8E8E8">Training Epochs</td>
                    <td>10</td>
                    <td>50</td>
                    <td>50</td>
                    <td>50</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Learning Rate</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-5</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Batch Size</td>
                    <td>20</td>
                    <td>20</td>
                    <td>20</td>
                    <td>20</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Accumulation Steps</td>
                    <td>2</td>
                    <td>2</td>
                    <td>2</td>
                    <td>2</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Save Steps</td>
                    <td>5000</td>
                    <td>50</td>
                    <td>125</td>
                    <td>650</td>
                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Logging Steps</td>
                    <td>1500</td>
                    <td>15</td>
                    <td>50</td>
                    <td>200</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Warm up Steps</td>
                    <td>5000</td>
                    <td>50</td>
                    <td>125</td>
                    <td>650</td>

                </tr>


            </table>
        </div>
        <div id="entity" class="tabcontent" style="display: none;">
            <table class="pair-wiseSmall">
                <caption style="margin-top: 1em">Table 5.4: Entity TURL Settings</caption>
                <tr>
                    <th>Category</th>
                    <th colspan="1" style='text-align:center; vertical-align:middle'> Inital Inputs</th>
                    <th colspan="2" style='text-align:center; vertical-align:middle'> Adjusted/Final Inputs per Model Type</th>
                </tr>
                <tr>
                    <th></th>
                    <th></th>
                    <th>TURL (transposed) </th>
                    <th>TURL (rewritten)</th>
                </tr>

                <tr>
                    <td  style="background-color:#E8E8E8">Training Epochs</td>
                    <td>10</td>
                    <td>100</td>
                    <td>100</td>


                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Learning Rate</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-4</td>


                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Batch Size</td>
                    <td>20</td>
                    <td>8</td>
                    <td>8</td>


                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Accumulation Steps</td>
                    <td>2</td>
                    <td>2</td>
                    <td>2</td>


                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Save Steps</td>
                    <td>5000</td>
                    <td>5</td>
                    <td>5</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Logging Steps</td>
                    <td>1500</td>
                    <td>1</td>
                    <td>1</td>


                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Warm up Steps</td>
                    <td>5000</td>
                    <td>5</td>
                    <td>5</td>
                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Weight decay</td>
                    <td>0.00</td>
                    <td>0.01</td>
                    <td>0.01</td>
                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Adam epsilon</td>
                    <td>1e-8</td>
                    <td>1e-8</td>
                    <td>1e-6</td>
                </tr>

            </table>
        </div>
    </div>
</div>


<span id="toc5.3"></span>
<h3> 5.3 Contrastive Learning Experiments </h3>
<p>
    All our experiments presented in this Section build upon the previous work on supervised Contrastive Learning for product matching <a href="#toc8">[3]</a>
    and the respective <a href="https://github.com/wbsg-uni-mannheim/contrastive-product-matching">git repository</a>. Their model already offered the use-case of binary pair-wise classification for product matching.
    From there we reformulated their approach to serve as a multi-class classification framework, for both schema and entitiy matching.
    As this approach is not table-based anymore, we had to restructure our data in a similar way as we did for our baseline experiments. While for product matching we only had one
    dataset available, we conducted experiments on all three datasets for schema matching namely small, medium and large. Furthermore, for product we concatenated
    the name and description columns and appended the respective cluster id as the label column. For schema matching the whole column information is concatenated into
    one text column followed the corresponding label. In both cases we basically discarded the table content information as all entries are compressed into the same file.
    For contrastive pretraining we then combined training and validation sets into a single one to rely on more data. As mentioned before, the whole power of this approach builds on
    meaningful contrastive pretraining. It is for the same reason that for product, we decided for a large batch size to avoid noisy gradient signals. In the case of schema matching
    we were not able to set the batch size as high as desired due to resource contraints. This is important to note and will be discussed later.
    After successful pretraining we froze the parameters of the encoder network and only trained a linear classifier which is a basic feed forward neural network. For product we even experimented
    with unfrozen parameters but results did not outperform the frozen setting.
    Table 5.5/5.6 summarizes our respective hyperparameter settings for the Contrastive Learning experiments. Note that for the finetuning step, we trained our models for up to 150 epochs using early stopping if validation loss did not improve significantly.


</p>
<div style="margin-top:2em;">
    <div style=";margin-right: 2.5em">
        <div class="tab">
            <button class="tablinks" onclick="openExpResult(event, 'schemacon')">Schema</button>
            <button class="tablinks" onclick="openExpResult(event, 'entitycon')">Entity</button>
        </div>
        <div id="schemacon" class="tabcontent" style="display:block;">
            <table class="Multi-class">
                <caption style="margin-top: 1em">Table 5.5: Schema Contrastive Learning Settings</caption>
                 <tr>
                    <th>Category</th>
                    <th colspan="1" style='text-align:center; vertical-align:middle'> Pretraining</th>
                    <th colspan="3" style='text-align:center; vertical-align:middle'> Finetuning</th>
                </tr>
                <tr>
                    <th></th>
                    <th></th>
                    <th>Small</th>
                    <th>Medium</th>
                    <th>Large</th>
                </tr>

                <tr>
                    <td  style="background-color:#E8E8E8">Training Epochs</td>
                    <td>200</td>
                    <td>150</td>
                    <td>150</td>
                    <td>150</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Learning Rate</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-5</td>

                </tr>
                <tr>
                  <td  style="background-color:#E8E8E8">Temperature</td>
                  <td>0.07</td>
                  <td>0.07</td>
                  <td>0.07</td>
                  <td>0.07</td>
              </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Batch Size</td>
                    <td>128</td>
                    <td>128</td>
                    <td>128</td>
                    <td>128</td>

                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Tokenizer</td>
                    <td>RoBERTa</td>
                    <td>RoBERTa</td>
                    <td>RoBERTa</td>
                    <td>RoBERTa</td>

                </tr>
            </table>
        </div>
        <div id="entitycon" class="tabcontent" style="display: none;">
            <table class="Multi-class">
                <caption style="margin-top: 1em">Table 5.6: Entity Contrastive Learning Settings</caption>
                 <tr>
                    <th>Category</th>
                    <th colspan="1" style='text-align:center; vertical-align:middle'> Pretraining</th>
                    <th colspan="3" style='text-align:center; vertical-align:middle'> Finetuning</th>
                </tr>
                <tr>
                    <th></th>
                    <th></th>
                    <th>Frozen</th>
                    <th>Unfrozen</th>
                </tr>

                <tr>
                    <td  style="background-color:#E8E8E8">Training Epochs</td>
                    <td>200</td>
                    <td>150</td>
                    <td>150</td>
                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Learning Rate</td>
                    <td>5e-5</td>
                    <td>5e-5</td>
                    <td>5e-5</td>

                </tr>
                <tr>
                  <td  style="background-color:#E8E8E8">Temperature</td>
                  <td>0.07</td>
                  <td>0.07</td>
                  <td>0.07</td>

              </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Batch Size</td>
                    <td>1024</td>
                    <td>128</td>
                    <td>128</td>
                </tr>
                <tr>
                    <td  style="background-color:#E8E8E8">Tokenizer</td>
                    <td>RoBERTa</td>
                    <td>RoBERTa</td>
                    <td>RoBERTa</td>
                </tr>
            </table>
        </div>
    </div>
</div>


<br>

<span id="toc6"></span>
<h2>6 Experimental Results and Error Analysis</h2>
<!--CONTENT-->
<p>
    Within this chapter all experimental results as well as a detailed error analysis will follow Tables 6.1/6.2 illustrate
    the final F1 scores we achieved with best ascertained settings for both tasks.
<br>

<div style="margin-top:2em;">
    <div style=";margin-right: 2.5em">
        <div class="tab">
            <button class="tablinks" onclick="openExpResult(event, 'unsup')">Schema</button>
            <button class="tablinks" onclick="openExpResult(event, 'supwc')">Entity</button>
        </div>
        <div id="unsup" class="tabcontent" style="display:block;width:800px; margin:0 auto;">
            <table class="Multi-class">
                <caption style="margin-top: 1em">Table 6.1: Schema Matching Results</caption>
                <tr>
                    <th colspan="1" style='text-align:center; vertical-align:middle'></th>
                    <th colspan="4" style='text-align:center; vertical-align:middle'> Baseline</th>
                    <th colspan="1" style='text-align:center; vertical-align:middle'>Contrastive Learning</th>
                    <th colspan="2" style='text-align:center; vertical-align:middle'> table-based Transformers</th>
                </tr>
                <tr>
                    <th>Train Set</th>
                    <th>BERT</th>
                    <th>TinyBERT</th>
                    <th>DistilBERT</th>
                    <th>RoBERTa</th>
                    <th>Contrastive Learning</th>
                    <th>TURL <br> 25 Epochs</th>
                    <th>TURL <br> 50 Epochs</th>

                </tr>
                <tr>
                    <td style="background-color:#E8E8E8">Small</td>
                    <td>0.7327</td>
                    <td>0.6982</td>
                    <td>0.6089</td>
                    <td>0.6601</td>
                    <td>0.7531</td>
                    <td>0.7457​​</td>
                    <td><b>0.7722</b></td>
                </tr>
                <tr>
                    <td style="background-color:#E8E8E8">Medium</td>
                    <td>0.7623</td>
                    <td>0.7044</td>
                    <td>0.6166</td>
                    <td>0.7569</td>
                    <td>0.7664</td>
                    <td>0.8214</td>
                    <td><b>0.8214​</b></td>
                </tr>
                <tr>
                    <td style="background-color:#E8E8E8">Large</td>
                    <td>0.8014</td>
                    <td>0.7593</td>
                    <td>0.8019</td>
                    <td>0.8030</td>
                    <td>0.7644</td>
                    <td>0.8663​​</td>
                    <td><b>0.8684​</b></td>
                </tr>
            </table>
        </div>
        <div id="supwc" class="tabcontent" style="display: none;width:800px; margin:0 auto;">
            <table class="pair-wiseSmall">
                 <caption style="margin-top: 1em">Table 6.2: Entity Matching Results</caption>
                <tr>
                    <th colspan="1" style='text-align:center; vertical-align:middle'></th>
                    <th colspan="3" style='text-align:center; vertical-align:middle'> Baseline</th>
                    <th colspan="2" style='text-align:center; vertical-align:middle'> Contrastive Learning</th>
                    <th colspan="2" style='text-align:center; vertical-align:middle'> table-based Transformers</th>
                </tr>
                <tr>
                    <th>Set</th>
                    <th>Random Forest</th>
                    <th>TinyBERT</th>
                    <th>RoBERTa</th>
                    <th>Frozen</th>
                    <th>Unfrozen</th>
                    <th>TURL (transposed)</th>
                    <th>TURL (rewritten)</th>

                </tr>
                <tr>
                    <td style="background-color:#E8E8E8">Product</td>
                    <td>0.8684</td>
                    <td>0.8329</td>
                    <td>0.8958</td>
                    <td><b>0.9379</b></td>
                    <td>0.9281</td>
                    <td>0.7556</td>
                    <td>0.7479</td>
                </tr>
            </table>
        </div>
    </div>
</div>


<span id="toc6.1"></span>
<h3>6.1 Schema Matching Results</h3>

<br>


<p>
    In Table 6.1 we can see that TURL outperformed the baseline models as well as the contrastive learning approach.
    It seems that increasing the number of epochs that TURL is trained after 25 epochs does not play a significant role for the both mid sized
    and large datasets as the performance converged for both 25 and 50 epochs to the same outcomes. The size of the training dataset, however,
    definitely has a great influence on the performance of the baseline models and TURL as the performances increased up to 20 percent.
    For contrastive learning on the other hand, the size of the dataset only made a difference of 0.01. Hereby, there is no difference between the
    mid sized and the large dataset.
    <br>
    When looking at the distribution of F1 scores for each chosen category and the differences in training sizes, Figure 6.1 clearly indicates, that
    the performance increased with increasing training samples. For some categories, such as TV episode or music album, for example, the difference was rather large
    while for other categories the difference is not as big. When looking at the data and column count more thoroughly, however, some columns such as number of tracks
    (in music album) that are only represented in 20 tables within the medium-sized training set already achieved an F1 score higher than 0.8.
    Other columns, such as names (in music album) did not reach an F1 score of 0.8 despite being represented in more than 160 tables. Hence, even though the models'
    performances generally increased with a larger training sample size, this does not necessarily indicate that all columns achieve a high F1 score with more
    training data. With regards to time constraints, such an in-depth look was not possible for the contrastive learning approach. But, as mentioned earlier, the
    size of the training set does not seem to make a big difference.

</p>

<figcaption style="text-align:center">Figure 6.1: F1 score Distribution for each Category and Training Size</figcaption>
      <figure>
        <img src="./visualizations/Schema/F1_Score_Distribution.png" style="margin-bottom:2em;"class="center" width="70%" height="auto"/>
      </figure>

<p>
  Therefore, to deepen the understanding of the differences between the baselines, the contrastive learning approach and TURL, a more detailed evaluation
  with regards to the datatypes is of interest. For example, while TinyBERT and Contrastive Learning reached a micro F1 Score of
  0.5034 and 0.5780 respectively for the type geolocation, TURL outperformed them with a micro F1 Score of 0.8857.
  Also floats and integers reached much higher micro F1 Scores with TURL.
For datetime, the performances of the three approaches were quite similar. Only the type string was detected more accurately
by the baseline model and the contrastive approach. Table 6.3 shows detailed F1 scores for each data type and model applied with the best
performing data type highlighted for each model.
</p>

<div style="margin-top:2em;">
    <div style=";margin-right: 2.5em">
        <table class="centertable">
              <caption style="margin-top: 1em">Table 6.3: F1 scores compared for different data types and models</caption>
              <tr>
                  <th>Data Type</th>
                  <th colspan="3" style='text-align:center; vertical-align:middle'> Model</th>
              </tr>
              <tr>
                <th></th>
                <th>TinyBERT</th>
                <th>Contrastive Learning</th>
                <th>TURL</th>
              </tr>
              <tr>
                <td>strings</td>
                <td><b>0.7797</b></td>
                <td><b>0.7906</b></td>
                <td>0.7651</td>
              </tr>
              <tr>
                <td>datetime</td>
                <td>0.7575</td>
                <td>0.7363</td>
                <td>0.7670</td>
              </tr>
              <tr>
                <td>float</td>
                <td>0.4988</td>
                <td>0.4933</td>
                <td>0.7920</td>
              </tr>
              <tr>
                <td>integer</td>
                <td>0.5436</td>
                <td>0.4980</td>
                <td>0.7235</td>
              </tr>
              <tr>
                <td>geolocation</td>
                <td>0.5034</td>
                <td>0.5780</td>
                <td><b>0.8857</b></td>
              </tr>
            </table>
          </div>
        </div>


<p>
  In addition to differences in data types in general, when looking into the data and tables more thoroughly, further criteria amount for
  differences in the models' performances. As the analysis of different data types already suggests, all baseline models performed fairly well
   on string input as they seem to be more easily identifiable when being concatenated as input. At the same time, clean column content of strings
   or integers only performed similarly well for both baselines, CL as well as TURL. Clean column content, as shown in the example of bookformat in Table 6.4
   only contains one data type and no mixtures.
  Compared to the baselines, TURL performed significantly better on columns containing rather complex content. Complex content can be mixtures of strings and integers or floats combined.
  Examples of such complex columns are opening hours, specification of places or address columns for different categories such as person, hotel or local business.
  Table 6.4 below shows some examples of clean, e.g. total time, and complex, e.g. address, columns as well as the respective F1 score of the models.

  <div style="margin-top:2em;">
      <div style=";margin-right: 2.5em">
          <table class="centertable">
                <caption style="margin-top: 1em">Table 6.4: Examples of different Column and Data Types and respective F1 score Results for the Large Train Size</caption>
                <tr>
                    <th>Column</th>
                    <th>Data Type</th>
                    <th>Example content</th>
                    <th>TinyBERT</th>
                    <th>CL</th>
                    <th>TURL</th>
                </tr>
                <tr>
                  <td>Hotel.address</td>
                  <td>string</td>
                  <td>{'addresslocality': '130 Clyde St', 'addresscountry': 'UK',<br> 'addressregion': 'Glasgow', 'postalcode': 'G1 4LH'}</td>
                  <td>0.4727</td>
                  <td>0.3909</td>
                  <td>0.9099</td>
                </tr>
                  <td>Person.address</td>
                  <td>string</td>
                  <td>{'addresslocality': 'Sechelt', 'postalcode': 'V0N 3A0', 'streetaddress': '5498 Trail Avenue',<br> 'addresscountry': 'Canada', 'addressregion': 'British Columbia'}</td>
                  <td>0.3387</td>
                  <td>0.1935</td>
                  <td>0.8197</td>
                </tr>
                </tr>
                  <td>Place.geo</td>
                  <td>geolocation</td>
                  <td>{'longitude': '-9.47658792E1', 'latitude': '3.00983072E1'}</td>
                  <td>0.5118</td>
                  <td>0.7165</td>
                  <td>0.9512</td>
                </tr>
                <tr>
                  <td>Recipe.totaltime</td>
                  <td>datetime</td>
                  <td>PT20M</td>
                  <td>0.8053</td>
                  <td>0.8114</td>
                  <td>0.8857</td>
                </tr>
                <tr>
                  <td>Book.bookformat</td>
                  <td>string</td>
                  <td>[Hardbound, Paperback, e-Book]</td>
                  <td>0.8966</td>
                  <td>0.9310</td>
                  <td>0.8621</td>
                </tr>
              </table>
            </div>
          </div>
  <br>

  As mentioned earlier, product gtins are examples of hard matching cases. Accordingly, results of the different models, displayed in Table 6.5,
    show rather bad to moderate performances of the respective columns. It is evident that TURL outperformed the approaches for gtins for every case while the constrastive approach is outperformed by TinyBERT.
    However, even TURL only achieved an F1 score of 0.7250 for gtin13, which performed best for
    all models as the column comprises more training data compared to the other gtins. These results suggest that, despite the difference in length of the
    gtin numbers, TURL was unable to correctly differentiate between the numbers as it seems to focus more on cell context rather than differences of the cell content itself.
    <br>
    Such examples clearly show how table-based transformers make use of context to make predictions about specific column labels.
    At the same time, however, the amount of context, such as number of columns did not results in any significant differences
    in the performance of TURL compared to the baselines. Hence, we cannot conclude that more context in turn also yields better results for TURL.

    <div style="margin-top:2em;">
      <div style=";margin-right: 2.5em">
          <table class="centertable">
                <caption style="margin-top: 1em">Table 6.5: Examples of different gtin numbers and respective F1 score Results for the Large Train Size</caption>
                <tr>
                    <th>Column</th>
                    <th>Data Type</th>
                    <th>TinyBERT</th>
                    <th>CL</th>
                    <th>TURL</th>
                </tr>
                <tr>
                  <td>Product.gtin12</td>
                  <td>integer</td>
                  <td>0.5163</td>
                  <td>0.4402</td>
                  <td>0.5707</td>
                </tr>
                  <td>Product.gtin13</td>
                  <td>integer</td>
                  <td>0.6859</td>
                  <td>0.6963</td>
                  <td>0.7250</td>
                </tr>
                </tr>
                  <td>Product.gtin14</td>
                  <td>integer</td>
                  <td>0.0556</td>
                  <td>0.0555</td>
                  <td>0.2500</td>
                </tr>
                <tr>
                  <td>Product.gtin8</td>
                  <td>integer</td>
                  <td>0.3858</td>
                  <td>0.3070</td>
                  <td>0.4368</td>
                </tr>
                <tr>
                  <td>Product.gtin</td>
                  <td>integer</td>
                  <td>0.1429</td>
                  <td>0.0989</td>
                  <td>0.4706</td>
                </tr>
              </table>
            </div>
          </div>
  <br>

<span id="toc6.2"></span>
<h3>6.2 Entity Matching Results</h3>

<p>
    Unlike schema matching, the results for entity matching appear slightly different (see Table 6.2). RoBERTa, as our best performing
    baseline with a F1 score of around 0.90, outperforms the TURL model by far (best F1 score: around 0.76). Moreover,
    the Contrastive Learning (frozen) model exceeds these scores by almost 4%, ending up with the best F1 score of around 0.94.
    <br>
    To go deeper into analysis, we compared the best performing models of each category namely RoBERTa for the baseline,
    TURL (the transposed version) and Contrastive Learning (frozen) on different characteristics. We therefore filtered
    on specifically created features or existing variables and calculated single F1 scores on each possible value. For TURL, we
    had to calculate an averaged F1 score since predictions were not given at hand, but rather we were
    directly provided with F1 scores per cluster. This approach helped us finding intrinsic differences between the models.
    <br>
    As Figure 6.2 illustrates, the different sizes of train data already show various impact on the F1 score.
    We binned the sizes of train data appropriately to our circumstances and calculated F1 scores separately on the
    given bins. Although all indicate an increasing F1 score for increasing sizes of train data, the baseline and TURL
    start at around the same level whereas Contrastive Learning achieves very good results with only zero to five training
    instances given. The baseline and Contrastive Learning then both increase steadily, the baseline having a more steep
    curve and end up at a score of 1.0. On the other hand, one might notice that TURL replicates a more diverse curve
    regarding the train sizes and indeed only achieves very good results for larger sets.

    <figcaption style="text-align:center">Figure 6.2: Comparison for different sizes of train data</figcaption>
      <figure>
        <img src="./visualizations/Entity/Baseline_F1_train_size.png" style="margin-bottom:2em;" class="center3"/>
        <img src="./visualizations/Entity/TURL_F1_train_size.png" style="margin-bottom:2em;" class="center3" />
        <img src="./visualizations/Entity/CL_F1_train_size.png" style="margin-bottom:2em;" class="center3" >
        <p style="clear: both;">
      </figure>

    Similarly, we examined the influence of the different domains (<a href="toc4.1">Section 4.1</a>) of the product data that were established in the preprocessing step
    (Figure 6.3). Here, it is of interest to mention that especially the domains bikes, cars, and
    technology meet the highest scores and drugstore and clothes perform worst on all of the models. One possible explanation
    for this can be that the former are rather structured entities containing precise model numbers and description, whereas
    the latter represent more unstructured data where it is therefore harder to make good predictions. Differences between
    the models occur with the domain random where the baseline and Contrastive Learning outperform TURL relatively speaking.
    In general, Contrastive Learning still achieves very good results even with drugstore and clothes, dating F1 scores
    around 0.8. In contrast, the baseline only achieves a F1 of 0.6 for drugstore.
    </br>
    <br>

    <figcaption style="text-align:center">Figure 6.3: Comparison for different domains</figcaption>
      <figure>
        <img src="./visualizations/Entity/Baseline_F1_domains_test.png" style="margin-bottom:2em;" class="center3" />
        <img src="./visualizations/Entity/TURL_F1_domains_test.png" style="margin-bottom:2em;" class="center3" />
        <img src="./visualizations/Entity/CL_F1_domains_test.png" style="margin-bottom:2em;" class="center3" />
        <p style="clear: both;">
      </figure>

    Next, the investigation for the influence of the description column for the different models was undermined. It could
    be found that the baseline and Contrastive Learning both perform better without a description than with one. This is in contrast
    to TURL where the model prefers having more context. Detailed scores are given in Table 6.6.

    <div style="margin-top:2em;">
      <div style=";margin-right: 2.5em">
          <table class="centertable">
                <caption style="margin-top: 1em">Table 6.6: Impact of description column</caption>
                <tr>
                    <th></th>
                    <th>F1 Baseline</th>
                    <th>F1 TURL</th>
                    <th>F1 Contrastive Learning</th>
                </tr>
                <tr>
                  <td>With description</td>
                  <td>0.8054</td>
                  <td>0.7534</td>
                  <td>0.9309</td>
                </tr>
                  <td>Without description</td>
                  <td>0.9090</td>
                  <td>0.7018</td>
                  <td>0.9395</td>
                </tr>
              </table>
            </div>
          </div>
    <br>

    <figcaption style="text-align:center">Figure 6.4: Impact of description column - Correlation with other investigations</figcaption>
      <figure>
        <img src="./visualizations/Entity/Amt_with_desc_domain.png" style="margin-bottom:2em;" class="left" width="40%" height="auto"/>
        <img src="./visualizations/Entity/Amt_without_desc_domain.png" style="margin-bottom:2em;" class="right" width="40%" height="auto"/>
        <img src="./visualizations/Entity/Amt_with_desc_set_size.png" style="margin-bottom:2em;" class="left" width="40%" height="auto"/>
        <img src="./visualizations/Entity/Amt_without_desc_set_size.png" style="margin-bottom:2em;" class="right" width="40%" height="auto"/>
        <p style="clear: both;">
      </figure>

    Going deeper into analysis, a correlation with the domain attribute as well as the train size was discovered.
    Especially the domain technology contains mostly entries without description where the baseline and Contrastive Learning
    are very good.
    Moreover, the entries which do not contain any descriptions are mostly clusters with 15 to 50 entries where we could see
    that TURL's curve in Figure 6.4 registers a drop. Of course, these characteristics are all interrelating which is why
    it is hard to distinguish causality.
    <br>
    As a next step, comparing the overall F1 score to the one filtered on hard cases only appeared to be an interesting
    tool of choice. The results in Table 6.7 confirm what was expected: The predictions for hard cases
    appear more difficult. All models date a 10% difference in F1.

    <div style="margin-top:2em;">
      <div style=";margin-right: 2.5em">
          <table class="centertable">
                <caption style="margin-top: 1em">Table 6.7: Comparison for hard cases</caption>
                <tr>
                    <th></th>
                    <th>Baseline</th>
                    <th>TURL (averaged)</th>
                    <th>Contrastive Learning</th>
                </tr>
                <tr>
                  <td>Overall F1</td>
                  <td>0.8329</td>
                  <td>0.6453</td>
                  <td>0.9379</td>
                </tr>
                  <td>F1 on hard cases only</td>
                  <td>0.7102</td>
                  <td>0.5509</td>
                  <td>0.8458</td>
                </tr>
              </table>
            </div>
          </div>
    <br>

    As a last method for comparison, we investigated the impact of the amount of tokens in the name column. This was only performed for TURL
    since both, Contrastive Learning and the baseline, operate with the concatenation of name and description column as
    input which means that the comparison would not be of any use here. Figure 6.5 shows that a name column with five to ten tokens appears to be the best input.
    A shorter name seems to be too less information for the model.
    TURL then records a steep decrease with increasing name length. This is probably because very long names contain more
    noise and therefore hinder the model from precise predictions.
    <br>
    <br>

    <figcaption style="text-align:center">Figure 6.5: Comparison for length of name column</figcaption>
      <figure>
        <img src="./visualizations/Entity/TURL_F1_amt_tokens_test.png" style="margin-bottom:2em;" class="center" width="40%" height="auto"/>
        <p style="clear: both;">
      </figure>

    After the statistical analysis of our results, we decided to further look into the data itself for investigating intrinsic
    overlaps and differences between the models. First, one quickly recognizes that all models perform good on mostly very
    clean and large clusters where the data between different clusters is not too similar and entries include entity-specific
    tokens like model numbers for example. On the other hand, the tested models have a hard time with clusters which are
    very similar to another and names and descriptions differ a lot in content and length. This is somehow of what has been expected so far.
    <br>
    However, if we now look at the differences between the models, we can still make some new observations.
    Comparing our baseline model to TURL, the baseline makes way better predictions for small clusters as well as clusters where entries
    have very long descriptions or no description is given. Also TURL finds it very hard to deal with a very specific cluster
    type which appears more often but could possibly also be referred back to the reasons just mentioned.
    Yet, TURL manages to achieve better results for clusters where entries differ a lot in name length or entries with very
    different and often long descpritions which also matches with the findings described above.
    <br>
    Contrastive Learning as our third model though accomplishes very good results on all the cases. It also manages to deal
    with clusters featuring both very few training data and hard cases where our baseline model seems to fail. To conclude this
    analysis, of course the baseline also registers better F1 scores on some given clusters but these have to be attributed
    to the hard cases where either one cluster is picked as the go-to prediction and therefore results in a kind of randomness as the explanation.
    <br>
    Data examples for the outlined analysis can be found in Figure 6.6. F1 scores in (a)-(d) are given by TURL, in (e)
    by Contrastive Learning. Respective baseline F1 scores for the differences are added in the description.
    <br>
    <br>

    <figcaption style="text-align:center">Figure 6.6: Snapshots from our data to illustrate overlaps and differences between the models</figcaption>
    <div style="margin-top:2em;">
      <div style=";margin-right: 2.5em">
          <table class="picturetable">
                <tr>
                  <td><figcaption style="text-align:center">(a) All models perform good.</figcaption><img src="./visualizations/Entity/Error_Analysis_Overlaps_all_good.png" style="margin-bottom:2em;" class="center" width="90%" height="auto"/></td>
                  <td><figcaption style="text-align:center">(b) All models perform bad.</figcaption><img src="./visualizations/Entity/Error_Analysis_Overlaps_all_bad.png" style="margin-bottom:2em;" class="center" width="90%" height="auto"/></td>
                <tr>
                  <td><figcaption style="text-align:center">(c) Baseline performs better than TURL.  F1 baseline: 1.0</figcaption><img src="./visualizations/Entity/Error_Analysis_Baseline%20better%20than%20TURL.png" style="margin-bottom:2em;" class="center" width="90%" height="auto"/></td>
                  <td><figcaption style="text-align:center">(d) TURL performs better than baseline.  F1 baseline: 0.33</figcaption><img src="./visualizations/Entity/Error_Analysis_TURL%20better%20than%20baseline.png" style="margin-bottom:2em;" class="center" width="90%" height="auto"/></td>
                <tr>
                  <td><figcaption style="text-align:center">(e) Contrastive Learning performs better than baseline.  F1 baseline: 0.36</figcaption><img src="./visualizations/Entity/Error_Analysis_CL%20better%20than%20baseline.png" style="margin-bottom:2em;" class="center" width="90%" height="auto"/></td>
                </tr>
              </table>
            </div>
          </div>
</p>
<br>
<span id="toc7"></span>
<h2>7 Discussion and Outlook</h2>
<!--CONTENT-->
<p>
  As for the task of schema matching, the experiments clearly show that table-based transformer models such as TURL are highly relevant and
  can be applied advantageously to identify and match identical schemata. In particular, complex content that seems rather confusing
  and hard to differentiate by all BERT-based models can be successfully identified and matched by TURL. Most notably, as result examples show,
  table-based transformers utilize and rather focus on context to draw inferences about specific columns content.
  Further, as is to be expected, larger amounts of training data clearly improve the model's performance. Especially for hard matching cases more training makes a huge difference.
  <br>
  However, while clean table content seems to provide good results for both baseline as well as table-based models, specific hard cases such as product gtins,
  despite the large amount of data, still seemed to be rather difficult to differentiate. Taking this as an example, one could further look into the specific
  column content of hard matching cases and possibly identify noise, other misleading content or further relevant context information.
  <br>
  Moreover, the test data already yielded roughly 10% of noise, mainly due to languages other than English, we assume that there are also roughly 10 percent of
  noise within the training data. Hence, further cleaning and investigation of the column content could give more insights and potentially lead to better
  model performance as well.
  <br>

    Drawing a conclusion on the task of entity matching, it is remarkable that overall Contrastive Learning by far
    outperforms our other models and TURL indeed achieves only comparably poor results. This is in great contrast to
    the schema matching results. Since the column type annotation as the chosen finetuning task is in general more applicable
    for schema matching where context of columns plays a more crucial role it is not surprising that results for schema matching
    are better than for entity matching. Nevertheless, we tried to adapt the model by transposing our input for TURL such that
    we could still use the model at hand which therefore yet works quite well. Trying to transform the TURL model structure on the other hand
    seems to be working a little less successful in comparison.
    Meanwhile, Contrastive Learning being a non table-based approach rather tries to learn an embedding space to better distinguish between various entities. This approach
    seems to work well for product matching but performs poorly on the task of schema matching. This is not surprising as contrastive learning does not utilize
    the table context information necessary to perform well on schema matching.

    <br>
    With regard to further work that can be applied to the tasks of data integration and transformer models, we also suggest to gather more training data and
    gain more insights on other categories next to the already chosen ones. Additionally, due to time restrictions, we did not focus extensively on hyperparameter
    tuning within TURL. Similarly, for schema matching we only managed to run pretraining with a batch size of 128. Since pretraining
    is a crucial part of the contrastive approach, it is worth trying to experiment with a batch size of 1024 (as we did for product matching). Further
    experimentation on hyperparameters such as learning rate, temperature or augmentation would also be a possibility for improvements and further insights.
    <br>

    As data integration using deep learning is still considered as new ground, other table-based models such as encoder-decoder transformer models [16]
    or frameworks using multi-task training [17], could be investigated and used for further experimentation. 
    As a last point, it is to mention that until now entity and schema matching was only looked at as separate tasks.
    One might find it of interest to combine the two tasks in an iterative manner in order to deal with the mentioned challenges that occur in web data integration.

<br>
</p>

<span id="toc8"></span>
<h2>8 References</h2>
<ol>
    <li>Deng, X. et al.:
        <a href="https://doi.org/10.48550/arXiv.2006.14806" target="_blank">TURL: table Understanding through Representation Learning</a>. In:
       Proceedings of the VLDB Endowment 14, 3, 307--319 (2020).
    </li>
    <li>Devlin, J. et al.: <a
            href="http://dblp.uni-trier.de/db/conf/naacl/naacl2019-1.html#DevlinCLT19"
            target="_blank">BERT: Pre-training of Deep Bidirectional Transformers for Language
        Understanding</a>. In:
        Jill Burstein; Christy Doran & Thamar Solorio, ed., 'NAACL-HLT (1)', Association for Computational Linguistics,
        , pp. 4171-4186 (2019).
    </li>
    <li>Peeters, R., Bizer, C.: <a
            href="https://arxiv.org/abs/2202.02098"
            target="_blank">Supervised Contrastive Learning for Product Matching</a>.
       arXiv:2202.02098 [cs] (2022).
    </li>
    <li>Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, et al.: <a
        href="https://arxiv.org/abs/2004.11362"
        target="_blank">Supervised Contrastive Learning</a>.
        arXiv:2004.11362 [cs.LG] (2020).
    </li>
    <li>Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton.: <a
        href="https://arxiv.org/abs/2002.05709"
        target="_blank">A Simple Framework for Contrastive Learning of Visual Representations</a>.
        arXiv:2002.05709 [cs.LG] (2020)
    </li>
    <li>Tianyu Gao, Xingcheng Yao, and Danqi Chen.: <a
        href="https://arxiv.org/abs/2104.08821"
        target="_blank">SimCSE: Simple Contrastive Learning of Sentence Embeddings.</a>.
        arXiv:2104.08821 [cs.CL] (2021)
    </li>
    <li>Christophides, V. et al.: <a
            href="https://doi.org/10.2200/S00655ED1V01Y201507WBE013"
            target="_blank">Entity Resolution in the Web of Data</a>.
        In: Synthesis Lectures on the Semantic Web: Theory and Technology, 5(3):1–122, (2015).
    </li>
    <li>Vaswani, A. et al.: <a
            href="https://arxiv.org/abs/1706.03762"
            target="_blank">Attention Is All You Need</a>.
        In: NIPS'17: Proceedings of the 31st International Conference on Neural Information Processing Systems, 6000-6010 (2017).
    </li>
    <li>Jiao, X. et al.: <a
            href="https://arxiv.org/abs/1909.10351"
            target="_blank">TinyBERT: Distilling BERT for Natural Language Understanding</a>.
        In: Findings of the Association for Computational Linguistics: EMNLP 2020, 4163--4174, (2020).
    </li>
    <li>Liu, Y. et al.: <a
            href="http://arxiv.org/abs/1907.11692"
            target="_blank">RoBERTa: A Robustly Optimized BERT Pretraining Approach</a>.
        arXiv:1907.11692 [cs], (2019).
    </li>
    <li>Joulin, A. et al.: <a
            href="https://arxiv.org/abs/1612.03651"
            target="_blank">FastText.zip: Compressing text classification models</a>.
        arXiv:1612.03651 [cs], (2016).
    </li>
    <li>Primpeli, A. et al.: <a
            href="https://dl.acm.org/doi/10.1145/3308560.3316609"
            target="_blank">The WDC Training Dataset and Gold Standard for Large-Scale Product Matching</a>.
        In:Proceedings of The 2019 World Wide Web Conference, 381–-386, San Francisco USA, (2019).
    </li>
    <li>Le, Quoc V., Mikolov, T.: <a
            href="https://arxiv.org/abs/1405.4053"
            target="_blank">Distributed Representations of Sentences and Documents</a>.
        arXiv:1405.4053 [cs], (2014).
    </li>
    <li>Yin, P. et al.: <a
            href="https://aclanthology.org/2020.acl-main.745"
            target="_blank">TaBERT: Pretraining for Joint Understanding of Textual and Tabular Data</a>.
        In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 8413–8426, (2020)
    </li>
    <li>Bradberry, T.J.:
      <a href="https://github.com/trent-b/iterative-stratification" target="blank">Iterative Stratification</a>.
      (2018).
    </li>
    <li>Tang, Nan and Fan, Ju and Li, Fangyi and Tu et al.:
      <a href="https://arxiv.org/abs/2012.02469" target="blank">RPT: Relational Pre-trained Transformer Is Almost All You Need towards Democratizing Data Preparation</a>.
      arXiv:2012.02469 [cs.LG](2020).
    </li>
    <li>Yoshihiko Suhara, Jinfeng Li, Yuliang Li, Dan Zhang et al.:
      <a href="https://arxiv.org/abs/2104.01785" target="blank">Annotating Columns with Pre-trained Language Models</a>.
      arXiv:2104.01785 [cs.DB](2021)
    </li>
    </ol>
    <span id="toc9"></span>
     <h2>9 Downloads</h2>
     <p>
        <a href="" target="_blank">Dataset for entity matching</a>
    </p>
    <p>
        <a href="" target="_blank">Dataset for schema matching</a>
    </p>


</div>
<script type="text/javascript">
    $('#toc').toc({
        'selectors': 'h2', //elements to use as headings
        'container': '#toccontent', //element to find all selectors in
        'smoothScrolling': true, //enable or disable smooth scrolling on click
        'prefix': 'toc', //prefix for anchor tags and class names
        'highlightOnScroll': true, //add class to heading that is currently in focus
        'highlightOffset': 100, //offset to trigger the next headline
        'anchorName': function (i, heading, prefix) { //custom function for anchor name
            return prefix + i;
        }
    });
    $('[id*="link_"]').each(function () {
        var element = $(this);
        element.click(function (e) {
            e.preventDefault();
            var id = element.attr("id").split("_")[1];
            element.parent().removeClass("show").addClass("no-show");
            $('#charts_' + id).removeClass("no-show").addClass("show");
        });
    });
    $('[id*="colapse_"]').each(function () {
        var element = $(this);
        element.click(function (e) {
            e.preventDefault();
            var id = element.attr("id").split("_")[1];
            element.parent().removeClass("show").addClass("no-show");
            $('#intro_' + id).removeClass("no-show").addClass("show");
        });
    });
    document.getElementById("defaultOpen").click();
</script>
</body>

</html>