From 5fb7f9cdcb964b2c346f1dc6d446492dfcc40e86 Mon Sep 17 00:00:00 2001
From: GitHub Action Website Snapshot <>
Date: Mon, 4 Nov 2024 09:25:40 +0000
Subject: [PATCH] Refreshing website content from main repo.

  Source commit:
  https://github.com/OpenLineage/OpenLineage/commit/45e99cb71fe895a242b8bb3a8b3010eed0720ca2
---
 blog/column-lineage/index.mdx                 | 50 ++++++------
 docs/client/java/partials/java_transport.md   |  3 +-
 docs/integrations/airflow/airflow.md          |  2 +-
 .../airflow/default-extractors.md             |  2 +-
 .../airflow/extractors/custom-extractors.md   |  2 +-
 .../airflow/extractors/extractor-testing.md   |  2 +-
 docs/integrations/airflow/job-hierarchy.md    |  2 +-
 docs/integrations/airflow/manual.md           |  2 +-
 docs/integrations/airflow/older.md            |  6 +-
 docs/integrations/airflow/usage.md            |  4 +-
 docs/integrations/flink.md                    | 10 +--
 .../spark/configuration/spark_conf.md         |  2 +-
 docs/spec/naming.md                           | 79 ++++++++++---------
 13 files changed, 88 insertions(+), 78 deletions(-)
diff --git a/blog/column-lineage/index.mdx b/blog/column-lineage/index.mdx
index 6a42880..3905309 100644
--- a/blog/column-lineage/index.mdx
+++ b/blog/column-lineage/index.mdx
@@ -27,31 +27,33 @@ In the process of implementing column-level lineage, Paweł and Julien contribut
 
 An example of a `columnLineage` facet in the outputs array of a lineage event:
 
-    {
-      "namespace": "{namespace of the outputdataset}",
-      "name": "{name of the output dataset}",
-      "facets": {
-        "schema": {
-          "fields": [
-            { "name": "{first column of the output dataset}", "type": "{its type}"},
-            { "name": "{second column of the output dataset}", "type": "{its type}"},
-            ...
-          ]
-        },
-        "columnLineage": {
-          "{first column of the output dataset}": {
-            "inputFields": [
-              { "namespace": "{input dataset namespace}", name: "{input dataset name}", "field": "{input dataset column name}"},
-              ... other inputs
-            ],
-            "transformationDescription": "identical",
-            "transformationType": "IDENTITY"
-          },
-          "{second column of the output dataset}": ...,
-          ...
-        }
-      }
+```json
+{
+  "namespace": "{namespace of the outputdataset}",
+  "name": "{name of the output dataset}",
+  "facets": {
+    "schema": {
+      "fields": [
+        { "name": "{first column of the output dataset}", "type": "{its type}"},
+        { "name": "{second column of the output dataset}", "type": "{its type}"},
+        ...
+      ]
+    },
+    "columnLineage": {
+      "{first column of the output dataset}": {
+        "inputFields": [
+          { "namespace": "{input dataset namespace}", name: "{input dataset name}", "field": "{input dataset column name}"},
+          ... other inputs
+        ],
+        "transformationDescription": "identical",
+        "transformationType": "IDENTITY"
+      },
+      "{second column of the output dataset}": ...,
+      ...
     }
+  }
+}
+```
 
 ### How it works
 
diff --git a/docs/client/java/partials/java_transport.md b/docs/client/java/partials/java_transport.md
index 2c3a264..3e0c5a8 100644
--- a/docs/client/java/partials/java_transport.md
+++ b/docs/client/java/partials/java_transport.md
@@ -101,7 +101,8 @@ spark.openlineage.transport.headers.X-Some-Extra-Header=abc
 spark.openlineage.transport.compression=gzip
 ```
 
-<details><summary>URL parsing within Spark integration</summary>
+<details>
+<summary>URL parsing within Spark integration</summary>
 <p>
 
 You can supply http parameters using values in url, the parsed `spark.openlineage.*` properties are located in url as follows:
diff --git a/docs/integrations/airflow/airflow.md b/docs/integrations/airflow/airflow.md
index 254bc79..a7bb2c4 100644
--- a/docs/integrations/airflow/airflow.md
+++ b/docs/integrations/airflow/airflow.md
@@ -4,7 +4,7 @@ title: Apache Airflow
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/default-extractors.md b/docs/integrations/airflow/default-extractors.md
index bbdcced..8cb012b 100644
--- a/docs/integrations/airflow/default-extractors.md
+++ b/docs/integrations/airflow/default-extractors.md
@@ -4,7 +4,7 @@ title: Exposing Lineage in Airflow Operators
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/extractors/custom-extractors.md b/docs/integrations/airflow/extractors/custom-extractors.md
index 8316cb9..9bb1280 100644
--- a/docs/integrations/airflow/extractors/custom-extractors.md
+++ b/docs/integrations/airflow/extractors/custom-extractors.md
@@ -4,7 +4,7 @@ title: Custom Extractors
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/extractors/extractor-testing.md b/docs/integrations/airflow/extractors/extractor-testing.md
index 1e5d7d8..0a891cf 100644
--- a/docs/integrations/airflow/extractors/extractor-testing.md
+++ b/docs/integrations/airflow/extractors/extractor-testing.md
@@ -4,7 +4,7 @@ title: Testing Custom Extractors
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/job-hierarchy.md b/docs/integrations/airflow/job-hierarchy.md
index 90bcf28..c8491ad 100644
--- a/docs/integrations/airflow/job-hierarchy.md
+++ b/docs/integrations/airflow/job-hierarchy.md
@@ -4,7 +4,7 @@ title: Job Hierarchy
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/manual.md b/docs/integrations/airflow/manual.md
index 1916885..b3a40d0 100644
--- a/docs/integrations/airflow/manual.md
+++ b/docs/integrations/airflow/manual.md
@@ -4,7 +4,7 @@ title: Manually Annotated Lineage
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
diff --git a/docs/integrations/airflow/older.md b/docs/integrations/airflow/older.md
index 3c2a646..2054a5b 100644
--- a/docs/integrations/airflow/older.md
+++ b/docs/integrations/airflow/older.md
@@ -4,7 +4,7 @@ title: Supported Airflow versions
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
@@ -16,7 +16,7 @@ while the `openlineage-airflow` will primarily be updated for bug fixes.
 ##### Airflow 2.7+
 
 This package **should not** be used starting with Airflow 2.7.0 and **can not** be used with Airflow 2.8+. 
-It was designed as Airflow's external integration that works mainly for Airflow versions <2.7.
+It was designed as Airflow's external integration that works mainly for Airflow versions \<2.7.
 For Airflow 2.7+ use the native Airflow OpenLineage provider 
 [package](https://airflow.apache.org/docs/apache-airflow-providers-openlineage) `apache-airflow-providers-openlineage`.
 
@@ -44,6 +44,6 @@ openlineage.lineage_backend.OpenLineageBackend
 
 The OpenLineageBackend does not take into account manually configured inlets and outlets. 
 
-##### Airflow <2.1 
+##### Airflow \<2.1 
 
 OpenLineage does not work with versions older than Airflow 2.1.
diff --git a/docs/integrations/airflow/usage.md b/docs/integrations/airflow/usage.md
index bdb5573..4815b21 100644
--- a/docs/integrations/airflow/usage.md
+++ b/docs/integrations/airflow/usage.md
@@ -4,7 +4,7 @@ title: Using the Airflow Integration
 ---
 
 :::caution
-This page is about Airflow's external integration that works mainly for Airflow versions <2.7. 
+This page is about Airflow's external integration that works mainly for Airflow versions \<2.7. 
 [If you're using Airflow 2.7+, look at native Airflow OpenLineage provider documentation.](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html)  <br /><br /> 
 
 The ongoing development and enhancements will be focused on the `apache-airflow-providers-openlineage` package, 
@@ -14,7 +14,7 @@ while the `openlineage-airflow` will primarily be updated for bug fixes. See [al
 #### PREREQUISITES
 
 - [Python 3.8](https://www.python.org/downloads)
-- [Airflow >= 2.1,<2.8](https://pypi.org/project/apache-airflow)
+- [Airflow >= 2.1,\<2.8](https://pypi.org/project/apache-airflow)
 
 To use the OpenLineage Airflow integration, you'll need a running [Airflow instance](https://airflow.apache.org/docs/apache-airflow/stable/start.html). You'll also need an OpenLineage-compatible [backend](https://github.com/OpenLineage/OpenLineage#scope).
 
diff --git a/docs/integrations/flink.md b/docs/integrations/flink.md
index d252d9f..871b94e 100644
--- a/docs/integrations/flink.md
+++ b/docs/integrations/flink.md
@@ -118,11 +118,11 @@ and allows all the configuration features present there to be used. The configur
 
 The following parameters can be specified:
 
-| Parameter                                | Definition                                                                                                                                                                          | Example                                 |
-|------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
-| openlineage.transport.type               | The transport type used for event emit, default type is `console`                                                                                                                   | http                                    |
-| openlineage.facets.disabled              | List of facets to disable, enclosed in `[]` (required from 0.21.x) and separated by `;`, default is `[spark_unknown;spark.logicalPlan;]` (currently must contain `;`)               | \[some_facet1;some_facet1\]             |
-| openlineage.job.owners.<ownership-type\> | Specifies ownership of the job. Multiple entries with different types are allowed. Config key name and value are used to create job ownership type and name (available since 1.13). | openlineage.job.owners.team="Some Team" |
+| Parameter                                 | Definition                                                                                                                                                                          | Example                                 |
+|-------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
+| openlineage.transport.type                | The transport type used for event emit, default type is `console`                                                                                                                   | http                                    |
+| openlineage.facets.disabled               | List of facets to disable, enclosed in `[]` (required from 0.21.x) and separated by `;`, default is `[spark_unknown;spark.logicalPlan;]` (currently must contain `;`)               | \[some_facet1;some_facet1\]             |
+| openlineage.job.owners.\<ownership-type\> | Specifies ownership of the job. Multiple entries with different types are allowed. Config key name and value are used to create job ownership type and name (available since 1.13). | openlineage.job.owners.team="Some Team" |
 
 ## Transports
 
diff --git a/docs/integrations/spark/configuration/spark_conf.md b/docs/integrations/spark/configuration/spark_conf.md
index 58ab68e..fc57979 100644
--- a/docs/integrations/spark/configuration/spark_conf.md
+++ b/docs/integrations/spark/configuration/spark_conf.md
@@ -22,5 +22,5 @@ The following parameters can be specified:
 | spark.openlineage.jobName.appendDatasetName           | Decides whether output dataset name should be appended to job name. By default `true`.                                                                                                                                                                                                | false                                         |
 | spark.openlineage.jobName.replaceDotWithUnderscore    | Replaces dots in job name with underscore. Can be used to mimic legacy behaviour on Databricks platform. By default `false`.                                                                                                                                                          | false                                         |
 | spark.openlineage.debugFacet                          | Determines whether debug facet shall be generated and included within the event. Set `enabled` to turn it on. By default, facet is disabled.                                                                                                                                          | enabled                                       |
-| spark.openlineage.job.owners.<ownership-type\>        | Specifies ownership of the job. Multiple entries with different types are allowed. Config key name and value are used to create job ownership type and name (available since 1.13).                                                                                                   | spark.openlineage.job.owners.team="Some Team" |
+| spark.openlineage.job.owners.\<ownership-type\>       | Specifies ownership of the job. Multiple entries with different types are allowed. Config key name and value are used to create job ownership type and name (available since 1.13).                                                                                                   | spark.openlineage.job.owners.team="Some Team" |
 | spark.openlineage.columnLineage.datasetLineageEnabled | Makes the dataset dependencies to be included in their own property `dataset` in the column lineage pattern. If this flag is set to `false`, then the dataset dependencies are merged into `fields` property. The default value is `false`. **It is recommended to set it to `true`** | true                                          |
diff --git a/docs/spec/naming.md b/docs/spec/naming.md
index ecd48fc..975495e 100644
--- a/docs/spec/naming.md
+++ b/docs/spec/naming.md
@@ -4,65 +4,72 @@ sidebar_position: 3
 
 # Naming Conventions
 
-Employing a unique naming strategy per resource ensures that the spec is followed uniformly regardless of metadata producer.
+Employing a unique naming strategy per resource ensures that the spec is followed uniformly regardless of metadata
+producer.
 
-Jobs and Datasets have their own namespaces, job namespaces being derived from schedulers and dataset namespaces from datasources.
+Jobs and Datasets have their own namespaces, job namespaces being derived from schedulers and dataset namespaces from
+datasources.
 
 ## Dataset Naming
 
 A dataset, or `table`, is organized according to a producer, namespace, database and (optionally) schema.
 
-| Data Store                    | Type                                 | Namespace                                                    | Name                                                     |
-|:------------------------------|:-------------------------------------|:-------------------------------------------------------------|:---------------------------------------------------------|
-| Athena                        | Warehouse                            | awsathena://athena.{region_name}.amazonaws.com               | {catalog}.{database}.{table}                             |
-| AWS Glue                      | Data catalog                         | arn:aws:glue:{region}:{account id}                           | table/{database name}/{table name}                       |
-| Azure Cosmos DB               | Warehouse                            | azurecosmos://{host}/dbs/{database}                          | colls/{table}                                            |
-| Azure Data Explorer           | Warehouse                            | azurekusto://{host}.kusto.windows.net                        | {database}/{table}                                       |
-| Azure Synapse                 | Warehouse                            | sqlserver://{host}:{port}                                    | {schema}.{table}                                         |
-| BigQuery                      | Warehouse                            | bigquery://                                                  | {project id}.{dataset name}.{table name}                 |
-| Cassandra                     | Warehouse                            | cassandra://{host}:{port}                                    | {keyspace}.{table}                                       |
-| MySQL                         | Warehouse                            | mysql://{host}:{port}                                        | {database}.{table}                                       |
-| Oracle                        | Warehouse                            | oracle://{host}:{port}                                       | {serviceName}.{schema}.{table} or {sid}.{schema}.{table} |
-| Postgres                      | Warehouse                            | postgres://{host}:{port}                                     | {database}.{schema}.{table}                              |
-| Teradata                      | Warehouse                            | teradata://{host}:{port}                                     | {database}.{table}                                       |
-| Redshift                      | Warehouse                            | redshift://{cluster_identifier}.{region_name}:{port}         | {database}.{schema}.{table}                              |
-| Snowflake                     | Warehouse                            | snowflake://{organization name}-{account name}               | {database}.{schema}.{table}                              |
-| Trino                         | Warehouse                            | trino://{host}:{port}                                        | {catalog}.{schema}.{table}                               |
-| ABFSS (Azure Data Lake Gen2)  | Data lake                            | abfss://{container name}@{service name}.dfs.core.windows.net | {path}                                                   |
-| DBFS (Databricks File System) | Distributed file system              | dbfs://{workspace name}                                      | {path}                                                   |
-| GCS                           | Blob storage                         | gs://{bucket name}                                           | {object key}                                             |
-| HDFS                          | Distributed file system              | hdfs://{namenode host}:{namenode port}                       | {path}                                                   |
-| Kafka                         | distributed event streaming platform | kafka://{bootstrap server host}:{port}                       | {topic}                                                  |
-| Local file system             | File system                          | file                                                         | {path}                                                   |
-| Remote file system            | File system                          | file://{host}                                                | {path}                                                   |
-| S3                            | Blob Storage                         | s3://{bucket name}                                           | {object key}                                             |
-| WASBS (Azure Blob Storage)    | Blob Storage                         | wasbs://{container name}@{service name}.dfs.core.windows.net | {object key}                                             |
+| Data Store                    | Type                                 | Namespace                                                      | Name                                                       |
+|:------------------------------|:-------------------------------------|:---------------------------------------------------------------|:-----------------------------------------------------------|
+| Athena                        | Warehouse                            | `awsathena://athena.{region_name}.amazonaws.com`               | `{catalog}.{database}.{table}`                             |
+| AWS Glue                      | Data catalog                         | `arn:aws:glue:{region}:{account id}`                           | `table/{database name}/{table name}`                       |
+| Azure Cosmos DB               | Warehouse                            | `azurecosmos://{host}/dbs/{database}`                          | `colls/{table}`                                            |
+| Azure Data Explorer           | Warehouse                            | `azurekusto://{host}.kusto.windows.net`                        | `{database}/{table}`                                       |
+| Azure Synapse                 | Warehouse                            | `sqlserver://{host}:{port}`                                    | `{schema}.{table}`                                         |
+| BigQuery                      | Warehouse                            | `bigquery://`                                                  | `{project id}.{dataset name}.{table name}`                 |
+| Cassandra                     | Warehouse                            | `cassandra://{host}:{port}`                                    | `{keyspace}.{table}`                                       |
+| MySQL                         | Warehouse                            | `mysql://{host}:{port}`                                        | `{database}.{table}`                                       |
+| Oracle                        | Warehouse                            | `oracle://{host}:{port}`                                       | `{serviceName}.{schema}.{table} or {sid}.{schema}.{table}` |
+| Postgres                      | Warehouse                            | `postgres://{host}:{port}`                                     | `{database}.{schema}.{table}`                              |
+| Teradata                      | Warehouse                            | `teradata://{host}:{port}`                                     | `{database}.{table}`                                       |
+| Redshift                      | Warehouse                            | `redshift://{cluster_identifier}.{region_name}:{port}`         | `{database}.{schema}.{table}`                              |
+| Snowflake                     | Warehouse                            | `snowflake://{organization name}-{account name}`               | `{database}.{schema}.{table}`                              |
+| Trino                         | Warehouse                            | `trino://{host}:{port}`                                        | `{catalog}.{schema}.{table}`                               |
+| ABFSS (Azure Data Lake Gen2)  | Data lake                            | `abfss://{container name}@{service name}.dfs.core.windows.net` | `{path}`                                                   |
+| DBFS (Databricks File System) | Distributed file system              | `dbfs://{workspace name}`                                      | `{path}`                                                   |
+| GCS                           | Blob storage                         | `gs://{bucket name}`                                           | `{object key}`                                             |
+| HDFS                          | Distributed file system              | `hdfs://{namenode host}:{namenode port}`                       | `{path}`                                                   |
+| Kafka                         | distributed event streaming platform | `kafka://{bootstrap server host}:{port}`                       | `{topic}`                                                  |
+| Local file system             | File system                          | `file`                                                         | `{path}`                                                   |
+| Remote file system            | File system                          | `file://{host}`                                                | `{path}`                                                   |
+| S3                            | Blob Storage                         | `s3://{bucket name}`                                           | `{object key}`                                             |
+| WASBS (Azure Blob Storage)    | Blob Storage                         | `wasbs://{container name}@{service name}.dfs.core.windows.net` | `{object key}`                                             |
 
 ## Job Naming
 
-A `Job` is a recurring data transformation with inputs and outputs. Each execution is captured as a `Run` with corresponding metadata.
+A `Job` is a recurring data transformation with inputs and outputs. Each execution is captured as a `Run` with
+corresponding metadata.
 A `Run` event identifies the `Job` it instances by providing the job’s unique identifier.
-The `Job` identifier is composed of a `Namespace` and `Name`. The job namespace is usually set in OpenLineage client config. The job name is unique within its namespace.
+The `Job` identifier is composed of a `Namespace` and `Name`. The job namespace is usually set in OpenLineage client
+config. The job name is unique within its namespace.
 
-| Job type     | Name                        | Example                                                    |
-|:-------------|:----------------------------|:-----------------------------------------------------------|
-| Airflow task | {dag_id}.{task_id}          | orders_etl.count_orders                                    |
-| Spark job    | {appName}.{command}.{table} | my_awesome_app.execute_insert_into_hive_table.mydb_mytable |
-| SQL          | {schema}.{table}            | gx.validate_datasets                                       |
+| Job type     | Name                          | Example                                                      |
+|:-------------|:------------------------------|:-------------------------------------------------------------|
+| Airflow task | `{dag_id}.{task_id}`          | `orders_etl.count_orders`                                    |
+| Spark job    | `{appName}.{command}.{table}` | `my_awesome_app.execute_insert_into_hive_table.mydb_mytable` |
+| SQL          | `{schema}.{table}`            | `gx.validate_datasets`                                       |
 
 ## Run Naming
 
-Runs are named using client-generated UUIDs. The OpenLineage client is responsible for generating them and maintaining them throughout the duration of the runcycle.
+Runs are named using client-generated UUIDs. The OpenLineage client is responsible for generating them and maintaining
+them throughout the duration of the runcycle.
 
 ```python
 from openlineage.client.run import Run
 from openlineage.client.uuid import generate_new_uuid
+
 run = Run(str(generate_new_uuid()))
 ```
 
 ## Why Naming Matters
 
-Naming enables focused insight into data flows, even when datasets and workflows are distributed across an organization. This focus enabled by naming is key to the production of useful lineage.
+Naming enables focused insight into data flows, even when datasets and workflows are distributed across an organization.
+This focus enabled by naming is key to the production of useful lineage.
 
 ![image](./naming-correlations.svg)