fix: failing tests are fixed

opentargets · Sep 25, 2024 · f703d47 · f703d47
1 parent d9e554b
commit f703d47
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 10 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/gentropy/common/schemas.py b/src/gentropy/common/schemas.py
@@ -159,11 +159,21 @@ def compare_struct_schemas(
 
     The comparison is done recursively, so nested structs are also compared.
 
+    Checking logic:
+    1. Checking for duplicated columns in the observed schema.
+    2. Checking for missing mandatory columns in the observed schema.
+    3. Now we know that all mandatory columns are present, we can iterate over the observed schema and compare the types.
+    4. Flagging unexpected columns in the observed schema.
+    5. Flagging columns with non-matching types.
+    6. If a column is a struct -> call compare_struct_schemas
+    7. If a column is an array -> call compare_array_schemas
+    8. Return dictionary with issues.
+
     Args:
         observed_schema (StructType): The observed schema.
         expected_schema (StructType): The expected schema.
-        parent_field_name (str, optional): The parent field name. Defaults to None.
-        schema_issues (defaultdict[str, list[str]], optional): The schema issues. Defaults to None.
+        parent_field_name (str | None): The parent field name. Defaults to None.
+        schema_issues (defaultdict[str, list[str]] | None): The schema issues. Defaults to None.
 
     Returns:
         defaultdict[str, list[str]]: The schema issues.

diff --git a/tests/gentropy/dataset/test_study_index.py b/tests/gentropy/dataset/test_study_index.py
@@ -167,14 +167,14 @@ def _setup(self: TestGeneValidation, spark: SparkSession) -> None:
         """Setup fixture."""
         self.study_index = StudyIndex(
             _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS).withColumn(
-                "qualityControls", f.array()
+                "qualityControls", f.array().cast("array<string>")
             ),
             _schema=StudyIndex.get_schema(),
         )
 
         self.study_index_no_gene = StudyIndex(
             _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS)
-            .withColumn("qualityControls", f.array())
+            .withColumn("qualityControls", f.array().cast("array<string>"))
             .drop("geneId"),
             _schema=StudyIndex.get_schema(),
         )
@@ -231,7 +231,7 @@ def _setup(self: TestUniquenessValidation, spark: SparkSession) -> None:
         """Setup fixture."""
         self.study_index = StudyIndex(
             _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS).withColumn(
-                "qualityControls", f.array()
+                "qualityControls", f.array().cast("array<string>")
             ),
             _schema=StudyIndex.get_schema(),
         )
@@ -279,7 +279,7 @@ def _setup(self: TestStudyTypeValidation, spark: SparkSession) -> None:
         """Setup fixture."""
         self.study_index = StudyIndex(
             _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS).withColumn(
-                "qualityControls", f.array()
+                "qualityControls", f.array().cast("array<string>")
             ),
             _schema=StudyIndex.get_schema(),
         )
@@ -346,8 +346,10 @@ def _setup(self: TestDiseaseValidation, spark: SparkSession) -> None:
             spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS)
             .groupBy("studyId", "studyType", "projectId")
             .agg(f.collect_set("efo").alias("traitFromSourceMappedIds"))
-            .withColumn("qualityControls", f.array())
-            .withColumn("backgroundTraitFromSourceMappedIds", f.array())
+            .withColumn("qualityControls", f.array().cast("array<string>"))
+            .withColumn(
+                "backgroundTraitFromSourceMappedIds", f.array().cast("array<string>")
+            )
         )
         study_df.show()
         # Mock study index:

diff --git a/tests/gentropy/method/test_clump.py b/tests/gentropy/method/test_clump.py
@@ -135,7 +135,9 @@ def test_flagging(self: TestIsLeadLinked) -> None:
         """Test flagging of lead variants."""
         # Create the study locus and clump:
         sl_flagged = StudyLocus(
-            _df=self.df.drop("expected_flag").withColumn("qualityControls", f.array()),
+            _df=self.df.drop("expected_flag").withColumn(
+                "qualityControls", f.array().cast("array<string>")
+            ),
             _schema=StudyLocus.get_schema(),
         ).clump()