diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index e56ef2ecc..c822b592a 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -352,3 +352,18 @@ def flag_duplicates(test_column: Column) -> Column: ) > 1 ) + + @staticmethod + def generate_identifier(uniqueness_defining_columns: list[str]) -> Column: + """Hashes the provided columns to generate a unique identifier. + + Args: + uniqueness_defining_columns (list[str]): list of columns defining uniqueness + + Returns: + Column: column with a unique identifier + """ + hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None")) + .otherwise(f.col(column).cast("string")) + for column in uniqueness_defining_columns] + return f.md5(f.concat(*hashable_columns)) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 1b3473148..a4d35e7d5 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -447,24 +447,18 @@ def _align_overlapping_tags( ) @staticmethod - def assign_study_locus_id( - study_id_col: Column, - variant_id_col: Column, - finemapping_col: Column = None, - ) -> Column: - """Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId. + def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column: + """Hashes the provided columns to extract a consistent studyLocusId. Args: - study_id_col (Column): column name with a study ID - variant_id_col (Column): column name with a variant ID - finemapping_col (Column, optional): column with fine mapping methodology + uniqueness_defining_columns (list[str]): list of columns defining uniqueness Returns: Column: column with a study locus ID Examples: >>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod") - >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False) + >>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False) +----------+----------+-----------------+--------------------------------+ |studyId |variantId |finemappingMethod|study_locus_id | +----------+----------+-----------------+--------------------------------+ @@ -473,15 +467,8 @@ def assign_study_locus_id( +----------+----------+-----------------+--------------------------------+ """ - if finemapping_col is None: - finemapping_col = f.lit("None") - columns = [study_id_col, variant_id_col, finemapping_col] - hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None")) - .otherwise(column.cast("string")) - for column in columns] - return f.md5(f.concat(*hashable_columns)).alias( - "studyLocusId" - ) + return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId") + @classmethod def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column: diff --git a/src/gentropy/datasource/eqtl_catalogue/finemapping.py b/src/gentropy/datasource/eqtl_catalogue/finemapping.py index 11ec5bef1..0808b7016 100644 --- a/src/gentropy/datasource/eqtl_catalogue/finemapping.py +++ b/src/gentropy/datasource/eqtl_catalogue/finemapping.py @@ -260,7 +260,7 @@ def from_susie_results( .select( *study_locus_cols, StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), StudyLocus.calculate_credible_set_log10bf( f.col("locus.logBF") diff --git a/src/gentropy/datasource/finngen/finemapping.py b/src/gentropy/datasource/finngen/finemapping.py index 092a79372..3c83ba8ff 100644 --- a/src/gentropy/datasource/finngen/finemapping.py +++ b/src/gentropy/datasource/finngen/finemapping.py @@ -471,7 +471,7 @@ def from_finngen_susie_finemapping( ).withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py index 5e84079a1..b34944b11 100644 --- a/src/gentropy/datasource/gwas_catalog/associations.py +++ b/src/gentropy/datasource/gwas_catalog/associations.py @@ -1188,7 +1188,7 @@ def update_study_id( .drop("subStudyDescription", "updatedStudyId") ).withColumn( "studyLocusId", - StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), + StudyLocus.assign_study_locus_id(["studyId", "variantId"]), ) return self diff --git a/src/gentropy/datasource/open_targets/l2g_gold_standard.py b/src/gentropy/datasource/open_targets/l2g_gold_standard.py index 2cfcd62f8..26d5a0253 100644 --- a/src/gentropy/datasource/open_targets/l2g_gold_standard.py +++ b/src/gentropy/datasource/open_targets/l2g_gold_standard.py @@ -52,7 +52,7 @@ def parse_positive_curation( ) .withColumn( "studyLocusId", - StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), + StudyLocus.assign_study_locus_id(["studyId", "variantId"]), ) .groupBy("studyLocusId", "studyId", "variantId", "geneId") .agg(f.collect_set("source").alias("sources")) diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 6f80d826e..ff8c6c8ff 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr study_locus_overlap = StudyLocus( _df=self.credible_set.df.join( f.broadcast( - self.gs_curation.select( - StudyLocus.assign_study_locus_id( - f.col("association_info.otg_id"), # studyId - f.concat_ws( # variantId + self.gs_curation + .withColumn( + "variantId", + f.concat_ws( "_", f.col("sentinel_variant.locus_GRCh38.chromosome"), f.col("sentinel_variant.locus_GRCh38.position"), f.col("sentinel_variant.alleles.reference"), f.col("sentinel_variant.alleles.alternative"), - ), - ).alias("studyLocusId"), + ) + ) + .select( + StudyLocus.assign_study_locus_id( + ["association_info.otg_id", # studyId + "variantId"] + ), ) ), "studyLocusId", diff --git a/src/gentropy/method/locus_breaker_clumping.py b/src/gentropy/method/locus_breaker_clumping.py index 0ca7ae29b..fd7661a22 100644 --- a/src/gentropy/method/locus_breaker_clumping.py +++ b/src/gentropy/method/locus_breaker_clumping.py @@ -112,8 +112,8 @@ def locus_breaker( .cast(t.ArrayType(t.StringType())) .alias("qualityControls"), StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId") - ).alias("studyLocusId"), + ["studyId", "variantId"] + ), ) ), _schema=StudyLocus.get_schema(), diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py index 6889aaa26..5fd084efd 100644 --- a/src/gentropy/method/pics.py +++ b/src/gentropy/method/pics.py @@ -257,7 +257,7 @@ def finemap( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .drop("neglog_pvalue") diff --git a/src/gentropy/method/window_based_clumping.py b/src/gentropy/method/window_based_clumping.py index 9ef747abf..3ab15d42f 100644 --- a/src/gentropy/method/window_based_clumping.py +++ b/src/gentropy/method/window_based_clumping.py @@ -247,7 +247,7 @@ def clump( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId") + ["studyId", "variantId"] ), ) # Initialize QC column as array of strings: diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index a80591c60..26c73e20f 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -95,7 +95,7 @@ def __init__( .df.withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .collect()[0] @@ -247,7 +247,7 @@ def susie_inf_to_studylocus( .withColumn( "studyLocusId", StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId"), f.col("finemappingMethod") + ["studyId", "variantId", "finemappingMethod"] ), ) .select(