From b525117be9ed75f3bde2b7934145654b4d018f2c Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Wed, 25 Sep 2024 11:41:43 +0100 Subject: [PATCH 1/2] fix: clean unused study_locus step parameter (#786) --- src/gentropy/config.py | 1 - src/gentropy/study_locus_validation.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 3293a882a..c56a9dfb3 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -503,7 +503,6 @@ class StudyLocusValidationStepConfig(StepConfig): valid_study_locus_path: str = MISSING invalid_study_locus_path: str = MISSING invalid_qc_reasons: list[str] = MISSING - gwas_significance: float = WindowBasedClumpingStepConfig.gwas_significance _target_: str = "gentropy.study_locus_validation.StudyLocusValidationStep" diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index 287cd5645..fc69f6855 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -19,7 +19,6 @@ def __init__( session: Session, study_index_path: str, study_locus_path: list[str], - gwas_significance: float, valid_study_locus_path: str, invalid_study_locus_path: str, invalid_qc_reasons: list[str] = [], @@ -30,7 +29,6 @@ def __init__( session (Session): Session object. study_index_path (str): Path to study index file. study_locus_path (list[str]): Path to study locus dataset. - gwas_significance (float): GWAS significance threshold. valid_study_locus_path (str): Path to write the valid records. invalid_study_locus_path (str): Path to write the output file. invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']). From 51125c77e5d837a049bdf8dc141f40263b301302 Mon Sep 17 00:00:00 2001 From: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:00:54 +0200 Subject: [PATCH 2/2] fix(vep_parser): use nested schema for insilico predictors (#789) --- src/gentropy/common/spark_helpers.py | 52 ++++++++++++++++++- src/gentropy/datasource/ensembl/vep_parser.py | 23 ++++---- tests/gentropy/data_samples/vep_sample.jsonl | 2 + .../datasource/ensembl/test_vep_variants.py | 37 +++++++++++++ 4 files changed, 102 insertions(+), 12 deletions(-) diff --git a/src/gentropy/common/spark_helpers.py b/src/gentropy/common/spark_helpers.py index 791fb913d..680975ef6 100644 --- a/src/gentropy/common/spark_helpers.py +++ b/src/gentropy/common/spark_helpers.py @@ -382,6 +382,8 @@ def order_array_of_structs_by_two_fields( """Sort array of structs by a field in descending order and by an other field in an ascending order. This function doesn't deal with null values, assumes the sort columns are not nullable. + The sorting function compares the descending_column first, in case when two values from descending_column are equal + it compares the ascending_column. When values in both columns are equal, the rows order is preserved. Args: array_name (str): Column name with array of structs @@ -406,6 +408,20 @@ def order_array_of_structs_by_two_fields( |[{1.0, 45, First}, {1.0, 125, Second}, {0.5, 232, Third}, {0.5, 233, Fourth}]| +-----------------------------------------------------------------------------+ + >>> data = [(1.0, 45, 'First'), (1.0, 45, 'Second'), (0.5, 233, 'Fourth'), (1.0, 125, 'Third'),] + >>> ( + ... spark.createDataFrame(data, ['col1', 'col2', 'ranking']) + ... .groupBy(f.lit('c')) + ... .agg(f.collect_list(f.struct('col1','col2', 'ranking')).alias('list')) + ... .select(order_array_of_structs_by_two_fields('list', 'col1', 'col2').alias('sorted_list')) + ... .show(truncate=False) + ... ) + +----------------------------------------------------------------------------+ + |sorted_list | + +----------------------------------------------------------------------------+ + |[{1.0, 45, First}, {1.0, 45, Second}, {1.0, 125, Third}, {0.5, 233, Fourth}]| + +----------------------------------------------------------------------------+ + """ return f.expr( f""" @@ -425,6 +441,7 @@ def order_array_of_structs_by_two_fields( when left.{descending_column} > right.{descending_column} then -1 when left.{descending_column} == right.{descending_column} and left.{ascending_column} > right.{ascending_column} then 1 when left.{descending_column} == right.{descending_column} and left.{ascending_column} < right.{ascending_column} then -1 + when left.{ascending_column} == right.{ascending_column} and left.{descending_column} == right.{descending_column} then 0 end) """ ) @@ -525,7 +542,7 @@ def get_value_from_row(row: Row, column: str) -> Any: def enforce_schema( - expected_schema: t.StructType, + expected_schema: t.ArrayType | t.StructType | Column | str, ) -> Callable[..., Any]: """A function to enforce the schema of a function output follows expectation. @@ -541,7 +558,7 @@ def my_function() -> t.StructType: return ... Args: - expected_schema (t.StructType): The expected schema of the output. + expected_schema (t.ArrayType | t.StructType | Column | str): The expected schema of the output. Returns: Callable[..., Any]: A decorator function. @@ -687,3 +704,34 @@ def get_standard_error_from_confidence_interval(lower: Column, upper: Column) -> """ return (upper - lower) / (2 * 1.96) + + +def get_nested_struct_schema(dtype: t.DataType) -> t.StructType: + """Get the bottom StructType from a nested ArrayType type. + + Args: + dtype (t.DataType): The nested data structure. + + Returns: + t.StructType: The nested struct schema. + + Raises: + TypeError: If the input data type is not a nested struct. + + Examples: + >>> get_nested_struct_schema(t.ArrayType(t.StructType([t.StructField('a', t.StringType())]))) + StructType([StructField('a', StringType(), True)]) + + >>> get_nested_struct_schema(t.ArrayType(t.ArrayType(t.StructType([t.StructField("a", t.StringType())])))) + StructType([StructField('a', StringType(), True)]) + """ + if isinstance(dtype, t.StructField): + dtype = dtype.dataType + + match dtype: + case t.StructType(fields=_): + return dtype + case t.ArrayType(elementType=dtype): + return get_nested_struct_schema(dtype) + case _: + raise TypeError("The input data type must be a nested struct.") diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py index 4b70a36e6..d84b58407 100644 --- a/src/gentropy/datasource/ensembl/vep_parser.py +++ b/src/gentropy/datasource/ensembl/vep_parser.py @@ -14,6 +14,7 @@ from gentropy.common.schemas import parse_spark_schema from gentropy.common.spark_helpers import ( enforce_schema, + get_nested_struct_schema, map_column_by_dictionary, order_array_of_structs_by_field, order_array_of_structs_by_two_fields, @@ -26,14 +27,16 @@ class VariantEffectPredictorParser: """Collection of methods to parse VEP output in json format.""" + # NOTE: Due to the fact that the comparison of the xrefs is done om the base of rsids + # if the field `colocalised_variants` have multiple rsids, this extracting xrefs will result in + # an array of xref structs, rather then the struct itself. - # Schema description of the dbXref object: DBXREF_SCHEMA = VariantIndex.get_schema()["dbXrefs"].dataType # Schema description of the in silico predictor object: - IN_SILICO_PREDICTOR_SCHEMA = VariantIndex.get_schema()[ - "inSilicoPredictors" - ].dataType + IN_SILICO_PREDICTOR_SCHEMA = get_nested_struct_schema( + VariantIndex.get_schema()["inSilicoPredictors"] + ) # Schema for the allele frequency column: ALLELE_FREQUENCY_SCHEMA = VariantIndex.get_schema()["alleleFrequencies"].dataType @@ -350,12 +353,12 @@ def _get_max_alpha_missense(transcripts: Column) -> Column: ... .select(VariantEffectPredictorParser._get_max_alpha_missense(f.col('transcripts')).alias('am')) ... .show(truncate=False) ... ) - +------------------------------------------------------+ - |am | - +------------------------------------------------------+ - |[{max alpha missense, assessment 1, 0.4, null, gene1}]| - |[{max alpha missense, null, null, null, gene1}] | - +------------------------------------------------------+ + +----------------------------------------------------+ + |am | + +----------------------------------------------------+ + |{max alpha missense, assessment 1, 0.4, null, gene1}| + |{max alpha missense, null, null, null, gene1} | + +----------------------------------------------------+ """ return f.transform( diff --git a/tests/gentropy/data_samples/vep_sample.jsonl b/tests/gentropy/data_samples/vep_sample.jsonl index 2a3cb05dc..ec8ab7dbe 100644 --- a/tests/gentropy/data_samples/vep_sample.jsonl +++ b/tests/gentropy/data_samples/vep_sample.jsonl @@ -1,2 +1,4 @@ {"most_severe_consequence":"missense_variant","input":"17\t29510931\trs2153029597\tT\tC","assembly_name":"GRCh38","transcript_consequences":[{"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000238007","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":1,"canonical":1,"impact":"MODIFIER","tssdistance":498066,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000436028","distance":494419},{"hgvsg":"17:g.29510931T>C","consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000222363","strand":1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9,"variant_allele":"C","tssdistance":122371,"transcript_id":"ENST00000410431","distance":122248,"cadd_raw":5.156509},{"cadd_raw":5.156509,"transcript_id":"ENST00000581240","distance":128696,"tssdistance":128696,"variant_allele":"C","cadd_phred":28.9,"strand":1,"impact":"MODIFIER","canonical":1,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000263370","hgvsg":"17:g.29510931T>C"},{"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000264007","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":-1,"impact":"MODIFIER","canonical":1,"tssdistance":111323,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000582367","distance":110686},{"cadd_raw":5.156509,"distance":49616,"transcript_id":"ENST00000307201","appris":"P1","tssdistance":56106,"uniparc":["UPI00001C1FC9"],"swissprot":["Q6UXT9.120"],"variant_allele":"C","cadd_phred":28.9,"mane_select":"NM_198147.3","impact":"MODIFIER","canonical":1,"strand":-1,"gene_id":"ENSG00000168792","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C"},{"strand":1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000264290","transcript_id":"ENST00000579050","distance":58649,"cadd_raw":5.156509,"variant_allele":"C","tssdistance":58649},{"strand":1,"canonical":1,"impact":"MODIFIER","cadd_phred":28.9,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000263657","transcript_id":"ENST00000577846","distance":250172,"cadd_raw":5.156509,"variant_allele":"C","tssdistance":250172},{"variant_allele":"C","swissprot":["Q6QEF8.143"],"uniparc":["UPI0000DA4C55"],"tssdistance":111981,"transcript_id":"ENST00000388767","distance":103831,"cadd_raw":5.156509,"uniprot_isoform":["Q6QEF8-5"],"hgvsg":"17:g.29510931T>C","consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000167549","strand":-1,"canonical":1,"impact":"MODIFIER","mane_select":"NM_032854.4","cadd_phred":28.9},{"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000263781","strand":-1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9,"variant_allele":"C","tssdistance":489291,"transcript_id":"ENST00000580924","distance":489291,"cadd_raw":5.156509},{"strand":1,"impact":"MODIFIER","canonical":1,"mane_select":"NM_198529.4","cadd_phred":28.9,"uniprot_isoform":["A4FU69-1"],"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000176927","appris":"P1","transcript_id":"ENST00000394835","distance":430703,"cadd_raw":5.156509,"variant_allele":"C","swissprot":["A4FU69.119"],"uniparc":["UPI0000E59EF5"],"tssdistance":430703},{"cdna_start":2399,"tssdistance":120568,"amino_acids":"L/P","swissprot":["Q7L7X3.173"],"transcript_id":"ENST00000261716","appris":"P1","consequence_terms":["missense_variant"],"trembl":["A0A024QZ70.65"],"cds_start":1643,"mane_select":"NM_020791.4","cadd_phred":28.9,"strand":1,"cds_end":1643,"impact":"MODERATE","canonical":1,"uniparc":["UPI000004A033"],"variant_allele":"C","cadd_raw":5.156509,"sift_score":0,"protein_start":548,"cdna_end":2399,"gene_id":"ENSG00000160551","uniprot_isoform":["Q7L7X3-1"],"codons":"cTg/cCg","sift_prediction":"deleterious_low_confidence","hgvsg":"17:g.29510931T>C","alphamissense":{"am_class":"likely_pathogenic","am_pathogenicity":0.9994},"protein_end":548,"polyphen_score":0.999,"polyphen_prediction":"probably_damaging"},{"uniparc":["UPI0000246D82"],"tssdistance":82201,"variant_allele":"C","swissprot":["Q86YJ7.146"],"cadd_raw":5.156509,"transcript_id":"ENST00000394859","appris":"P1","distance":82201,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000198720","uniprot_isoform":["Q86YJ7-1"],"hgvsg":"17:g.29510931T>C","trembl":["A0A024QZ29.60"],"mane_select":"NM_152345.5","cadd_phred":28.9,"strand":1,"impact":"MODIFIER","canonical":1},{"transcript_id":"ENST00000459235","distance":372075,"cadd_raw":5.156509,"variant_allele":"C","tssdistance":372075,"strand":1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000239129"},{"tssdistance":205754,"variant_allele":"C","cadd_raw":5.156509,"distance":205348,"transcript_id":"ENST00000493028","gene_id":"ENSG00000240531","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"impact":"MODIFIER","canonical":1,"strand":-1},{"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000284162","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":1,"canonical":1,"impact":"MODIFIER","tssdistance":120269,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000580425","distance":120201},{"tssdistance":49616,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000581474","distance":49616,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000264031","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":1,"canonical":1,"impact":"MODIFIER"},{"hgvsg":"17:g.29510931T>C","gene_id":"ENSG00000222858","consequence_terms":["downstream_gene_variant"],"impact":"MODIFIER","canonical":1,"strand":-1,"cadd_phred":28.9,"variant_allele":"C","tssdistance":130771,"distance":130680,"transcript_id":"ENST00000410926","cadd_raw":5.156509},{"tssdistance":461583,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000581995","distance":461583,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000264435","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":1,"canonical":1,"impact":"MODIFIER"},{"gene_id":"ENSG00000179761","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C","mane_select":"NM_016518.3","cadd_phred":28.9,"canonical":1,"impact":"MODIFIER","strand":1,"tssdistance":467790,"uniparc":["UPI00001410B0"],"swissprot":["Q9P0Z9.165"],"variant_allele":"C","cadd_raw":5.156509,"distance":453715,"appris":"P1","transcript_id":"ENST00000323372"},{"strand":-1,"canonical":1,"impact":"MODIFIER","cadd_phred":28.9,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000253064","transcript_id":"ENST00000517255","distance":86064,"cadd_raw":5.156509,"variant_allele":"C","tssdistance":86064},{"tssdistance":313146,"variant_allele":"C","cadd_raw":5.156509,"distance":313146,"transcript_id":"ENST00000580309","gene_id":"ENSG00000264050","consequence_terms":["upstream_gene_variant"],"hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"canonical":1,"impact":"MODIFIER","strand":-1},{"variant_allele":"C","tssdistance":134916,"transcript_id":"ENST00000582881","distance":133865,"cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000265625","strand":-1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9},{"hgvsg":"17:g.29510931T>C","gene_id":"ENSG00000240074","consequence_terms":["upstream_gene_variant"],"impact":"MODIFIER","canonical":1,"strand":1,"cadd_phred":28.9,"variant_allele":"C","tssdistance":344828,"distance":344828,"transcript_id":"ENST00000478775","cadd_raw":5.156509},{"cadd_phred":28.9,"canonical":1,"impact":"MODIFIER","strand":-1,"gene_id":"ENSG00000264647","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C","cadd_raw":5.156509,"distance":80772,"transcript_id":"ENST00000584986","tssdistance":81310,"variant_allele":"C"},{"variant_allele":"C","tssdistance":65531,"transcript_id":"ENST00000365335","distance":65531,"cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000202205","strand":-1,"canonical":1,"impact":"MODIFIER","cadd_phred":28.9},{"gene_id":"ENSG00000266111","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"impact":"MODIFIER","canonical":1,"strand":1,"tssdistance":158862,"variant_allele":"C","cadd_raw":5.156509,"distance":106826,"transcript_id":"ENST00000584958"},{"variant_allele":"C","tssdistance":265770,"distance":264816,"transcript_id":"ENST00000580031","cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","gene_id":"ENSG00000265713","consequence_terms":["downstream_gene_variant"],"canonical":1,"impact":"MODIFIER","strand":-1,"cadd_phred":28.9},{"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000108255","uniprot_isoform":["P05813-1"],"hgvsg":"17:g.29510931T>C","mane_select":"NM_005208.5","cadd_phred":28.9,"strand":1,"impact":"MODIFIER","canonical":1,"uniparc":["UPI00001283CF"],"tssdistance":264072,"variant_allele":"C","swissprot":["P05813.205"],"cadd_raw":5.156509,"transcript_id":"ENST00000225387","appris":"P1","distance":256437},{"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000264808","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":-1,"canonical":1,"impact":"MODIFIER","tssdistance":120702,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000685798","distance":120702},{"cadd_phred":28.9,"strand":-1,"canonical":1,"impact":"MODIFIER","consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000290082","hgvsg":"17:g.29510931T>C","cadd_raw":5.156509,"transcript_id":"ENST00000702873","distance":56892,"tssdistance":57602,"variant_allele":"C"},{"mane_select":"NM_078471.4","cadd_phred":28.9,"strand":-1,"canonical":1,"impact":"MODIFIER","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000196535","uniprot_isoform":["Q92614-1"],"hgvsg":"17:g.29510931T>C","cadd_raw":5.156509,"appris":"P4","transcript_id":"ENST00000527372","distance":330533,"uniparc":["UPI0000167F32"],"tssdistance":330533,"variant_allele":"C","swissprot":["Q92614.216"]},{"cadd_raw":5.156509,"transcript_id":"ENST00000492004","distance":170109,"tssdistance":170109,"variant_allele":"C","cadd_phred":28.9,"strand":-1,"impact":"MODIFIER","canonical":1,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000239256","hgvsg":"17:g.29510931T>C"},{"gene_id":"ENSG00000263709","consequence_terms":["downstream_gene_variant"],"hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"canonical":1,"impact":"MODIFIER","strand":1,"tssdistance":370448,"variant_allele":"C","cadd_raw":5.156509,"distance":355442,"transcript_id":"ENST00000582196"},{"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000252657","hgvsg":"17:g.29510931T>C","cadd_phred":28.9,"strand":1,"impact":"MODIFIER","canonical":1,"tssdistance":266688,"variant_allele":"C","cadd_raw":5.156509,"transcript_id":"ENST00000516848","distance":266688},{"trembl":["F5H527.88"],"mane_select":"NM_001282129.2","cadd_phred":28.9,"strand":-1,"impact":"MODIFIER","canonical":1,"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000141298","hgvsg":"17:g.29510931T>C","cadd_raw":5.156509,"transcript_id":"ENST00000540801","appris":"A1","distance":115007,"uniparc":["UPI0002065A97"],"tssdistance":419297,"variant_allele":"C"},{"tssdistance":216783,"uniparc":["UPI00001B078D"],"swissprot":["Q7Z417.159"],"variant_allele":"C","cadd_raw":5.156509,"distance":216783,"transcript_id":"ENST00000225388","appris":"P1","gene_id":"ENSG00000108256","consequence_terms":["upstream_gene_variant"],"hgvsg":"17:g.29510931T>C","uniprot_isoform":["Q7Z417-1"],"mane_select":"NM_020772.3","cadd_phred":28.9,"impact":"MODIFIER","canonical":1,"strand":-1},{"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000108262","uniprot_isoform":["Q9Y2X7-1"],"hgvsg":"17:g.29510931T>C","mane_select":"NM_014030.4","cadd_phred":28.9,"strand":-1,"impact":"MODIFIER","canonical":1,"uniparc":["UPI000013C867"],"tssdistance":78717,"variant_allele":"C","swissprot":["Q9Y2X7.219"],"cadd_raw":5.156509,"transcript_id":"ENST00000225394","appris":"A1","distance":62544},{"variant_allele":"C","tssdistance":499922,"transcript_id":"ENST00000581964","distance":499922,"cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000263613","strand":-1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9},{"variant_allele":"C","tssdistance":306457,"transcript_id":"ENST00000580812","distance":306457,"cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000178082","strand":-1,"impact":"MODIFIER","canonical":1,"cadd_phred":28.9},{"variant_allele":"C","tssdistance":352471,"distance":352471,"transcript_id":"ENST00000584258","cadd_raw":5.156509,"hgvsg":"17:g.29510931T>C","gene_id":"ENSG00000263477","consequence_terms":["upstream_gene_variant"],"canonical":1,"impact":"MODIFIER","strand":1,"cadd_phred":28.9},{"cadd_raw":5.156509,"transcript_id":"ENST00000301057","appris":"P1","distance":57770,"uniparc":["UPI000003B08D"],"tssdistance":57770,"variant_allele":"C","swissprot":["Q8NBR0.130"],"mane_select":"NM_138349.4","cadd_phred":28.9,"strand":1,"impact":"MODIFIER","canonical":1,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000167543","hgvsg":"17:g.29510931T>C"}],"allele_string":"T/C","seq_region_name":"17","strand":1,"end":29510931,"start":29510931,"colocated_variants":[{"clin_sig":["pathogenic"],"clin_sig_allele":"C:pathogenic","phenotype_or_disease":1,"strand":1,"allele_string":"T/C","start":29510931,"id":"rs2153029597","seq_region_name":"17","pubmed":[33565190],"end":29510931,"var_synonyms":{"ClinVar":["RCV001731168","VCV001300172"],"OMIM":[610266.0003]}}],"id":"rs2153029597"} {"strand":1,"seq_region_name":"9","allele_string":"C/T","transcript_consequences":[{"hgvsg":"9:g.82445881C>T","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000228046","strand":-1,"impact":"MODIFIER","canonical":1,"cadd_phred":7.002,"variant_allele":"T","tssdistance":17856,"transcript_id":"ENST00000392516","distance":17856,"cadd_raw":0.6583},{"cadd_phred":7.002,"impact":"MODIFIER","canonical":1,"strand":1,"gene_id":"ENSG00000225085","consequence_terms":["downstream_gene_variant"],"hgvsg":"9:g.82445881C>T","cadd_raw":0.6583,"distance":39642,"transcript_id":"ENST00000436084","tssdistance":40693,"variant_allele":"T"},{"cadd_raw":0.6583,"transcript_id":"ENST00000637606","tssdistance":468267,"variant_allele":"T","cadd_phred":7.002,"canonical":1,"impact":"MODIFIER","strand":1,"gene_id":"ENSG00000290551","consequence_terms":["intron_variant","non_coding_transcript_variant"],"hgvsg":"9:g.82445881C>T"},{"gene_id":"ENSG00000278988","consequence_terms":["upstream_gene_variant"],"hgvsg":"9:g.82445881C>T","cadd_phred":7.002,"canonical":1,"impact":"MODIFIER","strand":1,"tssdistance":97837,"variant_allele":"T","cadd_raw":0.6583,"distance":97837,"transcript_id":"ENST00000623079"},{"swissprot":["Q6ZQQ2.115"],"variant_allele":"T","tssdistance":457109,"uniparc":["UPI00001C10A6"],"distance":450628,"appris":"P1","transcript_id":"ENST00000344803","cadd_raw":0.6583,"hgvsg":"9:g.82445881C>T","gene_id":"ENSG00000214929","consequence_terms":["downstream_gene_variant"],"impact":"MODIFIER","canonical":1,"strand":1,"cadd_phred":7.002,"mane_select":"NM_001001670.3"},{"cadd_phred":7.002,"impact":"MODIFIER","canonical":1,"strand":-1,"gene_id":"ENSG00000230360","consequence_terms":["upstream_gene_variant"],"hgvsg":"9:g.82445881C>T","cadd_raw":0.6583,"distance":357113,"transcript_id":"ENST00000417796","tssdistance":357113,"variant_allele":"T"},{"cadd_raw":0.6583,"transcript_id":"ENST00000422010","distance":5775,"tssdistance":5775,"variant_allele":"T","cadd_phred":7.002,"strand":1,"canonical":1,"impact":"MODIFIER","consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000232749","hgvsg":"9:g.82445881C>T"},{"tssdistance":12976,"variant_allele":"T","cadd_raw":0.6583,"transcript_id":"ENST00000438986","distance":12976,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000228123","hgvsg":"9:g.82445881C>T","cadd_phred":7.002,"strand":-1,"canonical":1,"impact":"MODIFIER"},{"tssdistance":382199,"variant_allele":"T","cadd_raw":0.6583,"transcript_id":"ENST00000434692","distance":382199,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000231649","hgvsg":"9:g.82445881C>T","cadd_phred":7.002,"strand":-1,"canonical":1,"impact":"MODIFIER"},{"tssdistance":298911,"variant_allele":"T","cadd_raw":0.6583,"transcript_id":"ENST00000432491","distance":298186,"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000233309","hgvsg":"9:g.82445881C>T","cadd_phred":7.002,"strand":-1,"canonical":1,"impact":"MODIFIER"},{"canonical":1,"impact":"MODIFIER","strand":1,"cadd_phred":7.002,"hgvsg":"9:g.82445881C>T","gene_id":"ENSG00000235377","consequence_terms":["downstream_gene_variant"],"distance":314249,"transcript_id":"ENST00000445918","cadd_raw":0.6583,"variant_allele":"T","tssdistance":315129},{"variant_allele":"T","tssdistance":17682,"distance":17588,"transcript_id":"ENST00000636401","cadd_raw":0.6583,"hgvsg":"9:g.82445881C>T","gene_id":"ENSG00000228430","consequence_terms":["downstream_gene_variant"],"impact":"MODIFIER","canonical":1,"strand":1,"cadd_phred":7.002},{"cadd_raw":0.6583,"transcript_id":"ENST00000585776","distance":468792,"tssdistance":468792,"variant_allele":"T","cadd_phred":7.002,"strand":-1,"impact":"MODIFIER","canonical":1,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000267559","hgvsg":"9:g.82445881C>T"},{"impact":"MODIFIER","canonical":1,"strand":1,"cadd_phred":7.002,"hgvsg":"9:g.82445881C>T","gene_id":"ENSG00000237770","consequence_terms":["downstream_gene_variant"],"distance":473751,"transcript_id":"ENST00000429999","cadd_raw":0.6583,"variant_allele":"T","tssdistance":479317},{"distance":166671,"transcript_id":"ENST00000661177","cadd_raw":0.6583,"variant_allele":"T","tssdistance":200703,"impact":"MODIFIER","canonical":1,"strand":-1,"cadd_phred":7.002,"hgvsg":"9:g.82445881C>T","gene_id":"ENSG00000286612","consequence_terms":["downstream_gene_variant"]},{"mane_select":"NM_207416.3","cadd_phred":7.002,"impact":"MODIFIER","canonical":1,"strand":1,"gene_id":"ENSG00000186788","consequence_terms":["downstream_gene_variant"],"hgvsg":"9:g.82445881C>T","cadd_raw":0.6583,"distance":495788,"appris":"P1","transcript_id":"ENST00000445385","tssdistance":502381,"uniparc":["UPI000048D678"],"swissprot":["P0C874.81"],"variant_allele":"T"}],"assembly_name":"GRCh38","input":"9\t82445881\t9_82445881_C_T\tC\tT","most_severe_consequence":"intron_variant","id":"9_82445881_C_T","colocated_variants":[{"phenotype_or_disease":1,"strand":1,"allele_string":"C/G/T","frequencies":{"T":{"gnomadg":0.01197,"gnomadg_amr":0.0191,"gnomadg_afr":0.003331,"gnomadg_asj":0.02364,"eas":0,"amr":0.0216,"gnomadg_eas":0,"sas":0,"gnomadg_nfe":0.01704,"gnomadg_fin":0.009992,"gnomadg_mid":0.006329,"afr":0,"gnomadg_oth":0.01772,"gnomadg_ami":0.003289,"af":0.0068,"eur":0.0189,"gnomadg_sas":0.0004142}},"start":82445881,"id":"rs117517710","seq_region_name":"9","pubmed":[31073882],"end":82445881}],"end":82445881,"start":82445881} +{"assembly_name":"GRCh38","seq_region_name":"20","id":"rs1555828246","start":10645397,"strand":1,"transcript_consequences":[{"hgvsg":"20:g.10645397C>T","canonical":1,"strand":-1,"distance":295769,"consequence_terms":["upstream_gene_variant"],"cadd_raw":4.925023,"cadd_phred":27.5,"variant_allele":"T","gene_id":"ENSG00000214835","impact":"MODIFIER","transcript_id":"ENST00000446637","tssdistance":295769},{"impact":"MODIFIER","transcript_id":"ENST00000417299","tssdistance":51552,"gene_id":"ENSG00000224961","distance":51552,"consequence_terms":["upstream_gene_variant"],"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"canonical":1,"hgvsg":"20:g.10645397C>T","strand":1},{"consequence_terms":["upstream_gene_variant"],"distance":355907,"variant_allele":"T","cadd_phred":27.5,"cadd_raw":4.925023,"strand":1,"hgvsg":"20:g.10645397C>T","canonical":1,"transcript_id":"ENST00000605338","impact":"MODIFIER","tssdistance":355907,"gene_id":"ENSG00000270777"},{"gene_id":"ENSG00000235036","transcript_id":"ENST00000456064","tssdistance":31168,"impact":"MODIFIER","canonical":1,"hgvsg":"20:g.10645397C>T","strand":-1,"distance":31168,"consequence_terms":["upstream_gene_variant"],"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023},{"hgvsg":"20:g.10645397C>T","canonical":1,"strand":-1,"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"distance":107881,"consequence_terms":["downstream_gene_variant"],"gene_id":"ENSG00000234900","transcript_id":"ENST00000418690","impact":"MODIFIER","tssdistance":119889},{"strand":1,"hgvsg":"20:g.10645397C>T","canonical":1,"cadd_raw":4.925023,"cadd_phred":27.5,"variant_allele":"T","consequence_terms":["downstream_gene_variant"],"distance":444573,"gene_id":"ENSG00000230506","transcript_id":"ENST00000662058","impact":"MODIFIER","tssdistance":472875},{"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"consequence_terms":["upstream_gene_variant"],"distance":211175,"strand":-1,"canonical":1,"hgvsg":"20:g.10645397C>T","uniparc":["UPI0000D483F7"],"mane_select":"NM_001394149.2","tssdistance":211175,"transcript_id":"ENST00000713549","impact":"MODIFIER","gene_id":"ENSG00000285508"},{"impact":"MODIFIER","transcript_id":"ENST00000666915","tssdistance":472696,"gene_id":"ENSG00000232448","consequence_terms":["downstream_gene_variant"],"distance":458657,"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"strand":1,"canonical":1,"hgvsg":"20:g.10645397C>T"},{"consequence_terms":["downstream_gene_variant"],"distance":310699,"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"strand":1,"canonical":1,"hgvsg":"20:g.10645397C>T","transcript_id":"ENST00000416198","impact":"MODIFIER","tssdistance":310978,"gene_id":"ENSG00000237005"},{"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"distance":229936,"consequence_terms":["downstream_gene_variant"],"hgvsg":"20:g.10645397C>T","canonical":1,"strand":-1,"tssdistance":263882,"transcript_id":"ENST00000448859","impact":"MODIFIER","gene_id":"ENSG00000232900"},{"impact":"MODIFIER","transcript_id":"ENST00000649912","tssdistance":211175,"appris":"P1","gene_id":"ENSG00000285723","trembl":["Q9HB66.115"],"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"distance":211175,"consequence_terms":["upstream_gene_variant"],"canonical":1,"hgvsg":"20:g.10645397C>T","strand":-1,"mane_select":"NM_001394148.2","uniparc":["UPI000006FBAA"]},{"transcript_id":"ENST00000347364","tssdistance":211175,"impact":"MODIFIER","appris":"P1","gene_id":"ENSG00000125863","cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"swissprot":["Q9NPJ1.195"],"consequence_terms":["upstream_gene_variant"],"distance":211175,"strand":-1,"canonical":1,"hgvsg":"20:g.10645397C>T","uniparc":["UPI000012F199"],"mane_select":"NM_170784.3"},{"tssdistance":4298,"transcript_id":"ENST00000615931","impact":"MODIFIER","gene_id":"ENSG00000273745","cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"distance":4239,"consequence_terms":["downstream_gene_variant"],"canonical":1,"hgvsg":"20:g.10645397C>T","strand":-1},{"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"consequence_terms":["downstream_gene_variant"],"distance":257591,"strand":1,"hgvsg":"20:g.10645397C>T","canonical":1,"impact":"MODIFIER","transcript_id":"ENST00000441308","tssdistance":259618,"gene_id":"ENSG00000230750"},{"consequence_terms":["downstream_gene_variant"],"distance":17367,"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"swissprot":["Q5VYV7.120"],"uniparc":["UPI00001D8318"],"mane_select":"NM_001009608.3","strand":1,"hgvsg":"20:g.10645397C>T","canonical":1,"appris":"P1","transcript_id":"ENST00000334534","impact":"MODIFIER","tssdistance":210092,"gene_id":"ENSG00000149346"},{"gene_id":"ENSG00000125899","transcript_id":"ENST00000659767","tssdistance":350896,"impact":"MODIFIER","hgvsg":"20:g.10645397C>T","canonical":1,"strand":1,"cadd_raw":4.925023,"cadd_phred":27.5,"variant_allele":"T","distance":350896,"consequence_terms":["upstream_gene_variant"]},{"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"distance":335492,"consequence_terms":["downstream_gene_variant"],"canonical":1,"hgvsg":"20:g.10645397C>T","strand":1,"transcript_id":"ENST00000688853","tssdistance":336975,"impact":"MODIFIER","gene_id":"ENSG00000289505"},{"appris":"P3","impact":"MODIFIER","transcript_id":"ENST00000254976","tssdistance":426567,"gene_id":"ENSG00000132639","uniprot_isoform":["P60880-1"],"distance":337979,"consequence_terms":["downstream_gene_variant"],"swissprot":["P60880.188"],"cadd_raw":4.925023,"cadd_phred":27.5,"variant_allele":"T","mane_select":"NM_130811.4","uniparc":["UPI0000001103"],"hgvsg":"20:g.10645397C>T","canonical":1,"strand":1},{"strand":1,"canonical":1,"hgvsg":"20:g.10645397C>T","consequence_terms":["upstream_gene_variant"],"distance":27531,"cadd_raw":4.925023,"variant_allele":"T","cadd_phred":27.5,"gene_id":"ENSG00000270792","transcript_id":"ENST00000605292","impact":"MODIFIER","tssdistance":27531},{"hgvsg":"20:g.10645397C>T","canonical":1,"strand":-1,"distance":276678,"consequence_terms":["upstream_gene_variant"],"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"gene_id":"ENSG00000227906","impact":"MODIFIER","transcript_id":"ENST00000421143","tssdistance":276678},{"distance":243185,"consequence_terms":["downstream_gene_variant"],"variant_allele":"T","cadd_phred":27.5,"cadd_raw":4.925023,"canonical":1,"hgvsg":"20:g.10645397C>T","strand":-1,"tssdistance":243691,"transcript_id":"ENST00000406588","impact":"MODIFIER","gene_id":"ENSG00000217809"},{"uniprot_isoform":["P78504-1"],"cdna_end":2541,"appris":"P1","transcript_id":"ENST00000254958","impact":"MODERATE","tssdistance":28602,"sift_prediction":"deleterious","codons":"tGt/tAt","uniparc":["UPI00000498B5"],"strand":-1,"canonical":1,"swissprot":["P78504.228"],"cds_start":2072,"gene_id":"ENSG00000101384","cds_end":2072,"protein_start":691,"sift_score":0,"mane_select":"NM_000214.3","hgvsg":"20:g.10645397C>T","consequence_terms":["missense_variant"],"protein_end":691,"cadd_raw":4.925023,"cadd_phred":27.5,"variant_allele":"T","cdna_start":2541,"amino_acids":"C/Y"},{"canonical":1,"hgvsg":"20:g.10645397C>T","strand":-1,"cadd_phred":27.5,"variant_allele":"T","cadd_raw":4.925023,"distance":326644,"consequence_terms":["upstream_gene_variant"],"gene_id":"ENSG00000286936","transcript_id":"ENST00000664194","tssdistance":326644,"impact":"MODIFIER"}],"input":"20\t10645397\trs1555828246\tC\tT\t.\t.\t.","allele_string":"C/T","end":10645397,"colocated_variants":[{"end":10645397,"allele_string":"C/T","strand":1,"id":"rs1555828246","seq_region_name":"20","start":10645397},{"end":10645397,"allele_string":"C/T","phenotype_or_disease":1,"clin_sig":["uncertain_significance"],"pubmed":[26076142,21752016],"strand":1,"clin_sig_allele":"T:uncertain_significance","start":10645397,"seq_region_name":"20","var_synonyms":{"ClinVar":["RCV002260566","VCV001693298"]}}],"most_severe_consequence":"missense_variant"} +{"input":"20\t10649087\trs863223652\tG\tA\t.\t.\t.","transcript_consequences":[{"strand":1,"canonical":1,"hgvsg":"20:g.10649087G>A","cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"consequence_terms":["upstream_gene_variant"],"distance":352217,"gene_id":"ENSG00000270777","transcript_id":"ENST00000605338","impact":"MODIFIER","tssdistance":352217},{"impact":"MODIFIER","transcript_id":"ENST00000417299","tssdistance":47862,"gene_id":"ENSG00000224961","cadd_raw":9.333171,"cadd_phred":40,"variant_allele":"A","distance":47862,"consequence_terms":["upstream_gene_variant"],"hgvsg":"20:g.10649087G>A","canonical":1,"strand":1},{"gene_id":"ENSG00000214835","impact":"MODIFIER","transcript_id":"ENST00000446637","tssdistance":299459,"strand":-1,"hgvsg":"20:g.10649087G>A","canonical":1,"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"consequence_terms":["upstream_gene_variant"],"distance":299459},{"transcript_id":"ENST00000456064","impact":"MODIFIER","tssdistance":34858,"gene_id":"ENSG00000235036","distance":34858,"consequence_terms":["upstream_gene_variant"],"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"canonical":1,"hgvsg":"20:g.10649087G>A","strand":-1},{"transcript_id":"ENST00000662058","tssdistance":476565,"impact":"MODIFIER","gene_id":"ENSG00000230506","consequence_terms":["downstream_gene_variant"],"distance":448263,"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"strand":1,"canonical":1,"hgvsg":"20:g.10649087G>A"},{"impact":"MODIFIER","transcript_id":"ENST00000418690","tssdistance":116199,"gene_id":"ENSG00000234900","cadd_raw":9.333171,"variant_allele":"A","cadd_phred":40,"distance":104191,"consequence_terms":["downstream_gene_variant"],"hgvsg":"20:g.10649087G>A","canonical":1,"strand":-1},{"gene_id":"ENSG00000232448","transcript_id":"ENST00000666915","tssdistance":476386,"impact":"MODIFIER","hgvsg":"20:g.10649087G>A","canonical":1,"strand":1,"cadd_raw":9.333171,"cadd_phred":40,"variant_allele":"A","distance":462347,"consequence_terms":["downstream_gene_variant"]},{"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"distance":214865,"consequence_terms":["upstream_gene_variant"],"hgvsg":"20:g.10649087G>A","canonical":1,"strand":-1,"mane_select":"NM_001394149.2","uniparc":["UPI0000D483F7"],"impact":"MODIFIER","transcript_id":"ENST00000713549","tssdistance":214865,"gene_id":"ENSG00000285508"},{"transcript_id":"ENST00000416198","impact":"MODIFIER","tssdistance":314668,"gene_id":"ENSG00000237005","consequence_terms":["downstream_gene_variant"],"distance":314389,"cadd_raw":9.333171,"cadd_phred":40,"variant_allele":"A","strand":1,"canonical":1,"hgvsg":"20:g.10649087G>A"},{"gene_id":"ENSG00000289505","tssdistance":340665,"transcript_id":"ENST00000688853","impact":"MODIFIER","hgvsg":"20:g.10649087G>A","canonical":1,"strand":1,"distance":339182,"consequence_terms":["downstream_gene_variant"],"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171},{"impact":"MODIFIER","transcript_id":"ENST00000659767","tssdistance":347206,"gene_id":"ENSG00000125899","distance":347206,"consequence_terms":["upstream_gene_variant"],"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"hgvsg":"20:g.10649087G>A","canonical":1,"strand":1},{"consequence_terms":["upstream_gene_variant"],"distance":214865,"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"swissprot":["Q9NPJ1.195"],"uniparc":["UPI000012F199"],"mane_select":"NM_170784.3","strand":-1,"hgvsg":"20:g.10649087G>A","canonical":1,"appris":"P1","transcript_id":"ENST00000347364","tssdistance":214865,"impact":"MODIFIER","gene_id":"ENSG00000125863"},{"gene_id":"ENSG00000285723","appris":"P1","tssdistance":214865,"transcript_id":"ENST00000649912","impact":"MODIFIER","mane_select":"NM_001394148.2","uniparc":["UPI000006FBAA"],"hgvsg":"20:g.10649087G>A","canonical":1,"strand":-1,"distance":214865,"consequence_terms":["upstream_gene_variant"],"trembl":["Q9HB66.115"],"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171},{"strand":-1,"hgvsg":"20:g.10649087G>A","canonical":1,"consequence_terms":["downstream_gene_variant"],"distance":549,"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"gene_id":"ENSG00000273745","impact":"MODIFIER","transcript_id":"ENST00000615931","tssdistance":608},{"consequence_terms":["downstream_gene_variant"],"distance":261281,"cadd_raw":9.333171,"variant_allele":"A","cadd_phred":40,"strand":1,"canonical":1,"hgvsg":"20:g.10649087G>A","transcript_id":"ENST00000441308","tssdistance":263308,"impact":"MODIFIER","gene_id":"ENSG00000230750"},{"strand":-1,"hgvsg":"20:g.10649087G>A","canonical":1,"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"consequence_terms":["downstream_gene_variant"],"distance":226246,"gene_id":"ENSG00000232900","transcript_id":"ENST00000448859","impact":"MODIFIER","tssdistance":260192},{"appris":"P1","transcript_id":"ENST00000334534","tssdistance":213782,"impact":"MODIFIER","gene_id":"ENSG00000149346","consequence_terms":["downstream_gene_variant"],"distance":21057,"cadd_raw":9.333171,"variant_allele":"A","cadd_phred":40,"swissprot":["Q5VYV7.120"],"uniparc":["UPI00001D8318"],"mane_select":"NM_001009608.3","strand":1,"hgvsg":"20:g.10649087G>A","canonical":1},{"consequence_terms":["downstream_gene_variant"],"distance":341669,"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"swissprot":["P60880.188"],"uniparc":["UPI0000001103"],"mane_select":"NM_130811.4","strand":1,"canonical":1,"hgvsg":"20:g.10649087G>A","appris":"P3","transcript_id":"ENST00000254976","tssdistance":430257,"impact":"MODIFIER","gene_id":"ENSG00000132639","uniprot_isoform":["P60880-1"]},{"gene_id":"ENSG00000227906","transcript_id":"ENST00000421143","impact":"MODIFIER","tssdistance":280368,"strand":-1,"canonical":1,"hgvsg":"20:g.10649087G>A","cadd_raw":9.333171,"variant_allele":"A","cadd_phred":40,"consequence_terms":["upstream_gene_variant"],"distance":280368},{"consequence_terms":["downstream_gene_variant"],"distance":239495,"cadd_phred":40,"variant_allele":"A","cadd_raw":9.333171,"strand":-1,"canonical":1,"hgvsg":"20:g.10649087G>A","transcript_id":"ENST00000406588","tssdistance":240001,"impact":"MODIFIER","gene_id":"ENSG00000217809"},{"strand":1,"hgvsg":"20:g.10649087G>A","canonical":1,"consequence_terms":["upstream_gene_variant"],"distance":23841,"variant_allele":"A","cadd_phred":40,"cadd_raw":9.333171,"gene_id":"ENSG00000270792","impact":"MODIFIER","transcript_id":"ENST00000605292","tssdistance":23841},{"gene_id":"ENSG00000286936","transcript_id":"ENST00000664194","impact":"MODIFIER","tssdistance":330334,"strand":-1,"canonical":1,"hgvsg":"20:g.10649087G>A","cadd_raw":9.333171,"cadd_phred":40,"variant_allele":"A","consequence_terms":["upstream_gene_variant"],"distance":330334},{"transcript_id":"ENST00000254958","impact":"HIGH","tssdistance":24912,"lof_info":"PERCENTILE:0.374350560568772,GERP_DIST:2349.53755103406,BP_DIST:2273,DIST_FROM_LAST_EXON:1816,50_BP_RULE:PASS,ANN_ORF:237.018,MAX_ORF:237.018","codons":"Cag/Tag","cdna_end":1838,"appris":"P1","uniprot_isoform":["P78504-1"],"swissprot":["P78504.228"],"lof":"HC","strand":-1,"canonical":1,"uniparc":["UPI00000498B5"],"cds_end":1369,"protein_start":457,"cds_start":1369,"gene_id":"ENSG00000101384","cadd_raw":9.333171,"variant_allele":"A","cadd_phred":40,"cdna_start":1838,"amino_acids":"Q/*","consequence_terms":["stop_gained"],"protein_end":457,"hgvsg":"20:g.10649087G>A","mane_select":"NM_000214.3"}],"strand":1,"colocated_variants":[{"strand":1,"start":10649087,"seq_region_name":"20","id":"rs1555828721","end":10649087,"allele_string":"G/A"},{"id":"rs863223652","var_synonyms":{"ClinVar":["RCV002383649","VCV001770992"]},"seq_region_name":"20","clin_sig_allele":"A:pathogenic","start":10649087,"strand":1,"clin_sig":["pathogenic"],"phenotype_or_disease":1,"allele_string":"G/A","end":10649087}],"most_severe_consequence":"stop_gained","allele_string":"G/A","end":10649087,"id":"rs863223652","assembly_name":"GRCh38","seq_region_name":"20","start":10649087} diff --git a/tests/gentropy/datasource/ensembl/test_vep_variants.py b/tests/gentropy/datasource/ensembl/test_vep_variants.py index 97f255cf0..5757fa2f5 100644 --- a/tests/gentropy/datasource/ensembl/test_vep_variants.py +++ b/tests/gentropy/datasource/ensembl/test_vep_variants.py @@ -7,6 +7,7 @@ import pytest from pyspark.sql import DataFrame from pyspark.sql import functions as f +from pyspark.sql import types as t from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser @@ -118,6 +119,21 @@ def test_extract_variant_index_from_vep( assert isinstance( variant_index, VariantIndex ), "VariantIndex object not created." + in_silico_schema = t.ArrayType( + t.StructType( + [ + t.StructField("method", t.StringType(), True), + t.StructField("assessment", t.StringType(), True), + t.StructField("score", t.FloatType(), True), + t.StructField("assessmentFlag", t.StringType(), True), + t.StructField("targetId", t.StringType(), True), + ] + ) + ) + assert ( + variant_index.df.select("inSilicoPredictors").schema.fields[0].dataType + == in_silico_schema + ), "In silico schema is not correct." def test_process(self: TestVEPParser) -> None: """Test process method.""" @@ -144,3 +160,24 @@ def test_variant_count(self: TestVEPParser) -> None: assert ( self.raw_vep_output.count() == self.processed_vep_output.count() ), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}." + + def test_collection(self: TestVEPParser) -> None: + """Test if the collection of VEP variantIndex runs without failures.""" + assert ( + len(self.processed_vep_output.collect()) + == self.processed_vep_output.count() + ), "Collection performed incorrectly." + + def test_ensembl_transcripts_no_duplicates(self: TestVEPParser) -> None: + """Test if in single row all ensembl target ids (gene ids) do not have duplicates.""" + targets = ( + self.processed_vep_output.limit(1) + .select(f.explode("transcriptConsequences").alias("t")) + .select("t.targetId") + .collect() + ) + + asserted_targets = [t["targetId"] for t in targets] + assert len(asserted_targets) == len( + set(asserted_targets) + ), "Duplicate ensembl transcripts in a single row."