Skip to content

Commit

Permalink
Merge pull request opentargets#507 from opentargets/ds_gwas_update
Browse files Browse the repository at this point in the history
chore: small updates to accomodate GWAS Catalog for feb release
  • Loading branch information
DSuveges authored Mar 8, 2024
2 parents 1fecffb + a7c919a commit fa938fd
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 71 deletions.
2 changes: 1 addition & 1 deletion src/airflow/dags/gwas_curation_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
):
update_gwas_curation = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="gwas_catalog_curation_update",
step_id="ot_gwas_catalog_study_curation",
task_id="gwas_catalog_curation_update",
other_args=[
f"step.gwas_catalog_study_curation_out=gs://genetics_etl_python_playground/input/v2d/GWAS_Catalog_study_curation_{RUN_DATE}.tsv",
Expand Down
13 changes: 2 additions & 11 deletions src/gentropy/datasource/gwas_catalog/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ def _parse_study_table(
parse_efos(f.col("MAPPED BACKGROUND TRAIT URI")).alias(
"backgroundTraitFromSourceMappedIds"
),
cls.parse_cohorts(f.col("COHORT")).alias("cohorts"),
),
_schema=StudyIndexGWASCatalog.get_schema(),
)
Expand Down Expand Up @@ -548,14 +549,6 @@ def annotate_ancestries(
) # studyId has not been split yet
)

# Parsing cohort information:
cohorts = ancestry_lut.select(
f.col("STUDY ACCESSION").alias("studyId"),
GWASCatalogStudyIndexParser.parse_cohorts(f.col("COHORT(S)")).alias(
"cohorts"
),
).distinct()

# Get a high resolution dataset on experimental stage:
ancestry_stages = (
ancestry.groupBy("studyId")
Expand Down Expand Up @@ -644,9 +637,7 @@ def annotate_ancestries(
).select(
"studyId", "discoverySamples", "ldPopulationStructure", "replicationSamples"
)
self.df = self.df.join(parsed_ancestry_lut, on="studyId", how="left").join(
cohorts, on="studyId", how="left"
)
self.df = self.df.join(parsed_ancestry_lut, on="studyId", how="left")
return self

def annotate_sumstats_info(
Expand Down
6 changes: 3 additions & 3 deletions tests/gentropy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def mock_ld_index(spark: SparkSession) -> LDIndex:
def sample_gwas_catalog_studies(spark: SparkSession) -> DataFrame:
"""Sample GWAS Catalog studies."""
return spark.read.csv(
"tests/gentropy/data_samples/gwas_catalog_studies_sample-r2022-11-29.tsv",
"tests/gentropy/data_samples/gwas_catalog_studies.tsv",
sep="\t",
header=True,
)
Expand All @@ -424,7 +424,7 @@ def sample_gwas_catalog_studies(spark: SparkSession) -> DataFrame:
def sample_gwas_catalog_ancestries_lut(spark: SparkSession) -> DataFrame:
"""Sample GWAS ancestries sample data."""
return spark.read.csv(
"tests/gentropy/data_samples/gwas_catalog_ancestries_sample_v1.0.3-r2022-11-29.tsv",
"tests/gentropy/data_samples/gwas_catalog_ancestries.tsv",
sep="\t",
header=True,
)
Expand All @@ -444,7 +444,7 @@ def sample_gwas_catalog_harmonised_sumstats_list(spark: SparkSession) -> DataFra
def sample_gwas_catalog_associations(spark: SparkSession) -> DataFrame:
"""Sample GWAS raw associations sample data."""
return spark.read.csv(
"tests/gentropy/data_samples/gwas_catalog_associations_sample_e107_r2022-11-29.tsv",
"tests/gentropy/data_samples/gwas_catalog_associations.tsv",
sep="\t",
header=True,
)
Expand Down
20 changes: 20 additions & 0 deletions tests/gentropy/data_samples/gwas_catalog_ancestries.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
STUDY ACCESSION PUBMED ID FIRST AUTHOR DATE INITIAL SAMPLE DESCRIPTION REPLICATION SAMPLE DESCRIPTION STAGE NUMBER OF INDIVIDUALS BROAD ANCESTRAL CATEGORY COUNTRY OF ORIGIN COUNTRY OF RECRUITMENT ADDITIONAL ANCESTRY DESCRIPTION ANCESTRY DESCRIPTOR FOUNDER/GENETICALLY ISOLATED POPULATION NUMBER OF CASES NUMBER OF CONTROLS SAMPLE DESCRIPTION
GCST004795 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S.
GCST004795 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S.
GCST004796 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S.
GCST004796 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S.
GCST004797 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S.
GCST004797 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S.
GCST004794 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S.
GCST004794 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S.
GCST005522 23459209 Faraco J 2013-01-01 1,886 European ancestry cases, 10,421 European ancestry controls NA initial 12307 European NR Canada, U.S., Australia, Austria, France, Germany, Netherlands, Switzerland, Argentina, Israel, Turkey, Czech Republic, Poland, Slovakia, Denmark, Finland, Norway, U.K., Italy, Portugal, Spain
GCST004692 27455348 van Rheenen W 2016-07-25 12,577 European ancestry cases, 23,475 European ancestry controls 2,579 European ancestry cases, 2,767 European ancestry controls initial 36052 European NR U.S., Belgium, France, Germany, Netherlands, Switzerland, Finland, Republic of Ireland, Sweden, U.K., Italy, Portugal, Spain
GCST004692 27455348 van Rheenen W 2016-07-25 12,577 European ancestry cases, 23,475 European ancestry controls 2,579 European ancestry cases, 2,767 European ancestry controls replication 5346 European NR Australia, Belgium, France, Germany, Netherlands, Turkey, Republic of Ireland, Italy
GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 64 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France
GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 431 European NR France
GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 448 European NR France
GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 47 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France
GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 448 European NR France
GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 47 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France
GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 431 European NR France
GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 64 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France

This file was deleted.

Loading

0 comments on commit fa938fd

Please sign in to comment.