Skip to content

Commit

Permalink
Merge branch 'dev' into dc_locus_fm_filter
Browse files Browse the repository at this point in the history
  • Loading branch information
addramir committed Sep 25, 2024
2 parents b3fa079 + 6c4bdf5 commit f27a256
Show file tree
Hide file tree
Showing 76 changed files with 4,364 additions and 1,437 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/artifact.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ env:
REGION: europe-west1
GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
REPOSITORY: gentropy-app
PYTHON_VERSION_DEFAULT: "3.10.8"

jobs:
build-push-artifact:
Expand Down Expand Up @@ -67,3 +68,18 @@ jobs:
tags: "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/custom_ensembl_vep:${{ github.ref_name }}"
context: .
file: "src/vep/Dockerfile"

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION_DEFAULT }}
- name: Install and configure Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

- name: Build and push spark cluster dependencies
run: |
make build
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ci:
skip: [poetry-lock]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.1
rev: v0.6.7
hooks:
- id: ruff
args:
Expand Down Expand Up @@ -58,14 +58,14 @@ repos:
exclude: "CHANGELOG.md"

- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
rev: v9.16.0
rev: v9.18.0
hooks:
- id: commitlint
additional_dependencies: ["@commitlint/[email protected]"]
stages: [commit-msg]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v1.11.1"
rev: "v1.11.2"
hooks:
- id: mypy
args:
Expand Down Expand Up @@ -98,7 +98,7 @@ repos:
- id: beautysh

- repo: https://github.com/jsh9/pydoclint
rev: 0.5.6
rev: 0.5.8
hooks:
- id: pydoclint

Expand Down
32 changes: 15 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml| grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
VERSION_NO ?= $$(poetry version --short)
CLEAN_VERSION_NO := $(shell echo "$(VERSION_NO)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/
BUCKET_COMPOSER_DAGS=gs://europe-west1-ot-workflows-fe147745-bucket/dags/
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
REF ?= $$(git rev-parse --abbrev-ref HEAD)
PACKAGE_VERSION ?= $$(poetry version --short)
CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${APP_NAME}/${REF}

.PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))

Expand Down Expand Up @@ -38,35 +38,33 @@ build-documentation: ## Create local server with documentation
create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependencies for development purposes
@echo "Creating Dataproc Dev Cluster"
@gcloud config set project ${PROJECT_ID}
@gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_VERSION_NO}-$(USER)" \
@gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
--image-version 2.1 \
--region ${REGION} \
--master-machine-type n1-standard-16 \
--initialization-actions=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/install_dependencies_on_cluster.sh \
--metadata="PACKAGE=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/gentropy-${VERSION_NO}-py3-none-any.whl,CONFIGTAR=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/config.tar.gz" \
--initialization-actions=$(BUCKET_NAME)/install_dependencies_on_cluster.sh \
--metadata="PACKAGE=$(BUCKET_NAME)/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl" \
--secondary-worker-type spot \
--worker-machine-type n1-standard-4 \
--worker-boot-disk-size 500 \
--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
--optional-components=JUPYTER \
--enable-component-gateway \
--max-idle=30m
--max-idle=60m

make update-dev-cluster: build ## Reinstalls the package on the dev-cluster
@echo "Updating Dataproc Dev Cluster"
@gcloud config set project ${PROJECT_ID}
gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_VERSION_NO}" \
gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_PACKAGE_VERSION}" \
--region ${REGION} \
--jars=${BUCKET_NAME}/install_dependencies_on_cluster.sh \
-e='sh chmod 750 $${PWD}/install_dependencies_on_cluster.sh; sh $${PWD}/install_dependencies_on_cluster.sh'

build: clean ## Build Python package with dependencies
@gcloud config set project ${PROJECT_ID}
@echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}"
@echo "Packaging Code and Dependencies for ${APP_NAME}-${PACKAGE_VERSION}"
@poetry build
@tar -czf dist/config.tar.gz config/
@echo "Uploading to Dataproc"
@gsutil cp src/gentropy/cli.py ${BUCKET_NAME}
@gsutil cp ./dist/${APP_NAME}-${VERSION_NO}-py3-none-any.whl ${BUCKET_NAME}
@gsutil cp ./dist/config.tar.gz ${BUCKET_NAME}
@gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}
@echo "Uploading to ${BUCKET_NAME}"
@gsutil cp src/${APP_NAME}/cli.py ${BUCKET_NAME}/
@gsutil cp ./dist/${APP_NAME}-${PACKAGE_VERSION}-py3-none-any.whl ${BUCKET_NAME}/
@gsutil cp ./utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}/
16 changes: 16 additions & 0 deletions docs/development/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,19 @@ Some functions on MacOS may throw a java error:
This can be resolved by adding the follow line to your `~/.zshrc`:

`export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`

## Creating development dataproc cluster (OT users only)

To start dataproc cluster in the development mode run

```
make create-dev-cluster
```

The command above will prepare 3 different resources:

- gentropy package
- cli script
- cluster setup script

and based on the branch ref (for example `dev`) will create a namespaced folder under GCS (`gs://genetics_etl_python_playground/initialisation/gentropy/dev`) with the three files described above. These files will be then used to create the cluster environment.
9 changes: 9 additions & 0 deletions docs/python_api/datasets/biosample_index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
title: Biosample index
---

::: gentropy.dataset.biosample_index.BiosampleIndex

## Schema

--8<-- "assets/schemas/biosample_index.md"
22 changes: 21 additions & 1 deletion docs/python_api/datasets/l2g_feature.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,27 @@
title: L2G Feature
---

::: gentropy.method.l2g.feature_factory.L2GFeature
## Abstract Class

::: gentropy.dataset.l2g_feature.L2GFeature

## Feature Classes

### Derived from colocalisation

::: gentropy.dataset.l2g_feature.EQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_feature.PQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_feature.SQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_feature.TuQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_feature.EQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_feature.PQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_feature.SQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_feature.TuQtlColocH4MaximumFeature

### Derived from distance

::: gentropy.dataset.l2g_feature.DistanceTssMinimumFeature
::: gentropy.dataset.l2g_feature.DistanceTssMeanFeature

## Schema

Expand Down
7 changes: 6 additions & 1 deletion docs/python_api/datasources/_datasources.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ This section contains information about the data source harmonisation tools avai
2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)

## Linkage desiquilibrium
## Linkage disequilibrium

1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries)

Expand All @@ -37,3 +37,8 @@ This section contains information about the data source harmonisation tools avai
## Gene annotation

1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl)

## Biological samples

1. [Uberon](biosample_ontologies/_uberon.md)
2. [Cell Ontology](biosample_ontologies/_cell_ontology.md)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
title: Cell Ontology
---

The [Cell Ontology](http://www.obofoundry.org/ontology/cl.html) is a structured controlled vocabulary for cell types. It is used to annotate cell types in single-cell RNA-seq data and other omics data.
5 changes: 5 additions & 0 deletions docs/python_api/datasources/biosample_ontologies/_uberon.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
title: Uberon
---

The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology.
4 changes: 2 additions & 2 deletions docs/python_api/methods/l2g/feature_factory.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
title: L2G Feature Factory
---

::: gentropy.method.l2g.feature_factory.ColocalisationFactory
::: gentropy.method.l2g.feature_factory.FeatureFactory

::: gentropy.method.l2g.feature_factory.StudyLocusFactory
::: gentropy.method.l2g.feature_factory.L2GFeatureInputLoader
5 changes: 5 additions & 0 deletions docs/python_api/steps/biosample_index_step.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
title: biosample_index
---

::: gentropy.biosample_index.BiosampleIndexStep
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ google = "^3.0.0"
omegaconf = "^2.3.0"
typing-extensions = "^4.9.0"
scikit-learn = "^1.3.2"
pandas = {extras = ["gcp", "parquet"], version = "^2.2.2"}
pandas = { extras = ["gcp", "parquet"], version = "^2.2.2" }
skops = ">=0.9,<0.11"
google-cloud-secret-manager = "^2.20.0"

Expand Down Expand Up @@ -126,6 +126,7 @@ exclude = ["dist"]
addopts = "-n auto --doctest-modules --cov=src/ --cov-report=xml"
pythonpath = ["."]
testpaths = ["tests/gentropy", "src/gentropy"]
marks = ["step_test"]

# Semi-strict mode for mypy
[tool.mypy]
Expand Down
83 changes: 83 additions & 0 deletions src/gentropy/assets/schemas/biosample_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"type": "struct",
"fields": [
{
"name": "biosampleId",
"type": "string",
"nullable": false,
"metadata": {}
},
{
"name": "biosampleName",
"type": "string",
"nullable": false,
"metadata": {}
},
{
"name": "description",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "xrefs",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
},
{
"name": "synonyms",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
},
{
"name": "parents",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
},
{
"name": "ancestors",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
},
{
"name": "descendants",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
},
{
"name": "children",
"type": {
"type": "array",
"elementType": "string",
"containsNull": true
},
"nullable": true,
"metadata": {}
}
]
}
6 changes: 6 additions & 0 deletions src/gentropy/assets/schemas/colocalisation.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
"type": "long",
"metadata": {}
},
{
"name": "rightStudyType",
"nullable": false,
"type": "string",
"metadata": {}
},
{
"name": "chromosome",
"nullable": false,
Expand Down
Loading

0 comments on commit f27a256

Please sign in to comment.