Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-6565 Ensure the CellBase DB adaptors are updated to reflect the latest data releases #700

Open
wants to merge 14 commits into
base: release-6.x.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -411,14 +411,14 @@ private void loadVariationData() throws NoSuchMethodException, InterruptedExcept
if (dbSnpFilePath.toFile().exists()) {
if (variationPath.resolve(DBSNP_VERSION_FILENAME).toFile().exists()) {
logger.info("Loading dbSNP file '{}'", dbSnpFilePath);
loadRunner.load(dbSnpFilePath, SNP_COLLECTION_NAME, dataRelease);
loadRunner.load(dbSnpFilePath, SNP_DATA, dataRelease);

// Create index
createIndex(SNP_COLLECTION_NAME);
createIndex(SNP_DATA);

// Update release (collection and sources)
List<Path> sources = Collections.singletonList(variationPath.resolve(DBSNP_VERSION_FILENAME));
dataReleaseManager.update(dataRelease, SNP_COLLECTION_NAME, EtlCommons.VARIATION_DATA, sources);
dataReleaseManager.update(dataRelease, SNP_DATA, EtlCommons.VARIATION_DATA, sources);
} else {
logger.warn("In order to load the dbSNP file you need the version file {} within the folder '{}'", DBSNP_VERSION_FILENAME,
variationPath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public ValidationCommandExecutor(AdminCliOptionsParser.ValidationCommandOptions
}

@Override
public void execute() {
public void execute() throws CellBaseException {
checkFilesExist();

VariantAnnotationCalculator variantAnnotationCalculator;
Expand Down
13 changes: 0 additions & 13 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,6 @@ download:
clinvar:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
<<<<<<< HEAD
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
=======
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz
version: 2024-05
Expand All @@ -122,7 +115,6 @@ download:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz
version: 2024-05
>>>>>>> release-6.2.x
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
clinvarVariationAllele:
Expand All @@ -147,15 +139,10 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
<<<<<<< HEAD
host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
version: "1.0.2 associations_e106_r2022-05-17"
=======
#host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv"
#version: "1.0.2 associations_e106_r2022-05-17"
version: "2024-05-20"
>>>>>>> release-6.2.x
hpo:
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
disgenet:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class EtlCommons {
public static final String HOMO_SAPIENS_NAME ="Homo sapiens";

public static final String GENOME_DATA = "genome";
public static final String GENOME_SEQUENCE_DATA = "genome_sequence";
public static final String GENE_DATA = "gene";
public static final String REFSEQ_DATA = "refseq";
public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association";
Expand Down Expand Up @@ -69,7 +70,7 @@ public class EtlCommons {
public static final String DBSNP_FILE = "GCF_000001405.40.gz";
public static final String DBSNP_NAME = "dbSNP";
public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json";
public static final String SNP_COLLECTION_NAME = "snp";
public static final String SNP_DATA = "snp";

public static final String STRUCTURAL_VARIANTS_DATA = "svs";
public static final String REPEATS_DATA = "repeats";
Expand All @@ -79,6 +80,8 @@ public class EtlCommons {
public static final String DOID_FILE = "doid.obo";
public static final String PFM_DATA = "regulatory_pfm";

public static final String REGULATORY_REGION_DATA = "regulatory_region";

// Build specific data options
public static final String GENOME_INFO_DATA = "genome_info";
public static final String DISGENET_DATA = "disgenet";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,76 +16,25 @@

package org.opencb.cellbase.lib.impl.core;

import org.apache.commons.collections4.CollectionUtils;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataRelease;
import org.opencb.cellbase.lib.impl.core.singleton.DataReleaseSingleton;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.datastore.mongodb.MongoDataStore;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class CellBaseDBAdaptor extends MongoDBAdaptor {

protected List<DataRelease> dataReleases;
protected Map<Integer, MongoDBCollection> mongoDBCollectionByRelease;

public static final String DATA_RELEASE_SEPARATOR = "__v";

public static String buildCollectionName(String data, int release) {
String name = data + DATA_RELEASE_SEPARATOR + release;
return name;
}

public Map<Integer, MongoDBCollection> buildCollectionByReleaseMap(String data) {
Map<Integer, MongoDBCollection> collectionMap = new HashMap<>();
if (CollectionUtils.isNotEmpty(dataReleases)) {
for (DataRelease dataRelease : dataReleases) {
if (dataRelease.getCollections().containsKey(data)) {
String collectionName = dataRelease.getCollections().get(data);
collectionMap.put(dataRelease.getRelease(), mongoDataStore.getCollection(collectionName));
}
}
} else {
// For backward compatibility (i.e., in case data_release collection is missing)
collectionMap.put(0, mongoDataStore.getCollection(data));
}

return collectionMap;
}

public MongoDBCollection getCollectionByRelease(Map<Integer, MongoDBCollection> collectionMap, Integer dataRelease)
throws CellBaseException {
int release = dataRelease == null ? 0 : dataRelease;
if (!collectionMap.containsKey(release)) {
// If the data release is invalid, throw an exception
String msg = "Data not found in release " + release + ". " + collectionMap.toString();
logger.error(msg);
throw new CellBaseException(msg);
}
return collectionMap.get(release);
}

public CellBaseDBAdaptor(MongoDataStore mongoDataStore) {
super(mongoDataStore);
this.dataReleases = new ReleaseMongoDBAdaptor(mongoDataStore).getAll().getResults();
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder("CellBaseDBAdaptor{");
sb.append("dataRelease=").append(dataReleases);
sb.append('}');
return sb.toString();
}

public List<DataRelease> getDataReleases() {
return dataReleases;
public static String buildCollectionName(String data, int release) {
String name = data + DATA_RELEASE_SEPARATOR + release;
return name;
}

public CellBaseDBAdaptor setDataReleases(List<DataRelease> dataReleases) {
this.dataReleases = dataReleases;
return this;
public MongoDBCollection getMongoDBCollection(String data, int release) throws CellBaseException {
return DataReleaseSingleton.getInstance().getMongoDBCollection(mongoDataStore.getDatabaseName(), data, release);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import java.util.function.Consumer;

import static org.opencb.cellbase.core.ParamConstants.DATA_RELEASE_PARAM;
import static org.opencb.cellbase.lib.EtlCommons.CLINICAL_VARIANTS_DATA;

/**
* Created by fjlopez on 06/12/16.
Expand All @@ -65,14 +66,6 @@ public ClinicalMongoDBAdaptor(MongoDataStore mongoDataStore, GenomeManager genom
super(mongoDataStore);

this.genomeManager = genomeManager;

init();
}

private void init() {
logger.debug("ClinicalMongoDBAdaptor: in 'constructor'");

mongoDBCollectionByRelease = buildCollectionByReleaseMap("clinical_variants");
}

public CellBaseDataResult<Variant> next(Query query, QueryOptions options) {
Expand Down Expand Up @@ -103,16 +96,14 @@ public CellBaseDataResult getIntervalFrequencies(Query query, int intervalSize,
public CellBaseDataResult<Long> count(Query query) throws CellBaseException {
Bson bson = parseQuery(query);

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease,
(Integer) query.getOrDefault(DATA_RELEASE_PARAM, 0));
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getInt(DATA_RELEASE_PARAM));
return new CellBaseDataResult<>(mongoDBCollection.count(bson));
}

public CellBaseDataResult distinct(Query query, String field) throws CellBaseException {
Bson bson = parseQuery(query);

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease,
(Integer) query.getOrDefault(DATA_RELEASE_PARAM, 0));
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getInt(DATA_RELEASE_PARAM));
return new CellBaseDataResult<>(mongoDBCollection.distinct(field, bson));
}

Expand All @@ -128,8 +119,7 @@ public CellBaseDataResult<Variant> get(Query query, QueryOptions options) throws
logger.debug("query: {}", bson.toBsonDocument().toJson());
logger.debug("queryOptions: {}", options.toJson());

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease,
(Integer) query.getOrDefault(DATA_RELEASE_PARAM, 0));
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getInt(DATA_RELEASE_PARAM));
return new CellBaseDataResult<>(mongoDBCollection.find(bson, null, Variant.class, parsedOptions));
}

Expand All @@ -140,8 +130,7 @@ public CellBaseDataResult nativeGet(Query query, QueryOptions options) throws Ce
logger.debug("query: {}", bson.toBsonDocument().toJson());
logger.debug("queryOptions: {}", options.toJson());

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease,
(Integer) query.getOrDefault(DATA_RELEASE_PARAM, 0));
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getInt(DATA_RELEASE_PARAM));
return new CellBaseDataResult<>(mongoDBCollection.find(bson, parsedOptions));
}

Expand All @@ -152,8 +141,7 @@ public Iterator<Variant> iterator(Query query, QueryOptions options) {
public Iterator nativeIterator(Query query, QueryOptions options) throws CellBaseException {
Bson bson = parseQuery(query);

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease,
(Integer) query.getOrDefault(DATA_RELEASE_PARAM, 0));
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getInt(DATA_RELEASE_PARAM));
return mongoDBCollection.nativeQuery().find(bson, options);
}

Expand Down Expand Up @@ -355,7 +343,7 @@ private CellBaseDataResult getClinvarPhenotypeGeneRelations(QueryOptions queryOp
fields.put("associatedGenes", 1);
pipeline.add(new Document("$project", fields));

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, dataRelease);
return executeAggregation2("", pipeline, queryOptions, mongoDBCollection);

}
Expand All @@ -377,7 +365,7 @@ private CellBaseDataResult getGwasPhenotypeGeneRelations(QueryOptions queryOptio
fields.put("associatedGenes", 1);
pipeline.add(new Document("$project", fields));

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, dataRelease);
return executeAggregation2("", pipeline, queryOptions, mongoDBCollection);
}

Expand Down Expand Up @@ -466,7 +454,7 @@ public CellBaseIterator iterator(ClinicalVariantQuery query) throws CellBaseExce
Bson projection = getProjection(query);
GenericDocumentComplexConverter<Variant> converter = new GenericDocumentComplexConverter<>(Variant.class);

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease());
MongoDBCollection mongoDBCollection = getMongoDBCollection(CLINICAL_VARIANTS_DATA, query.getDataRelease());
MongoDBIterator<Variant> iterator = mongoDBCollection.iterator(null, bson, projection, converter, queryOptions);
return new CellBaseMongoDBIterator<>(iterator);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.opencb.cellbase.core.api.query.ProjectionQueryOptions;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.result.CellBaseDataResult;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.MongoDBCollectionConfiguration;
import org.opencb.cellbase.lib.iterator.CellBaseIterator;
import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator;
Expand All @@ -43,13 +44,15 @@
import java.util.*;
import java.util.regex.Pattern;

import static org.opencb.cellbase.lib.EtlCommons.GENE_DATA;

/**
* Created by imedina on 25/11/15.
*/
public class GeneMongoDBAdaptor extends CellBaseDBAdaptor implements CellBaseCoreDBAdaptor<GeneQuery, Gene> {

private static final Set<String> CONSTRAINT_NAMES = new HashSet<>();
private Map<Integer, MongoDBCollection> refseqCollectionByRelease;
// private Map<Integer, MongoDBCollection> refseqCollectionByRelease;

private static final GenericDocumentComplexConverter<Gene> CONVERTER;

Expand All @@ -66,15 +69,15 @@ public class GeneMongoDBAdaptor extends CellBaseDBAdaptor implements CellBaseCor
public GeneMongoDBAdaptor(MongoDataStore mongoDataStore) {
super(mongoDataStore);

this.init();
// this.init();
}

private void init() {
mongoDBCollectionByRelease = buildCollectionByReleaseMap("gene");
refseqCollectionByRelease = buildCollectionByReleaseMap("refseq");

logger.debug("GeneMongoDBAdaptor initialised");
}
// private void init() {
// mongoDBCollectionByRelease = buildCollectionByReleaseMap("gene");
// refseqCollectionByRelease = buildCollectionByReleaseMap("refseq");
//
// logger.debug("GeneMongoDBAdaptor initialised");
// }

@Override
public CellBaseDataResult<Gene> aggregationStats(GeneQuery query) {
Expand All @@ -97,10 +100,10 @@ public List<CellBaseDataResult<Gene>> info(List<String> ids, ProjectionQueryOpti
orBsonList.add(Filters.eq("name", id));
Bson query = Filters.or(orBsonList);
if (StringUtils.isEmpty(source) || ParamConstants.QueryParams.ENSEMBL.key().equalsIgnoreCase(source)) {
MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, dataRelease);
results.add(new CellBaseDataResult<>(mongoDBCollection.find(query, projection, CONVERTER, new QueryOptions())));
} else {
MongoDBCollection mongoDBCollection = getCollectionByRelease(refseqCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(EtlCommons.REFSEQ_DATA, dataRelease);
results.add(new CellBaseDataResult<>(mongoDBCollection.find(query, projection, CONVERTER, new QueryOptions())));
}
}
Expand All @@ -115,10 +118,10 @@ public CellBaseIterator<Gene> iterator(GeneQuery query) throws CellBaseException
MongoDBIterator<Gene> iterator;
if (query.getSource() != null && !query.getSource().isEmpty() && ParamConstants.QueryParams.REFSEQ.key()
.equalsIgnoreCase(query.getSource().get(0))) {
MongoDBCollection mongoDBCollection = getCollectionByRelease(refseqCollectionByRelease, query.getDataRelease());
MongoDBCollection mongoDBCollection = getMongoDBCollection(EtlCommons.REFSEQ_DATA, query.getDataRelease());
iterator = mongoDBCollection.iterator(null, bson, projection, CONVERTER, queryOptions);
} else {
MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease());
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, query.getDataRelease());
iterator = mongoDBCollection.iterator(null, bson, projection, CONVERTER, queryOptions);
}
return new CellBaseMongoDBIterator<>(iterator);
Expand All @@ -127,15 +130,15 @@ public CellBaseIterator<Gene> iterator(GeneQuery query) throws CellBaseException
@Override
public CellBaseDataResult<String> distinct(GeneQuery geneQuery) throws CellBaseException {
Bson bsonDocument = parseQuery(geneQuery);
MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, geneQuery.getDataRelease());
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, geneQuery.getDataRelease());
return new CellBaseDataResult<>(mongoDBCollection.distinct(geneQuery.getFacet(), bsonDocument, String.class));
}

@Override
public CellBaseDataResult<Gene> groupBy(GeneQuery geneQuery) throws CellBaseException {
Bson bsonQuery = parseQuery(geneQuery);
logger.info("geneQuery: {}", bsonQuery.toBsonDocument().toJson());
MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, geneQuery.getDataRelease());
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, geneQuery.getDataRelease());
return groupBy(bsonQuery, geneQuery, "name", mongoDBCollection);
}

Expand All @@ -157,7 +160,7 @@ public CellBaseDataResult<Gene> startsWith(String id, QueryOptions options, int
projection = Projections.exclude("transcripts", "annotation");
}
}
MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, dataRelease);
return new CellBaseDataResult<>(mongoDBCollection.find(regex, projection, CONVERTER, options));
}

Expand Down Expand Up @@ -355,7 +358,7 @@ public CellBaseDataResult<TranscriptTfbs> getTfbs(String geneId, QueryOptions qu
List<Bson> pipeline = unwindAndMatchTranscripts(query, queryOptions);
GenericDocumentComplexConverter<TranscriptTfbs> converter = new GenericDocumentComplexConverter<>(TranscriptTfbs.class);

MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease);
MongoDBCollection mongoDBCollection = getMongoDBCollection(GENE_DATA, dataRelease);
MongoDBIterator<TranscriptTfbs> iterator = mongoDBCollection.iterator(pipeline, converter, queryOptions);

List<TranscriptTfbs> tfbs = new ArrayList<>();
Expand Down
Loading