diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java index 3772bb63..5a3af911 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java @@ -6,6 +6,9 @@ * Created by jtarraga on 30/01/17. */ public abstract class AnalysisExecutor { + + public static String metadataExtension = ".meta.json"; + protected String datasetName; protected SparkSession sparkSession; diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/adaptors/PlinkAdaptor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/adaptors/PlinkAdaptor.java new file mode 100644 index 00000000..06443d67 --- /dev/null +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/adaptors/PlinkAdaptor.java @@ -0,0 +1,80 @@ +package org.opencb.hpg.bigdata.analysis.variant.adaptors; + +import org.opencb.hpg.bigdata.analysis.AnalysisExecutor; +import org.opencb.hpg.bigdata.analysis.AnalysisExecutorException; + +import java.util.List; +import java.util.Map; + +/** + * Created by jtarraga on 09/06/17. + */ +public class PlinkAdaptor extends AnalysisExecutor { + private String inFilename; + private String metaFilename; + private String outDirname; + + private int splitSize; + private List plinkParams; + private Map filterOptions; + + public PlinkAdaptor(String inFilename, String metaFilename, String outDirname) { + this.inFilename = inFilename; + this.metaFilename = inFilename + AnalysisExecutor.metadataExtension; + this.outDirname = outDirname; + } + + @Override + public void execute() throws AnalysisExecutorException { + System.out.println("plink params = " + plinkParams); + System.out.println("filter options = " + filterOptions); + } + + public String getInFilename() { + return inFilename; + } + + public void setInFilename(String inFilename) { + this.inFilename = inFilename; + } + + public String getMetaFilename() { + return metaFilename; + } + + public void setMetaFilename(String metaFilename) { + this.metaFilename = metaFilename; + } + + public String getOutDirname() { + return outDirname; + } + + public void setOutDirname(String outDirname) { + this.outDirname = outDirname; + } + + public int getSplitSize() { + return splitSize; + } + + public void setSplitSize(int splitSize) { + this.splitSize = splitSize; + } + + public List getPlinkParams() { + return plinkParams; + } + + public void setPlinkParams(List plinkParams) { + this.plinkParams = plinkParams; + } + + public Map getFilterOptions() { + return filterOptions; + } + + public void setFilterOptions(Map filterOptions) { + this.filterOptions = filterOptions; + } +} diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java index 40e30a5d..8cc3bc23 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java @@ -13,9 +13,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; import static java.nio.file.Paths.get; @@ -212,4 +210,25 @@ public static VariantCommandOptions createVariantCommandOptions( return parser.getVariantCommandOptions(); } + + public static Map getFilterMap(VariantCommandOptions.PlinkVariantCommandOptions options) { + Map mapFilter = new HashMap<>(); + + // regions + if (StringUtils.isNotEmpty(options.regions)) { + mapFilter.put("regions", options.regions); + } + + // types + if (StringUtils.isNotEmpty(options.types)) { + mapFilter.put("types", options.types); + } + + // biotypes + if (StringUtils.isNotEmpty(options.biotypes)) { + mapFilter.put("biotypes", options.biotypes); + } + + return mapFilter; + } } diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java index c679c362..88bc2f66 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java @@ -85,6 +85,7 @@ public LocalCliOptionsParser() { variantSubCommands.addCommand("view", variantCommandOptions.viewVariantCommandOptions); variantSubCommands.addCommand("query", variantCommandOptions.queryVariantCommandOptions); variantSubCommands.addCommand("metadata", variantCommandOptions.metadataVariantCommandOptions); + variantSubCommands.addCommand("plink", variantCommandOptions.plinkVariantCommandOptions); variantSubCommands.addCommand("rvtests", variantCommandOptions.rvtestsVariantCommandOptions); variantSubCommands.addCommand("association", variantCommandOptions.associationVariantCommandOptions); diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java index 4ba334ed..5d708497 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java @@ -45,6 +45,7 @@ import org.opencb.hpg.bigdata.analysis.variant.LinearRegressionAnalysis; import org.opencb.hpg.bigdata.analysis.variant.LogisticRegressionAnalysis; import org.opencb.hpg.bigdata.analysis.variant.RvTestsAdaptor; +import org.opencb.hpg.bigdata.analysis.variant.adaptors.PlinkAdaptor; import org.opencb.hpg.bigdata.app.cli.CommandExecutor; import org.opencb.hpg.bigdata.app.cli.local.CliUtils; import org.opencb.hpg.bigdata.app.cli.local.options.VariantCommandOptions; @@ -103,6 +104,9 @@ public void execute() throws Exception { case "metadata": metadata(); break; + case "plink": + plink(); + break; case "rvtests": rvtests(); break; @@ -863,6 +867,21 @@ public void rvtests() throws Exception { rvtests.run00(variantCommandOptions.rvtestsVariantCommandOptions.datasetId); } + public void plink() throws Exception { + PlinkAdaptor plink = new PlinkAdaptor(variantCommandOptions.plinkVariantCommandOptions.inFilename, + variantCommandOptions.plinkVariantCommandOptions.metaFilename, + variantCommandOptions.plinkVariantCommandOptions.outDirname); + + plink.setPlinkParams(variantCommandOptions.plinkVariantCommandOptions.plinkParams); + plink.setSplitSize(variantCommandOptions.plinkVariantCommandOptions.splitSize); + plink.setFilterOptions(CliUtils.getFilterMap(variantCommandOptions.plinkVariantCommandOptions)); + + plink.execute(); + +//// rvtests.run(variantCommandOptions.rvtestsVariantCommandOptions.datasetId); + //rvtests.run00(variantCommandOptions.rvtestsVariantCommandOptions.datasetId); + } + public void assoc() throws Exception { // check input file File metaFile = new File(variantCommandOptions.associationVariantCommandOptions.input + ".meta.json"); diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java index 80323913..5b52e2dc 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java @@ -7,6 +7,8 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.opencb.hpg.bigdata.app.cli.local.LocalCliOptionsParser; +import java.util.List; + /** * Created by jtarraga on 01/06/17. */ @@ -20,6 +22,7 @@ public class VariantCommandOptions { public ViewVariantCommandOptions viewVariantCommandOptions; public QueryVariantCommandOptions queryVariantCommandOptions; public MetadataVariantCommandOptions metadataVariantCommandOptions; + public PlinkVariantCommandOptions plinkVariantCommandOptions; public RvTestsVariantCommandOptions rvtestsVariantCommandOptions; public AssociationVariantCommandOptions associationVariantCommandOptions; @@ -36,6 +39,7 @@ public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCo this.viewVariantCommandOptions = new ViewVariantCommandOptions(); this.queryVariantCommandOptions = new QueryVariantCommandOptions(); this.metadataVariantCommandOptions = new MetadataVariantCommandOptions(); + this.plinkVariantCommandOptions = new PlinkVariantCommandOptions(); this.rvtestsVariantCommandOptions = new RvTestsVariantCommandOptions(); this.associationVariantCommandOptions = new AssociationVariantCommandOptions(); } @@ -354,6 +358,117 @@ public class RvTestsVariantCommandOptions { public String confFilename; } + @Parameters(commandNames = {"plink"}, commandDescription = "Execute the 'plink' program") + public class PlinkVariantCommandOptions { + + @ParametersDelegate + public LocalCliOptionsParser.CommonCommandOptions commonOptions = commonCommandOptions; + + + @Parameter(names = {"-i", "--input"}, description = "Input file name (in Avro/Parquet file format)", + required = true, arity = 1) + public String inFilename; + + @Parameter(names = {"-m", "--metadata"}, description = "Input metadata file name", required = true, arity = 1) + public String metaFilename; + + @Parameter(names = {"--dataset"}, description = "Target dataset", arity = 1) + public String datasetId = null; + + @Parameter(names = {"--plink-params"}, description = "List of space-separated key=value parameters necessary to" + + " run the plink program", required = true, variableArity = true) + public List plinkParams; + + @Parameter(names = {"--split-size"}, description = "Split size. Range 10000000-100000000", arity = 1) + public int splitSize = 10000000; + + @Parameter(names = {"-o", "--output"}, description = "Output directory name to save the plink results", + required = true, arity = 1) + public String outDirname; + + // filter parameters + @Parameter(names = {"--id"}, description = "Filter ID; comma separated list of IDs, e.g.:" + + " \"rs312411,rs421225\"", arity = 1) + public String ids; + + @Parameter(names = {"--id-file"}, description = "Filter ID that are stored in a file, one ID per line," + + " e.g.: rs312411", arity = 1) + public String idFilename; + + @Parameter(names = {"--type"}, description = "Filter type; comma separated list of IDs, e.g.:" + + " \"INDEL,SNP,SNV\"", arity = 1) + public String types; + + @Parameter(names = {"--s", "--study"}, description = "Filter study; comma separated list of study names", + arity = 1) + public String studies; + + @Parameter(names = {"--biotype"}, description = "Filter biotype; comma separated list of biotype names," + + " e.g.: protein_coding, pseudogene", arity = 1) + public String biotypes; + + @Parameter(names = {"-r", "--region"}, description = "Filter region; comma separated list of regions," + + " e.g.: 1:300000-400000000,15:343453463-8787665654", arity = 1) + public String regions; + + @Parameter(names = {"--region-file"}, description = "Filter regions that are stored in a file, one region" + + " per line, e.g.: 1:6700000-560000000", arity = 1) + public String regionFilename; + + @Parameter(names = {"--maf"}, description = "QuerFilter Minor Allele Frequency of a given study and" + + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name" + + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.4\"", arity = 1) + public String maf; + + @Parameter(names = {"--mgf"}, description = "Filter Minor Genotype Frequency of a given study and" + + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name" + + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.18198\"", arity = 1) + public String mgf; + + @Parameter(names = {"--ct", "--consequence-type"}, description = "Filter Sequence Ontology term names or" + + " accession codes; comma separated (use double quotes if you provide term names), e.g.:" + + " \"transgenic insertion,SO:32234,SO:00124\"", arity = 1) + public String consequenceTypes; + + @Parameter(names = {"--gene"}, description = "Filter gene; comma separated list of gene names, e.g.:" + + " \"BIN3,ZNF517\"", arity = 1) + public String genes; + + @Parameter(names = {"--clinvar"}, description = "Filter clinvar (accession); comma separated list of" + + " accessions", arity = 1) + public String clinvar; + + @Parameter(names = {"--cosmic"}, description = "Filter cosmic (mutation ID); comma separated list of" + + " mutations IDs", arity = 1) + public String cosmic; + + @Parameter(names = {"--conservation"}, description = "Filter conservation scores (phastCons, phylop, gerp);" + + "comma separated list of scores and enclosed with double quotes, e.g.: \"phylop<0.3,phastCons<0.1\"", + arity = 1) + public String conservScores; + + @Parameter(names = {"--ps", "--protein-substitution"}, description = "Filter protein substitution scores" + + " (polyphen, sift); comma separated list of scores and enclosed with double quotes, e.g.:" + + "\"polyphen>0.3,sift>0.6\"", arity = 1) + public String substScores; + + @Parameter(names = {"--pf", "--population-frequency"}, description = "Filter alternate population" + + " frequency of a given study: \"study_name::population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: " + + " \"1000g::CEU<0.4\"", + arity = 1) + public String pf; + + @Parameter(names = {"--pmaf", "--population-maf"}, description = "Filter population minor allele frequency" + + " of a given study: \"study_name:: population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: " + + "\"1000g::PJL<=0.25\"", arity = 1) + public String pmaf; + + @Parameter(names = {"--sample"}, description = "Filter sample names; comma separated list of sample names", + arity = 1) + public String samples; + // end of filter parameters + } + @Parameters(commandNames = {"association"}, commandDescription = "Execute association tests such as chi-square," + " linear and logistic regressions for additive, dominant or recessive genetic models") public class AssociationVariantCommandOptions {