diff --git a/.github/workflows/task.yml b/.github/workflows/task.yml
index ac09089acb..f71e27a41d 100644
--- a/.github/workflows/task.yml
+++ b/.github/workflows/task.yml
@@ -21,5 +21,5 @@ jobs:
uses: opencb/java-common-libs/.github/workflows/deploy-docker-hub-workflow.yml@develop
needs: test
with:
- cli: python3 ./build/cloud/docker/docker-build.py push --images base --tag ${{ github.ref_name }}
+ cli: python3 ./build/cloud/docker/docker-build.py push --images base,builder --tag ${{ github.ref_name }}
secrets: inherit
diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
index 6e1657d1bf..bcb2de9cb8 100644
--- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
+++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
@@ -11,7 +11,7 @@ LABEL org.label-schema.vendor="OpenCB" \
## We need to be root to install dependencies
USER root
RUN apt-get update -y && \
- apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl && \
+ apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl libxml-simple-perl liblog-log4perl-perl libxml-parser-perl libxml-dom-perl && \
mkdir /opt/ensembl && chown cellbase:cellbase /opt/ensembl && \
rm -rf /var/lib/apt/lists/*
@@ -26,6 +26,10 @@ RUN cd /opt/ensembl && \
git clone https://github.com/Ensembl/ensembl-variation.git && \
git clone https://github.com/Ensembl/ensembl-funcgen.git && \
git clone https://github.com/Ensembl/ensembl-compara.git && \
- git clone https://github.com/Ensembl/ensembl-io.git
+ git clone https://github.com/Ensembl/ensembl-io.git && \
+ git clone --branch cvs/release-0_7 https://github.com/biomart/biomart-perl
-ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
+## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes
+RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/
+
+ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib
diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
index 70865465e9..90f2f8208e 100755
--- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
+++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
@@ -134,16 +134,16 @@ our $ENSEMBL_GENOMES_PORT = "4157";
our $ENSEMBL_GENOMES_USER = "anonymous";
## Vertebrates
-our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
-our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
-our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
-our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
+our $HOMO_SAPIENS_CORE = "homo_sapiens_core_111_38";
+our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_111_38";
+our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_111_38";
+our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38";
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
-our $MUS_MUSCULUS_CORE = "mus_musculus_core_78_38";
-our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_78_38";
-our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_78_38";
+our $MUS_MUSCULUS_CORE = "mus_musculus_core_111_39";
+our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_111_39";
+our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_111_39";
our $RATTUS_NORVEGICUS_CORE = "rattus_norvegicus_core_78_5";
our $RATTUS_NORVEGICUS_VARIATION = "rattus_norvegicus_variation_78_5";
our $RATTUS_NORVEGICUS_FUNCTIONAL = "rattus_norvegicus_funcgen_78_5";
diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl
new file mode 100755
index 0000000000..bed648e2d0
--- /dev/null
+++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl
@@ -0,0 +1,61 @@
+#!/usr/bin/env perl
+
+use strict;
+use Getopt::Long;
+use Data::Dumper;
+use JSON;
+use DB_CONFIG;
+
+use BioMart::Initializer;
+use BioMart::Query;
+use BioMart::QueryRunner;
+
+## Default values
+my $species = 'hsapiens';
+my $outdir = "./";
+
+## Parsing command line
+GetOptions ('species=s' => \$species, 'outdir=s' => \$outdir);
+
+
+my $confFile = "/opt/cellbase/scripts/ensembl-scripts/martURLLocation.xml";
+
+# NB: change action to 'clean' if you wish to start a fresh configuration
+# and to 'cached' if you want to skip configuration step on subsequent runs from the same registry
+my $action='clean';
+my $initializer = BioMart::Initializer->new('registryFile'=>$confFile, 'action'=>$action);
+my $registry = $initializer->getRegistry;
+
+my $query = BioMart::Query->new('registry'=>$registry,'virtualSchemaName'=>'default');
+
+$query->setDataset($species."_gene_ensembl");
+
+$query->addAttribute("ensembl_gene_id");
+$query->addAttribute("ensembl_transcript_id");
+$query->addAttribute("transcript_is_canonical");
+
+$query->formatter("TSV");
+
+# Open the file for writing
+open(my $fh, '>', "$outdir/ensembl_canonical.txt") or die "Cannot open ensembl_canonical.txt file: $!";
+
+# Save the original stdout
+my $original_stdout = *STDOUT;
+open(STDOUT, '>&', $fh) or die "Can't redirect STDOUT: $!";
+
+my $query_runner = BioMart::QueryRunner->new();
+
+# to obtain unique rows only
+$query_runner->uniqueRowsOnly(1);
+$query_runner->execute($query);
+#$query_runner->printHeader();
+#print ENSEMBL_CANONICAL $query_runner->printResults();
+# Call printResults which prints to STDOUT (now redirected to the file)
+$query_runner->printResults();
+#$query_runner->printFooter();
+
+# Restore the original stdout
+open(STDOUT, '>&', $original_stdout) or die "Can't restore STDOUT: $!";
+
+# Close the filehandle
+close($fh) or die "Failed to close file: $!";
\ No newline at end of file
diff --git a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl
index 5e3aa9c46a..22b6a825b2 100755
--- a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl
+++ b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl
@@ -16,7 +16,9 @@
####################################################################
## Parsing command line options ####################################
####################################################################
-# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ...
+##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp
+
+# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ...
## Parsing command line
GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo,
@@ -50,8 +52,8 @@
if ($phylo eq "" || $phylo eq "vertebrate") {
print ("In vertebrates section\n");
- if ($species eq "Homo sapiens" && $assembly eq "GRCh38") {
- print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
+ if ($species eq "Homo sapiens" || $species eq "Mus musculus") {
+ print ($species." selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
Bio::EnsEMBL::Registry->load_registry_from_db(
-host => $ENSEMBL_HOST,
-user => $ENSEMBL_USER,
diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl
index 50520f1f92..8ecf3d7c8f 100755
--- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl
+++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl
@@ -17,7 +17,9 @@
####################################################################
## Parsing command line options ####################################
####################################################################
-# USAGE: ./genome_info.pl --species "Homo sapiens" --outfile ../../appl_db/ird_v1/hsa ...
+##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --assembly GRCm39 --outfile /tmp
+
+# USAGE: ./genome_info.pl --species "Homo sapiens" --assembly GRCh38 --outfile ../../appl_db/ird_v1/hsa ...
## Parsing command line
GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'o|outfile=s' => \$outfile, 'phylo=s' => \$phylo,
@@ -29,7 +31,6 @@
if ($outfile eq "") {
$outfile = "/ensembl-data/genome_info.json";
- # $outfile = "/ensembl-data/$species.json";
}
####################################################################
@@ -42,17 +43,13 @@
# Bio::EnsEMBL::Registry->load_all("$ENSEMBL_REGISTRY");
if($phylo eq "" || $phylo eq "vertebrate") {
print ("In vertebrates section\n");
- if ($species eq "Homo sapiens" && $assembly eq "GRCh38") {
- print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n");
- Bio::EnsEMBL::Registry->load_registry_from_db(
- -host => $ENSEMBL_HOST,
- -user => $ENSEMBL_USER,
- -port => $ENSEMBL_PORT,
- -verbose => $verbose
- );
- } else {
- print ("Human selected, assembly ".$assembly." no supported\n");
- }
+ print ("Species: ".$species.", assembly ".$assembly.", connecting to: ".$ENSEMBL_HOST.":".$ENSEMBL_PORT."\n");
+ Bio::EnsEMBL::Registry->load_registry_from_db(
+ -host => $ENSEMBL_HOST,
+ -user => $ENSEMBL_USER,
+ -port => $ENSEMBL_PORT,
+ -verbose => $verbose
+ );
} else {
print ("In no-vertebrates section\n");
Bio::EnsEMBL::Registry->load_registry_from_db(
@@ -64,7 +61,6 @@
my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Slice");
my $karyotype_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "KaryotypeBand");
-# my $gene_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Gene");
####################################################################
my %info_stats = ();
@@ -81,12 +77,10 @@
$chromosome{'start'} = int($chrom->start());
$chromosome{'end'} = int($chrom->end());
$chromosome{'size'} = int($chrom->seq_region_length());
-# $chromosome{'numberGenes'} = scalar @{$chrom->get_all_Genes()};
$chromosome{'isCircular'} = $chrom->is_circular();
my @cytobands = ();
foreach my $cyto(@{$karyotype_adaptor->fetch_all_by_chr_name($chrom->seq_region_name)}) {
-# print $cytoband->name."\n";
my %cytoband = ();
$cytoband{'name'} = $cyto->name();
$cytoband{'start'} = int($cyto->start());
@@ -96,7 +90,7 @@
push(@cytobands, \%cytoband);
}
- ## check if any cytoband has been added
+ ## Check if any cytoband has been added
## If not a unique cytoband covering all chromosome is added.
if(@cytobands == 0) {
my %cytoband = ();
@@ -110,7 +104,6 @@
$chromosome{'cytobands'} = \@cytobands;
push(@chromosomes, \%chromosome);
-# push(@chrom_ids, $chrom->seq_region_name);
}
$info_stats{'chromosomes'} = \@chromosomes;
@@ -124,7 +117,6 @@
$supercontig{'start'} = int($supercon->start());
$supercontig{'end'} = int($supercon->end());
$supercontig{'size'} = int($supercon->seq_region_length());
-# $supercontig{'numberGenes'} = scalar @{$supercon->get_all_Genes()};
$supercontig{'isCircular'} = $supercon->is_circular();
## Adding an unique cytoband covering all chromosome is added.
@@ -151,7 +143,7 @@
sub print_parameters {
print "Parameters: ";
- print "species: $species, outfile: $outfile, ";
+ print "species: $species, assembly: $assembly, outfile: $outfile, ";
print "ensembl-registry: $ENSEMBL_REGISTRY, ";
print "ensembl-host: $ENSEMBL_HOST, ensembl-port: $ENSEMBL_PORT, ";
print "ensembl-user: $ENSEMBL_USER, verbose: $verbose, help: $help";
diff --git a/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml
new file mode 100644
index 0000000000..a710368f8f
--- /dev/null
+++ b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java
index 088db087f0..a71663f19f 100644
--- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java
+++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java
@@ -66,8 +66,8 @@ public class CommonCommandOptions {
description = "Set the logging level, accepted values are: debug, info, warn, error and fatal")
public String logLevel = "info";
- @Parameter(names = {"-C", "--config"}, arity = 1,
- description = "Path to CellBase configuration.yml file")
+ @Deprecated
+ @Parameter(names = {"-C", "--config"}, arity = 1, hidden = true, description = "Path to CellBase configuration.yml file")
public String conf;
}
diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java
index 39018bf170..64dcc05bfb 100644
--- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java
+++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java
@@ -35,18 +35,12 @@
import java.nio.file.Path;
import java.nio.file.Paths;
-/**
- * Created by imedina on 03/02/15.
- */
+
public abstract class CommandExecutor {
protected String logLevel;
-// protected boolean verbose;
protected String conf;
- @Deprecated
- protected String configFile;
-
protected String appHome;
protected CellBaseConfiguration configuration;
@@ -55,35 +49,13 @@ public abstract class CommandExecutor {
protected Logger logger;
public CommandExecutor() {
-
}
public CommandExecutor(String logLevel, String conf) {
this.logLevel = logLevel;
this.conf = conf;
- /**
- * System property 'app.home' is set up by cellbase.sh. If by any reason this is null
- * then CELLBASE_HOME environment variable is used instead.
- */
- this.appHome = System.getProperty("app.home", System.getenv("CELLBASE_HOME"));
-
- if (StringUtils.isEmpty(conf)) {
- this.conf = this.appHome + "/conf";
- }
-
- if (logLevel != null && !logLevel.isEmpty()) {
- // We must call to this method
- setLogLevel(logLevel);
- }
- }
-
- public CommandExecutor(String logLevel, boolean verbose, String conf) {
- this.logLevel = logLevel;
-// this.verbose = verbose;
- this.conf = conf;
-
- /**
+ /*
* System property 'app.home' is set up by cellbase.sh. If by any reason this is null
* then CELLBASE_HOME environment variable is used instead.
*/
@@ -124,29 +96,16 @@ public void setLogLevel(String logLevel) {
this.logLevel = logLevel;
}
-// public boolean isVerbose() {
-// return verbose;
-// }
-//
-// public void setVerbose(boolean verbose) {
-// this.verbose = verbose;
-// }
-
- public String getConfigFile() {
- return configFile;
- }
-
- public void setConfigFile(String configFile) {
- this.configFile = configFile;
- }
-
public Logger getLogger() {
return logger;
}
- /*
+ /**
* This method attempts to first data configuration from CLI parameter, if not present then uses
* the configuration from installation directory, if not exists then loads JAR configuration.json or yml.
+ *
+ * @throws URISyntaxException If any URI problem occurs
+ * @throws IOException If any IO problem occurs
*/
public void loadCellBaseConfiguration() throws URISyntaxException, IOException {
Path confPath = Paths.get(this.conf);
@@ -154,11 +113,13 @@ public void loadCellBaseConfiguration() throws URISyntaxException, IOException {
if (Files.exists(confPath.resolve("configuration.json"))) {
logger.debug("Loading configuration from '{}'", confPath.resolve("configuration.json").toAbsolutePath());
- this.configuration = CellBaseConfiguration.load(new FileInputStream(confPath.resolve("configuration.json").toFile()),
- CellBaseConfiguration.ConfigurationFileFormat.JSON);
+ this.configuration = CellBaseConfiguration
+ .load(Files.newInputStream(confPath.resolve("configuration.json").toFile().toPath()),
+ CellBaseConfiguration.ConfigurationFileFormat.JSON);
} else if (Files.exists(Paths.get(this.appHome + "/conf/configuration.yml"))) {
logger.debug("Loading configuration from '{}'", this.appHome + "/conf/configuration.yml");
- this.configuration = CellBaseConfiguration.load(new FileInputStream(new File(this.appHome + "/conf/configuration.yml")));
+ this.configuration = CellBaseConfiguration
+ .load(Files.newInputStream(new File(this.appHome + "/conf/configuration.yml").toPath()));
} else {
InputStream inputStream = CellBaseConfiguration.class.getClassLoader().getResourceAsStream("conf/configuration.json");
String configurationFilePath = "conf/configuration.json";
@@ -198,10 +159,4 @@ public void loadClientConfiguration() throws IOException {
}
}
}
-
- protected void makeDir(Path folderPath) throws IOException {
- if (!Files.exists(folderPath)) {
- Files.createDirectories(folderPath);
- }
- }
}
diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java
index 4a5f2c085f..ec1d8503de 100644
--- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java
+++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java
@@ -19,15 +19,12 @@
import com.beust.jcommander.*;
import org.opencb.cellbase.app.cli.CliOptionsParser;
import org.opencb.cellbase.core.api.key.ApiKeyQuota;
-import org.opencb.cellbase.lib.EtlCommons;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-/**
- * Created by imedina on 03/02/15.
- */
+
public class AdminCliOptionsParser extends CliOptionsParser {
private final CommonCommandOptions commonCommandOptions;
@@ -35,13 +32,13 @@ public class AdminCliOptionsParser extends CliOptionsParser {
private DownloadCommandOptions downloadCommandOptions;
private BuildCommandOptions buildCommandOptions;
+ private DataListCommandOptions dataListCommandOptions;
private DataReleaseCommandOptions dataReleaseCommandOptions;
private ApiKeyCommandOptions apiKeyCommandOptions;
private LoadCommandOptions loadCommandOptions;
private ExportCommandOptions exportCommandOptions;
private CustomiseCommandOptions customiseCommandOptions;
private IndexCommandOptions indexCommandOptions;
- private InstallCommandOptions installCommandOptions;
private ServerCommandOptions serverCommandOptions;
private ValidationCommandOptions validationCommandOptions;
@@ -52,25 +49,25 @@ public AdminCliOptionsParser() {
downloadCommandOptions = new DownloadCommandOptions();
buildCommandOptions = new BuildCommandOptions();
+ dataListCommandOptions = new DataListCommandOptions();
dataReleaseCommandOptions = new DataReleaseCommandOptions();
apiKeyCommandOptions = new ApiKeyCommandOptions();
loadCommandOptions = new LoadCommandOptions();
exportCommandOptions = new ExportCommandOptions();
customiseCommandOptions = new CustomiseCommandOptions();
indexCommandOptions = new IndexCommandOptions();
- installCommandOptions = new InstallCommandOptions();
serverCommandOptions = new ServerCommandOptions();
validationCommandOptions = new ValidationCommandOptions();
jCommander.addCommand("download", downloadCommandOptions);
jCommander.addCommand("build", buildCommandOptions);
+ jCommander.addCommand("data-list", dataListCommandOptions);
jCommander.addCommand("data-release", dataReleaseCommandOptions);
jCommander.addCommand("api-key", apiKeyCommandOptions);
jCommander.addCommand("load", loadCommandOptions);
jCommander.addCommand("export", exportCommandOptions);
jCommander.addCommand("customise", customiseCommandOptions);
jCommander.addCommand("index", indexCommandOptions);
- jCommander.addCommand("install", installCommandOptions);
jCommander.addCommand("server", serverCommandOptions);
jCommander.addCommand("validate", validationCommandOptions);
}
@@ -80,7 +77,8 @@ public void parse(String[] args) throws ParameterException {
jCommander.parse(args);
}
- @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml file")
+ @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml"
+ + " file")
public class DownloadCommandOptions {
@ParametersDelegate
@@ -89,16 +87,13 @@ public class DownloadCommandOptions {
@ParametersDelegate
public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions;
- @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download:"
- + EtlCommons.GENOME_DATA + ", " + EtlCommons.GENE_DATA + ", " + EtlCommons.VARIATION_DATA + ", "
- + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA + ", " + EtlCommons.MISSENSE_VARIATION_SCORE_DATA + ", "
- + EtlCommons.REGULATION_DATA + ", " + EtlCommons.PROTEIN_DATA + ", " + EtlCommons.CONSERVATION_DATA + ", "
- + EtlCommons.CLINICAL_VARIANTS_DATA + ", " + EtlCommons.REPEATS_DATA + ", " + EtlCommons.OBO_DATA + ", "
- + EtlCommons.PUBMED_DATA + ", " + EtlCommons.PHARMACOGENOMICS_DATA + "; and 'all' to download everything",
- required = true, arity = 1)
+ @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download, it depends on the species; use the"
+ + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to download"
+ + " everything", required = true, arity = 1)
public String data;
- @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1)
+ @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true,
+ arity = 1)
public String outputDirectory;
}
@@ -108,18 +103,21 @@ public class BuildCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;
- @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, "
- + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, "
- + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1)
+ @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build, it depends on the species; use the"
+ + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to build"
+ + " everything", required = true, arity = 1)
public String data;
- @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1)
+ @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or"
+ + " 'hsapiens'", arity = 1)
public String species = "Homo sapiens";
- @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml will be used", required = false, arity = 1)
+ @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml"
+ + " will be used", arity = 1)
public String assembly;
- @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1)
+ @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true,
+ arity = 1)
public String outputDirectory;
@Parameter(names = {"--skip-normalize"}, description = "Skip normalization of clinical variants. Normalization"
@@ -137,6 +135,17 @@ public class BuildCommandOptions {
}
+ @Parameters(commandNames = {"data-list"}, commandDescription = "List the data supported by the given species")
+ public class DataListCommandOptions {
+
+ @ParametersDelegate
+ public CommonCommandOptions commonOptions = commonCommandOptions;
+
+ @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens'"
+ + " or 'hsapiens'", arity = 1)
+ public String species = "Homo sapiens";
+ }
+
@Parameters(commandNames = {"data-release"}, commandDescription = "Manage data releases in order to support multiple versions of data")
public class DataReleaseCommandOptions {
@@ -155,11 +164,13 @@ public class DataReleaseCommandOptions {
@Parameter(names = {"--update"}, description = "Data release to be updated by adding CellBase vesions", arity = 1)
public int update;
- @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has to be used together to the parameter --update", arity = 1)
+ @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has"
+ + " to be used together to the parameter --update", arity = 1)
public String versions;
}
- @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources and set quota")
+ @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources"
+ + " and set quota")
public class ApiKeyCommandOptions {
@ParametersDelegate
@@ -168,9 +179,9 @@ public class ApiKeyCommandOptions {
@Parameter(names = {"--create-api-key"}, description = "Create an API key", arity = 0)
public boolean createApiKey;
- @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to specify the"
- + " licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:"
- + " cosmic:31/01/2025,hgmd", arity = 1)
+ @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to"
+ +" specify the licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:"
+ + " spliceai:31/01/2025,hgmd", arity = 1)
public String dataSources;
@Parameter(names = {"--expiration"}, description = "Use this parameter in conjunction with --create-api-key to specify the"
@@ -195,9 +206,9 @@ public class LoadCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;
- @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation,"
- + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics."
- + " 'all' loads everything", required = true, arity = 1)
+ @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to load, it depends on the species; use the"
+ + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to load"
+ + " everything", required = true, arity = 1)
public String data;
@Parameter(names = {"-i", "--input"}, required = true, arity = 1,
@@ -242,9 +253,9 @@ public class ExportCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;
- @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, "
- + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' "
- + " loads everything", required = true, arity = 1)
+ @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to export, it depends on the species; use the"
+ + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to export"
+ + " everything", required = true, arity = 1)
public String data;
@Parameter(names = {"--db", "--database"}, description = "Database name, e.g., cellbase_hsapiens_grch38_v5", required = true,
@@ -304,10 +315,9 @@ public class IndexCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;
- @Parameter(names = {"-d", "--data"}, description = "Data model type to be indexed: genome, gene, variation, "
- + "regulation, protein, ontology, clinical_variants, repeats, refseq and missense_variation_functional_score. 'all' "
- + "indexes everything", required = true,
- arity = 1)
+ @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to index, it depends on the species; use the"
+ + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to index"
+ + " everything", required = true, arity = 1)
public String data;
@Parameter(names = {"--db", "--database"}, description = "Database name.", required = true, arity = 1)
@@ -321,16 +331,6 @@ public class IndexCommandOptions {
public boolean validate;
}
- @Parameters(commandNames = {"install"}, commandDescription = "Set up sharding for CellBase")
- public class InstallCommandOptions {
-
- @ParametersDelegate
- public CommonCommandOptions commonOptions = commonCommandOptions;
-
- @ParametersDelegate
- public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions;
- }
-
@Parameters(commandNames = {"server"}, commandDescription = "Manage REST server")
public class ServerCommandOptions {
@@ -353,16 +353,20 @@ public class ValidationCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;
- @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens' or 'hsapiens'", arity = 1)
+ @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens'"
+ + " or 'hsapiens'", arity = 1)
public String species = "Homo sapiens";
- @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json will be used", required = false, arity = 1)
+ @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json"
+ + " will be used", arity = 1)
public String assembly = "GRCh38";
- @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter to 0", required = false, arity = 1)
+ @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter"
+ + " to 0", arity = 1)
public int dataRelease = 0;
- @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as COSMIC or HGMD", required = false, arity = 1)
+ @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as SpliceAI or"
+ + " HGMD", arity = 1)
public String apiKey;
@Parameter(names = {"-i", "--input-file"}, description = "Full path to VCF", required = true, arity = 1)
@@ -371,8 +375,7 @@ public class ValidationCommandOptions {
@Parameter(names = {"-V", "--vep-file"}, description = "Full path to VEP annotation JSON file", required = true, arity = 1)
public String vepFile;
- @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", required = false,
- arity = 1)
+ @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", arity = 1)
public String outputDirectory = "/tmp";
@Parameter(names = {"-t", "--type"}, description = "Which type to analyse: 'Protein', 'Transcript' or 'Both'", required =
@@ -410,6 +413,10 @@ public BuildCommandOptions getBuildCommandOptions() {
return buildCommandOptions;
}
+ public DataListCommandOptions getDataListCommandOptions() {
+ return dataListCommandOptions;
+ }
+
public DataReleaseCommandOptions getDataReleaseCommandOptions() {
return dataReleaseCommandOptions;
}
@@ -424,8 +431,6 @@ public IndexCommandOptions getIndexCommandOptions() {
return indexCommandOptions;
}
- public InstallCommandOptions getInstallCommandOptions() { return installCommandOptions; }
-
public ServerCommandOptions getServerCommandOptions() { return serverCommandOptions; }
public ValidationCommandOptions getValidationCommandOptions() { return validationCommandOptions; }
diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java
index 10c43d637c..d46d32709f 100644
--- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java
+++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java
@@ -25,9 +25,7 @@
import java.io.IOException;
import java.net.URISyntaxException;
-/**
- * Created by imedina on 03/02/15.
- */
+
public class AdminMain {
public static void main(String[] args) {
@@ -63,30 +61,30 @@ public static void main(String[] args) {
case "build":
commandExecutor = new BuildCommandExecutor(cliOptionsParser.getBuildCommandOptions());
break;
+ case "load":
+ commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions());
+ break;
+ case "data-list":
+ commandExecutor = new DataListCommandExecutor(cliOptionsParser.getDataListCommandOptions());
+ break;
case "data-release":
commandExecutor = new DataReleaseCommandExecutor(cliOptionsParser.getDataReleaseCommandOptions());
break;
case "api-key":
commandExecutor = new ApiKeyCommandExecutor(cliOptionsParser.getApiKeyCommandOptions());
break;
- case "load":
- commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions());
- break;
case "export":
commandExecutor = new ExportCommandExecutor(cliOptionsParser.getExportCommandOptions());
break;
case "index":
commandExecutor = new IndexCommandExecutor(cliOptionsParser.getIndexCommandOptions());
break;
- case "install":
- commandExecutor = new InstallCommandExecutor(cliOptionsParser.getInstallCommandOptions());
+ case "validate":
+ commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions());
break;
case "server":
commandExecutor = new ServerCommandExecutor(cliOptionsParser.getServerCommandOptions());
break;
- case "validate":
- commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions());
- break;
default:
break;
}
@@ -98,10 +96,10 @@ public static void main(String[] args) {
commandExecutor.execute();
} catch (IOException | URISyntaxException | CellBaseException e) {
commandExecutor.getLogger().error("Error: " + e.getMessage());
+ e.printStackTrace();
System.exit(1);
}
}
}
}
-
}
diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java
index 16db1f82bc..542cc3e129 100644
--- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java
+++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java
@@ -17,11 +17,14 @@
package org.opencb.cellbase.app.cli.admin.executors;
import com.beust.jcommander.ParameterException;
-import org.apache.commons.lang.StringUtils;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.ObjectReader;
+import org.apache.commons.lang3.StringUtils;
import org.opencb.cellbase.app.cli.CommandExecutor;
import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser;
import org.opencb.cellbase.core.config.SpeciesConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
+import org.opencb.cellbase.core.models.DataSource;
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
@@ -31,59 +34,74 @@
import org.opencb.cellbase.lib.builders.*;
import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder;
-import java.io.File;
import java.io.IOException;
import java.nio.file.*;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import static org.opencb.cellbase.lib.EtlCommons.*;
+import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE;
+import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE;
+import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME;
+import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME;
+import static org.opencb.cellbase.lib.builders.OntologyBuilder.OBO_OUTPUT_BASENAME;
+import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME;
+import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME;
+import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*;
+import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME;
+import static org.opencb.cellbase.lib.builders.VariationBuilder.VARIATION_CHR_PREFIX;
+import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME;
-/**
- * Created by imedina on 03/02/15.
- */
public class BuildCommandExecutor extends CommandExecutor {
- private AdminCliOptionsParser.BuildCommandOptions buildCommandOptions;
- private Path output;
- private Path buildFolder = null; //