diff --git a/.github/workflows/task.yml b/.github/workflows/task.yml index ac09089acb..f71e27a41d 100644 --- a/.github/workflows/task.yml +++ b/.github/workflows/task.yml @@ -21,5 +21,5 @@ jobs: uses: opencb/java-common-libs/.github/workflows/deploy-docker-hub-workflow.yml@develop needs: test with: - cli: python3 ./build/cloud/docker/docker-build.py push --images base --tag ${{ github.ref_name }} + cli: python3 ./build/cloud/docker/docker-build.py push --images base,builder --tag ${{ github.ref_name }} secrets: inherit diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index 6e1657d1bf..bcb2de9cb8 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -11,7 +11,7 @@ LABEL org.label-schema.vendor="OpenCB" \ ## We need to be root to install dependencies USER root RUN apt-get update -y && \ - apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl && \ + apt-get install -y git default-mysql-client libjson-perl libdbi-perl libdbd-mysql-perl libdbd-mysql-perl libtry-tiny-perl libxml-simple-perl liblog-log4perl-perl libxml-parser-perl libxml-dom-perl && \ mkdir /opt/ensembl && chown cellbase:cellbase /opt/ensembl && \ rm -rf /var/lib/apt/lists/* @@ -26,6 +26,10 @@ RUN cd /opt/ensembl && \ git clone https://github.com/Ensembl/ensembl-variation.git && \ git clone https://github.com/Ensembl/ensembl-funcgen.git && \ git clone https://github.com/Ensembl/ensembl-compara.git && \ - git clone https://github.com/Ensembl/ensembl-io.git + git clone https://github.com/Ensembl/ensembl-io.git && \ + git clone --branch cvs/release-0_7 https://github.com/biomart/biomart-perl -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts +## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes +RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/ + +ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..90f2f8208e 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,16 +134,16 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_111_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_111_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_111_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; -our $MUS_MUSCULUS_CORE = "mus_musculus_core_78_38"; -our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_78_38"; -our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_78_38"; +our $MUS_MUSCULUS_CORE = "mus_musculus_core_111_39"; +our $MUS_MUSCULUS_VARIATION = "mus_musculus_variation_111_39"; +our $MUS_MUSCULUS_FUNCTIONAL = "mus_musculus_funcgen_111_39"; our $RATTUS_NORVEGICUS_CORE = "rattus_norvegicus_core_78_5"; our $RATTUS_NORVEGICUS_VARIATION = "rattus_norvegicus_variation_78_5"; our $RATTUS_NORVEGICUS_FUNCTIONAL = "rattus_norvegicus_funcgen_78_5"; diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl new file mode 100755 index 0000000000..bed648e2d0 --- /dev/null +++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl @@ -0,0 +1,61 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; +use Data::Dumper; +use JSON; +use DB_CONFIG; + +use BioMart::Initializer; +use BioMart::Query; +use BioMart::QueryRunner; + +## Default values +my $species = 'hsapiens'; +my $outdir = "./"; + +## Parsing command line +GetOptions ('species=s' => \$species, 'outdir=s' => \$outdir); + + +my $confFile = "/opt/cellbase/scripts/ensembl-scripts/martURLLocation.xml"; + +# NB: change action to 'clean' if you wish to start a fresh configuration +# and to 'cached' if you want to skip configuration step on subsequent runs from the same registry +my $action='clean'; +my $initializer = BioMart::Initializer->new('registryFile'=>$confFile, 'action'=>$action); +my $registry = $initializer->getRegistry; + +my $query = BioMart::Query->new('registry'=>$registry,'virtualSchemaName'=>'default'); + +$query->setDataset($species."_gene_ensembl"); + +$query->addAttribute("ensembl_gene_id"); +$query->addAttribute("ensembl_transcript_id"); +$query->addAttribute("transcript_is_canonical"); + +$query->formatter("TSV"); + +# Open the file for writing +open(my $fh, '>', "$outdir/ensembl_canonical.txt") or die "Cannot open ensembl_canonical.txt file: $!"; + +# Save the original stdout +my $original_stdout = *STDOUT; +open(STDOUT, '>&', $fh) or die "Can't redirect STDOUT: $!"; + +my $query_runner = BioMart::QueryRunner->new(); + +# to obtain unique rows only +$query_runner->uniqueRowsOnly(1); +$query_runner->execute($query); +#$query_runner->printHeader(); +#print ENSEMBL_CANONICAL $query_runner->printResults(); +# Call printResults which prints to STDOUT (now redirected to the file) +$query_runner->printResults(); +#$query_runner->printFooter(); + +# Restore the original stdout +open(STDOUT, '>&', $original_stdout) or die "Can't restore STDOUT: $!"; + +# Close the filehandle +close($fh) or die "Failed to close file: $!"; \ No newline at end of file diff --git a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl index 5e3aa9c46a..22b6a825b2 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl @@ -16,7 +16,9 @@ #################################################################### ## Parsing command line options #################################### #################################################################### -# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ... +##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp + +# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ... ## Parsing command line GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo, @@ -50,8 +52,8 @@ if ($phylo eq "" || $phylo eq "vertebrate") { print ("In vertebrates section\n"); - if ($species eq "Homo sapiens" && $assembly eq "GRCh38") { - print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); + if ($species eq "Homo sapiens" || $species eq "Mus musculus") { + print ($species." selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); Bio::EnsEMBL::Registry->load_registry_from_db( -host => $ENSEMBL_HOST, -user => $ENSEMBL_USER, diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index 50520f1f92..8ecf3d7c8f 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -17,7 +17,9 @@ #################################################################### ## Parsing command line options #################################### #################################################################### -# USAGE: ./genome_info.pl --species "Homo sapiens" --outfile ../../appl_db/ird_v1/hsa ... +##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/genome_info.pl --species "Mus musculus" --assembly GRCm39 --outfile /tmp + +# USAGE: ./genome_info.pl --species "Homo sapiens" --assembly GRCh38 --outfile ../../appl_db/ird_v1/hsa ... ## Parsing command line GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'o|outfile=s' => \$outfile, 'phylo=s' => \$phylo, @@ -29,7 +31,6 @@ if ($outfile eq "") { $outfile = "/ensembl-data/genome_info.json"; - # $outfile = "/ensembl-data/$species.json"; } #################################################################### @@ -42,17 +43,13 @@ # Bio::EnsEMBL::Registry->load_all("$ENSEMBL_REGISTRY"); if($phylo eq "" || $phylo eq "vertebrate") { print ("In vertebrates section\n"); - if ($species eq "Homo sapiens" && $assembly eq "GRCh38") { - print ("Human selected, assembly ".$assembly." selected, connecting to port ".$ENSEMBL_PORT."\n"); - Bio::EnsEMBL::Registry->load_registry_from_db( - -host => $ENSEMBL_HOST, - -user => $ENSEMBL_USER, - -port => $ENSEMBL_PORT, - -verbose => $verbose - ); - } else { - print ("Human selected, assembly ".$assembly." no supported\n"); - } + print ("Species: ".$species.", assembly ".$assembly.", connecting to: ".$ENSEMBL_HOST.":".$ENSEMBL_PORT."\n"); + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => $ENSEMBL_HOST, + -user => $ENSEMBL_USER, + -port => $ENSEMBL_PORT, + -verbose => $verbose + ); } else { print ("In no-vertebrates section\n"); Bio::EnsEMBL::Registry->load_registry_from_db( @@ -64,7 +61,6 @@ my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Slice"); my $karyotype_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "KaryotypeBand"); -# my $gene_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, "core", "Gene"); #################################################################### my %info_stats = (); @@ -81,12 +77,10 @@ $chromosome{'start'} = int($chrom->start()); $chromosome{'end'} = int($chrom->end()); $chromosome{'size'} = int($chrom->seq_region_length()); -# $chromosome{'numberGenes'} = scalar @{$chrom->get_all_Genes()}; $chromosome{'isCircular'} = $chrom->is_circular(); my @cytobands = (); foreach my $cyto(@{$karyotype_adaptor->fetch_all_by_chr_name($chrom->seq_region_name)}) { -# print $cytoband->name."\n"; my %cytoband = (); $cytoband{'name'} = $cyto->name(); $cytoband{'start'} = int($cyto->start()); @@ -96,7 +90,7 @@ push(@cytobands, \%cytoband); } - ## check if any cytoband has been added + ## Check if any cytoband has been added ## If not a unique cytoband covering all chromosome is added. if(@cytobands == 0) { my %cytoband = (); @@ -110,7 +104,6 @@ $chromosome{'cytobands'} = \@cytobands; push(@chromosomes, \%chromosome); -# push(@chrom_ids, $chrom->seq_region_name); } $info_stats{'chromosomes'} = \@chromosomes; @@ -124,7 +117,6 @@ $supercontig{'start'} = int($supercon->start()); $supercontig{'end'} = int($supercon->end()); $supercontig{'size'} = int($supercon->seq_region_length()); -# $supercontig{'numberGenes'} = scalar @{$supercon->get_all_Genes()}; $supercontig{'isCircular'} = $supercon->is_circular(); ## Adding an unique cytoband covering all chromosome is added. @@ -151,7 +143,7 @@ sub print_parameters { print "Parameters: "; - print "species: $species, outfile: $outfile, "; + print "species: $species, assembly: $assembly, outfile: $outfile, "; print "ensembl-registry: $ENSEMBL_REGISTRY, "; print "ensembl-host: $ENSEMBL_HOST, ensembl-port: $ENSEMBL_PORT, "; print "ensembl-user: $ENSEMBL_USER, verbose: $verbose, help: $help"; diff --git a/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml new file mode 100644 index 0000000000..a710368f8f --- /dev/null +++ b/cellbase-app/app/scripts/ensembl-scripts/martURLLocation.xml @@ -0,0 +1,19 @@ + + + + + \ No newline at end of file diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java index 088db087f0..a71663f19f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CliOptionsParser.java @@ -66,8 +66,8 @@ public class CommonCommandOptions { description = "Set the logging level, accepted values are: debug, info, warn, error and fatal") public String logLevel = "info"; - @Parameter(names = {"-C", "--config"}, arity = 1, - description = "Path to CellBase configuration.yml file") + @Deprecated + @Parameter(names = {"-C", "--config"}, arity = 1, hidden = true, description = "Path to CellBase configuration.yml file") public String conf; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java index 39018bf170..64dcc05bfb 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/CommandExecutor.java @@ -35,18 +35,12 @@ import java.nio.file.Path; import java.nio.file.Paths; -/** - * Created by imedina on 03/02/15. - */ + public abstract class CommandExecutor { protected String logLevel; -// protected boolean verbose; protected String conf; - @Deprecated - protected String configFile; - protected String appHome; protected CellBaseConfiguration configuration; @@ -55,35 +49,13 @@ public abstract class CommandExecutor { protected Logger logger; public CommandExecutor() { - } public CommandExecutor(String logLevel, String conf) { this.logLevel = logLevel; this.conf = conf; - /** - * System property 'app.home' is set up by cellbase.sh. If by any reason this is null - * then CELLBASE_HOME environment variable is used instead. - */ - this.appHome = System.getProperty("app.home", System.getenv("CELLBASE_HOME")); - - if (StringUtils.isEmpty(conf)) { - this.conf = this.appHome + "/conf"; - } - - if (logLevel != null && !logLevel.isEmpty()) { - // We must call to this method - setLogLevel(logLevel); - } - } - - public CommandExecutor(String logLevel, boolean verbose, String conf) { - this.logLevel = logLevel; -// this.verbose = verbose; - this.conf = conf; - - /** + /* * System property 'app.home' is set up by cellbase.sh. If by any reason this is null * then CELLBASE_HOME environment variable is used instead. */ @@ -124,29 +96,16 @@ public void setLogLevel(String logLevel) { this.logLevel = logLevel; } -// public boolean isVerbose() { -// return verbose; -// } -// -// public void setVerbose(boolean verbose) { -// this.verbose = verbose; -// } - - public String getConfigFile() { - return configFile; - } - - public void setConfigFile(String configFile) { - this.configFile = configFile; - } - public Logger getLogger() { return logger; } - /* + /** * This method attempts to first data configuration from CLI parameter, if not present then uses * the configuration from installation directory, if not exists then loads JAR configuration.json or yml. + * + * @throws URISyntaxException If any URI problem occurs + * @throws IOException If any IO problem occurs */ public void loadCellBaseConfiguration() throws URISyntaxException, IOException { Path confPath = Paths.get(this.conf); @@ -154,11 +113,13 @@ public void loadCellBaseConfiguration() throws URISyntaxException, IOException { if (Files.exists(confPath.resolve("configuration.json"))) { logger.debug("Loading configuration from '{}'", confPath.resolve("configuration.json").toAbsolutePath()); - this.configuration = CellBaseConfiguration.load(new FileInputStream(confPath.resolve("configuration.json").toFile()), - CellBaseConfiguration.ConfigurationFileFormat.JSON); + this.configuration = CellBaseConfiguration + .load(Files.newInputStream(confPath.resolve("configuration.json").toFile().toPath()), + CellBaseConfiguration.ConfigurationFileFormat.JSON); } else if (Files.exists(Paths.get(this.appHome + "/conf/configuration.yml"))) { logger.debug("Loading configuration from '{}'", this.appHome + "/conf/configuration.yml"); - this.configuration = CellBaseConfiguration.load(new FileInputStream(new File(this.appHome + "/conf/configuration.yml"))); + this.configuration = CellBaseConfiguration + .load(Files.newInputStream(new File(this.appHome + "/conf/configuration.yml").toPath())); } else { InputStream inputStream = CellBaseConfiguration.class.getClassLoader().getResourceAsStream("conf/configuration.json"); String configurationFilePath = "conf/configuration.json"; @@ -198,10 +159,4 @@ public void loadClientConfiguration() throws IOException { } } } - - protected void makeDir(Path folderPath) throws IOException { - if (!Files.exists(folderPath)) { - Files.createDirectories(folderPath); - } - } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 4a5f2c085f..ec1d8503de 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -19,15 +19,12 @@ import com.beust.jcommander.*; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; -import org.opencb.cellbase.lib.EtlCommons; import java.util.HashMap; import java.util.List; import java.util.Map; -/** - * Created by imedina on 03/02/15. - */ + public class AdminCliOptionsParser extends CliOptionsParser { private final CommonCommandOptions commonCommandOptions; @@ -35,13 +32,13 @@ public class AdminCliOptionsParser extends CliOptionsParser { private DownloadCommandOptions downloadCommandOptions; private BuildCommandOptions buildCommandOptions; + private DataListCommandOptions dataListCommandOptions; private DataReleaseCommandOptions dataReleaseCommandOptions; private ApiKeyCommandOptions apiKeyCommandOptions; private LoadCommandOptions loadCommandOptions; private ExportCommandOptions exportCommandOptions; private CustomiseCommandOptions customiseCommandOptions; private IndexCommandOptions indexCommandOptions; - private InstallCommandOptions installCommandOptions; private ServerCommandOptions serverCommandOptions; private ValidationCommandOptions validationCommandOptions; @@ -52,25 +49,25 @@ public AdminCliOptionsParser() { downloadCommandOptions = new DownloadCommandOptions(); buildCommandOptions = new BuildCommandOptions(); + dataListCommandOptions = new DataListCommandOptions(); dataReleaseCommandOptions = new DataReleaseCommandOptions(); apiKeyCommandOptions = new ApiKeyCommandOptions(); loadCommandOptions = new LoadCommandOptions(); exportCommandOptions = new ExportCommandOptions(); customiseCommandOptions = new CustomiseCommandOptions(); indexCommandOptions = new IndexCommandOptions(); - installCommandOptions = new InstallCommandOptions(); serverCommandOptions = new ServerCommandOptions(); validationCommandOptions = new ValidationCommandOptions(); jCommander.addCommand("download", downloadCommandOptions); jCommander.addCommand("build", buildCommandOptions); + jCommander.addCommand("data-list", dataListCommandOptions); jCommander.addCommand("data-release", dataReleaseCommandOptions); jCommander.addCommand("api-key", apiKeyCommandOptions); jCommander.addCommand("load", loadCommandOptions); jCommander.addCommand("export", exportCommandOptions); jCommander.addCommand("customise", customiseCommandOptions); jCommander.addCommand("index", indexCommandOptions); - jCommander.addCommand("install", installCommandOptions); jCommander.addCommand("server", serverCommandOptions); jCommander.addCommand("validate", validationCommandOptions); } @@ -80,7 +77,8 @@ public void parse(String[] args) throws ParameterException { jCommander.parse(args); } - @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml file") + @Parameters(commandNames = {"download"}, commandDescription = "Download all different data sources provided in the configuration.yml" + + " file") public class DownloadCommandOptions { @ParametersDelegate @@ -89,16 +87,13 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download:" - + EtlCommons.GENOME_DATA + ", " + EtlCommons.GENE_DATA + ", " + EtlCommons.VARIATION_DATA + ", " - + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA + ", " + EtlCommons.MISSENSE_VARIATION_SCORE_DATA + ", " - + EtlCommons.REGULATION_DATA + ", " + EtlCommons.PROTEIN_DATA + ", " + EtlCommons.CONSERVATION_DATA + ", " - + EtlCommons.CLINICAL_VARIANTS_DATA + ", " + EtlCommons.REPEATS_DATA + ", " + EtlCommons.OBO_DATA + ", " - + EtlCommons.PUBMED_DATA + ", " + EtlCommons.PHARMACOGENOMICS_DATA + "; and 'all' to download everything", - required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to download" + + " everything", required = true, arity = 1) public String data; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; } @@ -108,18 +103,21 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " - + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to build" + + " everything", required = true, arity = 1) public String data; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) + @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or" + + " 'hsapiens'", arity = 1) public String species = "Homo sapiens"; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml will be used", required = false, arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml" + + " will be used", arity = 1) public String assembly; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; @Parameter(names = {"--skip-normalize"}, description = "Skip normalization of clinical variants. Normalization" @@ -137,6 +135,17 @@ public class BuildCommandOptions { } + @Parameters(commandNames = {"data-list"}, commandDescription = "List the data supported by the given species") + public class DataListCommandOptions { + + @ParametersDelegate + public CommonCommandOptions commonOptions = commonCommandOptions; + + @Parameter(names = {"-s", "--species"}, description = "Name of the species to list the data, valid formats include 'Homo sapiens'" + + " or 'hsapiens'", arity = 1) + public String species = "Homo sapiens"; + } + @Parameters(commandNames = {"data-release"}, commandDescription = "Manage data releases in order to support multiple versions of data") public class DataReleaseCommandOptions { @@ -155,11 +164,13 @@ public class DataReleaseCommandOptions { @Parameter(names = {"--update"}, description = "Data release to be updated by adding CellBase vesions", arity = 1) public int update; - @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has to be used together to the parameter --update", arity = 1) + @Parameter(names = {"--add-versions"}, description = "CellBase versions separated by commas, e.g.: v5.2,v5.3. This parameter has" + + " to be used together to the parameter --update", arity = 1) public String versions; } - @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources and set quota") + @Parameters(commandNames = {"api-key"}, commandDescription = "Manage API keys in order to access to restricted/licensed data sources" + + " and set quota") public class ApiKeyCommandOptions { @ParametersDelegate @@ -168,9 +179,9 @@ public class ApiKeyCommandOptions { @Parameter(names = {"--create-api-key"}, description = "Create an API key", arity = 0) public boolean createApiKey; - @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to specify the" - + " licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:" - + " cosmic:31/01/2025,hgmd", arity = 1) + @Parameter(names = {"--licensed-data-sources"}, description = "Use this parameter in conjunction with --create-api-key to" + +" specify the licensed data sources separated by commas and optionally the expiration date: source[:dd/mm/yyyy]. e.g.:" + + " spliceai:31/01/2025,hgmd", arity = 1) public String dataSources; @Parameter(names = {"--expiration"}, description = "Use this parameter in conjunction with --create-api-key to specify the" @@ -195,9 +206,9 @@ public class LoadCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," - + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics." - + " 'all' loads everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to load, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to load" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, @@ -242,9 +253,9 @@ public class ExportCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, " - + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' " - + " loads everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to export, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to export" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name, e.g., cellbase_hsapiens_grch38_v5", required = true, @@ -304,10 +315,9 @@ public class IndexCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be indexed: genome, gene, variation, " - + "regulation, protein, ontology, clinical_variants, repeats, refseq and missense_variation_functional_score. 'all' " - + "indexes everything", required = true, - arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to index, it depends on the species; use the" + + " command 'cellbase-admin.sh data-list' to know the data list available for each species; or use 'all' to index" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name.", required = true, arity = 1) @@ -321,16 +331,6 @@ public class IndexCommandOptions { public boolean validate; } - @Parameters(commandNames = {"install"}, commandDescription = "Set up sharding for CellBase") - public class InstallCommandOptions { - - @ParametersDelegate - public CommonCommandOptions commonOptions = commonCommandOptions; - - @ParametersDelegate - public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - } - @Parameters(commandNames = {"server"}, commandDescription = "Manage REST server") public class ServerCommandOptions { @@ -353,16 +353,20 @@ public class ValidationCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens' or 'hsapiens'", arity = 1) + @Parameter(names = {"-s", "--species"}, description = "Name of the species to be downloaded, valid format include 'Homo sapiens'" + + " or 'hsapiens'", arity = 1) public String species = "Homo sapiens"; - @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json will be used", required = false, arity = 1) + @Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.json" + + " will be used", arity = 1) public String assembly = "GRCh38"; - @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter to 0", required = false, arity = 1) + @Parameter(names = {"--data-release"}, description = "Data release. To use the default data release, please, set this parameter" + + " to 0", arity = 1) public int dataRelease = 0; - @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as COSMIC or HGMD", required = false, arity = 1) + @Parameter(names = {"--api-key"}, description = "API key to get access to licensed/restricted data sources such as SpliceAI or" + + " HGMD", arity = 1) public String apiKey; @Parameter(names = {"-i", "--input-file"}, description = "Full path to VCF", required = true, arity = 1) @@ -371,8 +375,7 @@ public class ValidationCommandOptions { @Parameter(names = {"-V", "--vep-file"}, description = "Full path to VEP annotation JSON file", required = true, arity = 1) public String vepFile; - @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", required = false, - arity = 1) + @Parameter(names = {"-o", "--output-dir"}, description = "Output directory where the comparison report is saved", arity = 1) public String outputDirectory = "/tmp"; @Parameter(names = {"-t", "--type"}, description = "Which type to analyse: 'Protein', 'Transcript' or 'Both'", required = @@ -410,6 +413,10 @@ public BuildCommandOptions getBuildCommandOptions() { return buildCommandOptions; } + public DataListCommandOptions getDataListCommandOptions() { + return dataListCommandOptions; + } + public DataReleaseCommandOptions getDataReleaseCommandOptions() { return dataReleaseCommandOptions; } @@ -424,8 +431,6 @@ public IndexCommandOptions getIndexCommandOptions() { return indexCommandOptions; } - public InstallCommandOptions getInstallCommandOptions() { return installCommandOptions; } - public ServerCommandOptions getServerCommandOptions() { return serverCommandOptions; } public ValidationCommandOptions getValidationCommandOptions() { return validationCommandOptions; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index 10c43d637c..d46d32709f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -25,9 +25,7 @@ import java.io.IOException; import java.net.URISyntaxException; -/** - * Created by imedina on 03/02/15. - */ + public class AdminMain { public static void main(String[] args) { @@ -63,30 +61,30 @@ public static void main(String[] args) { case "build": commandExecutor = new BuildCommandExecutor(cliOptionsParser.getBuildCommandOptions()); break; + case "load": + commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); + break; + case "data-list": + commandExecutor = new DataListCommandExecutor(cliOptionsParser.getDataListCommandOptions()); + break; case "data-release": commandExecutor = new DataReleaseCommandExecutor(cliOptionsParser.getDataReleaseCommandOptions()); break; case "api-key": commandExecutor = new ApiKeyCommandExecutor(cliOptionsParser.getApiKeyCommandOptions()); break; - case "load": - commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); - break; case "export": commandExecutor = new ExportCommandExecutor(cliOptionsParser.getExportCommandOptions()); break; case "index": commandExecutor = new IndexCommandExecutor(cliOptionsParser.getIndexCommandOptions()); break; - case "install": - commandExecutor = new InstallCommandExecutor(cliOptionsParser.getInstallCommandOptions()); + case "validate": + commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions()); break; case "server": commandExecutor = new ServerCommandExecutor(cliOptionsParser.getServerCommandOptions()); break; - case "validate": - commandExecutor = new ValidationCommandExecutor(cliOptionsParser.getValidationCommandOptions()); - break; default: break; } @@ -98,10 +96,10 @@ public static void main(String[] args) { commandExecutor.execute(); } catch (IOException | URISyntaxException | CellBaseException e) { commandExecutor.getLogger().error("Error: " + e.getMessage()); + e.printStackTrace(); System.exit(1); } } } } - } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 16db1f82bc..542cc3e129 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -17,11 +17,14 @@ package org.opencb.cellbase.app.cli.admin.executors; import com.beust.jcommander.ParameterException; -import org.apache.commons.lang.StringUtils; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -31,59 +34,74 @@ import org.opencb.cellbase.lib.builders.*; import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder; -import java.io.File; import java.io.IOException; import java.nio.file.*; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.OntologyBuilder.OBO_OUTPUT_BASENAME; +import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; +import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.VariationBuilder.VARIATION_CHR_PREFIX; +import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; -/** - * Created by imedina on 03/02/15. - */ public class BuildCommandExecutor extends CommandExecutor { - private AdminCliOptionsParser.BuildCommandOptions buildCommandOptions; - private Path output; - private Path buildFolder = null; // /_/generated-json - private Path downloadFolder = null; // /_/download + private final AdminCliOptionsParser.BuildCommandOptions buildCommandOptions; + private final Path outputDirectory; + + private Path buildFolder = null; + private Path downloadFolder = null; private boolean normalize = true; - private File ensemblScriptsFolder; + private SpeciesConfiguration speciesConfiguration; + private SpeciesConfiguration.Assembly assembly; + private String ensemblRelease; private boolean flexibleGTFParsing; - private SpeciesConfiguration speciesConfiguration; + +// private SpeciesConfiguration speciesConfiguration; public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); this.buildCommandOptions = buildCommandOptions; - this.output = Paths.get(buildCommandOptions.outputDirectory); + this.outputDirectory = Paths.get(buildCommandOptions.outputDirectory); normalize = !buildCommandOptions.skipNormalize; - this.ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); this.flexibleGTFParsing = buildCommandOptions.flexibleGTFParsing; } - /** * Parse specific 'build' command options. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { try { // Output directory need to be created if it doesn't exist - if (!Files.exists(output)) { - Files.createDirectories(output); + if (!Files.exists(outputDirectory)) { + Files.createDirectories(outputDirectory); } - speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, buildCommandOptions.species); + // Get the species + String species = buildCommandOptions.species; + speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - SpeciesConfiguration.Assembly assembly = null; - if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { + + // Get the assembly + if (StringUtils.isNotEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { throw new CellBaseException("Invalid assembly: '" + buildCommandOptions.assembly + "'"); @@ -92,294 +110,383 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } + String ensemblVersion = assembly.getEnsemblVersion(); + ensemblRelease = "release-" + ensemblVersion.split("_")[0]; + String spShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); String spAssembly = assembly.getName().toLowerCase(); - Path spFolder = output.resolve(spShortName + "_" + spAssembly); - // /_/download - downloadFolder = output.resolve(spFolder + "/download"); + Path spFolder = outputDirectory.resolve(spShortName + "_" + spAssembly); + downloadFolder = outputDirectory.resolve(spFolder + "/download"); if (!Files.exists(downloadFolder)) { throw new CellBaseException("Download folder not found '" + spShortName + "_" + spAssembly + "/download'"); } - // /_/generated_json - buildFolder = output.resolve(spFolder + "/generated_json"); - if (!buildFolder.toFile().exists()) { - makeDir(buildFolder); + buildFolder = outputDirectory.resolve(spFolder + "/generated_json"); + if (!Files.exists(buildFolder)) { + Files.createDirectories(buildFolder); } - if (buildCommandOptions.data != null) { - String[] buildOptions; - if (buildCommandOptions.data.equals("all")) { - buildOptions = speciesConfiguration.getData().toArray(new String[0]); - } else { - buildOptions = buildCommandOptions.data.split(","); + // Check data sources + List dataList = getDataList(species, speciesConfiguration); + AbstractBuilder parser; + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + parser = buildGenomeSequence(); + break; + case CONSERVATION_DATA: + parser = buildConservation(); + break; + case REPEATS_DATA: + parser = buildRepeats(); + break; + case GENE_DATA: + parser = buildGene(); + break; + case PROTEIN_DATA: + parser = buildProtein(); + break; + case VARIATION_DATA: + parser = buildVariation(); + break; + case REGULATION_DATA: + parser = buildRegulation(); + break; + case VARIATION_FUNCTIONAL_SCORE_DATA: + parser = buildCadd(); + break; + case MISSENSE_VARIATION_SCORE_DATA: + parser = buildRevel(); + break; + case CLINICAL_VARIANT_DATA: + parser = buildClinicalVariants(); + break; + case SPLICE_SCORE_DATA: + parser = buildSplice(); + break; + case ONTOLOGY_DATA: + parser = buildObo(); + break; + case PUBMED_DATA: + parser = buildPubMed(); + break; + case PHARMACOGENOMICS_DATA: + parser = buildPharmacogenomics(); + break; + case PGS_DATA: + parser = buildPolygenicScores(); + break; + default: + throw new IllegalArgumentException("Data parameter '" + data + "' is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",") + + ". You can use data parameter 'all' to download everything"); } - for (int i = 0; i < buildOptions.length; i++) { - String buildOption = buildOptions[i]; - - logger.info("Building '{}' data", buildOption); - CellBaseBuilder parser = null; - switch (buildOption) { -// case EtlCommons.GENOME_INFO_DATA: -// buildGenomeInfo(); -// break; - case EtlCommons.GENOME_DATA: - parser = buildGenomeSequence(); - break; - case EtlCommons.GENE_DATA: - parser = buildGene(); - break; - case EtlCommons.REFSEQ_DATA: - parser = buildRefSeq(); - break; - case EtlCommons.VARIATION_DATA: - parser = buildVariation(); - break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: - parser = buildCadd(); - break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - parser = buildRevel(); - break; - case EtlCommons.REGULATION_DATA: - parser = buildRegulation(); - break; - case EtlCommons.PROTEIN_DATA: - parser = buildProtein(); - break; -// case EtlCommons.PPI_DATA: -// parser = getInteractionParser(); -// break; - case EtlCommons.CONSERVATION_DATA: - parser = buildConservation(); - break; - case EtlCommons.CLINICAL_VARIANTS_DATA: - parser = buildClinicalVariants(); - break; - case EtlCommons.REPEATS_DATA: - parser = buildRepeats(); - break; - case EtlCommons.OBO_DATA: - parser = buildObo(); - break; - case EtlCommons.SPLICE_SCORE_DATA: - parser = buildSplice(); - break; - case EtlCommons.PUBMED_DATA: - parser = buildPubMed(); - break; - case EtlCommons.PHARMACOGENOMICS_DATA: - parser = buildPharmacogenomics(); - break; - default: - logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); - break; - } - - if (parser != null) { - try { - parser.parse(); - } catch (Exception e) { - logger.error("Error executing 'build' command " + buildCommandOptions.data + ": " + e.getMessage(), e); - } - parser.disconnect(); - } + if (parser != null) { + parser.parse(); + parser.disconnect(); + logger.info(BUILDING_DONE_LOG_MESSAGE); } } - } catch (ParameterException e) { - logger.error("Error parsing build command line parameters: " + e.getMessage(), e); - } catch (IOException | CellBaseException e) { - logger.error(e.getMessage()); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'build': " + e.getMessage(), e); + } catch (Exception e) { + throw new CellBaseException("Error executing command line 'build': " + e.getMessage(), e); } } - private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILE))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILE))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILE))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); - return new RepeatsBuilder(repeatsFilesDir, serializer); - } + private AbstractBuilder buildGenomeSequence() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENOME_DATA)); - private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(EtlCommons.OBO_DATA); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); - return new OntologyBuilder(oboDir, serializer); - } + Path genomeDownloadFolder = downloadFolder.resolve(GENOME_DATA); + Path genomeBuildFolder = buildFolder.resolve(GENOME_DATA); + + if (Files.exists(genomeBuildFolder.resolve(GENOME_JSON_FILENAME)) + && Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME)) + && Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { + logger.warn(DATA_ALREADY_BUILT, getDataName(GENOME_DATA)); + return null; + } - private void copyVersionFiles(List pathList) { - for (Path path : pathList) { + // Sanity check + if (!Files.exists(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME))) { + throw new CellBaseException("Genome info file " + GENOME_INFO_FILENAME + " does not exist at " + genomeDownloadFolder); + } + + // Copy files if necessary + if (!Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) { + Path genomeVersionPath = genomeDownloadFolder.resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); + } + + if (!Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME))) { try { - Files.copy(path, downloadFolder.resolve(path.getFileName()), StandardCopyOption.REPLACE_EXISTING); + Files.copy(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME), genomeBuildFolder.resolve(GENOME_INFO_FILENAME)); } catch (IOException e) { - logger.warn("Version file {} not found - skipping", path.toString()); + throw new CellBaseException("Error copying file " + GENOME_INFO_FILENAME, e); } } - } -// private void buildGenomeInfo() { -// /** -// * To get some extra info about the genome such as chromosome length or cytobands -// * we execute the following script. -// */ -// try { -// String outputFileName = downloadFolder.resolve("genome_info.json").toAbsolutePath().toString(); -// List args = new ArrayList<>(); -// args.addAll(Arrays.asList("--species", speciesConfigurathtion.getScientificName(), -// "--assembly", buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, -// "-o", outputFileName, -// "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); -// if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) -// && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { -// args.add("--phylo"); -// args.add("no-vertebrate"); -// } -// -// String geneInfoLogFileName = downloadFolder.resolve("genome_info.log").toAbsolutePath().toString(); -// -// boolean downloadedGenomeInfo; -// downloadedGenomeInfo = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, "./genome_info.pl", args, geneInfoLogFileName); -// -// if (downloadedGenomeInfo) { -// logger.info(outputFileName + " created OK"); -// } else { -// logger.error("Genome info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); -// } -// } catch (IOException | InterruptedException e) { -// e.printStackTrace(); -// } -// } + // Parse file + if (!Files.exists(genomeBuildFolder.resolve(GENOME_JSON_FILENAME))) { + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); - private CellBaseBuilder buildGenomeSequence() { - copyVersionFiles(Collections.singletonList(downloadFolder.resolve("genome/genomeVersion.json"))); - Path fastaFile = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "genome_sequence"); - return new GenomeSequenceFastaBuilder(fastaFile, serializer); + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(genomeBuildFolder, GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); + } + return null; } - private CellBaseBuilder buildGene() throws CellBaseException { - Path geneFolderPath = downloadFolder.resolve("gene"); - copyVersionFiles(Arrays.asList(geneFolderPath.resolve("dgidbVersion.json"), - geneFolderPath.resolve("ensemblCoreVersion.json"), geneFolderPath.resolve("uniprotXrefVersion.json"), - geneFolderPath.resolve("geneExpressionAtlasVersion.json"), - geneFolderPath.resolve("hpoVersion.json"), geneFolderPath.resolve("disgenetVersion.json"), - geneFolderPath.resolve("gnomadVersion.json"))); - Path genomeFastaFilePath = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "gene"); - return new GeneBuilder(geneFolderPath, genomeFastaFilePath, speciesConfiguration, flexibleGTFParsing, serializer); + private AbstractBuilder buildGene() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); + + // Sanity check + Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); + Path geneBuildPath = buildFolder.resolve(GENE_DATA); + + List versionFiles = new ArrayList<>(Arrays.asList( + geneDownloadPath.resolve(ENSEMBL_DATA).resolve(getDataVersionFilename(ENSEMBL_DATA)), + geneDownloadPath.resolve(REFSEQ_DATA).resolve(getDataVersionFilename(REFSEQ_DATA)))); + List dataList = GeneBuilder.getCommonDataSources(speciesConfiguration, configuration); + for (String data : dataList) { + Path versionFile; + switch (data) { + case MIRTARBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRTARBASE_DATA).resolve(getDataVersionFilename(data)); + break; + case MIRBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRBASE_DATA).resolve(getDataVersionFilename(data)); + break; + default: + versionFile = downloadFolder.resolve(GERP_DATA).resolve(getDataVersionFilename(data)); + break; + } + versionFiles.add(versionFile); + } + + List filesToCheck = new ArrayList<>(Arrays.asList(geneBuildPath.resolve(ENSEMBL_GENE_OUTPUT_FILENAME), + geneBuildPath.resolve(REFSEQ_GENE_OUTPUT_FILENAME))); + for (Path versionFile : versionFiles) { + filesToCheck.add(geneBuildPath.resolve(versionFile.getFileName())); + } + filesToCheck.addAll(versionFiles); + + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(ENSEMBL_DATA) + " and " + getDataName(REFSEQ_DATA) + " genes"); + return null; + } + + copyVersionFiles(versionFiles, geneBuildPath); + + return new GeneBuilder(geneDownloadPath, geneBuildPath, speciesConfiguration, flexibleGTFParsing, configuration); } - private CellBaseBuilder buildRefSeq() { - Path refseqFolderPath = downloadFolder.resolve("refseq"); - copyVersionFiles(Arrays.asList(refseqFolderPath.resolve("refSeqVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "refseq"); - return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); + private AbstractBuilder buildRepeats() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + + // Sanity check + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); + Path repeatsBuildPath = buildFolder.resolve(REPEATS_DATA); + List dataList = EtlCommons.getDataList(REPEATS_DATA, configuration, speciesConfiguration); + List filesToCheck = new ArrayList<>(Arrays.asList(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME))); + for (String data : dataList) { + filesToCheck.add(repeatsBuildPath.resolve(getDataVersionFilename(data))); + } + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(REPEATS_DATA)); + return null; + } + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(repeatsDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + for (String data : dataList) { + copyVersionFiles(Collections.singletonList(repeatsDownloadPath.resolve(data).resolve(getDataVersionFilename(data))), + repeatsBuildPath); + } + + // Create serializer and return the repeats builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_DATA); + return new RepeatsBuilder(dataList, repeatsDownloadPath, serializer, configuration); } - private CellBaseBuilder buildVariation() throws IOException { - Path downloadVariationPath = downloadFolder.resolve(VARIATION_DATA); - Path buildVariationPath = buildFolder.resolve(VARIATION_DATA); - if (!buildVariationPath.toFile().exists()) { - buildVariationPath.toFile().mkdirs(); + private AbstractBuilder buildObo() throws CellBaseException { + // Sanity check + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); + List filesToCheck = new ArrayList<>(Arrays.asList(oboBuildPath.resolve(OBO_OUTPUT_BASENAME))); + List dataList = new ArrayList<>(Arrays.asList(GO_OBO_DATA)); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + dataList.add(HPO_OBO_DATA); + dataList.add(DOID_OBO_DATA); + dataList.add(MONDO_OBO_DATA); } - CellBaseFileSerializer variationSerializer = new CellBaseJsonFileSerializer(buildVariationPath); + for (String data : dataList) { + filesToCheck.add(oboBuildPath.resolve(data).resolve(getDataVersionFilename(data))); + } - // Currently, only dbSNP data - Files.copy(downloadVariationPath.resolve(DBSNP_VERSION_FILENAME), buildVariationPath.resolve(DBSNP_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); - return new VariationBuilder(downloadVariationPath, variationSerializer, configuration); + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(ONTOLOGY_DATA)); + return null; + } + + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + for (String data : dataList) { + copyVersionFiles(Collections.singletonList(oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data))), + oboBuildPath); + } + + // Create serializer and return the ontology builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_OUTPUT_BASENAME); + return new OntologyBuilder(oboDownloadPath, speciesConfiguration, serializer); } - private CellBaseBuilder buildCadd() { - Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score"); - copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json"))); - Path caddFilePath = variationFunctionalScorePath.resolve("whole_genome_SNVs.tsv.gz"); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "cadd"); - return new CaddScoreBuilder(caddFilePath, serializer); + private AbstractBuilder buildCadd() throws CellBaseException { + // Sanity check + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Path caddBuildPath = buildFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + copyVersionFiles(Collections.singletonList(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA))), caddBuildPath); + + // Create the file serializer and the protein builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(caddBuildPath, CADD_DATA); + return new CaddScoreBuilder(caddDownloadPath, serializer); } - private CellBaseBuilder buildRevel() { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - copyVersionFiles(Arrays.asList(missensePredictionScorePath.resolve("revelVersion.json"))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - return new RevelScoreBuilder(missensePredictionScorePath, serializer); + private AbstractBuilder buildRevel() throws CellBaseException { + // Sanity check + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Path revelBuildPath = buildFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + copyVersionFiles(Collections.singletonList(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA))), revelBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(revelBuildPath, REVEL_DATA); + return new RevelScoreBuilder(revelDownloadPath, serializer); } - private CellBaseBuilder buildRegulation() { - Path regulatoryRegionFilesDir = downloadFolder.resolve("regulation"); - copyVersionFiles(Collections.singletonList(regulatoryRegionFilesDir.resolve("ensemblRegulationVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_region"); - return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); + private AbstractBuilder buildRegulation() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); + + // Sanity check + Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); + Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); + List filesToCheck = Arrays.asList(regulationBuildPath.resolve(REGULATORY_REGION_OUTPUT_FILENAME), + regulationBuildPath.resolve(REGULATORY_PFM_OUTPUT_FILENAME), + regulationBuildPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), + regulationBuildPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(REGULATION_DATA)); + return null; + } + + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_DATA).resolve(getDataVersionFilename( + REGULATORY_BUILD_DATA)), regulationDownloadPath.resolve(MOTIF_FEATURES_DATA).resolve(getDataVersionFilename( + MOTIF_FEATURES_DATA))), regulationBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); + return new RegulatoryFeatureBuilder(regulationDownloadPath, serializer); } - private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve("protein"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), - proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein"); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), - downloadFolder.resolve("protein").resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); + private AbstractBuilder buildProtein() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + + // Sanity check + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); + List filesToCheck = Arrays.asList(proteinBuildPath.resolve(PROTEIN_OUTPUT_FILENAME), + proteinBuildPath.resolve(getDataVersionFilename(INTERPRO_DATA)), + proteinBuildPath.resolve(getDataVersionFilename(INTACT_DATA)), + proteinBuildPath.resolve(getDataVersionFilename(UNIPROT_DATA))); + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(PROTEIN_DATA)); + return null; + } + + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(INTERPRO_DATA).resolve(getDataVersionFilename( + INTERPRO_DATA)), proteinDownloadPath.resolve(INTACT_DATA).resolve(getDataVersionFilename( + INTACT_DATA)), proteinDownloadPath.resolve(UNIPROT_DATA).resolve(getDataVersionFilename( + UNIPROT_DATA))), proteinBuildPath); + + // Create the file serializer and the protein builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); + return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } - private void getProteinFunctionPredictionMatrices(SpeciesConfiguration sp, Path geneFolder) - throws IOException, InterruptedException { - logger.info("Downloading protein function prediction matrices ..."); + private AbstractBuilder buildVariation() throws CellBaseException, IOException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(VARIATION_DATA)); - // run protein_function_prediction_matrices.pl - String proteinFunctionProcessLogFile = geneFolder.resolve("protein_function_prediction_matrices.log").toString(); - List args = Arrays.asList("--species", sp.getScientificName(), "--outdir", geneFolder.toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs()); + // Sanity check + Path variationDownloadPath = downloadFolder.resolve(VARIATION_DATA); + Path variationBuildPath = buildFolder.resolve(VARIATION_DATA); - boolean proteinFunctionPredictionMatricesObtaines = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./protein_function_prediction_matrices.pl", - args, - proteinFunctionProcessLogFile); + if (Files.exists(variationBuildPath)) { + List filesToCheck = new ArrayList<>(); + if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { + filesToCheck.add(variationBuildPath.resolve(getDataVersionFilename(VARIATION_DATA))); + } - // check output - if (proteinFunctionPredictionMatricesObtaines) { - logger.info("Protein function prediction matrices created OK"); - } else { - logger.error("Protein function prediction matrices for " + sp.getScientificName() + " cannot be downloaded"); + try (DirectoryStream vcfPaths = Files.newDirectoryStream(variationBuildPath, + entry -> entry.getFileName().toString().startsWith(VARIATION_CHR_PREFIX))) { + if (AbstractBuilder.existFiles(filesToCheck) && vcfPaths.iterator().hasNext()) { + logger.warn(DATA_ALREADY_BUILT, getDataName(VARIATION_DATA)); + return null; + } + } } - } - private CellBaseBuilder getInteractionParser() { - Path proteinFolder = downloadFolder.resolve("protein"); - Path psimiTabFile = proteinFolder.resolve("intact.txt"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("intactVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein_protein_interaction"); - return new InteractionBuilder(psimiTabFile, speciesConfiguration.getScientificName(), serializer); + // Copy version files + if (!speciesConfiguration.getId().equalsIgnoreCase(HSAPIENS)) { + copyVersionFiles(Arrays.asList(variationDownloadPath.resolve(getDataVersionFilename(VARIATION_DATA))), variationBuildPath); + } + + // Create the file serializer and the variation builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(variationBuildPath); + return new VariationBuilder(variationDownloadPath, speciesConfiguration.getScientificName(), serializer, configuration); } - private CellBaseBuilder buildConservation() { - Path conservationFilesDir = downloadFolder.resolve("conservation"); - copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), - conservationFilesDir.resolve("phastConsVersion.json"), - conservationFilesDir.resolve("phyloPVersion.json"))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? + private AbstractBuilder buildConservation() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + + // Sanity check + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); + Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); + List dataList = Arrays.asList(GERP_DATA, PHASTCONS_DATA, PHYLOP_DATA); + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(conservationDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_DATA).resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(PHASTCONS_DATA).resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(PHYLOP_DATA).resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); + int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); - return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); + return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } - private CellBaseBuilder buildClinicalVariants() { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); - copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json"))); - copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json"))); - - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, - EtlCommons.CLINICAL_VARIANTS_JSON_FILE.replace(".json.gz", ""), true); - return new ClinicalVariantBuilder(clinicalVariantFolder, normalize, getFastaReferenceGenome(), + private AbstractBuilder buildClinicalVariants() throws CellBaseException { + // Sanity check + Path clinicalDownloadPath = downloadFolder.resolve(CLINICAL_VARIANT_DATA); + Path clinicalBuildPath = buildFolder.resolve(CLINICAL_VARIANT_DATA); + copyVersionFiles(Arrays.asList(clinicalDownloadPath.resolve(getDataVersionFilename(CLINVAR_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(COSMIC_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(HGMD_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(GWAS_DATA))), clinicalBuildPath); + + // Create the file serializer and the clinical variants builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(clinicalBuildPath, CLINICAL_VARIANTS_BASENAME, true); + return new ClinicalVariantBuilder(clinicalDownloadPath, normalize, getFastaReferenceGenome(), buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, - serializer); + configuration, serializer); } private String getDefaultHumanAssembly() { for (SpeciesConfiguration species : configuration.getSpecies().getVertebrates()) { - if (species.getId().equals("hsapiens")) { + if (species.getId().equals(HSAPIENS)) { return species.getAssemblies().get(0).getName(); } } @@ -388,31 +495,26 @@ private String getDefaultHumanAssembly() { + "configuration file. No hsapiens data found within the configuration.json file"); } - private Path getFastaReferenceGenome() { - Path fastaFile = null; - try { - DirectoryStream stream = Files.newDirectoryStream(downloadFolder.resolve("genome"), entry -> { - return entry.toString().endsWith(".fa"); - }); - for (Path entry : stream) { - fastaFile = entry; - } - } catch (IOException e) { - e.printStackTrace(); - } - return fastaFile; + private Path getFastaReferenceGenome() throws CellBaseException { + // Check FASTA and unzip if necessary + String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, + SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null); + String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); + Path gzFastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); + + return EtlCommons.getFastaPath(gzFastaPath); } - private CellBaseBuilder buildSplice() throws IOException { + private AbstractBuilder buildSplice() throws IOException, CellBaseException { Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_SCORE_DATA); if (!spliceOutputFolder.toFile().exists()) { spliceOutputFolder.toFile().mkdirs(); } - if (spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME).toFile().exists()) { - Files.copy(spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME), - spliceOutputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME), + if (spliceInputFolder.resolve(getDataVersionFilename(MMSPLICE_DATA)).toFile().exists()) { + Files.copy(spliceInputFolder.resolve(getDataVersionFilename(MMSPLICE_DATA)), + spliceOutputFolder.resolve(EtlCommons.getDataVersionFilename(MMSPLICE_DATA)), StandardCopyOption.REPLACE_EXISTING); } @@ -420,39 +522,130 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException { - Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); - Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); - if (!pubmedOutputFolder.toFile().exists()) { - pubmedOutputFolder.toFile().mkdirs(); - } + private AbstractBuilder buildPubMed() throws CellBaseException { + // Sanity check + Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); + Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); + copyVersionFiles(Collections.singletonList(pubMedDownloadPath.resolve(getDataVersionFilename(PUBMED_DATA))), pubMedBuildPath); - logger.info("Copying PubMed version file..."); - if (pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME).toFile().exists()) { - Files.copy(pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - pubmedOutputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); + // Create the file serializer and the PubMed builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubMedBuildPath); + return new PubMedBuilder(pubMedDownloadPath, serializer, configuration); + } + + private AbstractBuilder buildPharmacogenomics() throws CellBaseException { + // Sanity check + Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + copyVersionFiles(Collections.singletonList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); + + // Create the file serializer and the PharmGKB builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); + return new PharmGKBBuilder(pharmGkbDownloadPath, serializer); + } + + private AbstractBuilder buildPolygenicScores() throws CellBaseException { + Path pgsDownloadPath = downloadFolder.resolve(EtlCommons.PGS_DATA); + Path pgsBuildPath = buildFolder.resolve(EtlCommons.PGS_DATA); + copyVersionFiles(Collections.singletonList(pgsDownloadPath.resolve(getDataVersionFilename(PGS_CATALOG_DATA))), pgsBuildPath); + + // Create the file serializer and the PGS builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pgsBuildPath, PGS_VARIANT_COLLECTION); + return new PolygenicScoreBuilder(pgsDownloadPath, serializer); + } + + private void checkVersionFiles(List versionPaths) throws CellBaseException { + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + for (Path versionPath : versionPaths) { + if (!versionPath.toFile().exists()) { + throw new CellBaseException("Version file " + versionPath + " does not exist: this file is mandatory for version control"); + } + try { + DataSource dataSource = dataSourceReader.readValue(versionPath.toFile()); + if (StringUtils.isEmpty(dataSource.getVersion())) { + throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the" + + " file"); + } + } catch (IOException e) { + throw new CellBaseException("Error parsing the version file " + versionPath, e); + } } + } - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder); - return new PubMedBuilder(pubmedInputFolder, serializer); + private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { + // Check version files before copying them + checkVersionFiles(versionPaths); + copyFiles(versionPaths, targetPath); } - private CellBaseBuilder buildPharmacogenomics() throws IOException { - Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - if (!outFolder.toFile().exists()) { - outFolder.toFile().mkdirs(); + private void copyFiles(List versionPaths, Path targetPath) throws CellBaseException { + if (!Files.exists(targetPath)) { + try { + Files.createDirectories(targetPath); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + targetPath, e); + } } - logger.info("Copying PharmGKB version file..."); - if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) { - Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); + for (Path versionPath : versionPaths) { + try { + Files.copy(versionPath, targetPath.resolve(versionPath.getFileName()), StandardCopyOption.REPLACE_EXISTING); + } catch (IOException e) { + throw new CellBaseException("Error copying version file " + versionPath + " to " + targetPath, e); + } + // Sanity check after copying + if (!targetPath.resolve(versionPath.getFileName()).toFile().exists()) { + throw new CellBaseException("Something wrong happened when copying version file " + versionPath + " to " + targetPath); + } } + } - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); - return new PharmGKBBuilder(inFolder, serializer); +//<<<<<<< HEAD +// private List checkDataSources() { +// if (StringUtils.isEmpty(buildCommandOptions.data)) { +// throw new IllegalArgumentException("Missing data parameter. Valid values are: " +// + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to download everything"); +// } +// List dataList = Arrays.asList(buildCommandOptions.data.split(",")); +// for (String data : dataList) { +// switch (data) { +// case GENOME_DATA: +// case GENE_DATA: +// case REFSEQ_DATA: +// case VARIATION_FUNCTIONAL_SCORE_DATA: +// case MISSENSE_VARIATION_SCORE_DATA: +// case REGULATION_DATA: +// case PROTEIN_DATA: +// case CONSERVATION_DATA: +// case CLINICAL_VARIANT_DATA: +// case REPEATS_DATA: +// case ONTOLOGY_DATA: +// case SPLICE_SCORE_DATA: +// case PUBMED_DATA: +// case PHARMACOGENOMICS_DATA: +// case PGS_DATA: +// break; +// default: +// throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " +// + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build everything"); +//======= + private List getDataList(String species, SpeciesConfiguration speciesConfig) throws CellBaseException { + // No need to check if 'data' exists since it is declared as required in JCommander + List dataList; + if ("all".equalsIgnoreCase(buildCommandOptions.data)) { + // Download all data sources for the species in the configuration.yml file + dataList = speciesConfig.getData(); + } else { + // Check if the data sources requested are valid for the species + dataList = Arrays.asList(buildCommandOptions.data.split(",")); + for (String data : dataList) { + if (!speciesConfig.getData().contains(data)) { + throw new CellBaseException("Data parameter '" + data + "' does not exist or it is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". " + + "You can use data parameter 'all' to build everything"); + } + } + } + return dataList; } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java new file mode 100644 index 0000000000..8ec6a5e421 --- /dev/null +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DataListCommandExecutor.java @@ -0,0 +1,56 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.app.cli.admin.executors; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.app.cli.CommandExecutor; +import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.utils.SpeciesUtils; + +import java.util.List; + +public class DataListCommandExecutor extends CommandExecutor { + + private AdminCliOptionsParser.DataListCommandOptions dataListCommandOptions; + + public DataListCommandExecutor(AdminCliOptionsParser.DataListCommandOptions dataListCommandOptions) { + super(dataListCommandOptions.commonOptions.logLevel, dataListCommandOptions.commonOptions.conf); + + this.dataListCommandOptions = dataListCommandOptions; + } + + + /** + * Execute one of the selected actions according to the input parameters. + */ + public void execute() { + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, dataListCommandOptions.species); + if (speciesConfiguration == null) { + System.out.println("Unknown species: " + dataListCommandOptions.species); + System.out.println("Available species:"); + List allSpecies = SpeciesUtils.getAllSpecies(configuration); + for (SpeciesConfiguration species : allSpecies) { + System.out.println("\t- " + species.getScientificName() + " (" + species.getId() + ")"); + } + return; + } + + System.out.println("Species: " + dataListCommandOptions.species); + System.out.println("Available data: " + StringUtils.join(speciesConfiguration.getData(), ",")); + } +} diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index abb0629374..f2c3de6e6a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -16,29 +16,22 @@ package org.opencb.cellbase.app.cli.admin.executors; -import com.beust.jcommander.ParameterException; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.cellbase.lib.download.AbstractDownloadManager; -import org.opencb.cellbase.lib.download.DownloadFile; -import org.opencb.cellbase.lib.download.Downloader; +import org.opencb.cellbase.lib.download.*; -import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.*; + -/** - * Created by imedina on 03/02/15. - */ public class DownloadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; @@ -52,88 +45,123 @@ public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions down } /** - * Execute specific 'download' command options. + * Process CellBase command 'download'. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { try { + // Get the species and the assembly String species = downloadCommandOptions.speciesAndAssemblyOptions.species; String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; + + // Get the valid list of data sources + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + if (speciesConfiguration == null) { + throw new CellBaseException("Invalid species: '" + downloadCommandOptions.speciesAndAssemblyOptions.species + "'"); + } + List dataList = getDataList(species, speciesConfiguration); + logger.info("Downloading the following data sources: {}", CollectionUtils.isEmpty(dataList) + ? Collections.emptyList() + : StringUtils.join(dataList, ",")); + List downloadFiles = new ArrayList<>(); - List dataList = getDataList(species); - Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); + AbstractDownloadManager downloader = null; for (String data : dataList) { switch (data) { - case EtlCommons.GENOME_DATA: - downloadFiles.addAll(downloader.downloadGenome()); + case GENOME_DATA: + downloader = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.GENE_DATA: - downloadFiles.addAll(downloader.downloadGene()); + case CONSERVATION_DATA: + downloader = new ConservationDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.VARIATION_DATA: - downloadFiles.addAll(downloader.downloadVariation()); + case REPEATS_DATA: + downloader = new RepeatsDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: - downloadFiles.addAll(downloader.downloadCaddScores()); + case GENE_DATA: + downloader = new GeneDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - downloadFiles.addAll(downloader.downloadPredictionScores()); + case PROTEIN_DATA: + downloader = new ProteinDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.REGULATION_DATA: - downloadFiles.addAll(downloader.downloadRegulation()); + case REGULATION_DATA: + downloader = new RegulationDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.PROTEIN_DATA: - downloadFiles.addAll(downloader.downloadProtein()); + case VARIATION_DATA: + downloader = new VariationDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.CONSERVATION_DATA: - downloadFiles.addAll(downloader.downloadConservation()); + case VARIATION_FUNCTIONAL_SCORE_DATA: + downloader = new CaddDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.CLINICAL_VARIANTS_DATA: - downloadFiles.addAll(downloader.downloadClinicalVariants()); + case MISSENSE_VARIATION_SCORE_DATA: + downloader = new MissenseScoresDownloadManager(species, assembly, outputDirectory, configuration); break; -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// downloadFiles.add(downloadManager.downloadStructuralVariants()); -// break; - case EtlCommons.REPEATS_DATA: - downloadFiles.addAll(downloader.downloadRepeats()); + case CLINICAL_VARIANT_DATA: + downloader = new ClinicalDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.OBO_DATA: - downloadFiles.addAll(downloader.downloadOntologies()); + case SPLICE_SCORE_DATA: + downloader = new SpliceScoreDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.PUBMED_DATA: - downloadFiles.addAll(downloader.downloadPubMed()); + case ONTOLOGY_DATA: + downloader = new OntologyDownloadManager(species, assembly, outputDirectory, configuration); break; - case EtlCommons.PHARMACOGENOMICS_DATA: - downloadFiles.addAll(downloader.downloadPharmKGB()); + case PUBMED_DATA: + downloader = new PubMedDownloadManager(species, assembly, outputDirectory, configuration); break; - default: - System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" - + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," - + " regulation, protein, conservation, clinical_variants, ontology, pubmed}"); + case PHARMACOGENOMICS_DATA: + downloader = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); + break; + case PGS_DATA: + downloader = new PgsDownloadManager(species, assembly, outputDirectory, configuration); break; + default: + throw new IllegalArgumentException("Data parameter '" + data + "' is not allowed for '" + species + "'. " + + "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",") + + ". You can use data parameter 'all' to download everything"); } - } - AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (ParameterException | IOException | CellBaseException | InterruptedException | NoSuchMethodException - | FileFormatException e) { - logger.error("Error in 'download' command line: " + e.getMessage()); - } - } - private List getDataList(String species) throws CellBaseException { - if (StringUtils.isEmpty(downloadCommandOptions.data) || downloadCommandOptions.data.equals("all")) { - return SpeciesUtils.getSpeciesConfiguration(configuration, species).getData(); - } else { - return Arrays.asList(downloadCommandOptions.data.split(",")); + // Call to download method and add the files to the list + downloadFiles.addAll(downloader.download()); + } + if (downloader != null) { + Map params = new HashMap<>(); + params.put("species", species); + params.put("assembly", assembly); + params.put("data", dataList); + params.put("outDir", outputDirectory); + downloader.writeDownloadLogFile(params, downloadFiles); + } else { + logger.warn("Impossible to write log summary: downloader is null"); + } + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); + } catch (Exception e) { + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } - @Deprecated - private List getDataList(SpeciesConfiguration sp) { + private List getDataList(String species, SpeciesConfiguration speciesConfig) throws CellBaseException { + // No need to check if 'data' exists since it is declared as required in JCommander List dataList; - if (downloadCommandOptions.data.equals("all")) { - dataList = sp.getData(); + if ("all".equalsIgnoreCase(downloadCommandOptions.data)) { + // Download all data sources for the species in the configuration.yml file + dataList = speciesConfig.getData(); } else { + // Check if the data sources requested are valid for the species dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + Set invalidData = new HashSet<>(); + for (String data : dataList) { + if (!speciesConfig.getData().contains(data)) { + invalidData.add(data); + } + } + if (!CollectionUtils.isEmpty(invalidData)) { + throw new CellBaseException("Data '" + StringUtils.join(invalidData, ",") + "' not supported by species '" + species + "'." + + "Valid values are: " + StringUtils.join(speciesConfig.getData(), ",") + ". Our use data parameter 'all' to" + + " download everything"); + } } return dataList; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 72f992f344..4fba479a36 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -85,8 +85,8 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo this.dataToExport = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, + ONTOLOGY_DATA, MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -293,7 +293,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " protein functional predictions"; break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { counter = exportClinicalVariantData(regions); counterMsg = counter + " clinical variants"; break; @@ -309,7 +309,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " repeats"; break; } - case OBO_DATA: { + case ONTOLOGY_DATA: { counter = exportOntologyData(); counterMsg = counter + " ontology items"; break; @@ -424,7 +424,7 @@ private String exportPharmacogenomicsData(List genes) private int exportClinicalVariantData(List regions) throws CellBaseException, QueryException, IllegalAccessException, IOException { - String baseFilename = CLINICAL_VARIANTS_DATA + ".full"; + String baseFilename = CLINICAL_VARIANT_DATA + ".full"; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, baseFilename); ClinicalManager clinicalManager = managerFactory.getClinicalManager(species, assembly); ClinicalVariantQuery query = new ClinicalVariantQuery(); @@ -449,7 +449,7 @@ private int exportClinicalVariantData(List regions) throws CellBaseExcep private int exportOntologyData() throws CellBaseException, IOException { int counter = 0; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, OBO_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, ONTOLOGY_DATA); OntologyManager ontologyManager = managerFactory.getOntologyManager(species, assembly); CellBaseIterator iterator = ontologyManager.iterator(new OntologyQuery()); while (iterator.hasNext()) { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java deleted file mode 100644 index 70849eb924..0000000000 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/InstallCommandExecutor.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.app.cli.admin.executors; - -import org.opencb.cellbase.app.cli.CommandExecutor; -import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.install.InstallManager; - -public class InstallCommandExecutor extends CommandExecutor { - - private AdminCliOptionsParser.InstallCommandOptions installCommandOptions; - - public InstallCommandExecutor(AdminCliOptionsParser.InstallCommandOptions installCommandOptions) { - super(installCommandOptions.commonOptions.logLevel, installCommandOptions.commonOptions.conf); - - this.installCommandOptions = installCommandOptions; - } - - public void execute() throws CellBaseException { - try { - logger.info("Starting installation ..."); - InstallManager installManager = new InstallManager(configuration); - installManager.install(installCommandOptions.speciesAndAssemblyOptions.species, - installCommandOptions.speciesAndAssemblyOptions.assembly); - } catch (CellBaseException e) { - logger.error("Error installing:" + e.toString()); - } - } -} diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 97460d5a71..b155d2cfcf 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -20,9 +20,12 @@ import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; +import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.indexer.IndexManager; @@ -38,25 +41,33 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.concurrent.ExecutionException; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.DbSnpBuilder.DBSNP_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_JSON_FILENAME; +import static org.opencb.cellbase.lib.builders.OntologyBuilder.OBO_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.PolygenicScoreBuilder.PGS_COMMON_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.PolygenicScoreBuilder.PGS_VARIANT_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.ProteinBuilder.PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; +import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.VariationBuilder.VARIATION_CHR_PREFIX; +import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; /** * Created by imedina on 03/02/15. */ public class LoadCommandExecutor extends CommandExecutor { - private static final String METADATA = "metadata"; private LoadRunner loadRunner; private AdminCliOptionsParser.LoadCommandOptions loadCommandOptions; private Path input; - private String[] loadOptions; + private List dataList; private int dataRelease; private String database; @@ -68,35 +79,45 @@ public class LoadCommandExecutor extends CommandExecutor { private IndexManager indexManager; private DataReleaseManager dataReleaseManager; + private static final String AUTHENTICATION_DATABASE = "authenticationDatabase"; + + private static final String LOADING_FILE_LOG_MESSAGE = "Loading file '{}'"; + private static final String ERROR_LOADING_FILE_LOG_MESSAGE = "Error loading file '{}': {}"; + private static final String ERROR_LOADING_DATA = "Error loading data in collection "; + private static final String LOADING_JSON_IN_COLLECTION_MSG = "Loading JSON file '{}' in collection '{}' for data release '{}' ..."; + public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandOptions) { super(loadCommandOptions.commonOptions.logLevel, loadCommandOptions.commonOptions.conf); this.loadCommandOptions = loadCommandOptions; - - input = Paths.get(loadCommandOptions.input); - if (loadCommandOptions.database != null) { - database = loadCommandOptions.database; - } - if (loadCommandOptions.data.equals("all")) { - loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, - EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, - EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, - EtlCommons.PHARMACOGENOMICS_DATA}; - } else { - loadOptions = loadCommandOptions.data.split(","); - } - if (loadCommandOptions.field != null) { - field = loadCommandOptions.field; - } - if (loadCommandOptions.innerFields != null) { - innerFields = loadCommandOptions.innerFields.split(","); - } - if (loadCommandOptions.loader != null) { - loader = loadCommandOptions.loader; - } - createIndexes = !loadCommandOptions.skipIndex; +//<<<<<<< HEAD +// +// input = Paths.get(loadCommandOptions.input); +// if (loadCommandOptions.database != null) { +// database = loadCommandOptions.database; +// } +// if (loadCommandOptions.data.equals("all")) { +// loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, +// EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, +// EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, +// EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, +// EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, +// EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PGS_DATA}; +// } else { +// loadOptions = loadCommandOptions.data.split(","); +// } +// if (loadCommandOptions.field != null) { +// field = loadCommandOptions.field; +// } +// if (loadCommandOptions.innerFields != null) { +// innerFields = loadCommandOptions.innerFields.split(","); +// } +// if (loadCommandOptions.loader != null) { +// loader = loadCommandOptions.loader; +// } +// createIndexes = !loadCommandOptions.skipIndex; +//======= +//>>>>>>> TASK-5564 } /** @@ -105,17 +126,14 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO * @throws CellBaseException CellBase exception */ public void execute() throws CellBaseException { - // Init release manager - dataReleaseManager = new DataReleaseManager(database, configuration); - checkParameters(); logger.info("Loading in data release {}", dataRelease); - if (loadCommandOptions.data != null) { + if (CollectionUtils.isNotEmpty(dataList)) { // If 'authenticationDatabase' is not passed by argument then we read it from configuration.json - if (loadCommandOptions.loaderParams.containsKey("authenticationDatabase")) { - configuration.getDatabases().getMongodb().getOptions().put("authenticationDatabase", - loadCommandOptions.loaderParams.get("authenticationDatabase")); + if (loadCommandOptions.loaderParams.containsKey(AUTHENTICATION_DATABASE)) { + configuration.getDatabases().getMongodb().getOptions().put(AUTHENTICATION_DATABASE, + loadCommandOptions.loaderParams.get(AUTHENTICATION_DATABASE)); } loadRunner = new LoadRunner(loader, database, numThreads, dataReleaseManager, configuration); if (createIndexes) { @@ -123,167 +141,73 @@ public void execute() throws CellBaseException { indexManager = new IndexManager(database, indexFile, configuration); } - for (String loadOption : loadOptions) { + for (String data : dataList) { try { - switch (loadOption) { + switch (data) { case EtlCommons.GENOME_DATA: { - // Load data - if (input.resolve("genome_info.json").toFile().exists()) { - loadIfExists(input.resolve("genome_info.json"), "genome_info"); - } else { - loadIfExists(input.resolve("genome_info.json.gz"), "genome_info"); - } - loadIfExists(input.resolve("genome_sequence.json.gz"), "genome_sequence"); - - // Create index - createIndex("genome_info"); - createIndex("genome_sequence"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("genomeVersion.json") - )); - dataReleaseManager.update(dataRelease, "genome_info", EtlCommons.GENOME_DATA, sources); - dataReleaseManager.update(dataRelease, "genome_sequence", null, null); + loadGenome(); break; } case EtlCommons.GENE_DATA: { - // Load data - loadIfExists(input.resolve("gene.json.gz"), "gene"); - - // Create index - createIndex("gene"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("dgidbVersion.json"), - input.resolve("ensemblCoreVersion.json"), - input.resolve("uniprotXrefVersion.json"), - input.resolve("geneExpressionAtlasVersion.json"), - input.resolve("hpoVersion.json"), - input.resolve("disgenetVersion.json"), - input.resolve("gnomadVersion.json") - )); - dataReleaseManager.update(dataRelease, "gene", EtlCommons.GENE_DATA, sources); - break; - } - case EtlCommons.REFSEQ_DATA: { - // Load data - loadIfExists(input.resolve("refseq.json.gz"), "refseq"); - - // Create index - createIndex("refseq"); - - // Update release (collection and sources) - List sources = new ArrayList<>( - Collections.singletonList(input.resolve("refseqVersion.json"))); - dataReleaseManager.update(dataRelease, "refseq", EtlCommons.REFSEQ_DATA, sources); + loadGene(); break; } case EtlCommons.VARIATION_DATA: { - // Load data, create index and update release - loadVariationData(); + loadVariation(); break; } case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: { // Load data - loadIfExists(input.resolve("cadd.json.gz"), "variation_functional_score"); + loadIfExists(input.resolve("cadd.json.gz"), VARIATION_FUNCTIONAL_SCORE_DATA); // Create index - createIndex("variation_functional_score"); + createIndex(VARIATION_FUNCTIONAL_SCORE_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("caddVersion.json"))); - dataReleaseManager.update(dataRelease, "variation_functional_score", - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, VARIATION_FUNCTIONAL_SCORE_DATA, sources); break; } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { // Load data - loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), - "missense_variation_functional_score"); + loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), MISSENSE_VARIATION_SCORE_DATA); // Create index - createIndex("missense_variation_functional_score"); + createIndex(MISSENSE_VARIATION_SCORE_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Collections.singletonList(input.resolve("revelVersion.json"))); - dataReleaseManager.update(dataRelease, "missense_variation_functional_score", - EtlCommons.MISSENSE_VARIATION_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, MISSENSE_VARIATION_SCORE_DATA, sources); break; } case EtlCommons.CONSERVATION_DATA: { - // Load data, create index and update release loadConservation(); break; } case EtlCommons.REGULATION_DATA: { - // Load data (regulatory region and regulatory PFM)) - loadIfExists(input.resolve("regulatory_region.json.gz"), "regulatory_region"); - loadIfExists(input.resolve("regulatory_pfm.json.gz"), "regulatory_pfm"); - - // Create index - createIndex("regulatory_region"); - createIndex("regulatory_pfm"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Collections.singletonList(input.resolve("ensemblRegulationVersion.json"))); - dataReleaseManager.update(dataRelease, "regulatory_region", EtlCommons.REGULATION_DATA, sources); - dataReleaseManager.update(dataRelease, "regulatory_pfm", null, null); + loadRegulation(); break; } case EtlCommons.PROTEIN_DATA: { - // Load data - loadIfExists(input.resolve("protein.json.gz"), "protein"); - - // Create index - createIndex("protein"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("uniprotVersion.json"), - input.resolve("interproVersion.json") - )); - dataReleaseManager.update(dataRelease, "protein", EtlCommons.PROTEIN_DATA, sources); + loadProtein(); break; } -// case EtlCommons.PPI_DATA: -// loadIfExists(input.resolve("protein_protein_interaction.json.gz"), "protein_protein_interaction"); -// loadIfExists(input.resolve("intactVersion.json"), METADATA); -// createIndex("protein_protein_interaction"); -// break; case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { // Load data, create index and update release loadProteinFunctionalPrediction(); break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { // Load data, create index and update release loadClinical(); break; } case EtlCommons.REPEATS_DATA: { - // Load data, create index and update release loadRepeats(); break; } -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// loadStructuralVariants(); -// break; - case EtlCommons.OBO_DATA: { - // Load data - loadIfExists(input.resolve("ontology.json.gz"), "ontology"); - - // Create index - createIndex("ontology"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.HPO_VERSION_FILE), - input.resolve(EtlCommons.GO_VERSION_FILE), - input.resolve(EtlCommons.DO_VERSION_FILE) - )); - dataReleaseManager.update(dataRelease, "ontology", EtlCommons.OBO_DATA, sources); + case EtlCommons.ONTOLOGY_DATA: { + loadOntology(); break; } case EtlCommons.SPLICE_SCORE_DATA: { @@ -291,7 +215,7 @@ public void execute() throws CellBaseException { loadSpliceScores(); break; } - case EtlCommons.PUBMED_DATA: { + case PUBMED_DATA: { // Load data, create index and update release loadPubMed(); break; @@ -301,32 +225,27 @@ public void execute() throws CellBaseException { loadPharmacogenomica(); break; } + case EtlCommons.PGS_DATA: { + // Load data, create index and update release + loadPolygenicScores(); + break; + } default: - logger.warn("Not valid 'data'. We should not reach this point"); + logger.warn("Not valid data: {}. We should not reach this point", data); break; } } catch (IllegalAccessException | InstantiationException | InvocationTargetException | ExecutionException - | NoSuchMethodException | InterruptedException | ClassNotFoundException | LoaderException | IOException e) { - e.printStackTrace(); + | NoSuchMethodException | ClassNotFoundException | LoaderException | IOException e) { + logger.error(Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } } } -// private void loadStructuralVariants() { -// Path path = input.resolve(EtlCommons.STRUCTURAL_VARIANTS_JSON + ".json.gz"); -// if (Files.exists(path)) { -// try { -// logger.debug("Loading '{}' ...", path.toString()); -// loadRunner.load(path, EtlCommons.STRUCTURAL_VARIANTS_DATA); -// loadIfExists(input.resolve(EtlCommons.DGV_VERSION_FILE), "metadata"); -// } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException -// | IllegalAccessException | ExecutionException | IOException | InterruptedException e) { -// logger.error(e.toString()); -// } -// } -// } - private void loadIfExists(Path path, String collection) throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IOException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, LoaderException, CellBaseException { @@ -343,132 +262,132 @@ private void loadIfExists(Path path, String collection) throws NoSuchMethodExcep } private void checkParameters() throws CellBaseException { - if (loadCommandOptions.numThreads > 1) { - numThreads = loadCommandOptions.numThreads; + // Input folder + if (!Files.exists(Paths.get(loadCommandOptions.input))) { + throw new CellBaseException("Input path '" + loadCommandOptions.input + "' does not exist"); + } + if (!Files.isDirectory(Paths.get(loadCommandOptions.input))) { + throw new CellBaseException("Input path '" + loadCommandOptions.input + "' is not a directyory"); + } + input = Paths.get(loadCommandOptions.input); + + // Database + if (StringUtils.isEmpty(loadCommandOptions.database)) { + throw new CellBaseException("Missing database"); + } + database = loadCommandOptions.database; + + // Data + if (StringUtils.isEmpty(loadCommandOptions.data)) { + throw new CellBaseException("Missing data. Please, specify a list of data separated by commas, or use 'all' to load" + + " everything"); + } + String species = DatabaseNameUtils.getSpeciesFromDatabaseName(database); + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + if (speciesConfiguration == null) { + throw new CellBaseException("Species '" + species + "' not supported (database name '" + database + "')"); + } + if (loadCommandOptions.data.equals("all")) { + dataList = speciesConfiguration.getData(); } else { - numThreads = 1; - logger.warn("Incorrect number of numThreads, it must be a positive value. This has been set to '{}'", numThreads); + dataList = Arrays.asList(loadCommandOptions.data.split(",")); + if (CollectionUtils.isEmpty(dataList)) { + throw new CellBaseException("Missing data. Please, specify a list of data separated by commas, or use 'all' to load" + + " everything"); + } + Set invalidData = new HashSet<>(); + for (String data : dataList) { + if (!speciesConfiguration.getData().contains(data)) { + invalidData.add(data); + } + } + if (!CollectionUtils.isEmpty(invalidData)) { + throw new CellBaseException("Data '" + StringUtils.join(invalidData, ",") + "' not supported by species '" + species + "'"); + } } - if (field != null) { - if (loadCommandOptions.data == null) { - logger.error("--data option cannot be empty. Please provide a valid value for the --data parameter."); - } else if (!Files.exists(input)) { - logger.error("Input parameter {} does not exist", input); + // Field + if (StringUtils.isNotEmpty(loadCommandOptions.field)) { + field = loadCommandOptions.field; + } + + // Inner fields + if (StringUtils.isNotEmpty(loadCommandOptions.innerFields)) { + innerFields = loadCommandOptions.innerFields.split(","); + } + + // Loader + if (StringUtils.isNotEmpty(loadCommandOptions.loader)) { + loader = loadCommandOptions.loader; + try { + Class.forName(loader); + } catch (ClassNotFoundException e) { + throw new CellBaseException("Loader Java class '" + loader + "' does not exist", e); } - } else if (!Files.exists(input) || !Files.isDirectory(input)) { - logger.error("Input parameter {} does not exist or is not a directory", input); } - try { - Class.forName(loader); - } catch (ClassNotFoundException e) { - logger.error("Loader Java class '{}' does not exist", loader); - e.printStackTrace(); - System.exit(-1); + + // Skip indexes + createIndexes = !loadCommandOptions.skipIndex; + + // Num. threads + if (loadCommandOptions.numThreads > 1) { + numThreads = loadCommandOptions.numThreads; + } else { + numThreads = 1; + logger.warn("Incorrect number of numThreads, it must be a positive value. This has been set to '{}'", numThreads); } - // Check data release + // Data release + dataReleaseManager = new DataReleaseManager(database, configuration); dataRelease = getDataReleaseForLoading(dataReleaseManager).getRelease(); } - private void loadVariationData() throws NoSuchMethodException, InterruptedException, ExecutionException, + private void loadVariation() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, LoaderException, CellBaseException { - Path variationPath = input.resolve(VARIATION_DATA); - // First load data if (field == null) { - // Common loading process from CellBase variation data models - DirectoryStream stream = Files.newDirectoryStream(variationPath, - entry -> entry.getFileName().toString().startsWith("variation_chr")); + Path variationPath = input.resolve(VARIATION_DATA); - int numLoadings = 0; - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(variationPath.resolve(entry.getFileName()), "variation", dataRelease); - numLoadings++; + // Loading variant_chrXXX files, if necessary + File[] chrFiles = variationPath.toFile().listFiles((dir, name) -> name.startsWith(VARIATION_CHR_PREFIX)); + if (chrFiles.length > 0) { + // Common loading process from CellBase variation data models + loadData(variationPath, VARIATION_DATA, VARIATION_CHR_PREFIX); } - if (numLoadings > 0) { - // Create index - createIndex("variation"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - variationPath.resolve("ensemblVariationVersion.json") - )); - dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources); - } else { - logger.info("Any variation file 'variation_chr...' found within folder '{}'", variationPath); - } + // Loading dbSNP file, if necessary + HashMap collectionMap = new HashMap<>(); + collectionMap.put(SNP_DATA, DBSNP_OUTPUT_FILENAME); + loadData(variationPath.resolve(DBSNP_DATA), collectionMap); } else { // Custom update required e.g. population freqs loading - logger.info("Loading file '{}'", variationPath); - loadRunner.load(variationPath, "variation", dataRelease, field, innerFields); - } - - // Load dbSNP - Path dbSnpFilePath = variationPath.resolve(DBSNP_NAME + ".json.gz"); - if (dbSnpFilePath.toFile().exists()) { - if (variationPath.resolve(DBSNP_VERSION_FILENAME).toFile().exists()) { - logger.info("Loading dbSNP file '{}'", dbSnpFilePath); - loadRunner.load(dbSnpFilePath, SNP_COLLECTION_NAME, dataRelease); - - // Create index - createIndex(SNP_COLLECTION_NAME); - - // Update release (collection and sources) - List sources = Collections.singletonList(variationPath.resolve(DBSNP_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, SNP_COLLECTION_NAME, EtlCommons.VARIATION_DATA, sources); - } else { - logger.warn("In order to load the dbSNP file you need the version file {} within the folder '{}'", DBSNP_VERSION_FILENAME, - variationPath); - } - } else { - logger.warn("Any dbSNP file found within the folder '{}'", variationPath); + logger.info(LOADING_FILE_LOG_MESSAGE, input); + loadRunner.load(input, VARIATION_DATA, dataRelease, field, innerFields); } } - private void loadConservation() throws NoSuchMethodException, InterruptedException, ExecutionException, - InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, - IOException, CellBaseException, LoaderException { - // Load data - DirectoryStream stream = Files.newDirectoryStream(input, - entry -> entry.getFileName().toString().startsWith("conservation_")); - - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "conservation", dataRelease); - } - - // Create index - createIndex("conservation"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("gerpVersion.json"), - input.resolve("phastConsVersion.json"), - input.resolve("phyloPVersion.json") - )); - dataReleaseManager.update(dataRelease, "conservation", EtlCommons.CONSERVATION_DATA, sources); + private void loadConservation() throws IOException, CellBaseException { + loadData(input.resolve(CONSERVATION_DATA), CONSERVATION_DATA, "conservation_"); } private void loadProteinFunctionalPrediction() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { // Load data - DirectoryStream stream = Files.newDirectoryStream(input, - entry -> entry.getFileName().toString().startsWith("prot_func_pred_")); + try (DirectoryStream stream = Files.newDirectoryStream(input, + entry -> entry.getFileName().toString().startsWith("prot_func_pred_"))) { - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "protein_functional_prediction", dataRelease); - } + for (Path entry : stream) { + logger.info(LOADING_FILE_LOG_MESSAGE, entry); + loadRunner.load(input.resolve(entry.getFileName()), PROTEIN_FUNCTIONAL_PREDICTION_DATA, dataRelease); + } - // Create index - createIndex("protein_functional_prediction"); + // Create index + createIndex(PROTEIN_FUNCTIONAL_PREDICTION_DATA); - // Update release (collection and sources) - dataReleaseManager.update(dataRelease, "protein_functional_prediction", null, null); + // Update release (collection and sources) + dataReleaseManager.update(dataRelease, PROTEIN_FUNCTIONAL_PREDICTION_DATA, null); + } } private void loadClinical() throws FileNotFoundException { @@ -477,10 +396,10 @@ private void loadClinical() throws FileNotFoundException { try { // Load data logger.info("Loading '{}' ...", path); - loadRunner.load(path, "clinical_variants", dataRelease); + loadRunner.load(path, CLINICAL_VARIANT_DATA, dataRelease); // Create index - createIndex("clinical_variants"); + createIndex(CLINICAL_VARIANT_DATA); // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( @@ -488,46 +407,63 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANTS_DATA, sources); + dataReleaseManager.update(dataRelease, CLINICAL_VARIANT_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { - logger.error(e.toString()); - } catch (LoaderException e) { - e.printStackTrace(); + | IllegalAccessException | ExecutionException | IOException | LoaderException | CellBaseException e) { + logger.error(Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } else { throw new FileNotFoundException("File " + path + " does not exist"); } } - private void loadRepeats() { - Path path = input.resolve(EtlCommons.REPEATS_JSON + ".json.gz"); - if (Files.exists(path)) { - try { - // Load data - logger.debug("Loading '{}' ...", path); - loadRunner.load(path, "repeats", dataRelease); + private void loadGenome() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(GENOME_SEQUENCE_COLLECTION_NAME, GENOME_JSON_FILENAME); + collectionMap.put(GENOME_INFO_DATA, GENOME_INFO_FILENAME); - // Create index - createIndex("repeats"); + loadData(input.resolve(GENOME_DATA), collectionMap); + } - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.TRF_VERSION_FILE), - input.resolve(EtlCommons.GSD_VERSION_FILE), - input.resolve(EtlCommons.WM_VERSION_FILE) - )); - dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources); - } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { - logger.error(e.toString()); - } catch (LoaderException e) { - e.printStackTrace(); - } - } else { - logger.warn("Repeats file {} not found", path); - logger.warn("No repeats data will be loaded"); - } + private void loadGene() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(GENE_DATA, ENSEMBL_GENE_OUTPUT_FILENAME); + collectionMap.put(REFSEQ_DATA, REFSEQ_GENE_OUTPUT_FILENAME); + + loadData(input.resolve(GENE_DATA), collectionMap); + } + + private void loadRepeats() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(REPEATS_DATA, REPEATS_OUTPUT_FILENAME); + + loadData(input.resolve(REPEATS_DATA), collectionMap); + } + + private void loadOntology() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(ONTOLOGY_DATA, OBO_OUTPUT_FILENAME); + + loadData(input.resolve(ONTOLOGY_DATA), collectionMap); + } + + private void loadRegulation() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(REGULATORY_REGION_BASENAME, REGULATORY_REGION_OUTPUT_FILENAME); + collectionMap.put(REGULATORY_PFM_BASENAME, REGULATORY_PFM_OUTPUT_FILENAME); + + loadData(input.resolve(REGULATION_DATA), collectionMap); + } + + private void loadProtein() throws CellBaseException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(PROTEIN_DATA, PROTEIN_OUTPUT_FILENAME); + + loadData(input.resolve(PROTEIN_DATA), collectionMap); } private void loadSpliceScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, @@ -535,57 +471,62 @@ private void loadSpliceScores() throws NoSuchMethodException, InterruptedExcepti // Load data logger.info("Loading splice scores from '{}'", input); // MMSplice scores - loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_SUBDIRECTORY)); + loadSpliceScores(input.resolve(SPLICE_SCORE_DATA + "/" + MMSPLICE_DATA)); // SpliceAI scores - loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.SPLICEAI_SUBDIRECTORY)); + loadSpliceScores(input.resolve(SPLICE_SCORE_DATA + "/" + SPLICEAI_DATA)); // Create index createIndex("splice_score"); // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_VERSION_FILENAME), - input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.SPLICEAI_VERSION_FILENAME) + input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(MMSPLICE_DATA)), + input.resolve(SPLICE_SCORE_DATA + "/" + getDataVersionFilename(SPLICEAI_DATA)) )); - dataReleaseManager.update(dataRelease, "splice_score", EtlCommons.SPLICE_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, SPLICE_SCORE_DATA, sources); } private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionException, InterruptedException, ClassNotFoundException, InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException, LoaderException, CellBaseException { // Get files from folder - DirectoryStream stream = Files.newDirectoryStream(spliceFolder, - entry -> entry.getFileName().toString().startsWith("splice_score_")); + try (DirectoryStream stream = Files.newDirectoryStream(spliceFolder, + entry -> entry.getFileName().toString().startsWith("splice_score_"))) { - // Load from JSON files - for (Path entry : stream) { - logger.info("Loading file '{}'", entry); - loadRunner.load(spliceFolder.resolve(entry.getFileName()), "splice_score", dataRelease); + // Load from JSON files + for (Path entry : stream) { + logger.info(LOADING_FILE_LOG_MESSAGE, entry); + loadRunner.load(spliceFolder.resolve(entry.getFileName()), "splice_score", dataRelease); + } } } private void loadPubMed() throws CellBaseException { - Path pubmedPath = input.resolve(EtlCommons.PUBMED_DATA); + Path pubmedPath = input.resolve(PUBMED_DATA); if (Files.exists(pubmedPath)) { // Load data for (File file : pubmedPath.toFile().listFiles()) { if (file.isFile() && (file.getName().endsWith("gz"))) { - logger.info("Loading file '{}'", file.getName()); + logger.info(LOADING_FILE_LOG_MESSAGE, file.getName()); try { - loadRunner.load(file.toPath(), EtlCommons.PUBMED_DATA, dataRelease); + loadRunner.load(file.toPath(), PUBMED_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | LoaderException e) { - logger.error("Error loading file '{}': {}", file.getName(), e.toString()); + | IllegalAccessException | ExecutionException | IOException | LoaderException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, file.getName(), Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, file.getName(), Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } } } // Create index - createIndex(EtlCommons.PUBMED_DATA); + createIndex(PUBMED_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources); + List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.getDataVersionFilename(PUBMED_DATA))); + dataReleaseManager.update(dataRelease, PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } @@ -601,34 +542,126 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { // Load data Path pharmaJsonPath = pharmaPath.resolve(EtlCommons.PHARMACOGENOMICS_DATA + ".json.gz"); - logger.info("Loading file '{}'", pharmaJsonPath.toFile().getName()); + logger.info(LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName()); try { loadRunner.load(pharmaJsonPath, EtlCommons.PHARMACOGENOMICS_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException - | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException - | LoaderException e) { - logger.error("Error loading file '{}': {}", pharmaJsonPath.toFile().getName(), e.toString()); + | IllegalAccessException | ExecutionException | IOException | CellBaseException | LoaderException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName(), Arrays.toString(e.getStackTrace())); + } catch (InterruptedException e) { + logger.error(ERROR_LOADING_FILE_LOG_MESSAGE, pharmaJsonPath.toFile().getName(), Arrays.toString(e.getStackTrace())); + // Restore interrupted state... + Thread.currentThread().interrupt(); } - // Create index createIndex(EtlCommons.PHARMACOGENOMICS_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); + List sources = Collections.singletonList(pharmaPath.resolve(getDataVersionFilename(PHARMGKB_DATA))); + dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, sources); + } + + private void loadPolygenicScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, + IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { + HashMap collectionMap = new HashMap<>(); + collectionMap.put(PGS_COMMON_COLLECTION, PGS_COMMON_OUTPUT_FILENAME); + collectionMap.put(PGS_VARIANT_COLLECTION, PGS_VARIANT_OUTPUT_FILENAME); + + loadData(input.resolve(PGS_DATA), collectionMap); + } + + private void loadData(Path buildPath, Map collectionMap) throws CellBaseException { + // Load data from the different files into the input collections + for (Map.Entry entry : collectionMap.entrySet()) { + Path jsonPath = buildPath.resolve(entry.getValue()); + loadJsonFile(entry.getKey(), jsonPath); + } + + // Load sources + loadSources(buildPath); + } + + private void loadData(Path buildPath, String collection, String prefix) throws CellBaseException, IOException { + // Load data + try (DirectoryStream stream = Files.newDirectoryStream(buildPath, + entry -> entry.getFileName().toString().startsWith(prefix))) { + + for (Path entry : stream) { + logger.info("Loading JSON file '{}' ...", entry); + try { + loadRunner.load(buildPath.resolve(entry.getFileName()), collection, dataRelease); + logger.info(DONE_MSG); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (Exception e) { + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); + } + } + + // Create index + createIndex(collection); + + // Update the data release collection + dataReleaseManager.update(dataRelease, collection, getVersionPaths(buildPath)); + } + } + + private void loadJsonFile(String collection, Path jsonPath) throws CellBaseException { + if (!Files.exists(jsonPath)) { + String collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); + logger.warn("JSON file '{}' not found. No data will be loaded in collection '{}'.", jsonPath, collectionName); + return; + } + + try { + // Load data + logger.info(LOADING_JSON_IN_COLLECTION_MSG, jsonPath.getFileName(), collection, dataRelease); + loadRunner.load(jsonPath, collection, dataRelease); + logger.info(DONE_MSG); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); + } catch (Exception e) { + throw new CellBaseException(ERROR_LOADING_DATA + collection, e); + } + + // Create index + createIndex(collection); + + // Update collection in data release + dataReleaseManager.update(dataRelease, collection); } private void createIndex(String collection) { if (!createIndexes) { return; } - String collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); - logger.info("Loading indexes for '{}' collection ...", collectionName); + + String collectionName = null; try { + collectionName = CellBaseDBAdaptor.buildCollectionName(collection, dataRelease); + logger.info("Creating indexes for collection '{}' ...", collectionName); indexManager.createMongoDBIndexes(Collections.singletonList(collectionName), true); + logger.info(DONE_MSG); } catch (IOException e) { - logger.error("Error creating index: {}", e.getMessage()); + logger.error("Error creating indexes for collection '{}': {}", collectionName, Arrays.toString(e.getStackTrace())); + } + } + + private void loadSources(Path path) throws CellBaseException { + // Update data source in data release + dataReleaseManager.updateSources(dataRelease, getVersionPaths(path)); + } + + private List getVersionPaths(Path path) { + List sources = new ArrayList<>(); + for (File file : path.toFile().listFiles()) { + if (file.getName().endsWith(SUFFIX_VERSION_FILENAME)) { + sources.add(file.getAbsoluteFile().toPath()); + } } + return sources; } private DataRelease getDataReleaseForLoading(DataReleaseManager dataReleaseManager) throws CellBaseException { @@ -638,11 +671,9 @@ private DataRelease getDataReleaseForLoading(DataReleaseManager dataReleaseManag throw new CellBaseException("No data releases are available"); } DataRelease lastDataRelease = null; - for (DataRelease dataRelease : dataReleaseResults.getResults()) { - if (lastDataRelease == null) { - lastDataRelease = dataRelease; - } else if (dataRelease.getRelease() > lastDataRelease.getRelease()) { - lastDataRelease = dataRelease; + for (DataRelease dr : dataReleaseResults.getResults()) { + if (lastDataRelease == null || dr.getRelease() > lastDataRelease.getRelease()) { + lastDataRelease = dr; } } if (lastDataRelease == null) { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java index 612e8d6a38..764de7b0df 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ValidationCommandExecutor.java @@ -78,7 +78,7 @@ public void execute() { DataRelease dataRelease = dataReleaseManager.get(validationCommandOptions.dataRelease); variantAnnotationCalculator = new VariantAnnotationCalculator(validationCommandOptions.species, validationCommandOptions.assembly, dataRelease, validationCommandOptions.apiKey, - cellBaseManagerFactory); + cellBaseManagerFactory, configuration); } catch (CellBaseException e) { e.printStackTrace(); return; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java index d2285d5550..731a7220ca 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/main/executors/VariantAnnotationCommandExecutor.java @@ -207,7 +207,7 @@ private boolean runAnnotation() throws Exception { DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); DataRelease dataRelease = dataReleaseManager.get(variantAnnotationCommandOptions.dataRelease); VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator(species, assembly, - dataRelease, variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory); + dataRelease, variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory, configuration); List> annotationByVariantList = variantAnnotationCalculator.getAnnotationByVariantList(variants, serverQueryOptions); @@ -485,7 +485,7 @@ private VariantAnnotator createCellBaseAnnotator() throws CellBaseException { DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); DataRelease dataRelease = dataReleaseManager.get(variantAnnotationCommandOptions.dataRelease); return new CellBaseLocalVariantAnnotator(new VariantAnnotationCalculator(species, assembly, dataRelease, - variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory), serverQueryOptions); + variantAnnotationCommandOptions.apiKey, cellBaseManagerFactory, configuration), serverQueryOptions); } else { try { ClientConfiguration clientConfiguration = ClientConfiguration.load(getClass() diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java index 1451fc213b..b987afa29f 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/GeneQuery.java @@ -42,12 +42,11 @@ public class GeneQuery extends AbstractQuery { @QueryParameter(id = "region") private List regions; - @QueryParameter(id = "transcripts.biotype", alias = {ParamConstants.TRANSCRIPT_BIOTYPES_PARAM, - "transcriptsBiotype"}) + @QueryParameter(id = "transcripts.biotype", alias = {ParamConstants.TRANSCRIPT_BIOTYPES_PARAM, "transcriptsBiotype"}) private List transcriptsBiotype; - @QueryParameter(id = "transcripts.xrefs.id", alias = {ParamConstants.TRANSCRIPT_XREFS_PARAM, "xrefs", "transcriptsXrefsId", - "transcripts.xrefs"}) + @QueryParameter(id = "transcripts.xrefs.id", + alias = {ParamConstants.TRANSCRIPT_XREFS_PARAM, "xrefs", "transcriptsXrefsId", "transcripts.xrefs"}) private List transcriptsXrefs; @QueryParameter(id = "transcripts.id", alias = {ParamConstants.TRANSCRIPT_IDS_PARAM, "transcriptsId"}) private List transcriptsId; @@ -61,22 +60,23 @@ public class GeneQuery extends AbstractQuery { private LogicalList transcriptsTfbsId; @QueryParameter(id = "transcripts.tfbs.pfmId", alias = {ParamConstants.TRANSCRIPT_TFBS_PFMIDS_PARAM, "transcriptsTfbsPfmId"}) private LogicalList transcriptsTfbsPfmId; - @QueryParameter(id = "transcripts.tfbs.transcriptionFactors", alias = {ParamConstants.TRANSCRIPT_TRANSCRIPTION_FACTORS_PARAM, - "transcriptsTfbsTranscriptionFactors"}) + @QueryParameter(id = "transcripts.tfbs.transcriptionFactors", + alias = {ParamConstants.TRANSCRIPT_TRANSCRIPTION_FACTORS_PARAM, "transcriptsTfbsTranscriptionFactors"}) private LogicalList transcriptsTfbsTranscriptionFactors; - @QueryParameter(id = ParamConstants.ONTOLOGY_PARAM, alias = {"transcripts.annotation.ontologies.id", - "transcripts.annotation.ontologies.name", "transcriptAnnotationOntologiesId"}) + @QueryParameter(id = ParamConstants.ONTOLOGY_PARAM, + alias = {"transcripts.annotation.ontologies.id", "transcripts.annotation.ontologies.name", "transcriptAnnotationOntologiesId"}) private LogicalList transcriptAnnotationOntologiesId; - @QueryParameter(id = ParamConstants.ANNOTATION_DISEASES_PARAM, alias = {"annotation.diseases.id", "annotation.diseases.name"}) + @QueryParameter(id = ParamConstants.ANNOTATION_DISEASES_PARAM, + alias = {"annotation.diseases.id", "annotation.diseases.name", "annotation.diseases.hpo"}) private LogicalList annotationDiseases; - @QueryParameter(id = "annotation.expression.tissue", alias = {ParamConstants.ANNOTATION_EXPRESSION_TISSUE_PARAM, - "annotationExpressionTissue"}) + @QueryParameter(id = "annotation.expression.tissue", + alias = {ParamConstants.ANNOTATION_EXPRESSION_TISSUE_PARAM, "annotationExpressionTissue"}) private LogicalList annotationExpressionTissue; - @QueryParameter(id = "annotation.expression.value", alias = {ParamConstants.ANNOTATION_EXPRESSION_VALUE_PARAM, - "annotationExpressionValue"}) + @QueryParameter(id = "annotation.expression.value", + alias = {ParamConstants.ANNOTATION_EXPRESSION_VALUE_PARAM, "annotationExpressionValue"}) private LogicalList annotationExpressionValue; - @QueryParameter(id = "annotation.drugs.drugName", alias = {ParamConstants.ANNOTATION_DRUGS_NAME_PARAM, "annotation.drugs.name", - "annotationDrugsName"}) + @QueryParameter(id = "annotation.drugs.drugName", + alias = {ParamConstants.ANNOTATION_DRUGS_NAME_PARAM, "annotation.drugs.name", "annotationDrugsName"}) private LogicalList annotationDrugsName; @QueryParameter(id = "constraints", alias = {ParamConstants.ANNOTATION_CONSTRAINTS_PARAM}) private LogicalList annotationConstraints; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java new file mode 100644 index 0000000000..106b01e1fe --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java @@ -0,0 +1,98 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.api; + +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.api.query.QueryParameter; + +import java.util.List; +import java.util.Map; + +public class PolygenicScoreQuery extends AbstractQuery { + + @QueryParameter(id = "id") + private List ids; + + @QueryParameter(id = "name") + private List names; + + @QueryParameter(id = "source", allowedValues = {"PGS Catalog"}) + private List sources; + + public PolygenicScoreQuery() { + } + + public PolygenicScoreQuery(Map params) throws QueryException { + super(params); + + objectMapper.readerForUpdating(this); + objectMapper.readerFor(PolygenicScoreQuery.class); + objectWriter = objectMapper.writerFor(PolygenicScoreQuery.class); + } + + @Override + protected void validateQuery() throws QueryException { + // Nothing to to + return; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("PolygenicScoreQuery{"); + sb.append("ids=").append(ids); + sb.append(", names=").append(names); + sb.append(", sources=").append(sources); + sb.append(", limit=").append(limit); + sb.append(", skip=").append(skip); + sb.append(", count=").append(count); + sb.append(", sort='").append(sort).append('\''); + sb.append(", order=").append(order); + sb.append(", facet='").append(facet).append('\''); + sb.append(", includes=").append(includes); + sb.append(", excludes=").append(excludes); + sb.append('}'); + return sb.toString(); + } + + public List getIds() { + return ids; + } + + public PolygenicScoreQuery setIds(List ids) { + this.ids = ids; + return this; + } + + public List getNames() { + return names; + } + + public PolygenicScoreQuery setNames(List names) { + this.names = names; + return this; + } + + public List getSources() { + return sources; + } + + public PolygenicScoreQuery setSources(List sources) { + this.sources = sources; + return this; + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java index de470db66d..c30d3d6bea 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/CellBaseConfiguration.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.google.common.base.CaseFormat; -import org.apache.commons.lang.StringUtils; import org.opencb.commons.utils.FileUtils; import org.slf4j.LoggerFactory; @@ -27,7 +26,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Path; -import java.util.*; +import java.util.HashMap; +import java.util.Map; public class CellBaseConfiguration { @@ -135,10 +135,7 @@ private static DatabaseCredentials secureGetMongodb(CellBaseConfiguration config configuration.setDatabases(new Databases()); } if (configuration.getDatabases().getMongodb() == null) { - configuration.getDatabases().setMongodb(new MongoDBDatabaseCredentials()); - } - if (configuration.getDatabases().getMongodb().getShards() == null) { - configuration.getDatabases().getMongodb().setShards(Collections.emptyList()); + configuration.getDatabases().setMongodb(new DatabaseCredentials()); } if (configuration.getDatabases().getMongodb().getOptions() == null) { configuration.getDatabases().getMongodb().setOptions(new HashMap<>()); @@ -228,51 +225,6 @@ public void setSpecies(SpeciesProperties species) { this.species = species; } - /** - * get the config for this species. - * @param id shortName for species, e.g. hsapiens - * @return configuration for this species - */ - public SpeciesConfiguration getSpeciesConfig(String id) { - if (StringUtils.isEmpty(id)) { - return null; - } - List allSpecies = getAllSpecies(); - for (SpeciesConfiguration config : allSpecies) { - if (config.getId().equals(id)) { - return config; - } - } - return null; - } - - public List getAllSpecies() { - List allSpecies = new ArrayList<>(); - if (species.getVertebrates() != null && !species.getVertebrates().isEmpty()) { - allSpecies.addAll(species.getVertebrates()); - } - if (species.getMetazoa() != null && !species.getMetazoa().isEmpty()) { - allSpecies.addAll(species.getMetazoa()); - } - if (species.getFungi() != null && !species.getFungi().isEmpty()) { - allSpecies.addAll(species.getFungi()); - } - if (species.getProtist() != null && !species.getProtist().isEmpty()) { - allSpecies.addAll(species.getProtist()); - } - if (species.getPlants() != null && !species.getPlants().isEmpty()) { - allSpecies.addAll(species.getPlants()); - } - if (species.getVirus() != null && !species.getVirus().isEmpty()) { - allSpecies.addAll(species.getVirus()); - } - if (species.getBacteria() != null && !species.getBacteria().isEmpty()) { - allSpecies.addAll(species.getBacteria()); - } - - return allSpecies; - } - public ServerProperties getServer() { return server; } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java index 304c191d78..ab9c8a6e94 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DatabaseCredentials.java @@ -18,15 +18,13 @@ import java.util.Map; -/** - * Created by imedina on 19/08/16. - */ + public class DatabaseCredentials { - private String host; - private String user; - private String password; - private Map options; + protected String host; + protected String user; + protected String password; + protected Map options; public DatabaseCredentials() { } @@ -40,7 +38,7 @@ public DatabaseCredentials(String host, String user, String password, Map getOptions() { return options; } - public void setOptions(Map options) { + public DatabaseCredentials setOptions(Map options) { this.options = options; + return this; } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java index 4c0cf374c7..905780fcdb 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/Databases.java @@ -16,48 +16,32 @@ package org.opencb.cellbase.core.config; -import java.util.Map; - -/** - * Created by imedina on 16/09/16. - */ public class Databases { - private MongoDBDatabaseCredentials mongodb; - private Map neo4j; + private DatabaseCredentials mongodb; public Databases() { } - public Databases(MongoDBDatabaseCredentials mongodb, Map neo4j) { + public Databases(DatabaseCredentials mongodb) { this.mongodb = mongodb; - this.neo4j = neo4j; } @Override public String toString() { final StringBuilder sb = new StringBuilder("Databases{"); sb.append("mongodb=").append(mongodb); - sb.append(", neo4j=").append(neo4j); sb.append('}'); return sb.toString(); } - public MongoDBDatabaseCredentials getMongodb() { + public DatabaseCredentials getMongodb() { return mongodb; } - public Databases setMongodb(MongoDBDatabaseCredentials mongodb) { + public Databases setMongodb(DatabaseCredentials mongodb) { this.mongodb = mongodb; return this; } - public Map getNeo4j() { - return neo4j; - } - - public Databases setNeo4j(Map neo4j) { - this.neo4j = neo4j; - return this; - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 507e85a75f..915dfa086b 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -16,7 +16,7 @@ package org.opencb.cellbase.core.config; -import java.util.List; +import java.util.Map; /** * Created by imedina on 19/08/16. @@ -28,16 +28,11 @@ public class DownloadProperties { private URLProperties hgnc; private URLProperties cancerHotspot; private URLProperties refSeq; - private URLProperties refSeqFasta; - private URLProperties refSeqProteinFasta; - private URLProperties refSeqCdna; private URLProperties maneSelect; private URLProperties lrg; - private URLProperties geneUniprotXref; private URLProperties geneExpressionAtlas; private URLProperties mirbase; - private URLProperties mirbaseReadme; private URLProperties targetScan; private URLProperties miRTarBase; private URLProperties uniprot; @@ -45,25 +40,20 @@ public class DownloadProperties { private URLProperties intact; private URLProperties interpro; private URLProperties interproRelNotes; - private URLProperties conservation; + private URLProperties phastCons; + private URLProperties phylop; private URLProperties gerp; private URLProperties clinvar; - private URLProperties clinvarVariation; - private URLProperties clinvarSummary; - private URLProperties clinvarVariationAllele; - private URLProperties clinvarEfoTerms; + private URLProperties cosmic; + private URLProperties hgmd; private URLProperties dbSNP; - private URLProperties iarctp53; - private URLProperties docm; - private URLProperties docmVersion; private URLProperties dgv; private URLProperties simpleRepeats; private URLProperties windowMasker; private URLProperties genomicSuperDups; private URLProperties hpo; - private URLProperties disgenet; - private URLProperties disgenetReadme; private URLProperties dgidb; + private URLProperties cancerGeneCensus; private URLProperties gwasCatalog; private URLProperties dbsnp; private URLProperties cadd; @@ -75,8 +65,11 @@ public class DownloadProperties { private URLProperties mondoObo; private URLProperties goAnnotation; private URLProperties revel; + private URLProperties mmSplice; + private URLProperties spliceAi; private URLProperties pubmed; private URLProperties pharmGKB; + private URLProperties pgsCatalog; public EnsemblProperties getEnsembl() { return ensembl; @@ -132,15 +125,6 @@ public DownloadProperties setMirbase(URLProperties mirbase) { return this; } - public URLProperties getMirbaseReadme() { - return mirbaseReadme; - } - - public DownloadProperties setMirbaseReadme(URLProperties mirbaseReadme) { - this.mirbaseReadme = mirbaseReadme; - return this; - } - public URLProperties getTargetScan() { return targetScan; } @@ -204,12 +188,21 @@ public DownloadProperties setInterproRelNotes(URLProperties interproRelNotes) { return this; } - public URLProperties getConservation() { - return conservation; + public URLProperties getPhastCons() { + return phastCons; } - public DownloadProperties setConservation(URLProperties conservation) { - this.conservation = conservation; + public DownloadProperties setPhastCons(URLProperties phastCons) { + this.phastCons = phastCons; + return this; + } + + public URLProperties getPhylop() { + return phylop; + } + + public DownloadProperties setPhylop(URLProperties phylop) { + this.phylop = phylop; return this; } @@ -231,38 +224,21 @@ public DownloadProperties setClinvar(URLProperties clinvar) { return this; } - public URLProperties getClinvarVariation() { - return clinvarVariation; - } - - public DownloadProperties setClinvarVariation(URLProperties clinvarVariation) { - this.clinvarVariation = clinvarVariation; - return this; - } - - public URLProperties getClinvarSummary() { - return clinvarSummary; + public URLProperties getCosmic() { + return cosmic; } - public DownloadProperties setClinvarSummary(URLProperties clinvarSummary) { - this.clinvarSummary = clinvarSummary; + public DownloadProperties setCosmic(URLProperties cosmic) { + this.cosmic = cosmic; return this; } - public URLProperties getClinvarVariationAllele() { - return clinvarVariationAllele; - } - - public void setClinvarVariationAllele(URLProperties clinvarVariationAllele) { - this.clinvarVariationAllele = clinvarVariationAllele; - } - - public URLProperties getClinvarEfoTerms() { - return clinvarEfoTerms; + public URLProperties getHgmd() { + return hgmd; } - public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) { - this.clinvarEfoTerms = clinvarEfoTerms; + public DownloadProperties setHgmd(URLProperties hgmd) { + this.hgmd = hgmd; return this; } @@ -275,30 +251,6 @@ public DownloadProperties setDbSNP(URLProperties dbSNP) { return this; } - public URLProperties getIarctp53() { - return iarctp53; - } - - public void setIarctp53(URLProperties iarctp53) { - this.iarctp53 = iarctp53; - } - - public URLProperties getDocm() { - return docm; - } - - public void setDocm(URLProperties docm) { - this.docm = docm; - } - - public URLProperties getDocmVersion() { - return docmVersion; - } - - public void setDocmVersion(URLProperties docmVersion) { - this.docmVersion = docmVersion; - } - public URLProperties getDgv() { return dgv; } @@ -340,30 +292,21 @@ public DownloadProperties setHpo(URLProperties hpo) { return this; } - public URLProperties getDisgenet() { - return disgenet; - } - - public DownloadProperties setDisgenet(URLProperties disgenet) { - this.disgenet = disgenet; - return this; - } - - public URLProperties getDisgenetReadme() { - return disgenetReadme; + public URLProperties getDgidb() { + return dgidb; } - public DownloadProperties setDisgenetReadme(URLProperties disgenetReadme) { - this.disgenetReadme = disgenetReadme; + public DownloadProperties setDgidb(URLProperties dgidb) { + this.dgidb = dgidb; return this; } - public URLProperties getDgidb() { - return dgidb; + public URLProperties getCancerGeneCensus() { + return cancerGeneCensus; } - public DownloadProperties setDgidb(URLProperties dgidb) { - this.dgidb = dgidb; + public DownloadProperties setCancerGeneCensus(URLProperties cancerGeneCensus) { + this.cancerGeneCensus = cancerGeneCensus; return this; } @@ -457,25 +400,30 @@ public DownloadProperties setRefSeq(URLProperties refSeq) { return this; } - public URLProperties getRefSeqFasta() { - return refSeqFasta; + public URLProperties getRevel() { + return revel; } - public DownloadProperties setRefSeqFasta(URLProperties refSeqFasta) { - this.refSeqFasta = refSeqFasta; + public DownloadProperties setRevel(URLProperties revel) { + this.revel = revel; return this; } - public URLProperties getRefSeqProteinFasta() { - return refSeqProteinFasta; + public URLProperties getMmSplice() { + return mmSplice; } - public URLProperties getRevel() { - return revel; + public DownloadProperties setMmSplice(URLProperties mmSplice) { + this.mmSplice = mmSplice; + return this; } - public DownloadProperties setRevel(URLProperties revel) { - this.revel = revel; + public URLProperties getSpliceAi() { + return spliceAi; + } + + public DownloadProperties setSpliceAi(URLProperties spliceAi) { + this.spliceAi = spliceAi; return this; } @@ -497,17 +445,12 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } - public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { - this.refSeqProteinFasta = refSeqProteinFasta; - return this; + public URLProperties getPgsCatalog() { + return pgsCatalog; } - public URLProperties getRefSeqCdna() { - return refSeqCdna; - } - - public DownloadProperties setRefSeqCdna(URLProperties refSeqCdna) { - this.refSeqCdna = refSeqCdna; + public DownloadProperties setPgsCatalog(URLProperties pgsCatalog) { + this.pgsCatalog = pgsCatalog; return this; } @@ -582,7 +525,7 @@ public static class URLProperties { private String host; private String version; - private List files; + private Map files; public String getHost() { return host; @@ -601,14 +544,13 @@ public URLProperties setVersion(String version) { return this; } - public List getFiles() { + public Map getFiles() { return files; } - public URLProperties setFiles(List files) { + public URLProperties setFiles(Map files) { this.files = files; return this; } - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java deleted file mode 100644 index 2582b24115..0000000000 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/MongoDBDatabaseCredentials.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.core.config; - -import java.util.List; -import java.util.Map; - -/** - * Created by imedina on 19/08/16. - */ -public class MongoDBDatabaseCredentials extends DatabaseCredentials { - - private List shards; - private String host; - private String user; - private String password; - private Map options; - - public MongoDBDatabaseCredentials() { - } - - public MongoDBDatabaseCredentials(String host, String user, String password, List shards, Map options) { - super(host, user, password, options); - this.shards = shards; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("DatabaseProperties{"); - sb.append("host='").append(host).append('\''); - sb.append(", user='").append(user).append('\''); - sb.append(", password='").append(password).append('\''); - sb.append(", replicaSets='").append(shards).append('\''); - sb.append(", options=").append(options); - sb.append('}'); - return sb.toString(); - } - - public List getShards() { - return shards; - } - - public MongoDBDatabaseCredentials setShards(List shards) { - this.shards = shards; - return this; - } - - public static class ReplicaSet { - private String id; - private String nodes; - - /** - * @return the replicaset name, e.g. rs0 - */ - public String getId() { - return id; - } - - /** - * @param id label for the replicaset, e.g. rs0 - * @return the replicaset of interest - */ - public ReplicaSet setId(String id) { - this.id = id; - return this; - } - - /** - * @return nodes for replica set, e.g. cb-mongo-shard1-1:27017,cb-mongo-shard1-2:27017,cb-mongo-shard1-3:27017 - */ - public String getNodes() { - return nodes; - } - - /** - * @param nodes nodes for replica set, e.g. cb-mongo-shard1-1:27017,cb-mongo-shard1-2:27017,cb-mongo-shard1-3:27017 - * @return nodes for this replica set - */ - public ReplicaSet setNodes(String nodes) { - this.nodes = nodes; - return this; - } - } -} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java index 3a3fae4d9f..5c4976675c 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/SpeciesConfiguration.java @@ -18,9 +18,7 @@ import java.util.List; -/** - * Created by imedina on 19/08/16. - */ + public class SpeciesConfiguration { private String id; @@ -28,23 +26,19 @@ public class SpeciesConfiguration { private String commonName; private List assemblies; private List data; - private List shards; public SpeciesConfiguration() { } - public SpeciesConfiguration(String id, String scientificName, String commonName, List assemblies, List data, - List shards) { + public SpeciesConfiguration(String id, String scientificName, String commonName, List assemblies, List data) { this.id = id; this.scientificName = scientificName; this.commonName = commonName; this.assemblies = assemblies; this.data = data; - this.shards = shards; } - @Override public String toString() { final StringBuilder sb = new StringBuilder("Species{"); @@ -53,7 +47,6 @@ public String toString() { sb.append(", commonName='").append(commonName).append('\''); sb.append(", assemblies=").append(assemblies); sb.append(", data=").append(data); - sb.append(", shards=").append(shards); sb.append('}'); return sb.toString(); } @@ -62,40 +55,45 @@ public String getId() { return id; } - public void setId(String id) { + public SpeciesConfiguration setId(String id) { this.id = id; + return this; } public String getScientificName() { return scientificName; } - public void setScientificName(String scientificName) { + public SpeciesConfiguration setScientificName(String scientificName) { this.scientificName = scientificName; + return this; } public String getCommonName() { return commonName; } - public void setCommonName(String commonName) { + public SpeciesConfiguration setCommonName(String commonName) { this.commonName = commonName; + return this; } public List getAssemblies() { return assemblies; } - public void setAssemblies(List assemblies) { + public SpeciesConfiguration setAssemblies(List assemblies) { this.assemblies = assemblies; + return this; } public List getData() { return data; } - public void setData(List data) { + public SpeciesConfiguration setData(List data) { this.data = data; + return this; } public static class Assembly { @@ -103,126 +101,51 @@ public static class Assembly { private String ensemblVersion; private String ensemblCollection; // Only for bacteria - public String getName() { - return name; + public Assembly() { } - public void setName(String name) { + public Assembly(String ensemblCollection, String ensemblVersion, String name) { + this.ensemblCollection = ensemblCollection; + this.ensemblVersion = ensemblVersion; this.name = name; } - public String getEnsemblVersion() { - return ensemblVersion; - } - - public void setEnsemblVersion(String ensemblVersion) { - this.ensemblVersion = ensemblVersion; + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("Assembly{"); + sb.append("ensemblCollection='").append(ensemblCollection).append('\''); + sb.append(", name='").append(name).append('\''); + sb.append(", ensemblVersion='").append(ensemblVersion).append('\''); + sb.append('}'); + return sb.toString(); } public String getEnsemblCollection() { return ensemblCollection; } - public void setEnsemblCollection(String ensemblCollection) { + public Assembly setEnsemblCollection(String ensemblCollection) { this.ensemblCollection = ensemblCollection; - } - } - - public List getShards() { - return shards; - } - - public SpeciesConfiguration setShards(List shards) { - this.shards = shards; - return this; - } - - public static class ShardConfig { - private String collection; - private List key; - private String rangeKey; - private List zones; - - public String getCollection() { - return collection; - } - - public ShardConfig setCollection(String collection) { - this.collection = collection; - return this; - } - - public List getKey() { - return key; - } - - public ShardConfig setKey(List key) { - this.key = key; - return this; - } - - public String getRangeKey() { - return rangeKey; - } - - public ShardConfig setRangeKey(String rangeKey) { - this.rangeKey = rangeKey; return this; } - public List getZones() { - return zones; + public String getEnsemblVersion() { + return ensemblVersion; } - public ShardConfig setZones(List zones) { - this.zones = zones; + public Assembly setEnsemblVersion(String ensemblVersion) { + this.ensemblVersion = ensemblVersion; return this; } - } - - public static class Zone { - private String name; - private List shardRanges; public String getName() { return name; } - public Zone setName(String name) { + public Assembly setName(String name) { this.name = name; return this; } - - public List getShardRanges() { - return shardRanges; - } - - public Zone setShardRanges(List shardRanges) { - this.shardRanges = shardRanges; - return this; - } } - public static class ShardRange { - private String minimum; - private String maximum; - - public String getMinimum() { - return minimum; - } - - public ShardRange setMinimum(String minimum) { - this.minimum = minimum; - return this; - } - - public String getMaximum() { - return maximum; - } - - public ShardRange setMaximum(String maximum) { - this.maximum = maximum; - return this; - } - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java index a3b54942d5..422a52b0d4 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java @@ -22,7 +22,7 @@ public CellBaseException(String msg) { super(msg); } - public CellBaseException(String msg, Exception e) { + public CellBaseException(String msg, Throwable e) { super(msg, e); } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java index 5674839aa8..47a694c5d8 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java @@ -21,14 +21,9 @@ public class DataRelease { private int release; private String date; - /** - * @deprecated it is maintained to back-compatibility with previous CellBase versions to v5.5 - */ - @Deprecated - private boolean active; private List activeByDefaultIn; private Map collections; - private List sources; + private List sources; public DataRelease() { this.activeByDefaultIn = Collections.emptyList(); @@ -37,7 +32,7 @@ public DataRelease() { } public DataRelease(int release, String date, List activeByDefaultIn, Map collections, - List sources) { + List sources) { this.release = release; this.date = date; this.activeByDefaultIn = activeByDefaultIn; @@ -75,15 +70,6 @@ public DataRelease setDate(String date) { return this; } - public boolean isActive() { - return active; - } - - public DataRelease setActive(boolean active) { - this.active = active; - return this; - } - public List getActiveByDefaultIn() { return activeByDefaultIn; } @@ -102,11 +88,11 @@ public DataRelease setCollections(Map collections) { return this; } - public List getSources() { + public List getSources() { return sources; } - public DataRelease setSources(List sources) { + public DataRelease setSources(List sources) { this.sources = sources; return this; } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java deleted file mode 100644 index 3a42de9374..0000000000 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.core.models; - -import java.util.List; -import java.util.Objects; - -public class DataReleaseSource { - private String name; - private String version; - private String data; - private String date; - private List url; - - public DataReleaseSource() { - } - - public DataReleaseSource(String name, String version, String data, String date, List url) { - this.name = name; - this.version = version; - this.data = data; - this.date = date; - this.url = url; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("DataReleaseSource{"); - sb.append("name='").append(name).append('\''); - sb.append(", version='").append(version).append('\''); - sb.append(", data='").append(data).append('\''); - sb.append(", date='").append(date).append('\''); - sb.append(", url=").append(url); - sb.append('}'); - return sb.toString(); - } - - public String getName() { - return name; - } - - public DataReleaseSource setName(String name) { - this.name = name; - return this; - } - - public String getVersion() { - return version; - } - - public DataReleaseSource setVersion(String version) { - this.version = version; - return this; - } - - public String getData() { - return data; - } - - public DataReleaseSource setData(String data) { - this.data = data; - return this; - } - - public String getDate() { - return date; - } - - public DataReleaseSource setDate(String date) { - this.date = date; - return this; - } - - public List getUrl() { - return url; - } - - public DataReleaseSource setUrl(List url) { - this.url = url; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - DataReleaseSource that = (DataReleaseSource) o; - return Objects.equals(name, that.name) - && Objects.equals(version, that.version) - && Objects.equals(data, that.data) - && Objects.equals(date, that.date) - && Objects.equals(url, that.url); - } - - @Override - public int hashCode() { - return Objects.hash(name, version, data, date, url); - } -} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java new file mode 100644 index 0000000000..acc134cb63 --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java @@ -0,0 +1,110 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.models; + +import java.util.ArrayList; +import java.util.List; + +public class DataSource { + + private String id; + private String name; + private String category; + private String version; + private String downloadDate; + private List urls; + + public DataSource() { + this.urls = new ArrayList<>(); + } + + public DataSource(String id, String name, String category, String version, String downloadDate, List urls) { + this.id = id; + this.name = name; + this.category = category; + this.version = version; + this.downloadDate = downloadDate; + this.urls = urls; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("DataSource{"); + sb.append("id='").append(id).append('\''); + sb.append(", name='").append(name).append('\''); + sb.append(", category='").append(category).append('\''); + sb.append(", version='").append(version).append('\''); + sb.append(", downloadDate='").append(downloadDate).append('\''); + sb.append(", urls=").append(urls); + sb.append('}'); + return sb.toString(); + } + + public String getId() { + return id; + } + + public DataSource setId(String id) { + this.id = id; + return this; + } + + public String getName() { + return name; + } + + public DataSource setName(String name) { + this.name = name; + return this; + } + + public String getCategory() { + return category; + } + + public DataSource setCategory(String category) { + this.category = category; + return this; + } + + public String getVersion() { + return version; + } + + public DataSource setVersion(String version) { + this.version = version; + return this; + } + + public String getDownloadDate() { + return downloadDate; + } + + public DataSource setDownloadDate(String downloadDate) { + this.downloadDate = downloadDate; + return this; + } + + public List getUrls() { + return urls; + } + + public DataSource setUrls(List urls) { + this.urls = urls; + return this; + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java new file mode 100644 index 0000000000..12954e950f --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/DatabaseNameUtils.java @@ -0,0 +1,72 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.utils; + +import org.apache.commons.lang3.StringUtils; + +import java.security.InvalidParameterException; +import java.util.Locale; + + +public final class DatabaseNameUtils { + + public static final String DBNAME_PREFIX = "cellbase"; + public static final String DBNAME_SEPARATOR = "_"; + + private DatabaseNameUtils() { + throw new IllegalStateException("Utility class"); + } + + public static String getDatabaseName(String species, String assembly, String version) { + if (StringUtils.isEmpty(species) || StringUtils.isEmpty(assembly)) { + throw new InvalidParameterException("Both species and assembly are required"); + } + + // Remove special characters + String dbnameAssembly = cleanAssembly(assembly); + + // Process version from the configuration file, in order to suffix the database name + // - Production environment, e.g.: if version is "v5", the suffix added wil be "_v5" + // - Test environment, e.g.: if version is "v5.6" or "v5.6.0-SNAPSHOT", the suffix added will be "_v5_6" + String auxVersion = version.replace(".", DBNAME_SEPARATOR).replace("-", DBNAME_SEPARATOR); + String[] split = auxVersion.split(DBNAME_SEPARATOR); + String dbName = DBNAME_PREFIX + DBNAME_SEPARATOR + species.toLowerCase() + DBNAME_SEPARATOR + dbnameAssembly.toLowerCase() + + DBNAME_SEPARATOR + split[0]; + if (split.length > 1) { + dbName += (DBNAME_SEPARATOR + split[1]); + } + return dbName; + } + + public static String cleanAssembly(String assembly) { + if (StringUtils.isEmpty(assembly)) { + throw new InvalidParameterException("Assembly is empty"); + } + + return assembly.replace("\\.", "") + .replace("-", "") + .replace("_", "").toLowerCase(Locale.ROOT); + } + + public static String getSpeciesFromDatabaseName(String databaseName) { + if (StringUtils.isEmpty(databaseName)) { + throw new InvalidParameterException("Database name is empty"); + } + + return databaseName.split(DBNAME_SEPARATOR)[1].toLowerCase(Locale.ROOT); + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java index c928f783e4..39c0b7e0f3 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/utils/SpeciesUtils.java @@ -16,12 +16,15 @@ package org.opencb.cellbase.core.utils; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.core.common.Species; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.config.SpeciesProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import java.util.ArrayList; import java.util.List; @@ -39,7 +42,7 @@ public class SpeciesUtils { */ public static Species getSpecies(CellBaseConfiguration configuration, String speciesStr, String assemblyStr) throws CellBaseException { Species species = null; - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (speciesStr.equalsIgnoreCase(sp.getScientificName()) || speciesStr.equalsIgnoreCase(sp.getCommonName()) || speciesStr.equalsIgnoreCase(sp.getId())) { SpeciesConfiguration.Assembly assembly; @@ -82,7 +85,7 @@ public static boolean validateSpeciesAndAssembly(CellBaseConfiguration configura return false; } - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { return getAssembly(sp, assembly) != null; @@ -96,8 +99,9 @@ public static boolean validateSpecies(CellBaseConfiguration configuration, Strin return false; } - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { - if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { + if (species.equalsIgnoreCase(sp.getScientificName()) + || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { return true; } @@ -108,7 +112,7 @@ public static boolean validateSpecies(CellBaseConfiguration configuration, Strin public static SpeciesConfiguration getSpeciesConfiguration(CellBaseConfiguration configuration, String species) { SpeciesConfiguration speciesConfiguration = null; - for (SpeciesConfiguration sp : configuration.getAllSpecies()) { + for (SpeciesConfiguration sp : SpeciesUtils.getAllSpecies(configuration)) { if (species.equalsIgnoreCase(sp.getScientificName()) || species.equalsIgnoreCase(sp.getCommonName()) || species.equalsIgnoreCase(sp.getId())) { @@ -119,6 +123,11 @@ public static SpeciesConfiguration getSpeciesConfiguration(CellBaseConfiguration return speciesConfiguration; } + public static boolean hasData(CellBaseConfiguration configuration, String species, String data) { + SpeciesConfiguration speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); + return CollectionUtils.isNotEmpty(speciesConfiguration.getData()) && speciesConfiguration.getData().contains(data); + } + /** * Get the default assembly for species. Is naive and just gets the first one. Order not guaranteed, don't rely on this at all. * @@ -134,6 +143,34 @@ public static SpeciesConfiguration.Assembly getDefaultAssembly(SpeciesConfigurat return assemblies.get(0); } + public static List getAllSpecies(CellBaseConfiguration cellBaseConfiguration) { + List allSpecies = new ArrayList<>(); + SpeciesProperties species = cellBaseConfiguration.getSpecies(); + if (species.getVertebrates() != null && !species.getVertebrates().isEmpty()) { + allSpecies.addAll(species.getVertebrates()); + } + if (species.getMetazoa() != null && !species.getMetazoa().isEmpty()) { + allSpecies.addAll(species.getMetazoa()); + } + if (species.getFungi() != null && !species.getFungi().isEmpty()) { + allSpecies.addAll(species.getFungi()); + } + if (species.getProtist() != null && !species.getProtist().isEmpty()) { + allSpecies.addAll(species.getProtist()); + } + if (species.getPlants() != null && !species.getPlants().isEmpty()) { + allSpecies.addAll(species.getPlants()); + } + if (species.getVirus() != null && !species.getVirus().isEmpty()) { + allSpecies.addAll(species.getVirus()); + } + if (species.getBacteria() != null && !species.getBacteria().isEmpty()) { + allSpecies.addAll(species.getBacteria()); + } + + return allSpecies; + } + /** * Get the default assembly for species. Is naive and just gets the first one. Order not guaranteed, don't rely on this at all. * diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 409c66ba1e..deff4e0881 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -16,11 +16,6 @@ databases: host: "${CELLBASE.DB.MONGODB.HOST}" user: "${CELLBASE.DB.USER}" password: "${CELLBASE.DB.PASSWORD}" - shards: - - id: "${CELLBASE.DB.MONGODB.REPLICASET.0.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.0}" - - id: "${CELLBASE.DB.MONGODB.REPLICASET.1.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.1}" options: authenticationDatabase: "${CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" authenticationMechanism: "${CELLBASE.DB.MONGODB.AUTHENTICATION_MECHANISM}" @@ -28,23 +23,15 @@ databases: replicaSet: "${CELLBASE.DB.MONGODB.REPLICASET}" connectionsPerHost: 20 sslEnabled: false -# sslInvalidCertificatesAllowed: true -# sslInvalidHostnameAllowed: true + # sslInvalidCertificatesAllowed: true + # sslInvalidHostnameAllowed: true enableSharding: false - neo4j: - hsapiens: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" - mmusculus: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" server: rest: port: "${CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -52,7 +39,23 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: https://ftp.ensembl.org/pub/ + files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + DESCRIPTION: "script:gene_extra_info.pl@description.txt" + XREFS: "script:gene_extra_info.pl@xrefs.txt" + CANONICAL: "script:ensembl_canonical.pl@ensembl_canonical.txt" + GENOME_INFO: "script:genome_info.pl@genome_info.json" + MMUSCULUS_VARIATION: "release-put_release_here/variation/vcf/put_species_here/put_species_here.vcf.gz" + MMUSCULUS_STRUCTURAL_VARIATIONS: "release-put_release_here/variation/vcf/put_species_here/put_species_here_structural_variations.vcf.gz" + ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -61,237 +64,342 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub - hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" refSeq: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - refSeqFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - refSeqProteinFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - refSeqCdna: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + host: https://ftp.ncbi.nih.gov/refseq/ + version: "2023-10-11" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + MMUSCULUS_GENOMIC_GTF: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.gtf.gz + MMUSCULUS_GENOMIC_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.fna.gz + MMUSCULUS_PROTEIN_FAA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_protein.faa.gz + MMUSCULUS_RNA_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_rna.fna.gz + RNORVEGICUS_GENOMIC_GTF: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.gtf.gz + RNORVEGICUS_GENOMIC_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.fna.gz + RNORVEGICUS_PROTEIN_FAA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_protein.faa.gz + RNORVEGICUS_RNA_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_rna.fna.gz + BTAURUS_GENOMIC_GTF: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.gtf.gz + BTAURUS_GENOMIC_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz + BTAURUS_PROTEIN_FAA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_protein.faa.gz + BTAURUS_RNA_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_rna.fna.gz maneSelect: -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz - version: "1.1" + host: https://ftp.ncbi.nlm.nih.gov/refseq/ + version: "1.2" + files: + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz lrg: - host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + host: http://ftp.ebi.ac.uk/ version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2024-04-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://dgidb.org/ + version: "DGIdb v.5.0.7 (07/06/2024)" + files: + DGIDB: data/latest/interactions.tsv geneUniprotXref: host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ - version: "2023-11-08" + version: "2024-03-27" + files: + UNIPROT_XREF: HUMAN_9606_idmapping_selected.tab.gz + MMUSCULUS_UNIPROT_XREF: MOUSE_10090_idmapping_selected.tab.gz + RNORVEGICUS_UNIPROT_XREF: RAT_10116_idmapping_selected.tab.gz + DRERIO_UNIPROT_XREF: DANRE_7955_idmapping_selected.tab.gz + DMELOANOGASTER_UNIPROT_XREF: DROME_7227_idmapping_selected.tab.gz + SCEREVISIAE_UNIPROT_XREF: YEAST_559292_idmapping_selected.tab.gz + CELEGANS_UNIPROT_XREF: CAEEL_6239_idmapping_selected.tab.gz geneExpressionAtlas: - host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ + version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations/ + version: "2024-04-26" + files: + HPO: "manual@phenotype_to_genes.txt" + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz + goAnnotation: + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz + MMUSCULUS_GO_ANNOTATION: gene-associations/mgi.gaf.gz + cancerGeneCensus: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/census/ + version: "v99" + files: + CANCER_GENE_CENSUS: "manual@cancer-gene-census.tsv" + pgsCatalog: + host: https://www.pgscatalog.org/ + version: "Dec. 15, 2023" + files: + PGS_CATALOG: https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv + + ## Regulation mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/ + version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" + files: + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx + RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx ## Protein Data uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" - uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + host: https://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" - interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + host: https://ftp.ebi.ac.uk/ + version: "2024-03-27" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" + host: https://ftp.ebi.ac.uk/ + version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt ## Conservation Scores - conservation: - host: https://hgdownload.cse.ucsc.edu/goldenPath/ + phastCons: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHASTCONS: goldenPath/hg38/phastCons470way/hg38.470way.phastCons/ + MMUSCULUS_PHASTCONS: goldenPath/mm39/phastCons35way/mm39.35way.phastCons/ + phylop: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHYLOP: goldenPath/hg38/phyloP470way/hg38.470way.phyloP/ + MMUSCULUS_PHYLOP: goldenPath/mm39/phyloP35way/mm39.35way.phyloP/ gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ version: "2023-05-17" + files: + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + MMUSCULUS_GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm39.bw + + ## Clinical Variant clinvar: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz - version: "2023-12-01" - clinvarVariation: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz - clinvarSummary: - host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" - clinvarVariationAllele: - host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" - clinvarEfoTerms: - host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + host: https://ftp.ncbi.nlm.nih.gov/ + version: "2024-02" + files: + FULL_RELEASE: pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz + SUMMARY: pub/clinvar/tab_delimited/variant_summary.txt.gz + ALLELE: pub/clinvar/tab_delimited/variation_allele.txt.gz + EFO_TERMS: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cosmic: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/cosmic/ + version: "v99" + files: + COSMIC: CosmicMutantExport.tsv.gz + hgmd: + ## To be downloaded manually + host: https://www.hgmd.cf.ac.uk/ + version: "2020-03" + files: + HGMD: hgmd.vcf + gwasCatalog: + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' + host: https://ftp.ebi.ac.uk/ + version: "2024-04-22" + files: + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv + DBSNP: All.vcf.gz + dbSNP: host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz version: "156" - iarctp53: - host: http://p53.iarc.fr/ajax/Zipper.ashx - docm: - host: http://docm.info/api/ - docmVersion: - host: http://docm.info + + pharmGKB: + host: https://api.pharmgkb.org/v1/download/file/data/ + version: v1 + files: + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip + dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + SIMPLE_REPEATS: goldenPath/hg38/database/simpleRepeat.txt.gz + MMUSCULUS_SIMPLE_REPEATS: goldenPath/mm39/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + WINDOW_MASKER: goldenPath/hg38/database/windowmaskerSdust.txt.gz + MMUSCULUS_WINDOW_MASKER: goldenPath/mm39/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath - gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv - version: "23-12-21" - hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations - host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt - disgenet: - host: https://www.disgenet.org/static/disgenet_ap1/files/downloads + host: http://hgdownload.cse.ucsc.edu/ files: - - all_gene_disease_associations.tsv.gz - - readme.txt - dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" + GENOMIC_SUPER_DUPS: goldenPath/hg38/database/genomicSuperDups.txt.gz + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/ + version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" - reactome: - host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" + host: https://krishna.gs.washington.edu/ + version: "1.7" + files: + CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + + ## OBO Ontologies + ## The version is retrieved from the OBO file hpoObo: - host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + files: + HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + files: + GO: go/go-basic.obo doidObo: - host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + files: + DOID: doid.obo mondoObo: - host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + host: http://purl.obolibrary.org/obo/ + files: + MONDO: mondo.obo + + ## Splice score + mmSplice: + host: http://kipoi.org/models/MMSplice/mtsplice/ + version: 2.0 + spliceAi: + host: https://basespace.illumina.com/s/otSPW8hnhaZR + version: 1.3.1 + + ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 files: - - pubmed22n[1..1114..4].xml.gz - pharmGKB: - host: https://www.pharmgkb.org/downloads - version: v1 - files: - - https://api.pharmgkb.org/v1/download/file/data/genes.zip - - https://api.pharmgkb.org/v1/download/file/data/chemicals.zip - - https://api.pharmgkb.org/v1/download/file/data/variants.zip - - https://api.pharmgkb.org/v1/download/file/data/guidelineAnnotations.json.zip - - https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip - - https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip - - https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip - - https://api.pharmgkb.org/v1/download/file/data/drugLabels.zip - - https://api.pharmgkb.org/v1/download/file/data/relationships.zip + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz + reactome: + host: http://www.reactome.org/download/current/biopax.zip + + species: vertebrates: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '111_38' name: GRCh38 - - ensemblVersion: '82_37' - name: GRCh37 +# - ensemblVersion: '82_37' +# name: GRCh37 data: - - clinical_variants + - genome - conservation + - repeats - gene - - genome - - missense_variation_functional_score - - ontology - protein - - refseq - regulation - - repeats - variation - variation_functional_score + - missense_variation_functional_score + - clinical_variant - splice_score - shards: - - collection: "variation" - key: - - chromosome - - start - - end - rangeKey: "chromosome" - zones: - - name: "zone0" - shardRanges: - - minimum: "1" - maximum: "10" - - minimum: "2" - maximum: "20" - - minimum: "3" - maximum: "9" - - name: "zone1" - shardRanges: - - minimum: "10" - maximum: "2" - - minimum: "20" - maximum: "3" - - minimum: "9" - maximum: "Z" + - ontology + - pubmed + - pharmacogenomics + - polygenic_score - id: mmusculus scientificName: Mus musculus assemblies: - - ensemblVersion: '82_38' - name: GRCm38 + - ensemblVersion: '111_39' + name: GRCm39 data: - genome - - genome_info + - conservation + - repeats - gene + - regulation + - protein - variation + - ontology + - id: rnorvegicus + scientificName: Rattus norvegicus + assemblies: + - ensemblVersion: '111_7.2' + name: mRatBN7.2 + data: + - genome + - gene - regulation - protein - - conservation +# - variation - id: drerio scientificName: Danio rerio assemblies: - - ensemblVersion: '82_10' - name: GRCz10 + - ensemblVersion: '111_11' + name: GRCz11 + data: + - genome + - gene + - regulation + - protein +# - variation + - id: btaurus + scientificName: Bos taurus + assemblies: + - ensemblVersion: '111_1.3' + name: ARS-UCD1.3 data: - genome - genome_info - gene +# - refseq + - regulation - variation - protein - - id: rnorvegicus - scientificName: Rattus norvegicus + - id: sscrofa + scientificName: Sus scrofa assemblies: - - ensemblVersion: '82_6' - name: Rnor_6.0 + - ensemblVersion: '111_11.1' + name: Sscrofa11.1 data: - genome - genome_info diff --git a/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java b/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java index 75bc8c2104..29546c02ad 100644 --- a/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java +++ b/cellbase-core/src/test/java/org/opencb/cellbase/core/config/CellBaseConfigurationTest.java @@ -41,7 +41,7 @@ public void defaultOutdir() { @Test public void vertebrates() { - Assertions.assertEquals(9, cellBaseConfiguration.getSpecies().getVertebrates().size()); + Assertions.assertEquals(11, cellBaseConfiguration.getSpecies().getVertebrates().size()); } @Test diff --git a/cellbase-core/src/test/resources/configuration.yml b/cellbase-core/src/test/resources/configuration.yml index 64ce73d692..8edc5d2581 100644 --- a/cellbase-core/src/test/resources/configuration.yml +++ b/cellbase-core/src/test/resources/configuration.yml @@ -1,4 +1,5 @@ -version: ${CELLBASE.VERSION} + +version: "${CELLBASE.VERSION}" apiVersion: "${project.version}" wiki: https://github.com/opencb/cellbase/wiki maintenanceFlagFile: "/tmp/maintenance" @@ -8,38 +9,29 @@ logDir: "./logs" # where to output the logs # can be "console" or "file", defaults to console logOutput: "file" +# For testing secretKey: "xPacig89igHSieEnveJEi4KCfdEslhmssC3vui1JJQGgDQ0y8v" databases: mongodb: - host: "${CELLBASE.DB.MONGODB.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" - shards: - - id: "${CELLBASE.DB.MONGODB.REPLICASET.0.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.0}" - - id: "${CELLBASE.DB.MONGODB.REPLICASET.1.NAME}" - nodes: "${CELLBASE.DB.MONGODB.REPLICASET.1}" + host: "${JUNIT.CELLBASE.DB.MONGODB.HOST}" + user: "${JUNIT.CELLBASE.DB.USER}" + password: "${JUNIT.CELLBASE.DB.PASSWORD}" options: - authenticationDatabase: "${CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" - readPreference: "${CELLBASE.DB.MONGODB.READPREFERENCE}" - replicaSet: "${CELLBASE.DB.MONGODB.REPLICASET}" + authenticationDatabase: "${JUNIT.CELLBASE.DB.MONGODB.AUTHENTICATIONDATABASE}" + authenticationMechanism: "${JUNIT.CELLBASE.DB.MONGODB.AUTHENTICATION_MECHANISM}" + readPreference: "${JUNIT.CELLBASE.DB.MONGODB.READPREFERENCE}" + replicaSet: "${JUNIT.CELLBASE.DB.MONGODB.REPLICASET}" connectionsPerHost: 20 sslEnabled: false - enableSharding: true - neo4j: - hsapiens: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" - mmusculus: - host: "${CELLBASE.DB.NEO4J.HOST}" - user: "${CELLBASE.DB.USER}" - password: "${CELLBASE.DB.PASSWORD}" + # sslInvalidCertificatesAllowed: true + # sslInvalidHostnameAllowed: true + enableSharding: false server: rest: - port: 9090 + port: 9090 #"${JUNIT.CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -47,7 +39,27 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: https://ftp.ensembl.org/pub/ + files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + # To be generated manually + DESCRIPTION: "manual@description.txt" + # To be generated manually + XREFS: "manual@xrefs.txt" + # To be downloaded manually + HAEM_ONC_TRANSCRIPTS: "manual@EGLH_HaemOnc_transcripts.txt" + # To be downloaded manually + TSO500: "manual@TSO500_transcripts.txt" + # To be downloaded manually + CANONICAL: "manual@ensembl_canonical.txt" + ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -56,165 +68,334 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub + refSeq: + host: https://ftp.ncbi.nih.gov/refseq/ + version: "2023-10-11" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + MMUSCULUS_GENOMIC_GTF: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.gtf.gz + MMUSCULUS_GENOMIC_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_genomic.fna.gz + MMUSCULUS_PROTEIN_FAA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_protein.faa.gz + MMUSCULUS_RNA_FNA: M_musculus/annotation_releases/GCF_000001635.27-RS_2024_02/GCF_000001635.27_GRCm39_rna.fna.gz + RNORVEGICUS_GENOMIC_GTF: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.gtf.gz + RNORVEGICUS_GENOMIC_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_genomic.fna.gz + RNORVEGICUS_PROTEIN_FAA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_protein.faa.gz + RNORVEGICUS_RNA_FNA: R_norvegicus/annotation_releases/GCF_036323735.1-RS_2024_02/GCF_036323735.1_GRCr8_rna.fna.gz + BTAURUS_GENOMIC_GTF: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.gtf.gz + BTAURUS_GENOMIC_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz + BTAURUS_PROTEIN_FAA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_protein.faa.gz + BTAURUS_RNA_FNA: B_taurus/annotation_releases/GCF_002263795.3-RS_2023_09/GCF_002263795.3_ARS-UCD2.0_rna.fna.gz + maneSelect: + host: https://ftp.ncbi.nlm.nih.gov/refseq/ + version: "1.2" + files: + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz + lrg: + host: http://ftp.ebi.ac.uk/ + version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2024-04-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + version: "2024-03-27" + files: + UNIPROT_XREF: HUMAN_9606_idmapping_selected.tab.gz + MMUSCULUS_UNIPROT_XREF: MOUSE_10090_idmapping_selected.tab.gz + RNORVEGICUS_UNIPROT_XREF: RAT_10116_idmapping_selected.tab.gz + DRERIO_UNIPROT_XREF: DANRE_7955_idmapping_selected.tab.gz + DMELOANOGASTER_UNIPROT_XREF: DROME_7227_idmapping_selected.tab.gz + SCEREVISIAE_UNIPROT_XREF: YEAST_559292_idmapping_selected.tab.gz + CELEGANS_UNIPROT_XREF: CAEEL_6239_idmapping_selected.tab.gz geneExpressionAtlas: - host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ + version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations/ + version: "2024-04-26" + files: + HPO: "manual@phenotype_to_genes.txt" + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz + goAnnotation: + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz + MMUSCULUS_GO_ANNOTATION: gene-associations/mgi.gaf.gz + cancerGeneCensus: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/census/ + version: "v99" + files: + CANCER_GENE_CENSUS: "manual@cancer-gene-census.tsv" + + ## Regulation mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/ + version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: http://mirtarbase.cuhk.edu.cn/cache/download/8.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ + version: "9.0" + files: + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx + RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx + + ## Protein Data uniprot: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - uniprotRelNotes: - host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - intact: - host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt + host: https://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz - interproRelNotes: - host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt - conservation: - host: ftp://hgdownload.cse.ucsc.edu/goldenPath/ + host: https://ftp.ebi.ac.uk/ + version: "2024-03-27" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz + intact: + host: https://ftp.ebi.ac.uk/ + version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt + + ## Conservation Scores + phastCons: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHASTCONS: goldenPath/hg38/phastCons470way/hg38.470way.phastCons/ + MMUSCULUS_PHASTCONS: goldenPath/mm39/phastCons35way/mm39.35way.phastCons/ + phylop: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHYLOP: goldenPath/hg38/phyloP470way/hg38.470way.phyloP/ + MMUSCULUS_PHYLOP: goldenPath/mm39/phyloP35way/mm39.35way.phyloP/ gerp: - host: ftp://ftp.ensembl.org/pub/current_compara/conservation_scores/103_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ + version: "2023-05-17" + files: + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + MMUSCULUS_GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.mus_musculus.GRCm39.bw + + ## Clinical Variant clinvar: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2020-02.xml.gz - clinvarSummary: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - clinvarVariationAllele: - host: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - clinvarEfoTerms: - host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv - iarctp53: - host: http://p53.iarc.fr/ajax/Zipper.ashx - docm: - host: http://docm.info/api/ - docmVersion: - host: http://docm.info + host: https://ftp.ncbi.nlm.nih.gov/ + version: "2024-02" + files: + FULL_RELEASE: pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz + SUMMARY: pub/clinvar/tab_delimited/variant_summary.txt.gz + ALLELE: pub/clinvar/tab_delimited/variation_allele.txt.gz + EFO_TERMS: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cosmic: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/cosmic/ + version: "v99" + files: + COSMIC: CosmicMutantExport.tsv.gz + hgmd: + ## To be downloaded manually + host: https://www.hgmd.cf.ac.uk/ + version: "2020-03" + files: + HGMD: hgmd.vcf + gwasCatalog: + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' + host: https://ftp.ebi.ac.uk/ + version: "2024-04-22" + files: + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv + DBSNP: All.vcf.gz + + dbSNP: + host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz + version: "156" + + pharmGKB: + host: https://api.pharmgkb.org/v1/download/file/data/ + version: v1 + files: + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip + dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + SIMPLE_REPEATS: goldenPath/hg38/database/simpleRepeat.txt.gz + MMUSCULUS_SIMPLE_REPEATS: goldenPath/mm39/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + WINDOW_MASKER: goldenPath/hg38/database/windowmaskerSdust.txt.gz + MMUSCULUS_WINDOW_MASKER: goldenPath/mm39/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath - gwasCatalog: - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv - hpo: - host: http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/util/annotation/phenotype_to_genes.txt - disgenet: - host: https://www.disgenet.org/static/disgenet_ap1/files/downloads + host: http://hgdownload.cse.ucsc.edu/ files: - - all_gene_disease_associations.tsv.gz - - readme.txt - dgidb: - host: http://dgidb.org/data/interactions.tsv + GENOMIC_SUPER_DUPS: goldenPath/hg38/database/genomicSuperDups.txt.gz + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/ + version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: - host: http://krishna.gs.washington.edu/download/CADD/v1.3/whole_genome_SNVs.tsv.gz - reactome: - host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: 2.1.1 + host: https://krishna.gs.washington.edu/ + version: "1.7" + files: + CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + + ## OBO Ontologies + ## The version is retrieved from the OBO file hpoObo: - host: http://purl.obolibrary.org/obo/hp.obo + host: http://purl.obolibrary.org/obo/ + files: + HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/go-basic.obo + host: http://purl.obolibrary.org/obo/ + files: + GO: go/go-basic.obo doidObo: - host: http://purl.obolibrary.org/obo/doid.obo - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - refSeq: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - refSeqFasta: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - refSeqProteinFasta: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - refSeqCdna: - host: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - revel: - host: https://rothsj06.u.hpc.mssm.edu/revel_grch38_all_chromosomes.csv.zip + host: http://purl.obolibrary.org/obo/ + files: + DOID: doid.obo + mondoObo: + host: http://purl.obolibrary.org/obo/ + files: + MONDO: mondo.obo + + ## Splice score + mmSplice: + host: http://kipoi.org/models/MMSplice/mtsplice/ + version: 2.0 + spliceAi: + host: https://basespace.illumina.com/s/otSPW8hnhaZR + version: 1.3.1 + + ## Others + pubmed: + host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 + files: + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz + reactome: + host: http://www.reactome.org/download/current/biopax.zip + + species: vertebrates: - id: hsapiens scientificName: Homo sapiens assemblies: - # - ensemblVersion: '82_37' - # name: GRCh37 - - ensemblVersion: '99_38' + - ensemblVersion: '111_38' name: GRCh38 + # - ensemblVersion: '82_37' + # name: GRCh37 data: - - clinical_variants + - genome - conservation + - repeats - gene - - genome + - regulation + - protein + - clinical_variant - missense_variation_functional_score - ontology - - protein - - refseq - - regulation - - repeats - variation_functional_score - splice_score - shards: - - collection: "variation" - key: - - chromosome - - start - - end - rangeKey: "chromosome" - zones: - - name: "zone0" - shardRanges: - - minimum: "1" - maximum: "10" - - minimum: "2" - maximum: "20" - - minimum: "3" - maximum: "9" - - name: "zone1" - shardRanges: - - minimum: "10" - maximum: "2" - - minimum: "20" - maximum: "3" - - minimum: "9" - maximum: "Z" + - pharmacogenomics - id: mmusculus scientificName: Mus musculus assemblies: - - ensemblVersion: '82_38' - name: GRCm38 + - ensemblVersion: '111_39' + name: GRCm39 data: - genome - - genome_info + - conservation + - repeats - gene + - regulation + - protein - variation + - ontology + - id: rnorvegicus + scientificName: Rattus norvegicus + assemblies: + - ensemblVersion: '111_7.2' + name: mRatBN7.2 + data: + - genome + - gene - regulation - protein - - conservation + # - variation - id: drerio scientificName: Danio rerio assemblies: - - ensemblVersion: '82_10' - name: GRCz10 + - ensemblVersion: '111_11' + name: GRCz11 + data: + - genome + - gene + - regulation + - protein + # - variation + - id: btaurus + scientificName: Bos taurus + assemblies: + - ensemblVersion: '111_1.3' + name: ARS-UCD1.3 data: - genome - genome_info - gene + # - refseq + - regulation - variation - protein - - id: rnorvegicus - scientificName: Rattus norvegicus + - id: sscrofa + scientificName: Sus scrofa assemblies: - - ensemblVersion: '82_6' - name: Rnor_6.0 + - ensemblVersion: '111_11.1' + name: Sscrofa11.1 data: - genome - genome_info diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml index 9b4dd7fbe0..cff0e8f8e0 100644 --- a/cellbase-lib/pom.xml +++ b/cellbase-lib/pom.xml @@ -185,6 +185,11 @@ junit-platform-engine test + + org.apache.commons + commons-csv + 1.0 + diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 6330cb71a3..9ad5ac3953 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -16,9 +16,12 @@ package org.opencb.cellbase.lib; -import org.apache.commons.lang.StringUtils; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.config.Configurator; +import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,148 +29,521 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; /** * Created by fjlopez on 03/06/16. */ -public class EtlCommons { +public final class EtlCommons { - public static final String HOMO_SAPIENS_NAME ="Homo sapiens"; + // Commons + public static final String HOMO_SAPIENS = "Homo sapiens"; + public static final String HSAPIENS = "hsapiens"; + public static final String MUS_MUSCULUS = "Mus musculus"; + public static final String RATTUS_NORVEGICUS = "Rattus norvegicus"; + public static final String BOS_TAURUS = "Bos taurus"; + public static final String DANIO_RERIO = "Danio rerio"; + public static final String GRCH38_NAME = "GRCh38"; + public static final String GRCH37_NAME = "GRCh37"; + public static final String HG38_NAME = "hg38"; + public static final String HG19_NAME = "hg19"; + + public static final String MANUAL_PREFIX = "manual@"; + public static final String SCRIPT_PREFIX = "script:"; + + public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + + public static final String XLSX_EXTENSION = ".xlsx"; + public static final String CSV_EXTENSION = ".csv"; + public static final String TBI_EXTENSION = ".tbi"; + public static final String FAI_EXTENSION = ".fai"; + public static final String GZ_EXTENSION = ".gz"; + public static final String TXT_GZ_EXTENSION = ".txt.gz"; + public static final String TAR_GZ_EXTENSION = ".tar.gz"; + public static final String JSON_GZ_EXTENSION = ".json.gz"; + + public static final String OK_MSG = "Ok."; + public static final String DONE_MSG = "Done."; + public static final String DATA_NOT_SUPPORTED_MSG = "Data '{}' not supported for species '{}'"; + + // Ensembl + public static final String ENSEMBL_DATA = "ensembl"; + public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; + public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; + public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; + public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; + // Must match the configuration file + public static final String ENSEMBL_PRIMARY_FA_FILE_ID = "PRIMARY_FA"; + public static final String ENSEMBL_GTF_FILE_ID = "GTF"; + public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; + public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; + public static final String ENSEMBL_REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String ENSEMBL_MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; + public static final String ENSEMBL_DESCRIPTION_FILE_ID = "DESCRIPTION"; + public static final String ENSEMBL_XREFS_FILE_ID = "XREFS"; + public static final String ENSEMBL_CANONICAL_FILE_ID = "CANONICAL"; + public static final String GENOME_INFO_FILE_ID = "GENOME_INFO"; + public static final String VARIATION_FILE_ID = "VARIATION"; + public static final String STRUCTURAL_VARIATIONS_FILE_ID = "STRUCTURAL_VARIATIONS"; + + // Genome public static final String GENOME_DATA = "genome"; + public static final String GENOME_SEQUENCE_COLLECTION_NAME = "genome_sequence"; + public static final String GENOME_INFO_DATA = "genome_info"; + + // Gene public static final String GENE_DATA = "gene"; + public static final String GENE_ANNOTATION_DATA = "gene_annotation"; + public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; + + // RefSeq public static final String REFSEQ_DATA = "refseq"; - public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; + // Must match the configuration file + public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; + public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; + public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; + public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; + + // Gene annotation + public static final String ENSEMBL_CANONICAL_DATA = "ensembl_canonical"; + public static final String GENE_EXTRA_INFO_DATA = "gene_extra_info"; + // - MANE Select + public static final String MANE_SELECT_DATA = "MANE Select"; + // Must match the configuration file + public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; + // - LRG + public static final String LRG_DATA = "lrg"; + // Must match the configuration file + public static final String LRG_FILE_ID = "LRG"; + // - HGNC + public static final String HGNC_DATA = "hgnc"; + // Must match the configuration file + public static final String HGNC_FILE_ID = "HGNC"; + // - Cancer HotSpot + public static final String CANCER_HOTSPOT_DATA = "cancer_hotspot"; + // Must match the configuration file + public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; + // - DGID (drug) + public static final String DGIDB_DATA = "dgidb"; + // Must match the configuration file + public static final String DGIDB_FILE_ID = "DGIDB"; + // - UniProt Xref + public static final String UNIPROT_XREF_DATA = "uniprot_xref"; + // Must match the configuration file + public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; + // - Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_DATA = "gene_expression_atlas"; + // Must match the configuration file + public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; + // - Gene Disease Annotation + public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; + // - HPO + public static final String HPO_DISEASE_DATA = "hpo_disease"; + // Must match the configuration file + public static final String HPO_FILE_ID = "HPO"; + // - DISGENET + public static final String DISGENET_DATA = "disgenet"; + // Must match the configuration file + public static final String DISGENET_FILE_ID = "DISGENET"; + // - gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_DATA = "gnomad_constraints"; + // Must match the configuration file + public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; + // - GO Annotation + public static final String GO_ANNOTATION_DATA = "go_annotation"; + // Must match the configuration file + public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; + // - Cancer Gene Census + public static final String CANCER_GENE_CENSUS_DATA = "cancer_gene_census"; + // Must match the configuration file + public static final String CANCER_GENE_CENSUS_FILE_ID = "CANCER_GENE_CENSUS"; + + // Variation public static final String VARIATION_DATA = "variation"; - public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; - public static final String REGULATION_DATA = "regulation"; - public static final String PROTEIN_DATA = "protein"; - public static final String CONSERVATION_DATA = "conservation"; - public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; - public static final String SPLICE_SCORE_DATA = "splice_score"; + public static final String DBSNP_DATA = "dbsnp"; + public static final String SNP_DATA = "snp"; + + // PGS (polygenic scores) + public static final String PGS_DATA = "polygenic_score"; + public static final String PGS_COMMON_COLLECTION = "common_polygenic_score"; + public static final String PGS_VARIANT_COLLECTION = "variant_polygenic_score"; + // PGS Catalog + public static final String PGS_CATALOG_DATA = "pgs_catalog"; + // Must match the configuration file + public static final String PGS_CATALOG_FILE_ID = "PGS_CATALOG"; + // Pharmacogenomics public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; - public static final String PHARMGKB_NAME = "PharmGKB"; + // PharmGKB public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; - - public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2022.11"; - public static final String CLINVAR_DATE = "2022-11"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; - public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; - public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; - public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; - public static final String IARCTP53_FILE = "IARC-TP53.zip"; - public static final String GWAS_FILE = "gwas_catalog.tsv"; - public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; - @Deprecated - public static final String DBSNP_FILE = "GCF_000001405.40.gz"; - public static final String DBSNP_NAME = "dbSNP"; - public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json"; - public static final String SNP_COLLECTION_NAME = "snp"; + // Must match the configuration file + public static final String PHARMGKB_GENES_FILE_ID = "GENES"; + public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; + public static final String PHARMGKB_VARIANTS_FILE_ID = "VARIANTS"; + public static final String PHARMGKB_GUIDELINE_ANNOTATIONS_FILE_ID = "GUIDELINE_ANNOTATIONS"; + public static final String PHARMGKB_VARIANT_ANNOTATIONS_FILE_ID = "VARIANT_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_ANNOTATIONS_FILE_ID = "CLINICAL_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_VARIANTS_FILE_ID = "CLINICAL_VARIANTS"; + public static final String PHARMGKB_DRUG_LABELS_FILE_ID = "DRUG_LABELS"; + public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; - public static final String STRUCTURAL_VARIANTS_DATA = "svs"; - public static final String REPEATS_DATA = "repeats"; - public static final String OBO_DATA = "ontology"; - public static final String HPO_FILE = "hp.obo"; - public static final String GO_FILE = "go-basic.obo"; - public static final String DOID_FILE = "doid.obo"; - public static final String MONDO_FILE = "mondo.obo"; - public static final String PFM_DATA = "regulatory_pfm"; + // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + // Revel + public static final String REVEL_DATA = "revel"; + // Must match the configuration file + public static final String REVEL_FILE_ID = "REVEL"; - // Build specific data options - public static final String GENOME_INFO_DATA = "genome_info"; - public static final String DISGENET_DATA = "disgenet"; - public static final String HPO_DATA = "hpo"; - public static final String CADD_DATA = "cadd"; - public static final String PPI_DATA = "ppi"; - public static final String DRUG_DATA = "drug"; + // Clinical variants data + public static final String CLINICAL_VARIANT_DATA = "clinical_variants"; + public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariants"; + // ClinVar public static final String CLINVAR_DATA = "clinvar"; - public static final String DOCM_DATA = "docm"; + public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; + // Must match the configuration file + public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; + public static final String CLINVAR_SUMMARY_FILE_ID = "SUMMARY"; + public static final String CLINVAR_ALLELE_FILE_ID = "ALLELE"; + public static final String CLINVAR_EFO_TERMS_FILE_ID = "EFO_TERMS"; + // COSMIC public static final String COSMIC_DATA = "cosmic"; - public static final String GWAS_DATA = "gwas"; - public static final String IARCTP53_GERMLINE_FILE = "germlineMutationDataIARC TP53 Database, R20.txt"; - public static final String IARCTP53_GERMLINE_REFERENCES_FILE = "germlineMutationReferenceIARC TP53 Database, R20.txt"; - public static final String IARCTP53_SOMATIC_FILE = "somaticMutationDataIARC TP53 Database, R20.txt"; - public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; + // Must match the configuration file + public static final String COSMIC_FILE_ID = "COSMIC"; + // HGMD public static final String HGMD_DATA = "hgmd"; + // Must match the configuration file + public static final String HGMD_FILE_ID = "HGMD"; + // GWAS + public static final String GWAS_DATA = "gwas"; + // Must match the configuration file + public static final String GWAS_FILE_ID = "GWAS"; + public static final String GWAS_DBSNP_FILE_ID = "DBSNP"; - public static final String PUBMED_DATA = "pubmed"; + // Repeats + public static final String REPEATS_DATA = "repeats"; + // Simple repeats + public static final String TRF_DATA = "trf"; + // Must match the configuration file + public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + // Genomic super duplications + public static final String GSD_DATA = "gsd"; + // Must match the configuration file + public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + // Window masker + public static final String WM_DATA = "wm"; + // Must match the configuration file + public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; + + // Ontology + public static final String ONTOLOGY_DATA = "ontology"; + // HPO + public static final String HPO_OBO_DATA = "hpo"; + // Must match the configuration file + public static final String HPO_OBO_FILE_ID = "HPO"; + // GO + public static final String GO_OBO_DATA = "go"; + // Must match the configuration file + public static final String GO_OBO_FILE_ID = "GO"; + // DOID + public static final String DOID_OBO_DATA = "doid"; + // Must match the configuration file + public static final String DOID_OBO_FILE_ID = "DOID"; + // MONDO + public static final String MONDO_OBO_DATA = "mondo"; + // Must match the configuration file + public static final String MONDO_OBO_FILE_ID = "MONDO"; + + + public static final String PFM_DATA = "regulatory_pfm"; + + // Variation functional score + public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; + // CADD scores + public static final String CADD_DATA = "cadd"; + public static final String CADD_RAW_DATA = "cadd_raw"; + public static final String CADD_SCALED_DATA = "cadd_scaled"; + // Must match the configuration file + public static final String CADD_FILE_ID = "CADD"; + + // Regulation + public static final String REGULATION_DATA = "regulation"; + // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) + public static final String REGULATORY_BUILD_DATA = "regulatory_build"; + // Motif features (see Ensembl files) + public static final String MOTIF_FEATURES_DATA = "motif_features"; + // miRBase + public static final String MIRBASE_DATA = "mirbase"; + // Must match the configuration file + public static final String MIRBASE_FILE_ID = "MIRBASE"; + // miRTarBase + public static final String MIRTARBASE_DATA = "mirtarbase"; + // Must match the configuration file + public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; - // Path and file names - public static final String GERP_SUBDIRECTORY = "gerp"; - public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; - public static final String MMSPLICE_VERSION_FILENAME = "mmspliceVersion.json"; - public static final String SPLICEAI_SUBDIRECTORY = "spliceai"; - public static final String SPLICEAI_VERSION_FILENAME = "spliceaiVersion.json"; + // Protein + public static final String PROTEIN_DATA = "protein"; + // UniProt + public static final String UNIPROT_DATA = "uniprot"; + public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; + // Must match the configuration file + public static final String UNIPROT_FILE_ID = "UNIPROT"; + // InterPro + public static final String INTERPRO_DATA = "interpro"; + // Must match the configuration file + public static final String INTERPRO_FILE_ID = "INTERPRO"; + // IntAct + public static final String INTACT_DATA = "intact"; + // Must match the configuration file + public static final String INTACT_FILE_ID = "INTACT"; + + // Conservation scores + public static final String CONSERVATION_DATA = "conservation"; + // GERP + public static final String GERP_DATA = "gerp"; + // Must match the configuration file + public static final String GERP_FILE_ID = "GERP"; + // PHASTCONS + public static final String PHASTCONS_DATA = "phastCons"; + // Must match the configuration file + public static final String PHASTCONS_FILE_ID = "PHASTCONS"; + // PHYLOP + public static final String PHYLOP_DATA = "phylop"; + // Must match the configuration file + public static final String PHYLOP_FILE_ID = "PHYLOP"; - // binary bigwig file + // Splice scores + public static final String SPLICE_SCORE_DATA = "splice_score"; + // MMSplice + public static final String MMSPLICE_DATA = "mmsplice"; + // SpliceAI + public static final String SPLICEAI_DATA = "spliceai"; + + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ + @Deprecated public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw"; - // bigwig file manually transformed to bedGraph file - public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz"; public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; - public static final String DOCM_FILE = "docm.json.gz"; public static final String DOCM_NAME = "DOCM"; - public static final String STRUCTURAL_VARIANTS_FOLDER = "structuralVariants"; - public static final String DGV_FILE = "dgv.txt"; - public static final String DGV_VERSION_FILE = "dgvVersion.json"; - public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; - public static final String TRF_FILE = "simpleRepeat.txt.gz"; - public static final String TRF_VERSION_FILE = "simpleRepeat.json"; - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; - public static final String GSD_VERSION_FILE = "genomicSuperDups.json"; - public static final String WM_FILE = "windowMasker.txt.gz"; - public static final String WM_VERSION_FILE = "windowMasker.json"; - public static final String REPEATS_FOLDER = "genome"; - public static final String REPEATS_JSON = "repeats"; - public static final String OBO_JSON = "ontology"; - public static final String HPO_VERSION_FILE = "hpoVersion.json"; - public static final String GO_VERSION_FILE = "goVersion.json"; - public static final String DO_VERSION_FILE = "doVersion.json"; + public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; + public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; + public static final String DO_VERSION_FILE = "do" + SUFFIX_VERSION_FILENAME; + public static final String MONDO_VERSION_FILE = "mondo" + SUFFIX_VERSION_FILENAME; + public static final String HGMD_FILE = "hgmd.vcf"; - public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // PubMed + public static final String PUBMED_DATA = "pubmed"; + // Must match the configuration file + public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; + + // Utilities maps + private static Map dataNamesMap = new HashMap<>(); + private static Map dataCategoriesMap = new HashMap<>(); + private static Map dataVersionFilenamesMap = new HashMap<>(); - public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) - throws IOException, InterruptedException { - // This small hack allow to configure the appropriate Logger level from the command line, this is done - // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. -// org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); -// ConsoleAppender stderr = (ConsoleAppender) rootLogger.getAppender("stdout"); -// stderr.setThreshold(Level.toLevel("debug")); + private static final Logger LOGGER = LoggerFactory.getLogger(EtlCommons.class); - Configurator.setRootLevel(Level.INFO); + static { - Logger logger = LoggerFactory.getLogger("EtlCommons"); + // Populate data names map + dataNamesMap.put(ENSEMBL_DATA, "Ensembl"); + dataNamesMap.put(REFSEQ_DATA, "RefSeq"); + dataNamesMap.put(GENOME_DATA, "Genome"); + dataNamesMap.put(GENOME_INFO_DATA, "Genome Info"); + dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(ENSEMBL_CANONICAL_DATA, "Ensembl canonical"); + dataNamesMap.put(GENE_EXTRA_INFO_DATA, "Gene extra info"); + dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); + dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); + dataNamesMap.put(LRG_DATA, "LRG"); + dataNamesMap.put(HGNC_DATA, "HGNC Gene"); + dataNamesMap.put(CANCER_HOTSPOT_DATA, "Cancer HotSpot"); + dataNamesMap.put(DGIDB_DATA, "DGIdb"); + dataNamesMap.put(UNIPROT_XREF_DATA, "UniProt Xref"); + dataNamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "Gene Expression Atlas"); + dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); + dataNamesMap.put(HPO_DISEASE_DATA, "HPO Disease"); + dataNamesMap.put(DISGENET_DATA, "DisGeNet"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraint"); + dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); + dataNamesMap.put(CANCER_GENE_CENSUS_DATA, "Cancer Gene Census"); + dataNamesMap.put(PROTEIN_DATA, "Protein"); + dataNamesMap.put(UNIPROT_DATA, "UniProt"); + dataNamesMap.put(INTERPRO_DATA, "InterPro"); + dataNamesMap.put(INTACT_DATA, "IntAct"); + dataNamesMap.put(CONSERVATION_DATA, "Conservation"); + dataNamesMap.put(GERP_DATA, "GERP++"); + dataNamesMap.put(PHASTCONS_DATA, "PhastCons"); + dataNamesMap.put(PHYLOP_DATA, "PhyloP"); + dataNamesMap.put(REPEATS_DATA, "Repeats"); + dataNamesMap.put(TRF_DATA, "Tandem Repeats Finder"); + dataNamesMap.put(WM_DATA, "Window Masker"); + dataNamesMap.put(GSD_DATA, "Genomic Super Duplications"); + dataNamesMap.put(REGULATION_DATA, "Regulation"); + dataNamesMap.put(REGULATORY_BUILD_DATA, "Regulatory Build"); + dataNamesMap.put(MOTIF_FEATURES_DATA, "Motif Features"); + dataNamesMap.put(MIRBASE_DATA, "miRBase"); + dataNamesMap.put(MIRTARBASE_DATA, "miRTarBase"); + dataNamesMap.put(ONTOLOGY_DATA, "Ontology"); + dataNamesMap.put(HPO_OBO_DATA, "HPO"); + dataNamesMap.put(GO_OBO_DATA, "GO"); + dataNamesMap.put(DOID_OBO_DATA, "DOID"); + dataNamesMap.put(MONDO_OBO_DATA, "Mondo"); + dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); + dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Score"); + dataNamesMap.put(CADD_DATA, "CADD"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Score"); + dataNamesMap.put(REVEL_DATA, "Revel"); + dataNamesMap.put(CLINICAL_VARIANT_DATA, "Clinical Variant"); + dataNamesMap.put(CLINVAR_DATA, "ClinVar"); + dataNamesMap.put(COSMIC_DATA, "Cosmic"); + dataNamesMap.put(HGMD_DATA, "HGMD"); + dataNamesMap.put(GWAS_DATA, "GWAS Catalog"); + dataNamesMap.put(SPLICE_SCORE_DATA, "Splice Score"); + dataNamesMap.put(MMSPLICE_DATA, "MMSplice"); + dataNamesMap.put(SPLICEAI_DATA, "SpliceAI"); + dataNamesMap.put(VARIATION_DATA, "Variation"); + dataNamesMap.put(SNP_DATA, "SNP"); + dataNamesMap.put(DBSNP_DATA, "dbSNP"); + dataNamesMap.put(PGS_DATA, "Polygenic Score"); + dataNamesMap.put(PGS_CATALOG_DATA, "PGS Catalog"); - ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); + // Populate data categories map + dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataCategoriesMap.put(GENOME_DATA, dataNamesMap.get(ENSEMBL_DATA)); + dataCategoriesMap.put(MANE_SELECT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(LRG_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HGNC_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_HOTSPOT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DGIDB_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_XREF_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GENE_EXPRESSION_ATLAS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HPO_DISEASE_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DISGENET_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GNOMAD_CONSTRAINTS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GO_ANNOTATION_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_GENE_CENSUS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTERPRO_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTACT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(GERP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHASTCONS_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHYLOP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(TRF_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(WM_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(GSD_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(REGULATORY_BUILD_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MOTIF_FEATURES_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRTARBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(HPO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(GO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(DOID_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(MONDO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); + dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); + dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); + dataCategoriesMap.put(CLINVAR_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(COSMIC_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(HGMD_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(MMSPLICE_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); + dataCategoriesMap.put(SPLICEAI_DATA, dataNamesMap.get(SPLICE_SCORE_DATA)); + dataCategoriesMap.put(VARIATION_DATA, dataNamesMap.get(VARIATION_DATA)); + dataCategoriesMap.put(SNP_DATA, dataNamesMap.get(VARIATION_DATA)); + dataCategoriesMap.put(DBSNP_DATA, dataNamesMap.get(VARIATION_DATA)); + dataCategoriesMap.put(PGS_CATALOG_DATA, dataNamesMap.get(PGS_DATA)); + + // Populate data version filenames Map + dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REFSEQ_DATA, "refSeqCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENOME_DATA, "genome" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MANE_SELECT_DATA, "maneSelect" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(LRG_DATA, "lrg" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGNC_DATA, "hgnc" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_HOTSPOT_DATA, "cancerHotSpot" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DGIDB_DATA, "dgidb" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_XREF_DATA, "uniProtXref" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_DISEASE_DATA, "hpoDisease" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DISGENET_DATA, "disGeNet" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomadConstraints" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_ANNOTATION_DATA, "goAnnotation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_GENE_CENSUS_DATA, "cancerGeneCensus" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_DATA, "uniProt" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTERPRO_DATA, "interPro" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTACT_DATA, "intAct" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GERP_DATA, "gerp" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHASTCONS_DATA, "phastCons" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHYLOP_DATA, "phyloP" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(TRF_DATA, "simpleRepeat" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(WM_DATA, "windowMasker" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GSD_DATA, "genomicSuperDups" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REGULATORY_BUILD_DATA, "regulatoryBuild" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MOTIF_FEATURES_DATA, "motifFeatures" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRBASE_DATA, "mirBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRTARBASE_DATA, "mirTarBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_OBO_DATA, "hpoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_OBO_DATA, "goObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DOID_OBO_DATA, "doidObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MONDO_OBO_DATA, "mondoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGkb" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CLINVAR_DATA, "clinVar" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(COSMIC_DATA, "cosmic" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGMD_DATA, "hgmd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MMSPLICE_DATA, "mmSplice" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(SPLICEAI_DATA, "spliceAi" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(VARIATION_DATA, "variation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DBSNP_DATA, "dbSnp" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PGS_CATALOG_DATA, "pgsCatalog" + SUFFIX_VERSION_FILENAME); + } - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + private EtlCommons() { + throw new IllegalStateException("Utility class"); + } + + public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, Path logFile) + throws IOException, InterruptedException, CellBaseException { + + ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFile); + + LOGGER.info("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); // Check process output - boolean executedWithoutErrors = true; - int genomeInfoExitValue = process.exitValue(); - if (genomeInfoExitValue != 0) { - logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); - executedWithoutErrors = false; - } - return executedWithoutErrors; +// if (process.exitValue() != 0) { +// String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() +// + ". More info in log file: " + logFilePath; +// logger.error(msg); +// throw new CellBaseException(msg); +// } + + return true; } - private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { + private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, Path logFile) { List commandArgs = new ArrayList<>(); commandArgs.add(binPath); commandArgs.addAll(args); @@ -178,13 +554,41 @@ private static ProcessBuilder getProcessBuilder(File workingDirectory, String bi builder.directory(workingDirectory); } builder.redirectErrorStream(true); - if (logFilePath != null) { - builder.redirectOutput(ProcessBuilder.Redirect.appendTo(new File(logFilePath))); + if (logFile != null) { + builder.redirectOutput(ProcessBuilder.Redirect.appendTo(logFile.toFile())); } return builder; } + public static Path getFastaPath(Path gzFastaPath) throws CellBaseException { + // Sanity check + if (!Files.exists(gzFastaPath)) { + throw new CellBaseException("Gzipped FASTA file " + gzFastaPath + " does not exist"); + } + + // Check FASTA and unzip if necessary + Path fastaPath = gzFastaPath.getParent().resolve(gzFastaPath.getFileName().toString().replace(GZ_EXTENSION, "")); + if (!fastaPath.toFile().exists()) { + // Gunzip + LOGGER.info("Gunzip file {}", gzFastaPath); + try { + List params = Arrays.asList("--keep", gzFastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "gunzip", params, null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } + } + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; + } + public static boolean isMissing(String string) { return !((string != null) && !string.isEmpty() && !string.replace(" ", "") @@ -209,7 +613,184 @@ public static Long countFileLines(Path filePath) throws IOException { } return nLines; } + } + + public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, + String assembly, String chromosome) throws CellBaseException { + if (!props.getUrl().getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); + // Change release, species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(ensemblRelease)) { + url = url.replace(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + } + if (StringUtils.isNotEmpty(species)) { + url = url.replace(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + + public static String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + return getUrl(props, fileId, null, null, null); + } + + public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) + throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + String url; + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + url = filesValue; + } else { + url = props.getHost() + filesValue; + } + if (StringUtils.isNotEmpty(species)) { + url = url.replace(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + + public static String getFilename(String prefix, String chromosome) { + return prefix + "_" + chromosome; + } + + public static boolean isExecutableAvailable(String executable) throws IOException, InterruptedException { + ProcessBuilder processBuilder = new ProcessBuilder("which", executable); + Process process = processBuilder.start(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) { + String line; + StringBuilder output = new StringBuilder(); + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + int exitCode = process.waitFor(); + + // if exitCode is 0 then the executable is installed at + output.toString().trim()), + // otherwise, it's not + return (exitCode == 0); + } + + public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + + public static String getFilenameFromUrl(String url) { + return Paths.get(url).getFileName().toString(); + } + + public static void checkDirectory(Path path, String name) throws CellBaseException { + if (path == null) { + throw new CellBaseException(name + " directory is null"); + } + if (!Files.exists(path)) { + throw new CellBaseException(name + " directory " + path + " does not exist"); + } + if (!Files.isDirectory(path)) { + throw new CellBaseException(name + " directory " + path + " is not a directory"); + } + } + + private static String getMissingFileIdMessage(String fileId) { + return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; + } + + public static String getDataName(String data) throws CellBaseException { + if (!dataNamesMap.containsKey(data)) { + throw new CellBaseException("Name not found for data '" + data + "'"); + } + return dataNamesMap.get(data); + } + + public static String getDataCategory(String data) throws CellBaseException { + if (!dataCategoriesMap.containsKey(data)) { + throw new CellBaseException("Category not found for data '" + data + "'"); + } + return dataCategoriesMap.get(data); + } + + public static String getDataVersionFilename(String data) throws CellBaseException { + if (!dataVersionFilenamesMap.containsKey(data)) { + throw new CellBaseException("Version filename not found for data '" + data + "'"); + } + return dataVersionFilenamesMap.get(data); + } + + public static List getUrls(List downloadFiles) { + return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); + } + + public static String getManualUrl(DownloadProperties.URLProperties props, String fileId) { + return getManualUrl(props.getHost(), props.getFiles().get(fileId)); + } + + public static String getManualUrl(String host, String file) { + if (file.startsWith(MANUAL_PREFIX)) { + return MANUAL_PREFIX + host + file.replace(MANUAL_PREFIX, ""); + } + return null; + } + + public static List getDataList(String data, CellBaseConfiguration configuration, SpeciesConfiguration speciesConfiguration) + throws CellBaseException { + switch (data) { + case REPEATS_DATA: { + return getRepeatsDataList(configuration, speciesConfiguration); + } + default: { + throw new CellBaseException("Unknown data " + data); + } + } + } + + private static List getRepeatsDataList(CellBaseConfiguration configuration, SpeciesConfiguration speciesConfiguration) { + List dataList = new ArrayList<>(); + String speciesId = speciesConfiguration.getId().toUpperCase(Locale.ROOT); + if (speciesId.equalsIgnoreCase(HSAPIENS)) { + return Arrays.asList(TRF_DATA, WM_DATA, GSD_DATA); + } + + if (isDataSupported(configuration.getDownload().getSimpleRepeats(), speciesId)) { + dataList.add(TRF_DATA); + } + if (isDataSupported(configuration.getDownload().getWindowMasker(), speciesId)) { + dataList.add(WM_DATA); + } + if (isDataSupported(configuration.getDownload().getGenomicSuperDups(), speciesId)) { + dataList.add(GSD_DATA); + } + return dataList; + } + + public static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { + for (String key : props.getFiles().keySet()) { + if (key.startsWith(prefix)) { + return true; + } + } + return false; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java new file mode 100644 index 0000000000..550197c762 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -0,0 +1,212 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + + +public abstract class AbstractBuilder { + + protected CellBaseSerializer serializer; + protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + + protected boolean checked; + + protected Logger logger; + + public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; + + public static final String BUILDING_LOG_MESSAGE = "Building {} data ..."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building done."; + + public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done."; + + public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; + + public static final String SKIPPING_INDEX_DATA_LOG_MESSAGE = "Skipping index for data '{}': it is not supported for species '{}'."; + public static final String DATA_ALREADY_BUILT = "'{}' data has already been built."; + + protected AbstractBuilder(CellBaseSerializer serializer) { + logger = LoggerFactory.getLogger(this.getClass()); + + this.serializer = serializer; + this.checked = false; + } + + public abstract void parse() throws Exception; + + public void disconnect() { + if (serializer != null) { + try { + serializer.close(); + } catch (Exception e) { + logger.error("Error closing serializer. Stack trace: {}", e.getStackTrace()); + } + } + } + + protected static String getConfigurationFileIdPrefix(String scientificSpecies) { + String prefix = ""; + if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { + char c = scientificSpecies.charAt(0); + prefix = (c + scientificSpecies.split(" ")[1] + "_").toUpperCase(); + } + return prefix; + } + + protected File checkFile(DownloadProperties.URLProperties props, String fileId, Path targetPath, String name) throws CellBaseException { + logger.info("Checking file {} (file ID {} in config.) ...", name, fileId); + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + if (filename.contains(MANUAL_PREFIX)) { + filename = filename.replace(MANUAL_PREFIX, ""); + } else if (filename.contains(SCRIPT_PREFIX)) { + filename = filename.split("@")[1]; + } + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + if (filename.contains(PUT_CAPITAL_SPECIES_HERE_MARK)) { + // Check + filename = filename.replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK + "." + PUT_RELEASE_HERE_MARK, "") + .replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK, ""); + boolean found = false; + for (File file : targetPath.toFile().listFiles()) { + if (file.getName().endsWith(filename)) { + filePath = file.toPath(); + found = true; + } + } + if (!found) { + throw new CellBaseException("Expected " + name + " file (configuration file ID = " + fileId + ") does not exist at " + + targetPath); + } + } else { + throw new CellBaseException("Expected " + name + " file: " + filename + " does not exist at " + targetPath); + } + } + logger.info("Ok."); + return filePath.toFile(); + } + + protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException { + logger.info("Checking file {} (file ID {} in config.) ...", getDataName(data), fileId); + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'"); + } + if (!Files.exists(targetPath)) { + throw new CellBaseException("Folder does not exist " + targetPath); + } + + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + throw new CellBaseException(getDataName(data) + " file " + filePath + " does not exist"); + } + logger.info("Ok."); + return filePath.toFile(); + } + + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + return checkFiles(getDataName(data), data, downloadPath, expectedFiles); + } + + protected List checkFiles(String label, String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + List files = checkFiles(dataSourceReader.readValue(downloadPath.resolve(getDataVersionFilename(data)).toFile()), + downloadPath, label); + if (files.size() != expectedFiles) { + throw new CellBaseException(expectedFiles + " " + label + " files are expected at " + downloadPath + ", but currently there" + + " are " + files.size() + " files"); + } + return files; + } + + protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { + logger.info("Checking {} folder and files ...", name); + if (!targetPath.toFile().exists()) { + throw new CellBaseException(name + " folder does not exist " + targetPath); + } + + List files = new ArrayList<>(); + + List filenames = dataSource.getUrls().stream().map(u -> Paths.get(u).getFileName().toString()).collect(Collectors.toList()); + for (String filename : filenames) { + File file = targetPath.resolve(filename).toFile(); + if (!file.exists()) { + throw new CellBaseException("File " + file + " does not exits"); + } else { + files.add(file); + } + } + logger.info("Ok."); + return files; + } + + protected Path getIndexFastaReferenceGenome(Path fastaPath) throws CellBaseException { + Path indexFastaPath = Paths.get(fastaPath + FAI_EXTENSION); + if (!Files.exists(indexFastaPath)) { + // Index FASTA file + logger.info("Indexing FASTA file {} ...", fastaPath); + String errorMsg = "Error executing 'samtools faidx' for FASTA file "; + try { + List params = Arrays.asList("faidx", fastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "samtools", params, null); + } catch (IOException e) { + throw new CellBaseException(errorMsg + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(errorMsg + fastaPath, e); + } + if (!Files.exists(indexFastaPath)) { + throw new CellBaseException("It could not index the FASTA file " + fastaPath + ". Please, try to do it manually!"); + } + } + return indexFastaPath; + } + + public static boolean existFiles(List paths) { + for (Path path : paths) { + if (!Files.exists(path)) { + return false; + } + } + return true; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java index b96985c399..7dd8b6a5bd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddAllAnnotationBuilder.java @@ -29,7 +29,7 @@ * @since October 08, 2014 */ @Deprecated -public class CaddAllAnnotationBuilder extends CellBaseBuilder { +public class CaddAllAnnotationBuilder extends AbstractBuilder { private final Path caddFilePath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index f4c6c861fd..64e4dda059 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -17,32 +17,33 @@ package org.opencb.cellbase.lib.builders; import org.opencb.biodata.models.core.GenomicScoreRegion; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 06/11/15. */ -public class CaddScoreBuilder extends CellBaseBuilder { +public class CaddScoreBuilder extends AbstractBuilder { - private Path caddFilePath; + private Path caddDownloadPath; private static final int CHUNK_SIZE = 1000; private static final int DECIMAL_RESOLUTION = 100; - public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { + public CaddScoreBuilder(Path caddDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.caddFilePath = caddFilePath; - - logger = LoggerFactory.getLogger(CaddScoreBuilder.class); + this.caddDownloadPath = caddDownloadPath; } /* Example: @@ -57,14 +58,25 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { */ @Override public void parse() throws Exception { - FileUtils.checkPath(caddFilePath); + String dataName = getDataName(CADD_DATA); + String dataCategory = getDataCategory(CADD_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(caddDownloadPath, dataName); + + // Check ontology files + List caddFiles = checkFiles(dataSourceReader.readValue(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA)).toFile()), + caddDownloadPath, dataName); + if (caddFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + caddFiles.size() + " files"); + } - BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFilePath); List rawValues = new ArrayList<>(CHUNK_SIZE); List scaledValues = new ArrayList<>(CHUNK_SIZE); int start = 1; -// int end = 1999; int end = CHUNK_SIZE - 1; String line; String[] fields = new String[0]; @@ -72,8 +84,8 @@ public void parse() throws Exception { int lineCount = 0; int counter = 1; int serializedChunks = 0; - int previousPosition = 0; - int newPosition = 0; + int prevPos = 0; + int newPos = 0; String chromosome = null; String[] nucleotides = new String[]{"A", "C", "G", "T"}; @@ -81,127 +93,102 @@ public void parse() throws Exception { long scaledLongValue = 0; Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); - while ((line = bufferedReader.readLine()) != null) { - if (!line.startsWith("#")) { - fields = line.split("\t"); - newPosition = Integer.parseInt(fields[1]); -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.debug("line {} reached", line); -// logger.debug("Associated chunk count {}", serializedChunks); -// logger.debug("start {}", start); -// logger.debug("end {}", end); -// logger.debug("chunk size {}", CHUNK_SIZE); -// } - // this only happens the first time, when we start reading the file - if (chromosome == null) { - logger.info("Parsing chr {} ", fields[0]); - chromosome = fields[0]; - - start = newPosition; - previousPosition = newPosition; - end = start + CHUNK_SIZE - 2; - } - if (!chromosome.equals(fields[0])) { - logger.info("Parsing chr {} ", fields[0]); - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - chromosome = fields[0]; - start = newPosition; -// end = CHUNK_SIZE - 1; - end = start + CHUNK_SIZE - 2; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); -// rawLongValue = 0; -// lineCount = 0; -// rawScoreValuesMap.clear(); -// scaledScoreValuesMap.clear(); - // The series of cadd scores is not continuous through the whole chromosome - } else if (end < newPosition || (newPosition - previousPosition) > 1) { - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - start = newPosition; -// start = end + 1; -// end += CHUNK_SIZE; - end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); - } + logger.info(PARSING_LOG_MESSAGE, caddFiles.get(0)); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { + while ((line = bufferedReader.readLine()) != null) { + if (!line.startsWith("#")) { + fields = line.split("\t"); + newPos = Integer.parseInt(fields[1]); + String message = "chrom. " + fields[0]; + // This only happens the first time, when we start reading the file + if (chromosome == null) { + logger.info(PARSING_LOG_MESSAGE, message); + chromosome = fields[0]; + + start = newPos; + prevPos = newPos; + end = start + CHUNK_SIZE - 2; + } - rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); - scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); - - if (++lineCount == 3) { -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.info("offset: {}", rawValues.size()); -// } - - for (String nucleotide : nucleotides) { - // raw CADD score values can be negative, we add 10 to make positive - float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; - v = (short) (a * DECIMAL_RESOLUTION); - rawLongValue = (rawLongValue << 16) | v; - - // scaled CADD scores are always positive - a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); - v = (short) (a * DECIMAL_RESOLUTION); - scaledLongValue = (scaledLongValue << 16) | v; + if (!chromosome.equals(fields[0])) { + logger.info(PARSING_LOG_MESSAGE, message); + + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + chromosome = fields[0]; + start = newPos; + end = start + CHUNK_SIZE - 2; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); + // The series of cadd scores is not continuous through the whole chromosome + } else if (end < newPos || (newPos - prevPos) > 1) { + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + start = newPos; + end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); } -// if (rawLongValue < 0 || scaledLongValue < 0) { -// logger.error("raw/scaled Long Values cannot be 0"); -// logger.error("Last read line {}", line); -// System.exit(1); -// } - rawValues.add(rawLongValue); - scaledValues.add(scaledLongValue); - - counter++; - rawLongValue = 0; - lineCount = 0; - rawScoreValuesMap.clear(); - scaledScoreValuesMap.clear(); + rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); + scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); + + if (++lineCount == 3) { + for (String nucleotide : nucleotides) { + // Raw CADD score values can be negative, we add 10 to make positive + float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; + v = (short) (a * DECIMAL_RESOLUTION); + rawLongValue = (rawLongValue << 16) | v; + + // Scaled CADD scores are always positive + a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); + v = (short) (a * DECIMAL_RESOLUTION); + scaledLongValue = (scaledLongValue << 16) | v; + } + + rawValues.add(rawLongValue); + scaledValues.add(scaledLongValue); + + counter++; + rawLongValue = 0; + lineCount = 0; + rawScoreValuesMap.clear(); + scaledScoreValuesMap.clear(); + } + prevPos = newPos; } - previousPosition = newPosition; } - } - // Last chunks can be incomplete for both raw and scaled are serialized -// GenomicScoreRegion genomicScoreRegion = -// new GenomicScoreRegion<>(fields[0], start, start + rawValues.size() - 1, "cadd_raw", rawValues); - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); + // Last chunks can be incomplete for both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_RAW_DATA, rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); -// genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, start + scaledValues.size() - 1, "cadd_scaled", scaledValues); - genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); + serializer.close(); + } + logger.info(PARSING_DONE_LOG_MESSAGE, caddFiles.get(0)); - serializer.close(); - bufferedReader.close(); - logger.info("Parsing finished."); + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java deleted file mode 100644 index 79e5b7e58b..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Created by imedina on 30/08/14. - */ -public abstract class CellBaseBuilder { - - protected CellBaseSerializer serializer; - - protected Logger logger; - - public CellBaseBuilder(CellBaseSerializer serializer) { - logger = LoggerFactory.getLogger(this.getClass()); - - this.serializer = serializer; - //this.serializer.open(); - } - - public abstract void parse() throws Exception; - - public void disconnect() { - try { - serializer.close(); - } catch (Exception e) { - logger.error("Disconnecting serializer: " + e.getMessage()); - } - } - -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9247b78faa..aadcdb6caf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -18,23 +18,24 @@ import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; -public class ConservationBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class ConservationBuilder extends AbstractBuilder { - private Logger logger; private Path conservedRegionPath; private int chunkSize; @@ -50,326 +51,289 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile fileSerializer = serializer; this.conservedRegionPath = conservedRegionPath; this.chunkSize = chunkSize; - logger = LoggerFactory.getLogger(ConservationBuilder.class); outputFileNames = new HashMap<>(); } @Override public void parse() throws IOException, CellBaseException { - System.out.println("conservedRegionPath = " + conservedRegionPath.toString()); if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { - throw new IOException("Conservation directory does not exist, is not a directory or cannot be read"); + throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + + " be read"); } - /* - * GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse - * this file correctly, so we transform the file into a bedGraph format which is human readable. - */ - Path gerpFolderPath = conservedRegionPath.resolve(EtlCommons.GERP_SUBDIRECTORY); - if (gerpFolderPath.toFile().exists()) { - logger.debug("Parsing GERP data ..."); - gerpParser(gerpFolderPath); - } else { - logger.debug("GERP data not found: " + gerpFolderPath.toString()); + // Check GERP folder and files + Path gerpPath = conservedRegionPath.resolve(GERP_DATA); + DataSource dataSource = dataSourceReader.readValue(gerpPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); + + // Check PhastCons folder and files + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); + dataSource = dataSourceReader.readValue(phastConsPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); + + // Check PhyloP folder and files + Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); + dataSource = dataSourceReader.readValue(phylopPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); + + // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse + // this file correctly, so we transform the file into a bedGraph format which is human-readable. + if (gerpFiles.size() != 1) { + throw new CellBaseException("Only one " + getDataName(GERP_DATA) + " file is expected, but currently there are " + + gerpFiles.size() + " files"); + } + File bigwigFile = gerpFiles.get(0); + File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); + String exec = "bigWigToBedGraph"; + if (!bedgraphFile.exists()) { + try { + if (isExecutableAvailable(exec)) { + EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); + } else { + throw new CellBaseException(exec + " not found in your system, install it to build " + getDataName(GERP_DATA) + + ". It is available at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + } + } catch (IOException e) { + throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("" + e.getMessage(), e); + } + if (!bedgraphFile.exists()) { + throw new CellBaseException("Something happened when executing " + exec + " in BIGWIG file " + bigwigFile + "; the BED" + + " graph file was not generated. Please, check " + exec); + } } + gerpParser(bedgraphFile.toPath()); - /* - * UCSC phastCons and phylop are stored in the same format. They are processed together. - */ + // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); String chromosome; Set chromosomes = new HashSet<>(); - // Reading all files in phastCons folder - DirectoryStream directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phastCons"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhastCons filenames + for (File file : phastConsFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phastCons", path); + files.put(chromosome + PHASTCONS_DATA, file.toPath()); } - // Reading all files in phylop folder - directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phylop"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhyloP filenames + for (File file : phylopFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phylop", path); + files.put(chromosome + PHYLOP_DATA, file.toPath()); } - /* - * Now we can iterate over all the chromosomes found and process the files - */ - logger.debug("Chromosomes found '{}'", chromosomes.toString()); + // Now we can iterate over all the chromosomes found and process the files + logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phastCons")); - processWigFixFile(files.get(chr + "phastCons"), "phastCons"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_DATA); - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phylop")); - processWigFixFile(files.get(chr + "phylop"), "phylop"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } } - private void gerpParser(Path gerpFolderPath) throws IOException, CellBaseException { - Path gerpProcessFilePath = gerpFolderPath.resolve(EtlCommons.GERP_PROCESSED_FILE); - logger.info("parsing {}", gerpProcessFilePath); - BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath); - - String line; - int startOfBatch = 0; - int previousEndValue = 0; - String chromosome = null; - String previousChromosomeValue = null; - - List conservationScores = new ArrayList<>(chunkSize); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - - // file is wrong. throw an exception instead? - if (fields.length != 4) { - logger.error("skipping invalid line: " + line.length()); - continue; - } + private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, gerpProcessFilePath); - chromosome = fields[0]; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath)) { + String line; + int startOfBatch = 0; + int previousEndValue = 0; + String chromosome = null; + String previousChromosomeValue = null; - // new chromosome, store batch - if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { - storeScores(startOfBatch, previousChromosomeValue, conservationScores); + List conservationScores = new ArrayList<>(chunkSize); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); - // reset values for current batch - startOfBatch = 0; - } + // Checking line + if (fields.length != 4) { + throw new CellBaseException("Invalid " + getDataName(GERP_DATA) + " line (expecting 4 columns): " + fields.length + + " items: " + line); + } - // reset chromosome for next entry - previousChromosomeValue = chromosome; + chromosome = fields[0]; - // file is american! starts at zero, add one - int start = Integer.parseInt(fields[1]) + 1; - // inclusive - int end = Integer.parseInt(fields[2]) + 1; + // New chromosome, store batch + if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { + storeScores(startOfBatch, previousChromosomeValue, conservationScores); - // start coordinate for this batch of 2,000 - if (startOfBatch == 0) { - startOfBatch = start; - previousEndValue = 0; - } + // Reset values for current batch + startOfBatch = 0; + } - // if there is a gap between the last entry and this one. - if (previousEndValue != 0 && (start - previousEndValue) != 0) { - // gap is too big! store what we already have before processing more - if (start - previousEndValue >= chunkSize) { - // we have a full batch, store - storeScores(startOfBatch, chromosome, conservationScores); + // Reset chromosome for next entry + previousChromosomeValue = chromosome; + + // File is american! starts at zero, add one + int start = Integer.parseInt(fields[1]) + 1; + // Inclusive + int end = Integer.parseInt(fields[2]) + 1; - // reset batch to start at this record + // sSart coordinate for this batch of 2,000 + if (startOfBatch == 0) { startOfBatch = start; - } else { - // fill in the gap with zeroes - // don't overfill the batch - while (previousEndValue < start && conservationScores.size() < chunkSize) { - conservationScores.add((float) 0); - previousEndValue++; + previousEndValue = 0; + } + + // If there is a gap between the last entry and this one + if (previousEndValue != 0 && (start - previousEndValue) != 0) { + // Gap is too big! store what we already have before processing more + if (start - previousEndValue >= chunkSize) { + // We have a full batch, store + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset batch to start at this record + startOfBatch = start; + } else { + // Fill in the gap with zeroes, don't overfill the batch + while (previousEndValue < start && conservationScores.size() < chunkSize) { + conservationScores.add((float) 0); + previousEndValue++; + } + + // We have a full batch, store + if (conservationScores.size() == chunkSize) { + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset: start a new batch + startOfBatch = start; + } } + } + + // Reset value + previousEndValue = end; + + // Score for these coordinates + String score = fields[3]; - // we have a full batch, store + // Add the score for each coordinate included in the range start-end + while (start < end) { + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch + // Reset: start a new batch startOfBatch = start; } - } - } - // reset value - previousEndValue = end; + // Add score to batch + conservationScores.add(Float.valueOf(score)); - // score for these coordinates - String score = fields[3]; + // Increment coordinate + start++; + } - // add the score for each coordinate included in the range start-end - while (start < end) { - // we have a full batch, store + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch - startOfBatch = start; + // Reset: start a new batch + startOfBatch = 0; } - - // add score to batch - conservationScores.add(Float.valueOf(score)); - - // increment coordinate - start++; } - - // we have a full batch, store - if (conservationScores.size() == chunkSize) { + // We need to serialize the last chunk that might be incomplete + if (!conservationScores.isEmpty()) { storeScores(startOfBatch, chromosome, conservationScores); - - // reset, start a new batch - startOfBatch = 0; } } - // we need to serialize the last chunk that might be incomplete - if (!conservationScores.isEmpty()) { - storeScores(startOfBatch, chromosome, conservationScores); - } - bufferedReader.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gerpProcessFilePath); } private void storeScores(int startOfBatch, String chromosome, List conservationScores) throws CellBaseException { - // if this is a small batch, fill in the missing coordinates with 0 + // If this is a small batch, fill in the missing coordinates with 0 while (conservationScores.size() < chunkSize) { conservationScores.add((float) 0); } if (conservationScores.size() != chunkSize) { - throw new CellBaseException("invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); + throw new CellBaseException("Invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); } - GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, "gerp", conservationScores); + GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, + startOfBatch + conservationScores.size() - 1, GERP_DATA, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); - // reset + // Reset conservationScores.clear(); } -// @Deprecated -// private void gerpParser(Path gerpFolderPath) throws IOException, InterruptedException { -// logger.info("Uncompressing {}", gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// List tarArgs = Arrays.asList("-xvzf", gerpFolderPath.resolve(EtlCommons.GERP_FILE).toString(), -// "--overwrite", "-C", gerpFolderPath.toString()); -// EtlCommons.runCommandLineProcess(null, "tar", tarArgs, null); -// -// DirectoryStream pathDirectoryStream = Files.newDirectoryStream(gerpFolderPath, "*.rates"); -// boolean filesFound = false; -// for (Path path : pathDirectoryStream) { -// filesFound = true; -// logger.info("Processing file '{}'", path.getFileName().toString()); -// String[] chromosome = path.getFileName().toString().replaceFirst("chr", "").split("\\."); -// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(String.valueOf(path)))); -// String line; -// int start = 1; -// int end = 1999; -// int counter = 1; -// String[] fields; -// List val = new ArrayList<>(chunkSize); -// while ((line = bufferedReader.readLine()) != null) { -// fields = line.split("\t"); -// val.add(Float.valueOf(fields[1])); -// counter++; -// if (counter == chunkSize) { -//// ConservationScoreRegion conservationScoreRegion = new ConservationScoreRegion(chromosome[0], start, end, "gerp", -// val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, end, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// start = end + 1; -// end += chunkSize; -// -// counter = 0; -// val.clear(); -// } -// } -// -// // we need to serialize the last chunk that might be incomplete -//// ConservationScoreRegion conservationScoreRegion = -//// new ConservationScoreRegion(chromosome[0], start, start + val.size() - 1, "gerp", val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, start + val.size() - 1, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// bufferedReader.close(); -// } -// -// if (!filesFound) { -// logger.warn("No GERP++ files were found. Please check that the original file {} is there, that it was" -// + " properly decompressed and that the *.rates files are present", -// gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// } -// } - - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { - BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath); - - String line; - String chromosome = ""; -// int start = 0, end = 0; - int start = 0; - float value; - Map attributes = new HashMap<>(); -// ConservedRegion conservedRegion = null; - List values = new ArrayList<>(); -// ConservationScoreRegion conservedRegion = null; - GenomicScoreRegion conservedRegion = null; - - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("fixedStep")) { - //new group, save last - if (conservedRegion != null) { -// conservedRegion.setEnd(end); -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - } + private void processWigFixFile(Path inGzPath, String conservationSource) { + logger.info(PARSING_LOG_MESSAGE, inGzPath); + String line = null; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { + String chromosome = ""; + int start = 0; + float value; + Map attributes = new HashMap<>(); + List values = new ArrayList<>(); + GenomicScoreRegion conservedRegion = null; + + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("fixedStep")) { + // New group, save last + if (conservedRegion != null) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, + conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } -// offset = 0; - attributes.clear(); - String[] attrFields = line.split(" "); - String[] attrKeyValue; - for (String attrField : attrFields) { - if (!attrField.equalsIgnoreCase("fixedStep")) { - attrKeyValue = attrField.split("="); - attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + attributes.clear(); + String[] attrFields = line.split(" "); + String[] attrKeyValue; + for (String attrField : attrFields) { + if (!attrField.equalsIgnoreCase("fixedStep")) { + attrKeyValue = attrField.split("="); + attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + } } - } - chromosome = formatChromosome(attributes); - start = Integer.parseInt(attributes.get("start")); -// end = Integer.parseInt(attributes.get("start")); - - values = new ArrayList<>(2000); - } else { - int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; -// end++; - int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - // This is the endChunk if current read score is - // appended to the array (otherwise it would be - // start + values.size() - 1). If this endChunk is - // different from the startChunk means that current - // conserved region must be dumped and current - // score must be associated to next chunk. Main - // difference to what there was before is that if - // the fixedStep starts on the last position of a - // chunk e.g. 1999, the chunk must be created with - // just that score - the chunk was left empty with - // the old code - if (startChunk != endChunk) { -// conservedRegion = new ConservationScoreRegion(chromosome, start, end - 1, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - start = start + values.size(); - values.clear(); - } + chromosome = formatChromosome(attributes); + start = Integer.parseInt(attributes.get("start")); + + values = new ArrayList<>(2000); + } else { + int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + // This is the endChunk if current read score is appended to the array (otherwise it would be start + values.size() + // - 1). If this endChunk is different from the startChunk means that current conserved region must be dumped and + // current score must be associated to next chunk. Main difference to what there was before is that if the fixedStep + // starts on the last position of a chunk e.g. 1999, the chunk must be created with just that score - the chunk was + // left empty with the old code + if (startChunk != endChunk) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, + values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + start = start + values.size(); + values.clear(); + } - value = Float.parseFloat(line.trim()); - values.add(value); + try { + value = Float.parseFloat(line.trim()); + } catch (NumberFormatException e) { + value = 0; + logger.warn("Invalid value: {}. Stack trace: {}", line, e.getStackTrace()); + } + values.add(value); + } } + + // Write last + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } catch (Exception e) { + logger.error("ERROR parsing {}. Line: {}. Stack trace: {}", inGzPath, line, e.getStackTrace()); } - //write last -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, - values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - bufferedReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } private String getOutputFileName(String chromosome) { @@ -377,15 +341,23 @@ private String getOutputFileName(String chromosome) { if (chromosome.equals("M")) { chromosome = "MT"; } - String outputFileName = outputFileNames.get(chromosome); - if (outputFileName == null) { - outputFileName = "conservation_" + chromosome; + + String outputFileName; + if (outputFileNames.containsKey(chromosome)) { + outputFileName = outputFileNames.get(chromosome); + } else { + outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); } return outputFileName; } - // phylop and phastcons list the chromosome as M instead of the standard MT. replace. + /** + * Remove chr from the chromosome name; and phylop and phastcons list the chromosome as M instead of the standard MT, replace it. + * + * @param attributes Attributes map with the chromosome name + * @return The new chromosome name + */ private String formatChromosome(Map attributes) { String chromosome = attributes.get("chrom").replace("chr", ""); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java index 4f128562e6..488b06e724 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java @@ -30,17 +30,20 @@ import java.nio.file.Paths; import java.util.*; -import static org.opencb.cellbase.lib.EtlCommons.DBSNP_NAME; +import static org.opencb.cellbase.lib.EtlCommons.DBSNP_DATA; /** * Created by imedina on 06/11/15. */ -public class DbSnpBuilder extends CellBaseBuilder { +public class DbSnpBuilder extends AbstractBuilder { - private final Path sourceVariationPath; + private final Path downloadPath; private final DownloadProperties.URLProperties dbSnpUrlProperties; private static final Map CHROMOSOME_MAPPING; + public static final String DBSNP_OUTPUT_BASENAME = "dbsnp"; + public static final String DBSNP_OUTPUT_FILENAME = DBSNP_OUTPUT_BASENAME + ".json.gz"; + static { CHROMOSOME_MAPPING = new HashMap<>(); CHROMOSOME_MAPPING.put("NC_000001", "1"); @@ -69,9 +72,9 @@ public class DbSnpBuilder extends CellBaseBuilder { CHROMOSOME_MAPPING.put("NC_000024", "Y"); } - public DbSnpBuilder(Path sourceVariationPath, DownloadProperties.URLProperties dbSnpUrlProperties, CellBaseSerializer serializer) { + public DbSnpBuilder(Path downloadPath, DownloadProperties.URLProperties dbSnpUrlProperties, CellBaseSerializer serializer) { super(serializer); - this.sourceVariationPath = sourceVariationPath; + this.downloadPath = downloadPath; this.dbSnpUrlProperties = dbSnpUrlProperties; logger = LoggerFactory.getLogger(DbSnpBuilder.class); @@ -99,7 +102,7 @@ public DbSnpBuilder(Path sourceVariationPath, DownloadProperties.URLProperties d */ @Override public void parse() throws Exception { - Path dbSnpFilePath = sourceVariationPath.resolve(Paths.get(dbSnpUrlProperties.getHost()).getFileName()); + Path dbSnpFilePath = downloadPath.resolve(Paths.get(dbSnpUrlProperties.getHost()).getFileName()); FileUtils.checkPath(dbSnpFilePath); CellBaseFileSerializer fileSerializer = (CellBaseFileSerializer) serializer; @@ -120,6 +123,7 @@ public void parse() throws Exception { SnpAnnotation snpAnnotation; + logger.info(PARSING_LOG_MESSAGE, dbSnpFilePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(dbSnpFilePath)) { while ((line = bufferedReader.readLine()) != null) { if (!line.startsWith("#")) { @@ -209,11 +213,11 @@ public void parse() throws Exception { snpAnnotation.setFlags(flags); snpAnnotation.setAdditionalAttributes(additionalAttributes); - Snp snp = new Snp(id, chromosome, position, ref, Arrays.asList(alt), type, DBSNP_NAME, version, snpAnnotation); - fileSerializer.serialize(snp, DBSNP_NAME); + Snp snp = new Snp(id, chromosome, position, ref, Arrays.asList(alt), type, DBSNP_DATA, version, snpAnnotation); + fileSerializer.serialize(snp, DBSNP_DATA); } } } - logger.info("Parsing finished."); + logger.info(PARSING_DONE_LOG_MESSAGE); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java new file mode 100644 index 0000000000..32d779e7ce --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -0,0 +1,1004 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import htsjdk.tribble.readers.TabixReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.feature.gff.Gff2; +import org.opencb.biodata.formats.feature.gtf.Gtf; +import org.opencb.biodata.formats.feature.gtf.io.GtfReader; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.tools.sequence.FastaIndex; +import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class EnsemblGeneBuilder extends AbstractBuilder { + + private Path downloadPath; + private SpeciesConfiguration speciesConfiguration; + private boolean flexibleGTFParsing; + private CellBaseConfiguration configuration; + + private final Map transcriptDict; + private final Map exonDict; + + private Path gtfFile = null; + private Path proteinFastaFile = null; + private Path cDnaFastaFile = null; + private Path geneDescriptionFile = null; + private Path xrefsFile = null; + private Path hgncFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path uniprotIdMappingFile = null; + private Path tfbsFile = null; + private Path tabixFile = null; + private Path geneExpressionFile = null; + private Path geneDrugFile = null; + private Path hpoFile = null; + private Path genomeSequenceFilePath = null; + private Path gnomadFile = null; + private Path geneOntologyAnnotationFile = null; + private Path miRBaseFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHostpotFile = null; + private Path ensemblCanonicalFile = null; + + // source for genes is either ensembl or refseq + private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); + + private int geneCounter; + private ArrayList geneList; + private String geneName; + private int transcriptCounter; + private ArrayList transcriptList; + private String transcriptName; + private int exonCounter; + private String feature; + private Gtf nextGtfToReturn; + + private boolean isHSapiens = false; + + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; + public static final String ENSEMBL_GENE_OUTPUT_FILENAME = ENSEMBL_GENE_BASENAME + ".json.gz"; + + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseConfiguration configuration, CellBaseSerializer serializer) { + super(serializer); + + this.downloadPath = downloadPath; + this.speciesConfiguration = speciesConfiguration; + this.flexibleGTFParsing = flexibleGTFParsing; + this.configuration = configuration; + + transcriptDict = new HashMap<>(250000); + exonDict = new HashMap<>(8000000); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + isHSapiens = true; + } + } + + public void check() throws Exception { + if (checked) { + return; + } + + String ensemblGeneLabel = getDataName(ENSEMBL_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + + // Sanity check + checkDirectory(downloadPath, ensemblGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check Ensembl files + DownloadProperties.URLProperties props = configuration.getDownload().getEnsembl().getUrl(); + gtfFile = checkFile(props, ENSEMBL_GTF_FILE_ID, downloadPath, "Ensembl GTF").toPath(); + proteinFastaFile = checkFile(props, ENSEMBL_PEP_FA_FILE_ID, downloadPath, "Ensembl Protein Fasta").toPath(); + cDnaFastaFile = checkFile(props, ENSEMBL_CDNA_FA_FILE_ID, downloadPath, "Ensembl CDNA Fasta").toPath(); + + // Commons + geneDescriptionFile = checkFile(props, ENSEMBL_DESCRIPTION_FILE_ID, downloadPath.getParent(), "Ensembl Description").toPath(); + xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath(); + ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); + + // Check common files + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MANE_SELECT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(LRG_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHgnc(), prefixId)) { + hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HGNC_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(DGIDB_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneUniprotXref(), prefixId)) { + uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneExpressionAtlas(), prefixId)) { + geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HPO_DISEASE_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGnomadConstraints(), prefixId)) { + gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGoAnnotation(), prefixId)) { + geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_GENE_CENSUS_DATA), speciesConfiguration.getScientificName()); + } + + // Check regulation files + // Motif features + List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MOTIF_FEATURES_DATA), 2); + if (files.get(0).getName().endsWith("tbi")) { + tabixFile = files.get(0).toPath(); + tfbsFile = files.get(1).toPath(); + } else { + tabixFile = files.get(1).toPath(); + tfbsFile = files.get(0).toPath(); + } + + // mirbase + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } + + // mirtarbase + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } + + // Check genome FASTA file + Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA); + String genomeGzFilename = Paths.get(((DataSource) dataSourceReader.readValue(genomeDownloadPath + .resolve(getDataVersionFilename(GENOME_DATA)).toFile())).getUrls().get(0)).getFileName().toString(); + genomeSequenceFilePath = getFastaPath(genomeDownloadPath.resolve(genomeGzFilename)); + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + checked = true; + } + + public void parse() throws Exception { + check(); + + Gene gene = null; + Transcript transcript; + Exon exon = null; + int cdna = 1; + int cds = 1; + + EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); + try { + // process files and put values in rocksdb + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, proteinFastaFile, + cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, geneDrugFile, hpoFile, gnomadFile, + geneOntologyAnnotationFile, miRBaseFile, miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); + + TabixReader tabixReader = null; + if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { + logger.error("Tfbs or tabix file not found. Download them and try again."); + } else { + tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); + } + + // Preparing the fasta file for fast accessing + FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); + + // Empty transcript and exon dictionaries + transcriptDict.clear(); + exonDict.clear(); + + logger.info(PARSING_LOG_MESSAGE, gtfFile); + GtfReader gtfReader = new GtfReader(gtfFile); + + // Gene->Transcript->Feature->GTF line + Map>> gtfMap = null; + if (flexibleGTFParsing) { + gtfMap = loadGTFMap(gtfReader); + initializePointers(gtfMap); + } + + Gtf gtf; + while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { + + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + String geneId = gtf.getAttributes().get("gene_id"); + String transcriptId = gtf.getAttributes().get("transcript_id"); + String geneName = gtf.getAttributes().get("gene_name"); + if (newGene(gene, geneId)) { + // If new geneId is different from the current then we must serialize before data new gene + if (gene != null) { + serializer.serialize(gene); + } + + GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), + indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), + indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); + + gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), + gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), + new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); + } + + // Check if Transcript exist in the Gene Set of transcripts + if (!transcriptDict.containsKey(transcriptId)) { + transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); + } else { + transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); + } + + // At this point gene and transcript objects are set up + // Update gene and transcript genomic coordinates, start must be the + // lower, and end the higher + updateTranscriptAndGeneCoords(transcript, gene, gtf); + + String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; + if (gtf.getFeature().equalsIgnoreCase("exon")) { + // Obtaining the exon sequence + String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); + String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); + + exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf + .getAttributes().get("exon_number")), exonSequence); + transcript.getExons().add(exon); + + exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); + if (gtf.getAttributes().get("exon_number").equals("1")) { + cdna = 1; + cds = 1; + } else { + // with every exon we update cDNA length with the previous exon length + cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() + - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; + } + } else { + exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); + if (gtf.getFeature().equalsIgnoreCase("CDS")) { + // Protein ID is only present in CDS lines + String proteinId = gtf.getAttributes().get("protein_id") != null + ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") + : ""; + transcript.setProteinId(proteinId); + transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); + + if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + + // cDNA coordinates + exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + } + // strand - + } else { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + // cDNA coordinates + // cdnaCodingStart points to the same base position than genomicCodingEnd + exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + // cdnaCodingStart points to the same base position than genomicCodingEnd + transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + } + } + + } +// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { +// // nothing to do +// System.out.println("Empty block, this should be redesigned"); +// } + if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { + // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, + // no need to set it at the beginning of next feature + if (exon.getStrand().equals("+")) { + updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingEnd(gtf.getEnd()); + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + transcript.setCdsLength(cds - 1); + + } else { + updateNegativeExonCodingData(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + transcript.setCdsLength(cds - 1); + } + } + } + } + + // last gene must be serialized + serializer.serialize(gene); + + // Close + gtfReader.close(); + serializer.close(); + fastaIndex.close(); + indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); + } catch (Exception e) { + indexer.close(); + throw e; + } + } + + private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) + throws IOException, RocksDBException { + Map gtfAttributes = gtf.getAttributes(); + + // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. + String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); + String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; + String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); + List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); + + List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); + TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); + + Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", + 0, 0, 0, 0, 0, + indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", + gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, + new HashSet<>(), transcriptAnnotation); + + // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL + // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure + // that the xrefs array contains all ids present in the GTF file + addGtfXrefs(transcript, gene, gtfAttributes); + + // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID + String hgncId = indexer.getHgncId(gene.getName()); + if (StringUtils.isNotEmpty(hgncId)) { + transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); + } + + // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE + for (String suffix: Arrays.asList("refseq", "refseq_protein")) { + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); + if (StringUtils.isNotEmpty(maneRefSeq)) { + transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, + "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); + } + } + + // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG + String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(lrgRefSeq)) { + transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); + } + + // Add Flags + // 1. GTF tags + String tags = gtf.getAttributes().get("tag"); + if (StringUtils.isNotEmpty(tags)) { + transcript.getFlags().addAll(Arrays.asList(tags.split(","))); + } + + // 2. TSL + String supportLevel = gtfAttributes.get("transcript_support_level"); + if (StringUtils.isNotEmpty(supportLevel)) { + // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" + String truncatedSupportLevel = supportLevel.split(" ")[0]; + transcript.getFlags().add("TSL:" + truncatedSupportLevel); + } + + // 3. MANE Flag + String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); + if (StringUtils.isNotEmpty(maneFlag)) { + transcript.getFlags().add(maneFlag); + } + + // 4. LRG Flag + String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); + if (StringUtils.isNotEmpty(lrg)) { + transcript.getFlags().add("LRG"); + } else { + for (Xref xref : transcript.getXrefs()) { + if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { + transcript.getFlags().add("LRG"); + } + } + } + + // 5. Ensembl Canonical + String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); + if (StringUtils.isNotEmpty(canonicalFlag)) { + transcript.getFlags().add(canonicalFlag); + } + + // 6. TSO500 and EGLH HaemOnc +// String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); +// if (StringUtils.isNotEmpty(maneRefSeq)) { +// String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); +// if (StringUtils.isNotEmpty(tso500Flag)) { +// transcript.getFlags().add(tso500Flag); +// } +// +// String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); +// if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { +// transcript.getFlags().add(eglhHaemOncFlag); +// } +// } + + gene.getTranscripts().add(transcript); + + // Do not change order!! size()-1 is the index of the transcript ID + transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); + return transcript; + } + + private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) + throws IOException, RocksDBException { + if (xrefs == null || indexer == null) { + return null; + } + List annotations = new ArrayList<>(); + for (Xref xref : xrefs) { + if (xref.getDbName().equals("uniprotkb_acc")) { + String key = xref.getId(); + if (key != null && indexer.getOntologyAnnotations(key) != null) { + annotations.addAll(indexer.getOntologyAnnotations(key)); + } + } + } + return annotations; + } + + private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingEnd() == 0) { + exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingEnd(gtf.getEnd()); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingStart() == 0) { + exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { + if (transcript.getXrefs() == null) { + transcript.setXrefs(new ArrayList<>()); + } + + transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); + transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); + + // Some non-coding genes do not have Gene names + if (StringUtils.isNotEmpty(gene.getName())) { + transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); + transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); + } + + if (gtfAttributes.get("ccds_id") != null) { + transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); + } + } + + private void initializePointers(Map>> gtfMap) { + geneCounter = 0; + geneList = new ArrayList<>(gtfMap.keySet()); + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + + private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { + // Flexible parsing is deactivated, return next line + if (gtfMap == null) { + return gtfReader.read(); + // Flexible parsing activated, carefully select next line to return + } else { + // No more genes/features to return + if (nextGtfToReturn == null) { + return null; + } + Gtf gtfToReturn = nextGtfToReturn; + if (feature.equals("exon")) { +// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { + nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) + .get(transcriptName).get("exon")).get(exonCounter)).getStart(), + ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), + (List) gtfMap.get(geneName).get(transcriptName).get("cds")); + if (nextGtfToReturn != null) { + feature = "cds"; + return gtfToReturn; + } + } + // if no cds was found for this exon, get next exon + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("cds") || feature.equals("stop_codon")) { + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("start_codon")) { + feature = "stop_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); + return gtfToReturn; + } + // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon + throw new FileFormatException("Execution cannot reach this point"); + } + } + + private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { + for (Object cdsObject : cdsList) { + int cdsStart = ((Gtf) cdsObject).getStart(); + int cdsEnd = ((Gtf) cdsObject).getEnd(); + if (cdsStart <= exonEnd && cdsEnd >= exonStart) { + return (Gtf) cdsObject; + } + } + return null; + } + + private void getFeatureFollowsExon(Map>> gtfMap) { + exonCounter++; + if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() + || feature.equals("stop_codon")) { + // If last returned feature was a stop_codon or no start_codon is provided for this transcript, + // next transcript must be selected + if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { + feature = "start_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); + } else { + transcriptCounter++; + // No more transcripts in this gene, check if there are more genes + if (transcriptCounter == gtfMap.get(geneName).size()) { + geneCounter++; + // No more genes available, end parsing + if (geneCounter == gtfMap.size()) { + nextGtfToReturn = null; + feature = null; + // Still more genes to parse, select next one + } else { + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + } + } + // Check if a new gene was selected - null would indicate there're no more genes + if (nextGtfToReturn != null) { + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + } else { + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + + private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { + Map>> gtfMap = new HashMap<>(); + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene + String geneId = gtf.getAttributes().get("gene_id"); + // Transcript -> feature -> GTF line + Map> gtfMapGeneEntry; + if (gtfMap.containsKey(geneId)) { + gtfMapGeneEntry = gtfMap.get(geneId); + } else { + gtfMapGeneEntry = new HashMap(); + gtfMap.put(geneId, gtfMapGeneEntry); + } + + // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene + String transcriptId = gtf.getAttributes().get("transcript_id"); + Map gtfMapTranscriptEntry; + if (gtfMapGeneEntry.containsKey(transcriptId)) { + gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); + } else { + gtfMapTranscriptEntry = new HashMap(); + gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); + } + + addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); + + } + + // Exon number is mandatory for the parser to be able to properly generate the gene data model + if (!exonNumberPresent(gtfMap)) { + setExonNumber(gtfMap); + } + + return gtfMap; + } + + private boolean exonNumberPresent(Map>> gtfMap) { + Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); + return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) + .getAttributes().containsKey("exon_number"); + } + + private void setExonNumber(Map>> gtfMap) { + for (String gene : gtfMap.keySet()) { + for (String transcript : gtfMap.get(gene).keySet()) { + List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); + Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); + if (exonList.get(0).getStrand().equals("+")) { + int exonNumber = 1; + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber++; + } + } else { + int exonNumber = exonList.size(); + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber--; + } + } + } + } + } + + private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { + // Add exon/cds GTF line to the corresponding gene entry in the map + String featureType = gtf.getFeature().toLowerCase(); + if (featureType.equals("exon") || featureType.equals("cds")) { + List gtfList; + // Check if there were exons already stored + if (gtfMapTranscriptEntry.containsKey(featureType)) { + gtfList = (List) gtfMapTranscriptEntry.get(featureType); + } else { + gtfList = new ArrayList<>(); + gtfMapTranscriptEntry.put(featureType, gtfList); + } + gtfList.add(gtf); + // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" + // keys are already there + } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { + gtfMapTranscriptEntry.put(featureType, gtf); + } + } + + private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { + if (tabixReader == null) { + return null; + } + List transcriptTfbses = null; + + int transcriptStart = transcript.getStart(); + int transcriptEnd = transcript.getEnd(); + + + String line; + TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); + while ((line = iter.next()) != null) { + String[] elements = line.split("\t"); + + String sequenceName = elements[0]; + String source = elements[1]; + String feature = elements[2]; + int start = Integer.parseInt(elements[3]); + int end = Integer.parseInt(elements[4]); + String score = elements[5]; + String strand = elements[6]; + String frame = elements[7]; + String attribute = elements[8]; + + if (strand.equals(transcript.getStrand())) { + continue; + } + + if (transcript.getStrand().equals("+")) { + if (start > transcript.getStart() + 500) { + break; + } else if (end > transcript.getStart() - 2500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } else { + // transcript in negative strand + if (start > transcript.getEnd() + 2500) { + break; + } else if (start > transcript.getEnd() - 500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } + } + + return transcriptTfbses; + } + + protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, + List transcriptTfbses) { + if (transcriptTfbses == null) { + transcriptTfbses = new ArrayList<>(); + } + + // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; + // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB + String[] attributes = tfbs.getAttribute().split(";"); + + String id = null; + String pfmId = null; + List transciptionFactors = null; + + for (String attributePair : attributes) { + String[] attributePairArray = attributePair.split("="); + switch(attributePairArray[0]) { + case "binding_matrix_stable_id": + pfmId = attributePairArray[1]; + break; + case "stable_id": + id = attributePairArray[1]; + break; + case "transcription_factor_complex": + transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); + break; + default: + break; + } + } + + transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), + tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), + Float.parseFloat(tfbs.getScore()))); + return transcriptTfbses; + } + + private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { + Integer relativeStart; + if (transcript.getStrand().equals("+")) { + if (tfbs.getStart() < transcript.getStart()) { + relativeStart = tfbs.getStart() - transcript.getStart(); + } else { + relativeStart = tfbs.getStart() - transcript.getStart() + 1; + } + } else { + // negative strand transcript + if (tfbs.getEnd() > transcript.getEnd()) { + relativeStart = transcript.getEnd() - tfbs.getEnd(); + } else { + relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; + } + } + return relativeStart; + } + + private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { + Integer relativeEnd; + if (transcript.getStrand().equals("+")) { + if (tfbs.getEnd() < transcript.getStart()) { + relativeEnd = tfbs.getEnd() - transcript.getStart(); + } else { + relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; + } + } else { + if (tfbs.getStart() > transcript.getEnd()) { + relativeEnd = transcript.getEnd() - tfbs.getStart(); + } else { + relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; + } + } + return relativeEnd; + } + + + + private boolean newGene(Gene previousGene, String newGeneId) { + return previousGene == null || !newGeneId.equals(previousGene.getId()); + } + + private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { + if (transcript.getStart() > gtf.getStart()) { + transcript.setStart(gtf.getStart()); + } + if (transcript.getEnd() < gtf.getEnd()) { + transcript.setEnd(gtf.getEnd()); + } + if (gene.getStart() > gtf.getStart()) { + gene.setStart(gtf.getStart()); + } + if (gene.getEnd() < gtf.getEnd()) { + gene.setEnd(gtf.getEnd()); + } + } + + private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { + gtfFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { + proteinFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { + cDnaFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index fb67c19b8b..4841f5ffe2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -16,27 +16,44 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.commons.lang3.StringUtils; -import org.apache.poi.hssf.usermodel.HSSFSheet; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParser; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParserCallback; import org.opencb.biodata.formats.gaf.GafParser; import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.FeatureOntologyTermAnnotation; +import org.opencb.biodata.models.core.MiRnaGene; +import org.opencb.biodata.models.core.MirnaTarget; import org.opencb.biodata.models.core.Xref; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.models.variant.avro.*; +import org.opencb.biodata.models.variant.avro.Constraint; +import org.opencb.biodata.models.variant.avro.Expression; +import org.opencb.biodata.models.variant.avro.ExpressionCall; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; +import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.zip.GZIPInputStream; -public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; + +public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer { private static final String DESCRIPTION_SUFFIX = "_description"; private static final String XREF_SUFFIX = "_xref"; @@ -54,32 +71,30 @@ public EnsemblGeneBuilderIndexer(Path geneDirectoryPath) { public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, Path lrgFile, Path uniprotIdMappingFile, Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, - Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, - Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) - throws IOException, RocksDBException, FileFormatException { + Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, + Path cancerHostpotFile, Path canonicalFile) + throws IOException, RocksDBException, FileFormatException, CellBaseException { indexDescriptions(geneDescriptionFile); indexXrefs(xrefsFile, uniprotIdMappingFile); indexHgncIdMapping(hgncFile); - indexManeMapping(maneFile, "ensembl"); - indexLrgMapping(lrgFile, "ensembl"); + indexManeMapping(maneFile, ENSEMBL_DATA); + indexLrgMapping(lrgFile, ENSEMBL_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFile, disgenetFile); + indexDiseases(hpoFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); - indexMiRBase(miRBaseFile); + indexMiRBase(species, miRBaseFile); indexMiRTarBase(miRTarBaseFile); indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); indexCanonical(canonicalFile); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { - logger.info("Loading gene description data..."); + logger.info(PARSING_LOG_MESSAGE, geneDescriptionFile); String[] fields; if (geneDescriptionFile != null && Files.exists(geneDescriptionFile) && Files.size(geneDescriptionFile) > 0) { List lines = Files.readAllLines(geneDescriptionFile, StandardCharsets.ISO_8859_1); @@ -91,6 +106,7 @@ private void indexDescriptions(Path geneDescriptionFile) throws IOException, Roc logger.warn("Gene description file " + geneDescriptionFile + " not found"); logger.warn("Gene description data not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE); } public String getDescription(String id) throws RocksDBException { @@ -103,7 +119,7 @@ public String getDescription(String id) throws RocksDBException { } private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOException, RocksDBException { - logger.info("Loading xref data..."); + logger.info(PARSING_LOG_MESSAGE, xrefsFile); String[] fields; if (xrefsFile != null && Files.exists(xrefsFile) && Files.size(xrefsFile) > 0) { List lines = Files.readAllLines(xrefsFile, StandardCharsets.ISO_8859_1); @@ -165,6 +181,7 @@ private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOExce logger.warn("Uniprot if mapping file " + uniprotIdMappingFile + " not found"); logger.warn("Protein mapping into xref data not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE); } public List getXrefs(String id) throws RocksDBException, IOException { @@ -183,6 +200,10 @@ public List getXrefs(String id) throws RocksDBException, IOException { } private void indexExpression(String species, Path geneExpressionFile) throws IOException, RocksDBException { + if (geneExpressionFile == null) { + return; + } + Map> geneExpressionMap = new HashMap<>(); if (geneExpressionFile != null && Files.exists(geneExpressionFile) && Files.size(geneExpressionFile) > 0 && species != null) { @@ -233,131 +254,12 @@ public List getExpression(String id) throws RocksDBException, IOExce return rocksDbManager.getExpression(rocksdb, key); } - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - String line; - - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - } - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Collections.singletonList(numberOfSNPs), Collections.singletonList(source), - "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - } - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); + private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { + if (gnomadFile == null) { + return; } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { - if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { + if (Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); InputStream inputStream = Files.newInputStream(gnomadFile); BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream))); @@ -384,7 +286,7 @@ private void indexConstraints(Path gnomadFile) throws IOException, RocksDBExcept rocksDbManager.update(rocksdb, transcriptIdentifier + CONSTRAINT_SUFFIX, constraints); if ("TRUE".equalsIgnoreCase(canonical)) { - rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); + rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); } } br.close(); @@ -413,6 +315,10 @@ private void addConstraint(List constraints, String name, String val } private void indexOntologyAnnotations(Path goaFile) throws IOException, RocksDBException { + if (goaFile == null) { + return; + } + Map> annotations = new HashMap<>(); if (goaFile != null && Files.exists(goaFile) && Files.size(goaFile) > 0) { logger.info("Loading GO annotation from '{}'", goaFile); @@ -432,66 +338,17 @@ public List getOntologyAnnotations(String id) thr return rocksDbManager.getOntologyAnnotations(rocksdb, key); } - private void indexMiRBase(Path miRBaseFile) throws IOException, RocksDBException { - if (miRBaseFile != null && Files.exists(miRBaseFile) && Files.size(miRBaseFile) > 0) { - logger.info("Loading mirna from '{}'", miRBaseFile); - FileInputStream fileInputStream = new FileInputStream(miRBaseFile.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - org.apache.poi.ss.usermodel.Cell cell = cellIterator.next(); - String miRBaseAccession = cell.getStringCellValue(); - - cell = cellIterator.next(); - String miRBaseID = cell.getStringCellValue(); - - cell = cellIterator.next(); - String status = cell.getStringCellValue(); - - cell = cellIterator.next(); - String sequence = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Accession = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Sequence = cell.getStringCellValue(); - - String mature2Accession = ""; - String mature2Id = ""; - String mature2Sequence = ""; - if (cellIterator.hasNext()) { - cell = cellIterator.next(); - mature2Accession = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Sequence = cell.getStringCellValue(); - } + private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + if (miRBaseFile == null) { + return; + } - MiRnaGene miRNAGene = new MiRnaGene(miRBaseAccession, miRBaseID, status, sequence, new ArrayList<>()); - int cdnaStart = sequence.indexOf(mature1Sequence); - int cdnaEnd = cdnaStart + mature1Sequence.length(); - miRNAGene.addMiRNAMature(mature1Accession, mature1Id, mature1Sequence, cdnaStart, cdnaEnd); + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); - cdnaStart = sequence.indexOf(mature2Sequence); - cdnaEnd = cdnaStart + mature2Sequence.length(); - miRNAGene.addMiRNAMature(mature2Accession, mature2Id, mature2Sequence, cdnaStart, cdnaEnd); + MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); + MirBaseParser.parse(miRBaseFile, species, callback); - rocksDbManager.update(rocksdb, miRBaseID + MIRBASE_SUFFIX, miRNAGene); - } - } else { - logger.error("mirna file not found"); - } + logger.info(PARSING_DONE_LOG_MESSAGE, miRBaseFile); } public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOException { @@ -509,117 +366,11 @@ public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOEx return null; } - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList<>(); - Map> geneToMirna = new HashMap<>(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - - Iterator cellIterator = currentRow.iterator(); - Cell cell = cellIterator.next(); - - // Iterate columns - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // Skip species - cellIterator.next(); - - // Read target gene - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // Skip entrez gene - cellIterator.next(); - // Skip species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList<>(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmed - cell = cellIterator.next(); - String pubmed; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { -// pubmed = String.valueOf(cell.getNumericCellValue()); - pubmed = Integer.toString(Double.valueOf(cell.getNumericCellValue()).intValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } - } - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { String key = geneName + MIRTARBASE_SUFFIX; return rocksDbManager.getMirnaTargets(rocksdb, key); } - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - protected void indexCanonical(Path canonocalFile) throws IOException, RocksDBException { // Gene Transcript Canonical // ENSG00000210049.1 ENST00000387314.1 1 @@ -652,4 +403,30 @@ public String getCanonical(String transcriptId) throws RocksDBException, IOExcep } return new String(bytes); } + + // Implementation of the MirBaseParserCallback function + public class MirBaseCallback implements MirBaseParserCallback { + + private RocksDB rocksDB; + private RocksDbManager rocksDbManager; + private Logger logger; + + public MirBaseCallback(RocksDB rocksDB, RocksDbManager rocksDbManager) { + this.rocksDB = rocksDB; + this.rocksDbManager = rocksDbManager; + this.logger = LoggerFactory.getLogger(this.getClass()); + } + + @Override + public boolean processMiRnaGene(MiRnaGene miRnaGene) { + try { + rocksDbManager.update(rocksdb, miRnaGene.getId() + MIRBASE_SUFFIX, miRnaGene); + } catch (JsonProcessingException | RocksDBException e) { + logger.warn("Something wrong happened when processing miRNA gene {}: {}", miRnaGene.getId(), + StringUtils.join(e.getStackTrace(), "\t")); + return false; + } + return true; + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..785b296982 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -16,904 +16,105 @@ package org.opencb.cellbase.lib.builders; -import htsjdk.tribble.readers.TabixReader; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gtf.Gtf; -import org.opencb.biodata.formats.feature.gtf.io.GtfReader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.tools.sequence.FastaIndex; -import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.rocksdb.RocksDBException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.List; -public class GeneBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; - private Map transcriptDict; - private Map exonDict; +public class GeneBuilder extends AbstractBuilder { - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; - private Path tso500File; - private Path eglhHaemOncFile; - private boolean flexibleGTFParsing; + private Path downloadPath; + private EnsemblGeneBuilder ensemblGeneBuilder; + private RefSeqGeneBuilder refSeqGeneBuilder; - // source for genes is either ensembl or refseq - private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); - private SpeciesConfiguration speciesConfiguration; + public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseConfiguration configuration) { + super(null); - private int geneCounter; - private ArrayList geneList; - private String geneName; - private int transcriptCounter; - private ArrayList transcriptList; - private String transcriptName; - private int exonCounter; - private String feature; - private Gtf nextGtfToReturn; + this.downloadPath = downloadPath; - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - CellBaseSerializer serializer) throws CellBaseException { - this(geneDirectoryPath, genomeSequenceFastaFile, speciesConfiguration, false, serializer); - } - - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { - this(null, geneDirectoryPath.resolve("description.txt"), - geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), - geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), - geneDirectoryPath.resolve("idmapping_selected.tab.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz.tbi"), - geneDirectoryPath.resolve("allgenes_updown_in_organism_part.tab.gz"), - geneDirectoryPath.resolve("dgidb.tsv"), - geneDirectoryPath.resolve("phenotype_to_genes.txt"), - geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"), - geneDirectoryPath.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz"), - geneDirectoryPath.resolve("goa_human.gaf.gz"), - geneDirectoryPath.getParent().resolve("regulation/miRNA.xls"), - geneDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"), - geneDirectoryPath.resolve("cancer-gene-census.tsv"), - geneDirectoryPath.resolve("hotspots_v2.xls"), - geneDirectoryPath.resolve("ensembl_canonical.txt"), - geneDirectoryPath.resolve("TSO500_transcripts.txt"), - geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"), - genomeSequenceFastaFile, - speciesConfiguration, flexibleGTFParsing, serializer); + // Create Ensembl gene builder + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath, ENSEMBL_GENE_BASENAME); + this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, + configuration, ensemblGeneSerializer); - getGtfFileFromGeneDirectoryPath(geneDirectoryPath); - getProteinFastaFileFromGeneDirectoryPath(geneDirectoryPath); - getCDnaFastaFileFromGeneDirectoryPath(geneDirectoryPath); + // Create RefSeq gene builder + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath, REFSEQ_GENE_BASENAME); + this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, configuration, + refSeqGeneSerializer); } - public GeneBuilder(Path gtfFile, Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, - Path lrgFile, Path uniprotIdMappingFile, Path tfbsFile, Path tabixFile, Path geneExpressionFile, - Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, - Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneCensusFile, - Path cancerHostpotFile, Path ensemblCanonicalFile, Path tso500File, Path eglhHaemOncFile, - Path genomeSequenceFilePath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseSerializer serializer) { - super(serializer); - - this.gtfFile = gtfFile; - this.geneDescriptionFile = geneDescriptionFile; - this.xrefsFile = xrefsFile; - this.hgncFile = hgncFile; - this.maneFile = maneFile; - this.lrgFile = lrgFile; - this.uniprotIdMappingFile = uniprotIdMappingFile; - this.tfbsFile = tfbsFile; - this.tabixFile = tabixFile; - this.geneExpressionFile = geneExpressionFile; - this.geneDrugFile = geneDrugFile; - this.hpoFile = hpoFile; - this.disgenetFile = disgenetFile; - this.gnomadFile = gnomadFile; - this.geneOntologyAnnotationFile = geneOntologyAnnotationFile; - this.miRBaseFile = miRBaseFile; - this.miRTarBaseFile = miRTarBaseFile; - this.cancerGeneCensusFile = cancerGeneCensusFile; - this.cancerHostpotFile = cancerHostpotFile; - this.ensemblCanonicalFile = ensemblCanonicalFile; - this.tso500File = tso500File; - this.eglhHaemOncFile = eglhHaemOncFile; - this.genomeSequenceFilePath = genomeSequenceFilePath; - this.speciesConfiguration = speciesConfiguration; - this.flexibleGTFParsing = flexibleGTFParsing; + public void check() throws Exception { + // Check Ensembl requirements + ensemblGeneBuilder.check(); - transcriptDict = new HashMap<>(250000); - exonDict = new HashMap<>(8000000); + // Check RefSeq requirements + refSeqGeneBuilder.check(); } + @Override public void parse() throws Exception { - Gene gene = null; - Transcript transcript; - Exon exon = null; - int cdna = 1; - int cds = 1; - EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(gtfFile.getParent()); - - try { - // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, - tso500File, eglhHaemOncFile); - - TabixReader tabixReader = null; - if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { - logger.error("Tfbs or tabix file not found. Download them and try again."); - } else { - tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); - } - - // Preparing the fasta file for fast accessing -// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); - FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); - - // Empty transcript and exon dictionaries - transcriptDict.clear(); - exonDict.clear(); - logger.info("Parsing gtf..."); - GtfReader gtfReader = new GtfReader(gtfFile); - - // Gene->Transcript->Feature->GTF line - Map>> gtfMap = null; - if (flexibleGTFParsing) { - gtfMap = loadGTFMap(gtfReader); - initializePointers(gtfMap); - } - - Gtf gtf; - while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { - - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - String geneId = gtf.getAttributes().get("gene_id"); - String transcriptId = gtf.getAttributes().get("transcript_id"); - String geneName = gtf.getAttributes().get("gene_name"); - if (newGene(gene, geneId)) { - // If new geneId is different from the current then we must serialize before data new gene - if (gene != null) { - serializer.serialize(gene); - } - - GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), - indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), - indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); - - gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), - gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), - new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); - } - - // Check if Transcript exist in the Gene Set of transcripts - if (!transcriptDict.containsKey(transcriptId)) { - transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); - } else { - transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); - } - - // At this point gene and transcript objects are set up - // Update gene and transcript genomic coordinates, start must be the - // lower, and end the higher - updateTranscriptAndGeneCoords(transcript, gene, gtf); - - String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; - if (gtf.getFeature().equalsIgnoreCase("exon")) { - // Obtaining the exon sequence - String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); - String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); - - exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf - .getAttributes().get("exon_number")), exonSequence); - transcript.getExons().add(exon); - - exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); - if (gtf.getAttributes().get("exon_number").equals("1")) { - cdna = 1; - cds = 1; - } else { - // with every exon we update cDNA length with the previous exon length - cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() - - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; - } - } else { - exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); - if (gtf.getFeature().equalsIgnoreCase("CDS")) { - // Protein ID is only present in CDS lines - String proteinId = gtf.getAttributes().get("protein_id") != null - ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") - : ""; - transcript.setProteinId(proteinId); - transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); - - if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - - // cDNA coordinates - exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - } - // strand - - } else { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - // cDNA coordinates - // cdnaCodingStart points to the same base position than genomicCodingEnd - exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - // cdnaCodingStart points to the same base position than genomicCodingEnd - transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - } - } - - } -// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { -// // nothing to do -// System.out.println("Empty block, this should be redesigned"); -// } - if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { - // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, - // no need to set it at the beginning of next feature - if (exon.getStrand().equals("+")) { - updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingEnd(gtf.getEnd()); - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - transcript.setCdsLength(cds - 1); - - } else { - updateNegativeExonCodingData(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - transcript.setCdsLength(cds - 1); - } - } - } - } - - // last gene must be serialized - serializer.serialize(gene); - - // cleaning - gtfReader.close(); - serializer.close(); - fastaIndex.close(); - indexer.close(); - } catch (Exception e) { - indexer.close(); - throw e; - } - } - - private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) - throws IOException, RocksDBException { - Map gtfAttributes = gtf.getAttributes(); + // Check folders and files before building + check(); - // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. - String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); - String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; - String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); - List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); + // Build Ensembl genes + ensemblGeneBuilder.parse(); - List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); - TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); - - Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", - 0, 0, 0, 0, 0, - indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", - gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, - new HashSet<>(), transcriptAnnotation); - - // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL - // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure - // that the xrefs array contains all ids present in the GTF file - addGtfXrefs(transcript, gene, gtfAttributes); - - // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID - String hgncId = indexer.getHgncId(gene.getName()); - if (StringUtils.isNotEmpty(hgncId)) { - transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); - } - - // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("refseq", "refseq_protein")) { - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); - if (StringUtils.isNotEmpty(maneRefSeq)) { - transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, - "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); - } - } - - // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(lrgRefSeq)) { - transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); - } - - // Add Flags - // 1. GTF tags - String tags = gtf.getAttributes().get("tag"); - if (StringUtils.isNotEmpty(tags)) { - transcript.getFlags().addAll(Arrays.asList(tags.split(","))); - } - // 2. TSL - String supportLevel = gtfAttributes.get("transcript_support_level"); - if (StringUtils.isNotEmpty(supportLevel)) { - // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" - String truncatedSupportLevel = supportLevel.split(" ")[0]; - transcript.getFlags().add("TSL:" + truncatedSupportLevel); - } - // 3. MANE Flag - String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); - if (StringUtils.isNotEmpty(maneFlag)) { - transcript.getFlags().add(maneFlag); - } - // 4. LRG Flag - String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); - if (StringUtils.isNotEmpty(lrg)) { - transcript.getFlags().add("LRG"); + // Build RefSeq genes + if (!Files.exists(downloadPath.resolve(REFSEQ_DATA).resolve(REFSEQ_GENE_OUTPUT_FILENAME))) { + refSeqGeneBuilder.parse(); } else { - for (Xref xref : transcript.getXrefs()) { - if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { - transcript.getFlags().add("LRG"); - } - } - } - // 5. Ensembl Canonical - String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); - if (StringUtils.isNotEmpty(canonicalFlag)) { - transcript.getFlags().add(canonicalFlag); + logger.info(DATA_ALREADY_BUILT, getDataName(REFSEQ_DATA) + " gene"); } - // 6. TSO500 and EGLH HaemOnc - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(maneRefSeq)) { - String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } - } - - gene.getTranscripts().add(transcript); - - // Do not change order!! size()-1 is the index of the transcript ID - transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); - return transcript; - } - private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) - throws IOException, RocksDBException { - if (xrefs == null || indexer == null) { - return null; - } - List annotations = new ArrayList<>(); - for (Xref xref : xrefs) { - if (xref.getDbName().equals("uniprotkb_acc")) { - String key = xref.getId(); - if (key != null && indexer.getOntologyAnnotations(key) != null) { - annotations.addAll(indexer.getOntologyAnnotations(key)); - } - } - } - return annotations; + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } - private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + public static List getCommonDataSources(SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration) { + List dataList = new ArrayList<>(); - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingEnd() == 0) { - exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); + boolean isHSapiens = false; + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + isHSapiens = true; } - } - private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingEnd(gtf.getEnd()); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingStart() == 0) { - exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { + dataList.add(MANE_SELECT_DATA); } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { + dataList.add(LRG_DATA); } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_HOTSPOT_DATA); } - } - - private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { - if (transcript.getXrefs() == null) { - transcript.setXrefs(new ArrayList<>()); + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { + dataList.add(DGIDB_DATA); } - - transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); - transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); - - // Some non-coding genes do not have Gene names - if (StringUtils.isNotEmpty(gene.getName())) { - transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); - transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { + dataList.add(HPO_DISEASE_DATA); } - - if (gtfAttributes.get("ccds_id") != null) { - transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_GENE_CENSUS_DATA); } - } - - private void initializePointers(Map>> gtfMap) { - geneCounter = 0; - geneList = new ArrayList<>(gtfMap.keySet()); - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - - private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { - // Flexible parsing is deactivated, return next line - if (gtfMap == null) { - return gtfReader.read(); - // Flexible parsing activated, carefully select next line to return - } else { - // No more genes/features to return - if (nextGtfToReturn == null) { - return null; - } - Gtf gtfToReturn = nextGtfToReturn; - if (feature.equals("exon")) { -// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { - nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) - .get(transcriptName).get("exon")).get(exonCounter)).getStart(), - ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), - (List) gtfMap.get(geneName).get(transcriptName).get("cds")); - if (nextGtfToReturn != null) { - feature = "cds"; - return gtfToReturn; - } - } - // if no cds was found for this exon, get next exon - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("cds") || feature.equals("stop_codon")) { - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("start_codon")) { - feature = "stop_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); - return gtfToReturn; - } - // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon - throw new FileFormatException("Execution cannot reach this point"); + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + dataList.add(MIRTARBASE_DATA); } - } - - private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { - for (Object cdsObject : cdsList) { - int cdsStart = ((Gtf) cdsObject).getStart(); - int cdsEnd = ((Gtf) cdsObject).getEnd(); - if (cdsStart <= exonEnd && cdsEnd >= exonStart) { - return (Gtf) cdsObject; - } - } - return null; - } - - private void getFeatureFollowsExon(Map>> gtfMap) { - exonCounter++; - if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() - || feature.equals("stop_codon")) { - // If last returned feature was a stop_codon or no start_codon is provided for this transcript, - // next transcript must be selected - if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { - feature = "start_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); - } else { - transcriptCounter++; - // No more transcripts in this gene, check if there are more genes - if (transcriptCounter == gtfMap.get(geneName).size()) { - geneCounter++; - // No more genes available, end parsing - if (geneCounter == gtfMap.size()) { - nextGtfToReturn = null; - feature = null; - // Still more genes to parse, select next one - } else { - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - } - } - // Check if a new gene was selected - null would indicate there're no more genes - if (nextGtfToReturn != null) { - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - } else { - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - - private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { - Map>> gtfMap = new HashMap<>(); - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene - String geneId = gtf.getAttributes().get("gene_id"); - // Transcript -> feature -> GTF line - Map> gtfMapGeneEntry; - if (gtfMap.containsKey(geneId)) { - gtfMapGeneEntry = gtfMap.get(geneId); - } else { - gtfMapGeneEntry = new HashMap(); - gtfMap.put(geneId, gtfMapGeneEntry); - } - - // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene - String transcriptId = gtf.getAttributes().get("transcript_id"); - Map gtfMapTranscriptEntry; - if (gtfMapGeneEntry.containsKey(transcriptId)) { - gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); - } else { - gtfMapTranscriptEntry = new HashMap(); - gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); - } - - addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); - - } - - // Exon number is mandatory for the parser to be able to properly generate the gene data model - if (!exonNumberPresent(gtfMap)) { - setExonNumber(gtfMap); - } - - return gtfMap; - } - - private boolean exonNumberPresent(Map>> gtfMap) { - Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); - return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) - .getAttributes().containsKey("exon_number"); - } - - private void setExonNumber(Map>> gtfMap) { - for (String gene : gtfMap.keySet()) { - for (String transcript : gtfMap.get(gene).keySet()) { - List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); - Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); - if (exonList.get(0).getStrand().equals("+")) { - int exonNumber = 1; - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber++; - } - } else { - int exonNumber = exonList.size(); - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber--; - } - } - } - } - } - - private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { - // Add exon/cds GTF line to the corresponding gene entry in the map - String featureType = gtf.getFeature().toLowerCase(); - if (featureType.equals("exon") || featureType.equals("cds")) { - List gtfList; - // Check if there were exons already stored - if (gtfMapTranscriptEntry.containsKey(featureType)) { - gtfList = (List) gtfMapTranscriptEntry.get(featureType); - } else { - gtfList = new ArrayList<>(); - gtfMapTranscriptEntry.put(featureType, gtfList); - } - gtfList.add(gtf); - // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" - // keys are already there - } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { - gtfMapTranscriptEntry.put(featureType, gtf); - } - } - - private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { - if (tabixReader == null) { - return null; - } - List transcriptTfbses = null; - - int transcriptStart = transcript.getStart(); - int transcriptEnd = transcript.getEnd(); - - - String line; - TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); - while ((line = iter.next()) != null) { - String[] elements = line.split("\t"); - - String sequenceName = elements[0]; - String source = elements[1]; - String feature = elements[2]; - int start = Integer.parseInt(elements[3]); - int end = Integer.parseInt(elements[4]); - String score = elements[5]; - String strand = elements[6]; - String frame = elements[7]; - String attribute = elements[8]; - - if (strand.equals(transcript.getStrand())) { - continue; - } - - if (transcript.getStrand().equals("+")) { - if (start > transcript.getStart() + 500) { - break; - } else if (end > transcript.getStart() - 2500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } else { - // transcript in negative strand - if (start > transcript.getEnd() + 2500) { - break; - } else if (start > transcript.getEnd() - 500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } - } - - return transcriptTfbses; - } - - protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, - List transcriptTfbses) { - if (transcriptTfbses == null) { - transcriptTfbses = new ArrayList<>(); - } - - // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; - // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB - String[] attributes = tfbs.getAttribute().split(";"); - - String id = null; - String pfmId = null; - List transciptionFactors = null; - - for (String attributePair : attributes) { - String[] attributePairArray = attributePair.split("="); - switch(attributePairArray[0]) { - case "binding_matrix_stable_id": - pfmId = attributePairArray[1]; - break; - case "stable_id": - id = attributePairArray[1]; - break; - case "transcription_factor_complex": - transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); - break; - default: - break; - } - } - - transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), - tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), - Float.parseFloat(tfbs.getScore()))); - return transcriptTfbses; - } - - private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { - Integer relativeStart; - if (transcript.getStrand().equals("+")) { - if (tfbs.getStart() < transcript.getStart()) { - relativeStart = tfbs.getStart() - transcript.getStart(); - } else { - relativeStart = tfbs.getStart() - transcript.getStart() + 1; - } - } else { - // negative strand transcript - if (tfbs.getEnd() > transcript.getEnd()) { - relativeStart = transcript.getEnd() - tfbs.getEnd(); - } else { - relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; - } - } - return relativeStart; - } - - private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { - Integer relativeEnd; - if (transcript.getStrand().equals("+")) { - if (tfbs.getEnd() < transcript.getStart()) { - relativeEnd = tfbs.getEnd() - transcript.getStart(); - } else { - relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; - } - } else { - if (tfbs.getStart() > transcript.getEnd()) { - relativeEnd = transcript.getEnd() - tfbs.getStart(); - } else { - relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; - } - } - return relativeEnd; - } - - - - private boolean newGene(Gene previousGene, String newGeneId) { - return previousGene == null || !newGeneId.equals(previousGene.getId()); - } - - private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { - if (transcript.getStart() > gtf.getStart()) { - transcript.setStart(gtf.getStart()); - } - if (transcript.getEnd() < gtf.getEnd()) { - transcript.setEnd(gtf.getEnd()); - } - if (gene.getStart() > gtf.getStart()) { - gene.setStart(gtf.getStart()); - } - if (gene.getEnd() < gtf.getEnd()) { - gene.setEnd(gtf.getEnd()); + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + dataList.add(MIRBASE_DATA); } - } - - private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } - private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { - proteinFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } - - private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { - cDnaFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } + return dataList; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 285236ba60..8db1ab315f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -27,6 +27,9 @@ import org.opencb.biodata.models.core.CancerHotspot; import org.opencb.biodata.models.core.CancerHotspotVariant; import org.opencb.biodata.models.core.GeneCancerAssociation; +import org.opencb.biodata.models.core.MirnaTarget; +import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; +import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -37,31 +40,35 @@ import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HPO_DISEASE_DATA; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; + public class GeneBuilderIndexer { + public static final String ROCKSDB_FOLDER = "rocksdb.idx"; + protected RocksDB rocksdb; protected RocksDbManager rocksDbManager; protected Logger logger; protected String dbLocation; protected Options dbOption; - protected final String HGNC_ID_SUFFIX = "_hgncid"; - protected final String MANE_SUFFIX = "_mane"; - protected final String LRG_SUFFIX = "_lrg"; - protected final String CANCER_GENE_CENSUS_SUFFIX = "_cgc"; - protected final String CANCER_HOTSPOT_SUFFIX = "_chs"; - protected final String PROTEIN_SEQUENCE_SUFFIX = "_protein_fasta"; - protected final String CDNA_SEQUENCE_SUFFIX = "_cdna_fasta"; - protected final String DRUGS_SUFFIX = "_drug"; - protected final String DISEASE_SUFFIX = "_disease"; - protected final String MIRTARBASE_SUFFIX = "_mirtarbase"; - protected final String TSO500_SUFFIX = "_tso500"; - protected final String EGLH_HAEMONC_SUFFIX = "_eglh_haemonc"; + protected static final String HGNC_ID_SUFFIX = "_hgncid"; + protected static final String MANE_SUFFIX = "_mane"; + protected static final String LRG_SUFFIX = "_lrg"; + protected static final String CANCER_GENE_CENSUS_SUFFIX = "_cgc"; + protected static final String CANCER_HOTSPOT_SUFFIX = "_chs"; + protected static final String PROTEIN_SEQUENCE_SUFFIX = "_protein_fasta"; + protected static final String CDNA_SEQUENCE_SUFFIX = "_cdna_fasta"; + protected static final String DRUGS_SUFFIX = "_drug"; + protected static final String DISEASE_SUFFIX = "_disease"; + protected static final String MIRTARBASE_SUFFIX = "_mirtarbase"; public GeneBuilderIndexer(Path genePath) { this.init(genePath); @@ -69,7 +76,7 @@ public GeneBuilderIndexer(Path genePath) { private void init(Path genePath) { rocksDbManager = new RocksDbManager(); - dbLocation = genePath.resolve("integration.idx").toString(); + dbLocation = genePath.resolve(ROCKSDB_FOLDER).toString(); rocksdb = rocksDbManager.getDBConnection(dbLocation); dbOption = new Options().setCreateIfMissing(true); @@ -77,18 +84,14 @@ private void init(Path genePath) { } protected void indexCdnaSequences(Path cDnaFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading RefSeq's cDNA sequences..."); - FileUtils.checkPath(cDnaFastaFile); - if (Files.size(cDnaFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(cDnaFastaFile); + logger.info(PARSING_LOG_MESSAGE, cDnaFastaFile); + try (FastaReader fastaReader = new FastaReader(cDnaFastaFile)) { Fasta fasta; while ((fasta = fastaReader.read()) != null) { rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); } - fastaReader.close(); - } else { - logger.warn("RefSeq's cDNA sequences not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE, cDnaFastaFile); } public String getCdnaFasta(String id) throws RocksDBException { @@ -96,18 +99,18 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading ENSEMBL's protein sequences..."); - FileUtils.checkPath(proteinFastaFile); - if (Files.size(proteinFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(proteinFastaFile); + if (proteinFastaFile == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); + try (FastaReader fastaReader = new FastaReader(proteinFastaFile)) { Fasta fasta; while ((fasta = fastaReader.read()) != null) { rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); } - fastaReader.close(); - } else { - logger.warn("ENSEMBL's protein sequences not loaded"); } + logger.info(PARSING_DONE_LOG_MESSAGE, proteinFastaFile); } protected String getProteinFasta(String id) throws RocksDBException { @@ -115,22 +118,22 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { - // #hgnc_id symbol name locus_group locus_type status location location_sortable ... - logger.info("Indexing HGNC ID mapping data ..."); - - // We only need the first two columns: hgnc_id -> symbol - if (hgncMappingFile != null && Files.exists(hgncMappingFile) && Files.size(hgncMappingFile) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); - line = bufferedReader.readLine(); - } + if (hgncMappingFile == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { + String line = bufferedReader.readLine(); + // We only need the first two columns: hgnc_id -> symbol + // #hgnc_id symbol name locus_group locus_type status location location_sortable ... + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); + line = bufferedReader.readLine(); } - } else { - logger.warn("HGNC ID mapping file " + hgncMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, hgncMappingFile); } public String getHgncId(String id) throws RocksDBException { @@ -138,29 +141,29 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + if (maneMappingFile == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); + int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; + // #NCBI_GeneID Ensembl_Gene HGNC_ID symbol name RefSeq_nuc RefSeq_prot Ensembl_nuc Ensembl_prot // MANE_status GRCh38_chr chr_start chr_end chr_strand - logger.info("Indexing MANE mapping data ..."); - - if (maneMappingFile != null && Files.exists(maneMappingFile) && Files.size(maneMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 7 : 5; -// BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); - line = bufferedReader.readLine(); - } + line = bufferedReader.readLine(); } - } else { - logger.warn("MANE mapping file " + maneMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, maneMappingFile); } public String getMane(String id, String field) throws RocksDBException { @@ -168,30 +171,31 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + if (lrgMappingFile == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); + // # Last modified: 30-03-2021@22:00:06 // # LRG HGNC_SYMBOL REFSEQ_GENOMIC LRG_TRANSCRIPT REFSEQ_TRANSCRIPT ENSEMBL_TRANSCRIPT CCDS // LRG_1 COL1A1 NG_007400.1 t1 NM_000088.3 ENST00000225964.10 CCDS11561.1 - logger.info("Indexing LRG mapping data ..."); - - if (lrgMappingFile != null && Files.exists(lrgMappingFile) && Files.size(lrgMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - String id = fields[idColumn]; - if (StringUtils.isNotEmpty(id) && !id.equals("-")) { - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); - } + int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + String id = fields[idColumn]; + if (StringUtils.isNotEmpty(id) && !id.equals("-")) { + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("LRG mapping file " + lrgMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, lrgMappingFile); } public String getLrg(String id, String field) throws RocksDBException { @@ -199,6 +203,12 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + if (cgcFile == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, cgcFile); + Map tissuesMap = new HashMap<>(); tissuesMap.put("E", "epithelial"); tissuesMap.put("L", "leukaemia/lymphoma"); @@ -224,22 +234,20 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx mutationTypesMap.put("Mis", "missense"); mutationTypesMap.put("PromoterMis", "missense"); - logger.info("Indexing CANCER GENE CENSUS data ..."); - if (cgcFile != null && Files.exists(cgcFile) && Files.size(cgcFile) > 0) { + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile)) { // Skip the first header line - BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile); - bufferedReader.readLine(); + String line = bufferedReader.readLine(); GeneCancerAssociation cancerGeneAssociation; - String line; + while ((line = bufferedReader.readLine()) != null) { String[] fields = line.split("\t", -1); // Find Ensembl Gene Id in the last comma-separated column List synonyms = StringUtils.isNotEmpty(fields[19]) ? Arrays.stream(fields[19] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replace("\"", "") + .replace(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -254,54 +262,55 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx boolean somatic = StringUtils.isNotEmpty(fields[7]) && fields[7].equalsIgnoreCase("yes"); boolean germline = StringUtils.isNotEmpty(fields[8]) && fields[8].equalsIgnoreCase("yes"); List somaticTumourTypes = StringUtils.isNotEmpty(fields[9]) - ? Arrays.asList(fields[9].replaceAll("\"", "").split(", ")) + ? Arrays.asList(fields[9].replace("\"", "").split(", ")) : new ArrayList<>(); List germlineTumourTypes = StringUtils.isNotEmpty(fields[10]) - ? Arrays.asList(fields[10].replaceAll("\"", "").split(", ")) + ? Arrays.asList(fields[10].replace("\"", "").split(", ")) : Collections.emptyList(); List syndromes = StringUtils.isNotEmpty(fields[11]) - ? Arrays.asList(fields[11].replaceAll("\"", "").split("; ")) + ? Arrays.asList(fields[11].replace("\"", "").split("; ")) : Collections.emptyList(); List tissues = StringUtils.isNotEmpty(fields[12]) ? Arrays.stream(fields[12] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replace("\"", "") + .replace(" ", "") + .split(",")) .map(tissuesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); - List modeOfInheritance = StringUtils.isNotEmpty(fields[13]) - ? fields[13].equalsIgnoreCase("Dom/Rec") + List modeOfInheritance = Collections.emptyList(); + if (StringUtils.isNotEmpty(fields[13])) { + modeOfInheritance = fields[13].equalsIgnoreCase("Dom/Rec") ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) - : Collections.singletonList(moiMap.get(fields[13])) - : Collections.emptyList(); + : Collections.singletonList(moiMap.get(fields[13])); + } List roleInCancer = StringUtils.isNotEmpty(fields[14]) ? Arrays.stream(fields[14] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replace("\"", "") + .replace(" ", "") + .split(",")) .map(roleInCancerMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List mutationTypes = StringUtils.isNotEmpty(fields[15]) ? Arrays.stream(fields[15] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replace("\"", "") + .replace(" ", "") + .split(",")) .map(mutationTypesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List translocationPartners = StringUtils.isNotEmpty(fields[16]) ? Arrays.stream(fields[16] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replace("\"", "") + .replace(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); List otherSyndromes = StringUtils.isNotEmpty(fields[18]) ? Arrays.stream(fields[18] - .replaceAll("\"", "") - .split("; ")) + .replace("\"", "") + .split("; ")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -312,10 +321,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx rocksDbManager.update(rocksdb, fields[0] + CANCER_GENE_CENSUS_SUFFIX, cancerGeneAssociation); } } - bufferedReader.close(); - } else { - logger.warn("CANCER GENE CENSUS file " + cgcFile + " not found"); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cgcFile); } public List getCancerGeneCensus(String geneName) throws RocksDBException, IOException { @@ -324,97 +332,106 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + if (cancerHotspot == null) { + return; + } + + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); + // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key Map> visited = new HashMap<>(); - FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - iterator.next(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - String geneName = currentRow.getCell(0).toString(); - - if (currentRow.getCell(1).toString().contains("splice")) { - continue; - } - int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - - CancerHotspot ch = null; - // Check if ch object already exist - if (visited.containsKey(geneName)) { - for (CancerHotspot hotspot : visited.get(geneName)) { - if (hotspot.getAminoacidPosition() == aminoAcidPosition) { - ch = hotspot; - break; - } - } - } - // If not exist we create new ch - if (ch == null) { - ch = new CancerHotspot(); - ch.setScores(new HashMap<>()); - ch.setCancerTypeCount(new HashMap<>()); - ch.setOrganCount(new HashMap<>()); - ch.setVariants(new ArrayList<>()); - - // Parse new row - ch.setGeneName(geneName); - ch.setAminoacidPosition(aminoAcidPosition); - ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); - ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); - - String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); - for (String cancerCount : cancerCountSplit) { - String[] split = cancerCount.split(":"); - ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + try (FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile())) { + HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); + HSSFSheet sheet = workbook.getSheetAt(0); + Iterator iterator = sheet.iterator(); + iterator.next(); + while (iterator.hasNext()) { + Row currentRow = iterator.next(); + String geneName = currentRow.getCell(0).toString(); + + if (currentRow.getCell(1).toString().contains("splice")) { + continue; } + int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); - for (String organCount : organCountSplit) { - String[] split = organCount.split(":"); - ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + CancerHotspot ch = null; + // Check if ch object already exist + if (visited.containsKey(geneName)) { + for (CancerHotspot hotspot : visited.get(geneName)) { + if (hotspot.getAminoacidPosition() == aminoAcidPosition) { + ch = hotspot; + break; + } + } } - ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); - ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); - ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); - ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); - ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); - ch.setAminoacidReference(currentRow.getCell(35).toString()); - ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); - ch.setCancerType(currentRow.getCell(37).toString()); + // If not exist we create new ch + if (ch == null) { + ch = new CancerHotspot(); + ch.setScores(new HashMap<>()); + ch.setCancerTypeCount(new HashMap<>()); + ch.setOrganCount(new HashMap<>()); + ch.setVariants(new ArrayList<>()); + + // Parse new row + ch.setGeneName(geneName); + ch.setAminoacidPosition(aminoAcidPosition); + ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); + ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); + + String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); + for (String cancerCount : cancerCountSplit) { + String[] split = cancerCount.split(":"); + ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + } + + String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); + for (String organCount : organCountSplit) { + String[] split = organCount.split(":"); + ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + } - if (visited.containsKey(geneName)) { - // Gene exists but no this aminoacid position - visited.get(geneName).add(ch); - } else { - // New gene found - visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); + ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); + ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); + ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); + ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); + ch.setAminoacidReference(currentRow.getCell(35).toString()); + ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); + ch.setCancerType(currentRow.getCell(37).toString()); + + if (visited.containsKey(geneName)) { + // Gene exists but no this aminoacid position + visited.get(geneName).add(ch); + } else { + // New gene found + visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + } } - } - // Add cancer hotspot variant information - CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); - cancerHotspotVariant.setSampleCount(new HashMap<>()); + // Add cancer hotspot variant information + CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); + cancerHotspotVariant.setSampleCount(new HashMap<>()); - String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); - cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); - cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); + String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); + cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); + cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); - String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); - for (String sampleCount : sampleSplit) { - String[] sampleCountSplit = sampleCount.split(":"); - cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); + for (String sampleCount : sampleSplit) { + String[] sampleCountSplit = sampleCount.split(":"); + cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + } + ch.getVariants().add(cancerHotspotVariant); } - ch.getVariants().add(cancerHotspotVariant); } - fileInputStream.close(); - for (String geneName : visited.keySet()) { - rocksDbManager.update(rocksdb, geneName + CANCER_HOTSPOT_SUFFIX, visited.get(geneName)); + for (Map.Entry> entry : visited.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + CANCER_HOTSPOT_SUFFIX, entry.getValue()); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cancerHotspot); } public List getCancerHotspot(String geneName) throws RocksDBException, IOException { @@ -422,92 +439,158 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } + private String getIndexEntry(String id, String suffix) throws RocksDBException { + return getIndexEntry(id, suffix, ""); + } - protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { - // Gene Ref Seq - // FAS NM_000043 - // AR NM_000044 - logger.info("Indexing TSO500 data ..."); - - if (tso500Path != null && Files.exists(tso500Path) && Files.size(tso500Path) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); - } - } - line = bufferedReader.readLine(); - } - } - } else { - logger.warn("Ensembl TSO500 mapping file " + tso500Path + " not found"); + private String getIndexEntry(String id, String suffix, String field) throws RocksDBException { + String key = id + suffix; + if (StringUtils.isNotEmpty(field)) { + key += "_" + field; + } + byte[] value = rocksdb.get(key.getBytes()); + if (value != null) { + return new String(value); } + return null; } - public String getTSO500(String transcriptId) throws RocksDBException { - String key = transcriptId + TSO500_SUFFIX; - byte[] bytes = rocksdb.get(key.getBytes()); - if (bytes == null) { - return null; - } - return new String(bytes); + protected void close() throws IOException { + rocksDbManager.closeIndex(rocksdb, dbOption, dbLocation); } + protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + if (geneDrugFile == null) { + return; + } - protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { - // Gene Ref Seq - // GNB1 NM_002074.4 - // CSF3R NM_000760.3 - logger.info("Indexing EGLH HaemOnc data ..."); - - if (eglhHaemOncPath != null && Files.exists(eglhHaemOncPath) && Files.size(eglhHaemOncPath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); - } - } - line = bufferedReader.readLine(); + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); + + String currentGene = ""; + List drugs = new ArrayList<>(); + + try (BufferedReader br = FileUtils.newBufferedReader(geneDrugFile)) { + // Skip header + String line = br.readLine(); + + while ((line = br.readLine()) != null) { + String[] parts = line.split("\t"); + String geneName = parts[0]; + if (currentGene.equals("")) { + currentGene = geneName; + } else if (!currentGene.equals(geneName)) { + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + drugs = new ArrayList<>(); + currentGene = geneName; + } + + String source = null; + if (parts.length >= 4) { + source = parts[3]; + } + + String interactionType = null; + if (parts.length >= 5) { + interactionType = parts[4]; } + + String drugName = null; + if (parts.length >= 8) { + // if drug name column is empty, use drug claim name instead + drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; + } + if (StringUtils.isEmpty(drugName)) { + // no drug name + continue; + } + + String chemblId = null; + if (parts.length >= 9) { + chemblId = parts[8]; + } + + List publications = new ArrayList<>(); + if (parts.length >= 10 && parts[9] != null) { + publications = Arrays.asList(parts[9].split(",")); + } + + GeneDrugInteraction drug = new GeneDrugInteraction( + geneName, drugName, source, null, null, interactionType, chemblId, publications); + drugs.add(drug); } - } else { - logger.warn("Ensembl EGLH HaemOnc mapping file " + eglhHaemOncPath + " not found"); } + // update last gene + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + + logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); } - public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { - String key = transcriptId + EGLH_HAEMONC_SUFFIX; - byte[] bytes = rocksdb.get(key.getBytes()); - if (bytes == null) { - return null; + protected void indexDiseases(Path hpoFilePath) throws IOException, RocksDBException { + if (hpoFilePath == null) { + return; } - return new String(bytes); - } - private String getIndexEntry(String id, String suffix) throws RocksDBException { - return getIndexEntry(id, suffix, ""); + Map> geneDiseaseAssociationMap = new HashMap<>(50000); + + String line; + + // HPO + logger.info(PARSING_LOG_MESSAGE, hpoFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { + // Skip first header line + line = bufferedReader.readLine(); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); + String omimId = fields[6]; + String geneSymbol = fields[3]; + String hpoId = fields[0]; + String diseaseName = fields[1]; + GeneTraitAssociation disease = + new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DISEASE_DATA); + addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE); + + for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); + } } - private String getIndexEntry(String id, String suffix, String field) throws RocksDBException { - String key = id + suffix; - if (StringUtils.isNotEmpty(field)) { - key += "_" + field; + protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { + if (miRTarBaseFile == null) { + return; } - byte[] value = rocksdb.get(key.getBytes()); - if (value != null) { - return new String(value); + + MiRTarBaseIndexer miRTarBaseIndexer = new MiRTarBaseIndexer(); + Map> result = miRTarBaseIndexer.index(miRTarBaseFile); + for (Map.Entry> entry : result.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); } - return null; } - protected void close() throws IOException { - rocksDbManager.closeIndex(rocksdb, dbOption, dbLocation); + public static void addValueToMapElement(Map> map, String key, T value) { + if (map.containsKey(key)) { + map.get(key).add(value); + } else { + List valueList = new ArrayList<>(); + valueList.add(value); + map.put(key, valueList); + } + } + + protected List getDrugs(String id) throws RocksDBException, IOException { + String key = id + DRUGS_SUFFIX; + return rocksDbManager.getDrugs(rocksdb, key); } + protected List getDiseases(String id) throws RocksDBException, IOException { + String key = id + DISEASE_SUFFIX; + return rocksDbManager.getDiseases(rocksdb, key); + } + + protected List getMirnaTargets(String geneName) throws RocksDBException, IOException { + String key = geneName + MIRTARBASE_SUFFIX; + return rocksDbManager.getMirnaTargets(rocksdb, key); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java deleted file mode 100644 index 16dbbc9a3c..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderUtils.java +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Created by imedina on 12/11/15. - */ -@Deprecated -public class GeneBuilderUtils { - - private static Logger logger = LoggerFactory.getLogger(GeneBuilderUtils.class); - -// @Deprecated -// public static Map> getTfbsMap(Path tfbsFile) throws IOException, NoSuchMethodException, FileFormatException { -// Map> tfbsMap = new HashMap<>(); -// if (tfbsFile != null && Files.exists(tfbsFile) && !Files.isDirectory(tfbsFile) && Files.size(tfbsFile) > 0) { -// Gff2Reader motifsFeatureReader = new Gff2Reader(tfbsFile); -// Gff2 tfbsMotifFeature; -// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { -// // we only want high quality data. See issue 466 -// if (!tfbsMotifFeature.getAttribute().contains("experimental_evidence")) { -// continue; -// } -// String chromosome = tfbsMotifFeature.getSequenceName().replaceFirst("chr", ""); -// SortedSet chromosomeTfbsSet = tfbsMap.get(chromosome); -// if (chromosomeTfbsSet == null) { -// chromosomeTfbsSet = new TreeSet<>((Comparator) (feature1, feature2) -> { -// // TODO: maybe this should be in TranscriptTfbs class, and equals method should be overriden too -// if (feature1.getStart() != feature2.getStart()) { -// return feature1.getStart() - feature2.getStart(); -// } else { -// return feature1.getAttribute().compareTo(feature2.getAttribute()); -// } -// }); -// tfbsMap.put(chromosome, chromosomeTfbsSet); -// } -// chromosomeTfbsSet.add(tfbsMotifFeature); -// } -// motifsFeatureReader.close(); -// } -// return tfbsMap; -// } - -// public static Map> getXrefMap(Path xrefsFile, Path uniprotIdMappingFile) throws IOException { -// Map> xrefMap = new HashMap<>(); -// logger.info("Loading xref data..."); -// String[] fields; -// if (xrefsFile != null && Files.exists(xrefsFile) && Files.size(xrefsFile) > 0) { -// List lines = Files.readAllLines(xrefsFile, Charset.forName("ISO-8859-1")); -// for (String line : lines) { -// fields = line.split("\t", -1); -// if (fields.length >= 4) { -// if (!xrefMap.containsKey(fields[0])) { -// xrefMap.put(fields[0], new ArrayList<>()); -// } -// xrefMap.get(fields[0]).add(new Xref(fields[1], fields[2], fields[3])); -// } -// } -// } else { -// logger.warn("Xrefs file " + xrefsFile + " not found"); -// logger.warn("Xref data not loaded"); -// } -// -// logger.info("Loading protein mapping into xref data..."); -// if (uniprotIdMappingFile != null && Files.exists(uniprotIdMappingFile) && Files.size(uniprotIdMappingFile) > 0) { -// BufferedReader br = FileUtils.newBufferedReader(uniprotIdMappingFile); -// String line; -// while ((line = br.readLine()) != null) { -// fields = line.split("\t", -1); -// if (fields.length >= 19 && fields[19].startsWith("ENST")) { -// String[] transcripts = fields[19].split("; "); -// for (String transcript : transcripts) { -// if (!xrefMap.containsKey(transcript)) { -// xrefMap.put(transcript, new ArrayList()); -// } -// xrefMap.get(transcript).add(new Xref(fields[0], "uniprotkb_acc", "UniProtKB ACC")); -// xrefMap.get(transcript).add(new Xref(fields[1], "uniprotkb_id", "UniProtKB ID")); -// } -// } -// } -// br.close(); -// } else { -// logger.warn("Uniprot if mapping file " + uniprotIdMappingFile + " not found"); -// logger.warn("Protein mapping into xref data not loaded"); -// } -// -// return xrefMap; -// } - -// public static Map> getGeneDrugMap(Path geneDrugFile) throws IOException { -// Map> geneDrugMap = new HashMap<>(); -// if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { -// logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); -// BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); -// -// // Skip header -// br.readLine(); -// -// int lineCounter = 1; -// String line; -// while ((line = br.readLine()) != null) { -// String[] parts = line.split("\t"); -// String geneName = parts[0]; -// -// String source = null; -// if (parts.length >= 4) { -// source = parts[3]; -// } -// -// String interactionType = null; -// if (parts.length >= 5) { -// interactionType = parts[4]; -// } -// -// String drugName = null; -// if (parts.length >= 8) { -// // if drug name column is empty, use drug claim name instead -// drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; -// } -// if (StringUtils.isEmpty(drugName)) { -// // no drug name -// continue; -// } -// -// String chemblId = null; -// if (parts.length >= 9) { -// chemblId = parts[8]; -// } -// -// List publications = new ArrayList<>(); -// if (parts.length >= 10 && parts[9] != null) { -// publications = Arrays.asList(parts[9].split(",")); -// } -// -// //addValueToMapElement(geneDrugMap, geneName, new GeneDrugInteraction(geneName, drugName, source, null, interactionType)); -// // TODO update model to add new attributes -// addValueToMapElement(geneDrugMap, geneName, new GeneDrugInteraction(geneName, drugName, source, null, null, -// interactionType, chemblId, publications)); -// lineCounter++; -// } -// -// br.close(); -// } else { -// logger.warn("Gene drug file " + geneDrugFile + " not found"); -// logger.warn("Ignoring " + geneDrugFile); -// } -// -// return geneDrugMap; -// } - - -// -// public static Map> getGeneDiseaseAssociationMap(Path hpoFilePath, Path disgenetFilePath) -// throws IOException { -// Map> geneDiseaseAssociationMap = new HashMap<>(50000); -// -// String line; -// if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { -// BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); -// // skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String omimId = fields[6]; -// String geneSymbol = fields[3]; -// String hpoId = fields[0]; -// String diseaseName = fields[1]; -// GeneTraitAssociation disease = -// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); -// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); -// } -// bufferedReader.close(); -// } -// -// if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { -// BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); -// // skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String diseaseId = fields[4]; -// String diseaseName = fields[5]; -// String score = fields[9]; -// String numberOfPubmeds = fields[13].trim(); -// String numberOfSNPs = fields[14]; -// String source = fields[15]; -// GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), -// Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); -// addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); -// } -// bufferedReader.close(); -// } -// -// return geneDiseaseAssociationMap; -// } -// -// /** -// * For a gnomad file, parse and return a map of transcript to constraints. -// * -// * @param gnomadFile gene annotation file path -// * @return map of transcript to constraints -// * @throws IOException if goa file can't be read -// */ -// public static Map> getConstraints(Path gnomadFile) throws IOException { -// Map> transcriptConstraints = new HashMap<>(); -// -// if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { -// logger.info("Loading OE scores from '{}'", gnomadFile); -//// BufferedReader br = FileUtils.newBufferedReader(gnomadFile); -// InputStream inputStream = Files.newInputStream(gnomadFile); -// BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream))); -// // Skip header. -// br.readLine(); -// String line; -// while ((line = br.readLine()) != null) { -// String[] parts = line.split("\t"); -// String transcriptIdentifier = parts[1]; -// String canonical = parts[2]; -// String oeMis = parts[5]; -// String oeSyn = parts[14]; -// String oeLof = parts[24]; -// String exacPLI = parts[70]; -// String exacLof = parts[73]; -// String geneIdentifier = parts[64]; -// -// List constraints = new ArrayList<>(); -// addConstraint(constraints, "oe_mis", oeMis); -// addConstraint(constraints, "oe_syn", oeSyn); -// addConstraint(constraints, "oe_lof", oeLof); -// addConstraint(constraints, "exac_pLI", exacPLI); -// addConstraint(constraints, "exac_oe_lof", exacLof); -// transcriptConstraints.put(transcriptIdentifier, constraints); -// -// if ("TRUE".equalsIgnoreCase(canonical)) { -// transcriptConstraints.put(geneIdentifier, constraints); -// } -// } -// br.close(); -// } -// return transcriptConstraints; -// } -// -// private static void addConstraint(List constraints, String name, String value) { -// Constraint constraint = new Constraint(); -// constraint.setMethod("pLoF"); -// constraint.setSource("gnomAD"); -// constraint.setName(name); -// try { -// constraint.setValue(Double.parseDouble(value)); -// } catch (NumberFormatException e) { -// // invalid number (e.g. NA), discard. -// return; -// } -// constraints.add(constraint); -// } -// -// /** -// * For a gene annotation file, parse and return a map of proteins to ontology annotation objects. -// * -// * @param goaFile gene annotation file path -// * @return map of proteins to ontology annotation objects. -// * @throws IOException if goa file can't be read -// */ -// public static Map> getOntologyAnnotations(Path goaFile) throws IOException { -// Map> annotations = new HashMap<>(); -// if (goaFile != null && Files.exists(goaFile) && Files.size(goaFile) > 0) { -// logger.info("Loading GO annotation from '{}'", goaFile); -// BufferedReader br = FileUtils.newBufferedReader(goaFile); -// GafParser parser = new GafParser(); -// annotations = parser.parseGaf(br); -// } -// return annotations; -// } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java index 7428cd5fbf..e5f6449051 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneExpressionAtlasBuilder.java @@ -31,7 +31,7 @@ /** * Created by antonior on 10/16/14. */ -public class GeneExpressionAtlasBuilder extends CellBaseBuilder { +public class GeneExpressionAtlasBuilder extends AbstractBuilder { private Path geneAtlasDirectoryPath; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java index 5bb232f5d2..521c5f3a71 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GenomeSequenceFastaBuilder.java @@ -16,7 +16,9 @@ package org.opencb.cellbase.lib.builders; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.GenomeSequenceChunk; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; @@ -24,11 +26,13 @@ import java.io.IOException; import java.nio.file.Path; -public class GenomeSequenceFastaBuilder extends CellBaseBuilder { +public class GenomeSequenceFastaBuilder extends AbstractBuilder { private Path genomeReferenceFastaFile; private static final int CHUNK_SIZE = 2000; + public static final String GENOME_JSON_BASENAME = "genome"; + public static final String GENOME_JSON_FILENAME = GENOME_JSON_BASENAME + ".json.gz"; public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSerializer serializer) { super(serializer); @@ -36,9 +40,10 @@ public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSeriali } @Override - public void parse() { + public void parse() throws CellBaseException { + logger.info(PARSING_LOG_MESSAGE, genomeReferenceFastaFile); - try { + try (BufferedReader br = FileUtils.newBufferedReader(genomeReferenceFastaFile)) { String sequenceName = null; String sequenceType = ""; String sequenceAssembly = null; @@ -46,8 +51,7 @@ public void parse() { StringBuilder sequenceStringBuilder = new StringBuilder(); // Preparing input and output files - BufferedReader br; - br = FileUtils.newBufferedReader(genomeReferenceFastaFile); + while ((line = br.readLine()) != null) { @@ -55,11 +59,9 @@ public void parse() { sequenceStringBuilder.append(line); } else { // new chromosome, save data - if (sequenceStringBuilder.length() > 0) { - if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { - System.out.println(sequenceName); - serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); - } + if (sequenceStringBuilder.length() > 0 && StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH") + && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { + serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); } // initialize data structures @@ -75,18 +77,17 @@ public void parse() { } } // Last chromosome must be processed - if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) { + if (StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") + && !sequenceName.contains("contig")) { serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); } - - br.close(); } catch (IOException e) { e.printStackTrace(); } + logger.info(PARSING_DONE_LOG_MESSAGE); } - private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence) - throws IOException { + private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence) { int chunk = 0; int start = 1; int end = CHUNK_SIZE - 1; @@ -100,11 +101,10 @@ private void serializeGenomeSequence(String chromosome, String sequenceType, Str genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + 0 + "_" + chunkIdSuffix, start, sequence.length() - 1, sequenceType, sequenceAssembly, chunkSequence); serializer.serialize(genomeSequenceChunk); - start += CHUNK_SIZE - 1; } else { while (start < sequence.length()) { if (chunk % 10000 == 0) { - System.out.println("Chr:" + chromosome + " chunkId:" + chunk); + logger.info("Chr: {}, chunkId: {}", chromosome, chunk); } // First chunk of the chromosome if (start == 1) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java index 5fcc68c206..5bc18dba17 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/InteractionBuilder.java @@ -33,7 +33,7 @@ * Time: 4:43 PM * To change this template use File | Settings | File Templates. */ -public class InteractionBuilder extends CellBaseBuilder { +public class InteractionBuilder extends AbstractBuilder { private final String species; private final Path psimiTabFile; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java new file mode 100644 index 0000000000..01701362eb --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java @@ -0,0 +1,148 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import org.apache.commons.lang3.StringUtils; +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.models.core.MirnaTarget; +import org.opencb.biodata.models.core.TargetGene; +import org.opencb.commons.utils.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.math.BigDecimal; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.opencb.cellbase.lib.EtlCommons.MIRTARBASE_DATA; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; + +public class MiRTarBaseIndexer { + + protected Logger logger; + + public MiRTarBaseIndexer() { + logger = LoggerFactory.getLogger(this.getClass()); + } + + public Map> index(Path miRTarBaseFile) throws IOException { + FileUtils.checkFile(miRTarBaseFile); + + logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); + + Map> geneToMirna = new HashMap<>(); + + try (InputStream fis = new FileInputStream(miRTarBaseFile.toFile()); + Workbook workbook = new XSSFWorkbook(fis)) { + + // Get the first sheet + Sheet sheet = workbook.getSheetAt(0); + + String currentMiRTarBaseId = null; + String currentMiRNA = null; + String currentGene = null; + List targetGenes = new ArrayList<>(); + + for (int rowNum = sheet.getFirstRowNum() + 1; rowNum <= sheet.getLastRowNum(); rowNum++) { + Row row = sheet.getRow(rowNum); + + // Sanity check + if (row.getPhysicalNumberOfCells() != 9) { + logger.warn("Error parsing line {}: invalid number of columns {} (expected 9 columns). Line {}.", + rowNum + 1, row.getPhysicalNumberOfCells()); + continue; + } + + if (row.getCell(0).getCellType() != CellType.STRING || row.getCell(0).getStringCellValue() == null + || row.getCell(1).getCellType() != CellType.STRING || row.getCell(1).getStringCellValue() == null + || row.getCell(3).getCellType() != CellType.STRING || row.getCell(3).getStringCellValue() == null) { + logger.warn("Error parsing line {}: mandatory fields(miRTarBase ID, miRNA, Target Gene) are empty or wrong cell type.", + rowNum + 1); + continue; + } + + // #0: miRTarBase ID + Cell cell = row.getCell(0); + String miRTarBaseId = cell.getStringCellValue(); + if (currentMiRTarBaseId == null) { + currentMiRTarBaseId = miRTarBaseId; + } + + // #1: miRNA + cell = row.getCell(1); + String miRNA = cell.getStringCellValue(); + if (currentMiRNA == null) { + currentMiRNA = miRNA; + } + + // #2: Species (miRNA) + + // #3: Target Gene + cell = row.getCell(3); + String geneName = cell.getStringCellValue(); + if (currentGene == null) { + currentGene = geneName; + } + + // #4: Target Gene (Entrez ID) + // #5: Species (Target Gene) + + if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { + // new entry, store current one + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + targetGenes = new ArrayList<>(); + currentGene = geneName; + currentMiRTarBaseId = miRTarBaseId; + currentMiRNA = miRNA; + } + + // #6: Experiments + cell = row.getCell(6); + String experiment = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null); + + // #7: Support Type + cell = row.getCell(7); + String supportType = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null); + + // #8: pubmed + cell = row.getCell(8); + String pubmed = new BigDecimal(cell.getNumericCellValue()).toString(); + + if (StringUtils.isNotEmpty(experiment) || StringUtils.isNotEmpty(supportType) || StringUtils.isNotEmpty(pubmed)) { + targetGenes.add(new TargetGene(experiment, supportType, pubmed)); + } + } + + // parse last entry + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + + } + logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); + + return geneToMirna; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..9273c451f5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -19,60 +19,84 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; import java.nio.file.Path; +import java.util.Collections; import java.util.List; -public class OntologyBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Path hpoFile; - private Path goFile; - private Path doidFile; - private Path mondoFile; +public class OntologyBuilder extends AbstractBuilder { - public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { + private Path oboDownloadPath; + private SpeciesConfiguration speciesConfiguration; + + public static final String OBO_OUTPUT_BASENAME = "ontology"; + public static final String OBO_OUTPUT_FILENAME = OBO_OUTPUT_BASENAME + ".json.gz"; + + public OntologyBuilder(Path oboDownloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { super(serializer); - hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); - goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); - doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + + this.oboDownloadPath = oboDownloadPath; + this.speciesConfiguration = speciesConfiguration; } @Override public void parse() throws Exception { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFile); - OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, "Human Phenotype Ontology"); - for (OntologyTerm term : terms) { - term.setSource("HP"); - serializer.serialize(term); - } + // Sanity check + checkDirectory(oboDownloadPath, getDataName(ONTOLOGY_DATA)); - bufferedReader = FileUtils.newBufferedReader(goFile); - terms = parser.parseOBO(bufferedReader, "Gene Ontology"); - for (OntologyTerm term : terms) { - term.setSource("GO"); - serializer.serialize(term); + // Check ontology files + List hpoFiles = Collections.emptyList(); + List doidFiles = Collections.emptyList(); + List mondoFiles = Collections.emptyList(); + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + hpoFiles = checkOboFiles(HPO_OBO_DATA); + doidFiles = checkOboFiles(DOID_OBO_DATA); + mondoFiles = checkOboFiles(MONDO_OBO_DATA); } + List goFiles = checkOboFiles(GO_OBO_DATA); - bufferedReader = FileUtils.newBufferedReader(doidFile); - terms = parser.parseOBO(bufferedReader, "Human Disease Ontology"); - for (OntologyTerm term : terms) { - term.setSource("DOID"); - serializer.serialize(term); + // Parse OBO files and build + if (speciesConfiguration.getScientificName().equalsIgnoreCase(HOMO_SAPIENS)) { + parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + parseOboFile(doidFiles.get(0), DOID_OBO_DATA); + parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); } + parseOboFile(goFiles.get(0), GO_OBO_DATA); - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); + // Close serializer + serializer.close(); + } + + private void parseOboFile(File oboFile, String data) throws IOException { + logger.info(PARSING_LOG_MESSAGE, oboFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { + OboParser parser = new OboParser(); + List terms = parser.parseOBO(bufferedReader, data); + for (OntologyTerm term : terms) { + serializer.serialize(term); + } } + logger.info(PARSING_DONE_LOG_MESSAGE, oboFile); + } - serializer.close(); + private List checkOboFiles(String data) throws IOException, CellBaseException { + Path versionFilePath = oboDownloadPath.resolve(data).resolve(getDataVersionFilename(data)); + String name = getDataName(data); + + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath.resolve(data), + getDataName(ONTOLOGY_DATA) + "/" + name); + if (files.size() != 1) { + throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); + } + return files; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 1f7a4836ca..1cfd85ae07 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -23,22 +23,25 @@ import org.opencb.biodata.models.core.Xref; import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; -public class PharmGKBBuilder extends CellBaseBuilder { +public class PharmGKBBuilder extends AbstractBuilder { - private final Path inputDir; - private final Path pharmGKBDir; + private final Path pharmGkbDownloadPath; private static final String CHEMICALS_BASENAME = "chemicals"; private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv"; @@ -88,21 +91,25 @@ public class PharmGKBBuilder extends CellBaseBuilder { private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE"; private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP"; - public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) { + public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializer) { super(serializer); - - this.inputDir = inputDir; - this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA); + this.pharmGkbDownloadPath = parmGkbDownloadPath; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkDirectory(inputDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); + + // Sanity check + checkDirectory(pharmGkbDownloadPath, getDataName(PHARMGKB_DATA)); - // PharmGKB - FileUtils.checkDirectory(pharmGKBDir); - logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME); + // Check PharmGKB files + DataSource dataSource = dataSourceReader.readValue(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA)).toFile()); + List pharmGkbFiles = checkFiles(dataSource, pharmGkbDownloadPath, getDataCategory(PHARMGKB_DATA) + "/" + + getDataName(PHARMGKB_DATA)); + + // Unzip downloaded file + unzipDownloadedFiles(pharmGkbFiles); // Parse chemical file Map chemicalsMap = parseChemicalFile(); @@ -113,8 +120,6 @@ public void parse() throws Exception { // Parse gene file parseGeneFile(chemicalsMap); - logger.info("Parsing {} files finished.", PHARMGKB_NAME); - // Generation the pharmacogenomics JSON file logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir()); int counter = 0; @@ -125,11 +130,14 @@ public void parse() throws Exception { } } serializer.close(); - logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); } private Map parseChemicalFile() throws IOException { - Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + Path chemicalsFile = serializer.getOutdir().resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, chemicalsFile); + Map chemicalsMap = new HashMap<>(); try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) { // Skip first line, i.e. the header line @@ -146,7 +154,7 @@ private Map parseChemicalFile() throws IOException { // Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers PharmaChemical pharmaChemical = new PharmaChemical() .setId(fields[0]) - .setSource(PHARMGKB_NAME) + .setSource(PHARMGKB_DATA) .setName(fields[1]) .setSmiles(fields[7]) .setInChI(fields[8]); @@ -177,6 +185,7 @@ private Map parseChemicalFile() throws IOException { } logger.info("Number of Chemical items read {}", chemicalsMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, chemicalsFile); return chemicalsMap; } @@ -192,8 +201,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM Map> variantMap = parseVariantFile(); // clinical_annotations.tsv - try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME) - .resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) { + Path clinAnnotPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, clinAnnotPath); + try (BufferedReader br = FileUtils.newBufferedReader(clinAnnotPath)) { // Skip first line, i.e. the header line String line = br.readLine(); while ((line = br.readLine()) != null) { @@ -278,6 +288,7 @@ private void parseClinicalAnnotationFiles(Map chemicalsM } } } + logger.info(PARSING_DONE_LOG_MESSAGE, clinAnnotPath); // Update the clinical annotation map by parsing the clinical annotation evidences parseClinicalAnnotationEvidenceFile(variantAnnotationMap); @@ -300,7 +311,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM private Map> parseVariantFile() throws IOException { Map> variantMap = new HashMap<>(); // Parse the variant file (i.e., variants.tsv) - Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + Path varPath = serializer.getOutdir().resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varPath); + try (BufferedReader br = FileUtils.newBufferedReader(varPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -367,6 +380,7 @@ private Map> parseVariantFile() throws IOException { } logger.info("Number of variants = {}", variantMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, varPath); return variantMap; } @@ -385,7 +399,8 @@ private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException { // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) - Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + Path allelesPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, allelesPath); try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -502,12 +522,14 @@ private void parseClinicalAnnotationAlleleFile(Map variantAssociationMap) throws IOException { // For CellBase, variant association corresponds to PharmGKB variant annotation // Parse the variant annotation file (i.e., var_drug_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -562,6 +584,7 @@ private void parseVariantAnnotationFile(Map va } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private Map parseGuidelineAnnotationFiles() throws IOException { @@ -571,7 +594,7 @@ private Map parseGuidelineAnnotationFiles() t ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class); // Parse the guideline annotations JSON files - Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME); + Path guidelinesPath = serializer.getOutdir().resolve(GUIDELINE_ANNOTATIONS_BASENAME); FileUtils.checkDirectory(guidelinesPath); for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) { if (file.getName().endsWith("json")) { @@ -593,7 +616,8 @@ private Map parseGuidelineAnnotationFiles() t private Map parseDrugLabelAnnotationFile() throws IOException { Map drugLabelAnnotationMap = new HashMap<>(); // Parse the drug labels annotations file (i.e., drugLabels.tsv) - Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + Path drugLabelPath = serializer.getOutdir().resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, drugLabelPath); try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -631,12 +655,15 @@ private Map parseDrugLabelAnnotationFile() th } logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, drugLabelPath); return drugLabelAnnotationMap; } private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_pheno_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); + int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -691,11 +718,13 @@ private void parsePhenotypeAnnotationFile(Map } } logger.info("Number of phenotype annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_fa_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -751,12 +780,14 @@ private void parseFunctionalAnnotationFile(Map } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseStudyParameterFile(Map variantAssociationMap) throws IOException { Map> studyParametersMap = new HashMap<>(); // Parse the study parameters file (i.e., study_parameters.tsv) - Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + Path studyParamsPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, studyParamsPath); try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -807,6 +838,7 @@ private void parseStudyParameterFile(Map varia } } logger.info("Number of study parameters lines = {}", studyParametersMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, studyParamsPath); for (Map.Entry> entry : studyParametersMap.entrySet()) { if (variantAssociationMap.containsKey(entry.getKey())) { @@ -861,7 +893,8 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx // Parse the genes file (i.e., genes.tsv) Map geneAnnotationMapByPgkbGeneId = new HashMap<>(); - Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + Path genesPath = serializer.getOutdir().resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, genesPath); try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -940,13 +973,15 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx } logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, genesPath); } private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName, Map geneAnnotationMapByPgkbGeneId) throws IOException { int counter = 0; // Parse the genes file (i.e., relationships.tsv) - Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + Path relationshipsPath = serializer.getOutdir().resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, relationshipsPath); try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -986,6 +1021,7 @@ private void parseChemicalGeneRelationships(Map> pgkbGeneIdM } } logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter); + logger.info(PARSING_DONE_LOG_MESSAGE, relationshipsPath); } private List stringFieldToList(String field) { @@ -1011,6 +1047,29 @@ private boolean isHaplotype(String value) { } private List getHaplotypeList(String value) { - return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList()); + return Arrays.stream(value.split(",")).map(String::trim).collect(Collectors.toList()); + } + + private void unzipDownloadedFiles(List pharmGkbFiles) throws CellBaseException { + // Unzip + for (File pharmGgkFile : pharmGkbFiles) { + logger.info("Unzip file: {}", pharmGgkFile); + try { + String outPath = serializer.getOutdir().resolve(pharmGgkFile.getName().split("\\.")[0]).toString(); + List params = Arrays.asList("-d", outPath, "-o", pharmGgkFile.toString()); + EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log")); + } catch (CellBaseException e) { + if (pharmGgkFile.getName().contains(GUIDELINE_ANNOTATIONS_BASENAME)) { + // It fails because of long filenames, so it does not raise any exception + logger.warn(e.getMessage()); + } + } catch (IOException e) { + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } + } } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java new file mode 100644 index 0000000000..1e47d58725 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -0,0 +1,721 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; +import org.opencb.biodata.models.core.pgs.PgsCohort; +import org.opencb.biodata.models.core.pgs.PolygenicScore; +import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.variant.avro.OntologyTermAnnotation; +import org.opencb.biodata.models.variant.avro.PubmedReference; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.commons.utils.FileUtils; +import org.rocksdb.*; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class PolygenicScoreBuilder extends AbstractBuilder { + + private Path downloadPath; + private Path integrationPath; + private DataSource dataSource; + + private Set pgsIdSet; + private Object[] varRDBConn; + private Object[] varPgsRDBConn; + private int varBatchCounter = 0; + private int varPgsBatchCounter = 0; + private WriteBatch varBatch; + private WriteBatch varPgsBatch; + + private long duplicatedKeys = 0; + + private static ObjectMapper mapper; + private static ObjectReader varPgsReader; + private static ObjectWriter jsonObjectWriter; + + private static final int MAX_BATCH_SIZE = 100; + + private static final String RSID_COL = "rsID"; + private static final String CHR_NAME_COL = "chr_name"; + private static final String EFFECT_ALLELE_COL = "effect_allele"; + private static final String OTHER_ALLELE_COL = "other_allele"; + private static final String EFFECT_WEIGHT_COL = "effect_weight"; + private static final String ALLELEFREQUENCY_EFFECT_COL = "allelefrequency_effect"; + private static final String ODDS_RATIO_COL = "OR"; + private static final String HAZARD_RATIO_COL = "HR"; + private static final String LOCUS_NAME_COL = "locus_name"; + private static final String IS_HAPLOTYPE_COL = "is_haplotype"; + private static final String IS_DIPLOTYPE_COL = "is_diplotype"; + private static final String IMPUTATION_METHOD_COL = "imputation_method"; + private static final String VARIANT_DESCRIPTION_COL = "variant_description"; + private static final String INCLUSION_CRITERIA_COL = "inclusion_criteria"; + private static final String IS_INTERACTION_COL = "is_interaction"; + private static final String IS_DOMINANT_COL = "is_dominant"; + private static final String IS_RECESSIVE_COL = "is_recessive"; + private static final String DOSAGE_0_WEIGHT_COL = "dosage_0_weight"; + private static final String DOSAGE_1_WEIGHT_COL = "dosage_1_weight"; + private static final String DOSAGE_2_WEIGHT_COL = "dosage_2_weight"; + private static final String HM_RSID_COL = "hm_rsID"; + private static final String HM_CHR_COL = "hm_chr"; + private static final String HM_POS_COL = "hm_pos"; + private static final String HM_INFEROTHERALLELE_COL = "hm_inferOtherAllele"; + + public static final String SAMPLE_SET_KEY = "Sample Set"; + public static final String ODDS_RATIO_KEY = "Odds ratio"; + public static final String HAZARD_RATIO_KEY = "Hazard ratio"; + public static final String BETA_KEY = "Beta"; + public static final String AUROC_KEY = "AUROC"; // Area Under the Receiver-Operating Characteristic Curve (AUROC) + public static final String CINDEX_KEY = "C-index"; // Concordance Statistic (C-index) + public static final String OTHER_KEY = "Other metric"; + private static final String EFFECT_WEIGHT_KEY = "Effect weight"; + private static final String ALLELE_FREQUENCY_EFFECT_KEY = "Allele frequency effect"; + private static final String LOCUS_NAME_KEY = "Locus name"; + private static final String IS_HAPLOTYPE_KEY = "Haplotype"; + private static final String IS_DIPLOTYPE_KEY = "Diplotype"; + private static final String IMPUTATION_METHOD_KEY = "Imputation method"; + private static final String VARIANT_DESCRIPTION_KEY = "Variant description"; + private static final String INCLUSION_CRITERIA_KEY = "Score inclusion criteria"; + private static final String IS_INTERACTION_KEY = "Interaction"; + private static final String IS_DOMINANT_KEY = "Dominant inheritance model"; + private static final String IS_RECESSIVE_KEY = "Recessive inheritance model"; + private static final String DOSAGE_0_WEIGHT_KEY = "Effect weight with 0 copy of the effect allele"; + private static final String DOSAGE_1_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + private static final String DOSAGE_2_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + + private static final String PARSING_FILE = "Parsing file "; + + public static final String PGS_COMMON_OUTPUT_FILENAME = PGS_COMMON_COLLECTION + JSON_GZ_EXTENSION; + public static final String PGS_VARIANT_OUTPUT_FILENAME = PGS_VARIANT_COLLECTION + JSON_GZ_EXTENSION; + + private static final Set VALID_CHROMOSOMES = new HashSet<>(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", "M")); + + private static final byte[] ONE = "1".getBytes(); + + static { + mapper = new ObjectMapper(); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + varPgsReader = mapper.readerFor(VariantPolygenicScore.class); + jsonObjectWriter = mapper.writer(); + } + + public PolygenicScoreBuilder(Path downloadPath, CellBaseFileSerializer serializer) { + super(serializer); + + this.downloadPath = downloadPath; + + logger = LoggerFactory.getLogger(PolygenicScoreBuilder.class); + } + + public void check() throws CellBaseException, IOException { + if (checked) { + return; + } + + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, getDataName(PGS_DATA)); + + // Sanity check + checkDirectory(downloadPath, getDataName(PGS_DATA)); + integrationPath = serializer.getOutdir().resolve("integration"); + Files.createDirectories(integrationPath); + if (!Files.exists(integrationPath)) { + throw new CellBaseException("Could not create the folder " + integrationPath); + } + // Prepare RocksDB for variant IDs + this.varRDBConn = getDBConnection(integrationPath.resolve("rdb-var.idx").toString(), true); + this.varBatch = new WriteBatch(); + // Prepare RocksDB for PGS/variants + this.varPgsRDBConn = getDBConnection(integrationPath.resolve("rdb-var-pgs.idx").toString(), true); + this.varPgsBatch = new WriteBatch(); + // PGS set + this.pgsIdSet = new HashSet<>(); + + // Check downloaded files + this.dataSource = dataSourceReader.readValue(downloadPath.resolve(getDataVersionFilename(PGS_CATALOG_DATA)).toFile()); + checkFiles(dataSource, downloadPath, getDataName(PGS_CATALOG_DATA)); + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, getDataName(PGS_DATA)); + checked = true; + } + + @Override + public void parse() throws Exception { + check(); + + logger.info(BUILDING_LOG_MESSAGE, getDataName(PGS_DATA)); + + int numFiles; + int counter; + String endsWith; + File[] files = downloadPath.toFile().listFiles(); + + // First, process metadata files + try (BufferedWriter bw = FileUtils.newBufferedWriter(serializer.getOutdir().resolve(PGS_COMMON_COLLECTION + JSON_GZ_EXTENSION))) { + counter = 0; + endsWith = "_metadata" + TAR_GZ_EXTENSION; + numFiles = getNumFiles(files, endsWith); + for (File file : files) { + if (file.isFile() && file.getName().endsWith(endsWith)) { + // E.g.: PGS004905_metadata.tar.gz: it contains a set of files about metadata + logger.info(PARSING_LOG_MESSAGE, file.getName()); + processPgsMetadataFile(file, bw); + logger.info(PARSING_DONE_LOG_MESSAGE, file.getName()); + logger.info("Progress: {} of {} meta files", ++counter, numFiles); + } + } + } + + // Second, process variant files + counter = 0; + endsWith = TXT_GZ_EXTENSION; + numFiles = getNumFiles(files, endsWith); + for (File file : files) { + if (file.isFile() && file.getName().endsWith(endsWith)) { + // E.g.: PGS004905_hmPOS_GRCh38.txt.gz: it contains the variants + logger.info(PARSING_LOG_MESSAGE, file.getName()); + + String pgsId = null; + Map columnPos = new HashMap<>(); + + try (BufferedReader br = FileUtils.newBufferedReader(file.toPath())) { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + if (line.startsWith("#pgs_id=")) { + pgsId = line.split("=")[1].trim(); + // Sanity check + if (!file.getName().startsWith(pgsId)) { + throw new CellBaseException(PARSING_FILE + file.getName() + ": pgs_id mismatch"); + } + // Add PGS ID to the set + pgsIdSet.add(pgsId); + } + } else if (line.startsWith(RSID_COL) || line.startsWith(CHR_NAME_COL)) { + String[] fields = line.split("\t"); + for (int i = 0; i < fields.length; i++) { + columnPos.put(fields[i], i); + } + } else { + // Sanity check + if (pgsId == null) { + throw new CellBaseException(PARSING_FILE + file.getName() + ": pgs_id is null"); + } + saveVariantPolygenicScore(line, columnPos, pgsId); + } + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, file.getName()); + logger.info("Progress: {} of {} variant files", ++counter, numFiles); + } + } + + RocksDB rdb; + // Write remaining variant ID batch + if (varBatchCounter > 0) { + rdb = (RocksDB) varRDBConn[0]; + rdb.write(new WriteOptions(), varBatch); + varBatch.clear(); + } + // Write remaining PGS/variant batch + if (varPgsBatchCounter > 0) { + rdb = (RocksDB) varPgsRDBConn[0]; + rdb.write(new WriteOptions(), varPgsBatch); + varPgsBatch.clear(); + } + + // Serialize/write the saved variant polygenic scores in the RocksDB + serializeRDB(); + serializer.close(); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PGS_DATA)); + } + + private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws CellBaseException { + String suffix; + String pgsId = metadataFile.getName().split("_")[0]; + + Path tmp = serializer.getOutdir().resolve("tmp"); + if (!tmp.toFile().exists()) { + tmp.toFile().mkdirs(); + } + + String command = "tar -xzf " + metadataFile.getAbsolutePath() + " -C " + tmp.toAbsolutePath(); + try { + logger.info("Executing: {}", command); + Process process = Runtime.getRuntime().exec(command); + process.waitFor(); + } catch (IOException e) { + throw new CellBaseException("Exception raised when executing: " + command, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Exception raised when executing: " + command, e); + } + + // Create PGS object, with the common fields + String filename; + CommonPolygenicScore pgs = new CommonPolygenicScore(); + pgs.setId(pgsId); + pgs.setSource(PGS_CATALOG_DATA); + pgs.setVersion(dataSource.getVersion()); + + String line; + + // PGSxxxxx_metadata_publications.csv + suffix = "_metadata_publications.csv"; + filename = pgsId + suffix; + try (BufferedReader br = FileUtils.newBufferedReader(tmp.resolve(filename))) { + // Skip first line + br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 5 6 + // PGS Publication/Study (PGP) ID First Author Title Journal Name Publication Date Release Date Authors + // 7 8 + // digital object identifier (doi) PubMed ID (PMID) + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getPubmedRefs().add(new PubmedReference(strings.get(8), strings.get(2), strings.get(3), strings.get(4), null)); + } + } catch (IOException e) { + throw new CellBaseException(PARSING_FILE + filename, e); + } + + // PGSxxxxx_metadata_efo_traits.csv + suffix = "_metadata_efo_traits.csv"; + filename = pgsId + suffix; + try (BufferedReader br = FileUtils.newBufferedReader(tmp.resolve(filename))) { + // Skip first line + br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 + // Ontology Trait ID Ontology Trait Label Ontology Trait Description Ontology URL + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getTraits().add(new OntologyTermAnnotation(strings.get(0), strings.get(1), strings.get(2), "EFO", strings.get(3), + new HashMap<>())); + } + } catch (IOException e) { + throw new CellBaseException(PARSING_FILE + filename, e); + } + + // PGSxxxxx_metadata_scores.csv + suffix = "_metadata_scores.csv"; + filename = pgsId + suffix; + try (BufferedReader br = FileUtils.newBufferedReader(tmp.resolve(filename))) { + // Skip first line + br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // Polygenic Score (PGS) ID PGS Name Reported Trait Mapped Trait(s) (EFO label) Mapped Trait(s) (EFO ID) + // 5 6 7 8 + // PGS Development Method PGS Development Details/Relevant Parameters Original Genome Build Number of Variants + // 9 10 11 12 13 + // Number of Interaction Terms Type of Variant Weight PGS Publication (PGP) ID Publication (PMID) Publication (doi) + // 14 15 + // Score and results match the original publication Ancestry Distribution (%) - Source of Variant Associations (GWAS) + // 16 17 18 + // Ancestry Distribution (%) - Score Development/Training Ancestry Distribution (%) - PGS Evaluation FTP link + // 19 20 + // Release Date License/Terms of Use + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + // Sanity check + if (!pgsId.equals(strings.get(0))) { + throw new CellBaseException(PARSING_FILE + filename + ": mismatch PGS ID"); + } + if (StringUtils.isNotEmpty(pgs.getName())) { + throw new CellBaseException("More than one PGS in file " + filename); + } + pgs.setName(strings.get(1)); + } + } catch (IOException e) { + throw new CellBaseException(PARSING_FILE + filename, e); + } + + // TODO: PGSxxxxx_metadata_score_development_samples.csv + // 0 1 2 3 4 + // Polygenic Score (PGS) ID Stage of PGS Development Number of Individuals Number of Cases Number of Controls + // 5 6 7 8 + // Percent of Participants Who are Male Sample Age Broad Ancestry Category "Ancestry (e.g. French, Chinese)" + // 9 10 11 12 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 13 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_performance_metrics.csv + suffix = "_metadata_performance_metrics.csv"; + filename = pgsId + suffix; + try (BufferedReader br = FileUtils.newBufferedReader(tmp.resolve(filename))) { + // Skip first line + br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // PGS Performance Metric (PPM) ID Evaluated Score PGS Sample Set (PSS) PGS Publication (PGP) ID Reported Trait + // 5 6 7 8 + // Covariates Included in the Model PGS Performance: Other Relevant Information Publication (PMID) Publication (doi) + // 9 10 11 12 + // Hazard Ratio (HR) Odds Ratio (OR) Beta Area Under the Receiver-Operating Characteristic Curve (AUROC) + // 13 14 + // Concordance Statistic (C-index) Other Metric(s) + + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + + // Sanity check + if (!pgsId.equals(strings.get(1))) { + continue; + } + + Map values = new HashMap<>(); + if (StringUtils.isNotEmpty(strings.get(2))) { + values.put(SAMPLE_SET_KEY, strings.get(2)); + } + if (StringUtils.isNotEmpty(strings.get(9))) { + values.put(HAZARD_RATIO_KEY, strings.get(9)); + } + if (StringUtils.isNotEmpty(strings.get(10))) { + values.put(ODDS_RATIO_KEY, strings.get(10)); + } + if (StringUtils.isNotEmpty(strings.get(11))) { + values.put(BETA_KEY, strings.get(11)); + } + if (StringUtils.isNotEmpty(strings.get(12))) { + values.put(AUROC_KEY, strings.get(12)); + } + if (StringUtils.isNotEmpty(strings.get(13))) { + values.put(CINDEX_KEY, strings.get(13)); + } + if (StringUtils.isNotEmpty(strings.get(14))) { + values.put(OTHER_KEY, strings.get(14)); + } + pgs.getValues().add(values); + } + } catch (IOException e) { + throw new CellBaseException(PARSING_FILE + filename, e); + } + + // TODO: PGSxxxxx_metadata_evaluation_sample_sets.csv + // 0 1 2 3 4 + // PGS Sample Set (PSS) Polygenic Score (PGS) ID Number of Individuals Number of Cases Number of Controls + // 5 6 7 + // Percent of Participants Who are Male Sample Age,Broad Ancestry Category "Ancestry (e.g.French, Chinese)" + // 8 9 10 11 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 12 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_cohorts.csv + suffix = "_metadata_cohorts.csv"; + filename = pgsId + suffix; + try (BufferedReader br = FileUtils.newBufferedReader(tmp.resolve(filename))) { + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 + // Cohort ID Cohort Name Previous/other/additional names + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getCohorts().add(new PgsCohort(strings.get(0), strings.get(1), strings.get(2))); + } + } catch (IOException e) { + throw new CellBaseException(PARSING_FILE + filename, e); + } + + // Create PGS object, with the common fields + try { + bw.write(jsonObjectWriter.writeValueAsString(pgs)); + bw.write("\n"); + } catch (IOException e) { + throw new CellBaseException("Writing CommonPolygenicScore data model", e); + } + + // Clean tmp folder + for (File tmpFile : tmp.toFile().listFiles()) { + try { + Files.delete(tmpFile.toPath()); + } catch (IOException e) { + logger.warn("Return false when deleting file: " + tmpFile, e); + } + } + } + + private void saveVariantPolygenicScore(String line, Map columnPos, String pgsId) throws RocksDBException, IOException { + String chrom; + int position; + String effectAllele; + String otherAllele; + + String[] field = line.split("\t", -1); + + if (columnPos.containsKey(HM_CHR_COL)) { + chrom = field[columnPos.get(HM_CHR_COL)]; + if (!VALID_CHROMOSOMES.contains(chrom)) { + // Only chromosomes are processed; no contigs, e.g.: 8_KI270821v1_alt, 11_KI270927v1_alt, 12_GL877875v1_alt,... + return; + } + } else { +// logger.warn("Missing field '{}', skipping line: {}", HM_CHR_COL, line); + return; + } + if (columnPos.containsKey(HM_POS_COL)) { + try { + position = Integer.parseInt(field[columnPos.get(HM_POS_COL)]); + } catch (NumberFormatException e) { +// logger.warn("Invalid field '{}' (value = {}), skipping line: {}", HM_POS_COL, field[columnPos.get(HM_POS_COL)], line); + return; + } + } else { +// logger.warn("Missing field '{}', skipping line: {}", HM_POS_COL, line); + return; + } + if (columnPos.containsKey(EFFECT_ALLELE_COL)) { + effectAllele = field[columnPos.get(EFFECT_ALLELE_COL)]; + } else { +// logger.warn("Missing field '{}', skipping line: {}", EFFECT_ALLELE_COL, line); + return; + } + if (columnPos.containsKey(HM_INFEROTHERALLELE_COL) && StringUtils.isNotEmpty(field[columnPos.get(HM_INFEROTHERALLELE_COL)])) { + otherAllele = field[columnPos.get(HM_INFEROTHERALLELE_COL)]; + } else if (columnPos.containsKey(OTHER_ALLELE_COL)) { + otherAllele = field[columnPos.get(OTHER_ALLELE_COL)]; + } else { +// logger.warn("Missing fields '{}' and '{}' (at least one is mandatory), skipping line: {}", HM_INFEROTHERALLELE_COL, +// OTHER_ALLELE_COL, line); + return; + } + + // Create polygenic score + Map values = new HashMap<>(); + if (columnPos.containsKey(EFFECT_WEIGHT_COL)) { + values.put(EFFECT_WEIGHT_KEY, field[columnPos.get(EFFECT_WEIGHT_COL)]); + } + if (columnPos.containsKey(ALLELEFREQUENCY_EFFECT_COL)) { + values.put(ALLELE_FREQUENCY_EFFECT_KEY, field[columnPos.get(ALLELEFREQUENCY_EFFECT_COL)]); + } + if (columnPos.containsKey(ODDS_RATIO_COL)) { + values.put(ODDS_RATIO_KEY, field[columnPos.get(ODDS_RATIO_COL)]); + } + if (columnPos.containsKey(HAZARD_RATIO_COL)) { + values.put(HAZARD_RATIO_KEY, field[columnPos.get(HAZARD_RATIO_COL)]); + } + if (columnPos.containsKey(LOCUS_NAME_COL)) { + values.put(LOCUS_NAME_KEY, field[columnPos.get(LOCUS_NAME_COL)]); + } + if (columnPos.containsKey(IS_HAPLOTYPE_COL)) { + values.put(IS_HAPLOTYPE_KEY, field[columnPos.get(IS_HAPLOTYPE_COL)]); + } + if (columnPos.containsKey(IS_DIPLOTYPE_COL)) { + values.put(IS_DIPLOTYPE_KEY, field[columnPos.get(IS_DIPLOTYPE_COL)]); + } + if (columnPos.containsKey(IMPUTATION_METHOD_COL)) { + values.put(IMPUTATION_METHOD_KEY, field[columnPos.get(IMPUTATION_METHOD_COL)]); + } + if (columnPos.containsKey(VARIANT_DESCRIPTION_COL)) { + values.put(VARIANT_DESCRIPTION_KEY, field[columnPos.get(VARIANT_DESCRIPTION_COL)]); + } + if (columnPos.containsKey(INCLUSION_CRITERIA_COL)) { + values.put(INCLUSION_CRITERIA_KEY, field[columnPos.get(INCLUSION_CRITERIA_COL)]); + } + if (columnPos.containsKey(IS_INTERACTION_COL)) { + values.put(IS_INTERACTION_KEY, field[columnPos.get(IS_INTERACTION_COL)]); + } + if (columnPos.containsKey(IS_DOMINANT_COL)) { + values.put(IS_DOMINANT_KEY, field[columnPos.get(IS_DOMINANT_COL)]); + } + if (columnPos.containsKey(IS_RECESSIVE_COL)) { + values.put(IS_RECESSIVE_KEY, field[columnPos.get(IS_RECESSIVE_COL)]); + } + if (columnPos.containsKey(DOSAGE_0_WEIGHT_COL)) { + values.put(DOSAGE_0_WEIGHT_KEY, field[columnPos.get(DOSAGE_0_WEIGHT_COL)]); + } + if (columnPos.containsKey(DOSAGE_1_WEIGHT_COL)) { + values.put(DOSAGE_1_WEIGHT_KEY, field[columnPos.get(DOSAGE_1_WEIGHT_COL)]); + } + if (columnPos.containsKey(DOSAGE_2_WEIGHT_COL)) { + values.put(DOSAGE_2_WEIGHT_KEY, field[columnPos.get(DOSAGE_2_WEIGHT_COL)]); + } + + // Creating and/or updating variant polygenic score + + // First, we store the variant + RocksDB rdb = (RocksDB) varRDBConn[0]; + String key = chrom + ":" + position + ":" + otherAllele + ":" + effectAllele; + byte[] dbContent = rdb.get(key.getBytes()); + if (dbContent == null) { + // Add data to batch + varBatch.put(key.getBytes(), ONE); + varBatchCounter++; + if (varBatchCounter >= MAX_BATCH_SIZE) { + // Write the batch to the database + rdb.write(new WriteOptions(), varBatch); + // Reset batch + varBatch.clear(); + varBatchCounter = 0; + } + } + + // Second, we store the polygenic scores + rdb = (RocksDB) varPgsRDBConn[0]; + key = chrom + ":" + position + ":" + otherAllele + ":" + effectAllele + ":" + pgsId; + dbContent = rdb.get(key.getBytes()); + if (dbContent != null) { + duplicatedKeys++; + logger.warn("Warning: the indexing PGS key {}: it should be unique", key); + } else { + VariantPolygenicScore varPgs = new VariantPolygenicScore(chrom, position, otherAllele, effectAllele, + Collections.singletonList(new PolygenicScore(pgsId, values))); + // Add data to batch + byte[] rdbKey = key.getBytes(); + byte[] rdbValue = jsonObjectWriter.writeValueAsBytes(varPgs); + varPgsBatch.put(rdbKey, rdbValue); + varPgsBatchCounter++; + if (varPgsBatchCounter >= MAX_BATCH_SIZE) { + // Write the batch to the database + rdb.write(new WriteOptions(), varPgsBatch); + // Reset batch + varPgsBatch.clear(); + varPgsBatchCounter = 0; + } + } + } + + private void serializeRDB() throws IOException, RocksDBException { + long counter = 0; + + RocksDB varRDB = (RocksDB) varRDBConn[0]; + RocksDB varPgsRDB = (RocksDB) varPgsRDBConn[0]; + + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + logger.info("Writing variants ..."); + RocksIterator rocksIterator = varRDB.newIterator(); + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + String varKey = new String(rocksIterator.key()); + VariantPolygenicScore varPgs = null; + for (String pgsId : pgsIdSet) { + String varPgsKey = varKey + ":" + pgsId; + byte[] dbContent = varPgsRDB.get(varPgsKey.getBytes()); + if (dbContent != null) { + VariantPolygenicScore newVarPgs = varPgsReader.readValue(dbContent); + if (varPgs == null) { + varPgs = newVarPgs; + } else { + varPgs.getPolygenicScores().addAll(newVarPgs.getPolygenicScores()); + } + } + } + if (varPgs != null) { + serializer.serialize(varPgs); + } + if (++counter % 500000 == 0) { + logger.info("Writing {} variants...", counter); + } + } + logger.info("Writing done."); + logger.info("Num. duplicated keys (PGS/Variant) = {}", duplicatedKeys); + + // Close RocksDB + closeIndex((RocksDB) varRDBConn[0], (Options) varRDBConn[1], (String) varRDBConn[2]); + closeIndex((RocksDB) varPgsRDBConn[0], (Options) varPgsRDBConn[1], (String) varPgsRDBConn[2]); + } + + private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { + if (rdb != null) { + rdb.close(); + } + if (dbOption != null) { + dbOption.dispose(); + } + if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { + org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); + } + } + + private Object[] getDBConnection(String dbLocation, boolean forceCreate) { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); + tableConfig.setBlockCacheSize(4 * 1024 * 1024 * 1024L); // 16 GB block cache + + Options options = new Options() + .setCreateIfMissing(true) + .setWriteBufferSize(1L * 256 * 1024 * 1024) // 256 MB + .setMaxWriteBufferNumber(4) + .setMinWriteBufferNumberToMerge(2) + .setIncreaseParallelism(4) + .setMaxBackgroundCompactions(4) + .setMaxBackgroundFlushes(2) + .setLevelCompactionDynamicLevelBytes(true) + .setTargetFileSizeBase(1L * 64 * 1024 * 1024) // 64 MB + .setMaxBytesForLevelBase(1L * 512 * 1024 * 1024) // 512 MB + .setTableFormatConfig(tableConfig) + .setCompressionType(CompressionType.LZ4_COMPRESSION); + + RocksDB db = null; + try { + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + // do something + } catch (RocksDBException e) { + // do some error handling + e.printStackTrace(); + System.exit(1); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + } + + private int getNumFiles(File[] files, String endsWith) { + int numFiles = 0; + for (File file : files) { + if (file.isFile() && file.getName().endsWith(endsWith)) { + ++numFiles; + } + } + return numFiles; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 0369a0e6aa..186a0218b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -21,67 +21,85 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.xml.bind.JAXBException; import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.math.BigInteger; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; -public class ProteinBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Path uniprotFilesDir; - private Path interproFilePath; - private String species; +public class ProteinBuilder extends AbstractBuilder { - private Map proteinMap; + private Path proteinPath; + private String species; - protected Logger logger = LoggerFactory.getLogger(this.getClass()); + public static final String PROTEIN_OUTPUT_FILENAME = PROTEIN_DATA + ".json.gz"; - public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { - this(uniprotFilesDir, null, species, serializer); - } - - public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String species, CellBaseSerializer serializer) { + public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); - this.uniprotFilesDir = uniprotFilesDir; - this.interproFilePath = interproFilePath; + this.proteinPath = proteinPath; this.species = species; } @Override - public void parse() throws IOException { + public void parse() throws CellBaseException, IOException { + // Sanity check + checkDirectory(proteinPath, getDataName(PROTEIN_DATA)); - if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new IOException("File '" + uniprotFilesDir + "' not valid"); + // Check UniProt file + DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(UNIPROT_DATA).resolve(getDataVersionFilename(UNIPROT_DATA)) + .toFile()); + List uniProtFiles = checkFiles(dataSource, proteinPath.resolve(UNIPROT_DATA), getDataCategory(UNIPROT_DATA) + "/" + + getDataName(UNIPROT_DATA)); + if (uniProtFiles.size() != 1) { + throw new CellBaseException(getMismatchNumFilesErrorMessage(getDataName(UNIPROT_DATA), uniProtFiles.size())); } - RocksDB rocksDb = getDBConnection(); + // Check InterPro file + dataSource = dataSourceReader.readValue(proteinPath.resolve(INTERPRO_DATA).resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); + List interProFiles = checkFiles(dataSource, proteinPath.resolve(INTERPRO_DATA), getDataCategory(INTERPRO_DATA) + "/" + + getDataName(INTERPRO_DATA)); + if (interProFiles.size() != 1) { + throw new CellBaseException(getMismatchNumFilesErrorMessage(getDataName(INTERPRO_DATA), interProFiles.size())); + } + + // Prepare UniProt data by splitting data in chunks + Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + logger.info("Split {} file {} into chunks at {}", getDataName(UNIPROT_DATA), uniProtFiles.get(0).getName(), uniProtChunksPath); + Files.createDirectories(uniProtChunksPath); + splitUniprot(proteinPath.resolve(UNIPROT_DATA).resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); + + // Prepare RocksDB + RocksDB rocksDb = getDBConnection(uniProtChunksPath); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); - proteinMap = new HashMap<>(30000); -// UniProtParser up = new UniProtParser(); + Map proteinMap = new HashMap<>(30000); + + // Parsing files try { - File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + File[] files = uniProtChunksPath.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + for (File file : files) { + logger.info(PARSING_LOG_MESSAGE, file); Uniprot uniprot = (Uniprot) UniProtParser.loadXMLInfo(file.toString(), UniProtParser.UNIPROT_CONTEXT); for (Entry entry : uniprot.getEntry()) { @@ -89,32 +107,41 @@ public void parse() throws IOException { for (OrganismNameType organismNameType : entry.getOrganism().getName()) { entryOrganism = organismNameType.getValue(); if (entryOrganism.equals(species)) { -// proteinMap.put(entry.getAccession().get(0), entry); + proteinMap.put(entry.getAccession().get(0), entry); + + // Update RocksDB rocksDb.put(entry.getAccession().get(0).getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); } } } + logger.info(PARSING_DONE_LOG_MESSAGE); + } + logger.info("Number of proteins stored in map: '{}'", proteinMap.size()); + if (proteinMap.size() > 10) { + logger.info("First 10 protein IDs in map: {}", proteinMap.keySet().stream().collect(Collectors.toList()).subList(0, 10)); } - logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - if (interproFilePath != null && Files.exists(interproFilePath)) { - BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath); - Set hashSet = new HashSet<>(proteinMap.keySet()); - Set visited = new HashSet<>(30000); + logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); + String interproName = getDataName(INTERPRO_DATA); + int numLine = 0; + int numInterProLinesProcessed = 0; + int numUniqueProteinsProcessed = 0; + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { + + Set hashSet = proteinMap.keySet(); + Set visited = new HashSet<>(proteinMap.size()); - int numInterProLinesProcessed = 0; - int numUniqueProteinsProcessed = 0; String[] fields; String line; boolean iprAdded; while ((line = interproBuffereReader.readLine()) != null) { + numLine++; fields = line.split("\t"); if (hashSet.contains(fields[0])) { iprAdded = false; BigInteger start = BigInteger.valueOf(Integer.parseInt(fields[4])); BigInteger end = BigInteger.valueOf(Integer.parseInt(fields[5])); -// for (FeatureType featureType : proteinMap.get(fields[0]).getFeature()) { byte[] bytes = rocksDb.get(fields[0].getBytes()); Entry entry = mapper.readValue(bytes, Entry.class); for (FeatureType featureType : entry.getFeature()) { @@ -123,8 +150,6 @@ public void parse() throws IOException { && featureType.getLocation().getEnd().getPosition() != null && featureType.getLocation().getBegin().getPosition().equals(start) && featureType.getLocation().getEnd().getPosition().equals(end)) { - featureType.setId(fields[1]); - featureType.setRef(fields[3]); iprAdded = true; break; } @@ -145,10 +170,17 @@ public void parse() throws IOException { locationType.setEnd(positionType2); featureType.setLocation(locationType); -// proteinMap.get(fields[0]).getFeature().add(featureType); bytes = rocksDb.get(fields[0].getBytes()); entry = mapper.readValue(bytes, Entry.class); entry.getFeature().add(featureType); + + if (fields[0].equalsIgnoreCase(entry.getAccession().get(0))) { + // Update RocksDB + rocksDb.put(fields[0].getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); + } else { + logger.info("Something wrong happen: interpro fields[0] = {} vs entry.getAccession().get(0) = {}", + fields[0], entry.getAccession().get(0)); + } } if (!visited.contains(fields[0])) { @@ -158,11 +190,16 @@ public void parse() throws IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} InterPro lines processed. {} unique proteins processed", - numInterProLinesProcessed, numUniqueProteinsProcessed); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); } } - interproBuffereReader.close(); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); + + logger.info(PARSING_DONE_LOG_MESSAGE); + } catch (IOException e) { + logger.error("Error parsing {} file: {}. Num. line = {}. Error stack trace = {}", interproName, interProFiles.get(0), + numLine, Arrays.toString(e.getStackTrace())); + printInfoLogs(numInterProLinesProcessed, numUniqueProteinsProcessed, interproName); } // Serialize and save results @@ -173,24 +210,80 @@ public void parse() throws IOException { } rocksDb.close(); - } catch (JAXBException | RocksDBException e) { - e.printStackTrace(); + } catch (JAXBException | RocksDBException | IOException e) { + throw new CellBaseException("Error parsing " + getDataName(PROTEIN_DATA) + " files", e); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); } - private RocksDB getDBConnection() { - // a static method that loads the RocksDB C++ library. + private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { + // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); - // the Options class contains a set of configurable DB options - // that determines the behavior of a database. + // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { - return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); + return RocksDB.open(options, uniProtChunksPath.resolve("integration.idx").toString()); } catch (RocksDBException e) { - // do some error handling - e.printStackTrace(); - System.exit(1); + throw new CellBaseException("Error preparing RocksDB", e); + } + } + + private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + pw.print(""); + pw.close(); + } finally { + if (pw != null) { + pw.close(); + } } - return null; } + + private String getMismatchNumFilesErrorMessage(String dataName, int numFiles) { + return "Only one " + dataName + " file is expected, but currently there are " + numFiles + " files"; + } + + private void printInfoLogs(int numInterProLinesProcessed, int numUniqueProteinsProcessed, String dataName) { + logger.info("{}: {} lines processed", dataName, numInterProLinesProcessed); + logger.info("{}: {} unique proteins processed", dataName, numUniqueProteinsProcessed); + } + } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java index 8aba7c9dda..5443b9aea9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java @@ -16,63 +16,71 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.pubmed.PubMedParser; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticle; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticleSet; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.download.PubMedDownloadManager; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; -import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -public class PubMedBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.getDataName; - private Path pubmedDir; - private CellBaseFileSerializer fileSerializer; +public class PubMedBuilder extends AbstractBuilder { - public PubMedBuilder(Path pubmedDir, CellBaseFileSerializer serializer) { - super(serializer); - - this.fileSerializer = serializer; - this.pubmedDir = pubmedDir; + private Path pubMedDownloadPath; + private CellBaseConfiguration configuration; - logger = LoggerFactory.getLogger(PubMedBuilder.class); + public PubMedBuilder(Path pubMedDownloadPath, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { + super(serializer); + this.pubMedDownloadPath = pubMedDownloadPath; + this.configuration = configuration; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkPath(pubmedDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - logger.info("Parsing PubMed files..."); + // Check input folder + FileUtils.checkPath(pubMedDownloadPath); - for (File file : pubmedDir.toFile().listFiles()) { - if (file.isFile() && (file.getName().endsWith("gz") || file.getName().endsWith("xml"))) { - String name = file.getName().split("\\.")[0]; + // Check PubMed files before parsing them + List pubMedFilenames = PubMedDownloadManager.getPubMedFilenames(configuration.getDownload().getPubmed()); + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + if (!Files.exists(pubMedPath)) { + throw new CellBaseException("Expected PubMed file " + pubMedFilename + ", but it was not found at " + pubMedDownloadPath); + } + } + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + String basename = pubMedFilename.split("\\.")[0]; - ObjectWriter objectWriter = new ObjectMapper().writerFor(PubmedArticle.class); - PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(file.getAbsolutePath()); + PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(pubMedPath.toAbsolutePath().toString()); - List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); - logger.info("Parsing PubMed file {} of {} articles ...", file.getName(), objects.size()); - int counter = 0; - for (Object object : objects) { - PubmedArticle pubmedArticle = (PubmedArticle) object; - fileSerializer.serialize(pubmedArticle, name); - if (++counter % 2000 == 0) { - logger.info("\t\t" + counter + " articles"); - } + List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); + logger.info(PARSING_LOG_MESSAGE, pubMedPath); + int counter = 0; + for (Object object : objects) { + PubmedArticle pubmedArticle = (PubmedArticle) object; + ((CellBaseFileSerializer) serializer).serialize(pubmedArticle, basename); + if (++counter % 2000 == 0) { + logger.info("{} articles", counter); } - fileSerializer.close(); - logger.info("\t\tDone: " + counter + " articles."); } + serializer.close(); + + String logMsg = pubMedPath + " (" + counter + " articles)"; + logger.info(PARSING_DONE_LOG_MESSAGE, logMsg); } - logger.info("Parsing PubMed files finished."); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 48b0cd1d0d..9ddb4e6a7c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -22,28 +22,43 @@ import org.opencb.biodata.models.core.*; import org.opencb.biodata.tools.sequence.FastaIndex; import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; -public class RefSeqGeneBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class RefSeqGeneBuilder extends AbstractBuilder { + + private Path downloadPath; + private CellBaseConfiguration configuration; private Map transcriptDict; private Map exonDict; - private Path gtfFile; - private Path fastaFile; - private Path proteinFastaFile, cdnaFastaFile; - private Path maneFile, lrgFile, disgenetFile, hpoFile, geneDrugFile, miRTarBaseFile; - private Path cancerGeneCensus, cancerHotspot; - private Path tso500File, eglhHaemOncFile; + private Path gtfFile = null; + private Path fastaFile = null; + private Path proteinFastaFile = null; + private Path cdnaFastaFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path hpoFile = null; + private Path geneDrugFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHotspot = null; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); - private final String status = "KNOWN"; + private static final String KNOWN_STATUS = "KNOWN"; private static final String SOURCE = ParamConstants.QueryParams.REFSEQ.key(); private Gene gene = null; private Transcript transcript = null; @@ -52,134 +67,173 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; + private boolean isHSapiens = false; + + private static final String ENSEMBL = "ensembl"; + private static final String TRANSCRIPT_ID = "transcript_id"; + private static final String EXON_NUMBER = "exon_number"; - public RefSeqGeneBuilder(Path refSeqDirectoryPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; + public static final String REFSEQ_GENE_OUTPUT_FILENAME = REFSEQ_GENE_BASENAME + ".json.gz"; + + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration, + CellBaseSerializer serializer) { super(serializer); + this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; - - getGtfFileFromDirectoryPath(refSeqDirectoryPath); - getFastaFileFromDirectoryPath(refSeqDirectoryPath); - getProteinFastaFileFromDirectoryPath(refSeqDirectoryPath); - getCdnaFastaFileFromDirectoryPath(refSeqDirectoryPath); - setAnnotationFiles(refSeqDirectoryPath); + this.configuration = configuration; transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); - } - private void setAnnotationFiles(Path refSeqDirectoryPath) { - Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); - maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); - lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); - geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); - disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); - hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); - cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); - cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); - tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); - eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); - miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS)) { + isHSapiens = true; + } } - private void getGtfFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = refSeqDirectoryPath.resolve(fileName); - break; - } + public void check() throws Exception { + if (checked) { + return; } - } - private void getFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("genomic.fna") || fileName.endsWith("genomic.fna.gz")) { - fastaFile = refSeqDirectoryPath.resolve(fileName); - break; + String refSeqGeneLabel = getDataName(REFSEQ_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); + + // Sanity check + checkDirectory(downloadPath, refSeqGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); } } - } - private void getProteinFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".faa") || fileName.endsWith(".faa.gz")) { - proteinFastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } + // Check RefSeq files + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + DownloadProperties.URLProperties props = configuration.getDownload().getRefSeq(); + gtfFile = checkFile(props, prefixId + REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath(); + proteinFastaFile = checkFile(props, prefixId + REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath(); + cdnaFastaFile = checkFile(props, prefixId + REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath(); + + // Check genome FASTA file + String genomeGzFilename = Paths.get(props.getFiles().get(prefixId + REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); + Path fastaGzFile = downloadPath.resolve(genomeGzFilename); + fastaFile = EtlCommons.getFastaPath(fastaGzFile); + + // Check common files + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MANE_SELECT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, LRG_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_HOTSPOT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, DGIDB_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { + hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, HPO_DISEASE_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_GENE_CENSUS_DATA, speciesConfiguration.getScientificName()); } - } - private void getCdnaFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("cdna.fna") || fileName.endsWith("cdna.fna.gz")) { - cdnaFastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } + // Check regulation files + // mirtarbase + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MIRTARBASE_DATA, speciesConfiguration.getScientificName()); } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); + checked = true; } public void parse() throws Exception { + check(); + // Preparing the fasta file for fast accessing FastaIndex fastaIndex = null; if (fastaFile != null) { fastaIndex = new FastaIndex(fastaFile); } - // index protein sequences for later + // Index protein sequences for later + logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); - indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, - cancerGeneCensus, cancerHotspot, tso500File, eglhHaemOncFile); - - logger.info("Parsing RefSeq gtf..."); - GtfReader gtfReader = new GtfReader(gtfFile); - - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - String chromosome = getSequenceName(gtf.getSequenceName()); - switch (gtf.getFeature()) { - case "gene": - // we've finished the previous transcript, store xrefs - addXrefs(transcript, geneDbxrefs, exonDbxrefs); - parseGene(gtf, chromosome, indexer); - break; - case "transcript": - break; - case "exon": - parseExon(gtf, chromosome, fastaIndex, indexer); - break; - case "CDS": - parseCDS(gtf, indexer); - break; - case "start_codon": - seenStopCodon = false; - break; - case "stop_codon": - if (!seenStopCodon) { - parseStopCodon(gtf); - seenStopCodon = true; - } - break; - default: - throw new RuntimeException("Unexpected feature type: " + gtf.getFeature()); + indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, miRTarBaseFile, cancerGeneCensusFile, + cancerHotspot); + logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); + + logger.info(PARSING_LOG_MESSAGE, gtfFile); + try (GtfReader gtfReader = new GtfReader(gtfFile)) { + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + String chromosome = getSequenceName(gtf.getSequenceName()); + switch (gtf.getFeature()) { + case "gene": + // we've finished the previous transcript, store xrefs + addXrefs(transcript, geneDbxrefs, exonDbxrefs); + parseGene(gtf, chromosome, indexer); + break; + case "transcript": + break; + case "exon": + parseExon(gtf, chromosome, fastaIndex, indexer); + break; + case "CDS": + parseCDS(gtf, indexer); + break; + case "start_codon": + seenStopCodon = false; + break; + case "stop_codon": + if (!seenStopCodon) { + parseStopCodon(gtf); + seenStopCodon = true; + } + break; + default: + throw new CellBaseException("Error parsing: unexpected feature type: " + gtf.getFeature()); + } } } - // add xrefs to last transcript + // Add xrefs to last transcript addXrefs(transcript, geneDbxrefs, exonDbxrefs); - // last gene must be serialized + // Last gene must be serialized store(); - // cleaning - gtfReader.close(); + // Close serializer.close(); if (fastaIndex != null) { fastaIndex.close(); } indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); } - // store right before parsing the previous gene, or the very last gene. + // Store right before parsing the previous gene, or the very last gene. private void store() { serializer.serialize(gene); reset(); @@ -200,7 +254,6 @@ private void addXrefs(Transcript transcript, Set geneDbxrefs, Set ex return; } exonDbxrefs.addAll(geneDbxrefs); -// transcript.setXrefs(new ArrayList<>(exonDbxrefs)); transcript.getXrefs().addAll(exonDbxrefs); transcript.getXrefs().add(new Xref(transcript.getName(), "hgnc_symbol", "HGNC Symbol")); @@ -235,12 +288,13 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde null, indexer.getMirnaTargets(geneName), indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); gene = new Gene(geneId, geneName, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), "1", geneBiotype, - status, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); + KNOWN_STATUS, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); geneDbxrefs = parseXrefs(gtf); } - private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String transcriptId = gtf.getAttributes().get("transcript_id"); + private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, + CellBaseException { + String transcriptId = gtf.getAttributes().get(TRANSCRIPT_ID); // new transcript if (!transcriptDict.containsKey(transcriptId)) { @@ -264,7 +318,7 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq if (fastaIndex != null) { exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); } - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); // RefSeq does not provide Exon IDs, we are using transcript ID and exon numbers String exonId = transcriptId + "_" + exonNumber; @@ -286,14 +340,14 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq } } - private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String exonNumber = gtf.getAttributes().get("exon_number"); + private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, CellBaseException { + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // this CDS doesn't know which exon it belongs to. skip return; } - transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -419,12 +473,12 @@ private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBE } private void parseStopCodon(Gtf gtf) { - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // some codons don't have an exon number, discard return; } - Transcript transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -525,14 +579,14 @@ private void parseStopCodon(Gtf gtf) { } } - private Set parseXrefs(Gtf gtf) { + private Set parseXrefs(Gtf gtf) throws CellBaseException { Set xrefSet = new HashSet<>(); String xrefs = gtf.getAttributes().get("db_xref"); if (StringUtils.isNotEmpty(xrefs)) { for (String xrefString : xrefs.split(",")) { String[] dbxrefParts = xrefString.split(":", 2); if (dbxrefParts.length != 2) { - throw new RuntimeException("Bad xref, expected colon: " + xrefString); + throw new CellBaseException("Error parsing Xrefs: bad xref, expected colon: " + xrefString); } String dbName = dbxrefParts[0].toLowerCase(); String id = dbxrefParts[1]; @@ -541,7 +595,7 @@ private Set parseXrefs(Gtf gtf) { dbName = "hgnc_id"; dbDisplayName = "HGNC ID"; } - if ("ensembl".equalsIgnoreCase(dbName)) { + if (ENSEMBL.equalsIgnoreCase(dbName)) { if (id.startsWith("ENST")) { dbName = "ensembl_transcript"; dbDisplayName = "Ensembl transcript"; @@ -562,18 +616,17 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId Map gtfAttributes = gtf.getAttributes(); String name = gene.getName(); -// String biotype = gtfAttributes.get("gbkey"); String biotype = gtfAttributes.get("transcript_biotype"); if ("mRNA".equals(biotype)) { biotype = "protein_coding"; } - transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, status, + transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, KNOWN_STATUS, 0, 0, 0, 0, 0, indexer.getCdnaFasta(transcriptId), "", "", "", version, SOURCE, new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("ensembl", "ensembl_protein")) { + for (String suffix: Arrays.asList(ENSEMBL, "ensembl_protein")) { String maneRefSeq = indexer.getMane(transcriptId, suffix); if (StringUtils.isNotEmpty(maneRefSeq)) { transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, @@ -582,7 +635,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptId, "ensembl"); + String lrgRefSeq = indexer.getLrg(transcriptId, ENSEMBL); if (StringUtils.isNotEmpty(lrgRefSeq)) { transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_ensembl", "LRG Ensembl")); } @@ -598,15 +651,6 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if (StringUtils.isNotEmpty(lrg)) { transcript.getFlags().add("LRG"); } - // 3. TSO500 and EGLH HaemOnc - String tso500Flag = indexer.getTSO500(transcriptId.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(transcriptId.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } gene.getTranscripts().add(transcript); @@ -615,7 +659,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } private String getGeneId(Gtf gtf) throws CellBaseException { - // db_xref "GeneID:100287102"; + // Splitting the db_xref, e.g.: "GeneID:100287102" String xrefString = gtf.getAttributes().get("db_xref"); String[] xrefs = xrefString.split(","); for (String xref : xrefs) { @@ -628,11 +672,11 @@ private String getGeneId(Gtf gtf) throws CellBaseException { throw new CellBaseException("Didn't find geneId for db_xref:" + xrefString); } - private String getSequenceName(String fullSequenceName) { + private String getSequenceName(String fullSequenceName) throws CellBaseException { String[] sequenceNameParts = fullSequenceName.split("\\."); if (sequenceNameParts.length != 2) { - throw new RuntimeException("bad chromosome: " + fullSequenceName); + throw new CellBaseException("Invalid sequence name: bad chromosome: " + fullSequenceName); } // just get the first part, e.g. NC_000024.11 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 45520161f5..6a4fe69fc9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -16,275 +16,32 @@ package org.opencb.cellbase.lib.builders; -import org.apache.commons.lang.StringUtils; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.MirnaTarget; -import org.opencb.biodata.models.core.TargetGene; -import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; -import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; -import org.opencb.commons.utils.FileUtils; +import org.opencb.cellbase.core.exception.CellBaseException; import org.rocksdb.RocksDBException; -import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.REFSEQ_DATA; + +public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer { public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { super(refSeqDirectoryPath); } public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, - Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File, - Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException { - indexManeMapping(maneFile, "refseq"); - indexLrgMapping(lrgFile, "refseq"); + Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) + throws IOException, RocksDBException, FileFormatException, CellBaseException { + indexManeMapping(maneFile, REFSEQ_DATA); + indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFilePath, disgenetFile); + indexDiseases(hpoFilePath); indexMiRTarBase(miRTarBaseFile); indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); - } - - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - - String line; - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - bufferedReader.close(); - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - bufferedReader.close(); - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList(); - Map> geneToMirna = new HashMap(); - while (iterator.hasNext()) { - - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - Cell cell = cellIterator.next(); - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // species - cellIterator.next(); - - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // entrez - cellIterator.next(); - // species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmeds - cell = cellIterator.next(); - String pubmed = null; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { - pubmed = String.valueOf(cell.getNumericCellValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } } - - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { - String key = geneName + MIRTARBASE_SUFFIX; - return rocksDbManager.getMirnaTargets(rocksdb, key); - } - - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 03fc3a1cd6..280fc631bb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -16,60 +16,160 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.feature.gff.Gff2; import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; +import org.opencb.biodata.models.core.RegulatoryPfm; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; +import java.io.File; import java.io.IOException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -public class RegulatoryFeatureBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private final Path gffFile; - protected Set regulatoryFeatureSet; +public class RegulatoryFeatureBuilder extends AbstractBuilder { - public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { + private Path regulationPath; + private Set regulatoryFeatureSet; + + public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; + public static final String REGULATORY_REGION_OUTPUT_FILENAME = REGULATORY_REGION_BASENAME + ".json.gz"; + public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; + public static final String REGULATORY_PFM_OUTPUT_FILENAME = REGULATORY_PFM_BASENAME + ".json.gz"; + + public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) { super(serializer); - gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + this.regulationPath = regulationPath; } @Override public void parse() throws Exception { - logger.info("Parsing regulatory features..."); - if (Files.exists(gffFile)) { - parseGffFile(gffFile); - } else { - logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); - logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); + // Sanity check + checkDirectory(regulationPath, getDataName(REGULATION_DATA)); + + DataSource dataSource; + List regulatoryFiles; + List motifFeaturesFiles; + + // Check build regulatory files + dataSource = dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_DATA) + .resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); + regulatoryFiles = checkFiles(dataSource, regulationPath.resolve(REGULATORY_BUILD_DATA), getDataCategory(REGULATORY_BUILD_DATA) + "/" + + getDataName(REGULATORY_BUILD_DATA)); + if (regulatoryFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are " + + regulatoryFiles.size() + " files"); } + + // Check motif features files + dataSource = dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_DATA) + .resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); + motifFeaturesFiles = checkFiles(dataSource, regulationPath.resolve(MOTIF_FEATURES_DATA), getDataCategory(MOTIF_FEATURES_DATA) + "/" + + getDataName(MOTIF_FEATURES_DATA)); + if (motifFeaturesFiles.size() != 2) { + throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are " + + motifFeaturesFiles.size() + " files"); + } + + // Downloading and building pfm matrices + File motifFile = motifFeaturesFiles.get(0).getName().endsWith("tbi") ? motifFeaturesFiles.get(1) : motifFeaturesFiles.get(0); + loadPfmMatrices(motifFile.toPath(), serializer.getOutdir()); + + // Parse regulatory build features + parseGffFile(regulatoryFiles.get(0).toPath()); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { + logger.info(PARSING_LOG_MESSAGE, regulatoryFeatureFile); + + // Create and populate regulatory feature set regulatoryFeatureSet = new HashSet<>(); - if (regulatoryFeatureFile != null && Files.exists(regulatoryFeatureFile) && !Files.isDirectory(regulatoryFeatureFile) - && Files.size(regulatoryFeatureFile) > 0) { - Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile); + try (Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile)) { Gff2 feature; while ((feature = regulatoryFeatureReader.read()) != null) { regulatoryFeatureSet.add(feature); } - regulatoryFeatureReader.close(); } - int i = 0; // Serialize and save results for (Gff2 feature : regulatoryFeatureSet) { - // ID=TF_binding_site:ENSR00000243312; + // In order to get the ID we split the attribute format: ID=TF_binding_site:ENSR00000243312; .... String id = feature.getAttribute().split(";")[0].split(":")[1]; RegulatoryFeature regulatoryFeature = new RegulatoryFeature(id, feature.getSequenceName(), feature.getFeature(), feature.getStart(), feature.getEnd()); serializer.serialize(regulatoryFeature); } serializer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE); + } + + private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException, + InterruptedException { + Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_OUTPUT_FILENAME); + logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile); + if (Files.exists(regulatoryPfmPath)) { + logger.info("{} is already built", regulatoryPfmPath); + return; + } + + Set motifIds = new HashSet<>(); + logger.info(PARSING_LOG_MESSAGE, motifGffFile); + try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { + Gff2 tfbsMotifFeature; + Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); + while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { + String pfmId = getMatrixId(filePattern, tfbsMotifFeature); + if (StringUtils.isNotEmpty(pfmId)) { + motifIds.add(pfmId); + } + } + } + logger.info(PARSING_DONE_LOG_MESSAGE); + + ObjectMapper mapper = new ObjectMapper(); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true); + if (logger.isInfoEnabled()) { + logger.info("Looking up {} PFMs", motifIds.size()); + } + for (String pfmId : motifIds) { + String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId + + "?unit=frequencies;content-type=application/json"; + URL url = new URL(urlString); + RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); + serializer.serialize(regulatoryPfm); + // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits + TimeUnit.MILLISECONDS.sleep(250); + } + serializer.close(); + + logger.info("Downloading and building PFM matrices at {} done.", regulatoryPfmPath); + } + + private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { + Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); + if (matcher.find()) { + return matcher.group(0); + } + return null; + } + + public Set getRegulatoryFeatureSet() { + return regulatoryFeatureSet; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java deleted file mode 100644 index 3727ac4a69..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java +++ /dev/null @@ -1,607 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.opencb.biodata.models.core.RegulatoryFeature; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.*; -import java.util.*; - -/** - * User: fsalavert. - * Date: 4/10/13 - * Time: 10:14 AM - */ -@Deprecated -public class RegulatoryRegionBuilder extends CellBaseBuilder { - - private static final int CHUNK_SIZE = 2000; - private static final String REGULATORY_FEATURES = "regulatory_features"; - @Deprecated - private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features"; - private static final String MOTIF_FEATURES = "motif_features"; - private static final String FEATURE_TYPE = "feature_type"; - private static final String ID = "id"; - private static final String BINDING_MATRIX = "binding_matrix"; - private static final String MOTIF_FEATURE_TYPE = "motif_feature_type"; - private Path regulatoryRegionPath; - - public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) { - super(serializer); - - this.regulatoryRegionPath = regulatoryRegionFilesDir; - - } - - public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath) - throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group"); - List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT"); - - // Path regulatoryRegionPath = regulationDir.toPath(); - - Path filePath; - - filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes); - - filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - // TODO: REMOVE - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED - filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes); - // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED - - - -// GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame"); -// GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT"); - filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz"); - if (Files.exists(filePath)) { - createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes); - } - - } - - @Override - public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) { - throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read"); - } - - // Create the SQLite databases - createSQLiteRegulatoryFiles(regulatoryRegionPath); - - String chunkIdSuffix = CHUNK_SIZE / 1000 + "k"; - - Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db"); - Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db"); - Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db"); - Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db"); - Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db"); - Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db"); - - List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath, - deprecatedMotifFilePath, deprecatedRegulatoryFilePath); - List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features", - DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell"); - - if (Files.exists(mirnaFilePath)) { - filePaths.add(mirnaFilePath); - tableNames.add("mirna_uniq"); - } - - // Fetching and joining all chromosomes found in the different databases - Set setChr = new HashSet<>(); - setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES)); - setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features")); - setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell")); - if (Files.exists(mirnaFilePath)) { - setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq")); - } - - List chromosomes = new ArrayList<>(setChr); - List regulatoryFeatures; - HashSet chunksHash; - for (String chromosome : chromosomes) { - for (int i = 0; i < tableNames.size(); i++) { - chunksHash = new HashSet<>(); - regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome); - for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) { - int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE); - int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE); - - List chunkIds = new ArrayList<>(); - String chunkId; - for (int j = firstChunkId; j <= lastChunkId; j++) { - chunkId = chromosome + "_" + j + "_" + chunkIdSuffix; - chunkIds.add(chunkId); - //count chunks - if (!chunksHash.contains(j)) { - chunksHash.add(j); - } - } -// regulatoryFeature.setChunkIds(chunkIds); - - // remove 'chr' prefix -// if (genericFeature.getChromosome() != null) { -// genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", "")); -// } - serializer.serialize(regulatoryFeature); - } - } - } - } - - - public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes) - throws ClassNotFoundException, IOException, SQLException { - int limitRows = 100000; - int batchCount = 0; - - if (!Files.exists(filePath) || Files.size(filePath) == 0) { - return; - } - - Path dbPath = Paths.get(filePath.toString() + ".db"); - if (Files.exists(dbPath) && Files.size(dbPath) > 0) { - return; - } - - BufferedReader br = FileUtils.newBufferedReader(filePath); - - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion - - //Create table query - Statement createTables = conn.createStatement(); - - StringBuilder sbQuery = new StringBuilder(); - sbQuery.append("CREATE TABLE if not exists " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { //columnNames and columnTypes must have the same size - sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ","); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - - System.out.println(sbQuery.toString()); - createTables.executeUpdate(sbQuery.toString()); - - //Prepare insert query - sbQuery = new StringBuilder(); - sbQuery.append("INSERT INTO " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { - sbQuery.append("'" + columnNames.get(i) + "',"); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(") values ("); - sbQuery.append(repeat("?,", columnNames.size())); - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - System.out.println(sbQuery.toString()); - - PreparedStatement ps = conn.prepareStatement(sbQuery.toString()); - - //Read file - String line = null; - while ((line = br.readLine()) != null) { - - insertByType(ps, getFields(line, tableName), columnTypes); - ps.addBatch(); - batchCount++; - - //commit batch - if (batchCount % limitRows == 0 && batchCount != 0) { - ps.executeBatch(); - conn.commit(); - } - - } - br.close(); - - //Execute last Batch - ps.executeBatch(); - conn.commit(); - - //Create index - System.out.println("creating indices..."); - createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")"); - System.out.println("indices created."); - - conn.commit(); - conn.close(); - } - - public List getChromosomesList(Path dbPath, String tableName) throws IOException { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - List chromosomes = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName); -// ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'"); - - while (rs.next()) { - chromosomes.add(rs.getString(1)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return chromosomes; - } - - public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - Connection conn; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'"); -// ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'"); - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) { - Connection conn = null; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start); - - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException { - RegulatoryFeature regulatoryFeature = null; - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - regulatoryFeature = getRegulatoryFeature(rs); - break; - case MOTIF_FEATURES: - regulatoryFeature = getMotifFeature(rs); - break; - case "annotated_features": - regulatoryFeature = getAnnotatedFeature(rs); - break; - case "regulatory_features_multicell": - regulatoryFeature = getDeprecatedRegulatoryFeature(rs); - break; - case DEPRECATED_MOTIF_FEATURES: - regulatoryFeature = getDeprecatedMotifFeature(rs); - break; - case "mirna_uniq": - regulatoryFeature = getMirnaFeature(rs); - break; - default: - break; - } - return regulatoryFeature; - } - - private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - // Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the - // it was being done from the previous ENSEMBL files - regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE)); - - regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setId(groupFields.get(ID)); - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_")); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setName(groupFields.get("name")); - regulatoryFeature.setAlias(groupFields.get("alias")); - regulatoryFeature.setFeatureClass(groupFields.get("class")); - regulatoryFeature.getCellTypes().add(groupFields.get("cell_type")); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - regulatoryFeature.setFrame(rs.getString(9)); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3) + "_motif"); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - String[] split = groupFields.get("name").split(":"); - regulatoryFeature.setName(split[0]); - regulatoryFeature.setMatrix(split[1]); - - return regulatoryFeature; - } - - private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setFeatureClass("microRNA"); - regulatoryFeature.setName(groupFields.get("name")); - - return regulatoryFeature; - } - - private static Map getGroupFields(String group) { - //process group column - Map groupFields = new HashMap<>(); - String[] attributeFields = group.split(";"); - String[] attributeKeyValue; - for (String attributeField : attributeFields) { - attributeKeyValue = attributeField.trim().split("="); - groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]); - } - return groupFields; - } - - - public static List getFields(String line, String tableName) { - List fields = new ArrayList<>(); - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - fields = getRegulatoryFeaturesFields(line); - break; - case MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "annotated_features": - fields = getAnnotatedFeaturesFields(line); - break; - case "regulatory_features_multicell": - fields = getRegulatoryFeaturesFields(line); - break; - case DEPRECATED_MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "mirna_uniq": - fields = getMirnaFeaturesFields(line); - break; - default: - break; - } - return fields; - } - - @Deprecated - public static List getAnnotatedFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getRegulatoryFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMotifFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMirnaFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException { - //Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html - String raw; - String type; - if (types.size() == fields.size()) { - for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size - int sqliteIndex = i + 1; - raw = fields.get(i); - type = types.get(i); - - switch (type) { - case "INTEGER": - case "INT": - ps.setInt(sqliteIndex, Integer.parseInt(raw)); - break; - case "REAL": - ps.setFloat(sqliteIndex, Float.parseFloat(raw)); - break; - case "TEXT": - ps.setString(sqliteIndex, raw); - break; - default: - ps.setString(sqliteIndex, raw); - break; - } - } - } - - } - - public String repeat(String s, int n) { - if (s == null) { - return null; - } - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < n; i++) { - sb.append(s); - } - return sb.toString(); - } - - private int getChunkId(int position, int chunksize) { - if (chunksize <= 0) { - return position / CHUNK_SIZE; - } else { - return position / chunksize; - } - } - - private int getChunkStart(int id, int chunksize) { - if (chunksize <= 0) { - return (id == 0) ? 1 : id * CHUNK_SIZE; - } else { - return (id == 0) ? 1 : id * chunksize; - } - } - - private int getChunkEnd(int id, int chunksize) { - if (chunksize <= 0) { - return (id * CHUNK_SIZE) + CHUNK_SIZE - 1; - } else { - return (id * chunksize) + chunksize - 1; - } - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index d37765e0b6..5add326db7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -18,8 +18,10 @@ import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.avro.Repeat; -import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.ProgressLogger; import org.opencb.commons.utils.FileUtils; @@ -27,62 +29,100 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by fjlopez on 05/05/17. */ -public class RepeatsBuilder extends CellBaseBuilder { - private static final String TRF = "trf"; - private static final String GSD = "genomicSuperDup"; - private static final String WM = "windowMasker"; +public class RepeatsBuilder extends AbstractBuilder { + + private CellBaseConfiguration configuration; + + private List dataList; private final Path filesDir; - public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer) { + public static final String REPEATS_OUTPUT_BASENAME = "repeats"; + public static final String REPEATS_OUTPUT_FILENAME = REPEATS_OUTPUT_BASENAME + ".json.gz"; + + public RepeatsBuilder(List dataList, Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); + this.dataList = dataList; this.filesDir = filesDir; + this.configuration = configuration; } @Override public void parse() throws Exception { + // Sanity check + checkDirectory(filesDir, getDataName(REPEATS_DATA)); + + // Check Simple Repeats (TRF) filename + String trfFilename = null; + if (dataList.contains(TRF_DATA)) { + trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(trfFilename))) { + throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); + } + } - logger.info("Parsing repeats..."); - if (Files.exists(filesDir.resolve(EtlCommons.TRF_FILE))) { - parseTrfFile(filesDir.resolve(EtlCommons.TRF_FILE)); - } else { - logger.warn("No TRF file found {}", EtlCommons.TRF_FILE); - logger.warn("Skipping TRF file parsing. TRF data models will not be built."); + // Check Genomic Super Duplications (GSD) file + String gsdFilename = null; + if (dataList.contains(GSD_DATA)) { + gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) + .getFileName().toString(); + if (!Files.exists(filesDir.resolve(gsdFilename))) { + throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); + } } - if (Files.exists(filesDir.resolve(EtlCommons.GSD_FILE))) { - parseGsdFile(filesDir.resolve(EtlCommons.GSD_FILE)); - } else { - logger.warn("No Genomic Super Duplications file found {}", EtlCommons.GSD_FILE); - logger.warn("Skipping Genomic Super Duplications file parsing. " - + "Genomic Super Duplications data models will not be built."); + // Check Window Masker (WM) file + String wmFilename = null; + if (dataList.contains(WM_DATA)) { + wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(wmFilename))) { + throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); + } } - if (Files.exists(filesDir.resolve(EtlCommons.WM_FILE))) { - parseWmFile(filesDir.resolve(EtlCommons.WM_FILE)); - } else { - logger.warn("No WindowMasker file found {}", EtlCommons.WM_FILE); - logger.warn("Skipping WindowMasker file parsing. WindowMasker data models will not be built."); + // Parse TRF file + if (dataList.contains(TRF_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(TRF_DATA)); + parseTrfFile(filesDir.resolve(trfFilename)); + } + + // Parse GSD file + if (dataList.contains(GSD_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(GSD_DATA)); + parseGsdFile(filesDir.resolve(gsdFilename)); + } + + // Parse WM file + if (dataList.contains(WM_DATA)) { + logger.info(PARSING_LOG_MESSAGE, getDataName(WM_DATA)); + parseWmFile(filesDir.resolve(wmFilename)); } - logger.info("Done."); } - private void parseTrfFile(Path filePath) throws IOException { + private void parseTrfFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed TRF lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(TRF_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); line = bufferedReader.readLine(); progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseTrfLine(String line) { @@ -90,21 +130,23 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_DATA); } - private void parseGsdFile(Path filePath) throws IOException { + private void parseGsdFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed GSD lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(GSD_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); line = bufferedReader.readLine(); progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseGSDLine(String line) { @@ -112,28 +154,39 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD); + null, GSD_DATA); } - private void parseWmFile(Path filePath) throws IOException { + private void parseWmFile(Path filePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, filePath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed WM lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(WM_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); line = bufferedReader.readLine(); progressLogger.increment(1); } } + logger.info(PARSING_DONE_LOG_MESSAGE); } private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_DATA); + } + + private String getMessageMissingFile(String data, String filename, Path folder) throws CellBaseException { + return getDataName(data) + " file " + filename + " does not exist at " + folder; + } + + private String getMessageParsedLines(String data) throws CellBaseException { + return "Parsed " + getDataName(data) + " lines:"; } } + diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 2ccf0cb2a1..4f0dac0a81 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -19,8 +19,8 @@ import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.Path; @@ -30,75 +30,95 @@ import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; -public class RevelScoreBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Path revelFilePath = null; - private static final String SOURCE = "revel"; +public class RevelScoreBuilder extends AbstractBuilder { - public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { - super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel-v1.3_all_chromosomes.zip"); - logger = LoggerFactory.getLogger(ConservationBuilder.class); + private Path revelDownloadPath = null; + public RevelScoreBuilder(Path revelDownloadPath, CellBaseSerializer serializer) { + super(serializer); + this.revelDownloadPath = revelDownloadPath; } @Override - public void parse() throws IOException { - logger.error("processing Revel file at " + revelFilePath.toAbsolutePath()); - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFilePath))); + public void parse() throws IOException, CellBaseException { + String dataName = getDataName(REVEL_DATA); + String dataCategory = getDataCategory(REVEL_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(revelDownloadPath, dataName); + + // Check ontology files + List revelFiles = checkFiles(dataSourceReader.readValue(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA)) + .toFile()), revelDownloadPath, dataName); + if (revelFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); + } + + logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); + + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFiles.get(0)))); ZipEntry zipEntry = zis.getNextEntry(); - ZipFile zipFile = new ZipFile(String.valueOf(revelFilePath)); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); InputStream inputStream = zipFile.getInputStream(zipEntry); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); - - // skip header - String line = bufferedReader.readLine(); - String[] fields = null; - String lastEntry = null; - String currentEntry = null; - List scores = new ArrayList<>(); - MissenseVariantFunctionalScore predictions = null; - while ((line = bufferedReader.readLine()) != null) { - fields = line.split(","); - String chromosome = fields[0]; - if (".".equalsIgnoreCase(fields[2])) { - // 1,12855835,.,C,A,A,D,0.175 - // skip if invalid position - continue; - } - int position = Integer.parseInt(fields[2]); - String reference = fields[3]; - String alternate = fields[4]; - String aaReference = fields[5]; - String aaAlternate = fields[6]; - double score = Double.parseDouble(fields[7]); - - currentEntry = chromosome + position; - - // new chromosome + position, store previous entry - if (lastEntry != null && !currentEntry.equals(lastEntry)) { - serializer.serialize(predictions); - scores = new ArrayList<>(); - predictions = null; + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { + // Skip header + bufferedReader.readLine(); + String[] fields; + String lastEntry = null; + String currentEntry; + List scores = new ArrayList<>(); + MissenseVariantFunctionalScore predictions = null; + String line; + while ((line = bufferedReader.readLine()) != null) { + fields = line.split(","); + String chromosome = fields[0]; + if (".".equalsIgnoreCase(fields[2])) { + // 1,12855835,.,C,A,A,D,0.175 + // skip if invalid position + continue; + } + int position = Integer.parseInt(fields[2]); + String reference = fields[3]; + String alternate = fields[4]; + String aaReference = fields[5]; + String aaAlternate = fields[6]; + double score = Double.parseDouble(fields[7]); + + currentEntry = chromosome + position; + + // new chromosome + position, store previous entry + if (lastEntry != null && !currentEntry.equals(lastEntry)) { + serializer.serialize(predictions); + scores = new ArrayList<>(); + predictions = null; + } + + if (predictions == null) { + predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, REVEL_DATA, scores); + } + + TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", alternate, + aaReference, aaAlternate, score); + scores.add(predictedScore); + lastEntry = chromosome + position; } - if (predictions == null) { - predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, SOURCE, scores); - } - - TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", - alternate, aaReference, aaAlternate, score); - scores.add(predictedScore); - lastEntry = chromosome + position; + // Serialise last entry + serializer.serialize(predictions); } - // serialise last entry - serializer.serialize(predictions); + logger.info(PARSING_DONE_LOG_MESSAGE, revelFiles.get(0)); + // Close zis.close(); zipFile.close(); inputStream.close(); - bufferedReader.close(); + + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java index cf8351cc54..3a178b4828 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java @@ -60,8 +60,11 @@ public RocksDB getDBConnection(String dbLocation) { Options options = new Options().setCreateIfMissing(true); RocksDB db = null; try { + if (!Files.exists(Paths.get(dbLocation))) { + Files.createDirectories(Paths.get(dbLocation)); + } return RocksDB.open(options, dbLocation); - } catch (RocksDBException e) { + } catch (RocksDBException | IOException e) { // do some error handling e.printStackTrace(); System.exit(1); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java index ddff52328b..bbd82344e7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/SpliceBuilder.java @@ -24,7 +24,6 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; @@ -35,9 +34,13 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; -public class SpliceBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.MMSPLICE_DATA; +import static org.opencb.cellbase.lib.EtlCommons.SPLICEAI_DATA; + +public class SpliceBuilder extends AbstractBuilder { private Path spliceDir; private CellBaseFileSerializer fileSerializer; @@ -58,14 +61,14 @@ public void parse() throws Exception { logger.info("Parsing splice files..."); - Path splicePath = spliceDir.resolve(EtlCommons.MMSPLICE_SUBDIRECTORY); + Path splicePath = spliceDir.resolve(MMSPLICE_DATA); if (splicePath.toFile().exists()) { logger.info("Parsing MMSplice data..."); mmspliceParser(splicePath); } else { logger.debug("MMSplice data not found: " + splicePath); } - splicePath = spliceDir.resolve(EtlCommons.SPLICEAI_SUBDIRECTORY); + splicePath = spliceDir.resolve(SPLICEAI_DATA); if (splicePath.toFile().exists()) { logger.info("Parsing SpliceAI data..."); spliceaiParser(splicePath); @@ -85,7 +88,7 @@ public void parse() throws Exception { */ private void mmspliceParser(Path mmsplicePath) throws IOException { // Check output folder: MMSplice - Path mmspliceOutFolder = fileSerializer.getOutdir().resolve(EtlCommons.MMSPLICE_SUBDIRECTORY); + Path mmspliceOutFolder = fileSerializer.getOutdir().resolve(MMSPLICE_DATA); if (!mmspliceOutFolder.toFile().exists()) { mmspliceOutFolder.toFile().mkdirs(); } @@ -177,7 +180,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException { } // Dump rocksDB to JSON file - dumpRocksDB(EtlCommons.MMSPLICE_SUBDIRECTORY + "/splice_score_mmsplice_chr", rocksDB); + dumpRocksDB(MMSPLICE_DATA + "/splice_score_mmsplice_chr", rocksDB); // Clean up rocksDB.close(); @@ -195,7 +198,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException { */ private void spliceaiParser(Path spliceaiPath) throws IOException { // Check output folder: MMSplice - Path spliceaiOutFolder = fileSerializer.getOutdir().resolve(EtlCommons.SPLICEAI_SUBDIRECTORY); + Path spliceaiOutFolder = fileSerializer.getOutdir().resolve(SPLICEAI_DATA); if (!spliceaiOutFolder.toFile().exists()) { spliceaiOutFolder.toFile().mkdirs(); } @@ -292,7 +295,7 @@ private void spliceaiParser(Path spliceaiPath) throws IOException { } // Dump rocksDB to JSON file - dumpRocksDB(EtlCommons.SPLICEAI_SUBDIRECTORY + "/splice_score_spliceai_chr", rocksDB); + dumpRocksDB(SPLICEAI_DATA + "/splice_score_spliceai_chr", rocksDB); // Clean up rocksDB.close(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java index 087a4aed36..33ffe2e337 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -16,33 +16,172 @@ package org.opencb.cellbase.lib.builders; +import org.apache.commons.collections4.MapUtils; +import org.opencb.biodata.formats.variant.io.VariantReader; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.avro.AdditionalAttribute; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.biodata.models.variant.avro.Xref; +import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; +import org.opencb.biodata.tools.variant.VariantNormalizer; +import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; -import org.slf4j.LoggerFactory; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; +import java.io.IOException; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.DBSNP_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS; /** - * Created by imedina on 06/11/15. + * Created by jtarraga on 01/08/24. */ -public class VariationBuilder extends CellBaseBuilder { +public class VariationBuilder extends AbstractBuilder { + + private Path downloadPath; + private String species; private DbSnpBuilder dbSnpBuilder; - public VariationBuilder(Path downloadVariationPath, CellBaseFileSerializer fileSerializer, CellBaseConfiguration configuration) { + public static final String VARIATION_CHR_PREFIX = "variation_chr"; + public static final String VCF_ID_KEY = "VCF_ID"; + public static final String EVA_PREFIX = "EVA_"; + public static final String RS_PREFIX = "rs"; + + private static final String VARIANTS_PARSED_LOG_MESSAGE = "{} variants parsed"; + + public static final Map SV_VALUES_MAP; + + static { + Map tempMap = new HashMap<>(); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + tempMap.put("", ""); + SV_VALUES_MAP = Collections.unmodifiableMap(tempMap); + } + + public VariationBuilder(Path downloadPath, String species, CellBaseFileSerializer fileSerializer, CellBaseConfiguration configuration) { super(fileSerializer); + this.downloadPath = downloadPath; + this.species = species; + // dbSNP DownloadProperties.URLProperties dbSnpUrlProperties = configuration.getDownload().getDbSNP(); - dbSnpBuilder = new DbSnpBuilder(downloadVariationPath, dbSnpUrlProperties, fileSerializer); - - logger = LoggerFactory.getLogger(VariationBuilder.class); + dbSnpBuilder = new DbSnpBuilder(downloadPath.resolve(DBSNP_DATA), dbSnpUrlProperties, fileSerializer); } @Override public void parse() throws Exception { - // Parsing dbSNP data - dbSnpBuilder.parse(); + if (species.equalsIgnoreCase(HOMO_SAPIENS)) { + // Parsing dbSNP data + dbSnpBuilder.parse(); + } else { + // Parsing VCF files + parseVcf(); + } + } + + private void parseVcf() throws IOException { + VariantNormalizer.VariantNormalizerConfig normalizerConfig = new VariantNormalizer.VariantNormalizerConfig() + .setReuseVariants(true) + .setNormalizeAlleles(true) + .setDecomposeMNVs(false); + + CellBaseJsonFileSerializer fileSerializer = (CellBaseJsonFileSerializer) this.serializer; + + // Usually we expect two VCF files prefixed by the species scientific name + // e.g., for 'Mus musculus' the VCF files are 'mus_musculus.vcf.gz' and 'mus_musculus_structural_variations.vcf.gz' + String prefix = species.toLowerCase(Locale.ROOT).replace(" ", "_"); + + try (DirectoryStream vcfPaths = Files.newDirectoryStream(downloadPath, + entry -> entry.getFileName().toString().startsWith(prefix))) { + for (Path vcfPath : vcfPaths) { + logger.info(PARSING_LOG_MESSAGE, vcfPath); + + VariantStudyMetadata variantStudyMetadata = new VariantFileMetadata(vcfPath.getFileName().toString(), + vcfPath.toAbsolutePath().toString()).toVariantStudyMetadata(""); + VariantReader variantVcfReader = new VariantVcfHtsjdkReader(vcfPath, variantStudyMetadata, + new VariantNormalizer(normalizerConfig)); + + // Write variant to the JSON files according to the chromosome + int count = 0; + Iterator iterator = variantVcfReader.iterator(); + while (iterator.hasNext()) { + Variant variant = iterator.next(); + // Convert alternate for structural variants + if (SV_VALUES_MAP.containsKey(variant.getAlternate())) { + variant.setAlternate(SV_VALUES_MAP.get(variant.getAlternate())); + } + // Set variant ID (after converting the alternate) + variant.setId(variant.toString()); + // Set variant annotation: chrom, start, end, ref, alt, xrefs and additional attributes + VariantAnnotation variantAnnotation = new VariantAnnotation(); + variantAnnotation.setChromosome(variant.getChromosome()); + variantAnnotation.setStart(variant.getStart()); + variantAnnotation.setEnd(variant.getEnd()); + variantAnnotation.setReference(variant.getReference()); + variantAnnotation.setAlternate(variant.getAlternate()); + try { + Xref xref = null; + Map attributes = new HashMap<>(); + Map data = variant.getStudies().get(0).getFiles().get(0).getData(); + for (Map.Entry entry : data.entrySet()) { + if (entry.getKey().startsWith(EVA_PREFIX)) { + if (xref == null && data.containsKey(VCF_ID_KEY) && data.get(VCF_ID_KEY).startsWith(RS_PREFIX)) { + xref = new Xref(data.get(VCF_ID_KEY), entry.getKey()); + } + } else if (!entry.getKey().equals(VCF_ID_KEY)) { + attributes.put(entry.getKey(), entry.getValue()); + } + } + if (xref != null) { + variantAnnotation.setXrefs(Collections.singletonList(xref)); + } + if (MapUtils.isNotEmpty(attributes)) { + AdditionalAttribute additionalAttribute = new AdditionalAttribute(attributes); + Map additionalAttributeMap = new HashMap<>(); + additionalAttributeMap.put(vcfPath.getFileName().toString(), additionalAttribute); + variantAnnotation.setAdditionalAttributes(additionalAttributeMap); + } + } catch (Exception e) { + logger.warn("Error setting annotation for variant {}: {}", variant.getId(), Arrays.toString(e.getStackTrace())); + } + if (variantAnnotation != null) { + variant.setAnnotation(variantAnnotation); + } + variant.setAnnotation(variantAnnotation); + + // Remove study info + variant.setStudies(null); + + // Serialize + fileSerializer.serialize(variant, VARIATION_CHR_PREFIX + variant.getChromosome()); + if (++count % 1000000 == 0) { + logger.info(VARIANTS_PARSED_LOG_MESSAGE, count); + } + } + variantVcfReader.close(); + + logger.info(VARIANTS_PARSED_LOG_MESSAGE, count); + logger.info(PARSING_DONE_LOG_MESSAGE); + } + } + + fileSerializer.close(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index a31bd8d5e6..951ea5c530 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -41,11 +41,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION; - -//import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*; - /** * Created by fjlopez on 28/09/16. */ @@ -78,11 +73,15 @@ public class ClinVarIndexer extends ClinicalIndexer { private static final String DIPLOTYPE = "Diplotype"; private static final String VARIANT = "Variant"; private static final char CLINICAL_SIGNIFICANCE_SEPARATOR = '/'; + private final Path clinvarXMLFiles; private final Path clinvarSummaryFile; private final Path clinvarVariationAlleleFile; private final Path clinvarEFOFile; + + private final String version; private final String assembly; + private int numberSomaticRecords = 0; private int numberGermlineRecords = 0; private int numberNoDiseaseTrait = 0; @@ -94,15 +93,15 @@ public class ClinVarIndexer extends ClinicalIndexer { private static final Set RECESSIVE_TERM_SET = new HashSet<>(Arrays.asList(ModeOfInheritance.biallelic)); - public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, - Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly, - RocksDB rdb) throws IOException { + public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, Path clinvarEFOFile, + String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); this.rdb = rdb; this.clinvarXMLFiles = clinvarXMLFiles; this.clinvarSummaryFile = clinvarSummaryFile; this.clinvarVariationAlleleFile = clinvarVariationAlleleFile; this.clinvarEFOFile = clinvarEFOFile; + this.version = version; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; @@ -310,7 +309,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap) { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // Create a set to avoid situations like germline;germline;germline List alleleOrigin = null; if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) { @@ -391,7 +390,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java index e44ce53e90..4a95b65757 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarParser.java @@ -20,7 +20,7 @@ import org.opencb.biodata.formats.variant.clinvar.rcv.v64jaxb.*; import org.opencb.cellbase.core.common.clinical.ClinvarPublicSet; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; @@ -45,7 +45,7 @@ * Created by imedina on 26/09/14. */ @Deprecated -public class ClinVarParser extends CellBaseBuilder { +public class ClinVarParser extends AbstractBuilder { private static final String ASSEMBLY_PREFIX = "GRCh"; public static final String GRCH37_ASSEMBLY = "37"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..3f6e87b89c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -83,7 +83,7 @@ public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { .setDecomposeMNVs(true); if (genomeSequenceFilePath != null) { - logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); + logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath); variantNormalizerConfig.enableLeftAlign(genomeSequenceFilePath.toString()); } else { logger.info("Left alignment is NOT enabled."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index f574133ad7..9b3457dc78 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -19,165 +19,159 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; +import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by fjlopez on 26/09/16. */ -public class ClinicalVariantBuilder extends CellBaseBuilder { - - private final Path clinvarXMLFile; - private final Path clinvarSummaryFile; - private final Path clinvarVariationAlleleFile; - private final Path clinvarEFOFile; - private final Path cosmicFile; - private final Path gwasFile; - private final Path dbsnpFile; +public class ClinicalVariantBuilder extends AbstractBuilder { + + private final Path clinicalVariantPath; private final String assembly; - private final Path iarctp53GermlineFile; - private final Path iarctp53SomaticFile; - private final Path iarctp53GermlineReferencesFile; - private final Path iarctp53SomaticReferencesFile; private final Path genomeSequenceFilePath; - private final Path docmFile; - private final Path hgmdFile; - private boolean normalize = true; + private boolean normalize; - public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, - String assembly, CellBaseSerializer serializer) { - this(clinicalVariantFolder.resolve(EtlCommons.CLINVAR_XML_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_EFO_FILE), - clinicalVariantFolder.resolve(EtlCommons.COSMIC_FILE), - clinicalVariantFolder.resolve(EtlCommons.GWAS_FILE), - clinicalVariantFolder.resolve(EtlCommons.DBSNP_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_GERMLINE_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_GERMLINE_REFERENCES_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_SOMATIC_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_SOMATIC_REFERENCES_FILE), - clinicalVariantFolder.resolve(EtlCommons.DOCM_FILE), - clinicalVariantFolder.resolve(EtlCommons.HGMD_FILE), - normalize, - genomeSequenceFilePath, assembly, serializer); - } + private Path clinvarFullReleaseFilePath; + private Path clinvarSummaryFilePath; + private Path clinvarVariationAlleleFilePath; + private Path clinvarEFOFilePath; + private Path cosmicFilePath; + private Path hgmdFilePath; + private Path gwasFilePath; + private Path gwasDbSnpFilePath; - public ClinicalVariantBuilder(Path clinvarXMLFile, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, - Path clinvarEFOFile, Path cosmicFile, Path gwasFile, Path dbsnpFile, - Path iarctp53GermlineFile, Path iarctp53GermlineReferencesFile, - Path iarctp53SomaticFile, Path iarctp53SomaticReferencesFile, Path docmFile, Path hgmdFile, - boolean normalize, Path genomeSequenceFilePath, String assembly, - CellBaseSerializer serializer) { + private final CellBaseConfiguration configuration; + + public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, + String assembly, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); - this.clinvarXMLFile = clinvarXMLFile; - this.clinvarSummaryFile = clinvarSummaryFile; - this.clinvarVariationAlleleFile = clinvarVariationAlleleFile; - this.clinvarEFOFile = clinvarEFOFile; - this.cosmicFile = cosmicFile; - this.gwasFile = gwasFile; - this.dbsnpFile = dbsnpFile; - this.iarctp53GermlineFile = iarctp53GermlineFile; - this.iarctp53GermlineReferencesFile = iarctp53GermlineReferencesFile; - this.iarctp53SomaticFile = iarctp53SomaticFile; - this.iarctp53SomaticReferencesFile = iarctp53SomaticReferencesFile; - this.docmFile = docmFile; - this.hgmdFile = hgmdFile; + this.clinicalVariantPath = clinicalVariantFolder; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; + this.configuration = configuration; + } + + public void check() throws CellBaseException, IOException { + if (checked) { + return; + } + + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + + // Sanity check + checkDirectory(clinicalVariantPath, getDataName(CLINICAL_VARIANT_DATA)); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check genome file + logger.info("Checking genome FASTA file ..."); + if (!Files.exists(genomeSequenceFilePath)) { + throw new CellBaseException("Genome file path does not exist " + genomeSequenceFilePath); + } + logger.info(OK_MSG); + logger.info("Checking index for genome FASTA file ..."); + getIndexFastaReferenceGenome(genomeSequenceFilePath); + logger.info(OK_MSG); + + // Check ClinVar files + clinvarFullReleaseFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_FULL_RELEASE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarSummaryFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_SUMMARY_FILE_ID, + clinicalVariantPath).toPath(); + clinvarVariationAlleleFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_ALLELE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarEFOFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_EFO_TERMS_FILE_ID, + clinicalVariantPath).toPath(); + + // Check COSMIC file + cosmicFilePath = checkFiles(COSMIC_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check HGMD file + hgmdFilePath = checkFiles(HGMD_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check GWAS files + gwasFilePath = checkFiles(GWAS_DATA, clinicalVariantPath, 1).get(0).toPath(); + String dbSnpFilename = Paths.get(configuration.getDownload().getGwasCatalog().getFiles().get(GWAS_DBSNP_FILE_ID)).getFileName() + .toString(); + gwasDbSnpFilePath = clinicalVariantPath.resolve(dbSnpFilename); + if (!Files.exists(gwasDbSnpFilePath)) { + throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbSnpFilename + " is missing at " + + clinicalVariantPath); + } + if (!Files.exists(clinicalVariantPath.resolve(dbSnpFilename + TBI_EXTENSION))) { + throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + dbSnpFilename + TBI_EXTENSION + + " is missing at " + clinicalVariantPath); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + checked = true; } - public void parse() throws IOException, RocksDBException { + public void parse() throws IOException, RocksDBException, CellBaseException { + check(); + + // Prepare ClinVar chunk files before building (if necessary) + Path chunksPath = serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY); + if (Files.notExists(chunksPath)) { + Files.createDirectories(chunksPath); + logger.info("Splitting CliVar file {} in {} ...", clinvarFullReleaseFilePath, chunksPath); + splitClinvar(clinvarFullReleaseFilePath, chunksPath); + logger.info(OK_MSG); + } RocksDB rdb = null; Options dbOption = null; String dbLocation = null; try { - Object[] dbConnection = getDBConnection(clinvarXMLFile.getParent().toString() + "/integration.idx", true); + Object[] dbConnection = getDBConnection(clinicalVariantPath.toString() + "/integration.idx", true); rdb = (RocksDB) dbConnection[0]; dbOption = (Options) dbConnection[1]; dbLocation = (String) dbConnection[2]; // COSMIC - // IMPORTANT: COSMIC must be indexed first (before ClinVar, IARC TP53, DOCM, HGMD,...)!!! - if (this.cosmicFile != null && Files.exists(this.cosmicFile)) { - CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, normalize, genomeSequenceFilePath, assembly, rdb); - cosmicIndexer.index(); - } else { - logger.warn("Cosmic file {} missing. Skipping Cosmic data", cosmicFile); - } + // IMPORTANT: COSMIC must be indexed first (before ClinVar, HGMD,...)!!! + CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFilePath, configuration.getDownload().getCosmic().getVersion(), + normalize, genomeSequenceFilePath, assembly, rdb); + cosmicIndexer.index(); // ClinVar - if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null - && this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile) - && Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) { - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, - clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb); - clinvarIndexer.index(); - } else { - logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n" - + "Please, ensure that these two files exist:\n" - + "{}\n" - + "{}", this.clinvarXMLFile.toString(), this.clinvarSummaryFile.toString()); - } - - // IARC TP53 - if (this.iarctp53GermlineFile != null && this.iarctp53SomaticFile != null - && Files.exists(iarctp53GermlineFile) && Files.exists(iarctp53SomaticFile)) { - IARCTP53Indexer iarctp53Indexer = new IARCTP53Indexer(iarctp53GermlineFile, - iarctp53GermlineReferencesFile, iarctp53SomaticFile, iarctp53SomaticReferencesFile, - normalize, genomeSequenceFilePath, assembly, rdb); - iarctp53Indexer.index(); - } else { - logger.warn("One or more of required IARCTP53 files are missing. Skipping IARCTP53 data."); - } - - // DOCM - if (this.docmFile != null && Files.exists(docmFile)) { - DOCMIndexer docmIndexer = new DOCMIndexer(docmFile, normalize, genomeSequenceFilePath, assembly, rdb); - docmIndexer.index(); - } else { - logger.warn("The DOCM file {} is missing. Skipping DOCM data.", docmFile); - } + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY), + clinvarSummaryFilePath, clinvarVariationAlleleFilePath, clinvarEFOFilePath, configuration.getDownload().getClinvar() + .getVersion(), normalize, genomeSequenceFilePath, assembly, rdb); + clinvarIndexer.index(); // HGMD - if (this.hgmdFile != null && Files.exists(hgmdFile)) { - HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, normalize, genomeSequenceFilePath, assembly, rdb); - hgmdIndexer.index(); - } else { - logger.warn("The HGMD file {} is missing. Skipping HGMD data.", hgmdFile); - } + HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFilePath, configuration.getDownload().getHgmd().getVersion(), normalize, + genomeSequenceFilePath, assembly, rdb); + hgmdIndexer.index(); // GWAS catalog - if (gwasFile != null && Files.exists(gwasFile)) { - if (dbsnpFile != null && Files.exists(dbsnpFile)) { - Path tabixFile = Paths.get(dbsnpFile.toAbsolutePath() + ".tbi"); - if (tabixFile != null && Files.exists(tabixFile)) { - GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbsnpFile, genomeSequenceFilePath, assembly, rdb); - gwasIndexer.index(); - } else { - logger.warn("The dbSNP tabix file {} is missing. Skipping GWAS catalog data.", tabixFile); - } - } else { - logger.warn("The dbSNP file {} is missing. Skipping GWAS catalog data.", dbsnpFile); - } - } else { - logger.warn("The GWAS catalog file {} is missing. Skipping GWAS catalog data.", gwasFile); - } + GwasIndexer gwasIndexer = new GwasIndexer(gwasFilePath, gwasDbSnpFilePath, genomeSequenceFilePath, assembly, rdb); + gwasIndexer.index(); + // Serialize serializeRDB(rdb); closeIndex(rdb, dbOption, dbLocation); serializer.close(); @@ -186,7 +180,6 @@ public void parse() throws IOException, RocksDBException { serializer.close(); throw e; } - } private void serializeRDB(RocksDB rdb) throws IOException { @@ -223,7 +216,7 @@ private Variant parseVariantFromVariantId(String variantId) { return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]); } } catch (Exception e) { - logger.warn(e.getMessage() + ". Impossible to create the variant object from the variant ID: " + variantId); + logger.warn("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId); return null; } } @@ -275,4 +268,53 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { } + private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + if (pw != null) { + pw.print(""); + pw.close(); + } + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java index 0a8931b536..e103385556 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicBuilder.java @@ -16,7 +16,7 @@ package org.opencb.cellbase.lib.builders.clinical.variant; -import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.cellbase.lib.builders.AbstractBuilder; import org.opencb.cellbase.core.common.clinical.Cosmic; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; @@ -37,7 +37,7 @@ * @since October 08, 2014 */ @Deprecated -public class CosmicBuilder extends CellBaseBuilder { +public class CosmicBuilder extends AbstractBuilder { private final Path cosmicFilePath; private static final String CHROMOSOME = "CHR"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index f8d2f16d15..51be2b6f31 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -37,12 +37,12 @@ public class CosmicIndexer extends ClinicalIndexer { private final Path cosmicFile; + private final String version; private final String assembly; + private Pattern mutationGRCh37GenomePositionPattern; private Pattern snvPattern; - private static final String COSMIC_VERSION = "v95"; - private static final int GENE_NAMES_COLUMN = 0; private static final int HGNC_COLUMN = 3; private static final int PRIMARY_SITE_COLUMN = 7; @@ -84,10 +84,12 @@ public class CosmicIndexer extends ClinicalIndexer { private int rocksDBNewVariants = 0; private int rocksDBUpdateVariants = 0; - public CosmicIndexer(Path cosmicFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { + public CosmicIndexer(Path cosmicFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) + throws IOException { super(genomeSequenceFilePath); this.cosmicFile = cosmicFile; + this.version = version; this.normalize = normalize; this.assembly = assembly; this.rdb = rdb; @@ -469,7 +471,7 @@ private EvidenceEntry buildCosmic(String[] fields) { String id = fields[ID_COLUMN]; String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, COSMIC_VERSION, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, version, null); SomaticInformation somaticInformation = getSomaticInformation(fields); List genomicFeatureList = getGenomicFeature(fields); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java index b77f238432..a150e042dd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java @@ -178,7 +178,7 @@ private VariantAnnotation parseVariantAnnotation(Map map) { List bibliography = getBibliography(evidenceEntry); bibliography.add(PMID + diseaseMap.get(SOURCE_PUBMED_ID)); } else { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.DOCM_DATA, null, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.DOCM_NAME, null, null); HeritableTrait heritableTrait = new HeritableTrait((String) diseaseMap.get(DISEASE), null); List genomicFeatureList = getGenomicFeature(map); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java index d2ce12dee8..f132f4b9e8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java @@ -36,15 +36,17 @@ */ public class HGMDIndexer extends ClinicalIndexer { private final Path hgmdFile; + private final String version; private final String assembly; - public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) + public HGMDIndexer(Path hgmdFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); - this.rdb = rdb; - this.assembly = assembly; this.hgmdFile = hgmdFile; + this.version = version; this.normalize = normalize; + this.assembly = assembly; + this.rdb = rdb; } public void index() throws RocksDBException, IOException { @@ -93,7 +95,7 @@ private void parseHgmdInfo(Variant variant) { } // Source - entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, "2020.3", "2020")); + entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, null)); // Assembly entry.setAssembly(assembly); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java index d78c0446c8..6c5d4cf679 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/db/MongoDBManager.java @@ -26,6 +26,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.DatabaseNameUtils; import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.impl.core.ReleaseMongoDBAdaptor; import org.opencb.commons.datastore.core.DataStoreServerAddress; @@ -49,10 +50,8 @@ public class MongoDBManager { - public static final String DBNAME_SEPARATOR = "_"; - private MongoDataStoreManager mongoDataStoreManager; - private CellBaseConfiguration cellBaseConfiguration; + private final CellBaseConfiguration cellBaseConfiguration; private Logger logger; @@ -99,19 +98,19 @@ public MongoDataStore createMongoDBDatastore(String speciesStr, String assemblyS // cellbase_speciesId_assembly_cellbaseVersion // Example: // cellbase_hsapiens_grch37_v3 - String database = getDatabaseName(species.getId(), species.getAssembly(), cellBaseConfiguration.getVersion()); + String database = DatabaseNameUtils.getDatabaseName(species.getId(), species.getAssembly(), cellBaseConfiguration.getVersion()); logger.debug("Database for the species is '{}'", database); return createMongoDBDatastore(database); } catch (CellBaseException e) { e.printStackTrace(); logger.error("Species name is not valid: '{}'. Valid species: {}", speciesStr, - String.join(",", cellBaseConfiguration.getAllSpecies().stream().map((tmpSpeciesObject) - -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) + String.join(",", SpeciesUtils.getAllSpecies(cellBaseConfiguration).stream().map((tmpSpeciesObject) + -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) .collect(Collectors.toList()))); throw new InvalidParameterException("Species name is not valid: '" + speciesStr + "'. Please provide one" + " of supported species: {" - + String.join(",", cellBaseConfiguration.getAllSpecies().stream().map((tmpSpeciesObject) - -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) + + String.join(",", SpeciesUtils.getAllSpecies(cellBaseConfiguration).stream().map((tmpSpeciesObject) + -> (tmpSpeciesObject.getCommonName() + "|" + tmpSpeciesObject.getScientificName())) .collect(Collectors.toList())) + "}"); } } @@ -162,29 +161,6 @@ public MongoDataStore createMongoDBDatastore(String database) { return mongoDatastore; } - public static String getDatabaseName(String species, String assembly, String version) { - if (StringUtils.isEmpty(species) || StringUtils.isEmpty(assembly)) { - throw new InvalidParameterException("Species and assembly are required"); - } - - String cleanAssembly = assembly - .replaceAll("\\.", "") - .replaceAll("-", "") - .replaceAll("_", ""); - - // Process version from the configuration file, in order to suffix the database name - // - Production environment, e.g.: if version is "v5", the suffix added wil be "_v5" - // - Test environment, e.g.: if version is "v5.6" or "v5.6.0-SNAPSHOT", the suffix added will be "_v5_6" - String auxVersion = version.replace(".", DBNAME_SEPARATOR).replace("-", DBNAME_SEPARATOR); - String[] split = auxVersion.split(DBNAME_SEPARATOR); - String dbName = "cellbase" + DBNAME_SEPARATOR + species.toLowerCase() + DBNAME_SEPARATOR + cleanAssembly.toLowerCase() - + DBNAME_SEPARATOR + split[0]; - if (split.length > 1) { - dbName += (DBNAME_SEPARATOR + split[1]); - } - return dbName; - } - public Map getDatabaseStatus(String species, String assembly) { MongoDataStore mongoDatastore = createMongoDBDatastore(species, assembly); try { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a4ade6603e..d88ef5d389 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -24,33 +24,38 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; -import java.io.File; import java.io.FileReader; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.time.LocalDateTime; import java.util.*; -public class AbstractDownloadManager { +import static org.opencb.cellbase.lib.EtlCommons.*; - private static final String DGV_NAME = "DGV"; +public abstract class AbstractDownloadManager { - private static final String GNOMAD_NAME = "gnomAD"; + protected static final String DOWNLOADING_MSG = "Downloading {} ..."; + protected static final String DOWNLOADING_DONE_MSG = "Downloading {} done."; + protected static final String CATEGORY_DOWNLOADING_MSG = "Downloading {}/{} ..."; + protected static final String CATEGORY_DOWNLOADING_DONE_MSG = "Downloading {}/{} done."; + protected static final String DOWNLOADING_FROM_TO_MSG = "Downloading {} to {} ..."; + protected static final String DATA_ALREADY_DOWNLOADED_MSG = "The file {} already exists, indicating that the data {} has already been" + + " downloaded."; protected String species; protected String assembly; @@ -66,15 +71,23 @@ public class AbstractDownloadManager { protected Path downloadFolder; protected Path downloadLogFolder; // /download/log protected Path buildFolder; // /_/generated-json + + protected ObjectReader dataSourceReader; + protected ObjectWriter dataSourceWriter; + protected Logger logger; - public AbstractDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) + protected AbstractDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { this.species = species; this.assembly = assembly; this.outdir = outdir; this.configuration = configuration; + ObjectMapper jsonObjectMapper = new ObjectMapper(); + this.dataSourceReader = jsonObjectMapper.readerFor(DataSource.class); + this.dataSourceWriter = jsonObjectMapper.writerFor(DataSource.class); + this.init(); } @@ -104,83 +117,130 @@ private void init() throws CellBaseException, IOException { // Prepare outdir Path speciesFolder = outdir.resolve(speciesShortName + "_" + assemblyConfiguration.getName().toLowerCase()); downloadFolder = outdir.resolve(speciesFolder + "/download"); - logger.info("Creating download dir " + downloadFolder.toString()); + logger.info("Creating download dir: {}", downloadFolder); Files.createDirectories(downloadFolder); downloadLogFolder = outdir.resolve(speciesFolder + "/download/log"); - logger.info("Creating download log dir " + downloadLogFolder.toString()); + logger.info("Creating download log dir: {}", downloadLogFolder); Files.createDirectories(downloadLogFolder); // /_/generated_json buildFolder = outdir.resolve(speciesFolder + "/generated_json"); - logger.info("Creating build dir " + buildFolder.toString()); + logger.info("Creating build dir: {}", buildFolder); Files.createDirectories(buildFolder); - logger.info("Processing species " + speciesConfiguration.getScientificName()); + logger.info("Processing species {}", speciesConfiguration.getScientificName()); } - public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - return null; - } + public abstract List download() throws IOException, InterruptedException, CellBaseException; -// public DownloadFile downloadStructuralVariants() throws IOException, InterruptedException { -// if (!speciesHasInfoToDownload(speciesConfiguration, "svs")) { -// return null; -// } -// if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { -// logger.info("Downloading DGV data ..."); -// -// Path structuralVariantsFolder = downloadFolder.resolve(EtlCommons.STRUCTURAL_VARIANTS_FOLDER); -// Files.createDirectories(structuralVariantsFolder); -// String sourceFilename = (assemblyConfiguration.getName().equalsIgnoreCase("grch37") ? "GRCh37_hg19" : "GRCh38_hg38") -// + "_variants_2016-05-15.txt"; -// String url = configuration.getDownload().getDgv().getHost() + "/" + sourceFilename; -// saveVersionData(EtlCommons.STRUCTURAL_VARIANTS_DATA, DGV_NAME, getDGVVersion(sourceFilename), getTimeStamp(), -// Collections.singletonList(url), structuralVariantsFolder.resolve(EtlCommons.DGV_VERSION_FILE)); -// return downloadFile(url, structuralVariantsFolder.resolve(EtlCommons.DGV_FILE).toString()); -// } -// return null; -// } - -// private String getDGVVersion(String sourceFilename) { -// return sourceFilename.split("\\.")[0].split("_")[3]; -// } - - protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { + protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String data) { boolean hasInfo = true; - if (sp.getData() == null || !sp.getData().contains(info)) { - logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), info); + if (sp.getData() == null || !sp.getData().contains(data)) { + logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), data); hasInfo = false; } return hasInfo; } - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); + protected String getConfigurationFileIdPrefix(String scientificSpecies) { + String prefix = ""; + if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { + char c = scientificSpecies.charAt(0); + prefix = (c + scientificSpecies.split(" ")[1] + "_").toUpperCase(); + } + return prefix; } - protected void saveVersionData(String data, String name, String version, String date, List url, Path outputFilePath) - throws IOException { - Map versionDataMap = new HashMap<>(); - versionDataMap.put("data", data); - versionDataMap.put("name", name); - versionDataMap.put("version", version); - versionDataMap.put("date", date); - versionDataMap.put("url", url); + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, data, null, outPath); + } - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.writeValue(outputFilePath.toFile(), versionDataMap); + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String versionFilename = getDataVersionFilename(data); + + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + Path outPath) throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveEnsemblDataSource(ensemblProps, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), + Collections.singletonList(downloadFile.getUrl()), outPath.resolve(getDataVersionFilename(data))); + + return downloadFile; + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadDataSource(props, fileId, null, outPath); + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); + Path outFile = outPath.resolve(getFilenameFromUrl(url)); + return downloadFile(url, outFile); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadEnsemblDataSource(ensemblProps, fileId, null, outPath); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), + chromosome); + Path outFile = outPath.resolve(getFilenameFromUrl(url)); + return downloadFile(url, outFile); + } + + protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) + throws IOException, CellBaseException { + String name = getDataName(data); + String category = getDataCategory(data); + DataSource dataSource = new DataSource(data, name, category, version, date, urls); + + if (StringUtils.isEmpty(version)) { + logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); + dataSource.setVersion(date); + } + + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + logger.info("Created the {} version file {} at {}", getDataName(data), versionFilePath.getFileName(), versionFilePath.getParent()); + } + + protected String getTimeStamp() { + return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); } protected String getLine(Path readmePath, int lineNumber) { Files.exists(readmePath); - try { - BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset()); + try (BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset())) { String line = null; for (int i = 0; i < lineNumber; i++) { line = reader.readLine(); } - reader.close(); return line; } catch (IOException e) { e.printStackTrace(); @@ -216,115 +276,107 @@ protected String getPhylo(SpeciesConfiguration sp) { } } - - - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { - return downloadFile(url, outputFileName, null); + protected DownloadFile downloadFile(String url, Path outputFile) throws IOException, InterruptedException, CellBaseException { + return downloadFile(url, outputFile, null); } - protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { - DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); + protected DownloadFile downloadFile(String url, Path outputFile, List wgetAdditionalArgs) + throws IOException, InterruptedException, CellBaseException { + DownloadFile downloadFile = new DownloadFile(url, outputFile.toAbsolutePath().toString(), + Timestamp.valueOf(LocalDateTime.now()).toString()); Long startTime = System.currentTimeMillis(); - if (Paths.get(outputFileName).toFile().exists()) { - logger.warn("File '{}' is already downloaded", outputFileName); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, "File '" + outputFileName + "' is already downloaded", true); + final Path outputLog = downloadLogFolder.resolve(outputFile.getFileName().toString() + ".log"); + if (Files.exists(outputFile)) { + logger.warn("File '{}' is already downloaded", outputFile); + setDownloadStatusAndMessage(outputFile, downloadFile, outputLog, true); + downloadFile.setMessage("File is already downloaded"); } else { - final String outputLog = downloadLogFolder + "/" + Paths.get(outputFileName).toFile().getName() + ".log"; - List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, "-O", outputFileName, "-o", outputLog)); + logger.info(DOWNLOADING_FROM_TO_MSG, url, outputFile); + List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, + "-O", outputFile.toAbsolutePath().toString(), + "-o", outputLog.toAbsolutePath().toString())); if (wgetAdditionalArgs != null && !wgetAdditionalArgs.isEmpty()) { wgetArgs.addAll(wgetAdditionalArgs); } boolean downloaded = EtlCommons.runCommandLineProcess(null, "wget", wgetArgs, outputLog); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, outputLog, downloaded); + setDownloadStatusAndMessage(outputFile, downloadFile, outputLog, downloaded); + logger.info(OK_MSG); } - downloadFileInfo.setElapsedTime(startTime, System.currentTimeMillis()); - return downloadFileInfo; + downloadFile.setElapsedTime(startTime, System.currentTimeMillis()); + return downloadFile; } - private void setDownloadStatusAndMessage(String outputFileName, DownloadFile downloadFile, String outputLog, boolean downloaded) { + private void setDownloadStatusAndMessage(Path outputFile, DownloadFile downloadFile, Path logFile, boolean downloaded) { if (downloaded) { - boolean validFileSize = validateDownloadFile(downloadFile, outputFileName, outputLog); + boolean validFileSize = validateDownloadFile(downloadFile, outputFile, logFile); if (validFileSize) { downloadFile.setStatus(DownloadFile.Status.OK); downloadFile.setMessage("File downloaded successfully"); } else { downloadFile.setStatus(DownloadFile.Status.ERROR); downloadFile.setMessage("Expected downloaded file size " + downloadFile.getExpectedFileSize() - + ", Actual file size " + downloadFile.getActualFileSize()); + + ", actual file size " + downloadFile.getActualFileSize()); } } else { - downloadFile.setMessage("See full error message in " + outputLog); + downloadFile.setMessage("See full error message in " + logFile); downloadFile.setStatus(DownloadFile.Status.ERROR); - // because we use the -O flag, a file will be written, even on error. See #467 -// Files.deleteIfExists((new File(outputFileName)).toPath()); } } - public static void writeDownloadLogFile(Path downloadFolder, List downloadFiles) throws IOException { + public void writeDownloadLogFile(Map params, List downloadFiles) throws IOException { + // Get current date and time + String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); + Path summaryPath = downloadLogFolder.resolve(timeStamp + "_summary.json"); + + Map summary = new HashMap<>(); + summary.put("params", params); + summary.put("downloadFiles", downloadFiles); + ObjectMapper mapper = new ObjectMapper(); ObjectWriter writer = mapper.writer(new DefaultPrettyPrinter()); - writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); + writer.writeValue(summaryPath.toFile(), summary); } - private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { - long expectedFileSize = getExpectedFileSize(outputFileLog); - long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); + public boolean isAlreadyDownloaded(Path path, String dataName) { + if (Files.exists(path)) { + logger.info(DATA_ALREADY_DOWNLOADED_MSG, path.getFileName(), dataName); + return true; + } + return false; + } + + private boolean validateDownloadFile(DownloadFile downloadFile, Path outputFile, Path logFile) { + long expectedFileSize = getExpectedFileSize(logFile); + long actualFileSize = FileUtils.sizeOf(outputFile.toFile()); downloadFile.setActualFileSize(actualFileSize); downloadFile.setExpectedFileSize(expectedFileSize); return expectedFileSize == actualFileSize; } - private long getExpectedFileSize(String outputFileLog) { - try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; + private long getExpectedFileSize(Path path) { + try (BufferedReader reader = new BufferedReader(new FileReader(path.toFile()))) { + String line; while ((line = reader.readLine()) != null) { // looking for: Length: 13846591 (13M) if (line.startsWith("Length:")) { String[] parts = line.split("\\s"); - return Long.valueOf(parts[1]); + return Long.parseLong(parts[1]); } } } catch (Exception e) { - logger.info("Error getting expected file size " + e.getMessage()); + logger.info("Error getting expected file size: {}. Stack trace: {}", e.getMessage(), Arrays.toString(e.getStackTrace())); } return -1; } - protected String getVersionFromVersionLine(Path path, String tag) { - Files.exists(path); - try { - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String line = reader.readLine(); - // There shall be a line at the README.txt containing the version. - // e.g. The files in the current directory contain the data corresponding to the latest release - // (version 4.0, April 2016). ... - while (line != null) { - // tag specifies a certain string that must be found within the line supposed to contain the version - // info - if (line.contains(tag)) { - String version = line.split("\\(")[1].split("\\)")[0]; - reader.close(); - return version; - } - line = reader.readLine(); - } - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - private String getEnsemblURL(SpeciesConfiguration sp) { // We need to find which is the correct Ensembl host URL. // This can different depending on if is a vertebrate species. - String ensemblHostUrl; if (configuration.getSpecies().getVertebrates().contains(sp)) { - ensemblHostUrl = configuration.getDownload().getEnsembl().getUrl().getHost(); + return configuration.getDownload().getEnsembl().getUrl().getHost(); } else { - ensemblHostUrl = configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); + return configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); } - return ensemblHostUrl; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index e0cae1250e..c128b1d67d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -18,7 +18,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.core.utils.SpeciesUtils; import java.io.IOException; import java.nio.file.Files; @@ -26,36 +26,35 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class CaddDownloadManager extends AbstractDownloadManager { - private static final String CADD_NAME = "CADD"; public CaddDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); } @Override - public List download() throws IOException, InterruptedException { - return Collections.singletonList(downloadCaddScores()); - } - - public DownloadFile downloadCaddScores() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "variation_functional_score")) { - return null; + public List download() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_FUNCTIONAL_SCORE_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(VARIATION_FUNCTIONAL_SCORE_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading CADD scores information ..."); - Path variationFunctionalScoreFolder = downloadFolder.resolve("variation_functional_score"); - Files.createDirectories(variationFunctionalScoreFolder); + logger.info(CATEGORY_DOWNLOADING_MSG, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); - // Downloads CADD scores - String url = configuration.getDownload().getCadd().getHost(); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - saveVersionData(EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, CADD_NAME, url.split("/")[5], getTimeStamp(), - Collections.singletonList(url), variationFunctionalScoreFolder.resolve("caddVersion.json")); - return downloadFile(url, variationFunctionalScoreFolder.resolve("whole_genome_SNVs.tsv.gz").toString()); - } - return null; + // Download CADD and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, + caddDownloadPath); + + logger.info(CATEGORY_DOWNLOADING_DONE_MSG, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + return Collections.singletonList(downloadFile); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index eb1f28db2d..e70e3d297b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -19,27 +19,20 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; -import javax.ws.rs.client.Client; -import javax.ws.rs.client.ClientBuilder; -import javax.ws.rs.client.WebTarget; -import java.io.*; -import java.net.URI; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -public class ClinicalDownloadManager extends AbstractDownloadManager { - - private static final String CLINVAR_NAME = "ClinVar"; - private static final String GWAS_NAME = "GWAS catalog"; - private static final String IARCTP53_NAME = "IARC TP53 Database"; +import static org.opencb.cellbase.lib.EtlCommons.*; +public class ClinicalDownloadManager extends AbstractDownloadManager { public ClinicalDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -47,199 +40,64 @@ public ClinicalDownloadManager(String species, String assembly, Path outdir, Cel } @Override - public List download() throws IOException, InterruptedException { - List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadClinical()); - return downloadFiles; + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadClinical(); } - public List downloadClinical() throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading clinical variant information ..."); - - String url; - List downloadFiles = new ArrayList<>(); - - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); - Files.createDirectories(clinicalFolder); - logger.info("\t\tDownloading ClinVar files ..."); - - List clinvarUrls = new ArrayList<>(3); - url = configuration.getDownload().getClinvar().getHost(); - - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarEfoTerms().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_EFO_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarSummary().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarVariationAllele().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); - clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); - - // Gwas catalog - logger.info("\t\tDownloading GWAS catalog file ..."); - DownloadProperties.URLProperties gwasCatalog = configuration.getDownload().getGwasCatalog(); - url = gwasCatalog.getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), - Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); - -// List hgvsList = getDocmHgvsList(); -// if (!hgvsList.isEmpty()) { -// downloadDocm(hgvsList, clinicalFolder.resolve(EtlCommons.DOCM_FILE)); -// downloadFiles.add(downloadFile(configuration.getDownload().getDocmVersion().getHost(), -// clinicalFolder.resolve("docmIndex.html").toString())); -// saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.DOCM_NAME, -// getDocmVersion(clinicalFolder.resolve("docmIndex.html")), getTimeStamp(), -// Arrays.asList(configuration.getDownload().getDocm().getHost() + "v1/variants.json", -// configuration.getDownload().getDocm().getHost() + "v1/variants/{hgvs}.json"), -// clinicalFolder.resolve("docmVersion.json")); -// } else { -// logger.warn("No DOCM variants found for assembly {}. Please double-check that this is the correct " -// + "assembly", assemblyConfiguration.getName()); -// } - - // I am only able to download these files manually -// if (assemblyConfiguration.getName().equalsIgnoreCase("grch38")) { -// url = configuration.getDownload().getIarctp53().getHost(); -// downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.IARCTP53_FILE).toString())); -// -// ZipFile zipFile = new ZipFile(clinicalFolder.resolve(EtlCommons.IARCTP53_FILE).toString()); -// Enumeration entries = zipFile.entries(); -// while (entries.hasMoreElements()) { -// ZipEntry entry = entries.nextElement(); -// File entryDestination = new File(clinicalFolder.toFile(), entry.getName()); -// if (entry.isDirectory()) { -// entryDestination.mkdirs(); -// } else { -// entryDestination.getParentFile().mkdirs(); -// InputStream in = zipFile.getInputStream(entry); -// OutputStream out = new FileOutputStream(entryDestination); -// IOUtils.copy(in, out); -// IOUtils.closeQuietly(in); -// out.close(); -// } -// } -// saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, IARCTP53_NAME, -// getVersionFromVersionLine(clinicalFolder.resolve("Disclaimer.txt"), -// "The version of the database should be identified"), getTimeStamp(), -// Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json")); -// } - - if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) { - Files.createDirectories(clinicalFolder.resolve("clinvar_chunks")); - splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks")); - } - - return downloadFiles; + public List downloadClinical() throws IOException, InterruptedException, CellBaseException { + // Check if the species supports this data + if (!SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CLINICAL_VARIANT_DATA)) { + logger.info(DATA_NOT_SUPPORTED_MSG, getDataName(CLINICAL_VARIANT_DATA), speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - return null; - } - private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); - } + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); - private String getDocmVersion(Path docmIndexHtml) { - return getVersionFromVersionLine(docmIndexHtml, "