diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java index 3f286d2e..9c4088c7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java @@ -22,11 +22,15 @@ import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.hateoas.config.EnableHypermediaSupport; import org.springframework.retry.annotation.EnableRetry; +import org.springframework.scheduling.annotation.EnableAsync; import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.transaction.annotation.EnableTransactionManagement; @EnableScheduling +@EnableAsync @SpringBootApplication @EnableRetry +@EnableTransactionManagement @EnableHypermediaSupport(type = EnableHypermediaSupport.HypermediaType.HAL) public class ContigAliasApplication extends SpringBootServletInitializer { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java b/src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java new file mode 100644 index 00000000..ffd62ea6 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java @@ -0,0 +1,21 @@ +package uk.ac.ebi.eva.contigalias.conf; + +import org.springframework.beans.BeansException; +import org.springframework.context.ApplicationContext; +import org.springframework.context.ApplicationContextAware; +import org.springframework.stereotype.Component; + +@Component +public class ApplicationContextHolder implements ApplicationContextAware { + + private static ApplicationContext applicationContext; + + @Override + public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { + ApplicationContextHolder.applicationContext = applicationContext; + } + + public static ApplicationContext getApplicationContext() { + return applicationContext; + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index 37e2d29a..e35a6ab7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -28,13 +28,13 @@ import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; -import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; +import java.util.Optional; @RequestMapping("/v1/admin") @RestController @@ -63,6 +63,9 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( "GCA_000001405.10") String asmAccession) throws IOException { try { handler.fetchAndInsertAssemblyByAccession(asmAccession); + // submit jobs for updating ena sequence name and md5 checksum for assembly + handler.retrieveAndInsertENASequenceNameForAssembly(asmAccession); + handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession); } catch (IllegalArgumentException e) { return new ResponseEntity<>(e.getMessage(), HttpStatus.BAD_REQUEST); } @@ -82,43 +85,124 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( "parallel manner.") @PutMapping(value = "assemblies") public ResponseEntity fetchAndInsertAssemblyByAccession( - @RequestBody(required = false) @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { if (accessions == null || accessions.size() <= 0) { return new ResponseEntity<>(HttpStatus.BAD_REQUEST); } Map> accessionResult = handler.fetchAndInsertAssemblyByAccession(accessions); + // submit jobs for updating ena sequence names and md5 checksum for all successfully inserted assemblies + if (accessionResult.get("SUCCESS").size() > 0) { + handler.retrieveAndInsertENASequenceNameForAssembly(accessionResult.get("SUCCESS")); + handler.retrieveAndInsertMd5ChecksumForAssembly(accessionResult.get("SUCCESS")); + } return new ResponseEntity<>("Accession Processing Result : " + accessionResult, HttpStatus.MULTI_STATUS); } @ApiOperation(value = "Given an assembly accession, retrieve MD5 checksum for all chromosomes belonging to assembly and update") - @PutMapping(value = "assemblies/{accession}/md5checksum") + @PutMapping(value = "assemblies/md5checksum/{accession}") public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. Eg: " + "GCA_000001405.10") String asmAccession) { - try { - handler.getAssemblyByAccession(asmAccession); - handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession); - return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " + - "in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " + - "this might take some time to complete"); - } catch (AssemblyNotFoundException e) { + Optional assemblyOpt = handler.getAssemblyByAccession(asmAccession); + if (assemblyOpt.isPresent()) { + handler.retrieveAndInsertMd5ChecksumForAssembly(assemblyOpt.get().getInsdcAccession()); + return ResponseEntity.ok("A task has been submitted for updating md5checksum for assembly " + asmAccession + + "\nDepending upon the size of assembly and other scheduled jobs, this might take some time to complete"); + } else { return ResponseEntity.ok("Could not find assembly " + asmAccession + - ". Please insert the assembly first (md5checksum will be updated as part of the insertion process"); + ". Please insert the assembly first. MD5 checksum will be updated as part of the insertion process"); } } - @ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ") - @GetMapping(value = "assemblies/md5checksum/status") - public ResponseEntity getMD5ChecksumUpdateTaskStatus() { - Map> md5ChecksumUpdateTasks = handler.getMD5ChecksumUpdateTaskStatus(); - Set runningTasks = md5ChecksumUpdateTasks.get("running"); - Set scheduledTasks = md5ChecksumUpdateTasks.get("scheduled"); - String runningTaskRes = runningTasks == null || runningTasks.isEmpty() ? "No running MD5 checksum update tasks" : - runningTasks.stream().collect(Collectors.joining(",")); - String scheduledTaskRes = scheduledTasks == null || scheduledTasks.isEmpty() ? "No scheduled MD5 checksum update tasks" : - scheduledTasks.stream().collect(Collectors.joining(",")); - return ResponseEntity.ok("running: " + runningTaskRes + "\nscheduled: " + scheduledTaskRes); + @ApiOperation(value = "Given a list of assembly accessions, retrieve MD5 checksum for all chromosomes belonging to all the assemblies and update") + @PutMapping(value = "assemblies/md5checksum") + public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly( + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { + if (accessions == null || accessions.size() <= 0) { + return new ResponseEntity<>(HttpStatus.BAD_REQUEST); + } + + List asmInsdcAccessionsList = new ArrayList<>(); + List asmNotPresent = new ArrayList<>(); + for (String accession : accessions) { + Optional assemblyOpt = handler.getAssemblyByAccession(accession); + if (assemblyOpt.isPresent()) { + asmInsdcAccessionsList.add(assemblyOpt.get().getInsdcAccession()); + } else { + asmNotPresent.add(accession); + } + } + + handler.retrieveAndInsertMd5ChecksumForAssembly(asmInsdcAccessionsList); + + accessions.removeAll(asmNotPresent); + String responseText = "A task has been submitted for updating MD5 checksum for assemblies: " + accessions + "." + + "\nDepending upon other scheduled jobs and the size of assembly, this might take some time to complete"; + if (!asmNotPresent.isEmpty()) { + responseText = responseText + "\nThe following assemblies are not present: " + asmNotPresent + "." + + "\nPlease insert the assembly first, MD5 Checksum will be updated as part of the insertion process"; + } + + return ResponseEntity.ok(responseText); + } + + @ApiOperation(value = "Given an assembly accession, retrieve ENA sequence name for all chromosomes belonging to assembly and update") + @PutMapping(value = "assemblies/ena-sequence-name/{accession}") + public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathVariable(name = "accession") + @ApiParam(value = "INSDC or RefSeq assembly accession. " + + "Eg: GCA_000001405.10") String asmAccession) { + Optional assemblyOpt = handler.getAssemblyByAccession(asmAccession); + if (assemblyOpt.isPresent()) { + handler.retrieveAndInsertENASequenceNameForAssembly(assemblyOpt.get().getInsdcAccession()); + return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for assembly " + asmAccession + + "\nDepending upon the size of assembly and other scheduled jobs, this might take some time to complete"); + } else { + return ResponseEntity.ok("Could not find assembly " + asmAccession + + ". Please insert the assembly first. ENA sequence name will be updated as part of the insertion process"); + } + } + + @ApiOperation(value = "Given a list of assembly accessions, retrieve ENA sequence name for all chromosomes belonging to all the assemblies and update") + @PutMapping(value = "assemblies/ena-sequence-name") + public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { + if (accessions == null || accessions.size() <= 0) { + return new ResponseEntity<>(HttpStatus.BAD_REQUEST); + } + + List asmInsdcAccessionsList = new ArrayList<>(); + List asmNotPresent = new ArrayList<>(); + for (String accession : accessions) { + Optional assemblyOpt = handler.getAssemblyByAccession(accession); + if (assemblyOpt.isPresent()) { + asmInsdcAccessionsList.add(assemblyOpt.get().getInsdcAccession()); + } else { + asmNotPresent.add(accession); + } + } + + handler.retrieveAndInsertENASequenceNameForAssembly(asmInsdcAccessionsList); + + accessions.removeAll(asmNotPresent); + String responseText = "A task has been submitted for updating ENA Sequence Name for assemblies: " + accessions + + "\nDepending upon other scheduled jobs and the size of assembly, this might take some time to complete"; + if (!asmNotPresent.isEmpty()) { + responseText = responseText + "\nThe following assemblies are not present: " + asmNotPresent + "." + + "\nPlease insert the assembly first, ENA Sequence Name will be updated as part of the insertion process"; + } + + return ResponseEntity.ok(responseText); + } + + + @ApiOperation(value = "Retrieve list of Jobs that are running or scheduled to run") + @GetMapping(value = "assemblies/scheduled-jobs") + public ResponseEntity> getMD5ChecksumUpdateTaskStatus() { + List scheduledJobStatus = handler.getScheduledJobStatus(); + return ResponseEntity.ok(scheduledJobStatus); } // This endpoint can be enabled in the future when checksums for assemblies are added to the project. diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java index 2007cd17..b0b18ccf 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java @@ -19,16 +19,13 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.web.PagedResourcesAssembler; import org.springframework.stereotype.Service; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.service.AssemblyService; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; @Service public class AdminHandler { @@ -52,7 +49,7 @@ public Optional getAssemblyByAccession(String accession) { return assemblyService.getAssemblyByAccession(accession); } - public void fetchAndInsertAssemblyByAccession(String accession) throws IOException { + public void fetchAndInsertAssemblyByAccession(String accession) { assemblyService.fetchAndInsertAssembly(accession); } @@ -64,8 +61,20 @@ public void retrieveAndInsertMd5ChecksumForAssembly(String accession) { assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accession); } - public Map> getMD5ChecksumUpdateTaskStatus() { - return assemblyService.getMD5ChecksumUpdateTaskStatus(); + public void retrieveAndInsertMd5ChecksumForAssembly(List accessions) { + assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accessions); + } + + public void retrieveAndInsertENASequenceNameForAssembly(String accession) { + assemblyService.retrieveAndInsertENASequenceNameForAssembly(accession); + } + + public void retrieveAndInsertENASequenceNameForAssembly(List accessions) { + assemblyService.retrieveAndInsertENASequenceNameForAssembly(accessions); + } + + public List getScheduledJobStatus() { + return assemblyService.getScheduledJobStatus(); } public void deleteAssemblyByAccession(String accession) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java index f280ef16..11f07306 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java @@ -19,7 +19,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.PageImpl; -import org.springframework.data.domain.PageRequest; import org.springframework.data.domain.Pageable; import org.springframework.data.web.PagedResourcesAssembler; import org.springframework.hateoas.EntityModel; @@ -83,6 +82,7 @@ public PagedModel> getAssemblyByRefseq(String refseq public PagedModel> getAssembliesByTaxid(long taxid, Pageable request) { Page page = assemblyService.getAssembliesByTaxid(taxid, request); + page.forEach(it->it.setChromosomes(null)); return generatePagedModelFromPage(page, assemblyAssembler); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java deleted file mode 100644 index 1c646ca8..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.datasource; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; - -import java.io.IOException; -import java.util.Optional; - -public interface AssemblyDataSource { - - Optional getAssemblyByAccession(String accession) throws IOException; - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 37a5c791..80d8bdf7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -25,75 +25,48 @@ import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReader; -import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReaderFactory; import uk.ac.ebi.eva.contigalias.dus.ENABrowser; import uk.ac.ebi.eva.contigalias.dus.ENABrowserFactory; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import uk.ac.ebi.eva.contigalias.exception.DownloadFailedException; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.Collections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.Optional; @Repository("ENADataSource") -public class ENAAssemblyDataSource implements AssemblyDataSource { +public class ENAAssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(ENAAssemblyDataSource.class); private final ENABrowserFactory factory; - private final ENAAssemblyReportReaderFactory readerFactory; - @Value("${asm.file.download.dir}") private String asmFileDownloadDir; @Autowired - public ENAAssemblyDataSource(ENABrowserFactory factory, - ENAAssemblyReportReaderFactory readerFactory) { + public ENAAssemblyDataSource(ENABrowserFactory factory) { this.factory = factory; - this.readerFactory = readerFactory; } - @Override - public Optional getAssemblyByAccession(String accession) throws IOException { + public Optional downloadAssemblyReport(String accession) throws IOException { ENABrowser enaBrowser = factory.build(); enaBrowser.connect(); try { - Optional downloadFilePath = downloadAssemblyReport(enaBrowser, accession); - if (!downloadFilePath.isPresent()) { - return Optional.empty(); - } - - AssemblyEntity assemblyEntity; - try (InputStream stream = new FileInputStream(downloadFilePath.get().toFile())) { - ENAAssemblyReportReader reader = readerFactory.build(stream); - assemblyEntity = reader.getAssemblyEntity(); - logger.info("ENA: Number of chromosomes in " + accession + " : " + assemblyEntity.getChromosomes().size()); - } finally { - try { - enaBrowser.disconnect(); - Files.deleteIfExists(downloadFilePath.get()); - } catch (IOException e) { - logger.warn("Error while trying to disconnect - enaBrowser (assembly: " + accession + ") : " + e); - } - } - return Optional.of(assemblyEntity); + enaBrowser.connect(); + return downloadAssemblyReport(enaBrowser, accession); } catch (Exception e) { logger.warn("Could not fetch Assembly Report from ENA for accession " + accession + "Exception: " + e); return Optional.empty(); + } finally { + try { + enaBrowser.disconnect(); + } catch (IOException e) { + logger.warn("Error while trying to disconnect - enaBrowser (assembly: " + accession + ") : " + e); + } } - } @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) @@ -105,58 +78,23 @@ public Optional downloadAssemblyReport(ENABrowser enaBrowser, String acces try { boolean success = enaBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); if (success) { - logger.info("ENA assembly report downloaded successfully for accession "+ accession); + logger.info("ENA assembly report downloaded successfully for accession " + accession); return Optional.of(downloadFilePath); } else { - logger.warn("ENA assembly report could not be downloaded successfully for accession "+accession); + logger.warn("ENA assembly report could not be downloaded successfully for accession " + accession); return Optional.empty(); } } catch (IOException | DownloadFailedException e) { - logger.warn("Error downloading ENA assembly report for accession "+ accession + e); + logger.warn("Error downloading ENA assembly report for accession " + accession + e); return Optional.empty(); } } - /** - * Adds ENA sequence names to chromosomes and scaffolds in an assembly. Will modify the AssemblyEntity in-place. - * - * @param optional {@link AssemblyEntity} to add ENA sequence names to - * @throws IOException Passes IOException thrown by {@link #getAssemblyByAccession(String)} - */ - public void addENASequenceNamesToAssembly(AssemblyEntity targetAssembly) throws IOException { - if (!hasAllEnaSequenceNames(targetAssembly)) { - String insdcAccession = targetAssembly.getInsdcAccession(); - Optional enaAssembly = getAssemblyByAccession(insdcAccession); - - if (enaAssembly.isPresent()) { - AssemblyEntity sourceAssembly = enaAssembly.get(); - addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ? - sourceAssembly.getChromosomes() : Collections.emptyList(), - Objects.nonNull(targetAssembly.getChromosomes()) ? - targetAssembly.getChromosomes() : Collections.emptyList()); - } - } - } - - public boolean hasAllEnaSequenceNames(AssemblyEntity assembly) { - List chromosomes = Objects.nonNull(assembly.getChromosomes()) ? - assembly.getChromosomes() : Collections.emptyList(); - return chromosomes.stream().allMatch(sequence -> sequence.getEnaSequenceName() != null); + public List getChromosomeEntityList(List chrDataList) { + return ENAAssemblyReportReader.getChromosomeEntity(chrDataList); } - private void addENASequenceNames( - List sourceSequences, List targetSequences) { - Map insdcToSequenceEntity = new HashMap<>(); - for (SequenceEntity targetSeq : targetSequences) { - insdcToSequenceEntity.put(targetSeq.getInsdcAccession(), targetSeq); - } - for (SequenceEntity sourceSeq : sourceSequences) { - String sourceInsdcAccession = sourceSeq.getInsdcAccession(); - if (insdcToSequenceEntity.containsKey(sourceInsdcAccession)) { - insdcToSequenceEntity.get(sourceInsdcAccession).setEnaSequenceName(sourceSeq.getEnaSequenceName()); - } else { - insdcToSequenceEntity.put(sourceInsdcAccession, sourceSeq); - } - } + public ChromosomeEntity getChromosomeEntity(String chrLine) { + return ENAAssemblyReportReader.getChromosomeEntity(chrLine); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index 08226ff2..2ef26d09 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -25,64 +25,74 @@ import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReader; -import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReaderFactory; import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; @Repository("NCBIDataSource") -public class NCBIAssemblyDataSource implements AssemblyDataSource { +public class NCBIAssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(NCBIAssemblyDataSource.class); private final NCBIBrowserFactory factory; - private final NCBIAssemblyReportReaderFactory readerFactory; - @Value("${asm.file.download.dir}") private String asmFileDownloadDir; @Autowired - public NCBIAssemblyDataSource(NCBIBrowserFactory factory, - NCBIAssemblyReportReaderFactory readerFactory) { + public NCBIAssemblyDataSource(NCBIBrowserFactory factory) { this.factory = factory; - this.readerFactory = readerFactory; } - @Override - public Optional getAssemblyByAccession( - String accession) throws IOException, IllegalArgumentException { - NCBIBrowser ncbiBrowser = factory.build(); - ncbiBrowser.connect(); + public AssemblyEntity getAssemblyEntity(Path downloadFilePath) throws IOException { + List asmDataLines = Files.lines(downloadFilePath) + .filter(line -> line.startsWith("#")) + .collect(Collectors.toList()); + return getAssemblyEntity(asmDataLines); + } - Optional downloadFilePath = downloadAssemblyReport(accession, ncbiBrowser); - if (!downloadFilePath.isPresent()) { - return Optional.empty(); + public AssemblyEntity getAssemblyEntity(List asmDataLines) { + return NCBIAssemblyReportReader.getAssemblyEntity(asmDataLines); + } + + public List getChromosomeEntityList(AssemblyEntity assemblyEntity, List chrDataList) { + List chromosomeEntityList = NCBIAssemblyReportReader.getChromosomeEntity(chrDataList); + chromosomeEntityList.stream().forEach(c -> c.setAssembly(assemblyEntity)); + return chromosomeEntityList; + } + + public ChromosomeEntity getChromosomeEntity(AssemblyEntity assemblyEntity, String chrLine) { + ChromosomeEntity chromosomeEntity = NCBIAssemblyReportReader.getChromosomeEntity(chrLine); + if (chromosomeEntity != null) { + chromosomeEntity.setAssembly(assemblyEntity); } + return chromosomeEntity; + } - AssemblyEntity assemblyEntity; - try (InputStream stream = new FileInputStream(downloadFilePath.get().toFile())) { - NCBIAssemblyReportReader reader = readerFactory.build(stream); - assemblyEntity = reader.getAssemblyEntity(); - logger.info("NCBI: Number of chromosomes in " + accession + " : " + - (assemblyEntity.getChromosomes() != null ? assemblyEntity.getChromosomes().size() : 0)); + public Optional downloadAssemblyReport(String accession) throws IOException { + NCBIBrowser ncbiBrowser = factory.build(); + Optional downloadPath; + try { + ncbiBrowser.connect(); + downloadPath = downloadAssemblyReport(accession, ncbiBrowser); } finally { try { ncbiBrowser.disconnect(); - Files.deleteIfExists(downloadFilePath.get()); } catch (IOException e) { logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); } } - return Optional.of(assemblyEntity); + + return downloadPath; } @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) @@ -105,5 +115,4 @@ public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiB return Optional.empty(); } } - } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java index af5be00e..96512f98 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java @@ -16,89 +16,57 @@ package uk.ac.ebi.eva.contigalias.dus; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.LinkedList; +import java.util.ArrayList; import java.util.List; -public class ENAAssemblyReportReader extends AssemblyReportReader { +public class ENAAssemblyReportReader { - public ENAAssemblyReportReader(InputStreamReader inputStreamReader, boolean isScaffoldsEnabled) { - super(inputStreamReader, isScaffoldsEnabled); + public static List getChromosomeEntity(List lines) { + List chromosomeEntityList = new ArrayList<>(); + for (String line : lines) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(line); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); + } + } + + return chromosomeEntityList; } - protected void parseReport() throws IOException, NullPointerException { - if (reader == null) { - throw new NullPointerException("Cannot use AssemblyReportReader without having a valid InputStreamReader."); - } - String line = reader.readLine(); - while (line != null) { - if (line.startsWith("accession")) { - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - parseAssemblyData(line); - } else if (!line.startsWith("accession")) { - String[] columns = line.split("\t", -1); - if (columns.length >= 6) { - if (columns[5].equals("Chromosome") && columns[3].equals("assembled-molecule")) { - parseChromosomeLine(columns); - } else if (isScaffoldsEnabled) { - parseScaffoldLine(columns); - } + public static ChromosomeEntity getChromosomeEntity(String line) { + if (!line.startsWith("accession")) { + String[] columns = line.split("\t", -1); + if (columns.length >= 6) { + if (columns[5].equals("Chromosome") && columns[3].equals("assembled-molecule")) { + return getChromosome(columns); + } else { + return getScaffold(columns); } } - line = reader.readLine(); } - reportParsed = true; - reader.close(); - } - // Not present in ENA assembly reports - protected void parseAssemblyData(String line) {} + return null; + } - protected void parseChromosomeLine(String[] columns) { + public static ChromosomeEntity getChromosome(String[] columns) { ChromosomeEntity chromosomeEntity = new ChromosomeEntity(); - chromosomeEntity.setInsdcAccession(columns[0]); chromosomeEntity.setEnaSequenceName(columns[1]); - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - chromosomeEntity.setAssembly(this.assemblyEntity); chromosomeEntity.setContigType(SequenceEntity.ContigType.CHROMOSOME); - List chromosomes = this.assemblyEntity.getChromosomes(); - if (chromosomes == null) { - chromosomes = new LinkedList<>(); - assemblyEntity.setChromosomes(chromosomes); - } - chromosomes.add(chromosomeEntity); + return chromosomeEntity; } - protected void parseScaffoldLine(String[] columns) { + public static ChromosomeEntity getScaffold(String[] columns) { ChromosomeEntity scaffoldEntity = new ChromosomeEntity(); - scaffoldEntity.setInsdcAccession(columns[0]); scaffoldEntity.setEnaSequenceName(columns[1]); - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - scaffoldEntity.setAssembly(this.assemblyEntity); scaffoldEntity.setContigType(SequenceEntity.ContigType.SCAFFOLD); - List scaffolds = this.assemblyEntity.getChromosomes(); - if (scaffolds == null) { - scaffolds = new LinkedList<>(); - assemblyEntity.setChromosomes(scaffolds); - } - scaffolds.add(scaffoldEntity); + return scaffoldEntity; } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java deleted file mode 100644 index f6d608ae..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2021 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.dus; - -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.io.InputStream; -import java.io.InputStreamReader; - -@Component -public class ENAAssemblyReportReaderFactory { - - @Value("${config.scaffolds.enabled:false}") - private boolean SCAFFOLDS_ENABLED; - - public ENAAssemblyReportReader build(InputStream inputStream) { - return new ENAAssemblyReportReader(new InputStreamReader(inputStream), SCAFFOLDS_ENABLED); - } - - public ENAAssemblyReportReader build(InputStreamReader inputStreamReader) { - return new ENAAssemblyReportReader(inputStreamReader, SCAFFOLDS_ENABLED); - } - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java index 28ad9329..fcf99462 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java @@ -20,81 +20,80 @@ import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.LinkedList; +import java.util.ArrayList; import java.util.List; - -public class NCBIAssemblyReportReader extends AssemblyReportReader { - - public NCBIAssemblyReportReader(InputStreamReader inputStreamReader, boolean isScaffoldsEnabled) { - super(inputStreamReader, isScaffoldsEnabled); - } - - protected void parseReport() throws IOException, NullPointerException { - if (reader == null) { - throw new NullPointerException("Cannot use AssemblyReportReader without having a valid InputStreamReader."); - } - String line = reader.readLine(); - while (line != null) { - if (line.startsWith("# ")) { - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); +import java.util.Map; +import java.util.stream.Collectors; + +public class NCBIAssemblyReportReader { + + public static AssemblyEntity getAssemblyEntity(List lines) { + Map tagAndValuesMap = lines.stream() + .filter(line -> line.startsWith("#")) + .filter(line -> line.indexOf(':') != -1) + .collect(Collectors.toMap(l -> l.substring(2, l.indexOf(':')), l -> l.substring(l.indexOf(':') + 1).trim())); + + AssemblyEntity asmEntity = new AssemblyEntity(); + for (Map.Entry entry : tagAndValuesMap.entrySet()) { + String tag = entry.getKey(); + String tagData = entry.getValue(); + switch (tag) { + case "Assembly name": { + asmEntity.setName(tagData); + break; + } + case "Organism name": { + asmEntity.setOrganism(tagData); + break; + } + case "Taxid": { + asmEntity.setTaxid(Long.parseLong(tagData)); + break; + } + case "GenBank assembly accession": { + asmEntity.setInsdcAccession(tagData); + break; } - parseAssemblyData(line); - } else if (!line.startsWith("#")) { - String[] columns = line.split("\t", -1); - if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && - (columns[4] != null && !columns[4].isEmpty() && !columns[4].equals("na"))) { - if (columns[3].equals("Chromosome") && columns[1].equals("assembled-molecule")) { - parseChromosomeLine(columns); - } else if (isScaffoldsEnabled) { - parseScaffoldLine(columns); - } + case "RefSeq assembly accession": { + asmEntity.setRefseq(tagData); + break; + } + case "RefSeq assembly and GenBank assemblies identical": { + asmEntity.setGenbankRefseqIdentical(tagData.equals("yes")); + break; } } - line = reader.readLine(); } - reportParsed = true; - reader.close(); + + return asmEntity; } - protected void parseAssemblyData(String line) { - int tagEnd = line.indexOf(':'); - if (tagEnd == -1) { - return; - } - String tag = line.substring(2, tagEnd); - String tagData = line.substring(tagEnd + 1).trim(); - switch (tag) { - case "Assembly name": { - assemblyEntity.setName(tagData); - break; - } - case "Organism name": { - assemblyEntity.setOrganism(tagData); - break; - } - case "Taxid": { - assemblyEntity.setTaxid(Long.parseLong(tagData)); - break; - } - case "GenBank assembly accession": { - assemblyEntity.setInsdcAccession(tagData); - break; - } - case "RefSeq assembly accession": { - assemblyEntity.setRefseq(tagData); - break; + public static List getChromosomeEntity(List lines) { + List chromosomeEntityList = new ArrayList<>(); + for (String line : lines) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(line); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); } - case "RefSeq assembly and GenBank assemblies identical": { - assemblyEntity.setGenbankRefseqIdentical(tagData.equals("yes")); - break; + } + return chromosomeEntityList; + } + + public static ChromosomeEntity getChromosomeEntity(String line) { + String[] columns = line.split("\t", -1); + if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && + (columns[4] != null && !columns[4].isEmpty() && !columns[4].equals("na"))) { + if (columns[3].equals("Chromosome") && columns[1].equals("assembled-molecule")) { + return getChromosome(columns); + } else { + return getScaffold(columns); } } + + return null; } - protected void parseChromosomeLine(String[] columns) { + public static ChromosomeEntity getChromosome(String[] columns) { ChromosomeEntity chromosomeEntity = new ChromosomeEntity(); chromosomeEntity.setGenbankSequenceName(columns[0]); @@ -104,7 +103,6 @@ protected void parseChromosomeLine(String[] columns) { } else { chromosomeEntity.setRefseq(columns[6]); } - if (columns.length > 8) { try { Long seqLength = Long.parseLong(columns[8]); @@ -113,26 +111,15 @@ protected void parseChromosomeLine(String[] columns) { } } - if (columns.length > 9 && !columns[9].equals("na")) { chromosomeEntity.setUcscName(columns[9]); } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - chromosomeEntity.setAssembly(this.assemblyEntity); chromosomeEntity.setContigType(SequenceEntity.ContigType.CHROMOSOME); - List chromosomes = this.assemblyEntity.getChromosomes(); - if (chromosomes == null) { - chromosomes = new LinkedList<>(); - assemblyEntity.setChromosomes(chromosomes); - } - chromosomes.add(chromosomeEntity); + return chromosomeEntity; } - protected void parseScaffoldLine(String[] columns) { + public static ChromosomeEntity getScaffold(String[] columns) { ChromosomeEntity scaffoldEntity = new ChromosomeEntity(); scaffoldEntity.setGenbankSequenceName(columns[0]); @@ -142,7 +129,6 @@ protected void parseScaffoldLine(String[] columns) { } else { scaffoldEntity.setRefseq(columns[6]); } - if (columns.length > 8) { try { Long seqLength = Long.parseLong(columns[8]); @@ -151,27 +137,15 @@ protected void parseScaffoldLine(String[] columns) { } } - - if (columns.length >= 10) { String ucscName = columns[9]; if (!ucscName.equals("na")) { scaffoldEntity.setUcscName(ucscName); } } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - scaffoldEntity.setAssembly(this.assemblyEntity); scaffoldEntity.setContigType(SequenceEntity.ContigType.SCAFFOLD); - List scaffolds = this.assemblyEntity.getChromosomes(); - if (scaffolds == null) { - scaffolds = new LinkedList<>(); - assemblyEntity.setChromosomes(scaffolds); - } - scaffolds.add(scaffoldEntity); + return scaffoldEntity; } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java deleted file mode 100644 index 6dfb49a8..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2020 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.dus; - -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.io.InputStream; -import java.io.InputStreamReader; - -@Component -public class NCBIAssemblyReportReaderFactory { - - @Value("${config.scaffolds.enabled:false}") - private boolean SCAFFOLDS_ENABLED; - - public NCBIAssemblyReportReader build(InputStream inputStream) { - return new NCBIAssemblyReportReader(new InputStreamReader(inputStream), SCAFFOLDS_ENABLED); - } - - public NCBIAssemblyReportReader build(InputStreamReader inputStreamReader) { - return new NCBIAssemblyReportReader(inputStreamReader, SCAFFOLDS_ENABLED); - } - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java index 7ada7de5..129764e0 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java @@ -24,6 +24,7 @@ import javax.persistence.CascadeType; import javax.persistence.Column; import javax.persistence.Entity; +import javax.persistence.FetchType; import javax.persistence.Id; import javax.persistence.OneToMany; import javax.persistence.Table; @@ -63,8 +64,8 @@ public class AssemblyEntity { @JsonInclude(JsonInclude.Include.NON_NULL) @ApiModelProperty(value = "List of all chromosomes of the assembly present in the database.") - @LazyCollection(LazyCollectionOption.FALSE) - @OneToMany(mappedBy = "assembly", cascade = CascadeType.ALL) + @LazyCollection(LazyCollectionOption.TRUE) + @OneToMany(mappedBy = "assembly", cascade = CascadeType.ALL, fetch = FetchType.LAZY) private List chromosomes; public AssemblyEntity() { @@ -178,11 +179,6 @@ public String toString() { .append("trunc512checksum :\t") .append(this.trunc512checksum) .append("\n"); - if (this.chromosomes != null) { - builder.append("Number of chromosomes :\t") - .append(this.chromosomes.size()) - .append("\n"); - } return builder.toString(); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java new file mode 100644 index 00000000..c60b42ac --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class AssemblyIngestionException extends RuntimeException { + + public AssemblyIngestionException(String accession) { + super("Error Ingesting assembly with accession " + accession); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java index 93d5c868..2d6c0d3b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java @@ -30,4 +30,9 @@ public ResponseEntity handleExceptions(DownloadFailedException exception return new ResponseEntity<>(exception.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR); } + @ExceptionHandler(AssemblyIngestionException.class) + public ResponseEntity handleExceptions(AssemblyIngestionException exception, WebRequest webRequest){ + return new ResponseEntity<>(exception.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR); + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java index 9ffe65d8..62881539 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java @@ -20,6 +20,9 @@ import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.data.jpa.repository.JpaSpecificationExecutor; +import org.springframework.data.jpa.repository.Modifying; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; @@ -46,8 +49,17 @@ default Optional findAssemblyEntityByAccession(String accession) Page findAssemblyEntitiesByTaxid(long taxid, Pageable pageable); @Transactional - void deleteAssemblyEntityByInsdcAccession(String insdcAccession); + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.insdcAccession=:asmInsdcAccession") + void deleteAssemblyEntityByInsdcAccession(@Param("asmInsdcAccession") String asmInsdcAccession); @Transactional - void deleteAssemblyEntityByRefseq(String refseq); + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.refseq=:asmRefSeq") + void deleteAssemblyEntityByRefseq(@Param("asmRefSeq") String asmRefSeq); + + @Transactional + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.insdcAccession=:asmAccession OR a.refseq=:asmAccession") + void deleteAssemblyEntityByInsdcAccessionOrRefseq(@Param("asmAccession") String asmAccession); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index 2a1ae338..5979af07 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -27,6 +27,7 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import javax.transaction.Transactional; import java.util.List; @Repository @@ -50,6 +51,15 @@ public interface ChromosomeRepository extends JpaRepository findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request); Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java deleted file mode 100644 index 27bb2b92..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java +++ /dev/null @@ -1,137 +0,0 @@ -package uk.ac.ebi.eva.contigalias.scheduler; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.jdbc.core.JdbcTemplate; -import org.springframework.jdbc.core.ResultSetExtractor; -import org.springframework.scheduling.annotation.Scheduled; -import org.springframework.stereotype.Component; -import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.service.ChromosomeService; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutionException; - -@Component -public class ChecksumSetter { - private final Logger logger = LoggerFactory.getLogger(ChecksumSetter.class); - private final Map> runningMD5ChecksumUpdateTasks = new ConcurrentHashMap<>(); - private Set scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>(); - private int DEFAULT_PAGE_SIZE = 10000; - private JdbcTemplate jdbcTemplate; - private ChromosomeService chromosomeService; - private Md5ChecksumRetriever md5ChecksumRetriever; - - @Autowired - public ChecksumSetter(ChromosomeService chromosomeService, Md5ChecksumRetriever md5ChecksumRetriever, - JdbcTemplate jdbcTemplate) { - this.chromosomeService = chromosomeService; - this.md5ChecksumRetriever = md5ChecksumRetriever; - this.jdbcTemplate = jdbcTemplate; - } - - @Scheduled(cron = "0 0 1 ? * TUE") - public void updateMd5CheckSumForAllAssemblies() { - List assemblyList = chromosomeService.getAssembliesWhereChromosomeMd5ChecksumIsNull(); - logger.info("List of assemblies to be updated for MD5 Checksum: " + assemblyList); - scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>(assemblyList); - - for (String assembly : assemblyList) { - scheduledToRunMD5ChecksumUpdateTasks.remove(assembly); - CompletableFuture future = updateMd5CheckSumForAssemblyAsync(assembly); - try { - future.get(); - } catch (InterruptedException | ExecutionException e) { - logger.error("Encountered an error when running MD5Checksum update for assembly: " + assembly); - } finally { - scheduledToRunMD5ChecksumUpdateTasks.remove(assembly); - } - } - } - - public CompletableFuture updateMd5CheckSumForAssemblyAsync(String assembly) { - logger.info("Submitted job for updating MD5 Checksum for assembly (asynchronously)"); - // Check if the async task for this assembly is already running - CompletableFuture existingTask = runningMD5ChecksumUpdateTasks.get(assembly); - if (existingTask != null && !existingTask.isDone()) { - logger.info("Async task is still running for assembly: " + assembly); - return existingTask; - } - // Start the async task (removing existing run if present) - runningMD5ChecksumUpdateTasks.remove(assembly); - CompletableFuture future = CompletableFuture.runAsync(() -> { - updateMD5ChecksumForAllChromosomesInAssembly(assembly); - }); - // Store the future in the map for the given assembly - runningMD5ChecksumUpdateTasks.put(assembly, future); - - // check the status of task upon completion and remove from running tasks - future.whenComplete((result, exception) -> { - if (exception != null) { - logger.error("Async task (MD5Checksum setter) failed for assembly: " + assembly, exception); - } else { - logger.info("Async task (MD5Checksum setter) completed successfully for assembly: " + assembly); - } - runningMD5ChecksumUpdateTasks.remove(assembly); - }); - - return future; - } - - public void updateMD5ChecksumForAllChromosomesInAssembly(String assembly) { - logger.info("Trying to update md5checksum for assembly: " + assembly); - String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly - + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; - jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { - long chromosomeUpdated = 0; - List chromosomeEntityList = new ArrayList<>(); - while (rs.next()) { - ChromosomeEntity chromosome = new ChromosomeEntity(); - chromosome.setInsdcAccession(rs.getString(1)); - chromosomeEntityList.add(chromosome); - - if (chromosomeEntityList.size() == DEFAULT_PAGE_SIZE) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); - chromosomeEntityList = new ArrayList<>(); - } - } - if (chromosomeEntityList.size() > 0) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); - } - - return null; - }); - } - - public void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { - chromosomesList.parallelStream().forEach(chromosome -> { - try { - String md5Checksum = md5ChecksumRetriever.retrieveMd5Checksum(chromosome.getInsdcAccession()); - chromosome.setMd5checksum(md5Checksum); - } catch (Exception e) { - logger.info("Could not retrieve md5Checksum for insdc accession: " + chromosome.getInsdcAccession()); - } - }); - - chromosomeService.updateMd5ChecksumForAllChromosomeInAssembly(assembly, chromosomesList); - } - - public Map> getMD5ChecksumUpdateTaskStatus() { - Map> taskStatus = new HashMap<>(); - taskStatus.put("running", runningMD5ChecksumUpdateTasks.keySet()); - taskStatus.put("scheduled", scheduledToRunMD5ChecksumUpdateTasks); - return taskStatus; - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java new file mode 100644 index 00000000..f39ad94b --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -0,0 +1,84 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.conf.ApplicationContextHolder; +import uk.ac.ebi.eva.contigalias.scheduler.job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobSubmittedEvent; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobType; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +@Service +public class ChromosomeUpdater { + private final Logger logger = LoggerFactory.getLogger(ChromosomeUpdater.class); + private final BlockingQueue jobQueue = new LinkedBlockingQueue<>(); + private final ENASequenceNameUpdater enaSequenceNameUpdater; + private final MD5ChecksumUpdater md5ChecksumUpdater; + private final AtomicBoolean running = new AtomicBoolean(false); + private Job currentJob; + + @Autowired + public ChromosomeUpdater(ENASequenceNameUpdater enaSequenceNameUpdater, MD5ChecksumUpdater md5ChecksumUpdater) { + this.md5ChecksumUpdater = md5ChecksumUpdater; + this.enaSequenceNameUpdater = enaSequenceNameUpdater; + } + + public void submitJob(Job job) { + jobQueue.add(job); + logger.info("Submitted Job : " + job); + JobSubmittedEvent event = new JobSubmittedEvent(this); + ApplicationContextHolder.getApplicationContext().publishEvent(event); + } + + public void submitJob(List jobList) { + jobQueue.addAll(jobList); + jobList.stream().forEach(job -> logger.info("Submitted Job : " + job)); + JobSubmittedEvent event = new JobSubmittedEvent(this); + ApplicationContextHolder.getApplicationContext().publishEvent(event); + } + + @Async + public void processJobs() { + running.set(true); + currentJob = null; + while (!jobQueue.isEmpty()) { + try { + currentJob = jobQueue.take(); + if (currentJob.getType() == JobType.ENA_SEQUENCE_NAME_UPDATE) { + enaSequenceNameUpdater.updateENASequenceNameForAssembly(currentJob.getParameter()); + } else if (currentJob.getType() == JobType.MD5_CHECKSUM_UPDATE) { + md5ChecksumUpdater.updateMD5ChecksumForAssembly(currentJob.getParameter()); + } + } catch (Exception e) { + logger.error("Exception while running job : " + currentJob); + } + } + currentJob = null; + running.set(false); + } + + public List getScheduledJobStatus() { + List jobList = new ArrayList<>(); + if (currentJob != null) { + jobList.add(currentJob.toString()); + } + jobList.addAll(jobQueue.stream() + .map(j -> j.getType().toString() + " : " + j.getParameter()) + .collect(Collectors.toList())); + + return jobList; + } + + public AtomicBoolean isRunning() { + return running; + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java new file mode 100644 index 00000000..5a28fe80 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java @@ -0,0 +1,92 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; +import uk.ac.ebi.eva.contigalias.datasource.ENAAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +@Component +public class ENASequenceNameUpdater { + private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class); + private final int DEFAULT_BATCH_SIZE = 100000; + private final ENAAssemblyDataSource enaDataSource; + + private final ChromosomeService chromosomeService; + + public ENASequenceNameUpdater(ENAAssemblyDataSource enaDataSource, ChromosomeService chromosomeService) { + this.enaDataSource = enaDataSource; + this.chromosomeService = chromosomeService; + } + + public void updateENASequenceNameForAssembly(String assembly) { + Path downloadedENAFilePath = null; + try { + logger.info("Trying to update ENA Sequence Name for assembly: " + assembly); + Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(assembly); + if (downloadENAFilePathOpt.isPresent()) { + downloadedENAFilePath = downloadENAFilePathOpt.get(); + + long numberOfChromosomesInFile = Files.lines(downloadedENAFilePath) + .filter(line -> !line.startsWith("accession")).count(); + logger.info("Number of chromosomes in assembly (" + assembly + "): " + numberOfChromosomesInFile); + + // retrieve and save ena sequence names + retrieveAndUpdateENASequenceNames(assembly, downloadedENAFilePath); + } else { + logger.warn("Could not download assembly report for assembly : " + assembly); + } + } catch (Exception e) { + logger.error("Error while updating ENA Sequence Name for assembly : " + assembly + "\n" + e); + } finally { + try { + if (downloadedENAFilePath != null) { + Files.deleteIfExists(downloadedENAFilePath); + } + } catch (IOException e) { + logger.error("Error while deleting downloaded ENA assembly report file with path " + downloadedENAFilePath + + " for assembly : " + assembly); + } + } + } + + private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedENAFilePath) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { + long chromosomesProcessedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("accession")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == DEFAULT_BATCH_SIZE) { + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); + chromosomesProcessedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesProcessedTillNow); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); + chromosomesProcessedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesProcessedTillNow); + } + } + + logger.info("Finished updating ENA Sequence Name for assembly: " + assembly); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java new file mode 100644 index 00000000..b9db041d --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -0,0 +1,88 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import com.fasterxml.jackson.databind.JsonNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.jdbc.core.ResultSetExtractor; +import org.springframework.stereotype.Component; +import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.ArrayList; +import java.util.List; + +@Component +public class MD5ChecksumUpdater { + private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class); + private final int DEFAULT_BATCH_SIZE = 10000; + private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; + private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; + private RestTemplate restTemplate; + private final JdbcTemplate jdbcTemplate; + private final ChromosomeService chromosomeService; + + @Autowired + public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) { + this.restTemplate = restTemplate; + this.jdbcTemplate = jdbcTemplate; + this.chromosomeService = chromosomeService; + } + + public void updateMD5ChecksumForAssembly(String assembly) { + try { + logger.info("Trying to update MD5 Checksum for assembly: " + assembly); + String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly + + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; + jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { + long chromosomeProcessed = 0; + List chromosomeEntityList = new ArrayList<>(); + while (rs.next()) { + ChromosomeEntity chromosome = new ChromosomeEntity(); + chromosome.setInsdcAccession(rs.getString(1)); + chromosomeEntityList.add(chromosome); + + if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) { + updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); + chromosomeProcessed += chromosomeEntityList.size(); + logger.info("Chromosomes Processed till now: " + chromosomeProcessed); + chromosomeEntityList = new ArrayList<>(); + } + } + if (chromosomeEntityList.size() > 0) { + updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); + chromosomeProcessed += chromosomeEntityList.size(); + logger.info("Chromosomes Processed till now: " + chromosomeProcessed); + } + + logger.info("Finished updating MD5 Checksum for assembly: " + assembly); + + return null; + }); + } catch (Exception e) { + logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e); + } + } + + private void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { + chromosomesList.parallelStream().forEach(chromosome -> { + try { + String md5Checksum = retrieveMd5Checksum(chromosome.getInsdcAccession()); + chromosome.setMd5checksum(md5Checksum); + } catch (Exception e) { + logger.info("Could not retrieve MD5 Checksum for insdc accession: " + chromosome.getInsdcAccession()); + } + }); + + chromosomeService.updateMd5ChecksumForAllChromosomeInAssembly(assembly, chromosomesList); + } + + public String retrieveMd5Checksum(String insdcAccession) { + String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession); + JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class); + String md5Checksum = jsonResponse.get("metadata").get("md5").asText(); + return md5Checksum; + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java deleted file mode 100644 index 912e5d6c..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java +++ /dev/null @@ -1,29 +0,0 @@ -package uk.ac.ebi.eva.contigalias.scheduler; - -import com.fasterxml.jackson.databind.JsonNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; -import org.springframework.web.client.RestTemplate; - -@Component -public class Md5ChecksumRetriever { - private final Logger logger = LoggerFactory.getLogger(Md5ChecksumRetriever.class); - private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; - private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - - private RestTemplate restTemplate; - - @Autowired - public Md5ChecksumRetriever(RestTemplate restTemplate) { - this.restTemplate = restTemplate; - } - - public String retrieveMd5Checksum(String insdcAccession) { - String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession); - JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class); - String md5Checksum = jsonResponse.get("metadata").get("md5").asText(); - return md5Checksum; - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java new file mode 100644 index 00000000..715381ba --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java @@ -0,0 +1,24 @@ +package uk.ac.ebi.eva.contigalias.scheduler.job; + +public class Job { + private final JobType type; + private final String parameter; + + public Job(JobType type, String parameter) { + this.type = type; + this.parameter = parameter; + } + + public JobType getType() { + return type; + } + + public String getParameter() { + return parameter; + } + + @Override + public String toString() { + return type + " : " + parameter; + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java new file mode 100644 index 00000000..464af04e --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java @@ -0,0 +1,10 @@ +package uk.ac.ebi.eva.contigalias.scheduler.job; + + +import org.springframework.context.ApplicationEvent; + +public class JobSubmittedEvent extends ApplicationEvent { + public JobSubmittedEvent(Object source) { + super(source); + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java new file mode 100644 index 00000000..e0df9888 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java @@ -0,0 +1,23 @@ +package uk.ac.ebi.eva.contigalias.scheduler.job; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.ApplicationListener; +import org.springframework.stereotype.Component; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; + +@Component +public class JobSubmittedEventHandler implements ApplicationListener { + private ChromosomeUpdater chromosomeUpdater; + + @Autowired + public JobSubmittedEventHandler(ChromosomeUpdater chromosomeUpdater) { + this.chromosomeUpdater = chromosomeUpdater; + } + + @Override + public void onApplicationEvent(JobSubmittedEvent event) { + if (!chromosomeUpdater.isRunning().get()) { + chromosomeUpdater.processJobs(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java new file mode 100644 index 00000000..47397dd4 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java @@ -0,0 +1,6 @@ +package uk.ac.ebi.eva.contigalias.scheduler.job; + +public enum JobType { + ENA_SEQUENCE_NAME_UPDATE, + MD5_CHECKSUM_UPDATE +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index e97adc2f..c5a9e2e9 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -26,130 +26,183 @@ import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblyIngestionException; import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblyException; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; -import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; +import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; +import uk.ac.ebi.eva.contigalias.scheduler.job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobType; import javax.transaction.Transactional; +import java.io.BufferedReader; +import java.io.FileReader; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; @Service public class AssemblyService { + private final int BATCH_SIZE = 100000; - private final AssemblyRepository repository; + private final ChromosomeService chromosomeService; + + private final AssemblyRepository assemblyRepository; + + private final ChromosomeRepository chromosomeRepository; private final NCBIAssemblyDataSource ncbiDataSource; private final ENAAssemblyDataSource enaDataSource; - private final ChecksumSetter checksumSetter; + private final ChromosomeUpdater chromosomeUpdater; private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @Autowired - public AssemblyService(AssemblyRepository repository, NCBIAssemblyDataSource ncbiDataSource, - ENAAssemblyDataSource enaDataSource, ChecksumSetter checksumSetter) { - this.repository = repository; + public AssemblyService(ChromosomeService chromosomeService, AssemblyRepository repository, ChromosomeRepository chromosomeRepository, + NCBIAssemblyDataSource ncbiDataSource, ENAAssemblyDataSource enaDataSource, + ChromosomeUpdater chromosomeUpdater) { + this.chromosomeService = chromosomeService; + this.assemblyRepository = repository; + this.chromosomeRepository = chromosomeRepository; this.ncbiDataSource = ncbiDataSource; this.enaDataSource = enaDataSource; - this.checksumSetter = checksumSetter; + this.chromosomeUpdater = chromosomeUpdater; } public Optional getAssemblyByInsdcAccession(String insdcAccession) { - Optional entity = repository.findAssemblyEntityByInsdcAccession(insdcAccession); - stripAssemblyFromChromosomes(entity); + Optional entity = assemblyRepository.findAssemblyEntityByInsdcAccession(insdcAccession); return entity; } public Optional getAssemblyByRefseq(String refseq) { - Optional entity = repository.findAssemblyEntityByRefseq(refseq); - stripAssemblyFromChromosomes(entity); + Optional entity = assemblyRepository.findAssemblyEntityByRefseq(refseq); return entity; } public Page getAssembliesByTaxid(long taxid, Pageable request) { - Page page = repository.findAssemblyEntitiesByTaxid(taxid, request); - page.forEach(this::stripAssemblyFromChromosomes); + Page page = assemblyRepository.findAssemblyEntitiesByTaxid(taxid, request); return page; } public void putAssemblyChecksumsByAccession(String accession, String md5, String trunc512) { - Optional entity = repository.findAssemblyEntityByAccession(accession); + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (!entity.isPresent()) { throw new IllegalArgumentException( "No assembly corresponding to accession " + accession + " found in the database"); } AssemblyEntity assemblyEntity = entity.get(); assemblyEntity.setMd5checksum(md5).setTrunc512checksum(trunc512); - repository.save(assemblyEntity); + assemblyRepository.save(assemblyEntity); } - public void fetchAndInsertAssembly(String accession) throws IOException { - Optional entity = repository.findAssemblyEntityByAccession(accession); + public void fetchAndInsertAssembly(String accession) { + // check if assembly already exists in db + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (entity.isPresent()) { throw duplicateAssemblyInsertionException(accession, entity.get()); } - Optional fetchAssembly = ncbiDataSource.getAssemblyByAccession(accession); - if (!fetchAssembly.isPresent()) { - throw new AssemblyNotFoundException(accession); + + try { + // download file and save assembly and chromosome data + logger.info("Start inserting assembly for accession " + accession); + parseFileAndInsertAssembly(accession); + logger.info("Successfully inserted assembly for accession " + accession); + } catch (Exception e) { + // roll back inserted entries in case of any exception or error + logger.error("Exception while inserting assembly " + accession + " Rolling back changes. \n" + e); + deleteEntriesForAssembly(accession); + throw new AssemblyIngestionException(accession); } - if (fetchAssembly.isPresent()) { - AssemblyEntity assemblyEntity = fetchAssembly.get(); - enaDataSource.addENASequenceNamesToAssembly(assemblyEntity); - if (assemblyEntity.getChromosomes() != null && assemblyEntity.getChromosomes().size() > 0) { - insertAssembly(assemblyEntity); - logger.info("Successfully inserted assembly for accession " + accession); - // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) - checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); - } else { - logger.error("Skipping inserting assembly : No chromosome in assembly " + accession); + } + + public void parseFileAndInsertAssembly(String accession) throws IOException { + Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); + Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); + + AssemblyEntity assemblyEntity = ncbiDataSource.getAssemblyEntity(downloadedNCBIFilePath); + assemblyRepository.save(assemblyEntity); + + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + long chromosomesSavedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + chromosomeService.insertAllChromosomes(chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + chromosomeService.insertAllChromosomes(chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); } - } else { - logger.error("Could not get assembly from NCBI"); } + + // delete the files after assembly insertion + Files.deleteIfExists(downloadedNCBIFilePath); } - public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { - checksumSetter.updateMd5CheckSumForAssemblyAsync(assembly); + public void deleteEntriesForAssembly(String accession) { + chromosomeRepository.deleteChromosomeEntitiesByAssembly_InsdcAccession(accession); + assemblyRepository.deleteAssemblyEntityByInsdcAccessionOrRefseq(accession); } - public Map> getMD5ChecksumUpdateTaskStatus() { - return checksumSetter.getMD5ChecksumUpdateTaskStatus(); + public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { + Job md5ChecksumupdateJob = new Job(JobType.MD5_CHECKSUM_UPDATE, assembly); + chromosomeUpdater.submitJob(md5ChecksumupdateJob); } - public Optional getAssemblyByAccession(String accession) { - Optional entity = repository.findAssemblyEntityByAccession(accession); - if (entity.isPresent()) { - stripAssemblyFromChromosomes(entity); - return entity; - } else { - throw new AssemblyNotFoundException(accession); + public void retrieveAndInsertMd5ChecksumForAssembly(List assemblies) { + List jobsList = new ArrayList(); + for (String assembly : assemblies) { + jobsList.add(new Job(JobType.MD5_CHECKSUM_UPDATE, assembly)); } + chromosomeUpdater.submitJob(jobsList); } - public void stripAssemblyFromChromosomes(Optional optional) { - if (optional.isPresent()) { - AssemblyEntity entity = optional.get(); - stripAssemblyFromChromosomes(entity); - } + public void retrieveAndInsertENASequenceNameForAssembly(String assembly) { + Job enaSequenceNameupdateJob = new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, assembly); + chromosomeUpdater.submitJob(enaSequenceNameupdateJob); } - private void stripAssemblyFromChromosomes(AssemblyEntity assembly) { - List chromosomes = assembly.getChromosomes(); - if (chromosomes != null && chromosomes.size() > 0) { - chromosomes.forEach(it -> it.setAssembly(null)); - } else { - assembly.setChromosomes(Collections.emptyList()); + public void retrieveAndInsertENASequenceNameForAssembly(List assemblies) { + List jobsList = new ArrayList(); + for (String assembly : assemblies) { + jobsList.add(new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, assembly)); } + chromosomeUpdater.submitJob(jobsList); + } + + public List getScheduledJobStatus() { + return chromosomeUpdater.getScheduledJobStatus(); + } + + public Optional getAssemblyByAccession(String accession) { + Optional assemblyEntity = assemblyRepository.findAssemblyEntityByAccession(accession); + return assemblyEntity; } @Transactional @@ -157,7 +210,7 @@ public void insertAssembly(AssemblyEntity entity) { if (isEntityPresent(entity)) { throw duplicateAssemblyInsertionException(null, entity); } else { - repository.save(entity); + assemblyRepository.save(entity); } } @@ -168,7 +221,7 @@ public boolean isEntityPresent(AssemblyEntity entity) { if (insdcAccession == null && refseq == null) { return false; } - Optional existingAssembly = repository.findAssemblyEntityByInsdcAccessionOrRefseq( + Optional existingAssembly = assemblyRepository.findAssemblyEntityByInsdcAccessionOrRefseq( // Setting to invalid prevents finding random accessions with null GCA/GCF insdcAccession == null ? "##########" : insdcAccession, refseq == null ? "##########" : refseq); @@ -197,11 +250,11 @@ public Map> fetchAndInsertAssembly(List accessions) } public void deleteAssemblyByInsdcAccession(String insdcAccession) { - repository.deleteAssemblyEntityByInsdcAccession(insdcAccession); + assemblyRepository.deleteAssemblyEntityByInsdcAccession(insdcAccession); } public void deleteAssemblyByRefseq(String refseq) { - repository.deleteAssemblyEntityByRefseq(refseq); + assemblyRepository.deleteAssemblyEntityByRefseq(refseq); } public void deleteAssemblyByAccession(String accession) { @@ -210,7 +263,7 @@ public void deleteAssemblyByAccession(String accession) { } public void deleteAssembly(AssemblyEntity entity) { - repository.delete(entity); + assemblyRepository.delete(entity); } private DuplicateAssemblyException duplicateAssemblyInsertionException(String accession, AssemblyEntity present) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 8b33659d..9fd11976 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -19,13 +19,16 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; +import org.springframework.jdbc.core.BatchPreparedStatementSetter; +import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import javax.transaction.Transactional; +import java.sql.PreparedStatement; +import java.sql.SQLException; import java.util.LinkedList; import java.util.List; @@ -33,10 +36,12 @@ public class ChromosomeService { private final ChromosomeRepository repository; + private final JdbcTemplate jdbcTemplate; @Autowired - public ChromosomeService(ChromosomeRepository repository) { + public ChromosomeService(ChromosomeRepository repository, JdbcTemplate jdbcTemplate) { this.repository = repository; + this.jdbcTemplate = jdbcTemplate; } @@ -71,6 +76,13 @@ public void updateMd5ChecksumForAllChromosomeInAssembly(String assembly, List chromosomeEntityList) { + for (ChromosomeEntity chromosome : chromosomeEntityList) { + repository.updateENASequenceNameByInsdcAccession(assembly, chromosome.getInsdcAccession(), chromosome.getEnaSequenceName()); + } + } + public Page getChromosomesByAssemblyRefseq(String asmRefseq, Pageable request) { Page chromosomes = repository.findChromosomeEntitiesByAssembly_Refseq(asmRefseq, request); return stripAssembliesFromChromosomes(chromosomes); @@ -196,7 +208,7 @@ private void stripAssemblyFromChromosome(ChromosomeEntity chromosome) { public void putChromosomeChecksumsByAccession(String accession, String md5, String trunc512) { Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseq( accession, accession, Pageable.unpaged()); - if (page.isEmpty()){ + if (page.isEmpty()) { throw new IllegalArgumentException( "No chromosomes corresponding to accession " + accession + " found in the database"); } @@ -272,4 +284,31 @@ public long countChromosomeEntitiesByEnaName(String enaName) { return repository.countChromosomeEntitiesByEnaSequenceName(enaName); } + public void insertAllChromosomes(List chromosomeEntityList) { + String sql = "INSERT INTO chromosome (assembly_insdc_accession,contig_type,ena_sequence_name," + + "genbank_sequence_name,insdc_accession,md5checksum,refseq,seq_length,trunc512checksum,ucsc_name) " + + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; + jdbcTemplate.batchUpdate(sql, new BatchPreparedStatementSetter() { + @Override + public void setValues(PreparedStatement ps, int i) throws SQLException { + ChromosomeEntity chromosome = chromosomeEntityList.get(i); + ps.setString(1, chromosome.getAssembly().getInsdcAccession()); + ps.setString(2, chromosome.getContigType().toString()); + ps.setString(3, chromosome.getEnaSequenceName()); + ps.setString(4, chromosome.getGenbankSequenceName()); + ps.setString(5, chromosome.getInsdcAccession()); + ps.setString(6, chromosome.getMd5checksum()); + ps.setString(7, chromosome.getRefseq()); + ps.setLong(8, chromosome.getSeqLength()); + ps.setString(9, chromosome.getTrunc512checksum()); + ps.setString(10, chromosome.getUcscName()); + } + + @Override + public int getBatchSize() { + return chromosomeEntityList.size(); + } + }); + } + } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 514ac4f3..0e2bef2d 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +spring.jpa.open-in-view=false controller.auth.admin.username=@contig-alias.admin-user@ controller.auth.admin.password=@contig-alias.admin-password@ diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java index 11f7bd5c..b4b6ff26 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java @@ -20,16 +20,16 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @ActiveProfiles("test") @@ -41,24 +41,22 @@ public class ENAAssemblyDataSourceTest { @Autowired private ENAAssemblyDataSource enaDataSource; - @Autowired - private NCBIAssemblyDataSource ncbiDataSource; - @Test - public void getAssemblyByAccessionGCAHavingChromosomes() throws IOException { - Optional accession = enaDataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes(); - assertNotNull(chromosomes); - assertFalse(chromosomes.isEmpty()); + public void testDownloadAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = enaDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + assertTrue(downloadedAssemblyReport.isPresent()); + assertTrue(Files.exists(downloadedAssemblyReport.get())); } @Test - public void getENASequenceNamesForAssembly() throws IOException { - Optional assembly = ncbiDataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - enaDataSource.addENASequenceNamesToAssembly(assembly.get()); - assertTrue(assembly.isPresent()); - assertTrue(enaDataSource.hasAllEnaSequenceNames(assembly.get())); + public void getChromosomeEntityFromAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = enaDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("accession")) + .collect(Collectors.toList()); + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + assertEquals(3143, chromosomeEntityList.size()); + chromosomeEntityList.stream().forEach(c -> assertTrue(!c.getEnaSequenceName().isEmpty())); } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java index 589b441e..9440ca3d 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java @@ -20,19 +20,18 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @ActiveProfiles("test") @@ -46,23 +45,40 @@ public class NCBIAssemblyDataSourceTest { @Autowired private NCBIAssemblyDataSource dataSource; + @Test + public void testDownloadAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + assertTrue(downloadedAssemblyReport.isPresent()); + assertTrue(Files.exists(downloadedAssemblyReport.get())); + } + @Test public void getAssemblyByAccessionGCAHavingChromosomes() throws IOException { - Optional accession = dataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes(); - assertNotNull(chromosomes); - assertFalse(chromosomes.isEmpty()); + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + AssemblyEntity assembly = dataSource.getAssemblyEntity(downloadedAssemblyReport.get()); + assertEquals(GCA_ACCESSION_HAVING_CHROMOSOMES, assembly.getInsdcAccession()); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("#")) + .collect(Collectors.toList()); + List chromosomeEntityList = dataSource.getChromosomeEntityList(assembly, chrLines); + assertEquals(3143, chromosomeEntityList.size()); } @Test public void getAssemblyByAccessionGCFNoChromosomes() throws IOException { - Optional accession = dataSource.getAssemblyByAccession(GCF_ACCESSION_NO_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes().stream() - .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.CHROMOSOME)) + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCF_ACCESSION_NO_CHROMOSOMES); + AssemblyEntity assembly = dataSource.getAssemblyEntity(downloadedAssemblyReport.get()); + assertEquals("GCA_006125015.1", assembly.getInsdcAccession()); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("#")) .collect(Collectors.toList()); - assertEquals(0, chromosomes.size()); + List chromosomeEntityList = dataSource.getChromosomeEntityList(assembly, chrLines); + long numOfChromosomes = chromosomeEntityList.stream() + .filter(c -> c.getContigType() == SequenceEntity.ContigType.CHROMOSOME).count(); + long numOfScaffolds = chromosomeEntityList.stream() + .filter(c -> c.getContigType() == SequenceEntity.ContigType.SCAFFOLD).count(); + assertEquals(0, numOfChromosomes); + assertEquals(2, numOfScaffolds); } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java index 2276c1c9..5d5d71c0 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java @@ -16,22 +16,16 @@ package uk.ac.ebi.eva.contigalias.dus; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -52,49 +46,23 @@ class ENAAssemblyReportReaderTest { private static final String SCAFFOLD_GENBANK_ACCESSION = "GJ057137.1"; - private InputStreamReader streamReader; - - private InputStream stream; - - @Autowired - private ENAAssemblyReportReaderFactory readerFactory; - - private ENAAssemblyReportReader reader; - - @BeforeEach - void setup() throws FileNotFoundException { - stream = new FileInputStream("src/test/resources/GCA_000003055.3_sequence_report.txt"); - streamReader = new InputStreamReader(stream); - reader = readerFactory.build(streamReader); - } - - @AfterEach - void tearDown() throws IOException { - stream.close(); - streamReader.close(); - } - - @Test - void getAssemblyReportReader() throws IOException { - assertTrue(reader.ready()); - } + private static final Path assemblyReportPath = Paths.get("src/test/resources/GCA_000003055.3_sequence_report.txt"); - AssemblyEntity getAssemblyEntity() throws IOException { - return reader.getAssemblyEntity(); + List getChromosomes() throws IOException { + List lines = Files.lines(assemblyReportPath).collect(Collectors.toList()); + return ENAAssemblyReportReader.getChromosomeEntity(lines); } @Test void verifyAssemblyHasChromosomes() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); assertNotNull(chromosomes); assertEquals(3316, chromosomes.size()); } @Test void verifyChromosomeMetadata() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); ChromosomeEntity chromosome = chromosomes.get(0); assertEquals(CHROMOSOME_ENA_SEQUENCE_NAME, chromosome.getEnaSequenceName()); assertEquals(CHROMOSOME_GENBANK_ACCESSION, chromosome.getInsdcAccession()); @@ -103,8 +71,7 @@ void verifyChromosomeMetadata() throws IOException { @Test void verifyAssemblyHasScaffolds() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertEquals(3286, scaffolds.size()); @@ -112,8 +79,7 @@ void verifyAssemblyHasScaffolds() throws IOException { @Test void assertParsedScaffoldValid() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertTrue(scaffolds.size() > 0); diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java index 9107baeb..ed32c95c 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java @@ -16,23 +16,18 @@ package uk.ac.ebi.eva.contigalias.dus; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -65,23 +60,11 @@ class NCBIAssemblyReportReaderTest { private static final Long CHROMOSOME_CHR1_SEQ_LENGTH = 158337067l; + private static final Path assemblyReportPath = Paths.get("src/test/resources/GCA_000003055.3_Bos_taurus_UMD_3.1_assembly_report.txt"); private ChromosomeEntity scaffoldEntity; - private InputStreamReader streamReader; - - private InputStream stream; - - @Autowired - private NCBIAssemblyReportReaderFactory readerFactory; - - private NCBIAssemblyReportReader reader; - @BeforeEach - void setup() throws FileNotFoundException { - stream = new FileInputStream( - new File("src/test/resources/GCA_000003055.3_Bos_taurus_UMD_3.1_assembly_report.txt")); - streamReader = new InputStreamReader(stream); - reader = readerFactory.build(streamReader); + void setup() { scaffoldEntity = (ChromosomeEntity) new ChromosomeEntity() .setGenbankSequenceName("ChrU_1") .setInsdcAccession("GJ057137.1") @@ -90,19 +73,18 @@ void setup() throws FileNotFoundException { .setUcscName(null); } - @AfterEach - void tearDown() throws IOException { - stream.close(); - streamReader.close(); - } - - @Test - void getAssemblyReportReader() throws IOException { - assertTrue(reader.ready()); + AssemblyEntity getAssemblyEntity() throws IOException { + List asmDataLines = Files.lines(assemblyReportPath) + .filter(line -> line.startsWith("#")) + .collect(Collectors.toList()); + return NCBIAssemblyReportReader.getAssemblyEntity(asmDataLines); } - AssemblyEntity getAssemblyEntity() throws IOException { - return reader.getAssemblyEntity(); + List getChromosomes() throws IOException { + List chrDataLines = Files.lines(assemblyReportPath) + .filter(line -> !line.startsWith("#")) + .collect(Collectors.toList()); + return NCBIAssemblyReportReader.getChromosomeEntity(chrDataLines); } @Test @@ -118,16 +100,14 @@ void verifyAssemblyMetadata() throws IOException { @Test void verifyAssemblyHasChromosomes() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); assertNotNull(chromosomes); assertEquals(3316, chromosomes.size()); } @Test void verifyChromosomeMetadata() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); ChromosomeEntity chromosome = chromosomes.get(0); assertEquals(CHROMOSOME_CHR1_SEQUENCE_NAME, chromosome.getGenbankSequenceName()); assertEquals(CHROMOSOME_CHR1_GENBANK_ACCESSION, chromosome.getInsdcAccession()); @@ -138,8 +118,7 @@ void verifyChromosomeMetadata() throws IOException { @Test void verifyAssemblyHasScaffolds() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertEquals(3286, scaffolds.size()); @@ -147,8 +126,7 @@ void verifyAssemblyHasScaffolds() throws IOException { @Test void assertParsedScaffoldValid() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertTrue(scaffolds.size() > 0); diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java new file mode 100644 index 00000000..b6dd79db --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java @@ -0,0 +1,61 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.data.domain.PageRequest; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ActiveProfiles; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.AssemblyService; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.LinkedList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; + + +@ActiveProfiles("test") +@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) +@SpringBootTest +public class ENASequenceNameUpdaterTest { + private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; + + @Autowired + private ENASequenceNameUpdater enaSequenceNameUpdater; + + @Autowired + private AssemblyService assemblyService; + + @Autowired + private ChromosomeService chromosomeService; + private final List chromosomeEntities = new LinkedList<>(); + + @BeforeEach + void setup() { + assemblyService.fetchAndInsertAssembly(GCA_ACCESSION_HAVING_CHROMOSOMES); + } + + @AfterEach + void tearDown() { + chromosomeEntities.stream().forEach(c -> chromosomeService.deleteChromosome(c)); + assemblyService.deleteAssembly(assemblyService.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES).get()); + } + + @Test + public void testUpdateENASequenceName() { + List chromosomeListBeforeUpdate = chromosomeService.getChromosomesByInsdcAccession(GCA_ACCESSION_HAVING_CHROMOSOMES, + PageRequest.of(0, 5000)).getContent(); + chromosomeListBeforeUpdate.stream().forEach(c -> assertNull(c.getEnaSequenceName())); + + enaSequenceNameUpdater.updateENASequenceNameForAssembly(GCA_ACCESSION_HAVING_CHROMOSOMES); + + List chromosomeListAfterUpdate = chromosomeService.getChromosomesByInsdcAccession(GCA_ACCESSION_HAVING_CHROMOSOMES, + PageRequest.of(0, 5000)).getContent(); + chromosomeListAfterUpdate.stream().forEach(c -> assertNotNull(c.getEnaSequenceName())); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java new file mode 100644 index 00000000..6969b670 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -0,0 +1,70 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.data.domain.PageRequest; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; +import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.mock; + +@ActiveProfiles("test") +@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) +@SpringBootTest +class MD5ChecksumUpdaterTest { + private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; + private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); + private List chromosomeEntityList = new ArrayList<>(); + @Autowired + private JdbcTemplate jdbcTemplate; + @Autowired + private ChromosomeService chromosomeService; + private MD5ChecksumUpdater md5ChecksumUpdater; + + @BeforeEach + void setup() throws JsonProcessingException { + RestTemplate restTemplate = mock(RestTemplate.class); + md5ChecksumUpdater = new MD5ChecksumUpdater(restTemplate, jdbcTemplate, chromosomeService); + for (int i = 0; i < 5; i++) { + ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); + chromosomeEntityList.add(chromosomeEntity); + chromosomeService.insertChromosome(chromosomeEntity); + + String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; + Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", + chromosomeEntity.getInsdcAccession()), JsonNode.class)) + .thenReturn(new ObjectMapper().readTree(jsonMD5Response)); + } + } + + @Test + void testUpdateMD5ChecksumForAssembly() { + chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), + PageRequest.of(0, 100)) + .forEach(c -> assertNull(c.getMd5checksum())); + + md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); + + chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), + PageRequest.of(0, 100)) + .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java index 02789609..b9447aaf 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java @@ -184,6 +184,9 @@ void setup() { @AfterEach void tearDown() { + for(ChromosomeEntity chromosomeEntity: chromosomeEntities){ + service.deleteChromosome(chromosomeEntity); + } for (AssemblyEntity assemblyEntity : assemblyEntities) { assemblyService.deleteAssembly(assemblyEntity); } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java index d713e2e9..5b1ac463 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java @@ -20,7 +20,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.data.domain.Page; @@ -31,12 +30,11 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; -import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; +import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; -import java.io.IOException; import java.util.List; import java.util.Optional; -import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -49,7 +47,6 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest public class AssemblyServiceIntegrationTest { - private static final int TEST_ENTITIES_NUMBERS = 11; private final AssemblyEntity[] entities = new AssemblyEntity[TEST_ENTITIES_NUMBERS]; @@ -57,25 +54,26 @@ public class AssemblyServiceIntegrationTest { @Autowired AssemblyRepository repository; + @Autowired + ChromosomeRepository chromosomeRepository; + + @Autowired + ChromosomeService chromosomeService; + @Autowired private AssemblyService service; @BeforeEach - void setup() throws IOException { + void setup() { NCBIAssemblyDataSource mockNcbiDataSource = mock(NCBIAssemblyDataSource.class); ENAAssemblyDataSource mockEnaDataSource = mock(ENAAssemblyDataSource.class); - ChecksumSetter mockChecksumSetter = mock(ChecksumSetter.class); + ChromosomeUpdater chromosomeUpdater = mock(ChromosomeUpdater.class); for (int i = 0; i < entities.length; i++) { AssemblyEntity generate = AssemblyGenerator.generate(i); entities[i] = generate; - Mockito.when(mockNcbiDataSource.getAssemblyByAccession(generate.getInsdcAccession())) - .thenReturn(Optional.of(generate)); - Mockito.when(mockNcbiDataSource.getAssemblyByAccession(generate.getRefseq())) - .thenReturn(Optional.of(generate)); - Mockito.when(mockChecksumSetter.updateMd5CheckSumForAssemblyAsync(generate.getInsdcAccession())) - .thenReturn(new CompletableFuture<>()); } - service = new AssemblyService(repository, mockNcbiDataSource, mockEnaDataSource, mockChecksumSetter); + service = new AssemblyService(chromosomeService, repository, chromosomeRepository, mockNcbiDataSource, + mockEnaDataSource, chromosomeUpdater); } @AfterEach diff --git a/src/test/resources/application-test.properties b/src/test/resources/application-test.properties index 166fc87b..83395a0e 100644 --- a/src/test/resources/application-test.properties +++ b/src/test/resources/application-test.properties @@ -25,6 +25,7 @@ spring.h2.console.path=/console/ spring.jpa.hibernate.ddl-auto=create-drop spring.jpa.show-sql=false spring.jpa.properties.hibernate.format_sql=false +spring.jpa.open-in-view=false ftp.proxy.host=null ftp.proxy.port=0