Skip to content

Commit

Permalink
Merge pull request #349 from pixlise/feature/archive-optimiser-tool
Browse files Browse the repository at this point in the history
Feature/archive optimiser tool
  • Loading branch information
pnemere authored Nov 11, 2024
2 parents a77ef16 + 7a0c922 commit 496f49d
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 94 deletions.
28 changes: 14 additions & 14 deletions api/dataimport/datasetArchive/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,35 +59,35 @@ func NewDatasetArchiveDownloader(
// Unzipped files path (archive zips unzipped here),
// How many zips loaded from archive
// Error (if any)
func (dl *DatasetArchiveDownloader) DownloadFromDatasetArchive(datasetID string, workingDir string) (string, string, int, string, error) {
func (dl *DatasetArchiveDownloader) DownloadFromDatasetArchive(datasetID string, workingDir string) (string, string, []string, error) {
// Create a directories to process data in
dl.log.Debugf("Preparing to download archived dataset %v...", datasetID)

downloadPath, err := fileaccess.MakeEmptyLocalDirectory(workingDir, "download")
if err != nil {
err = fmt.Errorf("Failed to generate directory for importer downloads: %v", err)
//dl.log.Errorf("%v", err)
return "", "", 0, "", err
return "", "", []string{}, err
}
unzippedPath, err := fileaccess.MakeEmptyLocalDirectory(workingDir, "unzipped")
if err != nil {
err = fmt.Errorf("Failed to generate directory for importer unzips: %v", err)
//dl.log.Errorf("%v", err)
return "", "", 0, "", err
return "", "", []string{}, err
}

// Download all zip files from archive for this dataset ID, and extract them as required
dl.log.Debugf("Downloading archived zip files...")

zipCount, lastZipName, err := dl.downloadArchivedZipsForDataset(datasetID, downloadPath, unzippedPath)
zipFilesOrdered, err := dl.downloadArchivedZipsForDataset(datasetID, downloadPath, unzippedPath)
if err != nil {
err = fmt.Errorf("Failed to download archived zip files for dataset ID: %v. Error: %v", datasetID, err)
//dl.log.Errorf("%v", err)
return downloadPath, unzippedPath, zipCount, lastZipName, err
return downloadPath, unzippedPath, zipFilesOrdered, err
}

dl.log.Debugf("Dataset %v downloaded %v zip files from archive", datasetID, zipCount)
return downloadPath, unzippedPath, zipCount, lastZipName, nil
dl.log.Debugf("Dataset %v downloaded %v zip files from archive", datasetID, len(zipFilesOrdered))
return downloadPath, unzippedPath, zipFilesOrdered, nil
}

func (dl *DatasetArchiveDownloader) DownloadPseudoIntensityRangesFile(configBucket string, downloadPath string, version string) (string, error) {
Expand Down Expand Up @@ -121,45 +121,45 @@ func (dl *DatasetArchiveDownloader) fetchFile(bucketFrom string, pathFrom string
// Returns 2 things:
// Number of zips loaded
// Error if there was one
func (dl *DatasetArchiveDownloader) downloadArchivedZipsForDataset(datasetID string, downloadPath string, unzippedPath string) (int, string, error) {
func (dl *DatasetArchiveDownloader) downloadArchivedZipsForDataset(datasetID string, downloadPath string, unzippedPath string) ([]string, error) {
// Download all zip files that have the dataset ID prefixed in their file name
// Unzip them in timestamp order into downloadPath
archiveSearchPath := path.Join(filepaths.RootArchive, datasetID)
dl.log.Infof("Searching for archived files in: s3://%v/%v", dl.datasetBucket, archiveSearchPath)

archivedFiles, err := dl.remoteFS.ListObjects(dl.datasetBucket, archiveSearchPath)
if err != nil {
return 0, "", err
return []string{}, err
}

orderedArchivedFiles, err := getOrderedArchiveFiles(archivedFiles)

if err != nil {
// Stop here if we find a bad file
return 0, "", err
return []string{}, err
}

fileCount := 0

for _, filePath := range orderedArchivedFiles {
fileName := path.Base(filePath)
if !strings.HasSuffix(fileName, ".zip") {
return 0, "", errors.New("Expected zip file, got: " + fileName)
return []string{}, errors.New("Expected zip file, got: " + fileName)
}

savePath := filepath.Join(downloadPath, fileName)
err = dl.fetchFile(dl.datasetBucket, filePath, savePath)

if err != nil {
return 0, "", err
return []string{}, err
}

dl.log.Debugf("Unzipping: \"%v\"", savePath)

// Unzip the file
unzippedFileNames, err := utils.UnzipDirectory(savePath, unzippedPath, false)
if err != nil {
return 0, "", err
return []string{}, err
}

fileCount += len(unzippedFileNames)
Expand All @@ -181,7 +181,7 @@ func (dl *DatasetArchiveDownloader) downloadArchivedZipsForDataset(datasetID str
}

dl.log.Infof("Downloaded %v zip files, unzipped %v files. Last file name: %v", len(orderedArchivedFiles), fileCount, lastFileName)
return len(orderedArchivedFiles), filepath.Base(lastFileName), nil
return orderedArchivedFiles, nil
}

func (dl *DatasetArchiveDownloader) DownloadUserCustomisationsForDataset(datasetID string, downloadPath string) error {
Expand Down
6 changes: 3 additions & 3 deletions api/dataimport/import.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ func ImportDataset(

// Firstly, we download from the archive
archive := datasetArchive.NewDatasetArchiveDownloader(remoteFS, localFS, log, datasetBucket, manualUploadBucket)
localDownloadPath, localUnzippedPath, zipCount, _, err := archive.DownloadFromDatasetArchive(datasetID, workingDir)
localDownloadPath, localUnzippedPath, zipFiles, err := archive.DownloadFromDatasetArchive(datasetID, workingDir)
if err != nil {
return workingDir, savedSummary, "", false, err
}

// If no zip files were loaded, maybe this dataset is a manually uploaded one, try to import from there instead
if zipCount == 0 {
if len(zipFiles) == 0 {
log.Infof("No zip files found in archive, dataset may have been manually uploaded. Trying to download...")
localDownloadPath, localUnzippedPath, err = archive.DownloadFromDatasetUploads(datasetID, workingDir)
if err != nil {
Expand Down Expand Up @@ -154,7 +154,7 @@ func ImportDataset(
}
}

return workingDir, savedSummary, updatenotificationtype, !justArchived && zipCount > 1, err
return workingDir, savedSummary, updatenotificationtype, !justArchived && len(zipFiles) > 1, err
}

// ImportFromLocalFileSystem - As the name says, imports from directory on local file system
Expand Down
Loading

0 comments on commit 496f49d

Please sign in to comment.