From c2cd1cfe2305d1c933e7753a6823d2df4fb0c469 Mon Sep 17 00:00:00 2001 From: Joren Six Date: Tue, 11 Oct 2022 14:47:10 +0200 Subject: [PATCH] Caching improvements: to avoid repeated feature extraction caching is now implemented with a content based hash and not on a path of a file. --- resources/defaults/logging.properties | 3 +- src/main/java/be/panako/cli/Application.java | 4 +- src/main/java/be/panako/cli/Delete.java | 58 ++-- src/main/java/be/panako/cli/Monitor.java | 4 +- src/main/java/be/panako/cli/Print.java | 2 +- src/main/java/be/panako/cli/Query.java | 4 +- src/main/java/be/panako/cli/Resolve.java | 4 +- src/main/java/be/panako/cli/Store.java | 61 ++-- .../be/panako/strategy/olaf/OlafStrategy.java | 107 +++---- .../olaf/storage/OlafCachingStorage.java | 102 ++++++ .../olaf/storage/OlafStorageFile.java | 7 +- .../strategy/panako/PanakoStrategy.java | 132 ++++---- .../panako/storage/PanakoCachingStorage.java | 102 ++++++ .../panako/storage/PanakoStorageFile.java | 36 +-- .../java/be/panako/util/AudioFileUtils.java | 55 ++++ src/main/java/be/panako/util/FileUtils.java | 86 ++++- src/main/java/be/panako/util/Key.java | 8 + src/main/java/be/panako/util/MurmurHash3.java | 301 ++++++++++++++++++ .../be/panako/tests/OlafStorageKVTest.java | 32 ++ .../be/panako/tests/PanakoStrategyTest.java | 121 +------ src/test/java/be/panako/tests/TestData.java | 119 +++++++ src/test/java/be/panako/tests/UtilsTest.java | 45 +++ 22 files changed, 1021 insertions(+), 372 deletions(-) create mode 100644 src/main/java/be/panako/strategy/olaf/storage/OlafCachingStorage.java create mode 100644 src/main/java/be/panako/strategy/panako/storage/PanakoCachingStorage.java create mode 100644 src/main/java/be/panako/util/AudioFileUtils.java create mode 100644 src/main/java/be/panako/util/MurmurHash3.java create mode 100644 src/test/java/be/panako/tests/TestData.java create mode 100644 src/test/java/be/panako/tests/UtilsTest.java diff --git a/resources/defaults/logging.properties b/resources/defaults/logging.properties index c21fe54..0e54731 100644 --- a/resources/defaults/logging.properties +++ b/resources/defaults/logging.properties @@ -55,4 +55,5 @@ java.util.logging.FileHandler.limit=5000000 # integer to the base file name: java.util.logging.FileHandler.count=5 -# Style of output (Simple or XML): \ No newline at end of file +# Style of output (Simple or XML): +java.util.logging.FileHandler.formatter = java.util.logging.SimpleFormatter \ No newline at end of file diff --git a/src/main/java/be/panako/cli/Application.java b/src/main/java/be/panako/cli/Application.java index 26ec34a..7786a08 100644 --- a/src/main/java/be/panako/cli/Application.java +++ b/src/main/java/be/panako/cli/Application.java @@ -140,9 +140,9 @@ protected boolean checkFile(String file){ File f = new File(file); boolean fileOk = false; if(f.exists() && f.canRead()){ - fileOk = true; + fileOk = FileUtils.checkFileSize(f,Config.getInt(Key.MAX_FILE_SIZE)); }else{ - String message = "Could not read " + f.getAbsolutePath() + " it does not exist or is not accesible at the moment.)"; + String message = "Could not read " + f.getAbsolutePath() + " it does not exist or is not accessible at the moment.)"; LOG.warning(message); System.out.println(message); } diff --git a/src/main/java/be/panako/cli/Delete.java b/src/main/java/be/panako/cli/Delete.java index 752d461..00df610 100644 --- a/src/main/java/be/panako/cli/Delete.java +++ b/src/main/java/be/panako/cli/Delete.java @@ -45,10 +45,7 @@ import be.panako.strategy.Strategy; import be.panako.strategy.olaf.OlafStrategy; -import be.panako.util.Config; -import be.panako.util.Key; -import be.panako.util.StopWatch; -import be.panako.util.TimeUnit; +import be.panako.util.*; /** * Delete fingerptings from the index. @@ -121,42 +118,29 @@ public DeleteTask(File file,int taskID,int totalTasks){ public void run() { StopWatch w = new StopWatch(); - if(checkFile(file)){ - - Strategy strategy = Strategy.getInstance(); - - boolean hasResource = false; - hasResource = strategy.hasResource(file.getAbsolutePath()); - - String message=null; - if(hasResource){ - message = String.format("%d/%d;%s;%s;%s",taskID,totalTasks,file.getName(),StopWatch.toTime("", 0),"Deletion skipped: resource not in the key value store;"); - }else{ - double durationInSeconds = strategy.delete(file.getAbsolutePath()); - - double cpuSecondsPassed = w.timePassed(TimeUnit.SECONDS); - String audioDuration = StopWatch.toTime("", (int) Math.round(durationInSeconds)); - String cpuTimeDuration = w.formattedToString(); - double timeRatio = durationInSeconds/cpuSecondsPassed; - message = String.format("%d/%d;%s;%s;%s;%.2f",taskID,totalTasks,file.getName(),audioDuration,cpuTimeDuration,timeRatio); - } - LOG.info(message); - System.out.println(message); - } - } - - private boolean checkFile(File file){ - boolean fileOk = false; - //file must be smaller than a configured number of bytes - if(file.length() != 0 && file.length() < Config.getInt(Key.MAX_FILE_SIZE)){ - fileOk = true; + Strategy strategy = Strategy.getInstance(); + + boolean hasResource = false; + hasResource = strategy.hasResource(file.getAbsolutePath()); + + String message=null; + if(hasResource){ + message = String.format("%d/%d;%s;%s;%s",taskID,totalTasks,file.getName(),StopWatch.toTime("", 0),"Deletion skipped: resource not in the key value store;"); }else{ - String message = "Could not process " + file.getName() + " it has an unacceptable file size: zero or larger than " + Config.getInt(Key.MAX_FILE_SIZE) + "bytes )."; - LOG.warning(message); - System.out.println(message); + double durationInSeconds = strategy.delete(file.getAbsolutePath()); + + double cpuSecondsPassed = w.timePassed(TimeUnit.SECONDS); + String audioDuration = StopWatch.toTime("", (int) Math.round(durationInSeconds)); + String cpuTimeDuration = w.formattedToString(); + double timeRatio = durationInSeconds/cpuSecondsPassed; + message = String.format("%d/%d;%s;%s;%s;%.2f",taskID,totalTasks,file.getName(),audioDuration,cpuTimeDuration,timeRatio); } - return fileOk; + LOG.info(message); + System.out.println(message); + } + + } diff --git a/src/main/java/be/panako/cli/Monitor.java b/src/main/java/be/panako/cli/Monitor.java index b300341..27cffe0 100644 --- a/src/main/java/be/panako/cli/Monitor.java +++ b/src/main/java/be/panako/cli/Monitor.java @@ -83,14 +83,14 @@ public void run(String... args) { if(hasArgument("debug", args) || processors==1){ int taskNumber = 1; for(File file: files){ - new Monitor.MonitorTask(file.getPath(),taskNumber,files.size()).run(); + new Monitor.MonitorTask(file.getAbsolutePath(),taskNumber,files.size()).run(); taskNumber++; } }else{ ExecutorService executor = Executors.newFixedThreadPool(processors); int taskNumber = 1; for(File file: files){ - executor.submit(new Monitor.MonitorTask(file.getPath(),taskNumber,files.size())); + executor.submit(new Monitor.MonitorTask(file.getAbsolutePath(),taskNumber,files.size())); taskNumber++; } executor.shutdown(); diff --git a/src/main/java/be/panako/cli/Print.java b/src/main/java/be/panako/cli/Print.java index 3077d6a..a07ea56 100644 --- a/src/main/java/be/panako/cli/Print.java +++ b/src/main/java/be/panako/cli/Print.java @@ -59,7 +59,7 @@ public void run(String... args) { Strategy strategy = Strategy.getInstance(); for(File file: files){ - strategy.print(file.getPath(),sonicVisualizerOutput); + strategy.print(file.getAbsolutePath(),sonicVisualizerOutput); } } diff --git a/src/main/java/be/panako/cli/Query.java b/src/main/java/be/panako/cli/Query.java index cafcad9..7aa5fb4 100644 --- a/src/main/java/be/panako/cli/Query.java +++ b/src/main/java/be/panako/cli/Query.java @@ -70,14 +70,14 @@ public void run(String... args) { if(hasArgument("debug", args) || processors==1){ int taskNumber = 1; for(File file: files){ - new QueryTask(file.getPath(),taskNumber,files.size()).run(); + new QueryTask(file.getAbsolutePath(),taskNumber,files.size()).run(); taskNumber++; } }else{ ExecutorService executor = Executors.newFixedThreadPool(processors); int taskNumber = 1; for(File file: files){ - executor.submit(new QueryTask(file.getPath(),taskNumber,files.size())); + executor.submit(new QueryTask(file.getAbsolutePath(),taskNumber,files.size())); taskNumber++; } executor.shutdown(); diff --git a/src/main/java/be/panako/cli/Resolve.java b/src/main/java/be/panako/cli/Resolve.java index 742878a..84e3d82 100644 --- a/src/main/java/be/panako/cli/Resolve.java +++ b/src/main/java/be/panako/cli/Resolve.java @@ -47,11 +47,11 @@ class Resolve extends Application { @Override public void run(String... args) { - Strategy strat = Strategy.getInstance(); + Strategy strategy = Strategy.getInstance(); List files = getFilesFromArguments(args); for(File f : files) { - System.out.println(strat.resolve(f.getPath())); + System.out.println(strategy.resolve(f.getAbsolutePath())); } } diff --git a/src/main/java/be/panako/cli/Store.java b/src/main/java/be/panako/cli/Store.java index 2eb490d..b3300ee 100644 --- a/src/main/java/be/panako/cli/Store.java +++ b/src/main/java/be/panako/cli/Store.java @@ -42,10 +42,7 @@ import java.util.logging.Logger; import be.panako.strategy.Strategy; -import be.panako.util.Config; -import be.panako.util.Key; -import be.panako.util.StopWatch; -import be.panako.util.TimeUnit; +import be.panako.util.*; /** * Store audio fingerptings in the storage. @@ -116,49 +113,31 @@ public StoreTask(File file,int taskID,int totalTasks){ @Override public void run() { - StopWatch w = new StopWatch(); - if(checkFile(file)){ - - Strategy strategy = Strategy.getInstance(); - - boolean isDouble = false; - if(Config.getBoolean(Key.CHECK_DUPLICATE_FILE_NAMES) ){ - isDouble = strategy.hasResource(file.getAbsolutePath()); - } + Strategy strategy = Strategy.getInstance(); - String message=null; - if(isDouble){ - message = String.format("%d/%d;%s;%s",taskID,totalTasks,file.getName(),"Skipped: resource already stored;"); - }else{ - double durationInSeconds = strategy.store(file.getAbsolutePath(), file.getName()); - double cpuSecondsPassed = w.timePassed(TimeUnit.SECONDS); - String audioDuration = StopWatch.toTime("", (int) Math.round(durationInSeconds)); - String cpuTimeDuration = w.formattedToString(); - double timeRatio = durationInSeconds/cpuSecondsPassed; - message = String.format("%d/%d;%s;%s;%s;%.2f",taskID,totalTasks,file.getName(),audioDuration,cpuTimeDuration,timeRatio); - } - LOG.info(message); - System.out.println(message); + boolean isDouble = false; + if(Config.getBoolean(Key.CHECK_DUPLICATE_FILE_NAMES) ){ + isDouble = strategy.hasResource(file.getAbsolutePath()); } - } - - private boolean checkFile(File file){ - boolean fileOk = false; - - //file must be smaller than a configured number of bytes - long maxFileSize = Config.getInt(Key.MAX_FILE_SIZE); - //from megabytes to bytes - maxFileSize = maxFileSize * 1024 * 1024; - if(file.length() != 0 && file.length() < maxFileSize ){ - fileOk = true; + + String message=null; + if(isDouble){ + message = String.format("%d/%d;%s;%s",taskID,totalTasks,file.getName(),"Skipped: resource already stored;"); }else{ - String message = "Could not process " + file.getName() + " it has an unacceptable file size.\n\tFile is " + file.length() + " bytes. \n\tShould be more than zero and smaller than " + Config.getInt(Key.MAX_FILE_SIZE) + " bytes )."; - LOG.warning(message); - System.out.println(message); + double durationInSeconds = strategy.store(file.getAbsolutePath(), file.getName()); + double cpuSecondsPassed = w.timePassed(TimeUnit.SECONDS); + String audioDuration = StopWatch.toTime("", (int) Math.round(durationInSeconds)); + String cpuTimeDuration = w.formattedToString(); + double timeRatio = durationInSeconds/cpuSecondsPassed; + message = String.format("%d/%d;%s;%s;%s;%.2f",taskID,totalTasks,file.getName(),audioDuration,cpuTimeDuration,timeRatio); } - return fileOk; + LOG.info(message); + System.out.println(message); + } + + } diff --git a/src/main/java/be/panako/strategy/olaf/OlafStrategy.java b/src/main/java/be/panako/strategy/olaf/OlafStrategy.java index 3ccc0d4..324f687 100644 --- a/src/main/java/be/panako/strategy/olaf/OlafStrategy.java +++ b/src/main/java/be/panako/strategy/olaf/OlafStrategy.java @@ -38,24 +38,14 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; +import java.util.*; import java.util.logging.Logger; import be.panako.strategy.QueryResult; import be.panako.strategy.QueryResultHandler; import be.panako.strategy.Strategy; import be.panako.strategy.olaf.storage.*; -import be.panako.util.Config; -import be.panako.util.FileUtils; -import be.panako.util.Key; -import be.panako.util.StopWatch; +import be.panako.util.*; import be.tarsos.dsp.AudioDispatcher; import be.tarsos.dsp.io.jvm.AudioDispatcherFactory; @@ -72,32 +62,37 @@ public class OlafStrategy extends Strategy { private final static Logger LOG = Logger.getLogger(OlafStrategy.class.getName()); + private final OlafStorage db; + /** * Create an instance */ - public OlafStrategy(){} - - private OlafStorage getDbInstance(){ - final OlafStorage db; - - if (Config.get(Key.OLAF_STORAGE).equalsIgnoreCase("MEM")) { - db = OlafStorageMemory.getInstance(); - } else { - //By default use the LMDB storage + public OlafStrategy(){ + OlafStorage db; + if (Config.get(Key.OLAF_STORAGE).equalsIgnoreCase("LMDB")) { db = OlafStorageKV.getInstance(); + }else if (Config.get(Key.OLAF_STORAGE).equalsIgnoreCase("FILE")) { + db = OlafStorageFile.getInstance(); + }else { + db = OlafStorageMemory.getInstance(); } + if(Config.getBoolean(Key.OLAF_CACHE_TO_FILE) && db != OlafStorageFile.getInstance()) { + LOG.info("Using "+ db.getClass().getSimpleName() + " storage with caching front."); + db = new OlafCachingStorage(OlafStorageFile.getInstance(),db); + }else { + LOG.info("Using " + db.getClass().getSimpleName() + " as storage."); + } + this.db = db; + } + + private OlafStorage getStorage(){ return db; } @Override public double store(String resource, String description) { - OlafStorage db = getDbInstance(); - - OlafStorage fileCache = null; - if(Config.getBoolean(Key.OLAF_CACHE_TO_FILE)) { - fileCache = OlafStorageFile.getInstance(); - } + OlafStorage db = getStorage(); List prints = toFingerprints(resource); @@ -107,16 +102,9 @@ public double store(String resource, String description) { long hash = print.hash(); int printT1 = print.t1; db.addToStoreQueue(hash, resourceID, printT1); - if(fileCache!=null) { - fileCache.addToStoreQueue(hash, resourceID, printT1); - } } db.processStoreQueue(); - if(fileCache!=null) { - fileCache.processStoreQueue(); - } - //store meta-data as well float duration = 0; if(prints.size() != 0) { @@ -139,7 +127,7 @@ public double store(String resource, String description) { @Override public double delete(String resource) { - OlafStorage db = getDbInstance(); + OlafStorage db = getStorage(); List prints = toFingerprints(resource); @@ -176,30 +164,37 @@ public double delete(String resource) { * @return A list of fingerprints */ public List toFingerprints(String resource){ - + return toFingerprints(resource,0,MAX_TIME); + } + + private List toFingerprints(String resource,double startTimeOffset,double numberOfSeconds){ if(Config.getBoolean(Key.OLAF_USE_CACHED_PRINTS)) { String folder = Config.get(Key.OLAF_CACHE_FOLDER); folder = FileUtils.expandHomeDir(folder); String tdbPath = FileUtils.combine(folder,resolve(resource) + ".tdb"); + if(FileUtils.exists(tdbPath)) { - List prints = new ArrayList<>(); List printData = readFingerprintFile(tdbPath); for(long[] data : printData) { long fingerprintHash = data[0]; int t1 = (int) data[2]; - prints.add(new OlafFingerprint(fingerprintHash,t1)); + float t1InSeconds = blocksToSeconds(t1); + + //skip all fingerprints after stop time + if(t1InSeconds > startTimeOffset + numberOfSeconds) + break; + //only add prints if they are after the start time offset + if(t1InSeconds >= startTimeOffset) + prints.add(new OlafFingerprint(fingerprintHash,t1)); } + LOG.info(String.format("Read %d cached fingerprints from file '%s' (start: %.3f sec, stop: %.3f sec) for '%s'", prints.size(),tdbPath,startTimeOffset,startTimeOffset+numberOfSeconds,resource)); return prints; - + }else{ + LOG.info(String.format("Could not read cached fingerprints from file '%s' for '%s'",tdbPath,resource)); } - } - - return toFingerprints(resource,0,MAX_TIME); - } - + } //else no cached prints are found - private List toFingerprints(String resource,double startTimeOffset,double numberOfSeconds){ int samplerate, size, overlap; samplerate = Config.getInt(Key.OLAF_SAMPLE_RATE); size = Config.getInt(Key.OLAF_SIZE); @@ -219,7 +214,6 @@ private List toFingerprints(String resource,double startTimeOff return eventPointProcessor.getFingerprints(); } - private float blocksToSeconds(int t) { return t * (Config.getInt(Key.OLAF_STEP_SIZE)/(float) Config.getInt(Key.OLAF_SAMPLE_RATE)); } @@ -465,11 +459,9 @@ private void query(String query, int maxNumberOfResults, Set avoid, Que public void monitor(String query, int maxNumberOfReqults, Set avoid, QueryResultHandler handler) { int overlapInSeconds = Config.getInt(Key.MONITOR_OVERLAP); // 5 int stepSizeInSeconds = Config.getInt(Key.MONITOR_STEP_SIZE); //25 - - // Get the total duration - AudioDispatcher d = AudioDispatcherFactory.fromPipe(query, 8000, 2048, 0); - d.run(); - double totalDuration = d.secondsProcessed(); + + // Get the total duration efficiently + double totalDuration = AudioFileUtils.audioFileDurationInSeconds(new File(query)); //Steps: 0-25s ; 20-45s ; 40-65s ... int actualStep = stepSizeInSeconds - overlapInSeconds;//20s @@ -631,17 +623,7 @@ public void print(String path, boolean sonicVisualizerOutput) { @Override public void clear() { - if (Config.get(Key.OLAF_STORAGE).equalsIgnoreCase("LMDB")) { - OlafStorageKV.getInstance().clear(); - - if(Config.getBoolean(Key.OLAF_CACHE_TO_FILE)) { - OlafStorageFile.getInstance().clear(); - } - - } else if (Config.get(Key.OLAF_STORAGE).equalsIgnoreCase("MEM")) { - OlafStorageMemory.getInstance().clear(); - } - + getStorage().clear(); } @Override @@ -649,7 +631,6 @@ public String metadata(String path) { final OlafStorage db = storageInstance(); long identifier = FileUtils.getIdentifier(path); OlafResourceMetadata metaData = db.getMetadata(identifier); - return String.format("%d ; %s ; %.3f (s) ; %d (#) ; %.3f (#/s)",metaData.identifier,metaData.path,metaData.duration,metaData.numFingerprints,metaData.printsPerSecond()); } } diff --git a/src/main/java/be/panako/strategy/olaf/storage/OlafCachingStorage.java b/src/main/java/be/panako/strategy/olaf/storage/OlafCachingStorage.java new file mode 100644 index 0000000..ea4b39c --- /dev/null +++ b/src/main/java/be/panako/strategy/olaf/storage/OlafCachingStorage.java @@ -0,0 +1,102 @@ +package be.panako.strategy.olaf.storage; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * This is a caching front for a storage engine. + * + * To save time on feature extraction a cache can store the extracted features (the fingerprints) + * and a store often stores an inverted index: the extracted features in an efficient way to query the features. + * + * During a query operation the cache is first checked for a match, only if no match is found a feature extraction + * takes place. + * + * The query operations are only executed on the inverted index since it is + */ +public class OlafCachingStorage implements OlafStorage{ + + private final OlafStorage cachingIndex; + private final OlafStorage invertedIndex; + + /** + * Create a new caching storage front. + * @param cachingIndex The underlying cache (straight caching index). + * @param invertedIndex The actual inverted index storage (key value store). + */ + public OlafCachingStorage(OlafStorage cachingIndex, OlafStorage invertedIndex){ + this.cachingIndex = cachingIndex; + this.invertedIndex = invertedIndex; + } + + @Override + public void storeMetadata(long resourceID, String resourcePath, float duration, int numberOfFingerprints) { + cachingIndex.storeMetadata(resourceID,resourcePath,duration,numberOfFingerprints); + invertedIndex.storeMetadata(resourceID,resourcePath,duration,numberOfFingerprints); + } + + @Override + public void addToStoreQueue(long fingerprintHash, int resourceIdentifier, int t1) { + cachingIndex.addToStoreQueue(fingerprintHash,resourceIdentifier,t1); + invertedIndex.addToStoreQueue(fingerprintHash,resourceIdentifier,t1); + } + + @Override + public void processStoreQueue() { + cachingIndex.processStoreQueue(); + invertedIndex.processStoreQueue(); + } + + @Override + public void clearStoreQueue() { + cachingIndex.clearStoreQueue(); + invertedIndex.clearStoreQueue(); + } + + @Override + public void printStatistics(boolean printDetailedStats) { + invertedIndex.printStatistics(printDetailedStats); + } + + @Override + public OlafResourceMetadata getMetadata(long identifier) { + return invertedIndex.getMetadata(identifier); + } + + @Override + public void addToQueryQueue(long queryHash) { + //it does not make sense to use the non optimized caching index for query tasks + invertedIndex.addToQueryQueue(queryHash); + } + + @Override + public void processQueryQueue(Map> matchAccumulator, int range, Set resourcesToAvoid) { + //it does not make sense to use the non optimized caching index for query tasks + invertedIndex.processQueryQueue(matchAccumulator,range,resourcesToAvoid); + } + + @Override + public void addToDeleteQueue(long fingerprintHash, int resourceIdentifier, int t1) { + cachingIndex.addToDeleteQueue(fingerprintHash,resourceIdentifier,t1); + invertedIndex.addToDeleteQueue(fingerprintHash,resourceIdentifier,t1); + } + + @Override + public void processDeleteQueue() { + cachingIndex.processDeleteQueue(); + invertedIndex.processDeleteQueue(); + } + + @Override + public void deleteMetadata(long resourceID) { + cachingIndex.deleteMetadata(resourceID); + invertedIndex.deleteMetadata(resourceID); + } + + @Override + public void clear() { + cachingIndex.clear(); + invertedIndex.clear(); + } +} diff --git a/src/main/java/be/panako/strategy/olaf/storage/OlafStorageFile.java b/src/main/java/be/panako/strategy/olaf/storage/OlafStorageFile.java index 4bbb444..7c19075 100644 --- a/src/main/java/be/panako/strategy/olaf/storage/OlafStorageFile.java +++ b/src/main/java/be/panako/strategy/olaf/storage/OlafStorageFile.java @@ -41,6 +41,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.logging.Logger; import be.panako.util.Config; import be.panako.util.FileUtils; @@ -50,6 +51,7 @@ * Stores fingerprints to a file. It is mainly used to cache fingerprint extraction results. */ public class OlafStorageFile implements OlafStorage { + private final static Logger LOG = Logger.getLogger(OlafStorageFile.class.getName()); /** * The single instance of the storage. @@ -239,10 +241,11 @@ public void deleteMetadata(long resourceID) { public void clear() { if(!FileUtils.exists(storeDir.getAbsolutePath())) return; - - for(File f : storeDir.listFiles()) { + File[] filesToDelete = storeDir.listFiles(); + for(File f : filesToDelete) { FileUtils.rm(f.getAbsolutePath()); } + LOG.info(String.format("Removed %d files from file storage",filesToDelete.length)); } } diff --git a/src/main/java/be/panako/strategy/panako/PanakoStrategy.java b/src/main/java/be/panako/strategy/panako/PanakoStrategy.java index a6cd204..632ca55 100644 --- a/src/main/java/be/panako/strategy/panako/PanakoStrategy.java +++ b/src/main/java/be/panako/strategy/panako/PanakoStrategy.java @@ -54,16 +54,8 @@ import be.panako.strategy.Strategy; import be.panako.strategy.olaf.storage.OlafResourceMetadata; import be.panako.strategy.olaf.storage.OlafStorage; -import be.panako.strategy.panako.storage.PanakoHit; -import be.panako.strategy.panako.storage.PanakoResourceMetadata; -import be.panako.strategy.panako.storage.PanakoStorage; -import be.panako.strategy.panako.storage.PanakoStorageFile; -import be.panako.strategy.panako.storage.PanakoStorageKV; -import be.panako.strategy.panako.storage.PanakoStorageMemory; -import be.panako.util.Config; -import be.panako.util.FileUtils; -import be.panako.util.Key; -import be.panako.util.StopWatch; +import be.panako.strategy.panako.storage.*; +import be.panako.util.*; import be.tarsos.dsp.AudioDispatcher; import be.tarsos.dsp.io.jvm.AudioDispatcherFactory; import be.tarsos.dsp.util.PitchConverter; @@ -76,29 +68,46 @@ public class PanakoStrategy extends Strategy { private final static Logger LOG = Logger.getLogger(PanakoStrategy.class.getName()); + private final int latency; + + private final PanakoStorage db; + /** * Create a new instance */ public PanakoStrategy(){ - - } - - @Override - public double store(String resource, String description) { + //determine the Gaborator latency to make sure the block time to seconds + //conversion is always correct! + int size = Config.getInt(Key.PANAKO_AUDIO_BLOCK_SIZE); + PanakoEventPointProcessor eventPointProcessor = new PanakoEventPointProcessor(size); + latency = eventPointProcessor.latency(); + LOG.info(String.format("Gaborator latency is %d samples",latency)); + eventPointProcessor.processingFinished(); PanakoStorage db; - if (Config.get(Key.PANAKO_STORAGE).equalsIgnoreCase("LMDB")) { db = PanakoStorageKV.getInstance(); + }else if (Config.get(Key.PANAKO_STORAGE).equalsIgnoreCase("FILE")) { + db = PanakoStorageFile.getInstance(); }else { db = PanakoStorageMemory.getInstance(); } - - PanakoStorage fileCache = null; - if(Config.getBoolean(Key.PANAKO_CACHE_TO_FILE)) { - fileCache = PanakoStorageFile.getInstance(); + if(Config.getBoolean(Key.PANAKO_CACHE_TO_FILE) && db != PanakoStorageFile.getInstance()) { + LOG.info("Using "+ db.getClass().getSimpleName() + " storage with caching front."); + db = new PanakoCachingStorage(PanakoStorageFile.getInstance(),db); + }else { + LOG.info("Using " + db.getClass().getSimpleName() + " as storage."); } - + this.db = db; + } + + private PanakoStorage getStorage(){ + return db; + } + + @Override + public double store(String resource, String description) { + PanakoStorage db = getStorage(); List prints = toFingerprints(resource); int resourceID = FileUtils.getIdentifier(resource); @@ -106,16 +115,9 @@ public double store(String resource, String description) { for(PanakoFingerprint print : prints) { long hash = print.hash(); db.addToStoreQueue(hash, resourceID, print.t1,print.f1); - if(fileCache!=null) { - fileCache.addToStoreQueue(hash, resourceID, print.t1, print.f1); - } } db.processStoreQueue(); - - if(fileCache!=null) { - fileCache.processStoreQueue(); - } - + //store meta-data as well float duration = 0; if(prints.size() != 0) { @@ -138,7 +140,7 @@ public double store(String resource, String description) { @Override public double delete(String resource) { - PanakoStorageKV db = PanakoStorageKV.getInstance(); + PanakoStorage db = getStorage(); List prints = toFingerprints(resource); @@ -168,30 +170,39 @@ public double delete(String resource) { } private List toFingerprints(String resource){ + return toFingerprints(resource,0,MAX_TIME); + } + + private List toFingerprints(String resource,double startTimeOffset,double numberOfSeconds){ + if(Config.getBoolean(Key.PANAKO_USE_CACHED_PRINTS)) { String folder = Config.get(Key.PANAKO_CACHE_FOLDER); folder = FileUtils.expandHomeDir(folder); String tdbPath = FileUtils.combine(folder,resolve(resource) + ".tdb"); + if(FileUtils.exists(tdbPath)) { - List prints = new ArrayList<>(); List printData = readFingerprintFile(tdbPath); for(long[] data : printData) { long fingerprintHash = data[0]; int t1 = (int) data[2]; int f1 = (int) data[3]; - prints.add(new PanakoFingerprint(fingerprintHash,t1,f1)); + float t1InSeconds = blocksToSeconds(t1); + + //skip all fingerprints after stop time + if(t1InSeconds > startTimeOffset + numberOfSeconds) + break; + //only add prints if they are after the start time offset + if(t1InSeconds >= startTimeOffset) + prints.add(new PanakoFingerprint(fingerprintHash,t1,f1)); } - - LOG.info(String.format("Read %d cached fingerprints from file %s for %s", prints.size(),tdbPath,resource)); + LOG.info(String.format("Read %d cached fingerprints from file '%s' (start: %.3f sec, stop: %.3f sec) for '%s'", prints.size(),tdbPath,startTimeOffset,startTimeOffset+numberOfSeconds,resource)); return prints; + }else{ + LOG.info(String.format("Could not read cached fingerprints from file '%s' for '%s'",tdbPath,resource)); } - } - - return toFingerprints(resource,0,MAX_TIME); - } - - private List toFingerprints(String resource,double startTimeOffset,double numberOfSeconds){ + } //else no cached prints are found + int samplerate, size, overlap; samplerate = Config.getInt(Key.PANAKO_SAMPLE_RATE); size = Config.getInt(Key.PANAKO_AUDIO_BLOCK_SIZE); @@ -205,15 +216,12 @@ private List toFingerprints(String resource,double startTimeO d = AudioDispatcherFactory.fromPipe(resource, samplerate, size, overlap,startTimeOffset,numberOfSeconds); PanakoEventPointProcessor eventPointProcessor = new PanakoEventPointProcessor(size); - latency = eventPointProcessor.latency(); d.addAudioProcessor(eventPointProcessor); d.run(); return eventPointProcessor.getFingerprints(); } - - - int latency; + private float blocksToSeconds(int t) { float timeResolution = Config.getFloat(Key.PANAKO_TRANSF_TIME_RESOLUTION); float sampleRate = Config.getFloat(Key.PANAKO_SAMPLE_RATE); @@ -271,7 +279,7 @@ private void query(String query, int maxNumberOfResults, Set avoid, Que prints = toFingerprints(query); } - PanakoStorageKV db = PanakoStorageKV.getInstance(); + PanakoStorage db = getStorage(); Map printMap = new HashMap<>(); @@ -289,14 +297,13 @@ private void query(String query, int maxNumberOfResults, Set avoid, Que int queryRange = Config.getInt(Key.PANAKO_QUERY_RANGE); db.processQueryQueue(matchAccumulator,queryRange , avoid); - LOG.info(String.format("Query for %d prints, %d matches in %s \n", printMap.size(),matchAccumulator.size(), w.formattedToString())); - + HashMap> hitsPerIdentifer = new HashMap<>(); final List queryResults = new ArrayList<>(); - + matchAccumulator.forEach((fingerprintHash, dbHits) -> { - + dbHits.forEach((dbHit)->{ //long matchingHash = data[0]; int identifier = dbHit.resourceID; @@ -317,7 +324,9 @@ private void query(String query, int maxNumberOfResults, Set avoid, Que hitsPerIdentifer.get(identifier).add(hit); }); }); - + + LOG.info(String.format("Query for %d prints, %d matches in %s \n", printMap.size(),matchAccumulator.size(), w.formattedToString())); + int minimumUnfilteredHits = Config.getInt(Key.PANAKO_MIN_HITS_UNFILTERED); int minimumFilteredHits = Config.getInt(Key.PANAKO_MIN_HITS_FILTERED); @@ -330,11 +339,9 @@ private void query(String query, int maxNumberOfResults, Set avoid, Que matchesToDelete.forEach( identifier ->{ hitsPerIdentifer.remove(identifier); - //System.out.println("Removed " + identifier); }); hitsPerIdentifer.forEach((identifier, hitlist) -> { - //System.out.println("Matches " + identifier + " matches " + hitlist.size()); //sort by query time Collections.sort(hitlist, (Comparator) (PanakoMatch a, PanakoMatch b) -> Integer.valueOf(a.queryTime).compareTo(Integer.valueOf(b.queryTime))); @@ -474,10 +481,7 @@ public void monitor(String query, int maxNumberOfReqults, Set avoid, Qu int overlapInSeconds = Config.getInt(Key.MONITOR_OVERLAP); // 5 int stepSizeInSeconds = Config.getInt(Key.MONITOR_STEP_SIZE); //25 - // Get the total duration - AudioDispatcher d = AudioDispatcherFactory.fromPipe(query, 8000, 2048, 0); - d.run(); - double totalDuration = d.secondsProcessed(); + float totalDuration = AudioFileUtils.audioFileDurationInSeconds(new File(query)); //Steps: 0-25s ; 20-45s ; 40-65s ... int actualStep = stepSizeInSeconds - overlapInSeconds;//20s @@ -489,9 +493,8 @@ public void monitor(String query, int maxNumberOfReqults, Set avoid, Qu @Override public boolean hasResource(String resource) { int identifier = FileUtils.getIdentifier(resource); - PanakoStorageKV db; - db = PanakoStorageKV.getInstance(); - + PanakoStorage db = getStorage(); + return db.getMetadata(identifier) != null; } @@ -502,7 +505,7 @@ public boolean isStorageAvailable() { @Override public void printStorageStatistics() { - PanakoStorageKV.getInstance().printStatistics(true); + getStorage().printStatistics(true); } @Override @@ -567,16 +570,11 @@ public void print(String path, boolean sonicVisualizerOutput) { @Override public void clear() { - if(Config.getBoolean(Key.PANAKO_CACHE_TO_FILE)) { - PanakoStorageFile.getInstance().clear(); - } - if (Config.get(Key.PANAKO_STORAGE).equalsIgnoreCase("LMDB")) { - PanakoStorageKV.getInstance().clear(); - } + getStorage().clear(); } public String metadata(String path) { - final PanakoStorage db = PanakoStorageKV.getInstance(); + final PanakoStorage db = getStorage(); long identifier = FileUtils.getIdentifier(path); PanakoResourceMetadata metaData = db.getMetadata(identifier); diff --git a/src/main/java/be/panako/strategy/panako/storage/PanakoCachingStorage.java b/src/main/java/be/panako/strategy/panako/storage/PanakoCachingStorage.java new file mode 100644 index 0000000..bc0fc52 --- /dev/null +++ b/src/main/java/be/panako/strategy/panako/storage/PanakoCachingStorage.java @@ -0,0 +1,102 @@ +package be.panako.strategy.panako.storage; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * This is a caching front for a storage engine. + * + * To save time on feature extraction a cache can store the extracted features (the fingerprints) + * and a store often stores an inverted index: the extracted features in an efficient way to query the features. + * + * During a query operation the cache is first checked for a match, only if no match is found a feature extraction + * takes place. + * + * The query operations are only executed on the inverted index since it is + */ +public class PanakoCachingStorage implements PanakoStorage{ + + private final PanakoStorage cachingIndex; + private final PanakoStorage invertedIndex; + + /** + * Create a new caching storage front. + * @param cachingIndex The underlying cache (straight caching index). + * @param invertedIndex The actual inverted index storage (key value store). + */ + public PanakoCachingStorage(PanakoStorage cachingIndex, PanakoStorage invertedIndex){ + this.cachingIndex = cachingIndex; + this.invertedIndex = invertedIndex; + } + + @Override + public void storeMetadata(long resourceID, String resourcePath, float duration, int fingerprints) { + cachingIndex.storeMetadata(resourceID,resourcePath,duration,fingerprints); + invertedIndex.storeMetadata(resourceID,resourcePath,duration,fingerprints); + } + + @Override + public void addToStoreQueue(long fingerprintHash, int resourceIdentifier, int t1, int f1) { + cachingIndex.addToStoreQueue(fingerprintHash,resourceIdentifier,t1,f1); + invertedIndex.addToStoreQueue(fingerprintHash,resourceIdentifier,t1,f1); + } + + @Override + public void processStoreQueue() { + cachingIndex.processStoreQueue(); + invertedIndex.processStoreQueue(); + } + + @Override + public PanakoResourceMetadata getMetadata(long identifier) { + return invertedIndex.getMetadata(identifier); + } + + @Override + public void printStatistics(boolean detailedStats) { + invertedIndex.printStatistics(detailedStats); + } + + @Override + public void deleteMetadata(long resourceID) { + cachingIndex.processStoreQueue(); + invertedIndex.processStoreQueue(); + } + + @Override + public void addToQueryQueue(long queryHash) { + //it does not make sense to use the non optimized caching index for query tasks + invertedIndex.addToQueryQueue(queryHash); + } + + @Override + public void processQueryQueue(Map> matchAccumulator, int range) { + //it does not make sense to use the non optimized caching index for query tasks + processQueryQueue(matchAccumulator, range, new HashSet()); + } + + @Override + public void processQueryQueue(Map> matchAccumulator, int range, Set resourcesToAvoid) { + invertedIndex.processQueryQueue(matchAccumulator,range,resourcesToAvoid); + } + + @Override + public void addToDeleteQueue(long fingerprintHash, int resourceIdentifier, int t1, int f1) { + cachingIndex.addToDeleteQueue(fingerprintHash,resourceIdentifier,t1,f1); + invertedIndex.addToDeleteQueue(fingerprintHash,resourceIdentifier,t1,f1); + } + + @Override + public void processDeleteQueue() { + cachingIndex.processDeleteQueue(); + invertedIndex.processDeleteQueue(); + } + + @Override + public void clear() { + cachingIndex.clear(); + invertedIndex.clear(); + } +} diff --git a/src/main/java/be/panako/strategy/panako/storage/PanakoStorageFile.java b/src/main/java/be/panako/strategy/panako/storage/PanakoStorageFile.java index 8a8f020..e96a20c 100644 --- a/src/main/java/be/panako/strategy/panako/storage/PanakoStorageFile.java +++ b/src/main/java/be/panako/strategy/panako/storage/PanakoStorageFile.java @@ -41,6 +41,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.logging.Logger; import be.panako.util.Config; import be.panako.util.FileUtils; @@ -50,6 +51,7 @@ * Stores fingerprints in flat files. */ public class PanakoStorageFile implements PanakoStorage { + private final static Logger LOG = Logger.getLogger(PanakoStorageFile.class.getName()); /** * The single instance of the storage. @@ -105,7 +107,8 @@ public void storeMetadata(long resourceID,String resourcePath,float duration, in String path = FileUtils.combine(storeDir.getAbsolutePath(),resourceID + "_meta_data.txt"); StringBuilder sb = new StringBuilder(); sb.append(resourceID).append("\n").append(duration).append("\n").append(fingerprints).append("\n").append(resourcePath).append("\n"); - FileUtils.writeFile(sb.toString(), path); + FileUtils.writeFile(sb.toString(), path); + LOG.info(String.format("Stored metadata file '%s'.",path)); } @Override @@ -133,6 +136,7 @@ public void printStatistics(boolean detailedStats) { public void deleteMetadata(long resourceID) { String path = FileUtils.combine(storeDir.getAbsolutePath(),resourceID + "_meta_data.txt"); FileUtils.rm(path); + LOG.info(String.format("Deleted metadata file '%s'.",path)); } @@ -160,24 +164,9 @@ public String storeQueueToString(List queue) { sb.append("\n"); } - // Clears the store queue - queue.clear(); - return sb.toString(); } - private String storeQueueToString( ) { - if(storeQueue.isEmpty()) return null; - long threadID = Thread.currentThread().getId(); - if(!storeQueue.containsKey(threadID)) return null; - - List queue = storeQueue.get(threadID); - - if (queue.isEmpty()) return null; - - return storeQueueToString(queue); - } - public void processStoreQueue() { if(storeQueue.isEmpty()) return; @@ -196,6 +185,11 @@ public void processStoreQueue() { String fingerprintsAsString = storeQueueToString(queue); String path = FileUtils.combine(storeDir.getAbsolutePath(),resourceIdentifier + ".tdb"); FileUtils.writeFile(fingerprintsAsString, path); + + LOG.info(String.format("Stored %d fingerprints in file %s",queue.size(),path)); + + // Clears the store queue + queue.clear(); } /** @@ -236,14 +230,16 @@ public void processDeleteQueue() { public void clear() { FileUtils.rm(storeDir.getAbsolutePath()); - + if(!FileUtils.exists(storeDir.getAbsolutePath())) return; - - for(File f : storeDir.listFiles()) { + File[] filesToDelete = storeDir.listFiles(); + for(File f : filesToDelete) { FileUtils.rm(f.getAbsolutePath()); } - System.out.println("Removed cached files from " + storeDir.getAbsolutePath()); + String message = String.format("Removed %d files from file storage",filesToDelete.length); + LOG.info(message); + System.out.println(message); } } diff --git a/src/main/java/be/panako/util/AudioFileUtils.java b/src/main/java/be/panako/util/AudioFileUtils.java new file mode 100644 index 0000000..3db286b --- /dev/null +++ b/src/main/java/be/panako/util/AudioFileUtils.java @@ -0,0 +1,55 @@ +package be.panako.util; + + +import be.tarsos.dsp.AudioDispatcher; +import be.tarsos.dsp.io.jvm.AudioDispatcherFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.logging.Logger; + +public class AudioFileUtils { + private final static Logger LOG = Logger.getLogger(AudioFileUtils.class.getName()); + + private static final String NEWLINE = System.getProperty("line.separator"); + + public static String run(String... command) throws IOException, InterruptedException { + ProcessBuilder pb = new ProcessBuilder(command).redirectErrorStream(true); + Process process = pb.start(); + StringBuilder result = new StringBuilder(80); + String line; + try (BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream()))) + { + while ((line = in.readLine()) != null) { + result.append(line).append(NEWLINE); + } + } + process.waitFor(); + + return result.toString(); + } + + public static float audioFileDurationInSeconds(File audioFile){ + String command = Config.get(Key.AUDIO_DURATION_COMMAND); + String path = audioFile.getAbsolutePath(); + command = command.replace("%resource%",path); + + float duration = -1; + try{ + String runtime = Config.get(Key.DECODER_PIPE_ENVIRONMENT); + String envArg = Config.get(Key.DECODER_PIPE_ENVIRONMENT_ARG); + String result = run(runtime,envArg,command); + duration = Float.valueOf(result); + LOG.info(String.format("Executed external command '%s' to find duration of %.3f",command,duration)); + }catch (IOException | InterruptedException | NumberFormatException ex){ + // Get the total duration, very inefficiently by decoding the + AudioDispatcher d = AudioDispatcherFactory.fromPipe(path, 8000, 2048, 0); + d.run(); + duration = d.secondsProcessed(); + LOG.warning(String.format("External command for audio duration failed. Decoded audio to find duration of %.3f s for '%s'",duration,path)); + } + return duration; + } +} diff --git a/src/main/java/be/panako/util/FileUtils.java b/src/main/java/be/panako/util/FileUtils.java index 4596e4e..1553b45 100644 --- a/src/main/java/be/panako/util/FileUtils.java +++ b/src/main/java/be/panako/util/FileUtils.java @@ -36,19 +36,7 @@ package be.panako.util; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.PrintWriter; +import java.io.*; import java.net.URL; import java.net.URLConnection; import java.nio.channels.FileChannel; @@ -470,11 +458,79 @@ public static int getIdentifier(String resource) { if (tokens.length == 2 && tokens[0].matches("\\d+")) { identifier = Integer.valueOf(tokens[0]); } else { - int hashCode = Math.abs(resource.hashCode()); + int hash = getFileHash(new File(resource)); + if(hash == 0 ) { + hash = Math.abs(resource.hashCode()); + } + //reserve the lower half of ints for sequential identifiers int minValue = Integer.MAX_VALUE / 2; - identifier = minValue + hashCode / 2; + identifier = minValue + hash / 2; } return identifier; } + /** + * A content based file hash. + * 8 * 8K bytes are read in the middle of the file and a hash is calculated + * with murmurhash3. + * + * The method fails silently and returns zero if the file is not accessible. + * + * @param file The file to calculate a hash for. + * @return The murmurhash3 hash of 8 * 8K bytes in the middle of the file. Zero is returned + * if the file is not accessible. + */ + public static int getFileHash(File file){ + final long blockSizeInBytes = 8 * 1024; + final long numberOfBlocksUsedInHash = 8; + + long fileSizeInBytes = file.length(); + long offsetInBytes = fileSizeInBytes / 2; + long offsetInBlocks = offsetInBytes / blockSizeInBytes; + int numberOfBytesToRead = (int) (blockSizeInBytes*numberOfBlocksUsedInHash); + + int fileHash = 0; + byte[] data = new byte[numberOfBytesToRead]; + boolean fallback = false; + try { + RandomAccessFile f = null; + f = new RandomAccessFile(file,"r"); + f.seek(offsetInBlocks * blockSizeInBytes); + int bytesRead = f.read(data); + if(bytesRead != numberOfBytesToRead){ + LOG.warning(String.format("Will only use %d bytes for hash, expected %d bytes",bytesRead,numberOfBytesToRead)); + } + fileHash = MurmurHash3.murmurhash3_x86_32(data,0,bytesRead,0); + } catch (FileNotFoundException e) { + LOG.warning(String.format("Could not determine file hash for '%s': %s",file.getAbsolutePath(),e.getMessage())); + } catch (IOException e) { + LOG.warning(String.format("Could not determine file hash for '%s': %s", file.getAbsolutePath(), e.getMessage())); + } + return fileHash; + } + + /** + * Checks the size of a file. + * @param file the file to check. + * @param maxFileSizeInMB the maximum file size in MB. + * @return Returns true if the file is not zero bytes and smaller than the given maximum file size in MB. + */ + public static boolean checkFileSize(File file,long maxFileSizeInMB){ + boolean fileOk = false; + //from megabytes to bytes + long fileSizeInBytes = file.length(); + long fileSizeInMB = fileSizeInBytes/(1024*1024); + + //file must be smaller than a configured number of bytes + if(fileSizeInBytes != 0 && fileSizeInMB < maxFileSizeInMB ){ + fileOk = true; + }else{ + String message = String.format("Could not process %s it has an unacceptable file size of %l MB (zero or larger than %d MB ).", file.getName(), fileSizeInMB, maxFileSizeInMB ); + LOG.warning(message); + System.err.println(message); + } + return fileOk; + + } + } diff --git a/src/main/java/be/panako/util/Key.java b/src/main/java/be/panako/util/Key.java index 288a98a..2977e53 100644 --- a/src/main/java/be/panako/util/Key.java +++ b/src/main/java/be/panako/util/Key.java @@ -94,6 +94,14 @@ public enum Key{ * The log file for the pipe decoder. */ DECODER_PIPE_LOG_FILE("decoder_log.txt"), + + /** + * By default ffprobe is used to determine the duration - in seconds - + * of an audio file. + * + * Alternatively any command which returns the duration in seconds can be used. + */ + AUDIO_DURATION_COMMAND("ffprobe -i \"%resource%\" -v quiet -show_entries format=duration -hide_banner -of default=noprint_wrappers=1:nokey=1"), /** diff --git a/src/main/java/be/panako/util/MurmurHash3.java b/src/main/java/be/panako/util/MurmurHash3.java new file mode 100644 index 0000000..f010e03 --- /dev/null +++ b/src/main/java/be/panako/util/MurmurHash3.java @@ -0,0 +1,301 @@ +package be.panako.util; + +/** + * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. + * This java port was authored by Yonik Seeley and also placed into the public domain. + * The author hereby disclaims copyright to this source code. + *

+ * This produces exactly the same hash values as the final C++ + * version of MurmurHash3 and is thus suitable for producing the same hash values across + * platforms. + *

+ * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids. + * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash. + *

+ * Note - The x86 and x64 versions do _not_ produce the same results, as the + * algorithms are optimized for their respective platforms. + *

+ * See http://github.com/yonik/java_util for future updates to this file. + */ +public final class MurmurHash3 { + + /** 128 bits of state */ + public static final class LongPair { + public long val1; + public long val2; + } + + public static final int fmix32(int h) { + h ^= h >>> 16; + h *= 0x85ebca6b; + h ^= h >>> 13; + h *= 0xc2b2ae35; + h ^= h >>> 16; + return h; + } + + public static final long fmix64(long k) { + k ^= k >>> 33; + k *= 0xff51afd7ed558ccdL; + k ^= k >>> 33; + k *= 0xc4ceb9fe1a85ec53L; + k ^= k >>> 33; + return k; + } + + /** Gets a long from a byte buffer in little endian byte order. */ + public static final long getLongLittleEndian(byte[] buf, int offset) { + return ((long)buf[offset+7] << 56) // no mask needed + | ((buf[offset+6] & 0xffL) << 48) + | ((buf[offset+5] & 0xffL) << 40) + | ((buf[offset+4] & 0xffL) << 32) + | ((buf[offset+3] & 0xffL) << 24) + | ((buf[offset+2] & 0xffL) << 16) + | ((buf[offset+1] & 0xffL) << 8) + | ((buf[offset ] & 0xffL)); // no shift needed + } + + + /** Returns the MurmurHash3_x86_32 hash. */ + @SuppressWarnings("fallthrough") + public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { + + final int c1 = 0xcc9e2d51; + final int c2 = 0x1b873593; + + int h1 = seed; + int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block + + for (int i=offset; i>> 17); // ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + // tail + int k1 = 0; + + switch(len & 0x03) { + case 3: + k1 = (data[roundedEnd + 2] & 0xff) << 16; + // fallthrough + case 2: + k1 |= (data[roundedEnd + 1] & 0xff) << 8; + // fallthrough + case 1: + k1 |= (data[roundedEnd] & 0xff); + k1 *= c1; + k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + } + + // finalization + h1 ^= len; + + // fmix(h1); + h1 ^= h1 >>> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >>> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >>> 16; + + return h1; + } + + + /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding + * the string to a temporary buffer. This is more than 2x faster than hashing the result + * of String.getBytes(). + */ + public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) { + + final int c1 = 0xcc9e2d51; + final int c2 = 0x1b873593; + + int h1 = seed; + + int pos = offset; + int end = offset + len; + int k1 = 0; + int k2 = 0; + int shift = 0; + int bits = 0; + int nBytes = 0; // length in UTF8 bytes + + + while (pos < end) { + int code = data.charAt(pos++); + if (code < 0x80) { + k2 = code; + bits = 8; + + /*** + // optimized ascii implementation (currently slower!!! code size?) + if (shift == 24) { + k1 = k1 | (code << 24); + + k1 *= c1; + k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + + shift = 0; + nBytes += 4; + k1 = 0; + } else { + k1 |= code << shift; + shift += 8; + } + continue; + ***/ + + } + else if (code < 0x800) { + k2 = (0xC0 | (code >> 6)) + | ((0x80 | (code & 0x3F)) << 8); + bits = 16; + } + else if (code < 0xD800 || code > 0xDFFF || pos>=end) { + // we check for pos>=end to encode an unpaired surrogate as 3 bytes. + k2 = (0xE0 | (code >> 12)) + | ((0x80 | ((code >> 6) & 0x3F)) << 8) + | ((0x80 | (code & 0x3F)) << 16); + bits = 24; + } else { + // surrogate pair + // int utf32 = pos < end ? (int) data.charAt(pos++) : 0; + int utf32 = (int) data.charAt(pos++); + utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); + k2 = (0xff & (0xF0 | (utf32 >> 18))) + | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8 + | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 + | (0x80 | (utf32 & 0x3F)) << 24; + bits = 32; + } + + + k1 |= k2 << shift; + + // int used_bits = 32 - shift; // how many bits of k2 were used in k1. + // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift + + shift += bits; + if (shift >= 32) { + // mix after we have a complete word + + k1 *= c1; + k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + + shift -= 32; + // unfortunately, java won't let you shift 32 bits off, so we need to check for 0 + if (shift != 0) { + k1 = k2 >>> (bits-shift); // bits used == bits - newshift + } else { + k1 = 0; + } + nBytes += 4; + } + + } // inner + + // handle tail + if (shift > 0) { + nBytes += shift >> 3; + k1 *= c1; + k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + } + + // finalization + h1 ^= nBytes; + + // fmix(h1); + h1 ^= h1 >>> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >>> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >>> 16; + + return h1; + } + + + /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */ + @SuppressWarnings("fallthrough") + public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) { + // The original algorithm does have a 32 bit unsigned seed. + // We have to mask to match the behavior of the unsigned types and prevent sign extension. + long h1 = seed & 0x00000000FFFFFFFFL; + long h2 = seed & 0x00000000FFFFFFFFL; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + + int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block + for (int i=offset; i references; + List queries; + @BeforeEach void setUp() { + references = TestData.referenceFiles(); + queries = TestData.queryFiles(); Config config = Config.getInstance(); String tempStoragePath = FileUtils.combine(FileUtils.temporaryDirectory(),"olaf_test"); config.set(Key.OLAF_LMDB_FOLDER,tempStoragePath); @@ -58,4 +69,25 @@ void storeFingerprint() { s.processQueryQueue(matchAccumulator,2,new HashSet<>()); assertEquals(1 ,matchAccumulator.size(),"Expected only one match"); } + + @Test + void testMatching(){ + Strategy s = new OlafStrategy(); + for(File ref : references){ + s.store(ref.getAbsolutePath(),ref.getName()); + } + + s.query(queries.get(1).getAbsolutePath(), 1, new HashSet<>(), new QueryResultHandler() { + @Override + public void handleQueryResult(QueryResult result) { + assertTrue(result.refIdentifier.equalsIgnoreCase(1051039 + "")); + assertEquals(34,result.refStart,3.5,"Expect start to be close to 34s"); + } + + @Override + public void handleEmptyResult(QueryResult result) { + assertTrue(false); + } + }); + } } \ No newline at end of file diff --git a/src/test/java/be/panako/tests/PanakoStrategyTest.java b/src/test/java/be/panako/tests/PanakoStrategyTest.java index 2aed63c..85ad79e 100644 --- a/src/test/java/be/panako/tests/PanakoStrategyTest.java +++ b/src/test/java/be/panako/tests/PanakoStrategyTest.java @@ -12,132 +12,21 @@ import org.junit.jupiter.api.Test; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.channels.Channels; -import java.nio.channels.FileChannel; -import java.nio.channels.ReadableByteChannel; -import java.util.ArrayList; -import java.util.Date; import java.util.HashSet; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; class PanakoStrategyTest { - String DATASET_URL = "https://panako.be/releases/Panako-test-dataset/"; - - private boolean downloadFile(String url,String targetLocation){ - - System.out.println("Downloading " + url + " to " + targetLocation + " ..."); - URL uri = null; - try { - uri = new URL(url); - } catch (MalformedURLException e) { - e.printStackTrace(); - System.err.println("Failed to download " + url); - return false; - } - - try { - ReadableByteChannel readableByteChannel = Channels.newChannel(uri.openStream()); - FileOutputStream fileOutputStream = new FileOutputStream(targetLocation); - FileChannel fileChannel = fileOutputStream.getChannel(); - fileOutputStream.getChannel() - .transferFrom(readableByteChannel, 0, Long.MAX_VALUE); - } catch (FileNotFoundException e) { - System.err.println("Failed to download " + url); - return false; - } catch (IOException e) { - System.err.println("Failed to download " + url); - return false; - } - long fileSize = new File(targetLocation).length()/1024/1024; - System.out.println("Downloaded " + url + " to " + targetLocation + " size " + fileSize + "MB" ); - return true; - } - - private boolean cacheDownloadedFile(String url,String targetLocation){ - if (!FileUtils.exists(targetLocation) || new File(targetLocation).length() < 100) - return downloadFile(url, targetLocation); - else{ - System.out.println("Using cached audio file " + targetLocation); - } - return true; - } - - private List downloadDataset(String filenames[] , String foldername){ - List files = new ArrayList<>(); - for(String f : filenames) { - String path = FileUtils.combine(FileUtils.temporaryDirectory(),f); - - if(cacheDownloadedFile(DATASET_URL + foldername + "/" + f , path)) - System.out.println("Successfully download " + f); - else - System.err.println("Failed to download " + f); - - files.add(new File(path)); - } - - return files; - - } - - List referenceFiles(){ - String[] references = {"1051039.mp3" , - "1071559.mp3" , - "1075784.mp3" , - "11266.mp3" , - "147199.mp3" , - "173050.mp3" , - "189211.mp3" , - "297888.mp3" , - "612409.mp3" , - "852601.mp3" ,}; - return downloadDataset(references,"reference"); - } - - List queryFiles(){ - String[] queries = {"1024035_55s-75s.mp3", - "1051039_34s-54s.mp3", - "1071559_60s-80s.mp3", - "1075784_78s-98s.mp3", - "11266_69s-89s.mp3", - "132755_137s-157s.mp3", - "147199_115s-135s.mp3", - "173050_86s-106s.mp3", - "189211_60s-80s.mp3", - "295781_88s-108s.mp3", - "297888_45s-65s.mp3", - "361430_180s-200s.mp3", - "371009_187s-207s.mp3", - "378501_59s-79s.mp3", - "384991_294s-314s.mp3", - "432279_81s-101s.mp3", - "43383_224s-244s.mp3", - "478466_24s-44s.mp3", - "602848_242s-262s.mp3", - "604705_154s-174s.mp3", - "612409_73s-93s.mp3", - "824093_182s-202s.mp3", - "84302_232s-252s.mp3", - "852601_43s-63s.mp3", - "96644_84s-104s.mp3"}; - return downloadDataset(queries,"queries"); - } - List references; List queries; @BeforeEach void setUp() { - references = referenceFiles(); - queries = queryFiles(); + references = TestData.referenceFiles(); + queries = TestData.queryFiles(); Config.set(Key.PANAKO_LMDB_FOLDER,FileUtils.combine(FileUtils.temporaryDirectory(),"panako_test_data")); - } @AfterEach @@ -147,8 +36,6 @@ void tearDown() { @Test void testPanakoStrategy(){ - - Strategy s = new PanakoStrategy(); for(File ref : references){ s.store(ref.getAbsolutePath(),ref.getName()); diff --git a/src/test/java/be/panako/tests/TestData.java b/src/test/java/be/panako/tests/TestData.java new file mode 100644 index 0000000..7a4a714 --- /dev/null +++ b/src/test/java/be/panako/tests/TestData.java @@ -0,0 +1,119 @@ +package be.panako.tests; + +import be.panako.util.FileUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.channels.ReadableByteChannel; +import java.util.ArrayList; +import java.util.List; + +public class TestData { + + private static final String DATASET_URL = "https://panako.be/releases/Panako-test-dataset/"; + + public static List referenceFiles(){ + String[] references = {"1051039.mp3" , + "1071559.mp3" , + "1075784.mp3" , + "11266.mp3" , + "147199.mp3" , + "173050.mp3" , + "189211.mp3" , + "297888.mp3" , + "612409.mp3" , + "852601.mp3" ,}; + return downloadDataset(references,"reference"); + } + + public static List queryFiles(){ + String[] queries = {"1024035_55s-75s.mp3", + "1051039_34s-54s.mp3", + "1071559_60s-80s.mp3", + "1075784_78s-98s.mp3", + "11266_69s-89s.mp3", + "132755_137s-157s.mp3", + "147199_115s-135s.mp3", + "173050_86s-106s.mp3", + "189211_60s-80s.mp3", + "295781_88s-108s.mp3", + "297888_45s-65s.mp3", + "361430_180s-200s.mp3", + "371009_187s-207s.mp3", + "378501_59s-79s.mp3", + "384991_294s-314s.mp3", + "432279_81s-101s.mp3", + "43383_224s-244s.mp3", + "478466_24s-44s.mp3", + "602848_242s-262s.mp3", + "604705_154s-174s.mp3", + "612409_73s-93s.mp3", + "824093_182s-202s.mp3", + "84302_232s-252s.mp3", + "852601_43s-63s.mp3", + "96644_84s-104s.mp3"}; + return downloadDataset(queries,"queries"); + } + + private static List downloadDataset(String filenames[] , String foldername){ + List files = new ArrayList<>(); + for(String f : filenames) { + String path = FileUtils.combine(FileUtils.temporaryDirectory(),f); + + if(cacheDownloadedFile(DATASET_URL + foldername + "/" + f , path)) + System.out.println("Successfully download " + f); + else + System.err.println("Failed to download " + f); + + files.add(new File(path)); + } + + return files; + + } + private static boolean cacheDownloadedFile(String url,String targetLocation){ + if (!FileUtils.exists(targetLocation) || new File(targetLocation).length() < 100) + return downloadFile(url, targetLocation); + else{ + System.out.println("Using cached audio file " + targetLocation); + } + return true; + } + + private static boolean downloadFile(String url,String targetLocation){ + + System.out.println("Downloading " + url + " to " + targetLocation + " ..."); + URL uri = null; + try { + uri = new URL(url); + } catch (MalformedURLException e) { + e.printStackTrace(); + System.err.println("Failed to download " + url); + return false; + } + + try { + ReadableByteChannel readableByteChannel = Channels.newChannel(uri.openStream()); + FileOutputStream fileOutputStream = new FileOutputStream(targetLocation); + FileChannel fileChannel = fileOutputStream.getChannel(); + fileOutputStream.getChannel() + .transferFrom(readableByteChannel, 0, Long.MAX_VALUE); + } catch (FileNotFoundException e) { + System.err.println("Failed to download " + url); + return false; + } catch (IOException e) { + System.err.println("Failed to download " + url); + return false; + } + long fileSize = new File(targetLocation).length()/1024/1024; + System.out.println("Downloaded " + url + " to " + targetLocation + " size " + fileSize + "MB" ); + return true; + } + +} diff --git a/src/test/java/be/panako/tests/UtilsTest.java b/src/test/java/be/panako/tests/UtilsTest.java new file mode 100644 index 0000000..f63b5f6 --- /dev/null +++ b/src/test/java/be/panako/tests/UtilsTest.java @@ -0,0 +1,45 @@ +package be.panako.tests; + + +import be.panako.util.AudioFileUtils; +import be.panako.util.Config; +import be.panako.util.FileUtils; +import be.panako.util.Key; +import org.junit.jupiter.api.Test; + +import java.io.File; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; + + +public class UtilsTest { + + @Test + void testAudioDuration(){ + File durationTestFile = TestData.queryFiles().get(0); + //extract duration with ffprobe + float actualDuration = AudioFileUtils.audioFileDurationInSeconds(durationTestFile); + float expectedDuration = 20; + assertEquals(expectedDuration,actualDuration,0.05,"Expect duration to be close to 20s"); + + //test fallback with nonsense command + Config.set(Key.AUDIO_DURATION_COMMAND,"non_existing_command arg arg2"); + actualDuration = AudioFileUtils.audioFileDurationInSeconds(durationTestFile); + assertEquals(expectedDuration,actualDuration,0.05,"Expect duration to be close to 20s"); + } + + @Test + void testFileHash(){ + File hashTestFile = TestData.queryFiles().get(0); + int expectedHash = 2035021894; + // The expected hash should be the same on each platform + int calculatedHash = FileUtils.getFileHash(hashTestFile); + assertEquals(expectedHash,calculatedHash,"The expected hash should be equal to " + expectedHash + " on each platform"); + // The hash file should differ from other files + File otherTestFile = TestData.queryFiles().get(1); + calculatedHash = FileUtils.getFileHash(otherTestFile); + assertNotEquals(expectedHash,calculatedHash,"Other files should have a hash different from " + expectedHash); + } + +}