From e5a140d538d7f74a54afc7142bf1ee2924ca6be7 Mon Sep 17 00:00:00 2001 From: Alain Riou Date: Mon, 16 Oct 2023 15:50:16 +0200 Subject: [PATCH 1/4] relative import in utils --- pesto/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pesto/utils.py b/pesto/utils.py index bc6f314..46281eb 100644 --- a/pesto/utils.py +++ b/pesto/utils.py @@ -3,9 +3,9 @@ import torch -from pesto.config import model_args, cqt_args, bins_per_semitone -from pesto.data import DataProcessor -from pesto.model import PESTOEncoder +from .config import model_args, cqt_args, bins_per_semitone +from .data import DataProcessor +from .model import PESTOEncoder def load_dataprocessor(step_size, device: Optional[torch.device] = None): From a9dcc805f844017eda67929635493ce131a79c2c Mon Sep 17 00:00:00 2001 From: Alain Riou Date: Mon, 16 Oct 2023 16:07:42 +0200 Subject: [PATCH 2/4] split cqt into chunks --- README.md | 4 ++-- pesto/core.py | 27 +++++++++++++++++---------- pesto/parser.py | 3 +++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fe79b7b..c827b27 100644 --- a/README.md +++ b/README.md @@ -119,20 +119,20 @@ re-initialize the same model for each tensor. To avoid this time-consuming step, one can manually instantiate the model and data processor, then pass them directly as args to the `predict` function. To do so, one has to use the underlying methods from `pesto.utils`: + ```python import torch from pesto import predict from pesto.utils import load_model, load_dataprocessor - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = load_model("mir-1k", device=device) data_processor = load_dataprocessor(step_size=0.01, device=device) for x, sr in ...: data_processor.sampling_rate = sr # The data_processor handles waveform->CQT conversion so it must know the sampling rate - predictions = predict(x, sr, model=model, data_processor=data_processor) + predictions = predict(x, sr, model=model) ... ``` Note that when passing a list of files to `pesto.predict_from_files(...)` or the CLI directly, the model is loaded only diff --git a/pesto/core.py b/pesto/core.py index 9521cfc..060a860 100644 --- a/pesto/core.py +++ b/pesto/core.py @@ -18,6 +18,7 @@ def predict( data_preprocessor=None, step_size: Optional[float] = None, reduction: str = "argmax", + num_chunks: int = 1, convert_to_freq: bool = False ): r"""Main prediction function. @@ -31,7 +32,10 @@ def predict( data_preprocessor: Module handling the data processing pipeline (waveform to CQT, cropping, etc.) step_size (float, optional): step size between each CQT frame in milliseconds. If the data_preprocessor is passed, its value will be used instead. - reduction (str): + reduction (str): reduction method for converting activation probabilities to log-frequencies. + num_chunks (int): number of chunks to split the input audios in. + Default is 1 (all CQT frames in parallel) but it can be increased to reduce memory usage + and prevent out-of-memory errors. convert_to_freq (bool): whether predictions should be converted to frequencies or not. """ # convert to mono @@ -53,7 +57,15 @@ def predict( # apply model cqt = data_preprocessor(x) - activations = model(cqt) + try: + activations = torch.stack([ + model(chunk) for chunk in cqt.chunk(chunks=num_chunks) + ]) + except torch.cuda.OutOfMemoryError: + raise torch.cuda.OutOfMemoryError("Got an out-of-memory error while performing pitch estimation. " + "Please increase the number of chunks with option `-c`/`--chunks` " + "to reduce GPU memory usage.") + if batch_size: total_batch_size, num_predictions = activations.size() activations = activations.view(batch_size, total_batch_size // batch_size, num_predictions) @@ -84,6 +96,7 @@ def predict_from_files( reduction: str = "alwa", export_format: Sequence[str] = ("csv",), no_convert_to_freq: bool = False, + num_chunks: int = 1, gpu: int = -1 ): r""" @@ -130,14 +143,8 @@ def predict_from_files( x = x.to(device) # compute the predictions - predictions = predict( - x, - sr, - model=model, - data_preprocessor=data_preprocessor, - reduction=reduction, - convert_to_freq=not no_convert_to_freq - ) + predictions = predict(x, sr, model=model, data_preprocessor=data_preprocessor, reduction=reduction, + convert_to_freq=not no_convert_to_freq, num_chunks=num_chunks) output_file = file.rsplit('.', 1)[0] + "." + ("semitones" if no_convert_to_freq else "f0") if output is not None: diff --git a/pesto/parser.py b/pesto/parser.py index f4957f9..39ba83d 100644 --- a/pesto/parser.py +++ b/pesto/parser.py @@ -17,6 +17,9 @@ def parse_args(): parser.add_argument('-F', '--no_convert_to_freq', action='store_true', help='if true, does not convert the predicted pitch to frequency domain and ' 'returns predictions as semitones') + parser.add_argument('-c', '--num_chunks', type=int, default=1, + help='number of chunks to split the input data into (default: 1). ' + 'Can be increased to prevent out-of-memory errors.') parser.add_argument('--gpu', type=int, default=-1, help='the index of the GPU to use, -1 for CPU') return parser.parse_args() From 11711876bda6b4478c52db72aae678559d887820 Mon Sep 17 00:00:00 2001 From: Alain Riou Date: Mon, 16 Oct 2023 16:15:01 +0200 Subject: [PATCH 3/4] correct typo --- pesto/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pesto/core.py b/pesto/core.py index 060a860..c55ac4a 100644 --- a/pesto/core.py +++ b/pesto/core.py @@ -58,7 +58,7 @@ def predict( # apply model cqt = data_preprocessor(x) try: - activations = torch.stack([ + activations = torch.cat([ model(chunk) for chunk in cqt.chunk(chunks=num_chunks) ]) except torch.cuda.OutOfMemoryError: From d9d7c64793725e9f79a732ce0becd5078cf990ef Mon Sep 17 00:00:00 2001 From: Alain Riou Date: Mon, 16 Oct 2023 16:46:49 +0200 Subject: [PATCH 4/4] add info about chunks --- README.md | 14 ++++++++++++++ pesto/model.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c827b27..b9cc81c 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,20 @@ Note that the *y*-axis is in log-scale: with a step size of 10ms (the default), PESTO would perform pitch estimation of the file in 13 seconds (~12 times faster than real-time) while CREPE would take 12 minutes! It is therefore more suited to applications that need very fast pitch estimation without relying on GPU resources. +### Inference on GPU + +The underlying PESTO pitch estimator is a standard PyTorch module and can therefore use the GPU, +if available, by setting option `--gpu` to the id of the device you want to use for pitch estimation. + +Under the hood, the input is passed to the model as a single batch of CQT frames, +so pitch is estimated for the whole track in parallel, making inference extremely fast. + +However, when dealing with very large audio files, processing the whole track at once can lead to OOM errors. +To circumvent this, one can split the batch of CQT frames into multiple chunks by setting option `-c`/`--num_chunks`. +Chunks will be processed sequentially, thus reducing memory usage. + +As an example, a 48kHz audio file of 1 hour can be processed in 20 seconds only on a single GTX 1080 Ti when split into 10 chunks. + ## Contributing - Currently, only a single model trained on [MIR-1K](https://zenodo.org/record/3532216#.ZG0kWhlBxhE) is provided. diff --git a/pesto/model.py b/pesto/model.py index e66557f..94337ec 100644 --- a/pesto/model.py +++ b/pesto/model.py @@ -56,7 +56,7 @@ def __init__( if len(n_ch) < 5: n_ch.append(1) - # Layer normalization over frequency and channels (harmonics of HCQT) + # Layer normalization over frequency self.layernorm = nn.LayerNorm(normalized_shape=[1, n_bins_in]) # Prefiltering