From e5a140d538d7f74a54afc7142bf1ee2924ca6be7 Mon Sep 17 00:00:00 2001
From: Alain Riou <alain.riou14000@yahoo.com>
Date: Mon, 16 Oct 2023 15:50:16 +0200
Subject: [PATCH 1/4] relative import in utils

---
 pesto/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pesto/utils.py b/pesto/utils.py
index bc6f314..46281eb 100644
--- a/pesto/utils.py
+++ b/pesto/utils.py
@@ -3,9 +3,9 @@
 
 import torch
 
-from pesto.config import model_args, cqt_args, bins_per_semitone
-from pesto.data import DataProcessor
-from pesto.model import PESTOEncoder
+from .config import model_args, cqt_args, bins_per_semitone
+from .data import DataProcessor
+from .model import PESTOEncoder
 
 
 def load_dataprocessor(step_size, device: Optional[torch.device] = None):

From a9dcc805f844017eda67929635493ce131a79c2c Mon Sep 17 00:00:00 2001
From: Alain Riou <alain.riou14000@yahoo.com>
Date: Mon, 16 Oct 2023 16:07:42 +0200
Subject: [PATCH 2/4] split cqt into chunks

---
 README.md       |  4 ++--
 pesto/core.py   | 27 +++++++++++++++++----------
 pesto/parser.py |  3 +++
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index fe79b7b..c827b27 100644
--- a/README.md
+++ b/README.md
@@ -119,20 +119,20 @@ re-initialize the same model for each tensor.
 
 To avoid this time-consuming step, one can manually instantiate  the model   and data processor, then pass them directly 
 as args to the `predict` function. To do so, one has to use the underlying methods from `pesto.utils`:
+
 ```python
 import torch
 
 from pesto import predict
 from pesto.utils import load_model, load_dataprocessor
 
-
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = load_model("mir-1k", device=device)
 data_processor = load_dataprocessor(step_size=0.01, device=device)
 
 for x, sr in ...:
     data_processor.sampling_rate = sr  # The data_processor handles waveform->CQT conversion so it must know the sampling rate
-    predictions = predict(x, sr, model=model, data_processor=data_processor)
+    predictions = predict(x, sr, model=model)
     ...
 ```
 Note that when passing a list of files to `pesto.predict_from_files(...)` or the CLI directly, the model  is loaded only
diff --git a/pesto/core.py b/pesto/core.py
index 9521cfc..060a860 100644
--- a/pesto/core.py
+++ b/pesto/core.py
@@ -18,6 +18,7 @@ def predict(
         data_preprocessor=None,
         step_size: Optional[float] = None,
         reduction: str = "argmax",
+        num_chunks: int = 1,
         convert_to_freq: bool = False
 ):
     r"""Main prediction function.
@@ -31,7 +32,10 @@ def predict(
         data_preprocessor: Module handling the data processing pipeline (waveform to CQT, cropping, etc.)
         step_size (float, optional): step size between each CQT frame in milliseconds.
             If the data_preprocessor is passed, its value will be used instead.
-        reduction (str):
+        reduction (str): reduction method for converting activation probabilities to log-frequencies.
+        num_chunks (int): number of chunks to split the input audios in.
+            Default is 1 (all CQT frames in parallel) but it can be increased to reduce memory usage
+            and prevent out-of-memory errors.
         convert_to_freq (bool): whether predictions should be converted to frequencies or not.
     """
     # convert to mono
@@ -53,7 +57,15 @@ def predict(
 
     # apply model
     cqt = data_preprocessor(x)
-    activations = model(cqt)
+    try:
+        activations = torch.stack([
+            model(chunk) for chunk in cqt.chunk(chunks=num_chunks)
+        ])
+    except torch.cuda.OutOfMemoryError:
+        raise torch.cuda.OutOfMemoryError("Got an out-of-memory error while performing pitch estimation. "
+                                          "Please increase the number of chunks with option `-c`/`--chunks` "
+                                          "to reduce GPU memory usage.")
+
     if batch_size:
         total_batch_size, num_predictions = activations.size()
         activations = activations.view(batch_size, total_batch_size // batch_size, num_predictions)
@@ -84,6 +96,7 @@ def predict_from_files(
         reduction: str = "alwa",
         export_format: Sequence[str] = ("csv",),
         no_convert_to_freq: bool = False,
+        num_chunks: int = 1,
         gpu: int = -1
 ):
     r"""
@@ -130,14 +143,8 @@ def predict_from_files(
         x = x.to(device)
 
         # compute the predictions
-        predictions = predict(
-            x,
-            sr,
-            model=model,
-            data_preprocessor=data_preprocessor,
-            reduction=reduction,
-            convert_to_freq=not no_convert_to_freq
-        )
+        predictions = predict(x, sr, model=model, data_preprocessor=data_preprocessor, reduction=reduction,
+                              convert_to_freq=not no_convert_to_freq, num_chunks=num_chunks)
 
         output_file = file.rsplit('.', 1)[0] + "." + ("semitones" if no_convert_to_freq else "f0")
         if output is not None:
diff --git a/pesto/parser.py b/pesto/parser.py
index f4957f9..39ba83d 100644
--- a/pesto/parser.py
+++ b/pesto/parser.py
@@ -17,6 +17,9 @@ def parse_args():
     parser.add_argument('-F', '--no_convert_to_freq', action='store_true',
                         help='if true, does not convert the predicted pitch to frequency domain and '
                              'returns predictions as semitones')
+    parser.add_argument('-c', '--num_chunks', type=int, default=1,
+                        help='number of chunks to split the input data into (default: 1). '
+                             'Can be increased to prevent out-of-memory errors.')
     parser.add_argument('--gpu', type=int, default=-1,
                         help='the index of the GPU to use, -1 for CPU')
     return parser.parse_args()

From 11711876bda6b4478c52db72aae678559d887820 Mon Sep 17 00:00:00 2001
From: Alain Riou <alain.riou14000@yahoo.com>
Date: Mon, 16 Oct 2023 16:15:01 +0200
Subject: [PATCH 3/4] correct typo

---
 pesto/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pesto/core.py b/pesto/core.py
index 060a860..c55ac4a 100644
--- a/pesto/core.py
+++ b/pesto/core.py
@@ -58,7 +58,7 @@ def predict(
     # apply model
     cqt = data_preprocessor(x)
     try:
-        activations = torch.stack([
+        activations = torch.cat([
             model(chunk) for chunk in cqt.chunk(chunks=num_chunks)
         ])
     except torch.cuda.OutOfMemoryError:

From d9d7c64793725e9f79a732ce0becd5078cf990ef Mon Sep 17 00:00:00 2001
From: Alain Riou <alain.riou14000@yahoo.com>
Date: Mon, 16 Oct 2023 16:46:49 +0200
Subject: [PATCH 4/4] add info about chunks

---
 README.md      | 14 ++++++++++++++
 pesto/model.py |  2 +-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c827b27..b9cc81c 100644
--- a/README.md
+++ b/README.md
@@ -175,6 +175,20 @@ Note that the *y*-axis is in log-scale: with a step size of 10ms (the default),
 PESTO would perform pitch estimation of the file in 13 seconds (~12 times faster than real-time) while CREPE would take 12 minutes!
 It is therefore more suited to applications that need very fast pitch estimation without relying on GPU resources.
 
+### Inference on GPU
+
+The underlying PESTO pitch estimator is a standard PyTorch module and can therefore use the GPU,
+if available, by setting option `--gpu` to the id of the device you want to use for pitch estimation.
+
+Under the hood, the input is passed to the model as a single batch of CQT frames,
+so pitch is estimated for the whole track in parallel, making inference extremely fast.
+
+However, when dealing with very large audio files, processing the whole track at once can lead to OOM errors. 
+To circumvent this, one can split the batch of CQT frames into multiple chunks by setting option `-c`/`--num_chunks`.
+Chunks will be processed sequentially, thus reducing memory usage.
+
+As an example, a 48kHz audio file of 1 hour can be processed in 20 seconds only on a single GTX 1080 Ti when split into 10 chunks.
+
 ## Contributing
 
 - Currently, only a single model trained on [MIR-1K](https://zenodo.org/record/3532216#.ZG0kWhlBxhE) is provided.
diff --git a/pesto/model.py b/pesto/model.py
index e66557f..94337ec 100644
--- a/pesto/model.py
+++ b/pesto/model.py
@@ -56,7 +56,7 @@ def __init__(
         if len(n_ch) < 5:
             n_ch.append(1)
 
-        # Layer normalization over frequency and channels (harmonics of HCQT)
+        # Layer normalization over frequency
         self.layernorm = nn.LayerNorm(normalized_shape=[1, n_bins_in])
 
         # Prefiltering