From c1ca3533ee4c566baf135373c0e262b968940436 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Tue, 16 Mar 2021 05:06:10 -0700
Subject: [PATCH 01/35] add some centralized paths support

---
 Python/evaluation.py         |  5 ++---
 Python/feature_extraction.py | 13 ++++++-------
 Python/paths.py              | 18 ++++++++++++++++++
 3 files changed, 26 insertions(+), 10 deletions(-)
 create mode 100644 Python/paths.py

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 9bc138e..0a5e63e 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -11,11 +11,10 @@
 import pickle
 import peakutils
 import mir_eval
+import paths
 
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
-beats_folder_path = '../Audio'
-annotations_folder_path = '../Data/salami-data-public/annotations/'
 f_measure_thresh = 3    # tolerance window in seconds
 
 
@@ -74,7 +73,7 @@ def post_processing(preds_track):
         print("Evaluating {}".format(f))
 
         # load annotations
-        segment_times = get_segment_times(f, annotations_folder_path)
+        segment_times = get_segment_times(f, paths.annotations_path)
 
         # get beat times
         beat_times = get_beat_times(f, beats_folder_path)
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 6c831d6..508b881 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -19,12 +19,11 @@
 import librosa
 import random
 import pickle
+import paths
+
 from utils import *
 import scipy
 
-audio_folder_path = '../Audio'
-beats_folder_path = '../Audio'
-annotations_folder_path = '../Data/salami-data-public/annotations/'
 context_length = 65         # how many beats make up a context window for the CNN
 num_mel_bands = 80          # number of Mel bands
 neg_frames_factor = 5       # how many more negative examples than segment boundaries
@@ -317,12 +316,12 @@ def load_raw_features(file):
     print("Extracting MLS features")
 
     train_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files,
-                                                                                  beats_folder_path,
-                                                                                  annotations_folder_path)
+                                                                                  paths.beats_path,
+                                                                                  paths.annotations_path)
 
     test_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files,
-                                                                               beats_folder_path,
-                                                                               annotations_folder_path)
+                                                                               paths.beats_path,
+                                                                               paths.annotations_path)
 
     print("Extracted features for {} training and {} test tracks".format(len(train_features), len(test_features)))
 
diff --git a/Python/paths.py b/Python/paths.py
new file mode 100644
index 0000000..61c5a5b
--- /dev/null
+++ b/Python/paths.py
@@ -0,0 +1,18 @@
+# encoding: utf-8
+"""
+    Define path locations and helpful functions
+"""
+
+import os
+
+audio_path = '../Audio'
+beats_path = '../Audio/beats'
+mls_path = '../Audio/features'
+annotations_path =  '../Data/salami-data-public/annotations/'
+
+def remove_suffix(filename):
+     return os.path.splitext(os.path.basename(filename))[0]
+
+def get_mls_path(audio_filename):
+     return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy')
+

From ea218dc8775709b940fb330751c310692d0749ce Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Wed, 17 Mar 2021 07:35:43 -0700
Subject: [PATCH 02/35] latest

---
 Python/evaluation.py             | 18 ++++++----
 Python/feature_extraction.py     | 62 +++++++++++++++++++++-----------
 Python/paths.py                  |  4 +++
 Python/track_segmentation.py     | 39 +++++++++++++++-----
 Python/train_segmentation_cnn.py |  2 +-
 Python/utils.py                  | 29 +++++++++++----
 Python/visualization.py          | 17 +++++++--
 7 files changed, 126 insertions(+), 45 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 0a5e63e..db3c54e 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -37,10 +37,11 @@ def load_data(preds_file, file_lists):
     return preds, test_files, test_idx
 
 
-def post_processing(preds_track):
+def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
     """
     Post processing of prediction probabilities, applies smoothing
     window and emphasizes beats by multiplying with running avarage.
+    Also weights predictions towards beat "1".
 
     :param preds_track: CNN predictions per beat
     :return: post-processed predictions
@@ -53,6 +54,11 @@ def post_processing(preds_track):
     preds_track = np.multiply(preds_track,
                               np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same'))
 
+
+    # emphasize downbeeat
+    if emphasize_downbeat:
+        preds_track = np.multiply(preds_track, np.where(beat_numbers == 1, 1, 0.5))
+
     # unit maximum
     preds_track /= np.max(preds_track)
 
@@ -76,19 +82,19 @@ def post_processing(preds_track):
         segment_times = get_segment_times(f, paths.annotations_path)
 
         # get beat times
-        beat_times = get_beat_times(f, beats_folder_path)
+        beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True)
 
         # get predictions for current track
         preds_track = np.squeeze(np.asarray(preds[test_idx == i]))
 
         # post processing
-        preds_track = post_processing(preds_track)
-        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1)
+        preds_track = post_processing(preds_track, beat_numbers)
+        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.2)
 
-        pred_times = beat_times[peak_loc] - 1
+        pred_times = beat_times[peak_loc]
 
         # compute f-measure
-        f_score, p, r = mir_eval.onset.f_measure(segment_times, pred_times, window=f_measure_thresh)
+        f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh)
 
         f_measures.append(f_score)
         precisions.append(p)
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 508b881..146e768 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -48,7 +48,18 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102
     :return: beat Mel spectrogram (mel_bands x frames)
     """
 
-    y, sr = librosa.load(os.path.join(audio_folder_path, filename), sr=22050, mono=True)
+    computed_mls_file = paths.get_mls_path(filename)
+
+    if os.path.exists(computed_mls_file):
+        return np.load(computed_mls_file)
+
+
+    if "/" in filename:
+        path = filename
+    else:
+        path = os.path.join(paths.audio_path, filename)
+
+    y, sr = librosa.load(path, sr=22050, mono=True)
 
     spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
                                window=scipy.signal.hamming))
@@ -72,6 +83,15 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102
     return beat_melspec
 
 
+def compute_features(logger, f, i, audio_files):
+    logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
+
+    beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
+
+    beat_mls = compute_beat_mls(f, beat_times)
+    beat_mls /= np.max(beat_mls)
+    return beat_mls, beat_times
+
 def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     """
     Extract Mel log spectrogram features from a folder of audio files given pre-analysed
@@ -88,31 +108,33 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     labels_list = []
     failed_tracks_idx = []
 
-    for i, f in enumerate(audio_files):
-
-        print("Track {} / {}".format(i, len(audio_files)))
-
-        beat_times = get_beat_times(f, beats_folder)
+    async_res = []
 
-        beat_mls = compute_beat_mls(f, beat_times)
-        beat_mls /= np.max(beat_mls)
+    logger = multiprocessing.log_to_stderr()
+    logger.setLevel(logging.INFO)
 
-        label_vec = np.zeros(beat_mls.shape[1],)
-        segment_times = get_segment_times(f, annotation_folder)
+    with multiprocessing.Pool(processes=8) as pool:
+        #for i, f in enumerate(audio_files):
+        #    async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
 
-        if isinstance(segment_times, int):
-            failed_tracks_idx.append(i)
-            print("Extraction failed - no annotation found for " + f)
-            continue
+        for i, f in enumerate(audio_files):
+            #beat_mls, beat_times = async_res[i].get()
+            beat_mls, beat_times = compute_features(logger, f, i , audio_files)
+            label_vec = np.zeros(beat_mls.shape[1],)
+            segment_times = get_segment_times(f, paths.annotations_path)
 
-        for segment_start in segment_times:
+            if isinstance(segment_times, int):
+                failed_tracks_idx.append(i)
+                print("Extraction failed - no annotation found for " + f)
+                continue
 
-            closest_beat = np.argmin(np.abs(beat_times - segment_start))
-            if closest_beat < len(label_vec):
-                label_vec[closest_beat] = 1.
+            for segment_start in segment_times:
+                closest_beat = np.argmin(np.abs(beat_times - segment_start))
+                if closest_beat < len(label_vec):
+                    label_vec[closest_beat] = 1.
 
-        feature_list.append(beat_mls)
-        labels_list.append(label_vec)
+            feature_list.append(beat_mls)
+            labels_list.append(label_vec)
 
     return feature_list, labels_list, failed_tracks_idx
 
diff --git a/Python/paths.py b/Python/paths.py
index 61c5a5b..af52dac 100644
--- a/Python/paths.py
+++ b/Python/paths.py
@@ -8,11 +8,15 @@
 audio_path = '../Audio'
 beats_path = '../Audio/beats'
 mls_path = '../Audio/features'
+viz_path = '../Audio/viz'
 annotations_path =  '../Data/salami-data-public/annotations/'
 
 def remove_suffix(filename):
      return os.path.splitext(os.path.basename(filename))[0]
 
+def with_suffix(path, ext):
+     return remove_suffix(path) + '.' + ext
+
 def get_mls_path(audio_filename):
      return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy')
 
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index 5bdc4c9..f3d277a 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -49,7 +49,8 @@ def extract_features(audio_file, beats_file):
     """
 
     t = pd.read_table(beats_file, header=None)
-    beat_times = t.iloc[:, 0].values
+    beat_times = t[0].values
+    beat_numbers = t[1].values
 
     beat_mls = compute_beat_mls(filename=audio_file, beat_times=beat_times)
     beat_mls /= np.max(beat_mls)
@@ -60,7 +61,7 @@ def extract_features(audio_file, beats_file):
     std_vec = norm_data['std_vec']
     features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec)
 
-    return features, beat_times
+    return features, beat_times, beat_numbers
 
 
 def compute_context_windows(features):
@@ -96,16 +97,34 @@ def compute_context_windows(features):
     return data_x
 
 
-def compute_segments_from_predictions(predictions, beat_times):
+def print_predictions(p, beat_times):
+    for i in range(len(p)):
+        print("%i:\t%.3f\t%.1f" % (i, p[i], beat_times[i]))
+
+
+def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
     """
     Computes the segment times from a prediction curve and the beat times
     using peak picking.
     """
     predictions = np.squeeze(predictions)
-    predictions = post_processing(predictions)
-    peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.05)
+
+    breakpoint()
+    print("raw predicitions:")
+    print_predictions(predictions, beat_times)
+
+    predictions = post_processing(predictions, beat_numbers, emphasize_downbeat=True)
+
+    print("after post-processing:")
+    print_predictions(predictions, beat_times)
+
+    peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1)
     segment_times = beat_times[peak_loc]
 
+    print("beat_num\ttime:")
+    for i in peak_loc:
+        print("%i\t%.2f" % (i, beat_times[i]))
+
     return segment_times
 
 
@@ -127,16 +146,20 @@ def compute_segments_from_predictions(predictions, beat_times):
 
     if not os.path.isfile(out_dir + file_name + '.beats.txt'):
         print("Extracting beat times (this might take a while)...")
-        os.system('DBNBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
+        os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
 
     print("Computing features")
-    mls_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt')
+    mls_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt')
 
     print("Computing CNN predictions")
     predictions = compute_cnn_predictions(mls_features)
 
     print("Get segment times")
-    segment_times = compute_segments_from_predictions(predictions, beat_times)
+    segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers)
+
+    print("\n")
+    for f in segment_times:
+        print(f)
 
     print("The result has been stored in " + output_file)
     np.savetxt(output_file, segment_times, fmt='%4.2f', delimiter='\n')
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 1b81912..9d3fd17 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -114,7 +114,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True)
     model.compile(loss='binary_crossentropy', optimizer=sgd)
 
-    #early_stopping = EarlyStopping(monitor='val_loss', patience=5)
+    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
 
     print('train model...')
     model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
diff --git a/Python/utils.py b/Python/utils.py
index a55b571..30a9145 100644
--- a/Python/utils.py
+++ b/Python/utils.py
@@ -89,22 +89,29 @@ def get_segment_times(audio_file, annotation_folder):
 
     # for some tracks, only one annotation is available, take first one as default
     # if there is no annotation available, store -1 as error code
+
     try:
-        label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile1_uppercase.txt')
+        label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile3_uppercase.txt')
         t = pd.read_table(label_file, header=None)
     except IOError:
         try:
-            label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile2_uppercase.txt')
+            label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile1_uppercase.txt')
             t = pd.read_table(label_file, header=None)
         except IOError:
-            return -1
+            try:
+                label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile2_uppercase.txt')
+                t = pd.read_table(label_file, header=None)
+            except IOError:
+                return -1
+
+    if t[1].dtype == 'O':
+        t = t[~(t[1].str.lower().isin(['silence', 'end']))]
 
     segment_times = t.iloc[:, 0].values
 
     return segment_times
 
-
-def get_beat_times(audio_file, beats_folder):
+def get_beat_times(audio_file, beats_folder, include_beat_numbers=False):
     """
     Read beat times from annotation file.
     :param audio_file: path to audio files
@@ -114,7 +121,15 @@ def get_beat_times(audio_file, beats_folder):
 
     file_name = os.path.splitext(os.path.basename(audio_file))[0]
     beats_file = os.path.join(beats_folder, file_name + '.beats.txt')
+
+    if not os.path.isfile(beats_file):
+        print(f"Extracting beat times for {audio_file}")
+        os.system(f"DBNDownBeatTracker single '{audio_file}' -o '{beats_file}'")
+
     t = pd.read_table(beats_file, header=None)
-    beat_times = t.iloc[:, 0].values
 
-    return beat_times
+    if include_beat_numbers:
+        return t[0].values, t[1].values
+    else:
+        return t[0].values
+
diff --git a/Python/visualization.py b/Python/visualization.py
index ea540d3..b602dcc 100644
--- a/Python/visualization.py
+++ b/Python/visualization.py
@@ -8,8 +8,11 @@
 import numpy as np
 from feature_extraction import load_raw_features
 from evaluation import post_processing
+from utils import get_beat_times
 import matplotlib.pyplot as plt
 import pickle
+import paths
+import os
 
 
 def visualize_predictions():
@@ -31,19 +34,27 @@ def visualize_predictions():
     for i in range(len(test_labels)):
 
         f = test_files[i]
-        print f
+        beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True)
+        print(f)
 
         idx = np.where(test_idx == i)[0]
         labels = test_y[idx]
 
         preds_track = np.squeeze(np.asarray(preds[idx]))
-        preds_track = post_processing(preds_track)
+        processed_preds_track = post_processing(preds_track, beat_numbers)
+        with_downbeat_preds = post_processing(preds_track, beat_numbers, emphasize_downbeat=True)
+
         preds_track = 0.5 + 0.5 * preds_track
+        processed_preds_track = 1.0 + 0.5 * processed_preds_track
+        with_downbeat_preds = 1.5 + 0.5  * with_downbeat_preds
         labels *= 0.5
 
         plt.plot(labels)
         plt.plot(preds_track)
-        plt.show()
+        plt.plot(processed_preds_track)
+        plt.plot(with_downbeat_preds)
+        plt.savefig(os.path.join(paths.viz_path, paths.with_suffix(test_files[i], 'svg')), dpi=400)
+        plt.clf()
 
 
 def visualize_training_data():

From 37eee1a39e22af8c004915bacce1bdd71975e1ab Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 18 Mar 2021 00:12:55 -0700
Subject: [PATCH 03/35] update data, clear up variable names in
 track_segmentation

---
 Data/test_tracks.txt         | 110 ++--
 Data/train_tracks.txt        | 944 ++++++++++++++++++++---------------
 Python/track_segmentation.py |   5 +-
 3 files changed, 606 insertions(+), 453 deletions(-)

diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index bdad88d..62f4bd4 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -1,46 +1,72 @@
-4.m4a
+1166.mp3
 40.m4a
-46.m4a
-5.m4a
-6.m4a
-8.m4a
-955.mp3
-956.mp3
-957.mp3
-958.mp3
-959.mp3
-960.mp3
-962.mp3
-963.mp3
+1090.mp3
+584.m4a
+346.m4a
+1026.mp3
+1142.mp3
+1302.mp3
+1131.mp3
+608.m4a
+1274.mp3
+1376.mp3
+670.m4a
+1399.mp3
+1319.mp3
+18.m4a
+1123.mp3
+342.m4a
+10013.mp3
+642.m4a
+306.m4a
+1488.mp3
+516.m4a
+1192.mp3
+10024.mp3
+1357.mp3
+404.m4a
+1063.mp3
+1331.mp3
+1356.mp3
+1322.mp3
+1170.mp3
+1440.mp3
+1091.mp3
 964.mp3
-965.mp3
-966.mp3
-967.mp3
-968.mp3
+1436.mp3
+1414.mp3
+1474.mp3
+1036.mp3
+1040.mp3
+426.m4a
+1087.mp3
+1301.mp3
 970.mp3
-971.mp3
-972.mp3
-973.mp3
-974.mp3
-975.mp3
-976.mp3
-978.mp3
-979.mp3
-980.mp3
-981.mp3
-982.mp3
-983.mp3
-984.mp3
-986.mp3
-987.mp3
-988.mp3
-989.mp3
-990.mp3
-991.mp3
+1141.mp3
+1250.mp3
+1483.mp3
 992.mp3
-994.mp3
-995.mp3
-996.mp3
-997.mp3
-998.mp3
-999.mp3
+1223.mp3
+1284.mp3
+10012.mp3
+472.m4a
+6.m4a
+986.mp3
+678.m4a
+1227.mp3
+1152.mp3
+5.m4a
+1270.mp3
+488.m4a
+1311.mp3
+1421.mp3
+1402.mp3
+522.m4a
+354.m4a
+1276.mp3
+1339.mp3
+1236.mp3
+1445.mp3
+1221.mp3
+1244.mp3
+1080.mp3
diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index 95ed51d..e2e2baf 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -1,449 +1,577 @@
-10.m4a
-1000.mp3
-1003.mp3
+1136.mp3
+1343.mp3
+1027.mp3
+971.mp3
+484.m4a
+1130.mp3
+10032.mp3
+991.mp3
+616.m4a
+1076.mp3
+478.m4a
+1300.mp3
+1333.mp3
+1395.mp3
+440.m4a
 1004.mp3
-1005.mp3
-1006.mp3
-1007.mp3
-1008.mp3
-1011.mp3
-1012.mp3
-1013.mp3
-1014.mp3
-1015.mp3
-1018.mp3
-1019.mp3
-1020.mp3
-1021.mp3
-1022.mp3
-1023.mp3
+1372.mp3
+512.m4a
+1155.mp3
+1397.mp3
+1485.mp3
 1024.mp3
-1026.mp3
-1027.mp3
-1029.mp3
+1093.mp3
+660.m4a
+1254.mp3
+1460.mp3
+1149.mp3
+338.m4a
+1396.mp3
+52.m4a
+987.mp3
+1384.mp3
+1423.mp3
+594.m4a
+1107.mp3
+1410.mp3
 1030.mp3
-1032.mp3
-1034.mp3
+1403.mp3
+14.m4a
+20.m4a
+480.m4a
+1455.mp3
+37.m4a
+995.mp3
+1430.mp3
+1147.mp3
+1392.mp3
+1164.mp3
+1205.mp3
+626.m4a
+1182.mp3
+444.m4a
+1448.mp3
+4.m4a
+1374.mp3
+996.mp3
+1328.mp3
+1365.mp3
+1358.mp3
+989.mp3
+1478.mp3
+1157.mp3
+1144.mp3
+1286.mp3
+384.m4a
+1179.mp3
+1404.mp3
+1256.mp3
+974.mp3
+1271.mp3
+498.m4a
+1327.mp3
+618.m4a
+1354.mp3
+966.mp3
+955.mp3
 1035.mp3
-1036.mp3
-1037.mp3
-1038.mp3
-1039.mp3
-1040.mp3
-1042.mp3
-1043.mp3
-1044.mp3
-1045.mp3
 1046.mp3
-1047.mp3
+1352.mp3
+10023.mp3
+1224.mp3
+1204.mp3
+1038.mp3
+1059.mp3
+534.m4a
+420.m4a
+1490.mp3
+474.m4a
+1243.mp3
+1086.mp3
+1226.mp3
 1048.mp3
-1051.mp3
+1476.mp3
+1214.mp3
+10033.mp3
+1162.mp3
+340.m4a
+13.m4a
+10025.mp3
+450.m4a
+1138.mp3
+1359.mp3
+1219.mp3
+10.m4a
+1202.mp3
+965.mp3
+1023.mp3
+1375.mp3
+1140.mp3
+1039.mp3
+1083.mp3
+1092.mp3
 1052.mp3
-1053.mp3
-1054.mp3
+1310.mp3
+1462.mp3
+10021.mp3
+1007.mp3
+690.m4a
+1242.mp3
+1120.mp3
+1496.mp3
+576.m4a
+1167.mp3
+652.m4a
 1055.mp3
-1056.mp3
-1058.mp3
-1059.mp3
-1060.mp3
-1061.mp3
-1062.mp3
-1063.mp3
+1419.mp3
+676.m4a
+416.m4a
+1316.mp3
+1288.mp3
+634.m4a
+1299.mp3
+648.m4a
+1268.mp3
+1078.mp3
+1459.mp3
+524.m4a
+978.mp3
+1114.mp3
+614.m4a
+1218.mp3
 1064.mp3
-1066.mp3
-1067.mp3
-1068.mp3
-1069.mp3
-1070.mp3
-1071.mp3
-1072.mp3
+1463.mp3
+612.m4a
+1122.mp3
+1232.mp3
+1258.mp3
+408.m4a
+1408.mp3
+402.m4a
+1306.mp3
 1074.mp3
-1075.mp3
-1076.mp3
-1077.mp3
-1078.mp3
+983.mp3
+1069.mp3
+8.m4a
+1126.mp3
+1335.mp3
+1062.mp3
+10008.mp3
+370.m4a
+1272.mp3
+1326.mp3
+1429.mp3
+1124.mp3
+320.m4a
+1196.mp3
+1464.mp3
+1350.mp3
+12.m4a
+1099.mp3
+1054.mp3
+1435.mp3
+1439.mp3
+372.m4a
+1269.mp3
+568.m4a
+1422.mp3
+10020.mp3
+10009.mp3
+307.m4a
+1109.mp3
+1206.mp3
+1318.mp3
+350.m4a
+1450.mp3
+360.m4a
+963.mp3
+476.m4a
+1251.mp3
+1132.mp3
+1011.mp3
+1424.mp3
+492.m4a
+1005.mp3
+1266.mp3
 1079.mp3
-1080.mp3
-1082.mp3
-1083.mp3
+1115.mp3
+1360.mp3
+1175.mp3
+1431.mp3
+1294.mp3
+520.m4a
+1245.mp3
+410.m4a
+1239.mp3
+468.m4a
+16.m4a
+1195.mp3
+1151.mp3
+1493.mp3
 1084.mp3
-1085.mp3
-1086.mp3
-1087.mp3
-1088.mp3
-1090.mp3
-1091.mp3
-1092.mp3
-1093.mp3
-1095.mp3
-1096.mp3
-1098.mp3
-1099.mp3
-1101.mp3
-1102.mp3
-1103.mp3
+1240.mp3
+1378.mp3
+1037.mp3
+988.mp3
+324.m4a
 1104.mp3
-1106.mp3
-1107.mp3
-1108.mp3
-1109.mp3
-1110.mp3
-1111.mp3
+979.mp3
+424.m4a
+1467.mp3
+975.mp3
+364.m4a
+1171.mp3
+10026.mp3
+1285.mp3
+668.m4a
+1189.mp3
+1291.mp3
+596.m4a
+1261.mp3
+1072.mp3
+442.m4a
+356.m4a
+1148.mp3
+956.mp3
+1070.mp3
+482.m4a
+396.m4a
+1067.mp3
+486.m4a
 1112.mp3
-1114.mp3
-1115.mp3
-1116.mp3
-1117.mp3
-1118.mp3
+358.m4a
+982.mp3
+1173.mp3
+334.m4a
+1262.mp3
+1412.mp3
+1315.mp3
+1309.mp3
+1106.mp3
+1287.mp3
+570.m4a
+1389.mp3
+1135.mp3
 1119.mp3
-1120.mp3
-1122.mp3
-1123.mp3
-1124.mp3
+1407.mp3
+1075.mp3
+666.m4a
+1207.mp3
+1367.mp3
+1362.mp3
+1451.mp3
+998.mp3
+1246.mp3
+1381.mp3
+1101.mp3
+1003.mp3
 1125.mp3
-1126.mp3
-1127.mp3
-1128.mp3
-1130.mp3
-1131.mp3
-1132.mp3
-1133.mp3
-1134.mp3
-1135.mp3
-1136.mp3
-1138.mp3
-1139.mp3
-1140.mp3
-1141.mp3
-1142.mp3
-1143.mp3
-1144.mp3
-1146.mp3
-1147.mp3
-1148.mp3
-1149.mp3
-1150.mp3
-1151.mp3
-1152.mp3
+1386.mp3
+536.m4a
+1238.mp3
+1095.mp3
+994.mp3
+1088.mp3
+394.m4a
+46.m4a
 1154.mp3
-1155.mp3
-1156.mp3
-1157.mp3
+1264.mp3
+1077.mp3
+1188.mp3
+1472.mp3
+1134.mp3
+1293.mp3
+1117.mp3
+1053.mp3
+658.m4a
+1461.mp3
+422.m4a
+1215.mp3
+1045.mp3
+317.m4a
 1158.mp3
+1346.mp3
+1194.mp3
+1446.mp3
+10022.mp3
 1159.mp3
-1160.mp3
-1162.mp3
-1163.mp3
-1164.mp3
-1165.mp3
-1166.mp3
-1167.mp3
+1368.mp3
+1332.mp3
+1096.mp3
+502.m4a
+1394.mp3
 1168.mp3
-1170.mp3
-1171.mp3
-1172.mp3
-1173.mp3
+1181.mp3
+610.m4a
+392.m4a
+322.m4a
+1371.mp3
+39.m4a
+560.m4a
+1180.mp3
+1338.mp3
+1443.mp3
+1111.mp3
+1432.mp3
+532.m4a
+496.m4a
+1482.mp3
+981.mp3
+311.m4a
+366.m4a
+694.m4a
+1212.mp3
+1102.mp3
+997.mp3
+646.m4a
+1042.mp3
+1060.mp3
 1174.mp3
-1175.mp3
+1382.mp3
+959.mp3
+554.m4a
+510.m4a
+1247.mp3
+1213.mp3
+323.m4a
+10017.mp3
+1082.mp3
+1110.mp3
+1307.mp3
+1495.mp3
+1296.mp3
+10016.mp3
+1108.mp3
+1364.mp3
+1470.mp3
+1021.mp3
+1492.mp3
+1484.mp3
+654.m4a
+504.m4a
+30.m4a
+1235.mp3
+10027.mp3
+1211.mp3
 1176.mp3
-1178.mp3
-1179.mp3
-1180.mp3
-1181.mp3
-1182.mp3
-1183.mp3
+1015.mp3
+574.m4a
+1314.mp3
+1494.mp3
+1405.mp3
+999.mp3
+10014.mp3
+990.mp3
+1071.mp3
 1184.mp3
-1186.mp3
-1187.mp3
-1188.mp3
-1189.mp3
-1190.mp3
-1191.mp3
-1192.mp3
-1194.mp3
-1195.mp3
-1196.mp3
-1197.mp3
-1198.mp3
+506.m4a
+1336.mp3
 1199.mp3
-12.m4a
-1200.mp3
-1202.mp3
-1203.mp3
-1204.mp3
-1205.mp3
-1206.mp3
-1207.mp3
-1208.mp3
-1210.mp3
-1211.mp3
-1212.mp3
-1213.mp3
-1214.mp3
-1215.mp3
-1216.mp3
-1218.mp3
-1219.mp3
-1220.mp3
-1221.mp3
 1222.mp3
-1223.mp3
-1224.mp3
-1226.mp3
-1227.mp3
-1228.mp3
-1229.mp3
-1230.mp3
-1231.mp3
-1232.mp3
-1234.mp3
-1235.mp3
-1236.mp3
-1237.mp3
-1238.mp3
-1239.mp3
-1240.mp3
-1242.mp3
-1243.mp3
-1244.mp3
-1245.mp3
-1246.mp3
-1247.mp3
-1248.mp3
-1250.mp3
-1251.mp3
+976.mp3
+1128.mp3
+1044.mp3
+1000.mp3
+1051.mp3
+1442.mp3
+24.m4a
+1210.mp3
+578.m4a
+564.m4a
+1032.mp3
+1437.mp3
+10029.mp3
+1406.mp3
+1379.mp3
+1347.mp3
+1456.mp3
+1438.mp3
+508.m4a
+1022.mp3
+1308.mp3
+1413.mp3
+1012.mp3
+3.m4a
+1127.mp3
 1253.mp3
-1254.mp3
-1256.mp3
-1258.mp3
-1259.mp3
+10035.mp3
+1390.mp3
+980.mp3
+1351.mp3
+368.m4a
+1317.mp3
+1150.mp3
+550.m4a
+967.mp3
+630.m4a
+1342.mp3
+968.mp3
 1260.mp3
-1261.mp3
-1262.mp3
-1263.mp3
-1264.mp3
-1266.mp3
-1267.mp3
-1268.mp3
-1269.mp3
-1270.mp3
-1271.mp3
-1272.mp3
-1274.mp3
+1383.mp3
+1428.mp3
+590.m4a
+1468.mp3
+1133.mp3
+1324.mp3
+1444.mp3
+1118.mp3
+1008.mp3
+10019.mp3
+1420.mp3
+448.m4a
+606.m4a
+1029.mp3
+10007.mp3
+1160.mp3
+1447.mp3
+548.m4a
+1415.mp3
+604.m4a
+1220.mp3
 1275.mp3
-1276.mp3
-1277.mp3
-1278.mp3
+10034.mp3
+336.m4a
+1186.mp3
+1469.mp3
+1475.mp3
+1454.mp3
+1434.mp3
+1418.mp3
+1014.mp3
+686.m4a
+1427.mp3
+10031.mp3
 1279.mp3
-1280.mp3
+1006.mp3
 1282.mp3
+1325.mp3
+1172.mp3
+1280.mp3
+957.mp3
+632.m4a
+1043.mp3
+556.m4a
+1387.mp3
+1230.mp3
+10030.mp3
+984.mp3
+1278.mp3
+1400.mp3
+1143.mp3
+10011.mp3
+1103.mp3
+1491.mp3
+662.m4a
 1283.mp3
-1284.mp3
-1285.mp3
-1286.mp3
-1287.mp3
-1288.mp3
-1290.mp3
-1291.mp3
+1334.mp3
+1068.mp3
+1228.mp3
+1066.mp3
+696.m4a
+1116.mp3
+1056.mp3
+335.m4a
+1348.mp3
+674.m4a
 1292.mp3
-1293.mp3
-1294.mp3
-1295.mp3
-1296.mp3
-1298.mp3
-1299.mp3
-13.m4a
-1300.mp3
-1301.mp3
-1302.mp3
-1303.mp3
+1156.mp3
 1304.mp3
-1306.mp3
-1307.mp3
-1308.mp3
-1309.mp3
-1310.mp3
-1311.mp3
-1312.mp3
-1314.mp3
-1315.mp3
-1316.mp3
-1317.mp3
-1318.mp3
-1319.mp3
+1197.mp3
+1013.mp3
+1355.mp3
+1216.mp3
+1380.mp3
+1426.mp3
 1320.mp3
-1322.mp3
-1323.mp3
-1324.mp3
-1325.mp3
-1326.mp3
-1327.mp3
-1328.mp3
-1330.mp3
-1331.mp3
-1332.mp3
-1333.mp3
-1334.mp3
-1335.mp3
-1336.mp3
-1338.mp3
-1339.mp3
+352.m4a
+1267.mp3
+1085.mp3
+325.m4a
+620.m4a
+640.m4a
+1234.mp3
+1203.mp3
+1163.mp3
+22.m4a
+10018.mp3
+1479.mp3
+622.m4a
+1487.mp3
+1486.mp3
+344.m4a
+1200.mp3
 1340.mp3
+1018.mp3
+1388.mp3
+1363.mp3
+1187.mp3
+1139.mp3
+960.mp3
+1229.mp3
+1208.mp3
+1034.mp3
+1178.mp3
+562.m4a
 1341.mp3
-1342.mp3
-1343.mp3
-1346.mp3
-1347.mp3
-1348.mp3
+1303.mp3
+1477.mp3
+1058.mp3
+1020.mp3
 1349.mp3
-1350.mp3
-1351.mp3
-1352.mp3
-1354.mp3
-1355.mp3
-1356.mp3
-1357.mp3
-1358.mp3
-1359.mp3
-1360.mp3
-1362.mp3
-1363.mp3
-1364.mp3
-1365.mp3
-1366.mp3
-1367.mp3
-1368.mp3
-1370.mp3
-1371.mp3
-1372.mp3
-1373.mp3
-1374.mp3
-1375.mp3
-1376.mp3
-1378.mp3
-1379.mp3
-1380.mp3
-1381.mp3
-1382.mp3
-1383.mp3
-1384.mp3
-1386.mp3
-1387.mp3
-1388.mp3
-1389.mp3
-1390.mp3
-1391.mp3
-1392.mp3
-1394.mp3
-1395.mp3
-1396.mp3
-1397.mp3
-1398.mp3
-1399.mp3
-14.m4a
-1400.mp3
-1402.mp3
-1403.mp3
-1404.mp3
-1405.mp3
-1406.mp3
-1407.mp3
-1408.mp3
-1410.mp3
+650.m4a
+1190.mp3
+1295.mp3
+962.mp3
+514.m4a
+972.mp3
+586.m4a
+1312.mp3
+664.m4a
 1411.mp3
-1412.mp3
-1413.mp3
-1414.mp3
-1415.mp3
-1418.mp3
-1419.mp3
-1420.mp3
-1421.mp3
-1422.mp3
-1423.mp3
-1424.mp3
-1426.mp3
-1427.mp3
-1428.mp3
-1429.mp3
-1430.mp3
-1431.mp3
-1432.mp3
-1434.mp3
-1435.mp3
-1436.mp3
-1437.mp3
-1438.mp3
-1439.mp3
-1440.mp3
-1442.mp3
-1443.mp3
-1444.mp3
-1445.mp3
-1446.mp3
-1447.mp3
-1448.mp3
-1450.mp3
-1451.mp3
-1452.mp3
-1453.mp3
-1454.mp3
-1455.mp3
-1456.mp3
+1277.mp3
+1366.mp3
+1231.mp3
+386.m4a
 1458.mp3
-1459.mp3
-1460.mp3
-1461.mp3
-1462.mp3
-1463.mp3
-1464.mp3
-1466.mp3
-1467.mp3
-1468.mp3
-1469.mp3
-1470.mp3
-1472.mp3
-1474.mp3
-1475.mp3
-1476.mp3
-1477.mp3
-1478.mp3
-1479.mp3
-1482.mp3
-1483.mp3
-1484.mp3
-1485.mp3
-1486.mp3
-1487.mp3
-1488.mp3
-1490.mp3
-1491.mp3
-1492.mp3
-1493.mp3
-1494.mp3
-1495.mp3
-1496.mp3
+1263.mp3
+602.m4a
+382.m4a
+1248.mp3
+1146.mp3
+328.m4a
+10028.mp3
+1061.mp3
+466.m4a
+528.m4a
+1452.mp3
 1498.mp3
-16.m4a
-18.m4a
-20.m4a
-22.m4a
-24.m4a
-3.m4a
-30.m4a
-306.m4a
-307.m4a
+636.m4a
+1398.mp3
+1373.mp3
+1290.mp3
+1183.mp3
+1298.mp3
+1237.mp3
+1323.mp3
+10015.mp3
+1198.mp3
+518.m4a
+10010.mp3
+1098.mp3
+1047.mp3
+1165.mp3
+1191.mp3
+348.m4a
+1466.mp3
+1019.mp3
+1453.mp3
+428.m4a
+624.m4a
+1391.mp3
+958.mp3
+973.mp3
+1259.mp3
+1370.mp3
 310.m4a
-311.m4a
-317.m4a
-320.m4a
-322.m4a
-323.m4a
-324.m4a
-325.m4a
-328.m4a
-334.m4a
-335.m4a
-336.m4a
-338.m4a
-37.m4a
+1330.mp3
+692.m4a
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index f3d277a..b13f211 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -81,10 +81,9 @@ def compute_context_windows(features):
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
 
     feature_count = 0
-    num_beats = features.shape[1]
-
-    for k in range(padding, num_beats-padding):
+    num_padded_features = features.shape[1]
 
+    for k in range(padding, num_padded_features - padding):
         if feature_count > n_preallocate:
             break
 

From 27c885470d961a9866df3d9acaba817ad7e87fbe Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 18 Mar 2021 00:13:46 -0700
Subject: [PATCH 04/35] help peak-finding algo along

---
 Python/track_segmentation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index b13f211..a7181ae 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -108,7 +108,6 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
     """
     predictions = np.squeeze(predictions)
 
-    breakpoint()
     print("raw predicitions:")
     print_predictions(predictions, beat_times)
 
@@ -117,7 +116,8 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
     print("after post-processing:")
     print_predictions(predictions, beat_times)
 
-    peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1)
+    predictions = np.insert(predictions, 0, 0)
+    peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1
     segment_times = beat_times[peak_loc]
 
     print("beat_num\ttime:")

From 1856582cc5c00b1dd027739f0d0b596a273a37b5 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 18 Mar 2021 00:16:30 -0700
Subject: [PATCH 05/35] script to load more files into the dataset

---
 add_files.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100755 add_files.sh

diff --git a/add_files.sh b/add_files.sh
new file mode 100755
index 0000000..1216e78
--- /dev/null
+++ b/add_files.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+IN_LIST=/tmp/add_files.present
+CURRENT=/tmp/add_files.exist
+
+cat Data/test_tracks.txt Data/train_tracks.txt | sort > $IN_LIST
+(cd ~/src/salami-audio && ls -1 *.{mp3,m4a}) | sort > $CURRENT
+
+newfiles=`comm -3 $IN_LIST $CURRENT | sort -R`
+count=`comm -3 $IN_LIST $CURRENT | wc -l`
+
+i=0
+for x in $newfiles
+do
+  if [ "$i" -gt "$(($count / 9 - 1))" ]
+  then
+    echo "$x" to train_tracks
+    echo $x >> Data/train_tracks.txt
+  else
+    echo "$x" to test_tracks
+    echo $x >> Data/test_tracks.txt
+  fi
+  i=$(($i + 1))
+done

From 835cd91aacdd7c3c3027be7d3e22a3544a66c79a Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 18 Mar 2021 00:16:50 -0700
Subject: [PATCH 06/35] convenience script to predict uploaded stuffs

---
 track_segment.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100755 track_segment.sh

diff --git a/track_segment.sh b/track_segment.sh
new file mode 100755
index 0000000..9002756
--- /dev/null
+++ b/track_segment.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd Python
+mkdir -p  ~/src/salami-data-public/annotations/$1/parsed
+python ./track_segmentation.py ~/src/salami-audio/$1.* ~/src/salami-data-public/annotations/$1/parsed/predicted.txt

From 959e5c1045d98a61e515579d2730377f0f203b02 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 18 Mar 2021 07:00:22 -0700
Subject: [PATCH 07/35] latest

---
 Python/evaluation.py             | 23 +++++++++--------------
 Python/feature_extraction.py     | 16 ++++++++++------
 Python/train_segmentation_cnn.py |  2 +-
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index db3c54e..8effcfa 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -15,7 +15,6 @@
 
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
-f_measure_thresh = 3    # tolerance window in seconds
 
 
 def load_data(preds_file, file_lists):
@@ -64,9 +63,7 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
 
     return preds_track
 
-
-if __name__ == "__main__":
-
+def run_eval(f_measure_thresh):
     f_measures = []
     precisions = []
     recalls = []
@@ -75,9 +72,6 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
     preds = np.reshape(preds, len(preds))
 
     for i, f in enumerate(test_files):
-
-        print("Evaluating {}".format(f))
-
         # load annotations
         segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -88,9 +82,9 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
         preds_track = np.squeeze(np.asarray(preds[test_idx == i]))
 
         # post processing
-        preds_track = post_processing(preds_track, beat_numbers)
-        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.2)
-
+        preds_track = post_processing(preds_track, beat_numbers, emphasize_downbeat=False)
+        peds_track = np.insert(preds_track, 0, 0)
+        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1
         pred_times = beat_times[peak_loc]
 
         # compute f-measure
@@ -100,14 +94,15 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
         precisions.append(p)
         recalls.append(r)
 
-        print("f-Measure: {}, precision: {}, recall: {}".format(f_score, p, r))
+        #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r))
 
     mean_f = np.mean(np.asarray(f_measures))
     mean_p = np.mean(np.asarray(precisions))
     mean_r = np.mean(np.asarray(recalls))
 
-    print(" ")
-    print("Mean scores across all test tracks:")
-    print("f-Measure: {}, precision: {}, recall: {}".format(mean_f, mean_p, mean_r))
+    print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r))
 
+if __name__ == "__main__":
+    run_eval(0.5)
+    run_eval(3.0)
 
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 146e768..0bf4178 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -20,6 +20,7 @@
 import random
 import pickle
 import paths
+import multiprocessing, logging
 
 from utils import *
 import scipy
@@ -76,8 +77,11 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102
     beat_melspec = np.max(mel_spec[:, beat_frames[0]:beat_frames[1]], axis=1)
 
     for k in range(1, beat_frames.shape[0]-1):
-        beat_melspec = np.column_stack((beat_melspec,
-                                        np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1)))
+        try:
+            beat_melspec = np.column_stack((beat_melspec,
+                                            np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1)))
+        except:
+            breakpoint()
 
     beat_melspec = np.column_stack((beat_melspec, mel_spec[:, beat_frames.shape[0]]))
     return beat_melspec
@@ -114,12 +118,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     logger.setLevel(logging.INFO)
 
     with multiprocessing.Pool(processes=8) as pool:
-        #for i, f in enumerate(audio_files):
-        #    async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
+        for i, f in enumerate(audio_files):
+            async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
 
         for i, f in enumerate(audio_files):
-            #beat_mls, beat_times = async_res[i].get()
-            beat_mls, beat_times = compute_features(logger, f, i , audio_files)
+            beat_mls, beat_times = async_res[i].get()
+            #beat_mls, beat_times = compute_features(logger, f, i , audio_files)
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
 
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 9d3fd17..d87106e 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -139,4 +139,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
 
 
 if __name__ == "__main__":
-    train_model()
+    train_model(nb_epoch=300)

From 89cd35c54fe2bc9f4c8fdb70591b0e816fab50e6 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 19 Mar 2021 03:01:08 -0700
Subject: [PATCH 08/35] quiet, librosa

---
 Python/feature_extraction.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 81e4d99..ee87736 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -20,6 +20,7 @@
 import random
 import pickle
 import paths
+import warnings
 
 import multiprocessing, logging
 
@@ -61,7 +62,9 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102
     else:
         path = os.path.join(paths.audio_path, filename)
 
-    y, sr = librosa.load(path, sr=22050, mono=True)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        y, sr = librosa.load(path, sr=22050, mono=True)
 
     spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
                                window=scipy.signal.hamming))

From 974ddc5c3857c0414ece9e4b118e9b734889031b Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Sat, 20 Mar 2021 14:46:25 -0700
Subject: [PATCH 09/35] output best and worst tracks in validation set

---
 Python/evaluation.py             | 21 +++++++++++++++++++++
 Python/train_segmentation_cnn.py |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 8effcfa..c5edab4 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -13,6 +13,8 @@
 import mir_eval
 import paths
 
+from operator import itemgetter
+
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
 
@@ -63,6 +65,10 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
 
     return preds_track
 
+def get_sort_key(item):
+    return item[1]
+
+
 def run_eval(f_measure_thresh):
     f_measures = []
     precisions = []
@@ -102,6 +108,21 @@ def run_eval(f_measure_thresh):
 
     print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r))
 
+    combined_tracks = list(zip(test_files, f_measures, precisions, recalls))
+    sorted_tracks = sorted(combined_tracks, key=get_sort_key)
+    print("worst:")
+    for x in range(3):
+        track = sorted_tracks[x]
+        print("{:<20}{:4.2}\t{:4.2}\t{:4.2}".format(*track))
+
+    print("best:")
+    for x in range(1,4):
+        track = sorted_tracks[-x]
+        print("{:<20}{:4.2}\t{:4.2}\t{:4.2}".format(*track))
+
+
+
+
 if __name__ == "__main__":
     run_eval(0.5)
     run_eval(3.0)
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index d87106e..58c5947 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -139,4 +139,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
 
 
 if __name__ == "__main__":
-    train_model(nb_epoch=300)
+    train_model(nb_epoch=75)

From 47fedb0fe9d77d33aec20d13c67dd8263b4cf7c6 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Mon, 22 Mar 2021 17:37:49 -0700
Subject: [PATCH 10/35] WIP: sslm

---
 Python/feature_extraction.py | 141 +++++++++++++++++++++++++++++------
 1 file changed, 119 insertions(+), 22 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index ee87736..2fcc223 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -26,6 +26,9 @@
 
 from utils import *
 import scipy
+import skimage.measure
+from scipy.spatial import distance
+
 
 context_length = 65         # how many beats make up a context window for the CNN
 num_mel_bands = 80          # number of Mel bands
@@ -33,40 +36,110 @@
 pos_frames_oversample = 5   # oversample positive frames because there are too few
 mid_frames_oversample = 3   # oversample frames between segments
 label_smearing = 1          # how many frames are positive examples around an annotation
+padding_length = int(context_length / 2)
+
+max_pool = 2
 
 random.seed(1234)           # for reproducibility
 np.random.seed(1234)
 
-
-def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
+def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
     """
     Compute average Mel log spectrogram per beat given previously
     extracted beat times.
 
-    :param filename: path to audio file
+    :param waveform: raw waveform data
     :param beat_times: list of beat times in seconds
     :param mel_bands: number of Mel bands
     :param fft_size: FFT size
     :param hop_size: hop size for FFT processing
-    :return: beat Mel spectrogram (mel_bands x frames)
+    :return: beat sslm
     """
+    S = librosa.feature.melspectrogram(y=waveform, sr=22050, n_fft=fft_size, hop_length=hop_size, n_mels=mel_bands, fmin=80, fmax=16000, win_length=fft_size, window=scipy.signal.hamming)
 
-    computed_mls_file = paths.get_mls_path(filename)
+    S_to_dB = librosa.power_to_db(S,ref=np.max)
 
-    if os.path.exists(computed_mls_file):
-        return np.load(computed_mls_file)
+    # pad 130 frames (to be 65) with noise at -70dB at the beginning
+    pad = np.full((S_to_dB.shape[0], context_length * 2), -70)
+    S_padded = np.concatenate((pad, S_to_dB), axis=1)
 
+    # downsample initial spectrogram
+    x_prime = skimage.measure.block_reduce(S_padded, (1,max_pool), np.max)
 
-    if "/" in filename:
-        path = filename
-    else:
-        path = os.path.join(paths.audio_path, filename)
+    MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
+    MFCCs = MFCCs[1:,:] + 1
 
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        y, sr = librosa.load(path, sr=22050, mono=True)
+    # this seems to group two frames together
+    m = 2
+    x = [np.roll(MFCCs,n,axis=1) for n in range(m)]
+    x_hat = np.concatenate(x, axis=0)
 
-    spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
+    #Cosine distance calculation: D[N/p,L/p] matrix
+    distances = np.zeros((x_hat.shape[1], context_length)) #D has as dimensions N/p and L/p
+    for i in range(x_hat.shape[1]): #iteration in columns of x_hat
+        for l in range(context_length):
+            if i-(l+1) < 0:
+                cosine_dist = 1
+            elif i-(l+1) < context_length:
+                cosine_dist = 1
+            else:
+                cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L
+            distances[i,l] = cosine_dist
+
+    #Threshold epsilon[N/p,L/p] calculation
+    kappa = 0.1 #equalization factor of 10%
+    epsilon = np.zeros((distances.shape[0], context_length)) #D has as dimensions N/p and L/p
+    for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat
+        for l in range(context_length):
+            epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa)
+
+
+    #Removing initial padding now taking into account the max-poolin factor
+    distances = distances[context_length:,:]
+    epsilon = epsilon[context_length:,:]
+    x_prime = x_prime[:,context_length:]
+
+
+    #Self Similarity Lag Matrix
+    sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
+    sslm = np.transpose(sslm)
+
+    # the paper further downsamples by 3, but since we're doing beat-frames only might be ok
+    #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max)
+    #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max)
+
+    #Check if SSLM has nans and if it has them, substitute them by 0
+    for i in range(sslm.shape[0]):
+        for j in range(sslm.shape[1]):
+            if np.isnan(sslm[i,j]):
+                sslm[i,j] = 0
+
+    beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int')
+    beat_sslms = np.zeros((65, 65, beat_frames.shape[0]))
+
+    for k in range(beat_frames.shape[0]):
+        sslm_frame = beat_frames[k] // max_pool
+        sslm_frame_min = sslm_frame - context_length // 2
+        sslm_frame_max = sslm_frame + context_length // 2 + 1
+        breakpoint()
+        beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max]
+
+    breakpoint()
+    return sslm
+
+def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
+    """
+    Compute average Mel log spectrogram per beat given previously
+    extracted beat times.
+
+    :param filename: path to audio file
+    :param beat_times: list of beat times in seconds
+    :param mel_bands: number of Mel bands
+    :param fft_size: FFT size
+    :param hop_size: hop size for FFT processing
+    :return: beat Mel spectrogram (mel_bands x frames)
+    """
+    spec = np.abs(librosa.stft(y=features, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
                                window=scipy.signal.hamming))
 
     mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True)
@@ -86,17 +159,41 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102
 
     beat_melspec = np.column_stack((beat_melspec, mel_spec[:, beat_frames.shape[0]]))
 
-    np.save(computed_mls_file, beat_melspec)
-
     return beat_melspec
 
+def load_waveform(filename):
+    if "/" in filename:
+        path = filename
+    else:
+        path = os.path.join(paths.audio_path, filename)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        y, sr = librosa.load(path, sr=22050, mono=True)
+        return y
+
+def get_cached_features(filename):
+    computed_mls_file = paths.get_mls_path(filename)
+
+    if os.path.exists(computed_mls_file):
+        return np.load(computed_mls_file)
+    else:
+        return None
 
 def compute_features(logger, f, i, audio_files):
     logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
 
     beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
 
-    beat_mls = compute_beat_mls(f, beat_times)
+    cached_features = get_cached_features(f)
+
+    if cached_features is not None:
+        return cached_features
+
+    waveform = load_waveform(f)
+
+    beat_mls = compute_beat_mls(waveform, beat_times)
+    compute_sslm(waveform, beat_times)
     beat_mls /= np.max(beat_mls)
     return beat_mls, beat_times
 
@@ -122,11 +219,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     logger.setLevel(logging.INFO)
 
     with multiprocessing.Pool(processes=8) as pool:
-        for i, f in enumerate(audio_files):
-            async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
+        #for i, f in enumerate(audio_files):
+        #    async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
 
         for i, f in enumerate(audio_files):
-            beat_mls, beat_times = async_res[i].get()
+            #beat_mls, beat_times = async_res[i].get()
+            beat_mls, beat_times = compute_features(logger, f, i, audio_files)
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -201,7 +299,6 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
 
     feature_count = 0
     current_track = 0
-    padding_length = int(context_length / 2)
 
     for features, labels in zip(feature_list, labels_list):
 

From c38c8fa949d68eaacd95d92f91d15bab5e218680 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Wed, 24 Mar 2021 23:01:33 -0700
Subject: [PATCH 11/35] this does something!!!!

---
 Data/test_tracks.txt             |  69 ----
 Data/train_tracks.txt            | 574 -------------------------------
 Python/feature_extraction.py     | 107 +++---
 Python/train_segmentation_cnn.py |  53 ++-
 4 files changed, 106 insertions(+), 697 deletions(-)

diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index 62f4bd4..fb1b766 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -1,72 +1,3 @@
 1166.mp3
 40.m4a
 1090.mp3
-584.m4a
-346.m4a
-1026.mp3
-1142.mp3
-1302.mp3
-1131.mp3
-608.m4a
-1274.mp3
-1376.mp3
-670.m4a
-1399.mp3
-1319.mp3
-18.m4a
-1123.mp3
-342.m4a
-10013.mp3
-642.m4a
-306.m4a
-1488.mp3
-516.m4a
-1192.mp3
-10024.mp3
-1357.mp3
-404.m4a
-1063.mp3
-1331.mp3
-1356.mp3
-1322.mp3
-1170.mp3
-1440.mp3
-1091.mp3
-964.mp3
-1436.mp3
-1414.mp3
-1474.mp3
-1036.mp3
-1040.mp3
-426.m4a
-1087.mp3
-1301.mp3
-970.mp3
-1141.mp3
-1250.mp3
-1483.mp3
-992.mp3
-1223.mp3
-1284.mp3
-10012.mp3
-472.m4a
-6.m4a
-986.mp3
-678.m4a
-1227.mp3
-1152.mp3
-5.m4a
-1270.mp3
-488.m4a
-1311.mp3
-1421.mp3
-1402.mp3
-522.m4a
-354.m4a
-1276.mp3
-1339.mp3
-1236.mp3
-1445.mp3
-1221.mp3
-1244.mp3
-1080.mp3
diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index e2e2baf..d45159d 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -1,577 +1,3 @@
 1136.mp3
 1343.mp3
 1027.mp3
-971.mp3
-484.m4a
-1130.mp3
-10032.mp3
-991.mp3
-616.m4a
-1076.mp3
-478.m4a
-1300.mp3
-1333.mp3
-1395.mp3
-440.m4a
-1004.mp3
-1372.mp3
-512.m4a
-1155.mp3
-1397.mp3
-1485.mp3
-1024.mp3
-1093.mp3
-660.m4a
-1254.mp3
-1460.mp3
-1149.mp3
-338.m4a
-1396.mp3
-52.m4a
-987.mp3
-1384.mp3
-1423.mp3
-594.m4a
-1107.mp3
-1410.mp3
-1030.mp3
-1403.mp3
-14.m4a
-20.m4a
-480.m4a
-1455.mp3
-37.m4a
-995.mp3
-1430.mp3
-1147.mp3
-1392.mp3
-1164.mp3
-1205.mp3
-626.m4a
-1182.mp3
-444.m4a
-1448.mp3
-4.m4a
-1374.mp3
-996.mp3
-1328.mp3
-1365.mp3
-1358.mp3
-989.mp3
-1478.mp3
-1157.mp3
-1144.mp3
-1286.mp3
-384.m4a
-1179.mp3
-1404.mp3
-1256.mp3
-974.mp3
-1271.mp3
-498.m4a
-1327.mp3
-618.m4a
-1354.mp3
-966.mp3
-955.mp3
-1035.mp3
-1046.mp3
-1352.mp3
-10023.mp3
-1224.mp3
-1204.mp3
-1038.mp3
-1059.mp3
-534.m4a
-420.m4a
-1490.mp3
-474.m4a
-1243.mp3
-1086.mp3
-1226.mp3
-1048.mp3
-1476.mp3
-1214.mp3
-10033.mp3
-1162.mp3
-340.m4a
-13.m4a
-10025.mp3
-450.m4a
-1138.mp3
-1359.mp3
-1219.mp3
-10.m4a
-1202.mp3
-965.mp3
-1023.mp3
-1375.mp3
-1140.mp3
-1039.mp3
-1083.mp3
-1092.mp3
-1052.mp3
-1310.mp3
-1462.mp3
-10021.mp3
-1007.mp3
-690.m4a
-1242.mp3
-1120.mp3
-1496.mp3
-576.m4a
-1167.mp3
-652.m4a
-1055.mp3
-1419.mp3
-676.m4a
-416.m4a
-1316.mp3
-1288.mp3
-634.m4a
-1299.mp3
-648.m4a
-1268.mp3
-1078.mp3
-1459.mp3
-524.m4a
-978.mp3
-1114.mp3
-614.m4a
-1218.mp3
-1064.mp3
-1463.mp3
-612.m4a
-1122.mp3
-1232.mp3
-1258.mp3
-408.m4a
-1408.mp3
-402.m4a
-1306.mp3
-1074.mp3
-983.mp3
-1069.mp3
-8.m4a
-1126.mp3
-1335.mp3
-1062.mp3
-10008.mp3
-370.m4a
-1272.mp3
-1326.mp3
-1429.mp3
-1124.mp3
-320.m4a
-1196.mp3
-1464.mp3
-1350.mp3
-12.m4a
-1099.mp3
-1054.mp3
-1435.mp3
-1439.mp3
-372.m4a
-1269.mp3
-568.m4a
-1422.mp3
-10020.mp3
-10009.mp3
-307.m4a
-1109.mp3
-1206.mp3
-1318.mp3
-350.m4a
-1450.mp3
-360.m4a
-963.mp3
-476.m4a
-1251.mp3
-1132.mp3
-1011.mp3
-1424.mp3
-492.m4a
-1005.mp3
-1266.mp3
-1079.mp3
-1115.mp3
-1360.mp3
-1175.mp3
-1431.mp3
-1294.mp3
-520.m4a
-1245.mp3
-410.m4a
-1239.mp3
-468.m4a
-16.m4a
-1195.mp3
-1151.mp3
-1493.mp3
-1084.mp3
-1240.mp3
-1378.mp3
-1037.mp3
-988.mp3
-324.m4a
-1104.mp3
-979.mp3
-424.m4a
-1467.mp3
-975.mp3
-364.m4a
-1171.mp3
-10026.mp3
-1285.mp3
-668.m4a
-1189.mp3
-1291.mp3
-596.m4a
-1261.mp3
-1072.mp3
-442.m4a
-356.m4a
-1148.mp3
-956.mp3
-1070.mp3
-482.m4a
-396.m4a
-1067.mp3
-486.m4a
-1112.mp3
-358.m4a
-982.mp3
-1173.mp3
-334.m4a
-1262.mp3
-1412.mp3
-1315.mp3
-1309.mp3
-1106.mp3
-1287.mp3
-570.m4a
-1389.mp3
-1135.mp3
-1119.mp3
-1407.mp3
-1075.mp3
-666.m4a
-1207.mp3
-1367.mp3
-1362.mp3
-1451.mp3
-998.mp3
-1246.mp3
-1381.mp3
-1101.mp3
-1003.mp3
-1125.mp3
-1386.mp3
-536.m4a
-1238.mp3
-1095.mp3
-994.mp3
-1088.mp3
-394.m4a
-46.m4a
-1154.mp3
-1264.mp3
-1077.mp3
-1188.mp3
-1472.mp3
-1134.mp3
-1293.mp3
-1117.mp3
-1053.mp3
-658.m4a
-1461.mp3
-422.m4a
-1215.mp3
-1045.mp3
-317.m4a
-1158.mp3
-1346.mp3
-1194.mp3
-1446.mp3
-10022.mp3
-1159.mp3
-1368.mp3
-1332.mp3
-1096.mp3
-502.m4a
-1394.mp3
-1168.mp3
-1181.mp3
-610.m4a
-392.m4a
-322.m4a
-1371.mp3
-39.m4a
-560.m4a
-1180.mp3
-1338.mp3
-1443.mp3
-1111.mp3
-1432.mp3
-532.m4a
-496.m4a
-1482.mp3
-981.mp3
-311.m4a
-366.m4a
-694.m4a
-1212.mp3
-1102.mp3
-997.mp3
-646.m4a
-1042.mp3
-1060.mp3
-1174.mp3
-1382.mp3
-959.mp3
-554.m4a
-510.m4a
-1247.mp3
-1213.mp3
-323.m4a
-10017.mp3
-1082.mp3
-1110.mp3
-1307.mp3
-1495.mp3
-1296.mp3
-10016.mp3
-1108.mp3
-1364.mp3
-1470.mp3
-1021.mp3
-1492.mp3
-1484.mp3
-654.m4a
-504.m4a
-30.m4a
-1235.mp3
-10027.mp3
-1211.mp3
-1176.mp3
-1015.mp3
-574.m4a
-1314.mp3
-1494.mp3
-1405.mp3
-999.mp3
-10014.mp3
-990.mp3
-1071.mp3
-1184.mp3
-506.m4a
-1336.mp3
-1199.mp3
-1222.mp3
-976.mp3
-1128.mp3
-1044.mp3
-1000.mp3
-1051.mp3
-1442.mp3
-24.m4a
-1210.mp3
-578.m4a
-564.m4a
-1032.mp3
-1437.mp3
-10029.mp3
-1406.mp3
-1379.mp3
-1347.mp3
-1456.mp3
-1438.mp3
-508.m4a
-1022.mp3
-1308.mp3
-1413.mp3
-1012.mp3
-3.m4a
-1127.mp3
-1253.mp3
-10035.mp3
-1390.mp3
-980.mp3
-1351.mp3
-368.m4a
-1317.mp3
-1150.mp3
-550.m4a
-967.mp3
-630.m4a
-1342.mp3
-968.mp3
-1260.mp3
-1383.mp3
-1428.mp3
-590.m4a
-1468.mp3
-1133.mp3
-1324.mp3
-1444.mp3
-1118.mp3
-1008.mp3
-10019.mp3
-1420.mp3
-448.m4a
-606.m4a
-1029.mp3
-10007.mp3
-1160.mp3
-1447.mp3
-548.m4a
-1415.mp3
-604.m4a
-1220.mp3
-1275.mp3
-10034.mp3
-336.m4a
-1186.mp3
-1469.mp3
-1475.mp3
-1454.mp3
-1434.mp3
-1418.mp3
-1014.mp3
-686.m4a
-1427.mp3
-10031.mp3
-1279.mp3
-1006.mp3
-1282.mp3
-1325.mp3
-1172.mp3
-1280.mp3
-957.mp3
-632.m4a
-1043.mp3
-556.m4a
-1387.mp3
-1230.mp3
-10030.mp3
-984.mp3
-1278.mp3
-1400.mp3
-1143.mp3
-10011.mp3
-1103.mp3
-1491.mp3
-662.m4a
-1283.mp3
-1334.mp3
-1068.mp3
-1228.mp3
-1066.mp3
-696.m4a
-1116.mp3
-1056.mp3
-335.m4a
-1348.mp3
-674.m4a
-1292.mp3
-1156.mp3
-1304.mp3
-1197.mp3
-1013.mp3
-1355.mp3
-1216.mp3
-1380.mp3
-1426.mp3
-1320.mp3
-352.m4a
-1267.mp3
-1085.mp3
-325.m4a
-620.m4a
-640.m4a
-1234.mp3
-1203.mp3
-1163.mp3
-22.m4a
-10018.mp3
-1479.mp3
-622.m4a
-1487.mp3
-1486.mp3
-344.m4a
-1200.mp3
-1340.mp3
-1018.mp3
-1388.mp3
-1363.mp3
-1187.mp3
-1139.mp3
-960.mp3
-1229.mp3
-1208.mp3
-1034.mp3
-1178.mp3
-562.m4a
-1341.mp3
-1303.mp3
-1477.mp3
-1058.mp3
-1020.mp3
-1349.mp3
-650.m4a
-1190.mp3
-1295.mp3
-962.mp3
-514.m4a
-972.mp3
-586.m4a
-1312.mp3
-664.m4a
-1411.mp3
-1277.mp3
-1366.mp3
-1231.mp3
-386.m4a
-1458.mp3
-1263.mp3
-602.m4a
-382.m4a
-1248.mp3
-1146.mp3
-328.m4a
-10028.mp3
-1061.mp3
-466.m4a
-528.m4a
-1452.mp3
-1498.mp3
-636.m4a
-1398.mp3
-1373.mp3
-1290.mp3
-1183.mp3
-1298.mp3
-1237.mp3
-1323.mp3
-10015.mp3
-1198.mp3
-518.m4a
-10010.mp3
-1098.mp3
-1047.mp3
-1165.mp3
-1191.mp3
-348.m4a
-1466.mp3
-1019.mp3
-1453.mp3
-428.m4a
-624.m4a
-1391.mp3
-958.mp3
-973.mp3
-1259.mp3
-1370.mp3
-310.m4a
-1330.mp3
-692.m4a
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 2fcc223..df62fc0 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -21,6 +21,7 @@
 import pickle
 import paths
 import warnings
+import time
 
 import multiprocessing, logging
 
@@ -55,16 +56,23 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     :param hop_size: hop size for FFT processing
     :return: beat sslm
     """
-    S = librosa.feature.melspectrogram(y=waveform, sr=22050, n_fft=fft_size, hop_length=hop_size, n_mels=mel_bands, fmin=80, fmax=16000, win_length=fft_size, window=scipy.signal.hamming)
+    spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
+                               window=scipy.signal.hamming))
+
+    mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True)
+    s = np.sum(mel_fb, axis=1)
+    mel_fb = np.divide(mel_fb, s[:, np.newaxis])
+
+    mel_spec = np.dot(mel_fb, spec)
 
-    S_to_dB = librosa.power_to_db(S,ref=np.max)
+    S_to_dB = librosa.power_to_db(mel_spec,ref=np.max)
 
     # pad 130 frames (to be 65) with noise at -70dB at the beginning
-    pad = np.full((S_to_dB.shape[0], context_length * 2), -70)
-    S_padded = np.concatenate((pad, S_to_dB), axis=1)
+    #pad = np.full((S_to_dB.shape[0], context_length * 2), -70)
+    #S_padded = np.concatenate((pad, S_to_dB), axis=1)
+
 
-    # downsample initial spectrogram
-    x_prime = skimage.measure.block_reduce(S_padded, (1,max_pool), np.max)
+    x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max)
 
     MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
     MFCCs = MFCCs[1:,:] + 1
@@ -74,34 +82,33 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     x = [np.roll(MFCCs,n,axis=1) for n in range(m)]
     x_hat = np.concatenate(x, axis=0)
 
+    # create circular foo
+    x_hat_length = x_hat.shape[1]
+    x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1)
+    print("pre-padded: {}, post-padded: {}".format(x_hat.shape, x_padded.shape))
+
     #Cosine distance calculation: D[N/p,L/p] matrix
-    distances = np.zeros((x_hat.shape[1], context_length)) #D has as dimensions N/p and L/p
-    for i in range(x_hat.shape[1]): #iteration in columns of x_hat
+    distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p
+    for i in range(context_length, x_padded.shape[1] - context_length): #iteration in columns of x_hat
         for l in range(context_length):
-            if i-(l+1) < 0:
-                cosine_dist = 1
-            elif i-(l+1) < context_length:
-                cosine_dist = 1
-            else:
-                cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L
+            cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L
             distances[i,l] = cosine_dist
 
     #Threshold epsilon[N/p,L/p] calculation
     kappa = 0.1 #equalization factor of 10%
-    epsilon = np.zeros((distances.shape[0], context_length)) #D has as dimensions N/p and L/p
+    t1 = time.time()
+    epsilon = np.full((distances.shape[0], context_length), 1.0)
     for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat
         for l in range(context_length):
             epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa)
 
+    t2 = time.time()
 
-    #Removing initial padding now taking into account the max-poolin factor
-    distances = distances[context_length:,:]
-    epsilon = epsilon[context_length:,:]
-    x_prime = x_prime[:,context_length:]
-
+    print(t2-t1)
 
     #Self Similarity Lag Matrix
     sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
+    #sslm = scipy.special.expit(1-distances)
     sslm = np.transpose(sslm)
 
     # the paper further downsamples by 3, but since we're doing beat-frames only might be ok
@@ -109,23 +116,21 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max)
 
     #Check if SSLM has nans and if it has them, substitute them by 0
-    for i in range(sslm.shape[0]):
-        for j in range(sslm.shape[1]):
-            if np.isnan(sslm[i,j]):
-                sslm[i,j] = 0
+    #for i in range(sslm.shape[0]):
+    #    for j in range(sslm.shape[1]):
+    #        if np.isnan(sslm[i,j]):
+    #            sslm[i,j] = 0
 
     beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int')
-    beat_sslms = np.zeros((65, 65, beat_frames.shape[0]))
+    beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]))
 
     for k in range(beat_frames.shape[0]):
-        sslm_frame = beat_frames[k] // max_pool
+        sslm_frame = beat_frames[k] // max_pool + context_length
         sslm_frame_min = sslm_frame - context_length // 2
         sslm_frame_max = sslm_frame + context_length // 2 + 1
-        breakpoint()
         beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max]
 
-    breakpoint()
-    return sslm
+    return beat_sslms
 
 def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
     """
@@ -193,9 +198,9 @@ def compute_features(logger, f, i, audio_files):
     waveform = load_waveform(f)
 
     beat_mls = compute_beat_mls(waveform, beat_times)
-    compute_sslm(waveform, beat_times)
+    beat_sslm = compute_sslm(waveform, beat_times)
     beat_mls /= np.max(beat_mls)
-    return beat_mls, beat_times
+    return beat_mls, beat_sslm, beat_times
 
 def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     """
@@ -210,6 +215,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     """
 
     feature_list = []
+    sslm_feature_list = []
     labels_list = []
     failed_tracks_idx = []
 
@@ -219,12 +225,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     logger.setLevel(logging.INFO)
 
     with multiprocessing.Pool(processes=8) as pool:
-        #for i, f in enumerate(audio_files):
-        #    async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
+        for i, f in enumerate(audio_files):
+            async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
 
         for i, f in enumerate(audio_files):
-            #beat_mls, beat_times = async_res[i].get()
-            beat_mls, beat_times = compute_features(logger, f, i, audio_files)
+            beat_mls, beat_sslm, beat_times = async_res[i].get()
+            #beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files)
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -239,9 +245,10 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
                     label_vec[closest_beat] = 1.
 
             feature_list.append(beat_mls)
+            sslm_feature_list.append(beat_sslm)
             labels_list.append(label_vec)
 
-    return feature_list, labels_list, failed_tracks_idx
+    return feature_list, sslm_feature_list, labels_list, failed_tracks_idx
 
 
 def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample=10000):
@@ -277,7 +284,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample
     return features, mean_vec, std_vec
 
 
-def prepare_batch_data(feature_list, labels_list, is_training=True):
+def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training=True):
     """
     Reads precomputed beat Mel spectrograms and slices them into context windows
     for CNN training. For the training set, subsampling is
@@ -293,6 +300,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
+    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32)
     data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     track_idx = np.zeros(shape=(n_preallocate,), dtype=int)
@@ -300,7 +308,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
     feature_count = 0
     current_track = 0
 
-    for features, labels in zip(feature_list, labels_list):
+    for features, sslm_features, labels in zip(feature_list, sslm_feature_list, labels_list):
 
         print("Processed {} examples from {} tracks".format(feature_count, current_track+1))
 
@@ -325,6 +333,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
                     next_weight = 1
 
                     data_x[feature_count, :, :] = next_window
+                    data_sslm_x[feature_count] =  sslm_features[:, :, k]
                     data_y[feature_count] = next_label
                     data_weight[feature_count] = next_weight
                     track_idx[feature_count] = current_track
@@ -342,6 +351,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
                             next_weight = 1. - np.abs(l-k) / (label_smearing + 1.)
 
                             data_x[feature_count, :, :] = next_window
+                            data_sslm_x[feature_count] =  sslm_features[:, :, l]
                             data_y[feature_count] = next_label
                             data_weight[feature_count] = next_weight
                             track_idx[feature_count] = current_track
@@ -361,6 +371,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
 
                             next_window = features[:, l-padding_length: l+padding_length+1]
 
+                            data_sslm_x[feature_count] =  sslm_features[:, :, l]
                             data_x[feature_count, :, :] = next_window
                             data_y[feature_count] = 0
                             data_weight[feature_count] = 1
@@ -384,6 +395,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
                 next_weight = 1
 
                 data_x[feature_count, :, :] = next_window
+                data_sslm_x[feature_count] =  sslm_features[:, :, next_idx - padding_length]
                 data_y[feature_count] = next_label
                 data_weight[feature_count] = next_weight
                 track_idx[feature_count] = current_track
@@ -399,6 +411,8 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
 
                 data_x[feature_count, :, :] = next_window
                 data_y[feature_count] = next_label
+                data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length]
+
                 data_weight[feature_count] = next_weight
                 track_idx[feature_count] = current_track
 
@@ -410,11 +424,12 @@ def prepare_batch_data(feature_list, labels_list, is_training=True):
             break
 
     data_x = data_x[:feature_count, :, :]
+    data_sslm_x = data_sslm_x[:feature_count, :, :]
     data_y = data_y[:feature_count]
     data_weight = data_weight[:feature_count]
     track_idx = track_idx[:feature_count]
 
-    return data_x, data_y, data_weight, track_idx
+    return data_x, data_sslm_x, data_y, data_weight, track_idx
 
 
 def load_raw_features(file):
@@ -441,11 +456,11 @@ def load_raw_features(file):
 
     print("Extracting MLS features")
 
-    train_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files,
+    train_features, train_sslm_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files,
                                                                                   paths.beats_path,
                                                                                   paths.annotations_path)
 
-    test_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files,
+    test_features, test_sslm_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files,
                                                                                paths.beats_path,
                                                                                paths.annotations_path)
 
@@ -459,12 +474,12 @@ def load_raw_features(file):
         del test_files[i]
 
     with open('../Data/rawFeatures.pickle', 'wb') as f:
-        pickle.dump((train_features, train_labels, test_features, test_labels), f)
+        pickle.dump((train_features, train_sslm_features, train_labels, test_features, test_sslm_features, test_labels), f)
 
     # train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle')
 
-    train_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_labels, is_training=True)
-    test_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_labels, is_training=False)
+    train_x, train_sslm_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_labels, is_training=True)
+    test_x, test_sslm_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_labels, is_training=False)
 
     train_x, mean_vec, std_vec = normalize_features_per_band(train_x)
     test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec)
@@ -472,8 +487,8 @@ def load_raw_features(file):
     print("Prepared {} training items and {} test items".format(train_x.shape[0], test_x.shape[0]))
 
     # store normalized features for CNN training
-    np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_y=train_y, train_weights=train_weights)
-    np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_y=test_y, test_weights=test_weights)
+    np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_y=train_y, train_weights=train_weights)
+    np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_y=test_y, test_weights=test_weights)
     np.savez('../Data/normalization.npz', mean_vec=mean_vec, std_vec=std_vec)
 
     # store file lists and index mapping to training and test data
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 58c5947..c089c2d 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -12,6 +12,14 @@
 from keras.models import Sequential
 from keras.layers.core import Dense, Dropout, Activation, Flatten
 from keras.layers.convolutional import Convolution2D, MaxPooling2D
+import tensorflow.keras.layers
+from tensorflow.keras.models import Model
+
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
 from keras.callbacks import EarlyStopping
 from keras.optimizers import SGD
 
@@ -32,10 +40,11 @@ def load_training_data(dataset):
 
     data = np.load(dataset)
     train_x = data['train_x']
+    train_sslm_x = data['train_sslm_x']
     train_y = data['train_y']
     train_weights = data['train_weights']
 
-    return train_x, train_y, train_weights
+    return train_x, train_sslm_x, train_y, train_weights
 
 
 def load_test_data(dataset):
@@ -52,10 +61,11 @@ def load_test_data(dataset):
 
     data = np.load(dataset)
     test_x = data['test_x']
+    test_sslm_x = data['test_sslm_x']
     test_y = data['test_y']
     test_weights = data['test_weights']
 
-    return test_x, test_y, test_weights
+    return test_x, test_sslm_x, test_y, test_weights
 
 
 def build_model(img_rows, img_cols):
@@ -78,6 +88,24 @@ def build_model(img_rows, img_cols):
 
     return model
 
+def build_sslm_model(img_rows, img_cols):
+
+    input = layers.Input(shape=(img_rows, img_cols, 1))
+    x = layers.Conv2D(16, (8, 8))(input)
+    x = layers.Activation('relu')(x)
+    x = layers.MaxPooling2D(pool_size=(6, 6))(x)
+    x = layers.Conv2D(64, (4, 4))(x)
+    x = layers.Activation('relu')(x)
+    x = layers.Dropout(0.5)(x)
+    x = layers.Flatten()(x)
+    x = layers.Dense(256)(x)
+    x = layers.Activation('relu')(x)
+    x = layers.Dropout(0.5)(x)
+    x = layers.Dense(1)(x)
+    x = layers.Activation('sigmoid')(x)
+    return Model(inputs = [input], outputs = x)
+
+
 
 def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None):
     """
@@ -90,23 +118,27 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     """
 
     print('loading training data...')
-    X_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz')
+    X_train, x_sslm_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz')
 
     print('training data size:')
     print(X_train.shape)
 
     p = np.random.permutation(X_train.shape[0])
     X_train = X_train[p, :, :]
+    x_sslm_train = x_sslm_train[p, :, :]
     y_train = y_train[p]
     w_train = w_train[p]
 
     X_train = X_train.astype('float32')
     X_train = np.expand_dims(X_train, 3)
+    x_sslm_train = np.expand_dims(x_sslm_train, 3)
 
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
 
-    model = build_model(img_rows, img_cols)
+    #model = build_model(img_rows, img_cols)
+    breakpoint()
+    model = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2])
 
     if weights_file is not None:
         model.load_weights(weights_file)
@@ -117,21 +149,26 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
 
     print('train model...')
-    model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+    model.fit(x_sslm_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
               verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
 
+    #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+    #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
     print('load test data...')
-    X_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz')
+    X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz')
     X_test = X_test.astype('float32')
     X_test = np.expand_dims(X_test, 3)
+    x_sslm_test = np.expand_dims(x_sslm_test, 3)
 
     print('predict test data...')
-    preds = model.predict(X_test, batch_size=1, verbose=1)
+    preds = model.predict(x_sslm_test, batch_size=1, verbose=1)
+    #preds = model.predict(X_test, batch_size=1, verbose=1)
 
     print('saving results...')
     np.save('../Data/predsTestTracks' + save_ext + '.npy', preds)
 
-    score = model.evaluate(X_test, y_test, verbose=1)
+    score = model.evaluate(x_sslm_test, y_test, verbose=1)
+    #score = model.evaluate(X_test, y_test, verbose=1)
     print('Test score:', score)
 
     # save model

From a1c271d13d3e5fb693254b7b214112afdf1bfba2 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 25 Mar 2021 05:00:39 -0700
Subject: [PATCH 12/35] train both models

---
 Python/train_segmentation_cnn.py | 55 +++++++++++---------------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index c089c2d..d9f3277 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -69,42 +69,25 @@ def load_test_data(dataset):
 
 
 def build_model(img_rows, img_cols):
-
-    model = Sequential()
-
-    model.add(Convolution2D(32, (6, 8), input_shape=(img_rows, img_cols, 1)))
-    model.add(Activation('relu'))
-    model.add(MaxPooling2D(pool_size=(5, 2)))
-    model.add(Convolution2D(64, (4, 6)))
-    model.add(Activation('relu'))
-    model.add(MaxPooling2D(pool_size=(2, 2)))
-    model.add(Dropout(0.5))
-    model.add(Flatten())
-    model.add(Dense(256))
-    model.add(Activation('relu'))
-    model.add(Dropout(0.5))
-    model.add(Dense(1))
-    model.add(Activation('sigmoid'))
-
-    return model
+    input = layers.Input(shape=(img_rows, img_cols, 1))
+    x = layers.Conv2D(16, (6, 8), activation='relu')(input)
+    x = layers.MaxPooling2D(pool_size=(3, 6))(x)
+    return input, x
 
 def build_sslm_model(img_rows, img_cols):
-
     input = layers.Input(shape=(img_rows, img_cols, 1))
-    x = layers.Conv2D(16, (8, 8))(input)
-    x = layers.Activation('relu')(x)
+    x = layers.Conv2D(16, (8, 8), activation='relu')(input)
     x = layers.MaxPooling2D(pool_size=(6, 6))(x)
-    x = layers.Conv2D(64, (4, 4))(x)
-    x = layers.Activation('relu')(x)
-    x = layers.Dropout(0.5)(x)
+    return input, x
+
+def build_fused_model(inputs, outputs):
+    x = layers.Concatenate(axis=1)(outputs)
+    x = layers.Conv2D(32, (6, 3), activation='relu')(x)
     x = layers.Flatten()(x)
-    x = layers.Dense(256)(x)
-    x = layers.Activation('relu')(x)
+    x = layers.Dense(256, activation='relu')(x)
     x = layers.Dropout(0.5)(x)
-    x = layers.Dense(1)(x)
-    x = layers.Activation('sigmoid')(x)
-    return Model(inputs = [input], outputs = x)
-
+    x = layers.Dense(1, activation='sigmoid')(x)
+    return Model(inputs = inputs, outputs = x)
 
 
 def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None):
@@ -136,9 +119,9 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
 
-    #model = build_model(img_rows, img_cols)
-    breakpoint()
-    model = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2])
+    mls_input, mls_output = build_model(img_rows, img_cols)
+    sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2])
+    model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output])
 
     if weights_file is not None:
         model.load_weights(weights_file)
@@ -149,7 +132,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
 
     print('train model...')
-    model.fit(x_sslm_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+    model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
               verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
 
     #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
@@ -161,13 +144,13 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     x_sslm_test = np.expand_dims(x_sslm_test, 3)
 
     print('predict test data...')
-    preds = model.predict(x_sslm_test, batch_size=1, verbose=1)
+    preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1)
     #preds = model.predict(X_test, batch_size=1, verbose=1)
 
     print('saving results...')
     np.save('../Data/predsTestTracks' + save_ext + '.npy', preds)
 
-    score = model.evaluate(x_sslm_test, y_test, verbose=1)
+    score = model.evaluate([X_test, x_sslm_test], y_test, verbose=1)
     #score = model.evaluate(X_test, y_test, verbose=1)
     print('Test score:', score)
 

From bd4551347b41405dd727fcab52718acf15922820 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 25 Mar 2021 05:04:39 -0700
Subject: [PATCH 13/35] put. the. candle. back.

---
 Data/test_tracks.txt  |  69 +++++
 Data/train_tracks.txt | 574 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 643 insertions(+)

diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index fb1b766..62f4bd4 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -1,3 +1,72 @@
 1166.mp3
 40.m4a
 1090.mp3
+584.m4a
+346.m4a
+1026.mp3
+1142.mp3
+1302.mp3
+1131.mp3
+608.m4a
+1274.mp3
+1376.mp3
+670.m4a
+1399.mp3
+1319.mp3
+18.m4a
+1123.mp3
+342.m4a
+10013.mp3
+642.m4a
+306.m4a
+1488.mp3
+516.m4a
+1192.mp3
+10024.mp3
+1357.mp3
+404.m4a
+1063.mp3
+1331.mp3
+1356.mp3
+1322.mp3
+1170.mp3
+1440.mp3
+1091.mp3
+964.mp3
+1436.mp3
+1414.mp3
+1474.mp3
+1036.mp3
+1040.mp3
+426.m4a
+1087.mp3
+1301.mp3
+970.mp3
+1141.mp3
+1250.mp3
+1483.mp3
+992.mp3
+1223.mp3
+1284.mp3
+10012.mp3
+472.m4a
+6.m4a
+986.mp3
+678.m4a
+1227.mp3
+1152.mp3
+5.m4a
+1270.mp3
+488.m4a
+1311.mp3
+1421.mp3
+1402.mp3
+522.m4a
+354.m4a
+1276.mp3
+1339.mp3
+1236.mp3
+1445.mp3
+1221.mp3
+1244.mp3
+1080.mp3
diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index d45159d..e2e2baf 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -1,3 +1,577 @@
 1136.mp3
 1343.mp3
 1027.mp3
+971.mp3
+484.m4a
+1130.mp3
+10032.mp3
+991.mp3
+616.m4a
+1076.mp3
+478.m4a
+1300.mp3
+1333.mp3
+1395.mp3
+440.m4a
+1004.mp3
+1372.mp3
+512.m4a
+1155.mp3
+1397.mp3
+1485.mp3
+1024.mp3
+1093.mp3
+660.m4a
+1254.mp3
+1460.mp3
+1149.mp3
+338.m4a
+1396.mp3
+52.m4a
+987.mp3
+1384.mp3
+1423.mp3
+594.m4a
+1107.mp3
+1410.mp3
+1030.mp3
+1403.mp3
+14.m4a
+20.m4a
+480.m4a
+1455.mp3
+37.m4a
+995.mp3
+1430.mp3
+1147.mp3
+1392.mp3
+1164.mp3
+1205.mp3
+626.m4a
+1182.mp3
+444.m4a
+1448.mp3
+4.m4a
+1374.mp3
+996.mp3
+1328.mp3
+1365.mp3
+1358.mp3
+989.mp3
+1478.mp3
+1157.mp3
+1144.mp3
+1286.mp3
+384.m4a
+1179.mp3
+1404.mp3
+1256.mp3
+974.mp3
+1271.mp3
+498.m4a
+1327.mp3
+618.m4a
+1354.mp3
+966.mp3
+955.mp3
+1035.mp3
+1046.mp3
+1352.mp3
+10023.mp3
+1224.mp3
+1204.mp3
+1038.mp3
+1059.mp3
+534.m4a
+420.m4a
+1490.mp3
+474.m4a
+1243.mp3
+1086.mp3
+1226.mp3
+1048.mp3
+1476.mp3
+1214.mp3
+10033.mp3
+1162.mp3
+340.m4a
+13.m4a
+10025.mp3
+450.m4a
+1138.mp3
+1359.mp3
+1219.mp3
+10.m4a
+1202.mp3
+965.mp3
+1023.mp3
+1375.mp3
+1140.mp3
+1039.mp3
+1083.mp3
+1092.mp3
+1052.mp3
+1310.mp3
+1462.mp3
+10021.mp3
+1007.mp3
+690.m4a
+1242.mp3
+1120.mp3
+1496.mp3
+576.m4a
+1167.mp3
+652.m4a
+1055.mp3
+1419.mp3
+676.m4a
+416.m4a
+1316.mp3
+1288.mp3
+634.m4a
+1299.mp3
+648.m4a
+1268.mp3
+1078.mp3
+1459.mp3
+524.m4a
+978.mp3
+1114.mp3
+614.m4a
+1218.mp3
+1064.mp3
+1463.mp3
+612.m4a
+1122.mp3
+1232.mp3
+1258.mp3
+408.m4a
+1408.mp3
+402.m4a
+1306.mp3
+1074.mp3
+983.mp3
+1069.mp3
+8.m4a
+1126.mp3
+1335.mp3
+1062.mp3
+10008.mp3
+370.m4a
+1272.mp3
+1326.mp3
+1429.mp3
+1124.mp3
+320.m4a
+1196.mp3
+1464.mp3
+1350.mp3
+12.m4a
+1099.mp3
+1054.mp3
+1435.mp3
+1439.mp3
+372.m4a
+1269.mp3
+568.m4a
+1422.mp3
+10020.mp3
+10009.mp3
+307.m4a
+1109.mp3
+1206.mp3
+1318.mp3
+350.m4a
+1450.mp3
+360.m4a
+963.mp3
+476.m4a
+1251.mp3
+1132.mp3
+1011.mp3
+1424.mp3
+492.m4a
+1005.mp3
+1266.mp3
+1079.mp3
+1115.mp3
+1360.mp3
+1175.mp3
+1431.mp3
+1294.mp3
+520.m4a
+1245.mp3
+410.m4a
+1239.mp3
+468.m4a
+16.m4a
+1195.mp3
+1151.mp3
+1493.mp3
+1084.mp3
+1240.mp3
+1378.mp3
+1037.mp3
+988.mp3
+324.m4a
+1104.mp3
+979.mp3
+424.m4a
+1467.mp3
+975.mp3
+364.m4a
+1171.mp3
+10026.mp3
+1285.mp3
+668.m4a
+1189.mp3
+1291.mp3
+596.m4a
+1261.mp3
+1072.mp3
+442.m4a
+356.m4a
+1148.mp3
+956.mp3
+1070.mp3
+482.m4a
+396.m4a
+1067.mp3
+486.m4a
+1112.mp3
+358.m4a
+982.mp3
+1173.mp3
+334.m4a
+1262.mp3
+1412.mp3
+1315.mp3
+1309.mp3
+1106.mp3
+1287.mp3
+570.m4a
+1389.mp3
+1135.mp3
+1119.mp3
+1407.mp3
+1075.mp3
+666.m4a
+1207.mp3
+1367.mp3
+1362.mp3
+1451.mp3
+998.mp3
+1246.mp3
+1381.mp3
+1101.mp3
+1003.mp3
+1125.mp3
+1386.mp3
+536.m4a
+1238.mp3
+1095.mp3
+994.mp3
+1088.mp3
+394.m4a
+46.m4a
+1154.mp3
+1264.mp3
+1077.mp3
+1188.mp3
+1472.mp3
+1134.mp3
+1293.mp3
+1117.mp3
+1053.mp3
+658.m4a
+1461.mp3
+422.m4a
+1215.mp3
+1045.mp3
+317.m4a
+1158.mp3
+1346.mp3
+1194.mp3
+1446.mp3
+10022.mp3
+1159.mp3
+1368.mp3
+1332.mp3
+1096.mp3
+502.m4a
+1394.mp3
+1168.mp3
+1181.mp3
+610.m4a
+392.m4a
+322.m4a
+1371.mp3
+39.m4a
+560.m4a
+1180.mp3
+1338.mp3
+1443.mp3
+1111.mp3
+1432.mp3
+532.m4a
+496.m4a
+1482.mp3
+981.mp3
+311.m4a
+366.m4a
+694.m4a
+1212.mp3
+1102.mp3
+997.mp3
+646.m4a
+1042.mp3
+1060.mp3
+1174.mp3
+1382.mp3
+959.mp3
+554.m4a
+510.m4a
+1247.mp3
+1213.mp3
+323.m4a
+10017.mp3
+1082.mp3
+1110.mp3
+1307.mp3
+1495.mp3
+1296.mp3
+10016.mp3
+1108.mp3
+1364.mp3
+1470.mp3
+1021.mp3
+1492.mp3
+1484.mp3
+654.m4a
+504.m4a
+30.m4a
+1235.mp3
+10027.mp3
+1211.mp3
+1176.mp3
+1015.mp3
+574.m4a
+1314.mp3
+1494.mp3
+1405.mp3
+999.mp3
+10014.mp3
+990.mp3
+1071.mp3
+1184.mp3
+506.m4a
+1336.mp3
+1199.mp3
+1222.mp3
+976.mp3
+1128.mp3
+1044.mp3
+1000.mp3
+1051.mp3
+1442.mp3
+24.m4a
+1210.mp3
+578.m4a
+564.m4a
+1032.mp3
+1437.mp3
+10029.mp3
+1406.mp3
+1379.mp3
+1347.mp3
+1456.mp3
+1438.mp3
+508.m4a
+1022.mp3
+1308.mp3
+1413.mp3
+1012.mp3
+3.m4a
+1127.mp3
+1253.mp3
+10035.mp3
+1390.mp3
+980.mp3
+1351.mp3
+368.m4a
+1317.mp3
+1150.mp3
+550.m4a
+967.mp3
+630.m4a
+1342.mp3
+968.mp3
+1260.mp3
+1383.mp3
+1428.mp3
+590.m4a
+1468.mp3
+1133.mp3
+1324.mp3
+1444.mp3
+1118.mp3
+1008.mp3
+10019.mp3
+1420.mp3
+448.m4a
+606.m4a
+1029.mp3
+10007.mp3
+1160.mp3
+1447.mp3
+548.m4a
+1415.mp3
+604.m4a
+1220.mp3
+1275.mp3
+10034.mp3
+336.m4a
+1186.mp3
+1469.mp3
+1475.mp3
+1454.mp3
+1434.mp3
+1418.mp3
+1014.mp3
+686.m4a
+1427.mp3
+10031.mp3
+1279.mp3
+1006.mp3
+1282.mp3
+1325.mp3
+1172.mp3
+1280.mp3
+957.mp3
+632.m4a
+1043.mp3
+556.m4a
+1387.mp3
+1230.mp3
+10030.mp3
+984.mp3
+1278.mp3
+1400.mp3
+1143.mp3
+10011.mp3
+1103.mp3
+1491.mp3
+662.m4a
+1283.mp3
+1334.mp3
+1068.mp3
+1228.mp3
+1066.mp3
+696.m4a
+1116.mp3
+1056.mp3
+335.m4a
+1348.mp3
+674.m4a
+1292.mp3
+1156.mp3
+1304.mp3
+1197.mp3
+1013.mp3
+1355.mp3
+1216.mp3
+1380.mp3
+1426.mp3
+1320.mp3
+352.m4a
+1267.mp3
+1085.mp3
+325.m4a
+620.m4a
+640.m4a
+1234.mp3
+1203.mp3
+1163.mp3
+22.m4a
+10018.mp3
+1479.mp3
+622.m4a
+1487.mp3
+1486.mp3
+344.m4a
+1200.mp3
+1340.mp3
+1018.mp3
+1388.mp3
+1363.mp3
+1187.mp3
+1139.mp3
+960.mp3
+1229.mp3
+1208.mp3
+1034.mp3
+1178.mp3
+562.m4a
+1341.mp3
+1303.mp3
+1477.mp3
+1058.mp3
+1020.mp3
+1349.mp3
+650.m4a
+1190.mp3
+1295.mp3
+962.mp3
+514.m4a
+972.mp3
+586.m4a
+1312.mp3
+664.m4a
+1411.mp3
+1277.mp3
+1366.mp3
+1231.mp3
+386.m4a
+1458.mp3
+1263.mp3
+602.m4a
+382.m4a
+1248.mp3
+1146.mp3
+328.m4a
+10028.mp3
+1061.mp3
+466.m4a
+528.m4a
+1452.mp3
+1498.mp3
+636.m4a
+1398.mp3
+1373.mp3
+1290.mp3
+1183.mp3
+1298.mp3
+1237.mp3
+1323.mp3
+10015.mp3
+1198.mp3
+518.m4a
+10010.mp3
+1098.mp3
+1047.mp3
+1165.mp3
+1191.mp3
+348.m4a
+1466.mp3
+1019.mp3
+1453.mp3
+428.m4a
+624.m4a
+1391.mp3
+958.mp3
+973.mp3
+1259.mp3
+1370.mp3
+310.m4a
+1330.mp3
+692.m4a

From 6e3f6dd1b39792048df03ef3482c0e5d48634569 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 25 Mar 2021 13:46:23 -0700
Subject: [PATCH 14/35] put in do_async switch, fix padding issue w/ sslm
 features

---
 Python/feature_extraction.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index df62fc0..5c1508b 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -219,18 +219,22 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     labels_list = []
     failed_tracks_idx = []
 
+    do_async = False
     async_res = []
 
     logger = multiprocessing.log_to_stderr()
     logger.setLevel(logging.INFO)
 
     with multiprocessing.Pool(processes=8) as pool:
-        for i, f in enumerate(audio_files):
-            async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
+        if do_async:
+            for i, f in enumerate(audio_files):
+                async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
 
         for i, f in enumerate(audio_files):
-            beat_mls, beat_sslm, beat_times = async_res[i].get()
-            #beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files)
+            if do_async:
+                beat_mls, beat_sslm, beat_times = async_res[i].get()
+            else:
+                beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files)
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -333,7 +337,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
                     next_weight = 1
 
                     data_x[feature_count, :, :] = next_window
-                    data_sslm_x[feature_count] =  sslm_features[:, :, k]
+                    data_sslm_x[feature_count] =  sslm_features[:, :, k - padding_length]
                     data_y[feature_count] = next_label
                     data_weight[feature_count] = next_weight
                     track_idx[feature_count] = current_track

From af4995179aad3603c395595bc56bd9fd4ec2a470 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Sun, 28 Mar 2021 04:18:34 -0700
Subject: [PATCH 15/35] checkpoint

---
 Data/train_tracks.txt            |  2 +-
 Python/feature_extraction.py     | 28 +++++++++++++---------------
 Python/train_segmentation_cnn.py |  6 ++++++
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index e2e2baf..5341be2 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -1,8 +1,8 @@
+484.m4a
 1136.mp3
 1343.mp3
 1027.mp3
 971.mp3
-484.m4a
 1130.mp3
 10032.mp3
 991.mp3
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 5c1508b..2a20ac6 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -67,17 +67,13 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     S_to_dB = librosa.power_to_db(mel_spec,ref=np.max)
 
-    # pad 130 frames (to be 65) with noise at -70dB at the beginning
-    #pad = np.full((S_to_dB.shape[0], context_length * 2), -70)
-    #S_padded = np.concatenate((pad, S_to_dB), axis=1)
-
-
+    # first max-pooling: by 2.
     x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max)
 
     MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
     MFCCs = MFCCs[1:,:] + 1
 
-    # this seems to group two frames together
+    # stack (bag) two frames
     m = 2
     x = [np.roll(MFCCs,n,axis=1) for n in range(m)]
     x_hat = np.concatenate(x, axis=0)
@@ -85,11 +81,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     # create circular foo
     x_hat_length = x_hat.shape[1]
     x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1)
-    print("pre-padded: {}, post-padded: {}".format(x_hat.shape, x_padded.shape))
 
     #Cosine distance calculation: D[N/p,L/p] matrix
     distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p
-    for i in range(context_length, x_padded.shape[1] - context_length): #iteration in columns of x_hat
+    for i in range(context_length, x_padded.shape[1] - context_length):
         for l in range(context_length):
             cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L
             distances[i,l] = cosine_dist
@@ -98,20 +93,22 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     kappa = 0.1 #equalization factor of 10%
     t1 = time.time()
     epsilon = np.full((distances.shape[0], context_length), 1.0)
-    for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat
+    for i in range(context_length, distances.shape[0]):
         for l in range(context_length):
             epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa)
+            if epsilon[i,l] == 0:
+                epsilon[i,l] = 0.000000001
 
-    t2 = time.time()
 
-    print(t2-t1)
+    t2 = time.time()
+    #print(t2-t1)
 
     #Self Similarity Lag Matrix
     sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
-    #sslm = scipy.special.expit(1-distances)
     sslm = np.transpose(sslm)
 
-    # the paper further downsamples by 3, but since we're doing beat-frames only might be ok
+
+    #breakpoint()
     #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max)
     #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max)
 
@@ -235,6 +232,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
                 beat_mls, beat_sslm, beat_times = async_res[i].get()
             else:
                 beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files)
+
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -355,7 +353,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
                             next_weight = 1. - np.abs(l-k) / (label_smearing + 1.)
 
                             data_x[feature_count, :, :] = next_window
-                            data_sslm_x[feature_count] =  sslm_features[:, :, l]
+                            data_sslm_x[feature_count] =  sslm_features[:, :, l - padding_length]
                             data_y[feature_count] = next_label
                             data_weight[feature_count] = next_weight
                             track_idx[feature_count] = current_track
@@ -375,7 +373,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
 
                             next_window = features[:, l-padding_length: l+padding_length+1]
 
-                            data_sslm_x[feature_count] =  sslm_features[:, :, l]
+                            data_sslm_x[feature_count] =  sslm_features[:, :, l - padding_length]
                             data_x[feature_count, :, :] = next_window
                             data_y[feature_count] = 0
                             data_weight[feature_count] = 1
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index d9f3277..9c57d95 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -83,6 +83,8 @@ def build_sslm_model(img_rows, img_cols):
 def build_fused_model(inputs, outputs):
     x = layers.Concatenate(axis=1)(outputs)
     x = layers.Conv2D(32, (6, 3), activation='relu')(x)
+    #x = layers.Conv2D(64, (6, 3), activation='relu')(outputs[0])
+    x = layers.Dropout(0.5)(x)
     x = layers.Flatten()(x)
     x = layers.Dense(256, activation='relu')(x)
     x = layers.Dropout(0.5)(x)
@@ -122,6 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     mls_input, mls_output = build_model(img_rows, img_cols)
     sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2])
     model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output])
+    #model = build_fused_model([mls_input], [mls_output])
 
     if weights_file is not None:
         model.load_weights(weights_file)
@@ -135,6 +138,9 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
               verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
 
+    #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+    #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
+
     #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
     #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
     print('load test data...')

From a2cd9898c40bb2f2db353ebb896b3b6813fea901 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Sun, 28 Mar 2021 13:58:38 -0700
Subject: [PATCH 16/35] checkpoint, non-padded way of making sslm circular

---
 Python/feature_extraction.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 2a20ac6..02d1980 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -73,27 +73,25 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
     MFCCs = MFCCs[1:,:] + 1
 
-    # stack (bag) two frames
+    # stack (bag?) two frames
     m = 2
     x = [np.roll(MFCCs,n,axis=1) for n in range(m)]
     x_hat = np.concatenate(x, axis=0)
 
-    # create circular foo
     x_hat_length = x_hat.shape[1]
-    x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1)
-
     #Cosine distance calculation: D[N/p,L/p] matrix
-    distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p
-    for i in range(context_length, x_padded.shape[1] - context_length):
+    distances = np.full((x_hat_length, context_length), 1.0) #D has as dimensions N/p and L/p
+    for i in range(x_hat_length):
         for l in range(context_length):
-            cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L
+            # note that negative indices here make our matrix 'time-circular'
+            cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L
             distances[i,l] = cosine_dist
 
     #Threshold epsilon[N/p,L/p] calculation
     kappa = 0.1 #equalization factor of 10%
     t1 = time.time()
     epsilon = np.full((distances.shape[0], context_length), 1.0)
-    for i in range(context_length, distances.shape[0]):
+    for i in range(distances.shape[0]):
         for l in range(context_length):
             epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa)
             if epsilon[i,l] == 0:
@@ -122,10 +120,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]))
 
     for k in range(beat_frames.shape[0]):
-        sslm_frame = beat_frames[k] // max_pool + context_length
+        sslm_frame = beat_frames[k] // max_pool
         sslm_frame_min = sslm_frame - context_length // 2
         sslm_frame_max = sslm_frame + context_length // 2 + 1
-        beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max]
+        beat_sslms[:,:,k] = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1)
 
     return beat_sslms
 
@@ -216,7 +214,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     labels_list = []
     failed_tracks_idx = []
 
-    do_async = False
+    do_async = True
     async_res = []
 
     logger = multiprocessing.log_to_stderr()

From dd8bf84e997dfeeb33486b95ed7b8207fdb53e00 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Mon, 29 Mar 2021 16:20:35 -0700
Subject: [PATCH 17/35] checkpoint SSLM work

increase the number of lag buffers out.
This is the first SSLM that seems to actually work.
mean f-Measure for 0.5: 0.2997474318247059, precision: 0.2868324565503678, recall: 0.354656707784005
mean f-Measure for 3.0: 0.5864113051042869, precision: 0.5659258978868614, recall: 0.6849318477316697
---
 Python/feature_extraction.py     | 58 ++++++++++++++++++++++----------
 Python/track_segmentation.py     | 22 ++++++------
 Python/train_segmentation_cnn.py | 14 ++++----
 Python/visualization.py          |  7 ++--
 4 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 02d1980..892c245 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -22,6 +22,7 @@
 import paths
 import warnings
 import time
+import pdb
 
 import multiprocessing, logging
 
@@ -44,6 +45,10 @@
 random.seed(1234)           # for reproducibility
 np.random.seed(1234)
 
+def debug_signal_handler(signal, frame):
+    pdb.set_trace()
+
+
 def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
     """
     Compute average Mel log spectrogram per beat given previously
@@ -80,26 +85,31 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     x_hat_length = x_hat.shape[1]
     #Cosine distance calculation: D[N/p,L/p] matrix
-    distances = np.full((x_hat_length, context_length), 1.0) #D has as dimensions N/p and L/p
+
+    sslm_shape = context_length * 3 # because we'll max pool it down at the end
+
+    distances = np.full((x_hat_length, sslm_shape), 1.0) #D has as dimensions N/p and L/p
     for i in range(x_hat_length):
-        for l in range(context_length):
+        for l in range(sslm_shape):
             # note that negative indices here make our matrix 'time-circular'
             cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L
             distances[i,l] = cosine_dist
 
     #Threshold epsilon[N/p,L/p] calculation
     kappa = 0.1 #equalization factor of 10%
-    t1 = time.time()
-    epsilon = np.full((distances.shape[0], context_length), 1.0)
+
+    epsilon_buf = np.empty((sslm_shape, sslm_shape * 2))
+    epsilon = np.empty((distances.shape[0], sslm_shape))
+
     for i in range(distances.shape[0]):
-        for l in range(context_length):
-            epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa)
-            if epsilon[i,l] == 0:
-                epsilon[i,l] = 0.000000001
+        for l in range(sslm_shape):
+            epsilon_buf[l] = np.concatenate((distances[i-l,:], distances[i,:]))
 
+        epsilon[i] = np.quantile(epsilon_buf, kappa, axis=1)
+        for l in range(sslm_shape):
+            if epsilon[i, l] == 0:
+                epsilon[i,l] = 0.000000001
 
-    t2 = time.time()
-    #print(t2-t1)
 
     #Self Similarity Lag Matrix
     sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
@@ -121,9 +131,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     for k in range(beat_frames.shape[0]):
         sslm_frame = beat_frames[k] // max_pool
-        sslm_frame_min = sslm_frame - context_length // 2
-        sslm_frame_max = sslm_frame + context_length // 2 + 1
-        beat_sslms[:,:,k] = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1)
+        sslm_frame_min = sslm_frame - sslm_shape // 2
+        sslm_frame_max = sslm_frame + sslm_shape // 2 + 1
+        beat_sslm = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1)
+        beat_sslms[:,:,k] = skimage.measure.block_reduce(beat_sslm, (3,3), np.max)
 
     return beat_sslms
 
@@ -180,9 +191,8 @@ def get_cached_features(filename):
     else:
         return None
 
-def compute_features(logger, f, i, audio_files):
-    logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
 
+def compute_features(f):
     beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
 
     cached_features = get_cached_features(f)
@@ -197,6 +207,11 @@ def compute_features(logger, f, i, audio_files):
     beat_mls /= np.max(beat_mls)
     return beat_mls, beat_sslm, beat_times
 
+def compute_features_async(logger, f, i, audio_files):
+    logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
+
+    return compute_features(f)
+
 def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     """
     Extract Mel log spectrogram features from a folder of audio files given pre-analysed
@@ -223,13 +238,18 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     with multiprocessing.Pool(processes=8) as pool:
         if do_async:
             for i, f in enumerate(audio_files):
-                async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, )))
+                async_res.append(pool.apply_async(compute_features_async, (logger, f, i, audio_files, )))
 
         for i, f in enumerate(audio_files):
             if do_async:
-                beat_mls, beat_sslm, beat_times = async_res[i].get()
+                try:
+                    beat_mls, beat_sslm, beat_times = async_res[i].get()
+                except Exception as inst:
+                    print("error processing {}".format(f))
+                    print(inst)
+                    continue
             else:
-                beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files)
+                beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files)
 
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
@@ -447,6 +467,8 @@ def load_raw_features(file):
 
 
 if __name__ == "__main__":
+    #import signal
+    #signal.signal(signal.SIGINT, debug_signal_handler)
 
     train_frame = pd.read_csv('../Data/train_tracks.txt', header=None)
     test_frame = pd.read_csv('../Data/test_tracks.txt', header=None)
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index 27daf95..7b368f2 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -13,7 +13,7 @@
 import os, sys
 import numpy as np
 import pandas as pd
-from feature_extraction import compute_beat_mls, normalize_features_per_band
+from feature_extraction import compute_features, normalize_features_per_band
 from evaluation import post_processing
 from train_segmentation_cnn import build_model
 import peakutils
@@ -26,16 +26,19 @@
 padding = int(context_length / 2)
 
 
-def compute_cnn_predictions(features):
+def compute_cnn_predictions(mls_features, sslm_features):
     """
     Apply pretrained CNN model to features and return predictions.
     """
-    model = build_model(num_mel_bands, context_length)
+    model = build_model(num_mel_bands, context_length, context_length)
     model.load_weights(model_weights)
     model.compile(loss='binary_crossentropy', optimizer='sgd')
 
-    features = np.expand_dims(features, 3)
-    predictions = model.predict(features, batch_size=1)
+    mls_features = np.expand_dims(mls_features, 3)
+    sslm_features = np.transpose(sslm_features, (2, 0, 1))
+    sslm_features = np.expand_dims(sslm_features, 3)
+
+    predictions = model.predict([mls_features, sslm_features], batch_size=1)
 
     return predictions
 
@@ -52,8 +55,7 @@ def extract_features(audio_file, beats_file):
     beat_times = t[0].values
     beat_numbers = t[1].values
 
-    beat_mls = compute_beat_mls(filename=audio_file, beat_times=beat_times)
-    beat_mls /= np.max(beat_mls)
+    beat_mls, beat_sslm, beat_times = compute_features(audio_file)
     features = compute_context_windows(beat_mls)
 
     norm_data = np.load(normalization_path)
@@ -61,7 +63,7 @@ def extract_features(audio_file, beats_file):
     std_vec = norm_data['std_vec']
     features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec)
 
-    return features, beat_times, beat_numbers
+    return features, beat_sslm, beat_times, beat_numbers
 
 
 def compute_context_windows(features):
@@ -148,10 +150,10 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
         os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
 
     print("Computing features")
-    mls_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt')
+    mls_features, sslm_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt')
 
     print("Computing CNN predictions")
-    predictions = compute_cnn_predictions(mls_features)
+    predictions = compute_cnn_predictions(mls_features, sslm_features)
 
     print("Get segment times")
     segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers)
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 9c57d95..3f1ccfa 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -68,7 +68,7 @@ def load_test_data(dataset):
     return test_x, test_sslm_x, test_y, test_weights
 
 
-def build_model(img_rows, img_cols):
+def build_mls_model(img_rows, img_cols):
     input = layers.Input(shape=(img_rows, img_cols, 1))
     x = layers.Conv2D(16, (6, 8), activation='relu')(input)
     x = layers.MaxPooling2D(pool_size=(3, 6))(x)
@@ -82,8 +82,7 @@ def build_sslm_model(img_rows, img_cols):
 
 def build_fused_model(inputs, outputs):
     x = layers.Concatenate(axis=1)(outputs)
-    x = layers.Conv2D(32, (6, 3), activation='relu')(x)
-    #x = layers.Conv2D(64, (6, 3), activation='relu')(outputs[0])
+    x = layers.Conv2D(64, (6, 3), activation='relu')(x)
     x = layers.Dropout(0.5)(x)
     x = layers.Flatten()(x)
     x = layers.Dense(256, activation='relu')(x)
@@ -91,6 +90,10 @@ def build_fused_model(inputs, outputs):
     x = layers.Dense(1, activation='sigmoid')(x)
     return Model(inputs = inputs, outputs = x)
 
+def build_model(mls_rows, mls_cols, sslm_shape):
+    mls_input, mls_output = build_mls_model(mls_rows, mls_cols)
+    sslm_input, sslm_output = build_sslm_model(sslm_shape, sslm_shape)
+    return  build_fused_model([mls_input, sslm_input], [mls_output, sslm_output])
 
 def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None):
     """
@@ -121,10 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
 
-    mls_input, mls_output = build_model(img_rows, img_cols)
-    sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2])
-    model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output])
-    #model = build_fused_model([mls_input], [mls_output])
+    model = build_model(img_rows, img_cols, sslm_train.shape[1])
 
     if weights_file is not None:
         model.load_weights(weights_file)
diff --git a/Python/visualization.py b/Python/visualization.py
index b602dcc..8259051 100644
--- a/Python/visualization.py
+++ b/Python/visualization.py
@@ -22,17 +22,14 @@ def visualize_predictions():
     """
 
     preds = np.load('../Data/predsTestTracks_100epochs_lr005.npy')
-    train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle')
-
     data = np.load('../Data/testDataNormalized.npz')
     test_y = data['test_y']
 
     # load file lists and indices
     with open('../Data/fileListsAndIndex.pickle', 'rb') as f:
-            train_files, train_idx, test_files, test_idx = pickle.load(f)
-
-    for i in range(len(test_labels)):
+        train_files, train_idx, test_files, test_idx = pickle.load(f)
 
+    for i in range(len(test_files)):
         f = test_files[i]
         beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True)
         print(f)

From e64d33c27104d2124b331c5dc90ec1a2010f8333 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Mon, 29 Mar 2021 19:14:23 -0700
Subject: [PATCH 18/35] fix training, early stopping back on

---
 Python/train_segmentation_cnn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 3f1ccfa..7d75f4a 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -124,7 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
 
-    model = build_model(img_rows, img_cols, sslm_train.shape[1])
+    model = build_model(img_rows, img_cols, x_sslm_train.shape[1])
 
     if weights_file is not None:
         model.load_weights(weights_file)
@@ -132,11 +132,11 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True)
     model.compile(loss='binary_crossentropy', optimizer=sgd)
 
-    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
+    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
 
     print('train model...')
     model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
-              verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
+              verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping])
 
     #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
     #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])

From aa7912953e71cd92709b28dee097e2522896e654 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Tue, 30 Mar 2021 10:14:57 -0700
Subject: [PATCH 19/35] fix possible epsilon calculation error

---
 Python/feature_extraction.py     | 3 ++-
 Python/train_segmentation_cnn.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 892c245..1b50ad9 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -103,7 +103,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     for i in range(distances.shape[0]):
         for l in range(sslm_shape):
-            epsilon_buf[l] = np.concatenate((distances[i-l,:], distances[i,:]))
+            epsilon_buf[l] = np.concatenate((distances[i-(l+1),:], distances[i,:]))
 
         epsilon[i] = np.quantile(epsilon_buf, kappa, axis=1)
         for l in range(sslm_shape):
@@ -247,6 +247,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
                 except Exception as inst:
                     print("error processing {}".format(f))
                     print(inst)
+                    failed_tracks_idx.append(i)
                     continue
             else:
                 beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files)
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 7d75f4a..e27c181 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -23,7 +23,7 @@
 from keras.callbacks import EarlyStopping
 from keras.optimizers import SGD
 
-np.random.seed(1234)  # for reproducibility
+np.random.seed(1235)  # for reproducibility
 
 
 def load_training_data(dataset):

From 77bca47d26b8c0645b4998da113d8b1c652c2d12 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Wed, 31 Mar 2021 03:51:57 -0700
Subject: [PATCH 20/35] remove bum track

---
 Data/test_tracks.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index 62f4bd4..596411a 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -1,5 +1,4 @@
 1166.mp3
-40.m4a
 1090.mp3
 584.m4a
 346.m4a

From c567be8a149e3fd8d654753a3d1f10b2c5850605 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Wed, 31 Mar 2021 13:45:31 -0700
Subject: [PATCH 21/35] caching, float32

---
 Python/feature_extraction.py | 56 +++++++++++++++++++-----------------
 Python/paths.py              |  4 +--
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 1b50ad9..10b03d1 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -25,6 +25,7 @@
 import pdb
 
 import multiprocessing, logging
+from contextlib import contextmanager
 
 from utils import *
 import scipy
@@ -88,7 +89,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     sslm_shape = context_length * 3 # because we'll max pool it down at the end
 
-    distances = np.full((x_hat_length, sslm_shape), 1.0) #D has as dimensions N/p and L/p
+    distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p
     for i in range(x_hat_length):
         for l in range(sslm_shape):
             # note that negative indices here make our matrix 'time-circular'
@@ -98,8 +99,8 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     #Threshold epsilon[N/p,L/p] calculation
     kappa = 0.1 #equalization factor of 10%
 
-    epsilon_buf = np.empty((sslm_shape, sslm_shape * 2))
-    epsilon = np.empty((distances.shape[0], sslm_shape))
+    epsilon_buf = np.empty((sslm_shape, sslm_shape * 2), dtype=np.float32)
+    epsilon = np.empty((distances.shape[0], sslm_shape), dtype=np.float32)
 
     for i in range(distances.shape[0]):
         for l in range(sslm_shape):
@@ -115,19 +116,8 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
     sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
     sslm = np.transpose(sslm)
 
-
-    #breakpoint()
-    #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max)
-    #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max)
-
-    #Check if SSLM has nans and if it has them, substitute them by 0
-    #for i in range(sslm.shape[0]):
-    #    for j in range(sslm.shape[1]):
-    #        if np.isnan(sslm[i,j]):
-    #            sslm[i,j] = 0
-
     beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int')
-    beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]))
+    beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]), dtype=np.float32)
 
     for k in range(beat_frames.shape[0]):
         sslm_frame = beat_frames[k] // max_pool
@@ -183,28 +173,35 @@ def load_waveform(filename):
         y, sr = librosa.load(path, sr=22050, mono=True)
         return y
 
-def get_cached_features(filename):
-    computed_mls_file = paths.get_mls_path(filename)
+def get_audio_cache(filename, ext):
+    path = paths.get_audio_cache_path(filename, ext)
 
-    if os.path.exists(computed_mls_file):
-        return np.load(computed_mls_file)
+    if os.path.exists(path):
+        return np.load(path)
     else:
         return None
 
+def set_audio_cache(filename, ext, data):
+    path = paths.get_audio_cache_path(filename, ext)
+    np.save(path, data)
 
 def compute_features(f):
     beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
 
-    cached_features = get_cached_features(f)
+    waveform = load_waveform(f)
+
+    beat_mls = get_audio_cache(f, '.mls.npy')
+    if beat_mls is None:
+        beat_mls = compute_beat_mls(waveform, beat_times)
+        beat_mls /= np.max(beat_mls)
+        set_audio_cache(f, '.mls.npy', beat_mls)
 
-    if cached_features is not None:
-        return cached_features
+    beat_sslm = get_audio_cache(f, '.mls_sslm.npy')
 
-    waveform = load_waveform(f)
+    if beat_sslm is None:
+        beat_sslm = compute_sslm(waveform, beat_times)
+        set_audio_cache(f, '.mls_sslm.npy', beat_sslm)
 
-    beat_mls = compute_beat_mls(waveform, beat_times)
-    beat_sslm = compute_sslm(waveform, beat_times)
-    beat_mls /= np.max(beat_mls)
     return beat_mls, beat_sslm, beat_times
 
 def compute_features_async(logger, f, i, audio_files):
@@ -230,11 +227,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     failed_tracks_idx = []
 
     do_async = True
+    max_tracks = None
+
     async_res = []
 
     logger = multiprocessing.log_to_stderr()
     logger.setLevel(logging.INFO)
 
+    n_tracks = 0
     with multiprocessing.Pool(processes=8) as pool:
         if do_async:
             for i, f in enumerate(audio_files):
@@ -269,6 +269,10 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
             sslm_feature_list.append(beat_sslm)
             labels_list.append(label_vec)
 
+            if max_tracks is not None and n_tracks > max_tracks:
+                break
+            n_tracks += 1
+
     return feature_list, sslm_feature_list, labels_list, failed_tracks_idx
 
 
diff --git a/Python/paths.py b/Python/paths.py
index 2bea298..55856e4 100644
--- a/Python/paths.py
+++ b/Python/paths.py
@@ -25,6 +25,6 @@ def remove_suffix(filename):
 def with_suffix(path, ext):
      return remove_suffix(path) + '.' + ext
 
-def get_mls_path(audio_filename):
-     return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy')
+def get_audio_cache_path(audio_filename, ext):
+     return os.path.join(mls_path, remove_suffix(audio_filename) + ext)
 

From acb4404d44f79fab2f0d826c8d90e49f3bf49b16 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 1 Apr 2021 09:34:22 -0700
Subject: [PATCH 22/35] chroma sslm!

---
 Python/feature_extraction.py     | 128 +++++++++++++++++--------------
 Python/train_segmentation_cnn.py |   4 +-
 2 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 10b03d1..0008c48 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -43,52 +43,30 @@
 
 max_pool = 2
 
+# for debugging
+# do_async = False
+# max_tracks = 1
+
+do_async = True
+max_tracks = None
+
 random.seed(1234)           # for reproducibility
 np.random.seed(1234)
 
 def debug_signal_handler(signal, frame):
     pdb.set_trace()
 
-
-def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
-    """
-    Compute average Mel log spectrogram per beat given previously
-    extracted beat times.
-
-    :param waveform: raw waveform data
-    :param beat_times: list of beat times in seconds
-    :param mel_bands: number of Mel bands
-    :param fft_size: FFT size
-    :param hop_size: hop size for FFT processing
-    :return: beat sslm
-    """
-    spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
-                               window=scipy.signal.hamming))
-
-    mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True)
-    s = np.sum(mel_fb, axis=1)
-    mel_fb = np.divide(mel_fb, s[:, np.newaxis])
-
-    mel_spec = np.dot(mel_fb, spec)
-
-    S_to_dB = librosa.power_to_db(mel_spec,ref=np.max)
-
-    # first max-pooling: by 2.
-    x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max)
-
-    MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
-    MFCCs = MFCCs[1:,:] + 1
-
+def compute_sslm(input_vector, beat_times, hop_size):
     # stack (bag?) two frames
     m = 2
-    x = [np.roll(MFCCs,n,axis=1) for n in range(m)]
+    x = [np.roll(input_vector,n,axis=1) for n in range(m)]
     x_hat = np.concatenate(x, axis=0)
 
     x_hat_length = x_hat.shape[1]
-    #Cosine distance calculation: D[N/p,L/p] matrix
 
     sslm_shape = context_length * 3 # because we'll max pool it down at the end
 
+    #Cosine distance calculation: D[N/p,L/p] matrix
     distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p
     for i in range(x_hat_length):
         for l in range(sslm_shape):
@@ -112,8 +90,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
                 epsilon[i,l] = 0.000000001
 
 
-    #Self Similarity Lag Matrix
-    sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide
+    sslm = scipy.special.expit(1-distances/epsilon) # sigmoid
     sslm = np.transpose(sslm)
 
     beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int')
@@ -128,6 +105,48 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h
 
     return beat_sslms
 
+
+
+def compute_mls_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
+    """
+    Compute self-similarilty lag matrix (SSLM) using mel-log spectrogram as input
+
+    :param waveform: raw waveform data
+    :param beat_times: list of beat times in seconds
+    :param mel_bands: number of Mel bands
+    :param fft_size: FFT size
+    :param hop_size: hop size for FFT processing
+    :return: beat sslm
+    """
+    spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size,
+                               window=scipy.signal.hamming))
+
+    mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True)
+    s = np.sum(mel_fb, axis=1)
+    mel_fb = np.divide(mel_fb, s[:, np.newaxis])
+
+    mel_spec = np.dot(mel_fb, spec)
+
+    S_to_dB = librosa.power_to_db(mel_spec,ref=np.max)
+
+    # first max-pooling: by 2.
+    x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max)
+
+    MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho')
+    MFCCs = MFCCs[1:,:] + 1
+
+    return compute_sslm(MFCCs, beat_times, hop_size)
+
+def compute_chroma_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
+    spec = librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming)
+    spec = np.abs(spec)
+    x_prime = skimage.measure.block_reduce(spec, (1,max_pool), np.max)
+
+    chroma_fb = librosa.filters.chroma(22050, fft_size, n_chroma=12)
+    chromagram = np.dot(chroma_fb, x_prime)
+
+    return compute_sslm(chromagram + 1, beat_times, hop_size)
+
 def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512):
     """
     Compute average Mel log spectrogram per beat given previously
@@ -173,36 +192,35 @@ def load_waveform(filename):
         y, sr = librosa.load(path, sr=22050, mono=True)
         return y
 
-def get_audio_cache(filename, ext):
+def with_audio_cache(filename, ext, waveform, beat_times, genf):
     path = paths.get_audio_cache_path(filename, ext)
 
     if os.path.exists(path):
-        return np.load(path)
+        return np.load(path), waveform
     else:
-        return None
+        if waveform is None:
+            waveform = load_waveform(filename)
 
-def set_audio_cache(filename, ext, data):
-    path = paths.get_audio_cache_path(filename, ext)
-    np.save(path, data)
+        data = genf(waveform, beat_times)
+        np.save(path, data)
+        return data, waveform
 
 def compute_features(f):
     beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
 
-    waveform = load_waveform(f)
-
-    beat_mls = get_audio_cache(f, '.mls.npy')
-    if beat_mls is None:
+    def gen_beat_mls(waveform, beat_times):
         beat_mls = compute_beat_mls(waveform, beat_times)
         beat_mls /= np.max(beat_mls)
-        set_audio_cache(f, '.mls.npy', beat_mls)
+        return beat_mls
 
-    beat_sslm = get_audio_cache(f, '.mls_sslm.npy')
+    waveform = None
+    beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls)
+    beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm)
+    chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm)
 
-    if beat_sslm is None:
-        beat_sslm = compute_sslm(waveform, beat_times)
-        set_audio_cache(f, '.mls_sslm.npy', beat_sslm)
+    beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3)
 
-    return beat_mls, beat_sslm, beat_times
+    return beat_mls, beat_sslm, chroma_sslm, beat_times
 
 def compute_features_async(logger, f, i, audio_files):
     logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
@@ -226,8 +244,6 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     labels_list = []
     failed_tracks_idx = []
 
-    do_async = True
-    max_tracks = None
 
     async_res = []
 
@@ -243,14 +259,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
         for i, f in enumerate(audio_files):
             if do_async:
                 try:
-                    beat_mls, beat_sslm, beat_times = async_res[i].get()
+                    beat_mls, beat_sslm, chroma_sslm, beat_times = async_res[i].get()
                 except Exception as inst:
                     print("error processing {}".format(f))
                     print(inst)
                     failed_tracks_idx.append(i)
                     continue
             else:
-                beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files)
+                beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features_async(logger, f, i, audio_files)
 
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
@@ -325,7 +341,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
-    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32)
+    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length, 2), dtype=np.float32)
     data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     track_idx = np.zeros(shape=(n_preallocate,), dtype=int)
@@ -449,7 +465,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
             break
 
     data_x = data_x[:feature_count, :, :]
-    data_sslm_x = data_sslm_x[:feature_count, :, :]
+    data_sslm_x = data_sslm_x[:feature_count, :, :, :]
     data_y = data_y[:feature_count]
     data_weight = data_weight[:feature_count]
     track_idx = track_idx[:feature_count]
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index e27c181..7f3cdab 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -75,7 +75,7 @@ def build_mls_model(img_rows, img_cols):
     return input, x
 
 def build_sslm_model(img_rows, img_cols):
-    input = layers.Input(shape=(img_rows, img_cols, 1))
+    input = layers.Input(shape=(img_rows, img_cols, 2))
     x = layers.Conv2D(16, (8, 8), activation='relu')(input)
     x = layers.MaxPooling2D(pool_size=(6, 6))(x)
     return input, x
@@ -119,7 +119,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
 
     X_train = X_train.astype('float32')
     X_train = np.expand_dims(X_train, 3)
-    x_sslm_train = np.expand_dims(x_sslm_train, 3)
 
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
@@ -147,7 +146,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz')
     X_test = X_test.astype('float32')
     X_test = np.expand_dims(X_test, 3)
-    x_sslm_test = np.expand_dims(x_sslm_test, 3)
 
     print('predict test data...')
     preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1)

From e0e20d922b4c031d0b1d9fbdc38f2f69764ff378 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 1 Apr 2021 11:03:23 -0700
Subject: [PATCH 23/35] extract with downbeat info, add more output to
 evaluation.py

---
 Python/evaluation.py         | 43 +++++++++++++++++++++++++-----------
 Python/track_segmentation.py |  1 +
 Python/utils.py              | 15 +++++++++----
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 36a2b97..6e1a5bc 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -13,9 +13,10 @@
 import mir_eval
 import paths
 
+from operator import itemgetter
+
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
-f_measure_thresh = 3    # tolerance window in seconds
 
 
 def load_data(preds_file, file_lists):
@@ -53,14 +54,14 @@ def post_processing(preds_track):
     preds_track = np.multiply(preds_track,
                               np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same'))
 
+
     # unit maximum
     preds_track /= np.max(preds_track)
 
     return preds_track
 
 
-if __name__ == "__main__":
-
+def run_eval(f_measure_thresh):
     f_measures = []
     precisions = []
     recalls = []
@@ -69,9 +70,6 @@ def post_processing(preds_track):
     preds = np.reshape(preds, len(preds))
 
     for i, f in enumerate(test_files):
-
-        print("Evaluating {}".format(f))
-
         # load annotations
         segment_times = get_segment_times(f, paths.annotations_path)
 
@@ -83,25 +81,44 @@ def post_processing(preds_track):
 
         # post processing
         preds_track = post_processing(preds_track)
-        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1)
 
-        pred_times = beat_times[peak_loc] - 1
+        # insert a zero value at the beginning of the predictions to help the peak-finding algorithm
+        # identify the first beat of a track
+
+        peds_track = np.insert(preds_track, 0, 0)
+        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1
+        pred_times = beat_times[peak_loc]
 
         # compute f-measure
-        f_score, p, r = mir_eval.onset.f_measure(segment_times, pred_times, window=f_measure_thresh)
+        f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh)
 
         f_measures.append(f_score)
         precisions.append(p)
         recalls.append(r)
 
-        print("f-Measure: {}, precision: {}, recall: {}".format(f_score, p, r))
+        #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r))
 
     mean_f = np.mean(np.asarray(f_measures))
     mean_p = np.mean(np.asarray(precisions))
     mean_r = np.mean(np.asarray(recalls))
 
-    print(" ")
-    print("Mean scores across all test tracks:")
-    print("f-Measure: {}, precision: {}, recall: {}".format(mean_f, mean_p, mean_r))
+    print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r))
+
+    return list(zip(test_files, f_measures, precisions, recalls))
+
+def get_sort_key(item):
+    return item[1]
+
+if __name__ == "__main__":
+    short = run_eval(0.5)
+    long = run_eval(3.0)
+
+    for i in range(len(short)):
+        short[i] += long[i][1:4]
+
+    sorted_tracks = sorted(short, key=get_sort_key)
 
+    print("{:<20}{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}".format("filename", "f0.5", "p0.5", "r0.5", "f3", "p3", "r3"))
+    for track in sorted_tracks:
+        print("{:<20}{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}".format(*track))
 
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index 8f4e538..1ad79b8 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -128,6 +128,7 @@ def compute_segments_from_predictions(predictions, beat_times):
     if not os.path.isfile(out_dir + file_name + '.beats.txt'):
         print("Extracting beat times (this might take a while)...")
         os.system('DBNBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
+        os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
 
     print("Computing features")
     mls_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt')
diff --git a/Python/utils.py b/Python/utils.py
index a55b571..fd697e3 100644
--- a/Python/utils.py
+++ b/Python/utils.py
@@ -103,8 +103,7 @@ def get_segment_times(audio_file, annotation_folder):
 
     return segment_times
 
-
-def get_beat_times(audio_file, beats_folder):
+def get_beat_times(audio_file, beats_folder, include_beat_numbers=False):
     """
     Read beat times from annotation file.
     :param audio_file: path to audio files
@@ -114,7 +113,15 @@ def get_beat_times(audio_file, beats_folder):
 
     file_name = os.path.splitext(os.path.basename(audio_file))[0]
     beats_file = os.path.join(beats_folder, file_name + '.beats.txt')
+
+    if not os.path.isfile(beats_file):
+        print(f"Extracting beat times for {audio_file}")
+        os.system(f"DBNDownBeatTracker single '{audio_file}' -o '{beats_file}'")
+
     t = pd.read_table(beats_file, header=None)
-    beat_times = t.iloc[:, 0].values
 
-    return beat_times
+    if include_beat_numbers:
+        return t[0].values, t[1].values
+    else:
+        return t[0].values
+

From e8d14708dda4255cca902dc3c1ab6f2349cfa940 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 1 Apr 2021 11:05:23 -0700
Subject: [PATCH 24/35] BeatTracker -> DownBeatTracker

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0701ca2..16eba4c 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ After that the beat tracking from the MADMOM library can be run on all files wit
 ```bash
 cd ./Audio
 mkdir beats
-DBNBeatTracker batch -o ./beats $(ls *.mp3)
+DBNDownBeatTracker batch -o ./beats $(ls *.mp3)
 ```
 
 This will take quite some time and use a lot of memory. After finishing, the beat files  (`*.beats.txt`) will be placed next to the audio files.

From f1115a7cceecb88925e2e4a26a2e29fe5b9e42a1 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 1 Apr 2021 11:06:08 -0700
Subject: [PATCH 25/35] remove commented code

---
 Python/evaluation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 6e1a5bc..9cfdc52 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -96,8 +96,6 @@ def run_eval(f_measure_thresh):
         precisions.append(p)
         recalls.append(r)
 
-        #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r))
-
     mean_f = np.mean(np.asarray(f_measures))
     mean_p = np.mean(np.asarray(precisions))
     mean_r = np.mean(np.asarray(recalls))

From d326179965d7f5686f708e8b524ed461a1ec3385 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 2 Apr 2021 17:07:49 -0700
Subject: [PATCH 26/35] protect against overly short tracks

---
 Python/evaluation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index c5edab4..eb17baa 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -52,8 +52,9 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
     preds_track = np.convolve(preds_track, np.hamming(4) / np.sum(np.hamming(4)), 'same')
 
     # emphasize peaks
-    preds_track = np.multiply(preds_track,
-                              np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same'))
+    if len(preds_track) >= 32:
+        preds_track = np.multiply(preds_track,
+                                  np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same'))
 
 
     # emphasize downbeeat

From d692b3a565d3a939e26afe1dc67c33cdca9a908b Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 2 Apr 2021 17:08:43 -0700
Subject: [PATCH 27/35] get much more memory efficiency by memmaping feature
 files

---
 Python/feature_extraction.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 0008c48..af4d404 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -196,7 +196,7 @@ def with_audio_cache(filename, ext, waveform, beat_times, genf):
     path = paths.get_audio_cache_path(filename, ext)
 
     if os.path.exists(path):
-        return np.load(path), waveform
+        return np.load(path,  mmap_mode='r'), waveform
     else:
         if waveform is None:
             waveform = load_waveform(filename)
@@ -224,8 +224,7 @@ def gen_beat_mls(waveform, beat_times):
 
 def compute_features_async(logger, f, i, audio_files):
     logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
-
-    return compute_features(f)
+    compute_features(f)
 
 def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
     """
@@ -259,7 +258,11 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
         for i, f in enumerate(audio_files):
             if do_async:
                 try:
-                    beat_mls, beat_sslm, chroma_sslm, beat_times = async_res[i].get()
+                    # have child process actually write features to disk
+                    async_res[i].get()
+
+                    # now reload them in mmap
+                    beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features(f)
                 except Exception as inst:
                     print("error processing {}".format(f))
                     print(inst)
@@ -337,7 +340,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
     :return: batch data in the form (n_items, n_melbands, n_context)
     """
 
-    n_preallocate = 250000
+    n_preallocate = 500000
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
@@ -465,7 +468,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
             break
 
     data_x = data_x[:feature_count, :, :]
-    data_sslm_x = data_sslm_x[:feature_count, :, :, :]
+    data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2], data_sslm_x.shape[3]))
     data_y = data_y[:feature_count]
     data_weight = data_weight[:feature_count]
     track_idx = track_idx[:feature_count]

From 8513ad3881028f8d769f7aa93d61fcd3c2ca55cd Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 2 Apr 2021 17:09:39 -0700
Subject: [PATCH 28/35] more tracks!

---
 Data/salami-data-public |   1 +
 Data/test_tracks.txt    |  21 +++++
 Data/train_tracks.txt   | 169 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+)
 create mode 120000 Data/salami-data-public

diff --git a/Data/salami-data-public b/Data/salami-data-public
new file mode 120000
index 0000000..a66aeab
--- /dev/null
+++ b/Data/salami-data-public
@@ -0,0 +1 @@
+/Users/ben/src/salami-data-public
\ No newline at end of file
diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index 596411a..dd02f71 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -69,3 +69,24 @@
 1221.mp3
 1244.mp3
 1080.mp3
+824.m4a
+752.m4a
+10043.mp3
+543.m4a
+587.m4a
+818.m4a
+950.m4a
+615.m4a
+1640.m4a
+1654.m4a
+663.m4a
+1648.m4a
+359.m4a
+1602.m4a
+39.m4a
+1624.m4a
+807.m4a
+459.m4a
+355.m4a
+728.m4a
+531.m4a
diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index 5341be2..dab82f3 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -575,3 +575,172 @@
 310.m4a
 1330.mp3
 692.m4a
+571.m4a
+575.m4a
+10042.mp3
+339.m4a
+842.m4a
+411.m4a
+379.m4a
+63.m4a
+791.m4a
+746.m4a
+852.m4a
+483.m4a
+795.m4a
+774.m4a
+739.m4a
+1642.m4a
+732.m4a
+491.m4a
+27.m4a
+802.m4a
+882.m4a
+659.m4a
+43.m4a
+906.m4a
+691.m4a
+535.m4a
+371.m4a
+651.m4a
+455.m4a
+7.m4a
+675.m4a
+744.m4a
+399.m4a
+431.m4a
+75.m4a
+15.m4a
+51.m4a
+515.m4a
+836.m4a
+407.m4a
+551.m4a
+783.m4a
+846.m4a
+10036.mp3
+667.m4a
+892.m4a
+555.m4a
+832.m4a
+1632.m4a
+647.m4a
+11.m4a
+687.m4a
+603.m4a
+427.m4a
+419.m4a
+591.m4a
+936.m4a
+655.m4a
+695.m4a
+708.m4a
+816.m4a
+706.m4a
+866.m4a
+864.m4a
+447.m4a
+1610.m4a
+511.m4a
+731.m4a
+704.m4a
+1650.m4a
+527.m4a
+750.m4a
+567.m4a
+1644.m4a
+367.m4a
+363.m4a
+803.m4a
+702.m4a
+786.m4a
+559.m4a
+946.m4a
+1628.m4a
+862.m4a
+579.m4a
+583.m4a
+475.m4a
+1620.m4a
+10041.mp3
+727.m4a
+343.m4a
+35.m4a
+707.m4a
+10037.mp3
+1600.m4a
+914.m4a
+643.m4a
+1634.m4a
+627.m4a
+794.m4a
+683.m4a
+1010.m4a
+858.m4a
+619.m4a
+10039.mp3
+930.m4a
+700.m4a
+747.m4a
+768.m4a
+1630.m4a
+10038.mp3
+1604.m4a
+19.m4a
+811.m4a
+815.m4a
+1614.m4a
+471.m4a
+611.m4a
+607.m4a
+479.m4a
+799.m4a
+563.m4a
+635.m4a
+822.m4a
+669.m4a
+47.m4a
+910.m4a
+682.m4a
+599.m4a
+767.m4a
+1612.m4a
+71.m4a
+828.m4a
+1626.m4a
+31.m4a
+23.m4a
+1622.m4a
+595.m4a
+463.m4a
+854.m4a
+1638.m4a
+1652.m4a
+860.m4a
+784.m4a
+787.m4a
+10040.mp3
+755.m4a
+1646.m4a
+631.m4a
+760.m4a
+770.m4a
+1618.m4a
+703.m4a
+623.m4a
+1636.m4a
+519.m4a
+451.m4a
+726.m4a
+1608.m4a
+539.m4a
+898.m4a
+1606.m4a
+814.m4a
+639.m4a
+834.m4a
+954.m4a
+790.m4a
+439.m4a
+79.m4a
+782.m4a

From 798b3e9c5475806844dacc35fe51a5b3e57b52a8 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 2 Apr 2021 17:14:59 -0700
Subject: [PATCH 29/35] fix merge

---
 Python/evaluation.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 8cd339f..ac64f10 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -38,11 +38,10 @@ def load_data(preds_file, file_lists):
     return preds, test_files, test_idx
 
 
-def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
+def post_processing(preds_track):
     """
     Post processing of prediction probabilities, applies smoothing
     window and emphasizes beats by multiplying with running avarage.
-    Also weights predictions towards beat "1".
 
     :param preds_track: CNN predictions per beat
     :return: post-processed predictions
@@ -57,11 +56,6 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False):
                                   np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same'))
 
 
-    # emphasize downbeeat
-    if emphasize_downbeat:
-        preds_track = np.multiply(preds_track, np.where(beat_numbers == 1, 1, 0.5))
-
-
     # unit maximum
     preds_track /= np.max(preds_track)
 

From 4d7e6af66fb5af98ad83543f22fb49da148b8ed0 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 8 Apr 2021 01:06:45 -0700
Subject: [PATCH 30/35] large changes

- bring in beat-number as a new dimension
- crib a new prediction-thresholding algo from a research paper
- refactor feature extraction
---
 Data/train_tracks.txt            |   2 +-
 Python/evaluation.py             |  50 ++++++++--
 Python/feature_extraction.py     | 157 ++++++++++++++-----------------
 Python/track_segmentation.py     |  49 ++++++----
 Python/train_segmentation_cnn.py |  89 ++++++++----------
 5 files changed, 179 insertions(+), 168 deletions(-)

diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index dab82f3..7992c7c 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -1,3 +1,4 @@
+10007.mp3
 484.m4a
 1136.mp3
 1343.mp3
@@ -421,7 +422,6 @@
 448.m4a
 606.m4a
 1029.mp3
-10007.mp3
 1160.mp3
 1447.mp3
 548.m4a
diff --git a/Python/evaluation.py b/Python/evaluation.py
index ac64f10..4d38983 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -17,7 +17,7 @@
 
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
-
+prediction_threshold = 0.3
 
 def load_data(preds_file, file_lists):
     """
@@ -38,6 +38,33 @@ def load_data(preds_file, file_lists):
     return preds, test_files, test_idx
 
 
+def choose_preds(preds, beat_times):
+    # At test time, we apply the trained network to each position in the
+    # spectrogram of the music piece to be segmented, ob- taining a boundary
+    # probability for each frame. We then employ a simple means of peak-picking
+    # on this boundary activation curve: Every output value that is not
+    # surpassed within ±6 seconds is a boundary candidate. From each candidate
+    # value we subtract the average of the activation curve in the past 12 and
+    # future 6 seconds, to compensate for long-term trends. We end up with a
+    # list of boundary candidates along with strength values that can be
+    # thresh- olded at will. We found that more elaborate peak picking methods
+    # did not improve results.
+    preds_out = np.zeros((len(preds)))
+
+    for i in range(len(preds)):
+        pred_time = beat_times[i]
+        in_window = (beat_times > pred_time - 6) & (beat_times <= pred_time + 6)
+        max_in_window = np.argmax(np.where(in_window, preds, 0))
+        if i == max_in_window:
+            in_avg_window = (beat_times > pred_time - 12) & (beat_times <= pred_time + 6)
+            window_avg = np.mean(preds[in_avg_window])
+            preds_out[i] = preds[i] - window_avg
+        else:
+            preds_out[i] = 0
+
+    return np.flatnonzero(preds_out > prediction_threshold)
+
+
 def post_processing(preds_track):
     """
     Post processing of prediction probabilities, applies smoothing
@@ -47,7 +74,6 @@ def post_processing(preds_track):
     :return: post-processed predictions
     """
 
-    # smoothing
     preds_track = np.convolve(preds_track, np.hamming(4) / np.sum(np.hamming(4)), 'same')
 
     # emphasize peaks
@@ -82,15 +108,21 @@ def run_eval(f_measure_thresh):
         # get predictions for current track
         preds_track = np.squeeze(np.asarray(preds[test_idx == i]))
 
-        # post processing
-        preds_track = post_processing(preds_track)
+        if len(preds_track) == 0:
+            continue
+
+        if True:
+            pred_indexes = choose_preds(preds_track, beat_times)
+            pred_times = beat_times[pred_indexes]
+        else:
+            preds_track = post_processing(preds_track)
 
-        # insert a zero value at the beginning of the predictions to help the peak-finding algorithm
-        # identify the first beat of a track
+            # insert a zero value at the beginning of the predictions to help the peak-finding algorithm
+            # identify the first beat of a track
 
-        peds_track = np.insert(preds_track, 0, 0)
-        peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1
-        pred_times = beat_times[peak_loc]
+            peds_track = np.insert(preds_track, 0, 0)
+            peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1
+            pred_times = beat_times[peak_loc]
 
         # compute f-measure
         f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh)
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index af4d404..8087a4f 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -36,7 +36,7 @@
 context_length = 65         # how many beats make up a context window for the CNN
 num_mel_bands = 80          # number of Mel bands
 neg_frames_factor = 5       # how many more negative examples than segment boundaries
-pos_frames_oversample = 5   # oversample positive frames because there are too few
+pos_frames_oversample = 5  # oversample positive frames because there are too few
 mid_frames_oversample = 3   # oversample frames between segments
 label_smearing = 1          # how many frames are positive examples around an annotation
 padding_length = int(context_length / 2)
@@ -44,11 +44,12 @@
 max_pool = 2
 
 # for debugging
-# do_async = False
-# max_tracks = 1
-
-do_async = True
-max_tracks = None
+if False:
+    do_async = False
+    max_tracks = 1
+else:
+    do_async = True
+    max_tracks = None
 
 random.seed(1234)           # for reproducibility
 np.random.seed(1234)
@@ -144,6 +145,7 @@ def compute_chroma_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=
 
     chroma_fb = librosa.filters.chroma(22050, fft_size, n_chroma=12)
     chromagram = np.dot(chroma_fb, x_prime)
+    chromagram = librosa.power_to_db(chromagram,ref=np.max)
 
     return compute_sslm(chromagram + 1, beat_times, hop_size)
 
@@ -181,6 +183,16 @@ def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=102
 
     return beat_melspec
 
+def compute_time_features(features, beat_times):
+    length = len(features) / 22050.
+    time_ratios = np.zeros((len(beat_times), 500), dtype=np.float32)
+
+    for k in range(len(beat_times)):
+        time_ratios[k, int((beat_times[k] * 500) // length)] = 1.0
+
+    return time_ratios
+
+
 def load_waveform(filename):
     if "/" in filename:
         path = filename
@@ -205,8 +217,14 @@ def with_audio_cache(filename, ext, waveform, beat_times, genf):
         np.save(path, data)
         return data, waveform
 
+def make_beat_time_features(beat_numbers):
+    times = np.zeros((len(beat_numbers), 4))
+    for i in range(len(beat_numbers)):
+        times[i][beat_numbers[i] - 1] = 1
+    return times
+
 def compute_features(f):
-    beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path)
+    beat_times, beat_numbers = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path, include_beat_numbers=True)
 
     def gen_beat_mls(waveform, beat_times):
         beat_mls = compute_beat_mls(waveform, beat_times)
@@ -216,11 +234,14 @@ def gen_beat_mls(waveform, beat_times):
     waveform = None
     beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls)
     beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm)
-    chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm)
+    #times, waveform = with_audio_cache(f, '.beat_time_ratios.npy', waveform, beat_times, compute_time_features)
+    times = make_beat_time_features(beat_numbers)
+
+    #chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm)
 
-    beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3)
+    #beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3)
 
-    return beat_mls, beat_sslm, chroma_sslm, beat_times
+    return beat_mls, beat_mls_sslm, times, beat_times
 
 def compute_features_async(logger, f, i, audio_files):
     logger.info("Track {} / {} ({})".format(i, len(audio_files), f))
@@ -240,6 +261,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
 
     feature_list = []
     sslm_feature_list = []
+    time_feature_list = []
     labels_list = []
     failed_tracks_idx = []
 
@@ -262,14 +284,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
                     async_res[i].get()
 
                     # now reload them in mmap
-                    beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features(f)
+                    beat_mls, beat_sslm, time_features, beat_times = compute_features(f)
                 except Exception as inst:
                     print("error processing {}".format(f))
                     print(inst)
                     failed_tracks_idx.append(i)
                     continue
             else:
-                beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features_async(logger, f, i, audio_files)
+                beat_mls, beat_sslm, time_features, beat_times = compute_features(f)
 
             label_vec = np.zeros(beat_mls.shape[1],)
             segment_times = get_segment_times(f, paths.annotations_path)
@@ -286,13 +308,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder):
 
             feature_list.append(beat_mls)
             sslm_feature_list.append(beat_sslm)
+            time_feature_list.append(time_features)
             labels_list.append(label_vec)
 
             if max_tracks is not None and n_tracks > max_tracks:
                 break
             n_tracks += 1
 
-    return feature_list, sslm_feature_list, labels_list, failed_tracks_idx
+    return feature_list, sslm_feature_list, time_feature_list, labels_list, failed_tracks_idx
 
 
 def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample=10000):
@@ -328,7 +351,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample
     return features, mean_vec, std_vec
 
 
-def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training=True):
+def prepare_batch_data(feature_list, sslm_feature_list, time_feature_list, labels_list, is_training=True):
     """
     Reads precomputed beat Mel spectrograms and slices them into context windows
     for CNN training. For the training set, subsampling is
@@ -344,7 +367,8 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
-    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length, 2), dtype=np.float32)
+    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32)
+    data_time_x = np.zeros(shape=(n_preallocate, time_feature_list[0].shape[1]), dtype=np.float32)
     data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     track_idx = np.zeros(shape=(n_preallocate,), dtype=int)
@@ -352,8 +376,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
     feature_count = 0
     current_track = 0
 
-    for features, sslm_features, labels in zip(feature_list, sslm_feature_list, labels_list):
-
+    for features, sslm_features, time_features, labels in zip(feature_list, sslm_feature_list, time_feature_list, labels_list):
         print("Processed {} examples from {} tracks".format(feature_count, current_track+1))
 
         num_beats = features.shape[1]
@@ -363,6 +386,17 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
 
         labels = np.concatenate((np.zeros(padding_length), labels, np.zeros(padding_length)), axis=0)
 
+        def add_feature(idx, label, weight=1):
+            nonlocal feature_count
+            data_x[feature_count, :, :] = features[:, idx - padding_length: idx + padding_length + 1]
+            data_sslm_x[feature_count] = sslm_features[:, :, idx - padding_length]
+            data_time_x[feature_count] = time_features[idx - padding_length]
+            data_y[feature_count] = label
+            data_weight[feature_count] = weight
+            track_idx[feature_count] = current_track
+
+            feature_count += 1
+
         if is_training is True:
 
             # take all positive frames.  these are indexes into the already padded features.
@@ -371,36 +405,15 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
             for rep in range(pos_frames_oversample):
 
                 for k in positive_frames_idx:
+                    add_feature(k, label=1)
 
-                    next_window = features[:, k - padding_length: k + padding_length + 1]
-                    next_label = 1
-                    next_weight = 1
-
-                    data_x[feature_count, :, :] = next_window
-                    data_sslm_x[feature_count] =  sslm_features[:, :, k - padding_length]
-                    data_y[feature_count] = next_label
-                    data_weight[feature_count] = next_weight
-                    track_idx[feature_count] = current_track
-
-                    feature_count += 1
-
-                    # apply label smearing: set labels around annotation to 1 and give them a triangular weight
+                    ## apply label smearing: set labels around annotation to 1 and give them a triangular weight
                     for l in range(k - label_smearing, k + label_smearing + 1):
 
                         # don't smear into padding.
                         if padding_length <= l < num_beats + padding_length and l != k:
-
-                            next_window = features[:, l-padding_length: l+padding_length+1]
-                            next_label = 1
                             next_weight = 1. - np.abs(l-k) / (label_smearing + 1.)
-
-                            data_x[feature_count, :, :] = next_window
-                            data_sslm_x[feature_count] =  sslm_features[:, :, l - padding_length]
-                            data_y[feature_count] = next_label
-                            data_weight[feature_count] = next_weight
-                            track_idx[feature_count] = current_track
-
-                            feature_count += 1
+                            add_feature(l, label=0.5, weight=next_weight)
 
             # take all frames in the middle between two boundaries (typical false positives)
             mid_segment_frames_idx = (positive_frames_idx[1:] + positive_frames_idx[:-1]) / 2
@@ -412,16 +425,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
                     for l in range(k - label_smearing, k + label_smearing + 1):
 
                         if padding_length <= l < num_beats + padding_length:
-
-                            next_window = features[:, l-padding_length: l+padding_length+1]
-
-                            data_sslm_x[feature_count] =  sslm_features[:, :, l - padding_length]
-                            data_x[feature_count, :, :] = next_window
-                            data_y[feature_count] = 0
-                            data_weight[feature_count] = 1
-                            track_idx[feature_count] = current_track
-
-                            feature_count += 1
+                            add_feature(l, label=0)
 
             # sample randomly from the remaining frames
             remaining_frames_idx = []
@@ -434,33 +438,11 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
             for k in range(num_neg_frames):
                 next_idx = random.sample(remaining_frames_idx, 1)[0]
 
-                next_window = features[:, next_idx-padding_length: next_idx+padding_length+1]
-                next_label = 0
-                next_weight = 1
-
-                data_x[feature_count, :, :] = next_window
-                data_sslm_x[feature_count] =  sslm_features[:, :, next_idx - padding_length]
-                data_y[feature_count] = next_label
-                data_weight[feature_count] = next_weight
-                track_idx[feature_count] = current_track
-
-                feature_count += 1
+                add_feature(next_idx, label=0)
 
         else:   # test data -> extract all context windows and keep track of track indices
             for k in range(padding_length, num_beats + padding_length):
-
-                next_window = features[:, k-padding_length: k+padding_length+1]
-                next_label = labels[k]
-                next_weight = 1
-
-                data_x[feature_count, :, :] = next_window
-                data_y[feature_count] = next_label
-                data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length]
-
-                data_weight[feature_count] = next_weight
-                track_idx[feature_count] = current_track
-
-                feature_count += 1
+                add_feature(k, label=labels[k])
 
         current_track += 1
 
@@ -468,12 +450,13 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training
             break
 
     data_x = data_x[:feature_count, :, :]
-    data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2], data_sslm_x.shape[3]))
+    data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2]))
+    data_time_x.resize((feature_count, data_time_x.shape[1]))
     data_y = data_y[:feature_count]
     data_weight = data_weight[:feature_count]
     track_idx = track_idx[:feature_count]
 
-    return data_x, data_sslm_x, data_y, data_weight, track_idx
+    return data_x, data_sslm_x, data_time_x, data_y, data_weight, track_idx
 
 
 def load_raw_features(file):
@@ -502,13 +485,13 @@ def load_raw_features(file):
 
     print("Extracting MLS features")
 
-    train_features, train_sslm_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files,
-                                                                                  paths.beats_path,
-                                                                                  paths.annotations_path)
+    train_features, train_sslm_features, train_time_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files,
+            paths.beats_path,
+            paths.annotations_path)
 
-    test_features, test_sslm_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files,
-                                                                               paths.beats_path,
-                                                                               paths.annotations_path)
+    test_features, test_sslm_features, test_time_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files,
+            paths.beats_path,
+            paths.annotations_path)
 
     print("Extracted features for {} training and {} test tracks".format(len(train_features), len(test_features)))
 
@@ -524,8 +507,8 @@ def load_raw_features(file):
 
     # train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle')
 
-    train_x, train_sslm_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_labels, is_training=True)
-    test_x, test_sslm_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_labels, is_training=False)
+    train_x, train_sslm_x, train_time_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_time_features,  train_labels, is_training=True)
+    test_x, test_sslm_x, test_time_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_time_features, test_labels, is_training=False)
 
     train_x, mean_vec, std_vec = normalize_features_per_band(train_x)
     test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec)
@@ -533,8 +516,8 @@ def load_raw_features(file):
     print("Prepared {} training items and {} test items".format(train_x.shape[0], test_x.shape[0]))
 
     # store normalized features for CNN training
-    np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_y=train_y, train_weights=train_weights)
-    np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_y=test_y, test_weights=test_weights)
+    np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_time_x=train_time_x, train_y=train_y, train_weights=train_weights)
+    np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_time_x=test_time_x, test_y=test_y, test_weights=test_weights)
     np.savez('../Data/normalization.npz', mean_vec=mean_vec, std_vec=std_vec)
 
     # store file lists and index mapping to training and test data
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index 7b368f2..a01f2ae 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -14,7 +14,7 @@
 import numpy as np
 import pandas as pd
 from feature_extraction import compute_features, normalize_features_per_band
-from evaluation import post_processing
+from evaluation import post_processing, choose_preds
 from train_segmentation_cnn import build_model
 import peakutils
 
@@ -26,19 +26,23 @@
 padding = int(context_length / 2)
 
 
-def compute_cnn_predictions(mls_features, sslm_features):
-    """
-    Apply pretrained CNN model to features and return predictions.
-    """
+def build_full_model():
     model = build_model(num_mel_bands, context_length, context_length)
     model.load_weights(model_weights)
     model.compile(loss='binary_crossentropy', optimizer='sgd')
+    return model
+
+def compute_cnn_predictions(mls_features, sslm_features, time_features):
+    """
+    Apply pretrained CNN model to features and return predictions.
+    """
+    model = build_full_model()
 
     mls_features = np.expand_dims(mls_features, 3)
     sslm_features = np.transpose(sslm_features, (2, 0, 1))
-    sslm_features = np.expand_dims(sslm_features, 3)
+    #sslm_features = sslm_features[:, :, :, 0] # remove chroma for now
 
-    predictions = model.predict([mls_features, sslm_features], batch_size=1)
+    predictions = model.predict([mls_features, sslm_features, time_features], batch_size=1)
 
     return predictions
 
@@ -55,7 +59,7 @@ def extract_features(audio_file, beats_file):
     beat_times = t[0].values
     beat_numbers = t[1].values
 
-    beat_mls, beat_sslm, beat_times = compute_features(audio_file)
+    beat_mls, sslm, time_features, beat_times = compute_features(audio_file)
     features = compute_context_windows(beat_mls)
 
     norm_data = np.load(normalization_path)
@@ -63,7 +67,7 @@ def extract_features(audio_file, beats_file):
     std_vec = norm_data['std_vec']
     features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec)
 
-    return features, beat_sslm, beat_times, beat_numbers
+    return features, sslm, time_features, beat_times
 
 
 def compute_context_windows(features):
@@ -103,7 +107,7 @@ def print_predictions(p, beat_times):
         print("%i:\t%.3f\t%.1f" % (i, p[i], beat_times[i]))
 
 
-def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
+def compute_segments_from_predictions(predictions, beat_times):
     """
     Computes the segment times from a prediction curve and the beat times
     using peak picking.
@@ -113,14 +117,21 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
     print("raw predicitions:")
     print_predictions(predictions, beat_times)
 
-    predictions = post_processing(predictions, beat_numbers, emphasize_downbeat=True)
+    if True:
+        peak_loc = choose_preds(predictions, beat_times)
+        segment_times = beat_times[peak_loc]
+        #print("after post-processing:")
+        #print_predictions(peak_loc, beat_times)
 
-    print("after post-processing:")
-    print_predictions(predictions, beat_times)
+    else:
+        predictions = post_processing(predictions)
+
+        print("after post-processing:")
+        print_predictions(predictions, beat_times)
 
-    predictions = np.insert(predictions, 0, 0)
-    peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1
-    segment_times = beat_times[peak_loc]
+        predictions = np.insert(predictions, 0, 0)
+        peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1
+        segment_times = beat_times[peak_loc]
 
     print("beat_num\ttime:")
     for i in peak_loc:
@@ -150,13 +161,13 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers):
         os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"')
 
     print("Computing features")
-    mls_features, sslm_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt')
+    mls_features, sslm, time_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt')
 
     print("Computing CNN predictions")
-    predictions = compute_cnn_predictions(mls_features, sslm_features)
+    predictions = compute_cnn_predictions(mls_features, sslm, time_features)
 
     print("Get segment times")
-    segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers)
+    segment_times = compute_segments_from_predictions(predictions, beat_times)
 
     print("\n")
     for f in segment_times:
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 7f3cdab..8658cda 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -39,12 +39,7 @@ def load_training_data(dataset):
     """
 
     data = np.load(dataset)
-    train_x = data['train_x']
-    train_sslm_x = data['train_sslm_x']
-    train_y = data['train_y']
-    train_weights = data['train_weights']
-
-    return train_x, train_sslm_x, train_y, train_weights
+    return data['train_x'], data['train_sslm_x'], data['train_time_x'], data['train_y'], data['train_weights']
 
 
 def load_test_data(dataset):
@@ -60,40 +55,34 @@ def load_test_data(dataset):
     """
 
     data = np.load(dataset)
-    test_x = data['test_x']
-    test_sslm_x = data['test_sslm_x']
-    test_y = data['test_y']
-    test_weights = data['test_weights']
-
-    return test_x, test_sslm_x, test_y, test_weights
-
-
-def build_mls_model(img_rows, img_cols):
-    input = layers.Input(shape=(img_rows, img_cols, 1))
-    x = layers.Conv2D(16, (6, 8), activation='relu')(input)
-    x = layers.MaxPooling2D(pool_size=(3, 6))(x)
-    return input, x
-
-def build_sslm_model(img_rows, img_cols):
-    input = layers.Input(shape=(img_rows, img_cols, 2))
-    x = layers.Conv2D(16, (8, 8), activation='relu')(input)
-    x = layers.MaxPooling2D(pool_size=(6, 6))(x)
-    return input, x
-
-def build_fused_model(inputs, outputs):
-    x = layers.Concatenate(axis=1)(outputs)
-    x = layers.Conv2D(64, (6, 3), activation='relu')(x)
-    x = layers.Dropout(0.5)(x)
-    x = layers.Flatten()(x)
-    x = layers.Dense(256, activation='relu')(x)
-    x = layers.Dropout(0.5)(x)
-    x = layers.Dense(1, activation='sigmoid')(x)
-    return Model(inputs = inputs, outputs = x)
+    return data['test_x'], data['test_sslm_x'], data['test_time_x'], data['test_y'], data['test_weights']
+
 
 def build_model(mls_rows, mls_cols, sslm_shape):
-    mls_input, mls_output = build_mls_model(mls_rows, mls_cols)
-    sslm_input, sslm_output = build_sslm_model(sslm_shape, sslm_shape)
-    return  build_fused_model([mls_input, sslm_input], [mls_output, sslm_output])
+    mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input')
+    mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input)
+    mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls)
+
+    sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input')
+    sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input)
+    sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm)
+
+    merged = layers.Concatenate(axis=1, name='mls_slsm_concat')([mls, sslm])
+    merged = layers.Conv2D(64, (6, 3), activation='relu', name='concat_conv')(merged)
+    merged = layers.Dropout(0.5, name='concat_dropout')(merged)
+
+    merged = layers.Flatten()(merged)
+
+    merged = layers.Dense(256, activation='relu', name='final_dense')(merged)
+    merged = layers.Dropout(0.5, name='final_dropout')(merged)
+
+    time_input = layers.Input(shape=(4,), name='time_input')
+    time = layers.Dense(1, activation='relu', name='time_dense')(time_input)
+    merged = layers.Concatenate(name='final_concat')([merged, time])
+
+    merged = layers.Dense(1, activation='sigmoid', name='final_sigmoid')(merged)
+
+    return Model(inputs=[mls_input, sslm_input, time_input], outputs = merged)
 
 def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None):
     """
@@ -106,7 +95,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     """
 
     print('loading training data...')
-    X_train, x_sslm_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz')
+    X_train, x_sslm_train, x_time_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz')
 
     print('training data size:')
     print(X_train.shape)
@@ -114,11 +103,13 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     p = np.random.permutation(X_train.shape[0])
     X_train = X_train[p, :, :]
     x_sslm_train = x_sslm_train[p, :, :]
+    x_time_train = x_time_train[p]
     y_train = y_train[p]
     w_train = w_train[p]
 
     X_train = X_train.astype('float32')
     X_train = np.expand_dims(X_train, 3)
+    x_sslm_train = np.expand_dims(x_sslm_train, 3)
 
     img_rows = X_train.shape[1]
     img_cols = X_train.shape[2]
@@ -131,31 +122,25 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True)
     model.compile(loss='binary_crossentropy', optimizer=sgd)
 
-    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
+    early_stopping = EarlyStopping(monitor='val_loss', patience=15)
 
     print('train model...')
-    model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+    model.fit(x=[X_train, x_sslm_train, x_time_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
               verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping])
 
-    #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
-    #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
-
-    #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
-    #          verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[])
     print('load test data...')
-    X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz')
+    X_test, x_sslm_test, x_time_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz')
     X_test = X_test.astype('float32')
     X_test = np.expand_dims(X_test, 3)
+    x_sslm_test = np.expand_dims(x_sslm_test, 3)
 
     print('predict test data...')
-    preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1)
-    #preds = model.predict(X_test, batch_size=1, verbose=1)
+    preds = model.predict([X_test, x_sslm_test, x_time_test], batch_size=1, verbose=1)
 
     print('saving results...')
     np.save('../Data/predsTestTracks' + save_ext + '.npy', preds)
 
-    score = model.evaluate([X_test, x_sslm_test], y_test, verbose=1)
-    #score = model.evaluate(X_test, y_test, verbose=1)
+    score = model.evaluate([X_test, x_sslm_test, x_time_test], y_test, verbose=1)
     print('Test score:', score)
 
     # save model
@@ -163,4 +148,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
 
 
 if __name__ == "__main__":
-    train_model(nb_epoch=75)
+    train_model(nb_epoch=200)

From 669a0150c2e48ecb299540a20d2dafa131af411f Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Thu, 8 Apr 2021 10:18:56 -0700
Subject: [PATCH 31/35] code cleanup

---
 Python/evaluation.py         | 15 +++------------
 Python/track_segmentation.py | 18 ++----------------
 2 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 4d38983..8fff7db 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -111,18 +111,8 @@ def run_eval(f_measure_thresh):
         if len(preds_track) == 0:
             continue
 
-        if True:
-            pred_indexes = choose_preds(preds_track, beat_times)
-            pred_times = beat_times[pred_indexes]
-        else:
-            preds_track = post_processing(preds_track)
-
-            # insert a zero value at the beginning of the predictions to help the peak-finding algorithm
-            # identify the first beat of a track
-
-            peds_track = np.insert(preds_track, 0, 0)
-            peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1
-            pred_times = beat_times[peak_loc]
+        pred_indexes = choose_preds(preds_track, beat_times)
+        pred_times = beat_times[pred_indexes]
 
         # compute f-measure
         f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh)
@@ -142,6 +132,7 @@ def get_sort_key(item):
     return item[1]
 
 if __name__ == "__main__":
+    run_eval(0.2)
     short = run_eval(0.5)
     long = run_eval(3.0)
 
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index a01f2ae..86ae88e 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -40,7 +40,6 @@ def compute_cnn_predictions(mls_features, sslm_features, time_features):
 
     mls_features = np.expand_dims(mls_features, 3)
     sslm_features = np.transpose(sslm_features, (2, 0, 1))
-    #sslm_features = sslm_features[:, :, :, 0] # remove chroma for now
 
     predictions = model.predict([mls_features, sslm_features, time_features], batch_size=1)
 
@@ -116,22 +115,9 @@ def compute_segments_from_predictions(predictions, beat_times):
 
     print("raw predicitions:")
     print_predictions(predictions, beat_times)
+    peak_loc = choose_preds(predictions, beat_times)
 
-    if True:
-        peak_loc = choose_preds(predictions, beat_times)
-        segment_times = beat_times[peak_loc]
-        #print("after post-processing:")
-        #print_predictions(peak_loc, beat_times)
-
-    else:
-        predictions = post_processing(predictions)
-
-        print("after post-processing:")
-        print_predictions(predictions, beat_times)
-
-        predictions = np.insert(predictions, 0, 0)
-        peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1
-        segment_times = beat_times[peak_loc]
+    segment_times = beat_times[peak_loc]
 
     print("beat_num\ttime:")
     for i in peak_loc:

From aff6e94272a3643b1398ad815308ae83b8bcc1bd Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Sat, 10 Apr 2021 04:04:30 -0700
Subject: [PATCH 32/35] move some parameters out to parameters.py

---
 Python/parameters.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Python/parameters.py

diff --git a/Python/parameters.py b/Python/parameters.py
new file mode 100644
index 0000000..4ad6222
--- /dev/null
+++ b/Python/parameters.py
@@ -0,0 +1,29 @@
+# thresholding value for prediction-choice algorithm.  trade recall for accuracy here.
+prediction_threshold = 0.3
+
+# how many beats make up a context window for the MLS part of the network
+context_length = 115
+
+# number of Mel bands
+num_mel_bands = 80
+
+# how many frames to max-pool in building the SSLM
+max_pool = 2
+
+# how far back to calculate the SSLM (note that actual length will be max_pool * sslm_length)
+sslm_length = 65
+
+# how many more negative examples than segment boundaries
+neg_frames_factor = 5
+
+# oversample positive frames because there are too few
+pos_frames_oversample = 5
+
+# oversample frames between segments
+mid_frames_oversample = 3
+
+# how many frames are semi-positive examples around an annotation
+label_smearing = 1
+
+padding_length = int(context_length / 2)
+

From 3ae11035038aa3e3b5ef8fcbe1a5d2f64e0fefda Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Wed, 14 Apr 2021 23:16:28 -0700
Subject: [PATCH 33/35] move numbers into parameters.py.  increase SSLM length.

---
 Python/evaluation.py         |  4 ++--
 Python/feature_extraction.py | 32 +++++++++++++-------------------
 Python/parameters.py         |  2 +-
 Python/track_segmentation.py | 14 ++++++--------
 4 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/Python/evaluation.py b/Python/evaluation.py
index 8fff7db..05c50eb 100644
--- a/Python/evaluation.py
+++ b/Python/evaluation.py
@@ -12,12 +12,12 @@
 import peakutils
 import mir_eval
 import paths
+import parameters
 
 from operator import itemgetter
 
 predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy'
 file_list_path = '../Data/fileListsAndIndex.pickle'
-prediction_threshold = 0.3
 
 def load_data(preds_file, file_lists):
     """
@@ -62,7 +62,7 @@ def choose_preds(preds, beat_times):
         else:
             preds_out[i] = 0
 
-    return np.flatnonzero(preds_out > prediction_threshold)
+    return np.flatnonzero(preds_out > parameters.prediction_threshold)
 
 
 def post_processing(preds_track):
diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py
index 8087a4f..503b711 100644
--- a/Python/feature_extraction.py
+++ b/Python/feature_extraction.py
@@ -32,16 +32,7 @@
 import skimage.measure
 from scipy.spatial import distance
 
-
-context_length = 65         # how many beats make up a context window for the CNN
-num_mel_bands = 80          # number of Mel bands
-neg_frames_factor = 5       # how many more negative examples than segment boundaries
-pos_frames_oversample = 5  # oversample positive frames because there are too few
-mid_frames_oversample = 3   # oversample frames between segments
-label_smearing = 1          # how many frames are positive examples around an annotation
-padding_length = int(context_length / 2)
-
-max_pool = 2
+from parameters import *
 
 # for debugging
 if False:
@@ -65,7 +56,7 @@ def compute_sslm(input_vector, beat_times, hop_size):
 
     x_hat_length = x_hat.shape[1]
 
-    sslm_shape = context_length * 3 # because we'll max pool it down at the end
+    sslm_shape = sslm_length * 3 # because we'll max pool it down at the end
 
     #Cosine distance calculation: D[N/p,L/p] matrix
     distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p
@@ -95,7 +86,7 @@ def compute_sslm(input_vector, beat_times, hop_size):
     sslm = np.transpose(sslm)
 
     beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int')
-    beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]), dtype=np.float32)
+    beat_sslms = np.zeros((sslm_length, sslm_length, beat_frames.shape[0]), dtype=np.float32)
 
     for k in range(beat_frames.shape[0]):
         sslm_frame = beat_frames[k] // max_pool
@@ -232,8 +223,8 @@ def gen_beat_mls(waveform, beat_times):
         return beat_mls
 
     waveform = None
-    beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls)
-    beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm)
+    beat_mls, waveform = with_audio_cache(f, '.mls_115.npy', waveform, beat_times, gen_beat_mls)
+    beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm_115.npy', waveform, beat_times, compute_mls_sslm)
     #times, waveform = with_audio_cache(f, '.beat_time_ratios.npy', waveform, beat_times, compute_time_features)
     times = make_beat_time_features(beat_numbers)
 
@@ -331,6 +322,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample
 
     if mean_vec is None:
         # subsample features
+        print("sampling")
         idx = random.sample(range(features.shape[0]), min(features.shape[0], subsample))
         temp_features = features[idx, :, :]
 
@@ -345,8 +337,9 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample
         mean_vec = np.mean(temp_features, axis=0)
         std_vec = np.std(temp_features, axis=0)
 
-    features = features - mean_vec[np.newaxis, :, np.newaxis]
-    features = features / std_vec[np.newaxis, :, np.newaxis]
+    print("modifying...")
+    features -= mean_vec[np.newaxis, :, np.newaxis]
+    features /= std_vec[np.newaxis, :, np.newaxis]
 
     return features, mean_vec, std_vec
 
@@ -367,7 +360,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, time_feature_list, label
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
-    data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32)
+    data_sslm_x = np.zeros(shape=(n_preallocate, sslm_length, sslm_length), dtype=np.float32)
     data_time_x = np.zeros(shape=(n_preallocate, time_feature_list[0].shape[1]), dtype=np.float32)
     data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32)
     data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32)
@@ -407,7 +400,7 @@ def add_feature(idx, label, weight=1):
                 for k in positive_frames_idx:
                     add_feature(k, label=1)
 
-                    ## apply label smearing: set labels around annotation to 1 and give them a triangular weight
+                    # apply label smearing: set labels around annotation to 1 and give them a triangular weight
                     for l in range(k - label_smearing, k + label_smearing + 1):
 
                         # don't smear into padding.
@@ -449,7 +442,7 @@ def add_feature(idx, label, weight=1):
         if feature_count > n_preallocate:
             break
 
-    data_x = data_x[:feature_count, :, :]
+    data_x.resize((feature_count, data_x.shape[1], data_x.shape[2]))
     data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2]))
     data_time_x.resize((feature_count, data_time_x.shape[1]))
     data_y = data_y[:feature_count]
@@ -510,6 +503,7 @@ def load_raw_features(file):
     train_x, train_sslm_x, train_time_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_time_features,  train_labels, is_training=True)
     test_x, test_sslm_x, test_time_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_time_features, test_labels, is_training=False)
 
+    print("normalizing features")
     train_x, mean_vec, std_vec = normalize_features_per_band(train_x)
     test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec)
 
diff --git a/Python/parameters.py b/Python/parameters.py
index 4ad6222..3d0e161 100644
--- a/Python/parameters.py
+++ b/Python/parameters.py
@@ -11,7 +11,7 @@
 max_pool = 2
 
 # how far back to calculate the SSLM (note that actual length will be max_pool * sslm_length)
-sslm_length = 65
+sslm_length = 115
 
 # how many more negative examples than segment boundaries
 neg_frames_factor = 5
diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py
index 86ae88e..ecce646 100644
--- a/Python/track_segmentation.py
+++ b/Python/track_segmentation.py
@@ -16,14 +16,12 @@
 from feature_extraction import compute_features, normalize_features_per_band
 from evaluation import post_processing, choose_preds
 from train_segmentation_cnn import build_model
-import peakutils
 
 normalization_path = '../Data/normalization.npz'
 model_weights = '../Data/model_weights_100epochs_lr005.h5'
 out_dir = '../Temp/'
-num_mel_bands = 80
-context_length = 65
-padding = int(context_length / 2)
+
+from parameters import context_length, num_mel_bands, padding_length
 
 
 def build_full_model():
@@ -79,8 +77,8 @@ def compute_context_windows(features):
 
     n_preallocate = 10000
 
-    features = np.hstack((0.001 * np.random.rand(num_mel_bands, padding), features,
-                         0.001 * np.random.rand(num_mel_bands, padding)))
+    features = np.hstack((0.001 * np.random.rand(num_mel_bands, padding_length), features,
+                         0.001 * np.random.rand(num_mel_bands, padding_length)))
 
     # initialize arrays for storing context windows
     data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32)
@@ -88,11 +86,11 @@ def compute_context_windows(features):
     feature_count = 0
     num_padded_features = features.shape[1]
 
-    for k in range(padding, num_padded_features - padding):
+    for k in range(padding_length, num_padded_features - padding_length):
         if feature_count > n_preallocate:
             break
 
-        next_window = features[:, k-padding: k+padding+1]
+        next_window = features[:, k-padding_length: k+padding_length+1]
         data_x[feature_count, :, :] = next_window
         feature_count += 1
 

From 73914d5b5b145b4415b0b2dbfebf547d7870b180 Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 16 Apr 2021 04:35:16 -0700
Subject: [PATCH 34/35] allow for toggling different parts of the network for
 testing

---
 Python/parameters.py             |  4 ++
 Python/train_segmentation_cnn.py | 74 ++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/Python/parameters.py b/Python/parameters.py
index 3d0e161..e9b5ad4 100644
--- a/Python/parameters.py
+++ b/Python/parameters.py
@@ -1,6 +1,10 @@
 # thresholding value for prediction-choice algorithm.  trade recall for accuracy here.
 prediction_threshold = 0.3
 
+# should we include (MLS, SSLM, beat #) features when training?
+#training_features = {'mls', 'sslm', 'beat_numbers'}
+training_features = {'mls', 'beat_numbers'}
+
 # how many beats make up a context window for the MLS part of the network
 context_length = 115
 
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index 8658cda..d951ab2 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -25,6 +25,7 @@
 
 np.random.seed(1235)  # for reproducibility
 
+import parameters
 
 def load_training_data(dataset):
     """
@@ -59,15 +60,29 @@ def load_test_data(dataset):
 
 
 def build_model(mls_rows, mls_cols, sslm_shape):
-    mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input')
-    mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input)
-    mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls)
+    inputs = []
+    merged_input = []
 
-    sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input')
-    sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input)
-    sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm)
+    if 'mls' in parameters.training_features:
+        mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input')
+        mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input)
+        mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls)
+        merged_input.append(mls)
+        inputs.append(mls_input)
+
+    if 'sslm' in parameters.training_features:
+        sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input')
+        sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input)
+        sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm)
+
+        merged_input.append(sslm)
+        inputs.append(sslm_input)
+
+    if len(merged_input) > 1:
+        merged = layers.Concatenate(axis=1, name='mls_sslm_concat')(merged_input)
+    else:
+        merged = merged_input[0]
 
-    merged = layers.Concatenate(axis=1, name='mls_slsm_concat')([mls, sslm])
     merged = layers.Conv2D(64, (6, 3), activation='relu', name='concat_conv')(merged)
     merged = layers.Dropout(0.5, name='concat_dropout')(merged)
 
@@ -76,13 +91,34 @@ def build_model(mls_rows, mls_cols, sslm_shape):
     merged = layers.Dense(256, activation='relu', name='final_dense')(merged)
     merged = layers.Dropout(0.5, name='final_dropout')(merged)
 
-    time_input = layers.Input(shape=(4,), name='time_input')
-    time = layers.Dense(1, activation='relu', name='time_dense')(time_input)
-    merged = layers.Concatenate(name='final_concat')([merged, time])
+    final_dense_input = [merged]
+    if 'beat_numbers' in parameters.training_features:
+        time_input = layers.Input(shape=(4,), name='time_input')
+        time = layers.Dense(1, activation='relu', name='time_dense')(time_input)
+        final_dense_input.append(time)
+        inputs.append(time_input)
+
+    if len(final_dense_input) > 1:
+        merged = layers.Concatenate(name='final_concat')(final_dense_input)
+    else:
+        merged = final_dense_input[0]
 
     merged = layers.Dense(1, activation='sigmoid', name='final_sigmoid')(merged)
 
-    return Model(inputs=[mls_input, sslm_input, time_input], outputs = merged)
+    return Model(inputs=inputs, outputs = merged)
+
+def make_input(mls, sslm, time):
+    input = []
+    if 'mls' in parameters.training_features:
+        input.append(mls)
+
+    if 'sslm' in parameters.training_features:
+        input.append(sslm)
+
+    if 'beat_numbers' in parameters.training_features:
+        input.append(time)
+
+    return input
 
 def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None):
     """
@@ -100,7 +136,12 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     print('training data size:')
     print(X_train.shape)
 
+    img_rows = X_train.shape[1]
+    img_cols = X_train.shape[2]
+    model = build_model(img_rows, img_cols, x_sslm_train.shape[1])
+
     p = np.random.permutation(X_train.shape[0])
+
     X_train = X_train[p, :, :]
     x_sslm_train = x_sslm_train[p, :, :]
     x_time_train = x_time_train[p]
@@ -111,10 +152,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     X_train = np.expand_dims(X_train, 3)
     x_sslm_train = np.expand_dims(x_sslm_train, 3)
 
-    img_rows = X_train.shape[1]
-    img_cols = X_train.shape[2]
-
-    model = build_model(img_rows, img_cols, x_sslm_train.shape[1])
 
     if weights_file is not None:
         model.load_weights(weights_file)
@@ -125,7 +162,8 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     early_stopping = EarlyStopping(monitor='val_loss', patience=15)
 
     print('train model...')
-    model.fit(x=[X_train, x_sslm_train, x_time_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
+
+    model.fit(x=make_input(X_train, x_sslm_train, x_time_train), y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True,
               verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping])
 
     print('load test data...')
@@ -135,12 +173,12 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh
     x_sslm_test = np.expand_dims(x_sslm_test, 3)
 
     print('predict test data...')
-    preds = model.predict([X_test, x_sslm_test, x_time_test], batch_size=1, verbose=1)
+    preds = model.predict(make_input(X_test, x_sslm_test, x_time_test), batch_size=1, verbose=1)
 
     print('saving results...')
     np.save('../Data/predsTestTracks' + save_ext + '.npy', preds)
 
-    score = model.evaluate([X_test, x_sslm_test, x_time_test], y_test, verbose=1)
+    score = model.evaluate(make_input(X_test, x_sslm_test, x_time_test), y_test, verbose=1)
     print('Test score:', score)
 
     # save model

From edefd022310a198ad1db315306795bfd76b62a3f Mon Sep 17 00:00:00 2001
From: Ben Osheroff <ben@gimbo.net>
Date: Fri, 16 Apr 2021 13:51:55 -0700
Subject: [PATCH 35/35] mmap stuff.  new tracks.

---
 Data/test_tracks.txt             |  14 ++++
 Data/train_tracks.txt            | 120 +++++++++++++++++++++++++++++++
 Python/parameters.py             |   3 +-
 Python/train_segmentation_cnn.py |   4 +-
 4 files changed, 137 insertions(+), 4 deletions(-)

diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt
index dd02f71..d7d6f00 100644
--- a/Data/test_tracks.txt
+++ b/Data/test_tracks.txt
@@ -90,3 +90,17 @@
 355.m4a
 728.m4a
 531.m4a
+549.m4a
+10050.mp3
+437.m4a
+855.m4a
+951.m4a
+653.m4a
+879.m4a
+935.m4a
+835.m4a
+629.m4a
+10051.mp3
+541.m4a
+893.m4a
+341.m4a
diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt
index 7992c7c..d6a95cc 100644
--- a/Data/train_tracks.txt
+++ b/Data/train_tracks.txt
@@ -744,3 +744,123 @@
 439.m4a
 79.m4a
 782.m4a
+803.m4a
+605.m4a
+10086.m4a
+53.m4a
+10075.m4a
+823.m4a
+10079.m4a
+685.m4a
+10087.m4a
+10088.m4a
+533.m4a
+10081.m4a
+701.m4a
+901.m4a
+39.m4a
+39.m4a
+827.m4a
+525.m4a
+933.m4a
+10045.mp3
+389.m4a
+10078.m4a
+1655.m4a
+10070.m4a
+799.m4a
+581.m4a
+85.m4a
+10062.m4a
+597.m4a
+943.m4a
+565.m4a
+10068.m4a
+10074.m4a
+445.m4a
+10044.mp3
+1651.m4a
+10058.m4a
+829.m4a
+909.m4a
+557.m4a
+381.m4a
+621.m4a
+485.m4a
+931.m4a
+413.m4a
+357.m4a
+839.m4a
+10072.m4a
+911.m4a
+493.m4a
+1635.m4a
+1647.m4a
+733.m4a
+10091.m4a
+837.m4a
+10052.mp3
+1627.m4a
+10054.mp3
+429.m4a
+10071.m4a
+10059.m4a
+645.m4a
+859.m4a
+10063.m4a
+501.m4a
+21.m4a
+10049.mp3
+10056.mp3
+10084.m4a
+863.m4a
+10090.m4a
+10053.mp3
+10076.m4a
+1607.m4a
+895.m4a
+10083.m4a
+795.m4a
+10048.mp3
+517.m4a
+10080.m4a
+853.m4a
+851.m4a
+847.m4a
+10069.m4a
+477.m4a
+589.m4a
+861.m4a
+333.m4a
+10073.m4a
+10057.m4a
+941.m4a
+1643.m4a
+677.m4a
+661.m4a
+10067.m4a
+10082.m4a
+10089.m4a
+1619.m4a
+1623.m4a
+1615.m4a
+831.m4a
+10047.mp3
+397.m4a
+693.m4a
+10066.m4a
+10055.mp3
+10046.mp3
+573.m4a
+10077.m4a
+819.m4a
+461.m4a
+10085.m4a
+813.m4a
+10061.m4a
+10065.m4a
+949.m4a
+469.m4a
+309.m4a
+709.m4a
+10060.m4a
diff --git a/Python/parameters.py b/Python/parameters.py
index e9b5ad4..b048dc7 100644
--- a/Python/parameters.py
+++ b/Python/parameters.py
@@ -2,8 +2,7 @@
 prediction_threshold = 0.3
 
 # should we include (MLS, SSLM, beat #) features when training?
-#training_features = {'mls', 'sslm', 'beat_numbers'}
-training_features = {'mls', 'beat_numbers'}
+training_features = {'mls', 'sslm', 'beat_numbers'}
 
 # how many beats make up a context window for the MLS part of the network
 context_length = 115
diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py
index d951ab2..c2680e7 100644
--- a/Python/train_segmentation_cnn.py
+++ b/Python/train_segmentation_cnn.py
@@ -39,7 +39,7 @@ def load_training_data(dataset):
     :return train_weights (n_items x 1)
     """
 
-    data = np.load(dataset)
+    data = np.load(dataset, mmap_mode='r')
     return data['train_x'], data['train_sslm_x'], data['train_time_x'], data['train_y'], data['train_weights']
 
 
@@ -55,7 +55,7 @@ def load_test_data(dataset):
     :return test_weights (n_items x 1)
     """
 
-    data = np.load(dataset)
+    data = np.load(dataset, mmap_mode='r')
     return data['test_x'], data['test_sslm_x'], data['test_time_x'], data['test_y'], data['test_weights']