From c1ca3533ee4c566baf135373c0e262b968940436 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Tue, 16 Mar 2021 05:06:10 -0700 Subject: [PATCH 01/35] add some centralized paths support --- Python/evaluation.py | 5 ++--- Python/feature_extraction.py | 13 ++++++------- Python/paths.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 Python/paths.py diff --git a/Python/evaluation.py b/Python/evaluation.py index 9bc138e..0a5e63e 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -11,11 +11,10 @@ import pickle import peakutils import mir_eval +import paths predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' -beats_folder_path = '../Audio' -annotations_folder_path = '../Data/salami-data-public/annotations/' f_measure_thresh = 3 # tolerance window in seconds @@ -74,7 +73,7 @@ def post_processing(preds_track): print("Evaluating {}".format(f)) # load annotations - segment_times = get_segment_times(f, annotations_folder_path) + segment_times = get_segment_times(f, paths.annotations_path) # get beat times beat_times = get_beat_times(f, beats_folder_path) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 6c831d6..508b881 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -19,12 +19,11 @@ import librosa import random import pickle +import paths + from utils import * import scipy -audio_folder_path = '../Audio' -beats_folder_path = '../Audio' -annotations_folder_path = '../Data/salami-data-public/annotations/' context_length = 65 # how many beats make up a context window for the CNN num_mel_bands = 80 # number of Mel bands neg_frames_factor = 5 # how many more negative examples than segment boundaries @@ -317,12 +316,12 @@ def load_raw_features(file): print("Extracting MLS features") train_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files, - beats_folder_path, - annotations_folder_path) + paths.beats_path, + paths.annotations_path) test_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files, - beats_folder_path, - annotations_folder_path) + paths.beats_path, + paths.annotations_path) print("Extracted features for {} training and {} test tracks".format(len(train_features), len(test_features))) diff --git a/Python/paths.py b/Python/paths.py new file mode 100644 index 0000000..61c5a5b --- /dev/null +++ b/Python/paths.py @@ -0,0 +1,18 @@ +# encoding: utf-8 +""" + Define path locations and helpful functions +""" + +import os + +audio_path = '../Audio' +beats_path = '../Audio/beats' +mls_path = '../Audio/features' +annotations_path = '../Data/salami-data-public/annotations/' + +def remove_suffix(filename): + return os.path.splitext(os.path.basename(filename))[0] + +def get_mls_path(audio_filename): + return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy') + From ea218dc8775709b940fb330751c310692d0749ce Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Wed, 17 Mar 2021 07:35:43 -0700 Subject: [PATCH 02/35] latest --- Python/evaluation.py | 18 ++++++---- Python/feature_extraction.py | 62 +++++++++++++++++++++----------- Python/paths.py | 4 +++ Python/track_segmentation.py | 39 +++++++++++++++----- Python/train_segmentation_cnn.py | 2 +- Python/utils.py | 29 +++++++++++---- Python/visualization.py | 17 +++++++-- 7 files changed, 126 insertions(+), 45 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 0a5e63e..db3c54e 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -37,10 +37,11 @@ def load_data(preds_file, file_lists): return preds, test_files, test_idx -def post_processing(preds_track): +def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): """ Post processing of prediction probabilities, applies smoothing window and emphasizes beats by multiplying with running avarage. + Also weights predictions towards beat "1". :param preds_track: CNN predictions per beat :return: post-processed predictions @@ -53,6 +54,11 @@ def post_processing(preds_track): preds_track = np.multiply(preds_track, np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same')) + + # emphasize downbeeat + if emphasize_downbeat: + preds_track = np.multiply(preds_track, np.where(beat_numbers == 1, 1, 0.5)) + # unit maximum preds_track /= np.max(preds_track) @@ -76,19 +82,19 @@ def post_processing(preds_track): segment_times = get_segment_times(f, paths.annotations_path) # get beat times - beat_times = get_beat_times(f, beats_folder_path) + beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True) # get predictions for current track preds_track = np.squeeze(np.asarray(preds[test_idx == i])) # post processing - preds_track = post_processing(preds_track) - peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) + preds_track = post_processing(preds_track, beat_numbers) + peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.2) - pred_times = beat_times[peak_loc] - 1 + pred_times = beat_times[peak_loc] # compute f-measure - f_score, p, r = mir_eval.onset.f_measure(segment_times, pred_times, window=f_measure_thresh) + f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh) f_measures.append(f_score) precisions.append(p) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 508b881..146e768 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -48,7 +48,18 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102 :return: beat Mel spectrogram (mel_bands x frames) """ - y, sr = librosa.load(os.path.join(audio_folder_path, filename), sr=22050, mono=True) + computed_mls_file = paths.get_mls_path(filename) + + if os.path.exists(computed_mls_file): + return np.load(computed_mls_file) + + + if "/" in filename: + path = filename + else: + path = os.path.join(paths.audio_path, filename) + + y, sr = librosa.load(path, sr=22050, mono=True) spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming)) @@ -72,6 +83,15 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102 return beat_melspec +def compute_features(logger, f, i, audio_files): + logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) + + beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) + + beat_mls = compute_beat_mls(f, beat_times) + beat_mls /= np.max(beat_mls) + return beat_mls, beat_times + def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): """ Extract Mel log spectrogram features from a folder of audio files given pre-analysed @@ -88,31 +108,33 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): labels_list = [] failed_tracks_idx = [] - for i, f in enumerate(audio_files): - - print("Track {} / {}".format(i, len(audio_files))) - - beat_times = get_beat_times(f, beats_folder) + async_res = [] - beat_mls = compute_beat_mls(f, beat_times) - beat_mls /= np.max(beat_mls) + logger = multiprocessing.log_to_stderr() + logger.setLevel(logging.INFO) - label_vec = np.zeros(beat_mls.shape[1],) - segment_times = get_segment_times(f, annotation_folder) + with multiprocessing.Pool(processes=8) as pool: + #for i, f in enumerate(audio_files): + # async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) - if isinstance(segment_times, int): - failed_tracks_idx.append(i) - print("Extraction failed - no annotation found for " + f) - continue + for i, f in enumerate(audio_files): + #beat_mls, beat_times = async_res[i].get() + beat_mls, beat_times = compute_features(logger, f, i , audio_files) + label_vec = np.zeros(beat_mls.shape[1],) + segment_times = get_segment_times(f, paths.annotations_path) - for segment_start in segment_times: + if isinstance(segment_times, int): + failed_tracks_idx.append(i) + print("Extraction failed - no annotation found for " + f) + continue - closest_beat = np.argmin(np.abs(beat_times - segment_start)) - if closest_beat < len(label_vec): - label_vec[closest_beat] = 1. + for segment_start in segment_times: + closest_beat = np.argmin(np.abs(beat_times - segment_start)) + if closest_beat < len(label_vec): + label_vec[closest_beat] = 1. - feature_list.append(beat_mls) - labels_list.append(label_vec) + feature_list.append(beat_mls) + labels_list.append(label_vec) return feature_list, labels_list, failed_tracks_idx diff --git a/Python/paths.py b/Python/paths.py index 61c5a5b..af52dac 100644 --- a/Python/paths.py +++ b/Python/paths.py @@ -8,11 +8,15 @@ audio_path = '../Audio' beats_path = '../Audio/beats' mls_path = '../Audio/features' +viz_path = '../Audio/viz' annotations_path = '../Data/salami-data-public/annotations/' def remove_suffix(filename): return os.path.splitext(os.path.basename(filename))[0] +def with_suffix(path, ext): + return remove_suffix(path) + '.' + ext + def get_mls_path(audio_filename): return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy') diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index 5bdc4c9..f3d277a 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -49,7 +49,8 @@ def extract_features(audio_file, beats_file): """ t = pd.read_table(beats_file, header=None) - beat_times = t.iloc[:, 0].values + beat_times = t[0].values + beat_numbers = t[1].values beat_mls = compute_beat_mls(filename=audio_file, beat_times=beat_times) beat_mls /= np.max(beat_mls) @@ -60,7 +61,7 @@ def extract_features(audio_file, beats_file): std_vec = norm_data['std_vec'] features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec) - return features, beat_times + return features, beat_times, beat_numbers def compute_context_windows(features): @@ -96,16 +97,34 @@ def compute_context_windows(features): return data_x -def compute_segments_from_predictions(predictions, beat_times): +def print_predictions(p, beat_times): + for i in range(len(p)): + print("%i:\t%.3f\t%.1f" % (i, p[i], beat_times[i])) + + +def compute_segments_from_predictions(predictions, beat_times, beat_numbers): """ Computes the segment times from a prediction curve and the beat times using peak picking. """ predictions = np.squeeze(predictions) - predictions = post_processing(predictions) - peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.05) + + breakpoint() + print("raw predicitions:") + print_predictions(predictions, beat_times) + + predictions = post_processing(predictions, beat_numbers, emphasize_downbeat=True) + + print("after post-processing:") + print_predictions(predictions, beat_times) + + peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) segment_times = beat_times[peak_loc] + print("beat_num\ttime:") + for i in peak_loc: + print("%i\t%.2f" % (i, beat_times[i])) + return segment_times @@ -127,16 +146,20 @@ def compute_segments_from_predictions(predictions, beat_times): if not os.path.isfile(out_dir + file_name + '.beats.txt'): print("Extracting beat times (this might take a while)...") - os.system('DBNBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') + os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') print("Computing features") - mls_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt') + mls_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt') print("Computing CNN predictions") predictions = compute_cnn_predictions(mls_features) print("Get segment times") - segment_times = compute_segments_from_predictions(predictions, beat_times) + segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers) + + print("\n") + for f in segment_times: + print(f) print("The result has been stored in " + output_file) np.savetxt(output_file, segment_times, fmt='%4.2f', delimiter='\n') diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 1b81912..9d3fd17 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -114,7 +114,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd) - #early_stopping = EarlyStopping(monitor='val_loss', patience=5) + early_stopping = EarlyStopping(monitor='val_loss', patience=5) print('train model...') model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, diff --git a/Python/utils.py b/Python/utils.py index a55b571..30a9145 100644 --- a/Python/utils.py +++ b/Python/utils.py @@ -89,22 +89,29 @@ def get_segment_times(audio_file, annotation_folder): # for some tracks, only one annotation is available, take first one as default # if there is no annotation available, store -1 as error code + try: - label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile1_uppercase.txt') + label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile3_uppercase.txt') t = pd.read_table(label_file, header=None) except IOError: try: - label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile2_uppercase.txt') + label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile1_uppercase.txt') t = pd.read_table(label_file, header=None) except IOError: - return -1 + try: + label_file = os.path.join(annotation_folder, file_name, 'parsed', 'textfile2_uppercase.txt') + t = pd.read_table(label_file, header=None) + except IOError: + return -1 + + if t[1].dtype == 'O': + t = t[~(t[1].str.lower().isin(['silence', 'end']))] segment_times = t.iloc[:, 0].values return segment_times - -def get_beat_times(audio_file, beats_folder): +def get_beat_times(audio_file, beats_folder, include_beat_numbers=False): """ Read beat times from annotation file. :param audio_file: path to audio files @@ -114,7 +121,15 @@ def get_beat_times(audio_file, beats_folder): file_name = os.path.splitext(os.path.basename(audio_file))[0] beats_file = os.path.join(beats_folder, file_name + '.beats.txt') + + if not os.path.isfile(beats_file): + print(f"Extracting beat times for {audio_file}") + os.system(f"DBNDownBeatTracker single '{audio_file}' -o '{beats_file}'") + t = pd.read_table(beats_file, header=None) - beat_times = t.iloc[:, 0].values - return beat_times + if include_beat_numbers: + return t[0].values, t[1].values + else: + return t[0].values + diff --git a/Python/visualization.py b/Python/visualization.py index ea540d3..b602dcc 100644 --- a/Python/visualization.py +++ b/Python/visualization.py @@ -8,8 +8,11 @@ import numpy as np from feature_extraction import load_raw_features from evaluation import post_processing +from utils import get_beat_times import matplotlib.pyplot as plt import pickle +import paths +import os def visualize_predictions(): @@ -31,19 +34,27 @@ def visualize_predictions(): for i in range(len(test_labels)): f = test_files[i] - print f + beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True) + print(f) idx = np.where(test_idx == i)[0] labels = test_y[idx] preds_track = np.squeeze(np.asarray(preds[idx])) - preds_track = post_processing(preds_track) + processed_preds_track = post_processing(preds_track, beat_numbers) + with_downbeat_preds = post_processing(preds_track, beat_numbers, emphasize_downbeat=True) + preds_track = 0.5 + 0.5 * preds_track + processed_preds_track = 1.0 + 0.5 * processed_preds_track + with_downbeat_preds = 1.5 + 0.5 * with_downbeat_preds labels *= 0.5 plt.plot(labels) plt.plot(preds_track) - plt.show() + plt.plot(processed_preds_track) + plt.plot(with_downbeat_preds) + plt.savefig(os.path.join(paths.viz_path, paths.with_suffix(test_files[i], 'svg')), dpi=400) + plt.clf() def visualize_training_data(): From 37eee1a39e22af8c004915bacce1bdd71975e1ab Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 18 Mar 2021 00:12:55 -0700 Subject: [PATCH 03/35] update data, clear up variable names in track_segmentation --- Data/test_tracks.txt | 110 ++-- Data/train_tracks.txt | 944 ++++++++++++++++++++--------------- Python/track_segmentation.py | 5 +- 3 files changed, 606 insertions(+), 453 deletions(-) diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index bdad88d..62f4bd4 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -1,46 +1,72 @@ -4.m4a +1166.mp3 40.m4a -46.m4a -5.m4a -6.m4a -8.m4a -955.mp3 -956.mp3 -957.mp3 -958.mp3 -959.mp3 -960.mp3 -962.mp3 -963.mp3 +1090.mp3 +584.m4a +346.m4a +1026.mp3 +1142.mp3 +1302.mp3 +1131.mp3 +608.m4a +1274.mp3 +1376.mp3 +670.m4a +1399.mp3 +1319.mp3 +18.m4a +1123.mp3 +342.m4a +10013.mp3 +642.m4a +306.m4a +1488.mp3 +516.m4a +1192.mp3 +10024.mp3 +1357.mp3 +404.m4a +1063.mp3 +1331.mp3 +1356.mp3 +1322.mp3 +1170.mp3 +1440.mp3 +1091.mp3 964.mp3 -965.mp3 -966.mp3 -967.mp3 -968.mp3 +1436.mp3 +1414.mp3 +1474.mp3 +1036.mp3 +1040.mp3 +426.m4a +1087.mp3 +1301.mp3 970.mp3 -971.mp3 -972.mp3 -973.mp3 -974.mp3 -975.mp3 -976.mp3 -978.mp3 -979.mp3 -980.mp3 -981.mp3 -982.mp3 -983.mp3 -984.mp3 -986.mp3 -987.mp3 -988.mp3 -989.mp3 -990.mp3 -991.mp3 +1141.mp3 +1250.mp3 +1483.mp3 992.mp3 -994.mp3 -995.mp3 -996.mp3 -997.mp3 -998.mp3 -999.mp3 +1223.mp3 +1284.mp3 +10012.mp3 +472.m4a +6.m4a +986.mp3 +678.m4a +1227.mp3 +1152.mp3 +5.m4a +1270.mp3 +488.m4a +1311.mp3 +1421.mp3 +1402.mp3 +522.m4a +354.m4a +1276.mp3 +1339.mp3 +1236.mp3 +1445.mp3 +1221.mp3 +1244.mp3 +1080.mp3 diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index 95ed51d..e2e2baf 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -1,449 +1,577 @@ -10.m4a -1000.mp3 -1003.mp3 +1136.mp3 +1343.mp3 +1027.mp3 +971.mp3 +484.m4a +1130.mp3 +10032.mp3 +991.mp3 +616.m4a +1076.mp3 +478.m4a +1300.mp3 +1333.mp3 +1395.mp3 +440.m4a 1004.mp3 -1005.mp3 -1006.mp3 -1007.mp3 -1008.mp3 -1011.mp3 -1012.mp3 -1013.mp3 -1014.mp3 -1015.mp3 -1018.mp3 -1019.mp3 -1020.mp3 -1021.mp3 -1022.mp3 -1023.mp3 +1372.mp3 +512.m4a +1155.mp3 +1397.mp3 +1485.mp3 1024.mp3 -1026.mp3 -1027.mp3 -1029.mp3 +1093.mp3 +660.m4a +1254.mp3 +1460.mp3 +1149.mp3 +338.m4a +1396.mp3 +52.m4a +987.mp3 +1384.mp3 +1423.mp3 +594.m4a +1107.mp3 +1410.mp3 1030.mp3 -1032.mp3 -1034.mp3 +1403.mp3 +14.m4a +20.m4a +480.m4a +1455.mp3 +37.m4a +995.mp3 +1430.mp3 +1147.mp3 +1392.mp3 +1164.mp3 +1205.mp3 +626.m4a +1182.mp3 +444.m4a +1448.mp3 +4.m4a +1374.mp3 +996.mp3 +1328.mp3 +1365.mp3 +1358.mp3 +989.mp3 +1478.mp3 +1157.mp3 +1144.mp3 +1286.mp3 +384.m4a +1179.mp3 +1404.mp3 +1256.mp3 +974.mp3 +1271.mp3 +498.m4a +1327.mp3 +618.m4a +1354.mp3 +966.mp3 +955.mp3 1035.mp3 -1036.mp3 -1037.mp3 -1038.mp3 -1039.mp3 -1040.mp3 -1042.mp3 -1043.mp3 -1044.mp3 -1045.mp3 1046.mp3 -1047.mp3 +1352.mp3 +10023.mp3 +1224.mp3 +1204.mp3 +1038.mp3 +1059.mp3 +534.m4a +420.m4a +1490.mp3 +474.m4a +1243.mp3 +1086.mp3 +1226.mp3 1048.mp3 -1051.mp3 +1476.mp3 +1214.mp3 +10033.mp3 +1162.mp3 +340.m4a +13.m4a +10025.mp3 +450.m4a +1138.mp3 +1359.mp3 +1219.mp3 +10.m4a +1202.mp3 +965.mp3 +1023.mp3 +1375.mp3 +1140.mp3 +1039.mp3 +1083.mp3 +1092.mp3 1052.mp3 -1053.mp3 -1054.mp3 +1310.mp3 +1462.mp3 +10021.mp3 +1007.mp3 +690.m4a +1242.mp3 +1120.mp3 +1496.mp3 +576.m4a +1167.mp3 +652.m4a 1055.mp3 -1056.mp3 -1058.mp3 -1059.mp3 -1060.mp3 -1061.mp3 -1062.mp3 -1063.mp3 +1419.mp3 +676.m4a +416.m4a +1316.mp3 +1288.mp3 +634.m4a +1299.mp3 +648.m4a +1268.mp3 +1078.mp3 +1459.mp3 +524.m4a +978.mp3 +1114.mp3 +614.m4a +1218.mp3 1064.mp3 -1066.mp3 -1067.mp3 -1068.mp3 -1069.mp3 -1070.mp3 -1071.mp3 -1072.mp3 +1463.mp3 +612.m4a +1122.mp3 +1232.mp3 +1258.mp3 +408.m4a +1408.mp3 +402.m4a +1306.mp3 1074.mp3 -1075.mp3 -1076.mp3 -1077.mp3 -1078.mp3 +983.mp3 +1069.mp3 +8.m4a +1126.mp3 +1335.mp3 +1062.mp3 +10008.mp3 +370.m4a +1272.mp3 +1326.mp3 +1429.mp3 +1124.mp3 +320.m4a +1196.mp3 +1464.mp3 +1350.mp3 +12.m4a +1099.mp3 +1054.mp3 +1435.mp3 +1439.mp3 +372.m4a +1269.mp3 +568.m4a +1422.mp3 +10020.mp3 +10009.mp3 +307.m4a +1109.mp3 +1206.mp3 +1318.mp3 +350.m4a +1450.mp3 +360.m4a +963.mp3 +476.m4a +1251.mp3 +1132.mp3 +1011.mp3 +1424.mp3 +492.m4a +1005.mp3 +1266.mp3 1079.mp3 -1080.mp3 -1082.mp3 -1083.mp3 +1115.mp3 +1360.mp3 +1175.mp3 +1431.mp3 +1294.mp3 +520.m4a +1245.mp3 +410.m4a +1239.mp3 +468.m4a +16.m4a +1195.mp3 +1151.mp3 +1493.mp3 1084.mp3 -1085.mp3 -1086.mp3 -1087.mp3 -1088.mp3 -1090.mp3 -1091.mp3 -1092.mp3 -1093.mp3 -1095.mp3 -1096.mp3 -1098.mp3 -1099.mp3 -1101.mp3 -1102.mp3 -1103.mp3 +1240.mp3 +1378.mp3 +1037.mp3 +988.mp3 +324.m4a 1104.mp3 -1106.mp3 -1107.mp3 -1108.mp3 -1109.mp3 -1110.mp3 -1111.mp3 +979.mp3 +424.m4a +1467.mp3 +975.mp3 +364.m4a +1171.mp3 +10026.mp3 +1285.mp3 +668.m4a +1189.mp3 +1291.mp3 +596.m4a +1261.mp3 +1072.mp3 +442.m4a +356.m4a +1148.mp3 +956.mp3 +1070.mp3 +482.m4a +396.m4a +1067.mp3 +486.m4a 1112.mp3 -1114.mp3 -1115.mp3 -1116.mp3 -1117.mp3 -1118.mp3 +358.m4a +982.mp3 +1173.mp3 +334.m4a +1262.mp3 +1412.mp3 +1315.mp3 +1309.mp3 +1106.mp3 +1287.mp3 +570.m4a +1389.mp3 +1135.mp3 1119.mp3 -1120.mp3 -1122.mp3 -1123.mp3 -1124.mp3 +1407.mp3 +1075.mp3 +666.m4a +1207.mp3 +1367.mp3 +1362.mp3 +1451.mp3 +998.mp3 +1246.mp3 +1381.mp3 +1101.mp3 +1003.mp3 1125.mp3 -1126.mp3 -1127.mp3 -1128.mp3 -1130.mp3 -1131.mp3 -1132.mp3 -1133.mp3 -1134.mp3 -1135.mp3 -1136.mp3 -1138.mp3 -1139.mp3 -1140.mp3 -1141.mp3 -1142.mp3 -1143.mp3 -1144.mp3 -1146.mp3 -1147.mp3 -1148.mp3 -1149.mp3 -1150.mp3 -1151.mp3 -1152.mp3 +1386.mp3 +536.m4a +1238.mp3 +1095.mp3 +994.mp3 +1088.mp3 +394.m4a +46.m4a 1154.mp3 -1155.mp3 -1156.mp3 -1157.mp3 +1264.mp3 +1077.mp3 +1188.mp3 +1472.mp3 +1134.mp3 +1293.mp3 +1117.mp3 +1053.mp3 +658.m4a +1461.mp3 +422.m4a +1215.mp3 +1045.mp3 +317.m4a 1158.mp3 +1346.mp3 +1194.mp3 +1446.mp3 +10022.mp3 1159.mp3 -1160.mp3 -1162.mp3 -1163.mp3 -1164.mp3 -1165.mp3 -1166.mp3 -1167.mp3 +1368.mp3 +1332.mp3 +1096.mp3 +502.m4a +1394.mp3 1168.mp3 -1170.mp3 -1171.mp3 -1172.mp3 -1173.mp3 +1181.mp3 +610.m4a +392.m4a +322.m4a +1371.mp3 +39.m4a +560.m4a +1180.mp3 +1338.mp3 +1443.mp3 +1111.mp3 +1432.mp3 +532.m4a +496.m4a +1482.mp3 +981.mp3 +311.m4a +366.m4a +694.m4a +1212.mp3 +1102.mp3 +997.mp3 +646.m4a +1042.mp3 +1060.mp3 1174.mp3 -1175.mp3 +1382.mp3 +959.mp3 +554.m4a +510.m4a +1247.mp3 +1213.mp3 +323.m4a +10017.mp3 +1082.mp3 +1110.mp3 +1307.mp3 +1495.mp3 +1296.mp3 +10016.mp3 +1108.mp3 +1364.mp3 +1470.mp3 +1021.mp3 +1492.mp3 +1484.mp3 +654.m4a +504.m4a +30.m4a +1235.mp3 +10027.mp3 +1211.mp3 1176.mp3 -1178.mp3 -1179.mp3 -1180.mp3 -1181.mp3 -1182.mp3 -1183.mp3 +1015.mp3 +574.m4a +1314.mp3 +1494.mp3 +1405.mp3 +999.mp3 +10014.mp3 +990.mp3 +1071.mp3 1184.mp3 -1186.mp3 -1187.mp3 -1188.mp3 -1189.mp3 -1190.mp3 -1191.mp3 -1192.mp3 -1194.mp3 -1195.mp3 -1196.mp3 -1197.mp3 -1198.mp3 +506.m4a +1336.mp3 1199.mp3 -12.m4a -1200.mp3 -1202.mp3 -1203.mp3 -1204.mp3 -1205.mp3 -1206.mp3 -1207.mp3 -1208.mp3 -1210.mp3 -1211.mp3 -1212.mp3 -1213.mp3 -1214.mp3 -1215.mp3 -1216.mp3 -1218.mp3 -1219.mp3 -1220.mp3 -1221.mp3 1222.mp3 -1223.mp3 -1224.mp3 -1226.mp3 -1227.mp3 -1228.mp3 -1229.mp3 -1230.mp3 -1231.mp3 -1232.mp3 -1234.mp3 -1235.mp3 -1236.mp3 -1237.mp3 -1238.mp3 -1239.mp3 -1240.mp3 -1242.mp3 -1243.mp3 -1244.mp3 -1245.mp3 -1246.mp3 -1247.mp3 -1248.mp3 -1250.mp3 -1251.mp3 +976.mp3 +1128.mp3 +1044.mp3 +1000.mp3 +1051.mp3 +1442.mp3 +24.m4a +1210.mp3 +578.m4a +564.m4a +1032.mp3 +1437.mp3 +10029.mp3 +1406.mp3 +1379.mp3 +1347.mp3 +1456.mp3 +1438.mp3 +508.m4a +1022.mp3 +1308.mp3 +1413.mp3 +1012.mp3 +3.m4a +1127.mp3 1253.mp3 -1254.mp3 -1256.mp3 -1258.mp3 -1259.mp3 +10035.mp3 +1390.mp3 +980.mp3 +1351.mp3 +368.m4a +1317.mp3 +1150.mp3 +550.m4a +967.mp3 +630.m4a +1342.mp3 +968.mp3 1260.mp3 -1261.mp3 -1262.mp3 -1263.mp3 -1264.mp3 -1266.mp3 -1267.mp3 -1268.mp3 -1269.mp3 -1270.mp3 -1271.mp3 -1272.mp3 -1274.mp3 +1383.mp3 +1428.mp3 +590.m4a +1468.mp3 +1133.mp3 +1324.mp3 +1444.mp3 +1118.mp3 +1008.mp3 +10019.mp3 +1420.mp3 +448.m4a +606.m4a +1029.mp3 +10007.mp3 +1160.mp3 +1447.mp3 +548.m4a +1415.mp3 +604.m4a +1220.mp3 1275.mp3 -1276.mp3 -1277.mp3 -1278.mp3 +10034.mp3 +336.m4a +1186.mp3 +1469.mp3 +1475.mp3 +1454.mp3 +1434.mp3 +1418.mp3 +1014.mp3 +686.m4a +1427.mp3 +10031.mp3 1279.mp3 -1280.mp3 +1006.mp3 1282.mp3 +1325.mp3 +1172.mp3 +1280.mp3 +957.mp3 +632.m4a +1043.mp3 +556.m4a +1387.mp3 +1230.mp3 +10030.mp3 +984.mp3 +1278.mp3 +1400.mp3 +1143.mp3 +10011.mp3 +1103.mp3 +1491.mp3 +662.m4a 1283.mp3 -1284.mp3 -1285.mp3 -1286.mp3 -1287.mp3 -1288.mp3 -1290.mp3 -1291.mp3 +1334.mp3 +1068.mp3 +1228.mp3 +1066.mp3 +696.m4a +1116.mp3 +1056.mp3 +335.m4a +1348.mp3 +674.m4a 1292.mp3 -1293.mp3 -1294.mp3 -1295.mp3 -1296.mp3 -1298.mp3 -1299.mp3 -13.m4a -1300.mp3 -1301.mp3 -1302.mp3 -1303.mp3 +1156.mp3 1304.mp3 -1306.mp3 -1307.mp3 -1308.mp3 -1309.mp3 -1310.mp3 -1311.mp3 -1312.mp3 -1314.mp3 -1315.mp3 -1316.mp3 -1317.mp3 -1318.mp3 -1319.mp3 +1197.mp3 +1013.mp3 +1355.mp3 +1216.mp3 +1380.mp3 +1426.mp3 1320.mp3 -1322.mp3 -1323.mp3 -1324.mp3 -1325.mp3 -1326.mp3 -1327.mp3 -1328.mp3 -1330.mp3 -1331.mp3 -1332.mp3 -1333.mp3 -1334.mp3 -1335.mp3 -1336.mp3 -1338.mp3 -1339.mp3 +352.m4a +1267.mp3 +1085.mp3 +325.m4a +620.m4a +640.m4a +1234.mp3 +1203.mp3 +1163.mp3 +22.m4a +10018.mp3 +1479.mp3 +622.m4a +1487.mp3 +1486.mp3 +344.m4a +1200.mp3 1340.mp3 +1018.mp3 +1388.mp3 +1363.mp3 +1187.mp3 +1139.mp3 +960.mp3 +1229.mp3 +1208.mp3 +1034.mp3 +1178.mp3 +562.m4a 1341.mp3 -1342.mp3 -1343.mp3 -1346.mp3 -1347.mp3 -1348.mp3 +1303.mp3 +1477.mp3 +1058.mp3 +1020.mp3 1349.mp3 -1350.mp3 -1351.mp3 -1352.mp3 -1354.mp3 -1355.mp3 -1356.mp3 -1357.mp3 -1358.mp3 -1359.mp3 -1360.mp3 -1362.mp3 -1363.mp3 -1364.mp3 -1365.mp3 -1366.mp3 -1367.mp3 -1368.mp3 -1370.mp3 -1371.mp3 -1372.mp3 -1373.mp3 -1374.mp3 -1375.mp3 -1376.mp3 -1378.mp3 -1379.mp3 -1380.mp3 -1381.mp3 -1382.mp3 -1383.mp3 -1384.mp3 -1386.mp3 -1387.mp3 -1388.mp3 -1389.mp3 -1390.mp3 -1391.mp3 -1392.mp3 -1394.mp3 -1395.mp3 -1396.mp3 -1397.mp3 -1398.mp3 -1399.mp3 -14.m4a -1400.mp3 -1402.mp3 -1403.mp3 -1404.mp3 -1405.mp3 -1406.mp3 -1407.mp3 -1408.mp3 -1410.mp3 +650.m4a +1190.mp3 +1295.mp3 +962.mp3 +514.m4a +972.mp3 +586.m4a +1312.mp3 +664.m4a 1411.mp3 -1412.mp3 -1413.mp3 -1414.mp3 -1415.mp3 -1418.mp3 -1419.mp3 -1420.mp3 -1421.mp3 -1422.mp3 -1423.mp3 -1424.mp3 -1426.mp3 -1427.mp3 -1428.mp3 -1429.mp3 -1430.mp3 -1431.mp3 -1432.mp3 -1434.mp3 -1435.mp3 -1436.mp3 -1437.mp3 -1438.mp3 -1439.mp3 -1440.mp3 -1442.mp3 -1443.mp3 -1444.mp3 -1445.mp3 -1446.mp3 -1447.mp3 -1448.mp3 -1450.mp3 -1451.mp3 -1452.mp3 -1453.mp3 -1454.mp3 -1455.mp3 -1456.mp3 +1277.mp3 +1366.mp3 +1231.mp3 +386.m4a 1458.mp3 -1459.mp3 -1460.mp3 -1461.mp3 -1462.mp3 -1463.mp3 -1464.mp3 -1466.mp3 -1467.mp3 -1468.mp3 -1469.mp3 -1470.mp3 -1472.mp3 -1474.mp3 -1475.mp3 -1476.mp3 -1477.mp3 -1478.mp3 -1479.mp3 -1482.mp3 -1483.mp3 -1484.mp3 -1485.mp3 -1486.mp3 -1487.mp3 -1488.mp3 -1490.mp3 -1491.mp3 -1492.mp3 -1493.mp3 -1494.mp3 -1495.mp3 -1496.mp3 +1263.mp3 +602.m4a +382.m4a +1248.mp3 +1146.mp3 +328.m4a +10028.mp3 +1061.mp3 +466.m4a +528.m4a +1452.mp3 1498.mp3 -16.m4a -18.m4a -20.m4a -22.m4a -24.m4a -3.m4a -30.m4a -306.m4a -307.m4a +636.m4a +1398.mp3 +1373.mp3 +1290.mp3 +1183.mp3 +1298.mp3 +1237.mp3 +1323.mp3 +10015.mp3 +1198.mp3 +518.m4a +10010.mp3 +1098.mp3 +1047.mp3 +1165.mp3 +1191.mp3 +348.m4a +1466.mp3 +1019.mp3 +1453.mp3 +428.m4a +624.m4a +1391.mp3 +958.mp3 +973.mp3 +1259.mp3 +1370.mp3 310.m4a -311.m4a -317.m4a -320.m4a -322.m4a -323.m4a -324.m4a -325.m4a -328.m4a -334.m4a -335.m4a -336.m4a -338.m4a -37.m4a +1330.mp3 +692.m4a diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index f3d277a..b13f211 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -81,10 +81,9 @@ def compute_context_windows(features): data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) feature_count = 0 - num_beats = features.shape[1] - - for k in range(padding, num_beats-padding): + num_padded_features = features.shape[1] + for k in range(padding, num_padded_features - padding): if feature_count > n_preallocate: break From 27c885470d961a9866df3d9acaba817ad7e87fbe Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 18 Mar 2021 00:13:46 -0700 Subject: [PATCH 04/35] help peak-finding algo along --- Python/track_segmentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index b13f211..a7181ae 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -108,7 +108,6 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers): """ predictions = np.squeeze(predictions) - breakpoint() print("raw predicitions:") print_predictions(predictions, beat_times) @@ -117,7 +116,8 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers): print("after post-processing:") print_predictions(predictions, beat_times) - peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) + predictions = np.insert(predictions, 0, 0) + peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1 segment_times = beat_times[peak_loc] print("beat_num\ttime:") From 1856582cc5c00b1dd027739f0d0b596a273a37b5 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 18 Mar 2021 00:16:30 -0700 Subject: [PATCH 05/35] script to load more files into the dataset --- add_files.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 add_files.sh diff --git a/add_files.sh b/add_files.sh new file mode 100755 index 0000000..1216e78 --- /dev/null +++ b/add_files.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +IN_LIST=/tmp/add_files.present +CURRENT=/tmp/add_files.exist + +cat Data/test_tracks.txt Data/train_tracks.txt | sort > $IN_LIST +(cd ~/src/salami-audio && ls -1 *.{mp3,m4a}) | sort > $CURRENT + +newfiles=`comm -3 $IN_LIST $CURRENT | sort -R` +count=`comm -3 $IN_LIST $CURRENT | wc -l` + +i=0 +for x in $newfiles +do + if [ "$i" -gt "$(($count / 9 - 1))" ] + then + echo "$x" to train_tracks + echo $x >> Data/train_tracks.txt + else + echo "$x" to test_tracks + echo $x >> Data/test_tracks.txt + fi + i=$(($i + 1)) +done From 835cd91aacdd7c3c3027be7d3e22a3544a66c79a Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 18 Mar 2021 00:16:50 -0700 Subject: [PATCH 06/35] convenience script to predict uploaded stuffs --- track_segment.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 track_segment.sh diff --git a/track_segment.sh b/track_segment.sh new file mode 100755 index 0000000..9002756 --- /dev/null +++ b/track_segment.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cd Python +mkdir -p ~/src/salami-data-public/annotations/$1/parsed +python ./track_segmentation.py ~/src/salami-audio/$1.* ~/src/salami-data-public/annotations/$1/parsed/predicted.txt From 959e5c1045d98a61e515579d2730377f0f203b02 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 18 Mar 2021 07:00:22 -0700 Subject: [PATCH 07/35] latest --- Python/evaluation.py | 23 +++++++++-------------- Python/feature_extraction.py | 16 ++++++++++------ Python/train_segmentation_cnn.py | 2 +- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index db3c54e..8effcfa 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -15,7 +15,6 @@ predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' -f_measure_thresh = 3 # tolerance window in seconds def load_data(preds_file, file_lists): @@ -64,9 +63,7 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): return preds_track - -if __name__ == "__main__": - +def run_eval(f_measure_thresh): f_measures = [] precisions = [] recalls = [] @@ -75,9 +72,6 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): preds = np.reshape(preds, len(preds)) for i, f in enumerate(test_files): - - print("Evaluating {}".format(f)) - # load annotations segment_times = get_segment_times(f, paths.annotations_path) @@ -88,9 +82,9 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): preds_track = np.squeeze(np.asarray(preds[test_idx == i])) # post processing - preds_track = post_processing(preds_track, beat_numbers) - peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.2) - + preds_track = post_processing(preds_track, beat_numbers, emphasize_downbeat=False) + peds_track = np.insert(preds_track, 0, 0) + peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1 pred_times = beat_times[peak_loc] # compute f-measure @@ -100,14 +94,15 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): precisions.append(p) recalls.append(r) - print("f-Measure: {}, precision: {}, recall: {}".format(f_score, p, r)) + #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r)) mean_f = np.mean(np.asarray(f_measures)) mean_p = np.mean(np.asarray(precisions)) mean_r = np.mean(np.asarray(recalls)) - print(" ") - print("Mean scores across all test tracks:") - print("f-Measure: {}, precision: {}, recall: {}".format(mean_f, mean_p, mean_r)) + print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r)) +if __name__ == "__main__": + run_eval(0.5) + run_eval(3.0) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 146e768..0bf4178 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -20,6 +20,7 @@ import random import pickle import paths +import multiprocessing, logging from utils import * import scipy @@ -76,8 +77,11 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102 beat_melspec = np.max(mel_spec[:, beat_frames[0]:beat_frames[1]], axis=1) for k in range(1, beat_frames.shape[0]-1): - beat_melspec = np.column_stack((beat_melspec, - np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1))) + try: + beat_melspec = np.column_stack((beat_melspec, + np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1))) + except: + breakpoint() beat_melspec = np.column_stack((beat_melspec, mel_spec[:, beat_frames.shape[0]])) return beat_melspec @@ -114,12 +118,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): logger.setLevel(logging.INFO) with multiprocessing.Pool(processes=8) as pool: - #for i, f in enumerate(audio_files): - # async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) + for i, f in enumerate(audio_files): + async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) for i, f in enumerate(audio_files): - #beat_mls, beat_times = async_res[i].get() - beat_mls, beat_times = compute_features(logger, f, i , audio_files) + beat_mls, beat_times = async_res[i].get() + #beat_mls, beat_times = compute_features(logger, f, i , audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 9d3fd17..d87106e 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -139,4 +139,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh if __name__ == "__main__": - train_model() + train_model(nb_epoch=300) From 89cd35c54fe2bc9f4c8fdb70591b0e816fab50e6 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 19 Mar 2021 03:01:08 -0700 Subject: [PATCH 08/35] quiet, librosa --- Python/feature_extraction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 81e4d99..ee87736 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -20,6 +20,7 @@ import random import pickle import paths +import warnings import multiprocessing, logging @@ -61,7 +62,9 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102 else: path = os.path.join(paths.audio_path, filename) - y, sr = librosa.load(path, sr=22050, mono=True) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + y, sr = librosa.load(path, sr=22050, mono=True) spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming)) From 974ddc5c3857c0414ece9e4b118e9b734889031b Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Sat, 20 Mar 2021 14:46:25 -0700 Subject: [PATCH 09/35] output best and worst tracks in validation set --- Python/evaluation.py | 21 +++++++++++++++++++++ Python/train_segmentation_cnn.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 8effcfa..c5edab4 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -13,6 +13,8 @@ import mir_eval import paths +from operator import itemgetter + predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' @@ -63,6 +65,10 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): return preds_track +def get_sort_key(item): + return item[1] + + def run_eval(f_measure_thresh): f_measures = [] precisions = [] @@ -102,6 +108,21 @@ def run_eval(f_measure_thresh): print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r)) + combined_tracks = list(zip(test_files, f_measures, precisions, recalls)) + sorted_tracks = sorted(combined_tracks, key=get_sort_key) + print("worst:") + for x in range(3): + track = sorted_tracks[x] + print("{:<20}{:4.2}\t{:4.2}\t{:4.2}".format(*track)) + + print("best:") + for x in range(1,4): + track = sorted_tracks[-x] + print("{:<20}{:4.2}\t{:4.2}\t{:4.2}".format(*track)) + + + + if __name__ == "__main__": run_eval(0.5) run_eval(3.0) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index d87106e..58c5947 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -139,4 +139,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh if __name__ == "__main__": - train_model(nb_epoch=300) + train_model(nb_epoch=75) From 47fedb0fe9d77d33aec20d13c67dd8263b4cf7c6 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Mon, 22 Mar 2021 17:37:49 -0700 Subject: [PATCH 10/35] WIP: sslm --- Python/feature_extraction.py | 141 +++++++++++++++++++++++++++++------ 1 file changed, 119 insertions(+), 22 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index ee87736..2fcc223 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -26,6 +26,9 @@ from utils import * import scipy +import skimage.measure +from scipy.spatial import distance + context_length = 65 # how many beats make up a context window for the CNN num_mel_bands = 80 # number of Mel bands @@ -33,40 +36,110 @@ pos_frames_oversample = 5 # oversample positive frames because there are too few mid_frames_oversample = 3 # oversample frames between segments label_smearing = 1 # how many frames are positive examples around an annotation +padding_length = int(context_length / 2) + +max_pool = 2 random.seed(1234) # for reproducibility np.random.seed(1234) - -def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): +def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): """ Compute average Mel log spectrogram per beat given previously extracted beat times. - :param filename: path to audio file + :param waveform: raw waveform data :param beat_times: list of beat times in seconds :param mel_bands: number of Mel bands :param fft_size: FFT size :param hop_size: hop size for FFT processing - :return: beat Mel spectrogram (mel_bands x frames) + :return: beat sslm """ + S = librosa.feature.melspectrogram(y=waveform, sr=22050, n_fft=fft_size, hop_length=hop_size, n_mels=mel_bands, fmin=80, fmax=16000, win_length=fft_size, window=scipy.signal.hamming) - computed_mls_file = paths.get_mls_path(filename) + S_to_dB = librosa.power_to_db(S,ref=np.max) - if os.path.exists(computed_mls_file): - return np.load(computed_mls_file) + # pad 130 frames (to be 65) with noise at -70dB at the beginning + pad = np.full((S_to_dB.shape[0], context_length * 2), -70) + S_padded = np.concatenate((pad, S_to_dB), axis=1) + # downsample initial spectrogram + x_prime = skimage.measure.block_reduce(S_padded, (1,max_pool), np.max) - if "/" in filename: - path = filename - else: - path = os.path.join(paths.audio_path, filename) + MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') + MFCCs = MFCCs[1:,:] + 1 - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - y, sr = librosa.load(path, sr=22050, mono=True) + # this seems to group two frames together + m = 2 + x = [np.roll(MFCCs,n,axis=1) for n in range(m)] + x_hat = np.concatenate(x, axis=0) - spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, + #Cosine distance calculation: D[N/p,L/p] matrix + distances = np.zeros((x_hat.shape[1], context_length)) #D has as dimensions N/p and L/p + for i in range(x_hat.shape[1]): #iteration in columns of x_hat + for l in range(context_length): + if i-(l+1) < 0: + cosine_dist = 1 + elif i-(l+1) < context_length: + cosine_dist = 1 + else: + cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L + distances[i,l] = cosine_dist + + #Threshold epsilon[N/p,L/p] calculation + kappa = 0.1 #equalization factor of 10% + epsilon = np.zeros((distances.shape[0], context_length)) #D has as dimensions N/p and L/p + for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat + for l in range(context_length): + epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa) + + + #Removing initial padding now taking into account the max-poolin factor + distances = distances[context_length:,:] + epsilon = epsilon[context_length:,:] + x_prime = x_prime[:,context_length:] + + + #Self Similarity Lag Matrix + sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide + sslm = np.transpose(sslm) + + # the paper further downsamples by 3, but since we're doing beat-frames only might be ok + #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max) + #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max) + + #Check if SSLM has nans and if it has them, substitute them by 0 + for i in range(sslm.shape[0]): + for j in range(sslm.shape[1]): + if np.isnan(sslm[i,j]): + sslm[i,j] = 0 + + beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int') + beat_sslms = np.zeros((65, 65, beat_frames.shape[0])) + + for k in range(beat_frames.shape[0]): + sslm_frame = beat_frames[k] // max_pool + sslm_frame_min = sslm_frame - context_length // 2 + sslm_frame_max = sslm_frame + context_length // 2 + 1 + breakpoint() + beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max] + + breakpoint() + return sslm + +def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): + """ + Compute average Mel log spectrogram per beat given previously + extracted beat times. + + :param filename: path to audio file + :param beat_times: list of beat times in seconds + :param mel_bands: number of Mel bands + :param fft_size: FFT size + :param hop_size: hop size for FFT processing + :return: beat Mel spectrogram (mel_bands x frames) + """ + spec = np.abs(librosa.stft(y=features, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming)) mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True) @@ -86,17 +159,41 @@ def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=102 beat_melspec = np.column_stack((beat_melspec, mel_spec[:, beat_frames.shape[0]])) - np.save(computed_mls_file, beat_melspec) - return beat_melspec +def load_waveform(filename): + if "/" in filename: + path = filename + else: + path = os.path.join(paths.audio_path, filename) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + y, sr = librosa.load(path, sr=22050, mono=True) + return y + +def get_cached_features(filename): + computed_mls_file = paths.get_mls_path(filename) + + if os.path.exists(computed_mls_file): + return np.load(computed_mls_file) + else: + return None def compute_features(logger, f, i, audio_files): logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) - beat_mls = compute_beat_mls(f, beat_times) + cached_features = get_cached_features(f) + + if cached_features is not None: + return cached_features + + waveform = load_waveform(f) + + beat_mls = compute_beat_mls(waveform, beat_times) + compute_sslm(waveform, beat_times) beat_mls /= np.max(beat_mls) return beat_mls, beat_times @@ -122,11 +219,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): logger.setLevel(logging.INFO) with multiprocessing.Pool(processes=8) as pool: - for i, f in enumerate(audio_files): - async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) + #for i, f in enumerate(audio_files): + # async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) for i, f in enumerate(audio_files): - beat_mls, beat_times = async_res[i].get() + #beat_mls, beat_times = async_res[i].get() + beat_mls, beat_times = compute_features(logger, f, i, audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -201,7 +299,6 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): feature_count = 0 current_track = 0 - padding_length = int(context_length / 2) for features, labels in zip(feature_list, labels_list): From c38c8fa949d68eaacd95d92f91d15bab5e218680 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Wed, 24 Mar 2021 23:01:33 -0700 Subject: [PATCH 11/35] this does something!!!! --- Data/test_tracks.txt | 69 ---- Data/train_tracks.txt | 574 ------------------------------- Python/feature_extraction.py | 107 +++--- Python/train_segmentation_cnn.py | 53 ++- 4 files changed, 106 insertions(+), 697 deletions(-) diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index 62f4bd4..fb1b766 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -1,72 +1,3 @@ 1166.mp3 40.m4a 1090.mp3 -584.m4a -346.m4a -1026.mp3 -1142.mp3 -1302.mp3 -1131.mp3 -608.m4a -1274.mp3 -1376.mp3 -670.m4a -1399.mp3 -1319.mp3 -18.m4a -1123.mp3 -342.m4a -10013.mp3 -642.m4a -306.m4a -1488.mp3 -516.m4a -1192.mp3 -10024.mp3 -1357.mp3 -404.m4a -1063.mp3 -1331.mp3 -1356.mp3 -1322.mp3 -1170.mp3 -1440.mp3 -1091.mp3 -964.mp3 -1436.mp3 -1414.mp3 -1474.mp3 -1036.mp3 -1040.mp3 -426.m4a -1087.mp3 -1301.mp3 -970.mp3 -1141.mp3 -1250.mp3 -1483.mp3 -992.mp3 -1223.mp3 -1284.mp3 -10012.mp3 -472.m4a -6.m4a -986.mp3 -678.m4a -1227.mp3 -1152.mp3 -5.m4a -1270.mp3 -488.m4a -1311.mp3 -1421.mp3 -1402.mp3 -522.m4a -354.m4a -1276.mp3 -1339.mp3 -1236.mp3 -1445.mp3 -1221.mp3 -1244.mp3 -1080.mp3 diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index e2e2baf..d45159d 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -1,577 +1,3 @@ 1136.mp3 1343.mp3 1027.mp3 -971.mp3 -484.m4a -1130.mp3 -10032.mp3 -991.mp3 -616.m4a -1076.mp3 -478.m4a -1300.mp3 -1333.mp3 -1395.mp3 -440.m4a -1004.mp3 -1372.mp3 -512.m4a -1155.mp3 -1397.mp3 -1485.mp3 -1024.mp3 -1093.mp3 -660.m4a -1254.mp3 -1460.mp3 -1149.mp3 -338.m4a -1396.mp3 -52.m4a -987.mp3 -1384.mp3 -1423.mp3 -594.m4a -1107.mp3 -1410.mp3 -1030.mp3 -1403.mp3 -14.m4a -20.m4a -480.m4a -1455.mp3 -37.m4a -995.mp3 -1430.mp3 -1147.mp3 -1392.mp3 -1164.mp3 -1205.mp3 -626.m4a -1182.mp3 -444.m4a -1448.mp3 -4.m4a -1374.mp3 -996.mp3 -1328.mp3 -1365.mp3 -1358.mp3 -989.mp3 -1478.mp3 -1157.mp3 -1144.mp3 -1286.mp3 -384.m4a -1179.mp3 -1404.mp3 -1256.mp3 -974.mp3 -1271.mp3 -498.m4a -1327.mp3 -618.m4a -1354.mp3 -966.mp3 -955.mp3 -1035.mp3 -1046.mp3 -1352.mp3 -10023.mp3 -1224.mp3 -1204.mp3 -1038.mp3 -1059.mp3 -534.m4a -420.m4a -1490.mp3 -474.m4a -1243.mp3 -1086.mp3 -1226.mp3 -1048.mp3 -1476.mp3 -1214.mp3 -10033.mp3 -1162.mp3 -340.m4a -13.m4a -10025.mp3 -450.m4a -1138.mp3 -1359.mp3 -1219.mp3 -10.m4a -1202.mp3 -965.mp3 -1023.mp3 -1375.mp3 -1140.mp3 -1039.mp3 -1083.mp3 -1092.mp3 -1052.mp3 -1310.mp3 -1462.mp3 -10021.mp3 -1007.mp3 -690.m4a -1242.mp3 -1120.mp3 -1496.mp3 -576.m4a -1167.mp3 -652.m4a -1055.mp3 -1419.mp3 -676.m4a -416.m4a -1316.mp3 -1288.mp3 -634.m4a -1299.mp3 -648.m4a -1268.mp3 -1078.mp3 -1459.mp3 -524.m4a -978.mp3 -1114.mp3 -614.m4a -1218.mp3 -1064.mp3 -1463.mp3 -612.m4a -1122.mp3 -1232.mp3 -1258.mp3 -408.m4a -1408.mp3 -402.m4a -1306.mp3 -1074.mp3 -983.mp3 -1069.mp3 -8.m4a -1126.mp3 -1335.mp3 -1062.mp3 -10008.mp3 -370.m4a -1272.mp3 -1326.mp3 -1429.mp3 -1124.mp3 -320.m4a -1196.mp3 -1464.mp3 -1350.mp3 -12.m4a -1099.mp3 -1054.mp3 -1435.mp3 -1439.mp3 -372.m4a -1269.mp3 -568.m4a -1422.mp3 -10020.mp3 -10009.mp3 -307.m4a -1109.mp3 -1206.mp3 -1318.mp3 -350.m4a -1450.mp3 -360.m4a -963.mp3 -476.m4a -1251.mp3 -1132.mp3 -1011.mp3 -1424.mp3 -492.m4a -1005.mp3 -1266.mp3 -1079.mp3 -1115.mp3 -1360.mp3 -1175.mp3 -1431.mp3 -1294.mp3 -520.m4a -1245.mp3 -410.m4a -1239.mp3 -468.m4a -16.m4a -1195.mp3 -1151.mp3 -1493.mp3 -1084.mp3 -1240.mp3 -1378.mp3 -1037.mp3 -988.mp3 -324.m4a -1104.mp3 -979.mp3 -424.m4a -1467.mp3 -975.mp3 -364.m4a -1171.mp3 -10026.mp3 -1285.mp3 -668.m4a -1189.mp3 -1291.mp3 -596.m4a -1261.mp3 -1072.mp3 -442.m4a -356.m4a -1148.mp3 -956.mp3 -1070.mp3 -482.m4a -396.m4a -1067.mp3 -486.m4a -1112.mp3 -358.m4a -982.mp3 -1173.mp3 -334.m4a -1262.mp3 -1412.mp3 -1315.mp3 -1309.mp3 -1106.mp3 -1287.mp3 -570.m4a -1389.mp3 -1135.mp3 -1119.mp3 -1407.mp3 -1075.mp3 -666.m4a -1207.mp3 -1367.mp3 -1362.mp3 -1451.mp3 -998.mp3 -1246.mp3 -1381.mp3 -1101.mp3 -1003.mp3 -1125.mp3 -1386.mp3 -536.m4a -1238.mp3 -1095.mp3 -994.mp3 -1088.mp3 -394.m4a -46.m4a -1154.mp3 -1264.mp3 -1077.mp3 -1188.mp3 -1472.mp3 -1134.mp3 -1293.mp3 -1117.mp3 -1053.mp3 -658.m4a -1461.mp3 -422.m4a -1215.mp3 -1045.mp3 -317.m4a -1158.mp3 -1346.mp3 -1194.mp3 -1446.mp3 -10022.mp3 -1159.mp3 -1368.mp3 -1332.mp3 -1096.mp3 -502.m4a -1394.mp3 -1168.mp3 -1181.mp3 -610.m4a -392.m4a -322.m4a -1371.mp3 -39.m4a -560.m4a -1180.mp3 -1338.mp3 -1443.mp3 -1111.mp3 -1432.mp3 -532.m4a -496.m4a -1482.mp3 -981.mp3 -311.m4a -366.m4a -694.m4a -1212.mp3 -1102.mp3 -997.mp3 -646.m4a -1042.mp3 -1060.mp3 -1174.mp3 -1382.mp3 -959.mp3 -554.m4a -510.m4a -1247.mp3 -1213.mp3 -323.m4a -10017.mp3 -1082.mp3 -1110.mp3 -1307.mp3 -1495.mp3 -1296.mp3 -10016.mp3 -1108.mp3 -1364.mp3 -1470.mp3 -1021.mp3 -1492.mp3 -1484.mp3 -654.m4a -504.m4a -30.m4a -1235.mp3 -10027.mp3 -1211.mp3 -1176.mp3 -1015.mp3 -574.m4a -1314.mp3 -1494.mp3 -1405.mp3 -999.mp3 -10014.mp3 -990.mp3 -1071.mp3 -1184.mp3 -506.m4a -1336.mp3 -1199.mp3 -1222.mp3 -976.mp3 -1128.mp3 -1044.mp3 -1000.mp3 -1051.mp3 -1442.mp3 -24.m4a -1210.mp3 -578.m4a -564.m4a -1032.mp3 -1437.mp3 -10029.mp3 -1406.mp3 -1379.mp3 -1347.mp3 -1456.mp3 -1438.mp3 -508.m4a -1022.mp3 -1308.mp3 -1413.mp3 -1012.mp3 -3.m4a -1127.mp3 -1253.mp3 -10035.mp3 -1390.mp3 -980.mp3 -1351.mp3 -368.m4a -1317.mp3 -1150.mp3 -550.m4a -967.mp3 -630.m4a -1342.mp3 -968.mp3 -1260.mp3 -1383.mp3 -1428.mp3 -590.m4a -1468.mp3 -1133.mp3 -1324.mp3 -1444.mp3 -1118.mp3 -1008.mp3 -10019.mp3 -1420.mp3 -448.m4a -606.m4a -1029.mp3 -10007.mp3 -1160.mp3 -1447.mp3 -548.m4a -1415.mp3 -604.m4a -1220.mp3 -1275.mp3 -10034.mp3 -336.m4a -1186.mp3 -1469.mp3 -1475.mp3 -1454.mp3 -1434.mp3 -1418.mp3 -1014.mp3 -686.m4a -1427.mp3 -10031.mp3 -1279.mp3 -1006.mp3 -1282.mp3 -1325.mp3 -1172.mp3 -1280.mp3 -957.mp3 -632.m4a -1043.mp3 -556.m4a -1387.mp3 -1230.mp3 -10030.mp3 -984.mp3 -1278.mp3 -1400.mp3 -1143.mp3 -10011.mp3 -1103.mp3 -1491.mp3 -662.m4a -1283.mp3 -1334.mp3 -1068.mp3 -1228.mp3 -1066.mp3 -696.m4a -1116.mp3 -1056.mp3 -335.m4a -1348.mp3 -674.m4a -1292.mp3 -1156.mp3 -1304.mp3 -1197.mp3 -1013.mp3 -1355.mp3 -1216.mp3 -1380.mp3 -1426.mp3 -1320.mp3 -352.m4a -1267.mp3 -1085.mp3 -325.m4a -620.m4a -640.m4a -1234.mp3 -1203.mp3 -1163.mp3 -22.m4a -10018.mp3 -1479.mp3 -622.m4a -1487.mp3 -1486.mp3 -344.m4a -1200.mp3 -1340.mp3 -1018.mp3 -1388.mp3 -1363.mp3 -1187.mp3 -1139.mp3 -960.mp3 -1229.mp3 -1208.mp3 -1034.mp3 -1178.mp3 -562.m4a -1341.mp3 -1303.mp3 -1477.mp3 -1058.mp3 -1020.mp3 -1349.mp3 -650.m4a -1190.mp3 -1295.mp3 -962.mp3 -514.m4a -972.mp3 -586.m4a -1312.mp3 -664.m4a -1411.mp3 -1277.mp3 -1366.mp3 -1231.mp3 -386.m4a -1458.mp3 -1263.mp3 -602.m4a -382.m4a -1248.mp3 -1146.mp3 -328.m4a -10028.mp3 -1061.mp3 -466.m4a -528.m4a -1452.mp3 -1498.mp3 -636.m4a -1398.mp3 -1373.mp3 -1290.mp3 -1183.mp3 -1298.mp3 -1237.mp3 -1323.mp3 -10015.mp3 -1198.mp3 -518.m4a -10010.mp3 -1098.mp3 -1047.mp3 -1165.mp3 -1191.mp3 -348.m4a -1466.mp3 -1019.mp3 -1453.mp3 -428.m4a -624.m4a -1391.mp3 -958.mp3 -973.mp3 -1259.mp3 -1370.mp3 -310.m4a -1330.mp3 -692.m4a diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 2fcc223..df62fc0 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -21,6 +21,7 @@ import pickle import paths import warnings +import time import multiprocessing, logging @@ -55,16 +56,23 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h :param hop_size: hop size for FFT processing :return: beat sslm """ - S = librosa.feature.melspectrogram(y=waveform, sr=22050, n_fft=fft_size, hop_length=hop_size, n_mels=mel_bands, fmin=80, fmax=16000, win_length=fft_size, window=scipy.signal.hamming) + spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, + window=scipy.signal.hamming)) + + mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True) + s = np.sum(mel_fb, axis=1) + mel_fb = np.divide(mel_fb, s[:, np.newaxis]) + + mel_spec = np.dot(mel_fb, spec) - S_to_dB = librosa.power_to_db(S,ref=np.max) + S_to_dB = librosa.power_to_db(mel_spec,ref=np.max) # pad 130 frames (to be 65) with noise at -70dB at the beginning - pad = np.full((S_to_dB.shape[0], context_length * 2), -70) - S_padded = np.concatenate((pad, S_to_dB), axis=1) + #pad = np.full((S_to_dB.shape[0], context_length * 2), -70) + #S_padded = np.concatenate((pad, S_to_dB), axis=1) + - # downsample initial spectrogram - x_prime = skimage.measure.block_reduce(S_padded, (1,max_pool), np.max) + x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max) MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') MFCCs = MFCCs[1:,:] + 1 @@ -74,34 +82,33 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h x = [np.roll(MFCCs,n,axis=1) for n in range(m)] x_hat = np.concatenate(x, axis=0) + # create circular foo + x_hat_length = x_hat.shape[1] + x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1) + print("pre-padded: {}, post-padded: {}".format(x_hat.shape, x_padded.shape)) + #Cosine distance calculation: D[N/p,L/p] matrix - distances = np.zeros((x_hat.shape[1], context_length)) #D has as dimensions N/p and L/p - for i in range(x_hat.shape[1]): #iteration in columns of x_hat + distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p + for i in range(context_length, x_padded.shape[1] - context_length): #iteration in columns of x_hat for l in range(context_length): - if i-(l+1) < 0: - cosine_dist = 1 - elif i-(l+1) < context_length: - cosine_dist = 1 - else: - cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L + cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L distances[i,l] = cosine_dist #Threshold epsilon[N/p,L/p] calculation kappa = 0.1 #equalization factor of 10% - epsilon = np.zeros((distances.shape[0], context_length)) #D has as dimensions N/p and L/p + t1 = time.time() + epsilon = np.full((distances.shape[0], context_length), 1.0) for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat for l in range(context_length): epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa) + t2 = time.time() - #Removing initial padding now taking into account the max-poolin factor - distances = distances[context_length:,:] - epsilon = epsilon[context_length:,:] - x_prime = x_prime[:,context_length:] - + print(t2-t1) #Self Similarity Lag Matrix sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide + #sslm = scipy.special.expit(1-distances) sslm = np.transpose(sslm) # the paper further downsamples by 3, but since we're doing beat-frames only might be ok @@ -109,23 +116,21 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max) #Check if SSLM has nans and if it has them, substitute them by 0 - for i in range(sslm.shape[0]): - for j in range(sslm.shape[1]): - if np.isnan(sslm[i,j]): - sslm[i,j] = 0 + #for i in range(sslm.shape[0]): + # for j in range(sslm.shape[1]): + # if np.isnan(sslm[i,j]): + # sslm[i,j] = 0 beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int') - beat_sslms = np.zeros((65, 65, beat_frames.shape[0])) + beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0])) for k in range(beat_frames.shape[0]): - sslm_frame = beat_frames[k] // max_pool + sslm_frame = beat_frames[k] // max_pool + context_length sslm_frame_min = sslm_frame - context_length // 2 sslm_frame_max = sslm_frame + context_length // 2 + 1 - breakpoint() beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max] - breakpoint() - return sslm + return beat_sslms def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): """ @@ -193,9 +198,9 @@ def compute_features(logger, f, i, audio_files): waveform = load_waveform(f) beat_mls = compute_beat_mls(waveform, beat_times) - compute_sslm(waveform, beat_times) + beat_sslm = compute_sslm(waveform, beat_times) beat_mls /= np.max(beat_mls) - return beat_mls, beat_times + return beat_mls, beat_sslm, beat_times def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): """ @@ -210,6 +215,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): """ feature_list = [] + sslm_feature_list = [] labels_list = [] failed_tracks_idx = [] @@ -219,12 +225,12 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): logger.setLevel(logging.INFO) with multiprocessing.Pool(processes=8) as pool: - #for i, f in enumerate(audio_files): - # async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) + for i, f in enumerate(audio_files): + async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) for i, f in enumerate(audio_files): - #beat_mls, beat_times = async_res[i].get() - beat_mls, beat_times = compute_features(logger, f, i, audio_files) + beat_mls, beat_sslm, beat_times = async_res[i].get() + #beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -239,9 +245,10 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): label_vec[closest_beat] = 1. feature_list.append(beat_mls) + sslm_feature_list.append(beat_sslm) labels_list.append(label_vec) - return feature_list, labels_list, failed_tracks_idx + return feature_list, sslm_feature_list, labels_list, failed_tracks_idx def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample=10000): @@ -277,7 +284,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample return features, mean_vec, std_vec -def prepare_batch_data(feature_list, labels_list, is_training=True): +def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training=True): """ Reads precomputed beat Mel spectrograms and slices them into context windows for CNN training. For the training set, subsampling is @@ -293,6 +300,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) + data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32) data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32) data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32) track_idx = np.zeros(shape=(n_preallocate,), dtype=int) @@ -300,7 +308,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): feature_count = 0 current_track = 0 - for features, labels in zip(feature_list, labels_list): + for features, sslm_features, labels in zip(feature_list, sslm_feature_list, labels_list): print("Processed {} examples from {} tracks".format(feature_count, current_track+1)) @@ -325,6 +333,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): next_weight = 1 data_x[feature_count, :, :] = next_window + data_sslm_x[feature_count] = sslm_features[:, :, k] data_y[feature_count] = next_label data_weight[feature_count] = next_weight track_idx[feature_count] = current_track @@ -342,6 +351,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): next_weight = 1. - np.abs(l-k) / (label_smearing + 1.) data_x[feature_count, :, :] = next_window + data_sslm_x[feature_count] = sslm_features[:, :, l] data_y[feature_count] = next_label data_weight[feature_count] = next_weight track_idx[feature_count] = current_track @@ -361,6 +371,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): next_window = features[:, l-padding_length: l+padding_length+1] + data_sslm_x[feature_count] = sslm_features[:, :, l] data_x[feature_count, :, :] = next_window data_y[feature_count] = 0 data_weight[feature_count] = 1 @@ -384,6 +395,7 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): next_weight = 1 data_x[feature_count, :, :] = next_window + data_sslm_x[feature_count] = sslm_features[:, :, next_idx - padding_length] data_y[feature_count] = next_label data_weight[feature_count] = next_weight track_idx[feature_count] = current_track @@ -399,6 +411,8 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): data_x[feature_count, :, :] = next_window data_y[feature_count] = next_label + data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length] + data_weight[feature_count] = next_weight track_idx[feature_count] = current_track @@ -410,11 +424,12 @@ def prepare_batch_data(feature_list, labels_list, is_training=True): break data_x = data_x[:feature_count, :, :] + data_sslm_x = data_sslm_x[:feature_count, :, :] data_y = data_y[:feature_count] data_weight = data_weight[:feature_count] track_idx = track_idx[:feature_count] - return data_x, data_y, data_weight, track_idx + return data_x, data_sslm_x, data_y, data_weight, track_idx def load_raw_features(file): @@ -441,11 +456,11 @@ def load_raw_features(file): print("Extracting MLS features") - train_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files, + train_features, train_sslm_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files, paths.beats_path, paths.annotations_path) - test_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files, + test_features, test_sslm_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files, paths.beats_path, paths.annotations_path) @@ -459,12 +474,12 @@ def load_raw_features(file): del test_files[i] with open('../Data/rawFeatures.pickle', 'wb') as f: - pickle.dump((train_features, train_labels, test_features, test_labels), f) + pickle.dump((train_features, train_sslm_features, train_labels, test_features, test_sslm_features, test_labels), f) # train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle') - train_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_labels, is_training=True) - test_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_labels, is_training=False) + train_x, train_sslm_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_labels, is_training=True) + test_x, test_sslm_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_labels, is_training=False) train_x, mean_vec, std_vec = normalize_features_per_band(train_x) test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec) @@ -472,8 +487,8 @@ def load_raw_features(file): print("Prepared {} training items and {} test items".format(train_x.shape[0], test_x.shape[0])) # store normalized features for CNN training - np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_y=train_y, train_weights=train_weights) - np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_y=test_y, test_weights=test_weights) + np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_y=train_y, train_weights=train_weights) + np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_y=test_y, test_weights=test_weights) np.savez('../Data/normalization.npz', mean_vec=mean_vec, std_vec=std_vec) # store file lists and index mapping to training and test data diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 58c5947..c089c2d 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -12,6 +12,14 @@ from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution2D, MaxPooling2D +import tensorflow.keras.layers +from tensorflow.keras.models import Model + + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + from keras.callbacks import EarlyStopping from keras.optimizers import SGD @@ -32,10 +40,11 @@ def load_training_data(dataset): data = np.load(dataset) train_x = data['train_x'] + train_sslm_x = data['train_sslm_x'] train_y = data['train_y'] train_weights = data['train_weights'] - return train_x, train_y, train_weights + return train_x, train_sslm_x, train_y, train_weights def load_test_data(dataset): @@ -52,10 +61,11 @@ def load_test_data(dataset): data = np.load(dataset) test_x = data['test_x'] + test_sslm_x = data['test_sslm_x'] test_y = data['test_y'] test_weights = data['test_weights'] - return test_x, test_y, test_weights + return test_x, test_sslm_x, test_y, test_weights def build_model(img_rows, img_cols): @@ -78,6 +88,24 @@ def build_model(img_rows, img_cols): return model +def build_sslm_model(img_rows, img_cols): + + input = layers.Input(shape=(img_rows, img_cols, 1)) + x = layers.Conv2D(16, (8, 8))(input) + x = layers.Activation('relu')(x) + x = layers.MaxPooling2D(pool_size=(6, 6))(x) + x = layers.Conv2D(64, (4, 4))(x) + x = layers.Activation('relu')(x) + x = layers.Dropout(0.5)(x) + x = layers.Flatten()(x) + x = layers.Dense(256)(x) + x = layers.Activation('relu')(x) + x = layers.Dropout(0.5)(x) + x = layers.Dense(1)(x) + x = layers.Activation('sigmoid')(x) + return Model(inputs = [input], outputs = x) + + def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None): """ @@ -90,23 +118,27 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh """ print('loading training data...') - X_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz') + X_train, x_sslm_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz') print('training data size:') print(X_train.shape) p = np.random.permutation(X_train.shape[0]) X_train = X_train[p, :, :] + x_sslm_train = x_sslm_train[p, :, :] y_train = y_train[p] w_train = w_train[p] X_train = X_train.astype('float32') X_train = np.expand_dims(X_train, 3) + x_sslm_train = np.expand_dims(x_sslm_train, 3) img_rows = X_train.shape[1] img_cols = X_train.shape[2] - model = build_model(img_rows, img_cols) + #model = build_model(img_rows, img_cols) + breakpoint() + model = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2]) if weights_file is not None: model.load_weights(weights_file) @@ -117,21 +149,26 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh early_stopping = EarlyStopping(monitor='val_loss', patience=5) print('train model...') - model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + model.fit(x_sslm_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) + #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) print('load test data...') - X_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz') + X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz') X_test = X_test.astype('float32') X_test = np.expand_dims(X_test, 3) + x_sslm_test = np.expand_dims(x_sslm_test, 3) print('predict test data...') - preds = model.predict(X_test, batch_size=1, verbose=1) + preds = model.predict(x_sslm_test, batch_size=1, verbose=1) + #preds = model.predict(X_test, batch_size=1, verbose=1) print('saving results...') np.save('../Data/predsTestTracks' + save_ext + '.npy', preds) - score = model.evaluate(X_test, y_test, verbose=1) + score = model.evaluate(x_sslm_test, y_test, verbose=1) + #score = model.evaluate(X_test, y_test, verbose=1) print('Test score:', score) # save model From a1c271d13d3e5fb693254b7b214112afdf1bfba2 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 25 Mar 2021 05:00:39 -0700 Subject: [PATCH 12/35] train both models --- Python/train_segmentation_cnn.py | 55 +++++++++++--------------------- 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index c089c2d..d9f3277 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -69,42 +69,25 @@ def load_test_data(dataset): def build_model(img_rows, img_cols): - - model = Sequential() - - model.add(Convolution2D(32, (6, 8), input_shape=(img_rows, img_cols, 1))) - model.add(Activation('relu')) - model.add(MaxPooling2D(pool_size=(5, 2))) - model.add(Convolution2D(64, (4, 6))) - model.add(Activation('relu')) - model.add(MaxPooling2D(pool_size=(2, 2))) - model.add(Dropout(0.5)) - model.add(Flatten()) - model.add(Dense(256)) - model.add(Activation('relu')) - model.add(Dropout(0.5)) - model.add(Dense(1)) - model.add(Activation('sigmoid')) - - return model + input = layers.Input(shape=(img_rows, img_cols, 1)) + x = layers.Conv2D(16, (6, 8), activation='relu')(input) + x = layers.MaxPooling2D(pool_size=(3, 6))(x) + return input, x def build_sslm_model(img_rows, img_cols): - input = layers.Input(shape=(img_rows, img_cols, 1)) - x = layers.Conv2D(16, (8, 8))(input) - x = layers.Activation('relu')(x) + x = layers.Conv2D(16, (8, 8), activation='relu')(input) x = layers.MaxPooling2D(pool_size=(6, 6))(x) - x = layers.Conv2D(64, (4, 4))(x) - x = layers.Activation('relu')(x) - x = layers.Dropout(0.5)(x) + return input, x + +def build_fused_model(inputs, outputs): + x = layers.Concatenate(axis=1)(outputs) + x = layers.Conv2D(32, (6, 3), activation='relu')(x) x = layers.Flatten()(x) - x = layers.Dense(256)(x) - x = layers.Activation('relu')(x) + x = layers.Dense(256, activation='relu')(x) x = layers.Dropout(0.5)(x) - x = layers.Dense(1)(x) - x = layers.Activation('sigmoid')(x) - return Model(inputs = [input], outputs = x) - + x = layers.Dense(1, activation='sigmoid')(x) + return Model(inputs = inputs, outputs = x) def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None): @@ -136,9 +119,9 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh img_rows = X_train.shape[1] img_cols = X_train.shape[2] - #model = build_model(img_rows, img_cols) - breakpoint() - model = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2]) + mls_input, mls_output = build_model(img_rows, img_cols) + sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2]) + model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output]) if weights_file is not None: model.load_weights(weights_file) @@ -149,7 +132,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh early_stopping = EarlyStopping(monitor='val_loss', patience=5) print('train model...') - model.fit(x_sslm_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, @@ -161,13 +144,13 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh x_sslm_test = np.expand_dims(x_sslm_test, 3) print('predict test data...') - preds = model.predict(x_sslm_test, batch_size=1, verbose=1) + preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1) #preds = model.predict(X_test, batch_size=1, verbose=1) print('saving results...') np.save('../Data/predsTestTracks' + save_ext + '.npy', preds) - score = model.evaluate(x_sslm_test, y_test, verbose=1) + score = model.evaluate([X_test, x_sslm_test], y_test, verbose=1) #score = model.evaluate(X_test, y_test, verbose=1) print('Test score:', score) From bd4551347b41405dd727fcab52718acf15922820 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 25 Mar 2021 05:04:39 -0700 Subject: [PATCH 13/35] put. the. candle. back. --- Data/test_tracks.txt | 69 +++++ Data/train_tracks.txt | 574 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 643 insertions(+) diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index fb1b766..62f4bd4 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -1,3 +1,72 @@ 1166.mp3 40.m4a 1090.mp3 +584.m4a +346.m4a +1026.mp3 +1142.mp3 +1302.mp3 +1131.mp3 +608.m4a +1274.mp3 +1376.mp3 +670.m4a +1399.mp3 +1319.mp3 +18.m4a +1123.mp3 +342.m4a +10013.mp3 +642.m4a +306.m4a +1488.mp3 +516.m4a +1192.mp3 +10024.mp3 +1357.mp3 +404.m4a +1063.mp3 +1331.mp3 +1356.mp3 +1322.mp3 +1170.mp3 +1440.mp3 +1091.mp3 +964.mp3 +1436.mp3 +1414.mp3 +1474.mp3 +1036.mp3 +1040.mp3 +426.m4a +1087.mp3 +1301.mp3 +970.mp3 +1141.mp3 +1250.mp3 +1483.mp3 +992.mp3 +1223.mp3 +1284.mp3 +10012.mp3 +472.m4a +6.m4a +986.mp3 +678.m4a +1227.mp3 +1152.mp3 +5.m4a +1270.mp3 +488.m4a +1311.mp3 +1421.mp3 +1402.mp3 +522.m4a +354.m4a +1276.mp3 +1339.mp3 +1236.mp3 +1445.mp3 +1221.mp3 +1244.mp3 +1080.mp3 diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index d45159d..e2e2baf 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -1,3 +1,577 @@ 1136.mp3 1343.mp3 1027.mp3 +971.mp3 +484.m4a +1130.mp3 +10032.mp3 +991.mp3 +616.m4a +1076.mp3 +478.m4a +1300.mp3 +1333.mp3 +1395.mp3 +440.m4a +1004.mp3 +1372.mp3 +512.m4a +1155.mp3 +1397.mp3 +1485.mp3 +1024.mp3 +1093.mp3 +660.m4a +1254.mp3 +1460.mp3 +1149.mp3 +338.m4a +1396.mp3 +52.m4a +987.mp3 +1384.mp3 +1423.mp3 +594.m4a +1107.mp3 +1410.mp3 +1030.mp3 +1403.mp3 +14.m4a +20.m4a +480.m4a +1455.mp3 +37.m4a +995.mp3 +1430.mp3 +1147.mp3 +1392.mp3 +1164.mp3 +1205.mp3 +626.m4a +1182.mp3 +444.m4a +1448.mp3 +4.m4a +1374.mp3 +996.mp3 +1328.mp3 +1365.mp3 +1358.mp3 +989.mp3 +1478.mp3 +1157.mp3 +1144.mp3 +1286.mp3 +384.m4a +1179.mp3 +1404.mp3 +1256.mp3 +974.mp3 +1271.mp3 +498.m4a +1327.mp3 +618.m4a +1354.mp3 +966.mp3 +955.mp3 +1035.mp3 +1046.mp3 +1352.mp3 +10023.mp3 +1224.mp3 +1204.mp3 +1038.mp3 +1059.mp3 +534.m4a +420.m4a +1490.mp3 +474.m4a +1243.mp3 +1086.mp3 +1226.mp3 +1048.mp3 +1476.mp3 +1214.mp3 +10033.mp3 +1162.mp3 +340.m4a +13.m4a +10025.mp3 +450.m4a +1138.mp3 +1359.mp3 +1219.mp3 +10.m4a +1202.mp3 +965.mp3 +1023.mp3 +1375.mp3 +1140.mp3 +1039.mp3 +1083.mp3 +1092.mp3 +1052.mp3 +1310.mp3 +1462.mp3 +10021.mp3 +1007.mp3 +690.m4a +1242.mp3 +1120.mp3 +1496.mp3 +576.m4a +1167.mp3 +652.m4a +1055.mp3 +1419.mp3 +676.m4a +416.m4a +1316.mp3 +1288.mp3 +634.m4a +1299.mp3 +648.m4a +1268.mp3 +1078.mp3 +1459.mp3 +524.m4a +978.mp3 +1114.mp3 +614.m4a +1218.mp3 +1064.mp3 +1463.mp3 +612.m4a +1122.mp3 +1232.mp3 +1258.mp3 +408.m4a +1408.mp3 +402.m4a +1306.mp3 +1074.mp3 +983.mp3 +1069.mp3 +8.m4a +1126.mp3 +1335.mp3 +1062.mp3 +10008.mp3 +370.m4a +1272.mp3 +1326.mp3 +1429.mp3 +1124.mp3 +320.m4a +1196.mp3 +1464.mp3 +1350.mp3 +12.m4a +1099.mp3 +1054.mp3 +1435.mp3 +1439.mp3 +372.m4a +1269.mp3 +568.m4a +1422.mp3 +10020.mp3 +10009.mp3 +307.m4a +1109.mp3 +1206.mp3 +1318.mp3 +350.m4a +1450.mp3 +360.m4a +963.mp3 +476.m4a +1251.mp3 +1132.mp3 +1011.mp3 +1424.mp3 +492.m4a +1005.mp3 +1266.mp3 +1079.mp3 +1115.mp3 +1360.mp3 +1175.mp3 +1431.mp3 +1294.mp3 +520.m4a +1245.mp3 +410.m4a +1239.mp3 +468.m4a +16.m4a +1195.mp3 +1151.mp3 +1493.mp3 +1084.mp3 +1240.mp3 +1378.mp3 +1037.mp3 +988.mp3 +324.m4a +1104.mp3 +979.mp3 +424.m4a +1467.mp3 +975.mp3 +364.m4a +1171.mp3 +10026.mp3 +1285.mp3 +668.m4a +1189.mp3 +1291.mp3 +596.m4a +1261.mp3 +1072.mp3 +442.m4a +356.m4a +1148.mp3 +956.mp3 +1070.mp3 +482.m4a +396.m4a +1067.mp3 +486.m4a +1112.mp3 +358.m4a +982.mp3 +1173.mp3 +334.m4a +1262.mp3 +1412.mp3 +1315.mp3 +1309.mp3 +1106.mp3 +1287.mp3 +570.m4a +1389.mp3 +1135.mp3 +1119.mp3 +1407.mp3 +1075.mp3 +666.m4a +1207.mp3 +1367.mp3 +1362.mp3 +1451.mp3 +998.mp3 +1246.mp3 +1381.mp3 +1101.mp3 +1003.mp3 +1125.mp3 +1386.mp3 +536.m4a +1238.mp3 +1095.mp3 +994.mp3 +1088.mp3 +394.m4a +46.m4a +1154.mp3 +1264.mp3 +1077.mp3 +1188.mp3 +1472.mp3 +1134.mp3 +1293.mp3 +1117.mp3 +1053.mp3 +658.m4a +1461.mp3 +422.m4a +1215.mp3 +1045.mp3 +317.m4a +1158.mp3 +1346.mp3 +1194.mp3 +1446.mp3 +10022.mp3 +1159.mp3 +1368.mp3 +1332.mp3 +1096.mp3 +502.m4a +1394.mp3 +1168.mp3 +1181.mp3 +610.m4a +392.m4a +322.m4a +1371.mp3 +39.m4a +560.m4a +1180.mp3 +1338.mp3 +1443.mp3 +1111.mp3 +1432.mp3 +532.m4a +496.m4a +1482.mp3 +981.mp3 +311.m4a +366.m4a +694.m4a +1212.mp3 +1102.mp3 +997.mp3 +646.m4a +1042.mp3 +1060.mp3 +1174.mp3 +1382.mp3 +959.mp3 +554.m4a +510.m4a +1247.mp3 +1213.mp3 +323.m4a +10017.mp3 +1082.mp3 +1110.mp3 +1307.mp3 +1495.mp3 +1296.mp3 +10016.mp3 +1108.mp3 +1364.mp3 +1470.mp3 +1021.mp3 +1492.mp3 +1484.mp3 +654.m4a +504.m4a +30.m4a +1235.mp3 +10027.mp3 +1211.mp3 +1176.mp3 +1015.mp3 +574.m4a +1314.mp3 +1494.mp3 +1405.mp3 +999.mp3 +10014.mp3 +990.mp3 +1071.mp3 +1184.mp3 +506.m4a +1336.mp3 +1199.mp3 +1222.mp3 +976.mp3 +1128.mp3 +1044.mp3 +1000.mp3 +1051.mp3 +1442.mp3 +24.m4a +1210.mp3 +578.m4a +564.m4a +1032.mp3 +1437.mp3 +10029.mp3 +1406.mp3 +1379.mp3 +1347.mp3 +1456.mp3 +1438.mp3 +508.m4a +1022.mp3 +1308.mp3 +1413.mp3 +1012.mp3 +3.m4a +1127.mp3 +1253.mp3 +10035.mp3 +1390.mp3 +980.mp3 +1351.mp3 +368.m4a +1317.mp3 +1150.mp3 +550.m4a +967.mp3 +630.m4a +1342.mp3 +968.mp3 +1260.mp3 +1383.mp3 +1428.mp3 +590.m4a +1468.mp3 +1133.mp3 +1324.mp3 +1444.mp3 +1118.mp3 +1008.mp3 +10019.mp3 +1420.mp3 +448.m4a +606.m4a +1029.mp3 +10007.mp3 +1160.mp3 +1447.mp3 +548.m4a +1415.mp3 +604.m4a +1220.mp3 +1275.mp3 +10034.mp3 +336.m4a +1186.mp3 +1469.mp3 +1475.mp3 +1454.mp3 +1434.mp3 +1418.mp3 +1014.mp3 +686.m4a +1427.mp3 +10031.mp3 +1279.mp3 +1006.mp3 +1282.mp3 +1325.mp3 +1172.mp3 +1280.mp3 +957.mp3 +632.m4a +1043.mp3 +556.m4a +1387.mp3 +1230.mp3 +10030.mp3 +984.mp3 +1278.mp3 +1400.mp3 +1143.mp3 +10011.mp3 +1103.mp3 +1491.mp3 +662.m4a +1283.mp3 +1334.mp3 +1068.mp3 +1228.mp3 +1066.mp3 +696.m4a +1116.mp3 +1056.mp3 +335.m4a +1348.mp3 +674.m4a +1292.mp3 +1156.mp3 +1304.mp3 +1197.mp3 +1013.mp3 +1355.mp3 +1216.mp3 +1380.mp3 +1426.mp3 +1320.mp3 +352.m4a +1267.mp3 +1085.mp3 +325.m4a +620.m4a +640.m4a +1234.mp3 +1203.mp3 +1163.mp3 +22.m4a +10018.mp3 +1479.mp3 +622.m4a +1487.mp3 +1486.mp3 +344.m4a +1200.mp3 +1340.mp3 +1018.mp3 +1388.mp3 +1363.mp3 +1187.mp3 +1139.mp3 +960.mp3 +1229.mp3 +1208.mp3 +1034.mp3 +1178.mp3 +562.m4a +1341.mp3 +1303.mp3 +1477.mp3 +1058.mp3 +1020.mp3 +1349.mp3 +650.m4a +1190.mp3 +1295.mp3 +962.mp3 +514.m4a +972.mp3 +586.m4a +1312.mp3 +664.m4a +1411.mp3 +1277.mp3 +1366.mp3 +1231.mp3 +386.m4a +1458.mp3 +1263.mp3 +602.m4a +382.m4a +1248.mp3 +1146.mp3 +328.m4a +10028.mp3 +1061.mp3 +466.m4a +528.m4a +1452.mp3 +1498.mp3 +636.m4a +1398.mp3 +1373.mp3 +1290.mp3 +1183.mp3 +1298.mp3 +1237.mp3 +1323.mp3 +10015.mp3 +1198.mp3 +518.m4a +10010.mp3 +1098.mp3 +1047.mp3 +1165.mp3 +1191.mp3 +348.m4a +1466.mp3 +1019.mp3 +1453.mp3 +428.m4a +624.m4a +1391.mp3 +958.mp3 +973.mp3 +1259.mp3 +1370.mp3 +310.m4a +1330.mp3 +692.m4a From 6e3f6dd1b39792048df03ef3482c0e5d48634569 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 25 Mar 2021 13:46:23 -0700 Subject: [PATCH 14/35] put in do_async switch, fix padding issue w/ sslm features --- Python/feature_extraction.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index df62fc0..5c1508b 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -219,18 +219,22 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): labels_list = [] failed_tracks_idx = [] + do_async = False async_res = [] logger = multiprocessing.log_to_stderr() logger.setLevel(logging.INFO) with multiprocessing.Pool(processes=8) as pool: - for i, f in enumerate(audio_files): - async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) + if do_async: + for i, f in enumerate(audio_files): + async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) for i, f in enumerate(audio_files): - beat_mls, beat_sslm, beat_times = async_res[i].get() - #beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files) + if do_async: + beat_mls, beat_sslm, beat_times = async_res[i].get() + else: + beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -333,7 +337,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training next_weight = 1 data_x[feature_count, :, :] = next_window - data_sslm_x[feature_count] = sslm_features[:, :, k] + data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length] data_y[feature_count] = next_label data_weight[feature_count] = next_weight track_idx[feature_count] = current_track From af4995179aad3603c395595bc56bd9fd4ec2a470 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Sun, 28 Mar 2021 04:18:34 -0700 Subject: [PATCH 15/35] checkpoint --- Data/train_tracks.txt | 2 +- Python/feature_extraction.py | 28 +++++++++++++--------------- Python/train_segmentation_cnn.py | 6 ++++++ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index e2e2baf..5341be2 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -1,8 +1,8 @@ +484.m4a 1136.mp3 1343.mp3 1027.mp3 971.mp3 -484.m4a 1130.mp3 10032.mp3 991.mp3 diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 5c1508b..2a20ac6 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -67,17 +67,13 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h S_to_dB = librosa.power_to_db(mel_spec,ref=np.max) - # pad 130 frames (to be 65) with noise at -70dB at the beginning - #pad = np.full((S_to_dB.shape[0], context_length * 2), -70) - #S_padded = np.concatenate((pad, S_to_dB), axis=1) - - + # first max-pooling: by 2. x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max) MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') MFCCs = MFCCs[1:,:] + 1 - # this seems to group two frames together + # stack (bag) two frames m = 2 x = [np.roll(MFCCs,n,axis=1) for n in range(m)] x_hat = np.concatenate(x, axis=0) @@ -85,11 +81,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h # create circular foo x_hat_length = x_hat.shape[1] x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1) - print("pre-padded: {}, post-padded: {}".format(x_hat.shape, x_padded.shape)) #Cosine distance calculation: D[N/p,L/p] matrix distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p - for i in range(context_length, x_padded.shape[1] - context_length): #iteration in columns of x_hat + for i in range(context_length, x_padded.shape[1] - context_length): for l in range(context_length): cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L distances[i,l] = cosine_dist @@ -98,20 +93,22 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h kappa = 0.1 #equalization factor of 10% t1 = time.time() epsilon = np.full((distances.shape[0], context_length), 1.0) - for i in range(context_length, distances.shape[0]): #iteration in columns of x_hat + for i in range(context_length, distances.shape[0]): for l in range(context_length): epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa) + if epsilon[i,l] == 0: + epsilon[i,l] = 0.000000001 - t2 = time.time() - print(t2-t1) + t2 = time.time() + #print(t2-t1) #Self Similarity Lag Matrix sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide - #sslm = scipy.special.expit(1-distances) sslm = np.transpose(sslm) - # the paper further downsamples by 3, but since we're doing beat-frames only might be ok + + #breakpoint() #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max) #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max) @@ -235,6 +232,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): beat_mls, beat_sslm, beat_times = async_res[i].get() else: beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files) + label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -355,7 +353,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training next_weight = 1. - np.abs(l-k) / (label_smearing + 1.) data_x[feature_count, :, :] = next_window - data_sslm_x[feature_count] = sslm_features[:, :, l] + data_sslm_x[feature_count] = sslm_features[:, :, l - padding_length] data_y[feature_count] = next_label data_weight[feature_count] = next_weight track_idx[feature_count] = current_track @@ -375,7 +373,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training next_window = features[:, l-padding_length: l+padding_length+1] - data_sslm_x[feature_count] = sslm_features[:, :, l] + data_sslm_x[feature_count] = sslm_features[:, :, l - padding_length] data_x[feature_count, :, :] = next_window data_y[feature_count] = 0 data_weight[feature_count] = 1 diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index d9f3277..9c57d95 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -83,6 +83,8 @@ def build_sslm_model(img_rows, img_cols): def build_fused_model(inputs, outputs): x = layers.Concatenate(axis=1)(outputs) x = layers.Conv2D(32, (6, 3), activation='relu')(x) + #x = layers.Conv2D(64, (6, 3), activation='relu')(outputs[0]) + x = layers.Dropout(0.5)(x) x = layers.Flatten()(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dropout(0.5)(x) @@ -122,6 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh mls_input, mls_output = build_model(img_rows, img_cols) sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2]) model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output]) + #model = build_fused_model([mls_input], [mls_output]) if weights_file is not None: model.load_weights(weights_file) @@ -135,6 +138,9 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) + #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) + #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) print('load test data...') From a2cd9898c40bb2f2db353ebb896b3b6813fea901 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Sun, 28 Mar 2021 13:58:38 -0700 Subject: [PATCH 16/35] checkpoint, non-padded way of making sslm circular --- Python/feature_extraction.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 2a20ac6..02d1980 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -73,27 +73,25 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') MFCCs = MFCCs[1:,:] + 1 - # stack (bag) two frames + # stack (bag?) two frames m = 2 x = [np.roll(MFCCs,n,axis=1) for n in range(m)] x_hat = np.concatenate(x, axis=0) - # create circular foo x_hat_length = x_hat.shape[1] - x_padded = np.concatenate((x_hat[:, x_hat_length - context_length : x_hat_length], x_hat, x_hat[:, 0:context_length]), axis=1) - #Cosine distance calculation: D[N/p,L/p] matrix - distances = np.full((x_padded.shape[1], context_length), 1.0) #D has as dimensions N/p and L/p - for i in range(context_length, x_padded.shape[1] - context_length): + distances = np.full((x_hat_length, context_length), 1.0) #D has as dimensions N/p and L/p + for i in range(x_hat_length): for l in range(context_length): - cosine_dist = distance.cosine(x_padded[:,i], x_padded[:,i-(l+1)]) #cosine distance between columns i and i-L + # note that negative indices here make our matrix 'time-circular' + cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L distances[i,l] = cosine_dist #Threshold epsilon[N/p,L/p] calculation kappa = 0.1 #equalization factor of 10% t1 = time.time() epsilon = np.full((distances.shape[0], context_length), 1.0) - for i in range(context_length, distances.shape[0]): + for i in range(distances.shape[0]): for l in range(context_length): epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa) if epsilon[i,l] == 0: @@ -122,10 +120,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0])) for k in range(beat_frames.shape[0]): - sslm_frame = beat_frames[k] // max_pool + context_length + sslm_frame = beat_frames[k] // max_pool sslm_frame_min = sslm_frame - context_length // 2 sslm_frame_max = sslm_frame + context_length // 2 + 1 - beat_sslms[:,:,k] = sslm[:, sslm_frame_min : sslm_frame_max] + beat_sslms[:,:,k] = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1) return beat_sslms @@ -216,7 +214,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): labels_list = [] failed_tracks_idx = [] - do_async = False + do_async = True async_res = [] logger = multiprocessing.log_to_stderr() From dd8bf84e997dfeeb33486b95ed7b8207fdb53e00 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Mon, 29 Mar 2021 16:20:35 -0700 Subject: [PATCH 17/35] checkpoint SSLM work increase the number of lag buffers out. This is the first SSLM that seems to actually work. mean f-Measure for 0.5: 0.2997474318247059, precision: 0.2868324565503678, recall: 0.354656707784005 mean f-Measure for 3.0: 0.5864113051042869, precision: 0.5659258978868614, recall: 0.6849318477316697 --- Python/feature_extraction.py | 58 ++++++++++++++++++++++---------- Python/track_segmentation.py | 22 ++++++------ Python/train_segmentation_cnn.py | 14 ++++---- Python/visualization.py | 7 ++-- 4 files changed, 61 insertions(+), 40 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 02d1980..892c245 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -22,6 +22,7 @@ import paths import warnings import time +import pdb import multiprocessing, logging @@ -44,6 +45,10 @@ random.seed(1234) # for reproducibility np.random.seed(1234) +def debug_signal_handler(signal, frame): + pdb.set_trace() + + def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): """ Compute average Mel log spectrogram per beat given previously @@ -80,26 +85,31 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h x_hat_length = x_hat.shape[1] #Cosine distance calculation: D[N/p,L/p] matrix - distances = np.full((x_hat_length, context_length), 1.0) #D has as dimensions N/p and L/p + + sslm_shape = context_length * 3 # because we'll max pool it down at the end + + distances = np.full((x_hat_length, sslm_shape), 1.0) #D has as dimensions N/p and L/p for i in range(x_hat_length): - for l in range(context_length): + for l in range(sslm_shape): # note that negative indices here make our matrix 'time-circular' cosine_dist = distance.cosine(x_hat[:,i], x_hat[:,i-(l+1)]) #cosine distance between columns i and i-L distances[i,l] = cosine_dist #Threshold epsilon[N/p,L/p] calculation kappa = 0.1 #equalization factor of 10% - t1 = time.time() - epsilon = np.full((distances.shape[0], context_length), 1.0) + + epsilon_buf = np.empty((sslm_shape, sslm_shape * 2)) + epsilon = np.empty((distances.shape[0], sslm_shape)) + for i in range(distances.shape[0]): - for l in range(context_length): - epsilon[i,l] = np.quantile(np.concatenate((distances[i-l,:], distances[i,:])), kappa) - if epsilon[i,l] == 0: - epsilon[i,l] = 0.000000001 + for l in range(sslm_shape): + epsilon_buf[l] = np.concatenate((distances[i-l,:], distances[i,:])) + epsilon[i] = np.quantile(epsilon_buf, kappa, axis=1) + for l in range(sslm_shape): + if epsilon[i, l] == 0: + epsilon[i,l] = 0.000000001 - t2 = time.time() - #print(t2-t1) #Self Similarity Lag Matrix sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide @@ -121,9 +131,10 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h for k in range(beat_frames.shape[0]): sslm_frame = beat_frames[k] // max_pool - sslm_frame_min = sslm_frame - context_length // 2 - sslm_frame_max = sslm_frame + context_length // 2 + 1 - beat_sslms[:,:,k] = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1) + sslm_frame_min = sslm_frame - sslm_shape // 2 + sslm_frame_max = sslm_frame + sslm_shape // 2 + 1 + beat_sslm = np.take(sslm, range(sslm_frame_min, sslm_frame_max), mode='wrap', axis=1) + beat_sslms[:,:,k] = skimage.measure.block_reduce(beat_sslm, (3,3), np.max) return beat_sslms @@ -180,9 +191,8 @@ def get_cached_features(filename): else: return None -def compute_features(logger, f, i, audio_files): - logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) +def compute_features(f): beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) cached_features = get_cached_features(f) @@ -197,6 +207,11 @@ def compute_features(logger, f, i, audio_files): beat_mls /= np.max(beat_mls) return beat_mls, beat_sslm, beat_times +def compute_features_async(logger, f, i, audio_files): + logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) + + return compute_features(f) + def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): """ Extract Mel log spectrogram features from a folder of audio files given pre-analysed @@ -223,13 +238,18 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): with multiprocessing.Pool(processes=8) as pool: if do_async: for i, f in enumerate(audio_files): - async_res.append(pool.apply_async(compute_features, (logger, f, i, audio_files, ))) + async_res.append(pool.apply_async(compute_features_async, (logger, f, i, audio_files, ))) for i, f in enumerate(audio_files): if do_async: - beat_mls, beat_sslm, beat_times = async_res[i].get() + try: + beat_mls, beat_sslm, beat_times = async_res[i].get() + except Exception as inst: + print("error processing {}".format(f)) + print(inst) + continue else: - beat_mls, beat_sslm, beat_times = compute_features(logger, f, i, audio_files) + beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -447,6 +467,8 @@ def load_raw_features(file): if __name__ == "__main__": + #import signal + #signal.signal(signal.SIGINT, debug_signal_handler) train_frame = pd.read_csv('../Data/train_tracks.txt', header=None) test_frame = pd.read_csv('../Data/test_tracks.txt', header=None) diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index 27daf95..7b368f2 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -13,7 +13,7 @@ import os, sys import numpy as np import pandas as pd -from feature_extraction import compute_beat_mls, normalize_features_per_band +from feature_extraction import compute_features, normalize_features_per_band from evaluation import post_processing from train_segmentation_cnn import build_model import peakutils @@ -26,16 +26,19 @@ padding = int(context_length / 2) -def compute_cnn_predictions(features): +def compute_cnn_predictions(mls_features, sslm_features): """ Apply pretrained CNN model to features and return predictions. """ - model = build_model(num_mel_bands, context_length) + model = build_model(num_mel_bands, context_length, context_length) model.load_weights(model_weights) model.compile(loss='binary_crossentropy', optimizer='sgd') - features = np.expand_dims(features, 3) - predictions = model.predict(features, batch_size=1) + mls_features = np.expand_dims(mls_features, 3) + sslm_features = np.transpose(sslm_features, (2, 0, 1)) + sslm_features = np.expand_dims(sslm_features, 3) + + predictions = model.predict([mls_features, sslm_features], batch_size=1) return predictions @@ -52,8 +55,7 @@ def extract_features(audio_file, beats_file): beat_times = t[0].values beat_numbers = t[1].values - beat_mls = compute_beat_mls(filename=audio_file, beat_times=beat_times) - beat_mls /= np.max(beat_mls) + beat_mls, beat_sslm, beat_times = compute_features(audio_file) features = compute_context_windows(beat_mls) norm_data = np.load(normalization_path) @@ -61,7 +63,7 @@ def extract_features(audio_file, beats_file): std_vec = norm_data['std_vec'] features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec) - return features, beat_times, beat_numbers + return features, beat_sslm, beat_times, beat_numbers def compute_context_windows(features): @@ -148,10 +150,10 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers): os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') print("Computing features") - mls_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt') + mls_features, sslm_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt') print("Computing CNN predictions") - predictions = compute_cnn_predictions(mls_features) + predictions = compute_cnn_predictions(mls_features, sslm_features) print("Get segment times") segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 9c57d95..3f1ccfa 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -68,7 +68,7 @@ def load_test_data(dataset): return test_x, test_sslm_x, test_y, test_weights -def build_model(img_rows, img_cols): +def build_mls_model(img_rows, img_cols): input = layers.Input(shape=(img_rows, img_cols, 1)) x = layers.Conv2D(16, (6, 8), activation='relu')(input) x = layers.MaxPooling2D(pool_size=(3, 6))(x) @@ -82,8 +82,7 @@ def build_sslm_model(img_rows, img_cols): def build_fused_model(inputs, outputs): x = layers.Concatenate(axis=1)(outputs) - x = layers.Conv2D(32, (6, 3), activation='relu')(x) - #x = layers.Conv2D(64, (6, 3), activation='relu')(outputs[0]) + x = layers.Conv2D(64, (6, 3), activation='relu')(x) x = layers.Dropout(0.5)(x) x = layers.Flatten()(x) x = layers.Dense(256, activation='relu')(x) @@ -91,6 +90,10 @@ def build_fused_model(inputs, outputs): x = layers.Dense(1, activation='sigmoid')(x) return Model(inputs = inputs, outputs = x) +def build_model(mls_rows, mls_cols, sslm_shape): + mls_input, mls_output = build_mls_model(mls_rows, mls_cols) + sslm_input, sslm_output = build_sslm_model(sslm_shape, sslm_shape) + return build_fused_model([mls_input, sslm_input], [mls_output, sslm_output]) def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None): """ @@ -121,10 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh img_rows = X_train.shape[1] img_cols = X_train.shape[2] - mls_input, mls_output = build_model(img_rows, img_cols) - sslm_input, sslm_output = build_sslm_model(x_sslm_train.shape[1], x_sslm_train.shape[2]) - model = build_fused_model([mls_input, sslm_input], [mls_output, sslm_output]) - #model = build_fused_model([mls_input], [mls_output]) + model = build_model(img_rows, img_cols, sslm_train.shape[1]) if weights_file is not None: model.load_weights(weights_file) diff --git a/Python/visualization.py b/Python/visualization.py index b602dcc..8259051 100644 --- a/Python/visualization.py +++ b/Python/visualization.py @@ -22,17 +22,14 @@ def visualize_predictions(): """ preds = np.load('../Data/predsTestTracks_100epochs_lr005.npy') - train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle') - data = np.load('../Data/testDataNormalized.npz') test_y = data['test_y'] # load file lists and indices with open('../Data/fileListsAndIndex.pickle', 'rb') as f: - train_files, train_idx, test_files, test_idx = pickle.load(f) - - for i in range(len(test_labels)): + train_files, train_idx, test_files, test_idx = pickle.load(f) + for i in range(len(test_files)): f = test_files[i] beat_times, beat_numbers = get_beat_times(f, paths.beats_path, include_beat_numbers=True) print(f) From e64d33c27104d2124b331c5dc90ec1a2010f8333 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Mon, 29 Mar 2021 19:14:23 -0700 Subject: [PATCH 18/35] fix training, early stopping back on --- Python/train_segmentation_cnn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 3f1ccfa..7d75f4a 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -124,7 +124,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh img_rows = X_train.shape[1] img_cols = X_train.shape[2] - model = build_model(img_rows, img_cols, sslm_train.shape[1]) + model = build_model(img_rows, img_cols, x_sslm_train.shape[1]) if weights_file is not None: model.load_weights(weights_file) @@ -132,11 +132,11 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd) - early_stopping = EarlyStopping(monitor='val_loss', patience=5) + early_stopping = EarlyStopping(monitor='val_loss', patience=10) print('train model...') model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, - verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) + verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping]) #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) From aa7912953e71cd92709b28dee097e2522896e654 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Tue, 30 Mar 2021 10:14:57 -0700 Subject: [PATCH 19/35] fix possible epsilon calculation error --- Python/feature_extraction.py | 3 ++- Python/train_segmentation_cnn.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 892c245..1b50ad9 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -103,7 +103,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h for i in range(distances.shape[0]): for l in range(sslm_shape): - epsilon_buf[l] = np.concatenate((distances[i-l,:], distances[i,:])) + epsilon_buf[l] = np.concatenate((distances[i-(l+1),:], distances[i,:])) epsilon[i] = np.quantile(epsilon_buf, kappa, axis=1) for l in range(sslm_shape): @@ -247,6 +247,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): except Exception as inst: print("error processing {}".format(f)) print(inst) + failed_tracks_idx.append(i) continue else: beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files) diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 7d75f4a..e27c181 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -23,7 +23,7 @@ from keras.callbacks import EarlyStopping from keras.optimizers import SGD -np.random.seed(1234) # for reproducibility +np.random.seed(1235) # for reproducibility def load_training_data(dataset): From 77bca47d26b8c0645b4998da113d8b1c652c2d12 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Wed, 31 Mar 2021 03:51:57 -0700 Subject: [PATCH 20/35] remove bum track --- Data/test_tracks.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index 62f4bd4..596411a 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -1,5 +1,4 @@ 1166.mp3 -40.m4a 1090.mp3 584.m4a 346.m4a From c567be8a149e3fd8d654753a3d1f10b2c5850605 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Wed, 31 Mar 2021 13:45:31 -0700 Subject: [PATCH 21/35] caching, float32 --- Python/feature_extraction.py | 56 +++++++++++++++++++----------------- Python/paths.py | 4 +-- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 1b50ad9..10b03d1 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -25,6 +25,7 @@ import pdb import multiprocessing, logging +from contextlib import contextmanager from utils import * import scipy @@ -88,7 +89,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h sslm_shape = context_length * 3 # because we'll max pool it down at the end - distances = np.full((x_hat_length, sslm_shape), 1.0) #D has as dimensions N/p and L/p + distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p for i in range(x_hat_length): for l in range(sslm_shape): # note that negative indices here make our matrix 'time-circular' @@ -98,8 +99,8 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h #Threshold epsilon[N/p,L/p] calculation kappa = 0.1 #equalization factor of 10% - epsilon_buf = np.empty((sslm_shape, sslm_shape * 2)) - epsilon = np.empty((distances.shape[0], sslm_shape)) + epsilon_buf = np.empty((sslm_shape, sslm_shape * 2), dtype=np.float32) + epsilon = np.empty((distances.shape[0], sslm_shape), dtype=np.float32) for i in range(distances.shape[0]): for l in range(sslm_shape): @@ -115,19 +116,8 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide sslm = np.transpose(sslm) - - #breakpoint() - #sslm = skimage.measure.block_reduce(sslm, (1,3), np.max) - #x_prime = skimage.measure.block_reduce(x_prime, (1,3), np.max) - - #Check if SSLM has nans and if it has them, substitute them by 0 - #for i in range(sslm.shape[0]): - # for j in range(sslm.shape[1]): - # if np.isnan(sslm[i,j]): - # sslm[i,j] = 0 - beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int') - beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0])) + beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]), dtype=np.float32) for k in range(beat_frames.shape[0]): sslm_frame = beat_frames[k] // max_pool @@ -183,28 +173,35 @@ def load_waveform(filename): y, sr = librosa.load(path, sr=22050, mono=True) return y -def get_cached_features(filename): - computed_mls_file = paths.get_mls_path(filename) +def get_audio_cache(filename, ext): + path = paths.get_audio_cache_path(filename, ext) - if os.path.exists(computed_mls_file): - return np.load(computed_mls_file) + if os.path.exists(path): + return np.load(path) else: return None +def set_audio_cache(filename, ext, data): + path = paths.get_audio_cache_path(filename, ext) + np.save(path, data) def compute_features(f): beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) - cached_features = get_cached_features(f) + waveform = load_waveform(f) + + beat_mls = get_audio_cache(f, '.mls.npy') + if beat_mls is None: + beat_mls = compute_beat_mls(waveform, beat_times) + beat_mls /= np.max(beat_mls) + set_audio_cache(f, '.mls.npy', beat_mls) - if cached_features is not None: - return cached_features + beat_sslm = get_audio_cache(f, '.mls_sslm.npy') - waveform = load_waveform(f) + if beat_sslm is None: + beat_sslm = compute_sslm(waveform, beat_times) + set_audio_cache(f, '.mls_sslm.npy', beat_sslm) - beat_mls = compute_beat_mls(waveform, beat_times) - beat_sslm = compute_sslm(waveform, beat_times) - beat_mls /= np.max(beat_mls) return beat_mls, beat_sslm, beat_times def compute_features_async(logger, f, i, audio_files): @@ -230,11 +227,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): failed_tracks_idx = [] do_async = True + max_tracks = None + async_res = [] logger = multiprocessing.log_to_stderr() logger.setLevel(logging.INFO) + n_tracks = 0 with multiprocessing.Pool(processes=8) as pool: if do_async: for i, f in enumerate(audio_files): @@ -269,6 +269,10 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): sslm_feature_list.append(beat_sslm) labels_list.append(label_vec) + if max_tracks is not None and n_tracks > max_tracks: + break + n_tracks += 1 + return feature_list, sslm_feature_list, labels_list, failed_tracks_idx diff --git a/Python/paths.py b/Python/paths.py index 2bea298..55856e4 100644 --- a/Python/paths.py +++ b/Python/paths.py @@ -25,6 +25,6 @@ def remove_suffix(filename): def with_suffix(path, ext): return remove_suffix(path) + '.' + ext -def get_mls_path(audio_filename): - return os.path.join(mls_path, remove_suffix(audio_filename) + '.mls.npy') +def get_audio_cache_path(audio_filename, ext): + return os.path.join(mls_path, remove_suffix(audio_filename) + ext) From acb4404d44f79fab2f0d826c8d90e49f3bf49b16 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 1 Apr 2021 09:34:22 -0700 Subject: [PATCH 22/35] chroma sslm! --- Python/feature_extraction.py | 128 +++++++++++++++++-------------- Python/train_segmentation_cnn.py | 4 +- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 10b03d1..0008c48 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -43,52 +43,30 @@ max_pool = 2 +# for debugging +# do_async = False +# max_tracks = 1 + +do_async = True +max_tracks = None + random.seed(1234) # for reproducibility np.random.seed(1234) def debug_signal_handler(signal, frame): pdb.set_trace() - -def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): - """ - Compute average Mel log spectrogram per beat given previously - extracted beat times. - - :param waveform: raw waveform data - :param beat_times: list of beat times in seconds - :param mel_bands: number of Mel bands - :param fft_size: FFT size - :param hop_size: hop size for FFT processing - :return: beat sslm - """ - spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, - window=scipy.signal.hamming)) - - mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True) - s = np.sum(mel_fb, axis=1) - mel_fb = np.divide(mel_fb, s[:, np.newaxis]) - - mel_spec = np.dot(mel_fb, spec) - - S_to_dB = librosa.power_to_db(mel_spec,ref=np.max) - - # first max-pooling: by 2. - x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max) - - MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') - MFCCs = MFCCs[1:,:] + 1 - +def compute_sslm(input_vector, beat_times, hop_size): # stack (bag?) two frames m = 2 - x = [np.roll(MFCCs,n,axis=1) for n in range(m)] + x = [np.roll(input_vector,n,axis=1) for n in range(m)] x_hat = np.concatenate(x, axis=0) x_hat_length = x_hat.shape[1] - #Cosine distance calculation: D[N/p,L/p] matrix sslm_shape = context_length * 3 # because we'll max pool it down at the end + #Cosine distance calculation: D[N/p,L/p] matrix distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p for i in range(x_hat_length): for l in range(sslm_shape): @@ -112,8 +90,7 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h epsilon[i,l] = 0.000000001 - #Self Similarity Lag Matrix - sslm = scipy.special.expit(1-distances/epsilon) #aplicación de la sigmoide + sslm = scipy.special.expit(1-distances/epsilon) # sigmoid sslm = np.transpose(sslm) beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int') @@ -128,6 +105,48 @@ def compute_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, h return beat_sslms + + +def compute_mls_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): + """ + Compute self-similarilty lag matrix (SSLM) using mel-log spectrogram as input + + :param waveform: raw waveform data + :param beat_times: list of beat times in seconds + :param mel_bands: number of Mel bands + :param fft_size: FFT size + :param hop_size: hop size for FFT processing + :return: beat sslm + """ + spec = np.abs(librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, + window=scipy.signal.hamming)) + + mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True) + s = np.sum(mel_fb, axis=1) + mel_fb = np.divide(mel_fb, s[:, np.newaxis]) + + mel_spec = np.dot(mel_fb, spec) + + S_to_dB = librosa.power_to_db(mel_spec,ref=np.max) + + # first max-pooling: by 2. + x_prime = skimage.measure.block_reduce(S_to_dB, (1,max_pool), np.max) + + MFCCs = scipy.fftpack.dct(x_prime, axis=0, type=2, norm='ortho') + MFCCs = MFCCs[1:,:] + 1 + + return compute_sslm(MFCCs, beat_times, hop_size) + +def compute_chroma_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): + spec = librosa.stft(y=waveform, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming) + spec = np.abs(spec) + x_prime = skimage.measure.block_reduce(spec, (1,max_pool), np.max) + + chroma_fb = librosa.filters.chroma(22050, fft_size, n_chroma=12) + chromagram = np.dot(chroma_fb, x_prime) + + return compute_sslm(chromagram + 1, beat_times, hop_size) + def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): """ Compute average Mel log spectrogram per beat given previously @@ -173,36 +192,35 @@ def load_waveform(filename): y, sr = librosa.load(path, sr=22050, mono=True) return y -def get_audio_cache(filename, ext): +def with_audio_cache(filename, ext, waveform, beat_times, genf): path = paths.get_audio_cache_path(filename, ext) if os.path.exists(path): - return np.load(path) + return np.load(path), waveform else: - return None + if waveform is None: + waveform = load_waveform(filename) -def set_audio_cache(filename, ext, data): - path = paths.get_audio_cache_path(filename, ext) - np.save(path, data) + data = genf(waveform, beat_times) + np.save(path, data) + return data, waveform def compute_features(f): beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) - waveform = load_waveform(f) - - beat_mls = get_audio_cache(f, '.mls.npy') - if beat_mls is None: + def gen_beat_mls(waveform, beat_times): beat_mls = compute_beat_mls(waveform, beat_times) beat_mls /= np.max(beat_mls) - set_audio_cache(f, '.mls.npy', beat_mls) + return beat_mls - beat_sslm = get_audio_cache(f, '.mls_sslm.npy') + waveform = None + beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls) + beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm) + chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm) - if beat_sslm is None: - beat_sslm = compute_sslm(waveform, beat_times) - set_audio_cache(f, '.mls_sslm.npy', beat_sslm) + beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3) - return beat_mls, beat_sslm, beat_times + return beat_mls, beat_sslm, chroma_sslm, beat_times def compute_features_async(logger, f, i, audio_files): logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) @@ -226,8 +244,6 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): labels_list = [] failed_tracks_idx = [] - do_async = True - max_tracks = None async_res = [] @@ -243,14 +259,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): for i, f in enumerate(audio_files): if do_async: try: - beat_mls, beat_sslm, beat_times = async_res[i].get() + beat_mls, beat_sslm, chroma_sslm, beat_times = async_res[i].get() except Exception as inst: print("error processing {}".format(f)) print(inst) failed_tracks_idx.append(i) continue else: - beat_mls, beat_sslm, beat_times = compute_features_async(logger, f, i, audio_files) + beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features_async(logger, f, i, audio_files) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -325,7 +341,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) - data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32) + data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length, 2), dtype=np.float32) data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32) data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32) track_idx = np.zeros(shape=(n_preallocate,), dtype=int) @@ -449,7 +465,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training break data_x = data_x[:feature_count, :, :] - data_sslm_x = data_sslm_x[:feature_count, :, :] + data_sslm_x = data_sslm_x[:feature_count, :, :, :] data_y = data_y[:feature_count] data_weight = data_weight[:feature_count] track_idx = track_idx[:feature_count] diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index e27c181..7f3cdab 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -75,7 +75,7 @@ def build_mls_model(img_rows, img_cols): return input, x def build_sslm_model(img_rows, img_cols): - input = layers.Input(shape=(img_rows, img_cols, 1)) + input = layers.Input(shape=(img_rows, img_cols, 2)) x = layers.Conv2D(16, (8, 8), activation='relu')(input) x = layers.MaxPooling2D(pool_size=(6, 6))(x) return input, x @@ -119,7 +119,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh X_train = X_train.astype('float32') X_train = np.expand_dims(X_train, 3) - x_sslm_train = np.expand_dims(x_sslm_train, 3) img_rows = X_train.shape[1] img_cols = X_train.shape[2] @@ -147,7 +146,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz') X_test = X_test.astype('float32') X_test = np.expand_dims(X_test, 3) - x_sslm_test = np.expand_dims(x_sslm_test, 3) print('predict test data...') preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1) From e0e20d922b4c031d0b1d9fbdc38f2f69764ff378 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 1 Apr 2021 11:03:23 -0700 Subject: [PATCH 23/35] extract with downbeat info, add more output to evaluation.py --- Python/evaluation.py | 43 +++++++++++++++++++++++++----------- Python/track_segmentation.py | 1 + Python/utils.py | 15 +++++++++---- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 36a2b97..6e1a5bc 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -13,9 +13,10 @@ import mir_eval import paths +from operator import itemgetter + predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' -f_measure_thresh = 3 # tolerance window in seconds def load_data(preds_file, file_lists): @@ -53,14 +54,14 @@ def post_processing(preds_track): preds_track = np.multiply(preds_track, np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same')) + # unit maximum preds_track /= np.max(preds_track) return preds_track -if __name__ == "__main__": - +def run_eval(f_measure_thresh): f_measures = [] precisions = [] recalls = [] @@ -69,9 +70,6 @@ def post_processing(preds_track): preds = np.reshape(preds, len(preds)) for i, f in enumerate(test_files): - - print("Evaluating {}".format(f)) - # load annotations segment_times = get_segment_times(f, paths.annotations_path) @@ -83,25 +81,44 @@ def post_processing(preds_track): # post processing preds_track = post_processing(preds_track) - peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - pred_times = beat_times[peak_loc] - 1 + # insert a zero value at the beginning of the predictions to help the peak-finding algorithm + # identify the first beat of a track + + peds_track = np.insert(preds_track, 0, 0) + peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1 + pred_times = beat_times[peak_loc] # compute f-measure - f_score, p, r = mir_eval.onset.f_measure(segment_times, pred_times, window=f_measure_thresh) + f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh) f_measures.append(f_score) precisions.append(p) recalls.append(r) - print("f-Measure: {}, precision: {}, recall: {}".format(f_score, p, r)) + #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r)) mean_f = np.mean(np.asarray(f_measures)) mean_p = np.mean(np.asarray(precisions)) mean_r = np.mean(np.asarray(recalls)) - print(" ") - print("Mean scores across all test tracks:") - print("f-Measure: {}, precision: {}, recall: {}".format(mean_f, mean_p, mean_r)) + print("mean f-Measure for {}: {}, precision: {}, recall: {}".format(f_measure_thresh, mean_f, mean_p, mean_r)) + + return list(zip(test_files, f_measures, precisions, recalls)) + +def get_sort_key(item): + return item[1] + +if __name__ == "__main__": + short = run_eval(0.5) + long = run_eval(3.0) + + for i in range(len(short)): + short[i] += long[i][1:4] + + sorted_tracks = sorted(short, key=get_sort_key) + print("{:<20}{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}".format("filename", "f0.5", "p0.5", "r0.5", "f3", "p3", "r3")) + for track in sorted_tracks: + print("{:<20}{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}\t{:4.2}".format(*track)) diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index 8f4e538..1ad79b8 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -128,6 +128,7 @@ def compute_segments_from_predictions(predictions, beat_times): if not os.path.isfile(out_dir + file_name + '.beats.txt'): print("Extracting beat times (this might take a while)...") os.system('DBNBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') + os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') print("Computing features") mls_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt') diff --git a/Python/utils.py b/Python/utils.py index a55b571..fd697e3 100644 --- a/Python/utils.py +++ b/Python/utils.py @@ -103,8 +103,7 @@ def get_segment_times(audio_file, annotation_folder): return segment_times - -def get_beat_times(audio_file, beats_folder): +def get_beat_times(audio_file, beats_folder, include_beat_numbers=False): """ Read beat times from annotation file. :param audio_file: path to audio files @@ -114,7 +113,15 @@ def get_beat_times(audio_file, beats_folder): file_name = os.path.splitext(os.path.basename(audio_file))[0] beats_file = os.path.join(beats_folder, file_name + '.beats.txt') + + if not os.path.isfile(beats_file): + print(f"Extracting beat times for {audio_file}") + os.system(f"DBNDownBeatTracker single '{audio_file}' -o '{beats_file}'") + t = pd.read_table(beats_file, header=None) - beat_times = t.iloc[:, 0].values - return beat_times + if include_beat_numbers: + return t[0].values, t[1].values + else: + return t[0].values + From e8d14708dda4255cca902dc3c1ab6f2349cfa940 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 1 Apr 2021 11:05:23 -0700 Subject: [PATCH 24/35] BeatTracker -> DownBeatTracker --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0701ca2..16eba4c 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ After that the beat tracking from the MADMOM library can be run on all files wit ```bash cd ./Audio mkdir beats -DBNBeatTracker batch -o ./beats $(ls *.mp3) +DBNDownBeatTracker batch -o ./beats $(ls *.mp3) ``` This will take quite some time and use a lot of memory. After finishing, the beat files (`*.beats.txt`) will be placed next to the audio files. From f1115a7cceecb88925e2e4a26a2e29fe5b9e42a1 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 1 Apr 2021 11:06:08 -0700 Subject: [PATCH 25/35] remove commented code --- Python/evaluation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 6e1a5bc..9cfdc52 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -96,8 +96,6 @@ def run_eval(f_measure_thresh): precisions.append(p) recalls.append(r) - #print("{} f-Measure: {}, precision: {}, recall: {}".format(f, f_score, p, r)) - mean_f = np.mean(np.asarray(f_measures)) mean_p = np.mean(np.asarray(precisions)) mean_r = np.mean(np.asarray(recalls)) From d326179965d7f5686f708e8b524ed461a1ec3385 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 2 Apr 2021 17:07:49 -0700 Subject: [PATCH 26/35] protect against overly short tracks --- Python/evaluation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index c5edab4..eb17baa 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -52,8 +52,9 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): preds_track = np.convolve(preds_track, np.hamming(4) / np.sum(np.hamming(4)), 'same') # emphasize peaks - preds_track = np.multiply(preds_track, - np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same')) + if len(preds_track) >= 32: + preds_track = np.multiply(preds_track, + np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same')) # emphasize downbeeat From d692b3a565d3a939e26afe1dc67c33cdca9a908b Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 2 Apr 2021 17:08:43 -0700 Subject: [PATCH 27/35] get much more memory efficiency by memmaping feature files --- Python/feature_extraction.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 0008c48..af4d404 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -196,7 +196,7 @@ def with_audio_cache(filename, ext, waveform, beat_times, genf): path = paths.get_audio_cache_path(filename, ext) if os.path.exists(path): - return np.load(path), waveform + return np.load(path, mmap_mode='r'), waveform else: if waveform is None: waveform = load_waveform(filename) @@ -224,8 +224,7 @@ def gen_beat_mls(waveform, beat_times): def compute_features_async(logger, f, i, audio_files): logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) - - return compute_features(f) + compute_features(f) def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): """ @@ -259,7 +258,11 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): for i, f in enumerate(audio_files): if do_async: try: - beat_mls, beat_sslm, chroma_sslm, beat_times = async_res[i].get() + # have child process actually write features to disk + async_res[i].get() + + # now reload them in mmap + beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features(f) except Exception as inst: print("error processing {}".format(f)) print(inst) @@ -337,7 +340,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training :return: batch data in the form (n_items, n_melbands, n_context) """ - n_preallocate = 250000 + n_preallocate = 500000 # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) @@ -465,7 +468,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training break data_x = data_x[:feature_count, :, :] - data_sslm_x = data_sslm_x[:feature_count, :, :, :] + data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2], data_sslm_x.shape[3])) data_y = data_y[:feature_count] data_weight = data_weight[:feature_count] track_idx = track_idx[:feature_count] From 8513ad3881028f8d769f7aa93d61fcd3c2ca55cd Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 2 Apr 2021 17:09:39 -0700 Subject: [PATCH 28/35] more tracks! --- Data/salami-data-public | 1 + Data/test_tracks.txt | 21 +++++ Data/train_tracks.txt | 169 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+) create mode 120000 Data/salami-data-public diff --git a/Data/salami-data-public b/Data/salami-data-public new file mode 120000 index 0000000..a66aeab --- /dev/null +++ b/Data/salami-data-public @@ -0,0 +1 @@ +/Users/ben/src/salami-data-public \ No newline at end of file diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index 596411a..dd02f71 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -69,3 +69,24 @@ 1221.mp3 1244.mp3 1080.mp3 +824.m4a +752.m4a +10043.mp3 +543.m4a +587.m4a +818.m4a +950.m4a +615.m4a +1640.m4a +1654.m4a +663.m4a +1648.m4a +359.m4a +1602.m4a +39.m4a +1624.m4a +807.m4a +459.m4a +355.m4a +728.m4a +531.m4a diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index 5341be2..dab82f3 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -575,3 +575,172 @@ 310.m4a 1330.mp3 692.m4a +571.m4a +575.m4a +10042.mp3 +339.m4a +842.m4a +411.m4a +379.m4a +63.m4a +791.m4a +746.m4a +852.m4a +483.m4a +795.m4a +774.m4a +739.m4a +1642.m4a +732.m4a +491.m4a +27.m4a +802.m4a +882.m4a +659.m4a +43.m4a +906.m4a +691.m4a +535.m4a +371.m4a +651.m4a +455.m4a +7.m4a +675.m4a +744.m4a +399.m4a +431.m4a +75.m4a +15.m4a +51.m4a +515.m4a +836.m4a +407.m4a +551.m4a +783.m4a +846.m4a +10036.mp3 +667.m4a +892.m4a +555.m4a +832.m4a +1632.m4a +647.m4a +11.m4a +687.m4a +603.m4a +427.m4a +419.m4a +591.m4a +936.m4a +655.m4a +695.m4a +708.m4a +816.m4a +706.m4a +866.m4a +864.m4a +447.m4a +1610.m4a +511.m4a +731.m4a +704.m4a +1650.m4a +527.m4a +750.m4a +567.m4a +1644.m4a +367.m4a +363.m4a +803.m4a +702.m4a +786.m4a +559.m4a +946.m4a +1628.m4a +862.m4a +579.m4a +583.m4a +475.m4a +1620.m4a +10041.mp3 +727.m4a +343.m4a +35.m4a +707.m4a +10037.mp3 +1600.m4a +914.m4a +643.m4a +1634.m4a +627.m4a +794.m4a +683.m4a +1010.m4a +858.m4a +619.m4a +10039.mp3 +930.m4a +700.m4a +747.m4a +768.m4a +1630.m4a +10038.mp3 +1604.m4a +19.m4a +811.m4a +815.m4a +1614.m4a +471.m4a +611.m4a +607.m4a +479.m4a +799.m4a +563.m4a +635.m4a +822.m4a +669.m4a +47.m4a +910.m4a +682.m4a +599.m4a +767.m4a +1612.m4a +71.m4a +828.m4a +1626.m4a +31.m4a +23.m4a +1622.m4a +595.m4a +463.m4a +854.m4a +1638.m4a +1652.m4a +860.m4a +784.m4a +787.m4a +10040.mp3 +755.m4a +1646.m4a +631.m4a +760.m4a +770.m4a +1618.m4a +703.m4a +623.m4a +1636.m4a +519.m4a +451.m4a +726.m4a +1608.m4a +539.m4a +898.m4a +1606.m4a +814.m4a +639.m4a +834.m4a +954.m4a +790.m4a +439.m4a +79.m4a +782.m4a From 798b3e9c5475806844dacc35fe51a5b3e57b52a8 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 2 Apr 2021 17:14:59 -0700 Subject: [PATCH 29/35] fix merge --- Python/evaluation.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 8cd339f..ac64f10 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -38,11 +38,10 @@ def load_data(preds_file, file_lists): return preds, test_files, test_idx -def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): +def post_processing(preds_track): """ Post processing of prediction probabilities, applies smoothing window and emphasizes beats by multiplying with running avarage. - Also weights predictions towards beat "1". :param preds_track: CNN predictions per beat :return: post-processed predictions @@ -57,11 +56,6 @@ def post_processing(preds_track, beat_numbers, emphasize_downbeat=False): np.convolve(preds_track, np.hamming(32) / np.sum(np.hamming(32)), 'same')) - # emphasize downbeeat - if emphasize_downbeat: - preds_track = np.multiply(preds_track, np.where(beat_numbers == 1, 1, 0.5)) - - # unit maximum preds_track /= np.max(preds_track) From 4d7e6af66fb5af98ad83543f22fb49da148b8ed0 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 8 Apr 2021 01:06:45 -0700 Subject: [PATCH 30/35] large changes - bring in beat-number as a new dimension - crib a new prediction-thresholding algo from a research paper - refactor feature extraction --- Data/train_tracks.txt | 2 +- Python/evaluation.py | 50 ++++++++-- Python/feature_extraction.py | 157 ++++++++++++++----------------- Python/track_segmentation.py | 49 ++++++---- Python/train_segmentation_cnn.py | 89 ++++++++---------- 5 files changed, 179 insertions(+), 168 deletions(-) diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index dab82f3..7992c7c 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -1,3 +1,4 @@ +10007.mp3 484.m4a 1136.mp3 1343.mp3 @@ -421,7 +422,6 @@ 448.m4a 606.m4a 1029.mp3 -10007.mp3 1160.mp3 1447.mp3 548.m4a diff --git a/Python/evaluation.py b/Python/evaluation.py index ac64f10..4d38983 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -17,7 +17,7 @@ predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' - +prediction_threshold = 0.3 def load_data(preds_file, file_lists): """ @@ -38,6 +38,33 @@ def load_data(preds_file, file_lists): return preds, test_files, test_idx +def choose_preds(preds, beat_times): + # At test time, we apply the trained network to each position in the + # spectrogram of the music piece to be segmented, ob- taining a boundary + # probability for each frame. We then employ a simple means of peak-picking + # on this boundary activation curve: Every output value that is not + # surpassed within ±6 seconds is a boundary candidate. From each candidate + # value we subtract the average of the activation curve in the past 12 and + # future 6 seconds, to compensate for long-term trends. We end up with a + # list of boundary candidates along with strength values that can be + # thresh- olded at will. We found that more elaborate peak picking methods + # did not improve results. + preds_out = np.zeros((len(preds))) + + for i in range(len(preds)): + pred_time = beat_times[i] + in_window = (beat_times > pred_time - 6) & (beat_times <= pred_time + 6) + max_in_window = np.argmax(np.where(in_window, preds, 0)) + if i == max_in_window: + in_avg_window = (beat_times > pred_time - 12) & (beat_times <= pred_time + 6) + window_avg = np.mean(preds[in_avg_window]) + preds_out[i] = preds[i] - window_avg + else: + preds_out[i] = 0 + + return np.flatnonzero(preds_out > prediction_threshold) + + def post_processing(preds_track): """ Post processing of prediction probabilities, applies smoothing @@ -47,7 +74,6 @@ def post_processing(preds_track): :return: post-processed predictions """ - # smoothing preds_track = np.convolve(preds_track, np.hamming(4) / np.sum(np.hamming(4)), 'same') # emphasize peaks @@ -82,15 +108,21 @@ def run_eval(f_measure_thresh): # get predictions for current track preds_track = np.squeeze(np.asarray(preds[test_idx == i])) - # post processing - preds_track = post_processing(preds_track) + if len(preds_track) == 0: + continue + + if True: + pred_indexes = choose_preds(preds_track, beat_times) + pred_times = beat_times[pred_indexes] + else: + preds_track = post_processing(preds_track) - # insert a zero value at the beginning of the predictions to help the peak-finding algorithm - # identify the first beat of a track + # insert a zero value at the beginning of the predictions to help the peak-finding algorithm + # identify the first beat of a track - peds_track = np.insert(preds_track, 0, 0) - peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1 - pred_times = beat_times[peak_loc] + peds_track = np.insert(preds_track, 0, 0) + peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1 + pred_times = beat_times[peak_loc] # compute f-measure f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh) diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index af4d404..8087a4f 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -36,7 +36,7 @@ context_length = 65 # how many beats make up a context window for the CNN num_mel_bands = 80 # number of Mel bands neg_frames_factor = 5 # how many more negative examples than segment boundaries -pos_frames_oversample = 5 # oversample positive frames because there are too few +pos_frames_oversample = 5 # oversample positive frames because there are too few mid_frames_oversample = 3 # oversample frames between segments label_smearing = 1 # how many frames are positive examples around an annotation padding_length = int(context_length / 2) @@ -44,11 +44,12 @@ max_pool = 2 # for debugging -# do_async = False -# max_tracks = 1 - -do_async = True -max_tracks = None +if False: + do_async = False + max_tracks = 1 +else: + do_async = True + max_tracks = None random.seed(1234) # for reproducibility np.random.seed(1234) @@ -144,6 +145,7 @@ def compute_chroma_sslm(waveform, beat_times, mel_bands=num_mel_bands, fft_size= chroma_fb = librosa.filters.chroma(22050, fft_size, n_chroma=12) chromagram = np.dot(chroma_fb, x_prime) + chromagram = librosa.power_to_db(chromagram,ref=np.max) return compute_sslm(chromagram + 1, beat_times, hop_size) @@ -181,6 +183,16 @@ def compute_beat_mls(features, beat_times, mel_bands=num_mel_bands, fft_size=102 return beat_melspec +def compute_time_features(features, beat_times): + length = len(features) / 22050. + time_ratios = np.zeros((len(beat_times), 500), dtype=np.float32) + + for k in range(len(beat_times)): + time_ratios[k, int((beat_times[k] * 500) // length)] = 1.0 + + return time_ratios + + def load_waveform(filename): if "/" in filename: path = filename @@ -205,8 +217,14 @@ def with_audio_cache(filename, ext, waveform, beat_times, genf): np.save(path, data) return data, waveform +def make_beat_time_features(beat_numbers): + times = np.zeros((len(beat_numbers), 4)) + for i in range(len(beat_numbers)): + times[i][beat_numbers[i] - 1] = 1 + return times + def compute_features(f): - beat_times = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path) + beat_times, beat_numbers = get_beat_times(os.path.join(paths.audio_path, f), paths.beats_path, include_beat_numbers=True) def gen_beat_mls(waveform, beat_times): beat_mls = compute_beat_mls(waveform, beat_times) @@ -216,11 +234,14 @@ def gen_beat_mls(waveform, beat_times): waveform = None beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls) beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm) - chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm) + #times, waveform = with_audio_cache(f, '.beat_time_ratios.npy', waveform, beat_times, compute_time_features) + times = make_beat_time_features(beat_numbers) + + #chroma_sslm, waveform = with_audio_cache(f, '.chroma_sslm.npy', waveform, beat_times, compute_chroma_sslm) - beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3) + #beat_sslm = np.stack((beat_mls_sslm, chroma_sslm), axis=3) - return beat_mls, beat_sslm, chroma_sslm, beat_times + return beat_mls, beat_mls_sslm, times, beat_times def compute_features_async(logger, f, i, audio_files): logger.info("Track {} / {} ({})".format(i, len(audio_files), f)) @@ -240,6 +261,7 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): feature_list = [] sslm_feature_list = [] + time_feature_list = [] labels_list = [] failed_tracks_idx = [] @@ -262,14 +284,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): async_res[i].get() # now reload them in mmap - beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features(f) + beat_mls, beat_sslm, time_features, beat_times = compute_features(f) except Exception as inst: print("error processing {}".format(f)) print(inst) failed_tracks_idx.append(i) continue else: - beat_mls, beat_sslm, chroma_sslm, beat_times = compute_features_async(logger, f, i, audio_files) + beat_mls, beat_sslm, time_features, beat_times = compute_features(f) label_vec = np.zeros(beat_mls.shape[1],) segment_times = get_segment_times(f, paths.annotations_path) @@ -286,13 +308,14 @@ def batch_extract_mls_and_labels(audio_files, beats_folder, annotation_folder): feature_list.append(beat_mls) sslm_feature_list.append(beat_sslm) + time_feature_list.append(time_features) labels_list.append(label_vec) if max_tracks is not None and n_tracks > max_tracks: break n_tracks += 1 - return feature_list, sslm_feature_list, labels_list, failed_tracks_idx + return feature_list, sslm_feature_list, time_feature_list, labels_list, failed_tracks_idx def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample=10000): @@ -328,7 +351,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample return features, mean_vec, std_vec -def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training=True): +def prepare_batch_data(feature_list, sslm_feature_list, time_feature_list, labels_list, is_training=True): """ Reads precomputed beat Mel spectrograms and slices them into context windows for CNN training. For the training set, subsampling is @@ -344,7 +367,8 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) - data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length, 2), dtype=np.float32) + data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32) + data_time_x = np.zeros(shape=(n_preallocate, time_feature_list[0].shape[1]), dtype=np.float32) data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32) data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32) track_idx = np.zeros(shape=(n_preallocate,), dtype=int) @@ -352,8 +376,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training feature_count = 0 current_track = 0 - for features, sslm_features, labels in zip(feature_list, sslm_feature_list, labels_list): - + for features, sslm_features, time_features, labels in zip(feature_list, sslm_feature_list, time_feature_list, labels_list): print("Processed {} examples from {} tracks".format(feature_count, current_track+1)) num_beats = features.shape[1] @@ -363,6 +386,17 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training labels = np.concatenate((np.zeros(padding_length), labels, np.zeros(padding_length)), axis=0) + def add_feature(idx, label, weight=1): + nonlocal feature_count + data_x[feature_count, :, :] = features[:, idx - padding_length: idx + padding_length + 1] + data_sslm_x[feature_count] = sslm_features[:, :, idx - padding_length] + data_time_x[feature_count] = time_features[idx - padding_length] + data_y[feature_count] = label + data_weight[feature_count] = weight + track_idx[feature_count] = current_track + + feature_count += 1 + if is_training is True: # take all positive frames. these are indexes into the already padded features. @@ -371,36 +405,15 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training for rep in range(pos_frames_oversample): for k in positive_frames_idx: + add_feature(k, label=1) - next_window = features[:, k - padding_length: k + padding_length + 1] - next_label = 1 - next_weight = 1 - - data_x[feature_count, :, :] = next_window - data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length] - data_y[feature_count] = next_label - data_weight[feature_count] = next_weight - track_idx[feature_count] = current_track - - feature_count += 1 - - # apply label smearing: set labels around annotation to 1 and give them a triangular weight + ## apply label smearing: set labels around annotation to 1 and give them a triangular weight for l in range(k - label_smearing, k + label_smearing + 1): # don't smear into padding. if padding_length <= l < num_beats + padding_length and l != k: - - next_window = features[:, l-padding_length: l+padding_length+1] - next_label = 1 next_weight = 1. - np.abs(l-k) / (label_smearing + 1.) - - data_x[feature_count, :, :] = next_window - data_sslm_x[feature_count] = sslm_features[:, :, l - padding_length] - data_y[feature_count] = next_label - data_weight[feature_count] = next_weight - track_idx[feature_count] = current_track - - feature_count += 1 + add_feature(l, label=0.5, weight=next_weight) # take all frames in the middle between two boundaries (typical false positives) mid_segment_frames_idx = (positive_frames_idx[1:] + positive_frames_idx[:-1]) / 2 @@ -412,16 +425,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training for l in range(k - label_smearing, k + label_smearing + 1): if padding_length <= l < num_beats + padding_length: - - next_window = features[:, l-padding_length: l+padding_length+1] - - data_sslm_x[feature_count] = sslm_features[:, :, l - padding_length] - data_x[feature_count, :, :] = next_window - data_y[feature_count] = 0 - data_weight[feature_count] = 1 - track_idx[feature_count] = current_track - - feature_count += 1 + add_feature(l, label=0) # sample randomly from the remaining frames remaining_frames_idx = [] @@ -434,33 +438,11 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training for k in range(num_neg_frames): next_idx = random.sample(remaining_frames_idx, 1)[0] - next_window = features[:, next_idx-padding_length: next_idx+padding_length+1] - next_label = 0 - next_weight = 1 - - data_x[feature_count, :, :] = next_window - data_sslm_x[feature_count] = sslm_features[:, :, next_idx - padding_length] - data_y[feature_count] = next_label - data_weight[feature_count] = next_weight - track_idx[feature_count] = current_track - - feature_count += 1 + add_feature(next_idx, label=0) else: # test data -> extract all context windows and keep track of track indices for k in range(padding_length, num_beats + padding_length): - - next_window = features[:, k-padding_length: k+padding_length+1] - next_label = labels[k] - next_weight = 1 - - data_x[feature_count, :, :] = next_window - data_y[feature_count] = next_label - data_sslm_x[feature_count] = sslm_features[:, :, k - padding_length] - - data_weight[feature_count] = next_weight - track_idx[feature_count] = current_track - - feature_count += 1 + add_feature(k, label=labels[k]) current_track += 1 @@ -468,12 +450,13 @@ def prepare_batch_data(feature_list, sslm_feature_list, labels_list, is_training break data_x = data_x[:feature_count, :, :] - data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2], data_sslm_x.shape[3])) + data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2])) + data_time_x.resize((feature_count, data_time_x.shape[1])) data_y = data_y[:feature_count] data_weight = data_weight[:feature_count] track_idx = track_idx[:feature_count] - return data_x, data_sslm_x, data_y, data_weight, track_idx + return data_x, data_sslm_x, data_time_x, data_y, data_weight, track_idx def load_raw_features(file): @@ -502,13 +485,13 @@ def load_raw_features(file): print("Extracting MLS features") - train_features, train_sslm_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files, - paths.beats_path, - paths.annotations_path) + train_features, train_sslm_features, train_time_features, train_labels, train_failed_idx = batch_extract_mls_and_labels(train_files, + paths.beats_path, + paths.annotations_path) - test_features, test_sslm_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files, - paths.beats_path, - paths.annotations_path) + test_features, test_sslm_features, test_time_features, test_labels, test_failed_idx = batch_extract_mls_and_labels(test_files, + paths.beats_path, + paths.annotations_path) print("Extracted features for {} training and {} test tracks".format(len(train_features), len(test_features))) @@ -524,8 +507,8 @@ def load_raw_features(file): # train_features, train_labels, test_features, test_labels = load_raw_features('../Data/rawFeatures.pickle') - train_x, train_sslm_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_labels, is_training=True) - test_x, test_sslm_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_labels, is_training=False) + train_x, train_sslm_x, train_time_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_time_features, train_labels, is_training=True) + test_x, test_sslm_x, test_time_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_time_features, test_labels, is_training=False) train_x, mean_vec, std_vec = normalize_features_per_band(train_x) test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec) @@ -533,8 +516,8 @@ def load_raw_features(file): print("Prepared {} training items and {} test items".format(train_x.shape[0], test_x.shape[0])) # store normalized features for CNN training - np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_y=train_y, train_weights=train_weights) - np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_y=test_y, test_weights=test_weights) + np.savez('../Data/trainDataNormalized.npz', train_x=train_x, train_sslm_x=train_sslm_x, train_time_x=train_time_x, train_y=train_y, train_weights=train_weights) + np.savez('../Data/testDataNormalized.npz', test_x=test_x, test_sslm_x=test_sslm_x, test_time_x=test_time_x, test_y=test_y, test_weights=test_weights) np.savez('../Data/normalization.npz', mean_vec=mean_vec, std_vec=std_vec) # store file lists and index mapping to training and test data diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index 7b368f2..a01f2ae 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -14,7 +14,7 @@ import numpy as np import pandas as pd from feature_extraction import compute_features, normalize_features_per_band -from evaluation import post_processing +from evaluation import post_processing, choose_preds from train_segmentation_cnn import build_model import peakutils @@ -26,19 +26,23 @@ padding = int(context_length / 2) -def compute_cnn_predictions(mls_features, sslm_features): - """ - Apply pretrained CNN model to features and return predictions. - """ +def build_full_model(): model = build_model(num_mel_bands, context_length, context_length) model.load_weights(model_weights) model.compile(loss='binary_crossentropy', optimizer='sgd') + return model + +def compute_cnn_predictions(mls_features, sslm_features, time_features): + """ + Apply pretrained CNN model to features and return predictions. + """ + model = build_full_model() mls_features = np.expand_dims(mls_features, 3) sslm_features = np.transpose(sslm_features, (2, 0, 1)) - sslm_features = np.expand_dims(sslm_features, 3) + #sslm_features = sslm_features[:, :, :, 0] # remove chroma for now - predictions = model.predict([mls_features, sslm_features], batch_size=1) + predictions = model.predict([mls_features, sslm_features, time_features], batch_size=1) return predictions @@ -55,7 +59,7 @@ def extract_features(audio_file, beats_file): beat_times = t[0].values beat_numbers = t[1].values - beat_mls, beat_sslm, beat_times = compute_features(audio_file) + beat_mls, sslm, time_features, beat_times = compute_features(audio_file) features = compute_context_windows(beat_mls) norm_data = np.load(normalization_path) @@ -63,7 +67,7 @@ def extract_features(audio_file, beats_file): std_vec = norm_data['std_vec'] features, mean_vec, std_vec = normalize_features_per_band(features, mean_vec, std_vec) - return features, beat_sslm, beat_times, beat_numbers + return features, sslm, time_features, beat_times def compute_context_windows(features): @@ -103,7 +107,7 @@ def print_predictions(p, beat_times): print("%i:\t%.3f\t%.1f" % (i, p[i], beat_times[i])) -def compute_segments_from_predictions(predictions, beat_times, beat_numbers): +def compute_segments_from_predictions(predictions, beat_times): """ Computes the segment times from a prediction curve and the beat times using peak picking. @@ -113,14 +117,21 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers): print("raw predicitions:") print_predictions(predictions, beat_times) - predictions = post_processing(predictions, beat_numbers, emphasize_downbeat=True) + if True: + peak_loc = choose_preds(predictions, beat_times) + segment_times = beat_times[peak_loc] + #print("after post-processing:") + #print_predictions(peak_loc, beat_times) - print("after post-processing:") - print_predictions(predictions, beat_times) + else: + predictions = post_processing(predictions) + + print("after post-processing:") + print_predictions(predictions, beat_times) - predictions = np.insert(predictions, 0, 0) - peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1 - segment_times = beat_times[peak_loc] + predictions = np.insert(predictions, 0, 0) + peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1 + segment_times = beat_times[peak_loc] print("beat_num\ttime:") for i in peak_loc: @@ -150,13 +161,13 @@ def compute_segments_from_predictions(predictions, beat_times, beat_numbers): os.system('DBNDownBeatTracker \'single\' "' + audio_file + '" -o "' + out_dir + file_name + '.beats.txt"') print("Computing features") - mls_features, sslm_features, beat_times, beat_numbers = extract_features(audio_file, out_dir + file_name + '.beats.txt') + mls_features, sslm, time_features, beat_times = extract_features(audio_file, out_dir + file_name + '.beats.txt') print("Computing CNN predictions") - predictions = compute_cnn_predictions(mls_features, sslm_features) + predictions = compute_cnn_predictions(mls_features, sslm, time_features) print("Get segment times") - segment_times = compute_segments_from_predictions(predictions, beat_times, beat_numbers) + segment_times = compute_segments_from_predictions(predictions, beat_times) print("\n") for f in segment_times: diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 7f3cdab..8658cda 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -39,12 +39,7 @@ def load_training_data(dataset): """ data = np.load(dataset) - train_x = data['train_x'] - train_sslm_x = data['train_sslm_x'] - train_y = data['train_y'] - train_weights = data['train_weights'] - - return train_x, train_sslm_x, train_y, train_weights + return data['train_x'], data['train_sslm_x'], data['train_time_x'], data['train_y'], data['train_weights'] def load_test_data(dataset): @@ -60,40 +55,34 @@ def load_test_data(dataset): """ data = np.load(dataset) - test_x = data['test_x'] - test_sslm_x = data['test_sslm_x'] - test_y = data['test_y'] - test_weights = data['test_weights'] - - return test_x, test_sslm_x, test_y, test_weights - - -def build_mls_model(img_rows, img_cols): - input = layers.Input(shape=(img_rows, img_cols, 1)) - x = layers.Conv2D(16, (6, 8), activation='relu')(input) - x = layers.MaxPooling2D(pool_size=(3, 6))(x) - return input, x - -def build_sslm_model(img_rows, img_cols): - input = layers.Input(shape=(img_rows, img_cols, 2)) - x = layers.Conv2D(16, (8, 8), activation='relu')(input) - x = layers.MaxPooling2D(pool_size=(6, 6))(x) - return input, x - -def build_fused_model(inputs, outputs): - x = layers.Concatenate(axis=1)(outputs) - x = layers.Conv2D(64, (6, 3), activation='relu')(x) - x = layers.Dropout(0.5)(x) - x = layers.Flatten()(x) - x = layers.Dense(256, activation='relu')(x) - x = layers.Dropout(0.5)(x) - x = layers.Dense(1, activation='sigmoid')(x) - return Model(inputs = inputs, outputs = x) + return data['test_x'], data['test_sslm_x'], data['test_time_x'], data['test_y'], data['test_weights'] + def build_model(mls_rows, mls_cols, sslm_shape): - mls_input, mls_output = build_mls_model(mls_rows, mls_cols) - sslm_input, sslm_output = build_sslm_model(sslm_shape, sslm_shape) - return build_fused_model([mls_input, sslm_input], [mls_output, sslm_output]) + mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input') + mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input) + mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls) + + sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input') + sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input) + sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm) + + merged = layers.Concatenate(axis=1, name='mls_slsm_concat')([mls, sslm]) + merged = layers.Conv2D(64, (6, 3), activation='relu', name='concat_conv')(merged) + merged = layers.Dropout(0.5, name='concat_dropout')(merged) + + merged = layers.Flatten()(merged) + + merged = layers.Dense(256, activation='relu', name='final_dense')(merged) + merged = layers.Dropout(0.5, name='final_dropout')(merged) + + time_input = layers.Input(shape=(4,), name='time_input') + time = layers.Dense(1, activation='relu', name='time_dense')(time_input) + merged = layers.Concatenate(name='final_concat')([merged, time]) + + merged = layers.Dense(1, activation='sigmoid', name='final_sigmoid')(merged) + + return Model(inputs=[mls_input, sslm_input, time_input], outputs = merged) def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None): """ @@ -106,7 +95,7 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh """ print('loading training data...') - X_train, x_sslm_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz') + X_train, x_sslm_train, x_time_train, y_train, w_train = load_training_data('../Data/trainDataNormalized.npz') print('training data size:') print(X_train.shape) @@ -114,11 +103,13 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh p = np.random.permutation(X_train.shape[0]) X_train = X_train[p, :, :] x_sslm_train = x_sslm_train[p, :, :] + x_time_train = x_time_train[p] y_train = y_train[p] w_train = w_train[p] X_train = X_train.astype('float32') X_train = np.expand_dims(X_train, 3) + x_sslm_train = np.expand_dims(x_sslm_train, 3) img_rows = X_train.shape[1] img_cols = X_train.shape[2] @@ -131,31 +122,25 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh sgd = SGD(lr=0.05, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd) - early_stopping = EarlyStopping(monitor='val_loss', patience=10) + early_stopping = EarlyStopping(monitor='val_loss', patience=15) print('train model...') - model.fit(x=[X_train, x_sslm_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + model.fit(x=[X_train, x_sslm_train, x_time_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping]) - #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, - # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) - - #model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, - # verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[]) print('load test data...') - X_test, x_sslm_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz') + X_test, x_sslm_test, x_time_test, y_test, w_test = load_test_data('../Data/testDataNormalized.npz') X_test = X_test.astype('float32') X_test = np.expand_dims(X_test, 3) + x_sslm_test = np.expand_dims(x_sslm_test, 3) print('predict test data...') - preds = model.predict([X_test, x_sslm_test], batch_size=1, verbose=1) - #preds = model.predict(X_test, batch_size=1, verbose=1) + preds = model.predict([X_test, x_sslm_test, x_time_test], batch_size=1, verbose=1) print('saving results...') np.save('../Data/predsTestTracks' + save_ext + '.npy', preds) - score = model.evaluate([X_test, x_sslm_test], y_test, verbose=1) - #score = model.evaluate(X_test, y_test, verbose=1) + score = model.evaluate([X_test, x_sslm_test, x_time_test], y_test, verbose=1) print('Test score:', score) # save model @@ -163,4 +148,4 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh if __name__ == "__main__": - train_model(nb_epoch=75) + train_model(nb_epoch=200) From 669a0150c2e48ecb299540a20d2dafa131af411f Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Thu, 8 Apr 2021 10:18:56 -0700 Subject: [PATCH 31/35] code cleanup --- Python/evaluation.py | 15 +++------------ Python/track_segmentation.py | 18 ++---------------- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 4d38983..8fff7db 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -111,18 +111,8 @@ def run_eval(f_measure_thresh): if len(preds_track) == 0: continue - if True: - pred_indexes = choose_preds(preds_track, beat_times) - pred_times = beat_times[pred_indexes] - else: - preds_track = post_processing(preds_track) - - # insert a zero value at the beginning of the predictions to help the peak-finding algorithm - # identify the first beat of a track - - peds_track = np.insert(preds_track, 0, 0) - peak_loc = peakutils.indexes(preds_track, min_dist=8, thres=0.1) - 1 - pred_times = beat_times[peak_loc] + pred_indexes = choose_preds(preds_track, beat_times) + pred_times = beat_times[pred_indexes] # compute f-measure f_score, p, r = mir_eval.onset.f_measure(np.sort(segment_times), np.sort(pred_times), window=f_measure_thresh) @@ -142,6 +132,7 @@ def get_sort_key(item): return item[1] if __name__ == "__main__": + run_eval(0.2) short = run_eval(0.5) long = run_eval(3.0) diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index a01f2ae..86ae88e 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -40,7 +40,6 @@ def compute_cnn_predictions(mls_features, sslm_features, time_features): mls_features = np.expand_dims(mls_features, 3) sslm_features = np.transpose(sslm_features, (2, 0, 1)) - #sslm_features = sslm_features[:, :, :, 0] # remove chroma for now predictions = model.predict([mls_features, sslm_features, time_features], batch_size=1) @@ -116,22 +115,9 @@ def compute_segments_from_predictions(predictions, beat_times): print("raw predicitions:") print_predictions(predictions, beat_times) + peak_loc = choose_preds(predictions, beat_times) - if True: - peak_loc = choose_preds(predictions, beat_times) - segment_times = beat_times[peak_loc] - #print("after post-processing:") - #print_predictions(peak_loc, beat_times) - - else: - predictions = post_processing(predictions) - - print("after post-processing:") - print_predictions(predictions, beat_times) - - predictions = np.insert(predictions, 0, 0) - peak_loc = peakutils.indexes(predictions, min_dist=8, thres=0.1) - 1 - segment_times = beat_times[peak_loc] + segment_times = beat_times[peak_loc] print("beat_num\ttime:") for i in peak_loc: From aff6e94272a3643b1398ad815308ae83b8bcc1bd Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Sat, 10 Apr 2021 04:04:30 -0700 Subject: [PATCH 32/35] move some parameters out to parameters.py --- Python/parameters.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 Python/parameters.py diff --git a/Python/parameters.py b/Python/parameters.py new file mode 100644 index 0000000..4ad6222 --- /dev/null +++ b/Python/parameters.py @@ -0,0 +1,29 @@ +# thresholding value for prediction-choice algorithm. trade recall for accuracy here. +prediction_threshold = 0.3 + +# how many beats make up a context window for the MLS part of the network +context_length = 115 + +# number of Mel bands +num_mel_bands = 80 + +# how many frames to max-pool in building the SSLM +max_pool = 2 + +# how far back to calculate the SSLM (note that actual length will be max_pool * sslm_length) +sslm_length = 65 + +# how many more negative examples than segment boundaries +neg_frames_factor = 5 + +# oversample positive frames because there are too few +pos_frames_oversample = 5 + +# oversample frames between segments +mid_frames_oversample = 3 + +# how many frames are semi-positive examples around an annotation +label_smearing = 1 + +padding_length = int(context_length / 2) + From 3ae11035038aa3e3b5ef8fcbe1a5d2f64e0fefda Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Wed, 14 Apr 2021 23:16:28 -0700 Subject: [PATCH 33/35] move numbers into parameters.py. increase SSLM length. --- Python/evaluation.py | 4 ++-- Python/feature_extraction.py | 32 +++++++++++++------------------- Python/parameters.py | 2 +- Python/track_segmentation.py | 14 ++++++-------- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/Python/evaluation.py b/Python/evaluation.py index 8fff7db..05c50eb 100644 --- a/Python/evaluation.py +++ b/Python/evaluation.py @@ -12,12 +12,12 @@ import peakutils import mir_eval import paths +import parameters from operator import itemgetter predictions_path = '../Data/predsTestTracks_100epochs_lr005.npy' file_list_path = '../Data/fileListsAndIndex.pickle' -prediction_threshold = 0.3 def load_data(preds_file, file_lists): """ @@ -62,7 +62,7 @@ def choose_preds(preds, beat_times): else: preds_out[i] = 0 - return np.flatnonzero(preds_out > prediction_threshold) + return np.flatnonzero(preds_out > parameters.prediction_threshold) def post_processing(preds_track): diff --git a/Python/feature_extraction.py b/Python/feature_extraction.py index 8087a4f..503b711 100644 --- a/Python/feature_extraction.py +++ b/Python/feature_extraction.py @@ -32,16 +32,7 @@ import skimage.measure from scipy.spatial import distance - -context_length = 65 # how many beats make up a context window for the CNN -num_mel_bands = 80 # number of Mel bands -neg_frames_factor = 5 # how many more negative examples than segment boundaries -pos_frames_oversample = 5 # oversample positive frames because there are too few -mid_frames_oversample = 3 # oversample frames between segments -label_smearing = 1 # how many frames are positive examples around an annotation -padding_length = int(context_length / 2) - -max_pool = 2 +from parameters import * # for debugging if False: @@ -65,7 +56,7 @@ def compute_sslm(input_vector, beat_times, hop_size): x_hat_length = x_hat.shape[1] - sslm_shape = context_length * 3 # because we'll max pool it down at the end + sslm_shape = sslm_length * 3 # because we'll max pool it down at the end #Cosine distance calculation: D[N/p,L/p] matrix distances = np.full((x_hat_length, sslm_shape), 1.0, dtype=np.float32) #D has as dimensions N/p and L/p @@ -95,7 +86,7 @@ def compute_sslm(input_vector, beat_times, hop_size): sslm = np.transpose(sslm) beat_frames = np.round(beat_times * (22050. / hop_size)).astype('int') - beat_sslms = np.zeros((context_length, context_length, beat_frames.shape[0]), dtype=np.float32) + beat_sslms = np.zeros((sslm_length, sslm_length, beat_frames.shape[0]), dtype=np.float32) for k in range(beat_frames.shape[0]): sslm_frame = beat_frames[k] // max_pool @@ -232,8 +223,8 @@ def gen_beat_mls(waveform, beat_times): return beat_mls waveform = None - beat_mls, waveform = with_audio_cache(f, '.mls.npy', waveform, beat_times, gen_beat_mls) - beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm.npy', waveform, beat_times, compute_mls_sslm) + beat_mls, waveform = with_audio_cache(f, '.mls_115.npy', waveform, beat_times, gen_beat_mls) + beat_mls_sslm, waveform = with_audio_cache(f, '.mls_sslm_115.npy', waveform, beat_times, compute_mls_sslm) #times, waveform = with_audio_cache(f, '.beat_time_ratios.npy', waveform, beat_times, compute_time_features) times = make_beat_time_features(beat_numbers) @@ -331,6 +322,7 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample if mean_vec is None: # subsample features + print("sampling") idx = random.sample(range(features.shape[0]), min(features.shape[0], subsample)) temp_features = features[idx, :, :] @@ -345,8 +337,9 @@ def normalize_features_per_band(features, mean_vec=None, std_vec=None, subsample mean_vec = np.mean(temp_features, axis=0) std_vec = np.std(temp_features, axis=0) - features = features - mean_vec[np.newaxis, :, np.newaxis] - features = features / std_vec[np.newaxis, :, np.newaxis] + print("modifying...") + features -= mean_vec[np.newaxis, :, np.newaxis] + features /= std_vec[np.newaxis, :, np.newaxis] return features, mean_vec, std_vec @@ -367,7 +360,7 @@ def prepare_batch_data(feature_list, sslm_feature_list, time_feature_list, label # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) - data_sslm_x = np.zeros(shape=(n_preallocate, context_length, context_length), dtype=np.float32) + data_sslm_x = np.zeros(shape=(n_preallocate, sslm_length, sslm_length), dtype=np.float32) data_time_x = np.zeros(shape=(n_preallocate, time_feature_list[0].shape[1]), dtype=np.float32) data_y = np.zeros(shape=(n_preallocate,), dtype=np.float32) data_weight = np.zeros(shape=(n_preallocate,), dtype=np.float32) @@ -407,7 +400,7 @@ def add_feature(idx, label, weight=1): for k in positive_frames_idx: add_feature(k, label=1) - ## apply label smearing: set labels around annotation to 1 and give them a triangular weight + # apply label smearing: set labels around annotation to 1 and give them a triangular weight for l in range(k - label_smearing, k + label_smearing + 1): # don't smear into padding. @@ -449,7 +442,7 @@ def add_feature(idx, label, weight=1): if feature_count > n_preallocate: break - data_x = data_x[:feature_count, :, :] + data_x.resize((feature_count, data_x.shape[1], data_x.shape[2])) data_sslm_x.resize((feature_count, data_sslm_x.shape[1], data_sslm_x.shape[2])) data_time_x.resize((feature_count, data_time_x.shape[1])) data_y = data_y[:feature_count] @@ -510,6 +503,7 @@ def load_raw_features(file): train_x, train_sslm_x, train_time_x, train_y, train_weights, train_idx = prepare_batch_data(train_features, train_sslm_features, train_time_features, train_labels, is_training=True) test_x, test_sslm_x, test_time_x, test_y, test_weights, test_idx = prepare_batch_data(test_features, test_sslm_features, test_time_features, test_labels, is_training=False) + print("normalizing features") train_x, mean_vec, std_vec = normalize_features_per_band(train_x) test_x, mean_vec, std_vec = normalize_features_per_band(test_x, mean_vec, std_vec) diff --git a/Python/parameters.py b/Python/parameters.py index 4ad6222..3d0e161 100644 --- a/Python/parameters.py +++ b/Python/parameters.py @@ -11,7 +11,7 @@ max_pool = 2 # how far back to calculate the SSLM (note that actual length will be max_pool * sslm_length) -sslm_length = 65 +sslm_length = 115 # how many more negative examples than segment boundaries neg_frames_factor = 5 diff --git a/Python/track_segmentation.py b/Python/track_segmentation.py index 86ae88e..ecce646 100644 --- a/Python/track_segmentation.py +++ b/Python/track_segmentation.py @@ -16,14 +16,12 @@ from feature_extraction import compute_features, normalize_features_per_band from evaluation import post_processing, choose_preds from train_segmentation_cnn import build_model -import peakutils normalization_path = '../Data/normalization.npz' model_weights = '../Data/model_weights_100epochs_lr005.h5' out_dir = '../Temp/' -num_mel_bands = 80 -context_length = 65 -padding = int(context_length / 2) + +from parameters import context_length, num_mel_bands, padding_length def build_full_model(): @@ -79,8 +77,8 @@ def compute_context_windows(features): n_preallocate = 10000 - features = np.hstack((0.001 * np.random.rand(num_mel_bands, padding), features, - 0.001 * np.random.rand(num_mel_bands, padding))) + features = np.hstack((0.001 * np.random.rand(num_mel_bands, padding_length), features, + 0.001 * np.random.rand(num_mel_bands, padding_length))) # initialize arrays for storing context windows data_x = np.zeros(shape=(n_preallocate, num_mel_bands, context_length), dtype=np.float32) @@ -88,11 +86,11 @@ def compute_context_windows(features): feature_count = 0 num_padded_features = features.shape[1] - for k in range(padding, num_padded_features - padding): + for k in range(padding_length, num_padded_features - padding_length): if feature_count > n_preallocate: break - next_window = features[:, k-padding: k+padding+1] + next_window = features[:, k-padding_length: k+padding_length+1] data_x[feature_count, :, :] = next_window feature_count += 1 From 73914d5b5b145b4415b0b2dbfebf547d7870b180 Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 16 Apr 2021 04:35:16 -0700 Subject: [PATCH 34/35] allow for toggling different parts of the network for testing --- Python/parameters.py | 4 ++ Python/train_segmentation_cnn.py | 74 ++++++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/Python/parameters.py b/Python/parameters.py index 3d0e161..e9b5ad4 100644 --- a/Python/parameters.py +++ b/Python/parameters.py @@ -1,6 +1,10 @@ # thresholding value for prediction-choice algorithm. trade recall for accuracy here. prediction_threshold = 0.3 +# should we include (MLS, SSLM, beat #) features when training? +#training_features = {'mls', 'sslm', 'beat_numbers'} +training_features = {'mls', 'beat_numbers'} + # how many beats make up a context window for the MLS part of the network context_length = 115 diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index 8658cda..d951ab2 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -25,6 +25,7 @@ np.random.seed(1235) # for reproducibility +import parameters def load_training_data(dataset): """ @@ -59,15 +60,29 @@ def load_test_data(dataset): def build_model(mls_rows, mls_cols, sslm_shape): - mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input') - mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input) - mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls) + inputs = [] + merged_input = [] - sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input') - sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input) - sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm) + if 'mls' in parameters.training_features: + mls_input = layers.Input(shape=(mls_rows, mls_cols, 1), name='mls_input') + mls = layers.Conv2D(16, (6, 8), activation='relu', name='mls_conv')(mls_input) + mls = layers.MaxPooling2D(pool_size=(3, 6), name='mls_maxpool')(mls) + merged_input.append(mls) + inputs.append(mls_input) + + if 'sslm' in parameters.training_features: + sslm_input = layers.Input(shape=(sslm_shape, sslm_shape, 1), name='sslm_input') + sslm = layers.Conv2D(16, (8, 8), activation='relu', name='sslm_conv')(sslm_input) + sslm = layers.MaxPooling2D(pool_size=(6, 6), name='sslm_maxpool')(sslm) + + merged_input.append(sslm) + inputs.append(sslm_input) + + if len(merged_input) > 1: + merged = layers.Concatenate(axis=1, name='mls_sslm_concat')(merged_input) + else: + merged = merged_input[0] - merged = layers.Concatenate(axis=1, name='mls_slsm_concat')([mls, sslm]) merged = layers.Conv2D(64, (6, 3), activation='relu', name='concat_conv')(merged) merged = layers.Dropout(0.5, name='concat_dropout')(merged) @@ -76,13 +91,34 @@ def build_model(mls_rows, mls_cols, sslm_shape): merged = layers.Dense(256, activation='relu', name='final_dense')(merged) merged = layers.Dropout(0.5, name='final_dropout')(merged) - time_input = layers.Input(shape=(4,), name='time_input') - time = layers.Dense(1, activation='relu', name='time_dense')(time_input) - merged = layers.Concatenate(name='final_concat')([merged, time]) + final_dense_input = [merged] + if 'beat_numbers' in parameters.training_features: + time_input = layers.Input(shape=(4,), name='time_input') + time = layers.Dense(1, activation='relu', name='time_dense')(time_input) + final_dense_input.append(time) + inputs.append(time_input) + + if len(final_dense_input) > 1: + merged = layers.Concatenate(name='final_concat')(final_dense_input) + else: + merged = final_dense_input[0] merged = layers.Dense(1, activation='sigmoid', name='final_sigmoid')(merged) - return Model(inputs=[mls_input, sslm_input, time_input], outputs = merged) + return Model(inputs=inputs, outputs = merged) + +def make_input(mls, sslm, time): + input = [] + if 'mls' in parameters.training_features: + input.append(mls) + + if 'sslm' in parameters.training_features: + input.append(sslm) + + if 'beat_numbers' in parameters.training_features: + input.append(time) + + return input def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weights_file=None): """ @@ -100,7 +136,12 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh print('training data size:') print(X_train.shape) + img_rows = X_train.shape[1] + img_cols = X_train.shape[2] + model = build_model(img_rows, img_cols, x_sslm_train.shape[1]) + p = np.random.permutation(X_train.shape[0]) + X_train = X_train[p, :, :] x_sslm_train = x_sslm_train[p, :, :] x_time_train = x_time_train[p] @@ -111,10 +152,6 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh X_train = np.expand_dims(X_train, 3) x_sslm_train = np.expand_dims(x_sslm_train, 3) - img_rows = X_train.shape[1] - img_cols = X_train.shape[2] - - model = build_model(img_rows, img_cols, x_sslm_train.shape[1]) if weights_file is not None: model.load_weights(weights_file) @@ -125,7 +162,8 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh early_stopping = EarlyStopping(monitor='val_loss', patience=15) print('train model...') - model.fit(x=[X_train, x_sslm_train, x_time_train], y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, + + model.fit(x=make_input(X_train, x_sslm_train, x_time_train), y=y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_split=0.1, sample_weight=w_train, callbacks=[early_stopping]) print('load test data...') @@ -135,12 +173,12 @@ def train_model(batch_size=128, nb_epoch=100, save_ext='_100epochs_lr005', weigh x_sslm_test = np.expand_dims(x_sslm_test, 3) print('predict test data...') - preds = model.predict([X_test, x_sslm_test, x_time_test], batch_size=1, verbose=1) + preds = model.predict(make_input(X_test, x_sslm_test, x_time_test), batch_size=1, verbose=1) print('saving results...') np.save('../Data/predsTestTracks' + save_ext + '.npy', preds) - score = model.evaluate([X_test, x_sslm_test, x_time_test], y_test, verbose=1) + score = model.evaluate(make_input(X_test, x_sslm_test, x_time_test), y_test, verbose=1) print('Test score:', score) # save model From edefd022310a198ad1db315306795bfd76b62a3f Mon Sep 17 00:00:00 2001 From: Ben Osheroff Date: Fri, 16 Apr 2021 13:51:55 -0700 Subject: [PATCH 35/35] mmap stuff. new tracks. --- Data/test_tracks.txt | 14 ++++ Data/train_tracks.txt | 120 +++++++++++++++++++++++++++++++ Python/parameters.py | 3 +- Python/train_segmentation_cnn.py | 4 +- 4 files changed, 137 insertions(+), 4 deletions(-) diff --git a/Data/test_tracks.txt b/Data/test_tracks.txt index dd02f71..d7d6f00 100644 --- a/Data/test_tracks.txt +++ b/Data/test_tracks.txt @@ -90,3 +90,17 @@ 355.m4a 728.m4a 531.m4a +549.m4a +10050.mp3 +437.m4a +855.m4a +951.m4a +653.m4a +879.m4a +935.m4a +835.m4a +629.m4a +10051.mp3 +541.m4a +893.m4a +341.m4a diff --git a/Data/train_tracks.txt b/Data/train_tracks.txt index 7992c7c..d6a95cc 100644 --- a/Data/train_tracks.txt +++ b/Data/train_tracks.txt @@ -744,3 +744,123 @@ 439.m4a 79.m4a 782.m4a +803.m4a +605.m4a +10086.m4a +53.m4a +10075.m4a +823.m4a +10079.m4a +685.m4a +10087.m4a +10088.m4a +533.m4a +10081.m4a +701.m4a +901.m4a +39.m4a +39.m4a +827.m4a +525.m4a +933.m4a +10045.mp3 +389.m4a +10078.m4a +1655.m4a +10070.m4a +799.m4a +581.m4a +85.m4a +10062.m4a +597.m4a +943.m4a +565.m4a +10068.m4a +10074.m4a +445.m4a +10044.mp3 +1651.m4a +10058.m4a +829.m4a +909.m4a +557.m4a +381.m4a +621.m4a +485.m4a +931.m4a +413.m4a +357.m4a +839.m4a +10072.m4a +911.m4a +493.m4a +1635.m4a +1647.m4a +733.m4a +10091.m4a +837.m4a +10052.mp3 +1627.m4a +10054.mp3 +429.m4a +10071.m4a +10059.m4a +645.m4a +859.m4a +10063.m4a +501.m4a +21.m4a +10049.mp3 +10056.mp3 +10084.m4a +863.m4a +10090.m4a +10053.mp3 +10076.m4a +1607.m4a +895.m4a +10083.m4a +795.m4a +10048.mp3 +517.m4a +10080.m4a +853.m4a +851.m4a +847.m4a +10069.m4a +477.m4a +589.m4a +861.m4a +333.m4a +10073.m4a +10057.m4a +941.m4a +1643.m4a +677.m4a +661.m4a +10067.m4a +10082.m4a +10089.m4a +1619.m4a +1623.m4a +1615.m4a +831.m4a +10047.mp3 +397.m4a +693.m4a +10066.m4a +10055.mp3 +10046.mp3 +573.m4a +10077.m4a +819.m4a +461.m4a +10085.m4a +813.m4a +10061.m4a +10065.m4a +949.m4a +469.m4a +309.m4a +709.m4a +10060.m4a diff --git a/Python/parameters.py b/Python/parameters.py index e9b5ad4..b048dc7 100644 --- a/Python/parameters.py +++ b/Python/parameters.py @@ -2,8 +2,7 @@ prediction_threshold = 0.3 # should we include (MLS, SSLM, beat #) features when training? -#training_features = {'mls', 'sslm', 'beat_numbers'} -training_features = {'mls', 'beat_numbers'} +training_features = {'mls', 'sslm', 'beat_numbers'} # how many beats make up a context window for the MLS part of the network context_length = 115 diff --git a/Python/train_segmentation_cnn.py b/Python/train_segmentation_cnn.py index d951ab2..c2680e7 100644 --- a/Python/train_segmentation_cnn.py +++ b/Python/train_segmentation_cnn.py @@ -39,7 +39,7 @@ def load_training_data(dataset): :return train_weights (n_items x 1) """ - data = np.load(dataset) + data = np.load(dataset, mmap_mode='r') return data['train_x'], data['train_sslm_x'], data['train_time_x'], data['train_y'], data['train_weights'] @@ -55,7 +55,7 @@ def load_test_data(dataset): :return test_weights (n_items x 1) """ - data = np.load(dataset) + data = np.load(dataset, mmap_mode='r') return data['test_x'], data['test_sslm_x'], data['test_time_x'], data['test_y'], data['test_weights']