Skip to content

Commit

Permalink
Merge pull request #89 from valentinp72/master
Browse files Browse the repository at this point in the history
Added options to bypass ffmpeg usage in some cases ; Pyro client now uses argparse
  • Loading branch information
DavidDoukhan authored Dec 2, 2024
2 parents 69a78bc + 76df66e commit 2cac771
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 18 deletions.
23 changes: 22 additions & 1 deletion inaSpeechSegmenter/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmp
Convert media to temp wav 16k mono and return signal
"""

if ffmpeg is None:
if start_sec is not None or stop_sec is not None:
raise NotImplementedError(
f'start_sec={start_sec} and stop_sec={stop_sec} cannot be set ' \
f' when running inaSpeechSegmenter without ffmpeg. Please cut '\
f'down your audio files beforehand or use ffmpeg.'
)
if medianame.startswith('http://') or medianame.startswith('https://'):
raise NotImplementedError(
f'Without ffmpeg you cannot process media content on http ' \
f'servers. You need to download your audio files beforehand ' \
f'or use ffmpeg. You gave medianame={medianame}.'
)

sig, sr = sf.read(medianame, dtype=dtype)
assert sr == 16_000, \
f'Without ffmpeg, inaSpeechSegmenter can only take files sampled ' \
f'at 16000 Hz. The file {medianame} is sampled at {sr} Hz.'
return sig

base, _ = os.path.splitext(os.path.basename(medianame))

with tempfile.TemporaryDirectory(dir=tmpdir) as tmpdirname:
Expand All @@ -57,4 +77,5 @@ def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmp
# Get Mel Power Spectrogram and Energy
sig, sr = sf.read(tmpwav, dtype=dtype)
assert sr == 16000
return sig
return sig

7 changes: 4 additions & 3 deletions inaSpeechSegmenter/segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,10 @@ def __init__(self, vad_engine='smn', detect_gender=True, ffmpeg='ffmpeg', batch_
default value (32) is slow, but works on any hardware
"""

# test ffmpeg installation
if shutil.which(ffmpeg) is None:
raise(Exception("""ffmpeg program not found"""))
if ffmpeg is not None:
# test ffmpeg installation
if shutil.which(ffmpeg) is None:
raise(Exception("""ffmpeg program not found"""))
self.ffmpeg = ffmpeg

# set energic ratio for 1st VAD
Expand Down
Binary file added media/musanmix.wav
Binary file not shown.
22 changes: 22 additions & 0 deletions run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ def seg2str(iseg, tseg):
self.assertEqual(curstop, nextstart,
'%s VS %s' % (seg2str(i, ret[i]), seg2str(i+1, ret[i+1])))

seg = Segmenter(ffmpeg=None)
ret = seg('./media/musanmix.wav')
for i in range(len(ret) -1):
curstop = ret[i][2]
nextstart = ret[i+1][1]
self.assertEqual(curstop, nextstart,
'%s VS %s' % (seg2str(i, ret[i]), seg2str(i+1, ret[i+1])))

def test_processingresult(self):
seg = Segmenter(vad_engine='sm')
ret = seg('./media/musanmix.mp3')
Expand All @@ -87,6 +95,14 @@ def test_processingresult(self):
self.assertEqual([e[0] for e in ref], [e[0] for e in ret])
np.testing.assert_almost_equal([e[1] for e in ref], [e[1] for e in ret])
np.testing.assert_almost_equal([e[2] for e in ref], [e[2] for e in ret])

seg = Segmenter(vad_engine='sm', ffmpeg=None)
ret = seg('./media/musanmix.wav')
df = pd.read_csv('./media/musanmix-sm-gender.csv', sep='\t')
ref = [(l.labels, float(l.start), float(l.stop)) for _, l in df.iterrows()]
self.assertEqual([e[0] for e in ref], [e[0] for e in ret])
np.testing.assert_almost_equal([e[1] for e in ref], [e[1] for e in ret])
np.testing.assert_almost_equal([e[2] for e in ref], [e[2] for e in ret])

def test_batch(self):
seg = Segmenter(vad_engine='sm')
Expand All @@ -96,6 +112,12 @@ def test_batch(self):
self.assertTrue(filecmp.cmp(lout[0], lout[1]))
self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-sm-gender.csv'))

seg = Segmenter(vad_engine='sm', ffmpeg=None)
with tempfile.TemporaryDirectory() as tmpdirname:
lout = [os.path.join(tmpdirname, '1.1.csv'), os.path.join(tmpdirname, '2.1.csv')]
ret = seg.batch_process(['./media/musanmix.wav', './media/musanmix.wav'], lout)
self.assertTrue(filecmp.cmp(lout[0], lout[1]))
self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-sm-gender.csv'))

def test_praat_export(self):
seg = Segmenter()
Expand Down
6 changes: 5 additions & 1 deletion scripts/ina_speech_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,16 @@
parser.add_argument('-s', '--batch_size', type=int, default=32, help="(default: 32 - we recommend 1024). Size of batches to be sent to the GPU. Larger values allow faster processings, but require GPU with more memories. Default 32 size is fine even with a baseline laptop GPU.")
parser.add_argument('-d', '--vad_engine', choices=['sm', 'smn'], default='smn', help="Voice activity detection (VAD) engine to be used (default: 'smn'). 'smn' split signal into 'speech', 'music' and 'noise' (better). 'sm' split signal into 'speech' and 'music' and do not take noise into account, which is either classified as music or speech. Results presented in ICASSP were obtained using 'sm' option")
parser.add_argument('-g', '--detect_gender', choices = ['true', 'false'], default='True', help="(default: 'true'). If set to 'true', segments detected as speech will be splitted into 'male' and 'female' segments. If set to 'false', segments corresponding to speech will be labelled as 'speech' (faster)")
parser.add_argument('-b', '--ffmpeg_binary', default='ffmpeg', help='Your custom binary of ffmpeg', required=False)
parser.add_argument('-b', '--ffmpeg_binary', default='ffmpeg', help='Your custom binary of ffmpeg. Set `None` to disable ffmpeg.', required=False)
parser.add_argument('-e', '--export_format', choices = ['csv', 'textgrid'], default='csv', help="(default: 'csv'). If set to 'csv', result will be exported in csv. If set to 'textgrid', results will be exported to praat Textgrid")
parser.add_argument('-r', '--energy_ratio', default=0.03, type=float, help="(default: 0.03). Energetic threshold used to detect activity (percentage of mean energy of the signal)")

args = parser.parse_args()

if args.ffmpeg_binary.lower() == "none" or args.ffmpeg_binary == "":
print("Disabling ffmpeg. Make sure your audio files are already sampled at 16kHz.")
args.ffmpeg_binary = None

# Preprocess arguments and check their consistency
input_files = []
for e in args.input:
Expand Down
43 changes: 30 additions & 13 deletions scripts/ina_speech_segmenter_pyro_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,33 +25,50 @@


import Pyro4
import sys
import os
import socket

from inaSpeechSegmenter import Segmenter

import argparse

if __name__ == '__main__':
dname = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser(
description='Start a inaSpeechSegmenter Pyro client.'
)
parser.add_argument(
'uri', type=str,
help='URI of the Pyro server to connect and get jobs from.'
)
parser.add_argument(
'--batch_size', type=int, default=1024,
help='Batch size to use. Use lower values with small GPUs.'
)
parser.add_argument(
'--ffmpeg_binary', default='ffmpeg', type=str,
help='Your custom binary of ffmpeg. Set `None` to disable ffmpeg.'
)
args = parser.parse_args()

hostname = socket.gethostname()
if args.ffmpeg_binary.lower() == "none" or args.ffmpeg_binary == "":
print("Disabling ffmpeg. Make sure your audio files are already sampled at 16kHz.")
args.ffmpeg_binary = None

uri = sys.argv[1]
jobserver = Pyro4.Proxy(uri)
dname = os.path.dirname(os.path.realpath(__file__))
hostname = socket.gethostname()
jobserver = Pyro4.Proxy(args.uri)

ret = -1
outname = 'init'

# batch size set at 1024. Use lower values with small gpus
g = Segmenter(batch_size=1024)

from inaSpeechSegmenter import Segmenter
g = Segmenter(batch_size=args.batch_size, ffmpeg=args.ffmpeg_binary)

while True:
lsrc, ldst = jobserver.get_njobs('%s %s' % (hostname, ret))

print(lsrc, ldst)
if len(lsrc) == 0:
print('job list finished')
break
ret = g.batch_process(lsrc, ldst, skipifexist=True, nbtry=3)

ret = g.batch_process(lsrc, ldst, skipifexist=True, nbtry=3)

0 comments on commit 2cac771

Please sign in to comment.