Skip to content

Commit

Permalink
Update registry with more languages commonvoice
Browse files Browse the repository at this point in the history
  • Loading branch information
liPatrick committed Oct 31, 2024
1 parent 487e939 commit df39685
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions ultravox/data/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,34 @@
splits=[types.DatasetSplitConfig(name="train", num_samples=26_377)],
)

CV_HI_CONFIG = types.DatasetConfig(
name="commonvoice-hi",
base="commonvoice",
subset="hi",
splits=[types.DatasetSplitConfig(name="train", num_samples=4_690)],
)

CV_SV_SE_CONFIG = types.DatasetConfig(
name="commonvoice-sv-se",
base="commonvoice",
subset="sv-SE",
splits=[types.DatasetSplitConfig(name="train", num_samples=7_740)],
)

CV_TR_CONFIG = types.DatasetConfig(
name="commonvoice-tr",
base="commonvoice",
subset="tr",
splits=[types.DatasetSplitConfig(name="train", num_samples=35_100)],
)

CV_UK_CONFIG = types.DatasetConfig(
name="commonvoice-uk",
base="commonvoice",
subset="uk",
splits=[types.DatasetSplitConfig(name="train", num_samples=25_100)],
)

GS_XL_CONFIG = types.DatasetConfig(
name="gigaspeech",
path="speechcolab/gigaspeech",
Expand Down

0 comments on commit df39685

Please sign in to comment.