Update

RobinSmits · Jul 2, 2023 · 152e010 · 152e010
1 parent bfd1d1a
commit 152e010
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,2 +1,37 @@
-# Dutch-LLMs
+# Dutch LLM's
 Various training, inference and validation code and results related to Open LLM's that were pretrained (full or partially) on the Dutch language.
+
+## Training
+
+<< TODO >>
+
+## Evaluation
+
+<< TODO >>
+
+
+## Datasets
+
+Below a description of the various dataset(s) that have been used for training and evaluation.
+
+### Alpaca Dutch Cleaned
+
+Alpaca is a dataset containing roughly 51K rows of data that can be used to finetune any Large Language Model. The original dataset is in the English language only.
+
+Recently I came across a version of the dataset that was completely translated into the Dutch language. Use the following link for the dataset: [Alpaca Dutch Cleaned](https://www.huggingface.co/datasets/BramVanroy/alpaca-cleaned-dutch)
+
+During training of the first Colab Notebook the dataset was split into a training and validation part. The size of the validation set is 2048 rows.
+Since I would like to be able to compare the various training runs and evaluation results the training and validation datasets are stored within a subfolder (alpaca_clean_dutch) in this Github repo.
+
+## References
+
+```
+@misc{https://doi.org/10.57967/hf/0530,
+  doi = {10.57967/HF/0530},
+  url = {https://huggingface.co/datasets/BramVanroy/alpaca-cleaned-dutch},
+  author = {{Bram Vanroy}},
+  title = {{A}lpaca {C}leaned {D}utch},
+  publisher = {Hugging Face},
+  year = {2023}
+}
+```
diff --git a/alpaca_clean_dutch/train_data/data-00000-of-00001.arrow b/alpaca_clean_dutch/train_data/data-00000-of-00001.arrow
diff --git a/alpaca_clean_dutch/train_data/dataset_info.json b/alpaca_clean_dutch/train_data/dataset_info.json
@@ -0,0 +1,49 @@
+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "BramVanroy--alpaca-cleaned-dutch",
+  "dataset_size": 22014685,
+  "description": "",
+  "download_checksums": {
+    "https://huggingface.co/datasets/BramVanroy/alpaca-cleaned-dutch/resolve/79e4cc109558b26e8f30f44adb768b8f9709dfba/alpaca_data_cleaned-dutch.jsonl": {
+      "num_bytes": 24355992,
+      "checksum": null
+    }
+  },
+  "download_size": 24355992,
+  "features": {
+    "id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "instruction": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "input": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "output": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 46370677,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 22014685,
+      "num_examples": 51712,
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
diff --git a/alpaca_clean_dutch/train_data/state.json b/alpaca_clean_dutch/train_data/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "0f7260bc068c7e5e",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}
diff --git a/alpaca_clean_dutch/val_data/data-00000-of-00001.arrow b/alpaca_clean_dutch/val_data/data-00000-of-00001.arrow
diff --git a/alpaca_clean_dutch/val_data/dataset_info.json b/alpaca_clean_dutch/val_data/dataset_info.json
@@ -0,0 +1,49 @@
+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "BramVanroy--alpaca-cleaned-dutch",
+  "dataset_size": 22014685,
+  "description": "",
+  "download_checksums": {
+    "https://huggingface.co/datasets/BramVanroy/alpaca-cleaned-dutch/resolve/79e4cc109558b26e8f30f44adb768b8f9709dfba/alpaca_data_cleaned-dutch.jsonl": {
+      "num_bytes": 24355992,
+      "checksum": null
+    }
+  },
+  "download_size": 24355992,
+  "features": {
+    "id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "instruction": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "input": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "output": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 46370677,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 22014685,
+      "num_examples": 51712,
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
diff --git a/alpaca_clean_dutch/val_data/state.json b/alpaca_clean_dutch/val_data/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "40ef988d605ceaae",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}