From 48d5d4bf6c54418cc5f6963c04ff894cc6b98233 Mon Sep 17 00:00:00 2001
From: Aravind Putrevu <aravind.putrevu@gmail.com>
Date: Fri, 30 Aug 2024 22:12:33 +0200
Subject: [PATCH] Synthetic Instruction Dataset Notebook (#86)

* Synthetic Instruction Dataset Notebook

* minor fix
---
 .../Llama_3_1_Synthetic_Data.ipynb            | 392 ++++++++++++++++++
 1 file changed, 392 insertions(+)
 create mode 100644 examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb

diff --git a/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb b/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb
new file mode 100644
index 0000000..75cacb9
--- /dev/null
+++ b/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb
@@ -0,0 +1,392 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "71a43144",
+      "metadata": {
+        "id": "71a43144"
+      },
+      "source": [
+        "# Get Started with Llama 3.1 Models\n",
+        "\n",
+        "\n",
+        "Llama 3.1 release comes with three sizes of models 7B, 70B and 405B\n",
+        "\n",
+        "In this notebook, we will look at :\n",
+        "\n",
+        "*  How to access the Llama 3.1 models over a API?\n",
+        "*  Generate Structured Synthetic Instruction Dataset with Llama 3.1 405B\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "TWZqnPr0pcv8",
+      "metadata": {
+        "id": "TWZqnPr0pcv8"
+      },
+      "source": [
+        "## Setup\n",
+        "\n",
+        "Install all the dependencies and import the required python modules."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "YYRdRzHuHkjz",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "YYRdRzHuHkjz",
+        "outputId": "c497ab98-5b58-44cc-e1ef-3a67c65a4e7e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting fireworks-ai\n",
+            "  Downloading fireworks_ai-0.15.0-py3-none-any.whl.metadata (5.3 kB)\n",
+            "Collecting httpx (from fireworks-ai)\n",
+            "  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)\n",
+            "Collecting httpx-sse (from fireworks-ai)\n",
+            "  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
+            "Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (from fireworks-ai) (2.8.2)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from fireworks-ai) (9.4.0)\n",
+            "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (3.7.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (2024.7.4)\n",
+            "Collecting httpcore==1.* (from httpx->fireworks-ai)\n",
+            "  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n",
+            "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (3.7)\n",
+            "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (1.3.1)\n",
+            "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->fireworks-ai)\n",
+            "  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
+            "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (2.20.1)\n",
+            "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (4.12.2)\n",
+            "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx->fireworks-ai) (1.2.2)\n",
+            "Downloading fireworks_ai-0.15.0-py3-none-any.whl (83 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.0/84.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
+            "Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: httpx-sse, h11, httpcore, httpx, fireworks-ai\n",
+            "Successfully installed fireworks-ai-0.15.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 httpx-sse-0.4.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "pip install --upgrade fireworks-ai"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "sV4u2hezqV4X",
+      "metadata": {
+        "id": "sV4u2hezqV4X"
+      },
+      "source": [
+        "## Setup your API Key\n",
+        "\n",
+        "In order to use the Llama 3.1, you must first obtain Fireworks API Keys. If you don't already have one, you can one by following the instructions [here](https://docs.fireworks.ai/getting-started/quickstart)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "M9EtJxjJrlyD",
+      "metadata": {
+        "id": "M9EtJxjJrlyD"
+      },
+      "outputs": [],
+      "source": [
+        "from fireworks.client import Fireworks\n",
+        "\n",
+        "#replace the FIREWORKS_API_KEY with the key copied in the above step.\n",
+        "client = Fireworks(api_key=\"FIREWORKS_API_KEY\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "FrFtrGQ6r-Mk",
+      "metadata": {
+        "id": "FrFtrGQ6r-Mk"
+      },
+      "source": [
+        "## Accessing Llama 3.1 Models using API\n",
+        "\n",
+        "We are sending a request to Llama 3.1 405B model, alternatively you can change the model string to access the otherm models.\n",
+        "\n",
+        "* accounts/fireworks/models/llama-v3p1-70b-instruct\n",
+        "* accounts/fireworks/models/llama-v3p1-8B-instruct"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "TVGsp9wvtO2Q",
+      "metadata": {
+        "id": "TVGsp9wvtO2Q"
+      },
+      "source": [
+        "### Chat Completions API"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "DHAgBTk8Hpg_",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DHAgBTk8Hpg_",
+        "outputId": "a2060088-0fac-4c29-c388-60c3760e8c0f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "I'm an artificial intelligence model known as Llama. Llama stands for \"Large Language Model Meta AI.\"\n"
+          ]
+        }
+      ],
+      "source": [
+        "model_name = \"accounts/fireworks/models/llama-v3p1-405b-instruct\"\n",
+        "\n",
+        "response = client.chat.completions.create(\n",
+        "\tmodel=model_name,\n",
+        "\tmessages=[{\n",
+        "\t\t\"role\": \"user\",\n",
+        "\t\t\"content\": \"Who are you?\",\n",
+        "\t}],\n",
+        ")\n",
+        "print(response.choices[0].message.content)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "O5i7rqkhJQJN",
+      "metadata": {
+        "id": "O5i7rqkhJQJN"
+      },
+      "source": [
+        "## Generate Synthetic Data\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "yxVgS4xuBhFF",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yxVgS4xuBhFF",
+        "outputId": "1ce8ebe2-c924-4065-d2c7-87e2a8d5e019"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (2.8.2)\n",
+            "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic) (2.20.1)\n",
+            "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic) (4.12.2)\n"
+          ]
+        }
+      ],
+      "source": [
+        "pip install pydantic"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "wcOA_lp1B9lD",
+      "metadata": {
+        "id": "wcOA_lp1B9lD"
+      },
+      "outputs": [],
+      "source": [
+        "from pydantic import BaseModel, Field"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "D9xgnLuGCAQi",
+      "metadata": {
+        "id": "D9xgnLuGCAQi"
+      },
+      "outputs": [],
+      "source": [
+        "from pydantic import BaseModel, Field\n",
+        "from typing import List, Optional\n",
+        "from enum import Enum\n",
+        "\n",
+        "\n",
+        "class Category(str, Enum):\n",
+        "    COUNTRIES = \"Countries\"\n",
+        "    CAPITALS = \"Capitals\"\n",
+        "    RIVERS = \"Rivers\"\n",
+        "    MOUNTAINS = \"Mountains\"\n",
+        "    LANDMARKS = \"Landmarks\"\n",
+        "    CLIMATE = \"Climate\"\n",
+        "    CULTURE = \"Culture\"\n",
+        "\n",
+        "class Difficulty(str, Enum):\n",
+        "    EASY = \"Easy\"\n",
+        "    MEDIUM = \"Medium\"\n",
+        "    HARD = \"Hard\"\n",
+        "    EXPERT = \"Expert\"\n",
+        "\n",
+        "class QuestionType(str, Enum):\n",
+        "    MULTIPLE_CHOICE = \"Multiple Choice\"\n",
+        "    TRUE_FALSE = \"True/False\"\n",
+        "    FILL_IN_THE_BLANK = \"Fill in the Blank\"\n",
+        "    SHORT_ANSWER = \"Short Answer\"\n",
+        "\n",
+        "class Question(BaseModel):\n",
+        "    instruction: str\n",
+        "    context: str\n",
+        "    response: str\n",
+        "    question_type: QuestionType\n",
+        "    category: Category\n",
+        "    difficulty: Difficulty\n",
+        "\n",
+        "class GeographyQuizDataset(BaseModel):\n",
+        "    title: str = \"World Geography Challenge Dataset\"\n",
+        "    description: str = \"Dataset for geography quiz questions and answers\"\n",
+        "    questions: List[Question]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "5fdLrBLx9vwL",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5fdLrBLx9vwL",
+        "outputId": "f4e84aff-af6c-4075-e48f-7674d764ab06"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "{'instruction': \"What is the world's largest desert, covering over 9,000,000 square kilometers?\", 'context': 'Deserts are large areas of land that receive very little precipitation. The largest desert in the world is also the driest and hottest, covering much of North Africa.', 'response': 'Sahara', 'question_type': 'Multiple Choice', 'category': 'Landmarks', 'difficulty': 'Easy'}\n",
+            "Generated question 1/10: What is the world's largest desert, covering over 9,000,000 square kilometers?\n",
+            "{'instruction': 'Which African country has a coastline on the Mediterranean Sea and is home to the ancient city of Carthage?', 'context': 'Carthage was a major trading hub in the ancient world and is now a UNESCO World Heritage Site.', 'response': 'Tunisia', 'question_type': 'Multiple Choice', 'category': 'Countries', 'difficulty': 'Medium'}\n",
+            "Generated question 2/10: Which African country has a coastline on the Mediterranean Sea and is home to the ancient city of Carthage?\n",
+            "{'instruction': \"What is the world's longest river, which flows through 10 countries in northeastern Africa?\", 'context': 'This river is a primary source of water, transportation, and livelihood for millions of people in the region.', 'response': 'Nile', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Easy'}\n",
+            "Generated question 3/10: What is the world's longest river, which flows through 10 countries in northeastern Africa?\n",
+            "{'instruction': 'Which river is the longest in South America?', 'context': 'The river in question is the largest river in the world by discharge volume, and flows through Brazil, Peru, and Colombia.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 4/10: Which river is the longest in South America?\n",
+            "{'instruction': 'Which river is the longest in South America and flows through Brazil, Peru, and Colombia before emptying into the Pacific Ocean?', 'context': 'Rivers of South America', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 5/10: Which river is the longest in South America and flows through Brazil, Peru, and Colombia before emptying into the Pacific Ocean?\n",
+            "{'instruction': 'Which of the following rivers is the longest in South America?', 'context': 'The question requires the test-taker to identify the longest river in South America, which is a key geographical feature of the continent.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 6/10: Which of the following rivers is the longest in South America?\n",
+            "{'instruction': 'What is the longest river in South America?', 'context': 'This river flows through several countries in South America, including Brazil, Colombia, and Peru, before emptying into the Pacific Ocean.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 7/10: What is the longest river in South America?\n",
+            "{'instruction': \"What is the world's longest river, which flows through 11 countries in northeastern Africa?\", 'context': 'The river originates in Burundi and flows northwards through Rwanda and Tanzania before emptying into the Mediterranean Sea.', 'response': 'Nile', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Easy'}\n",
+            "Generated question 8/10: What is the world's longest river, which flows through 11 countries in northeastern Africa?\n",
+            "{'instruction': 'What is the longest river in South America?', 'context': 'South America is home to many significant rivers, but one stands out as the longest.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 9/10: What is the longest river in South America?\n",
+            "{'instruction': 'Which river is the longest in South America?', 'context': 'The question requires the test-taker to identify the longest river in South America.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n",
+            "Generated question 10/10: Which river is the longest in South America?\n",
+            "Generated and saved 10 questions to geography_quiz_dataset.jsonl\n"
+          ]
+        }
+      ],
+      "source": [
+        "import json\n",
+        "def generate_question():\n",
+        "    prompt = \"\"\"Generate a geography quiz question. Format your response as a JSON object with the following structure:\n",
+        "    {\n",
+        "        \"instruction\": \"The full question text\",\n",
+        "        \"context\": \"Provide context about the question\",\n",
+        "        \"response\": \"The correct answer\",\n",
+        "        \"question_type\": \"The type of question (e.g., 'Multiple Choice')\",\n",
+        "        \"category\": \"The category should be marked as one of these: Countries, Capitals, Rivers, Mountains, Landmarks, Climate, Culture\",\n",
+        "        \"difficulty\": \"The difficulty level of the question (e.g., 'Easy')\"\n",
+        "    }\"\"\"\n",
+        "\n",
+        "    response = client.chat.completions.create(\n",
+        "        model=\"accounts/fireworks/models/llama-v3p1-405b-instruct\",\n",
+        "        response_format={\"type\": \"json_object\"},\n",
+        "        messages=[\n",
+        "            {\"role\": \"system\", \"content\": \"You are a geography expert creating quiz questions.\"},\n",
+        "            {\"role\": \"user\", \"content\": prompt}\n",
+        "        ]\n",
+        "    )\n",
+        "\n",
+        "    question_data = json.loads(response.choices[0].message.content)\n",
+        "    print(question_data)\n",
+        "    return Question(**question_data)\n",
+        "\n",
+        "def main(num_questions=10):\n",
+        "    with open(\"geography_quiz_dataset.jsonl\", \"w\") as f:\n",
+        "        for i in range(num_questions):\n",
+        "            question = generate_question()\n",
+        "            json.dump(question.dict(), f)\n",
+        "            f.write(\"\\n\")\n",
+        "            print(f\"Generated question {i+1}/{num_questions}: {question.instruction}\")\n",
+        "\n",
+        "    print(f\"Generated and saved {num_questions} questions to geography_quiz_dataset.jsonl\")\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    main()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "sDoxKlYwDORC",
+      "metadata": {
+        "id": "sDoxKlYwDORC"
+      },
+      "source": [
+        "## Conclusion\n",
+        "\n",
+        "We’re excited to see how the community leverages Llama 3.1 API to create interesting applications.\n",
+        "\n",
+        "\n",
+        "For more information and to get started with Llama 3.1, visit [docs.fireworks.ai](https://docs.fireworks.ai) or join our [discord community](https://discord.gg/fireworks-ai)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}