From 48d5d4bf6c54418cc5f6963c04ff894cc6b98233 Mon Sep 17 00:00:00 2001 From: Aravind Putrevu Date: Fri, 30 Aug 2024 22:12:33 +0200 Subject: [PATCH] Synthetic Instruction Dataset Notebook (#86) * Synthetic Instruction Dataset Notebook * minor fix --- .../Llama_3_1_Synthetic_Data.ipynb | 392 ++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100644 examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb diff --git a/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb b/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb new file mode 100644 index 0000000..75cacb9 --- /dev/null +++ b/examples/synthetic_data/Llama_3_1_Synthetic_Data.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "71a43144", + "metadata": { + "id": "71a43144" + }, + "source": [ + "# Get Started with Llama 3.1 Models\n", + "\n", + "\n", + "Llama 3.1 release comes with three sizes of models 7B, 70B and 405B\n", + "\n", + "In this notebook, we will look at :\n", + "\n", + "* How to access the Llama 3.1 models over a API?\n", + "* Generate Structured Synthetic Instruction Dataset with Llama 3.1 405B\n" + ] + }, + { + "cell_type": "markdown", + "id": "TWZqnPr0pcv8", + "metadata": { + "id": "TWZqnPr0pcv8" + }, + "source": [ + "## Setup\n", + "\n", + "Install all the dependencies and import the required python modules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "YYRdRzHuHkjz", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YYRdRzHuHkjz", + "outputId": "c497ab98-5b58-44cc-e1ef-3a67c65a4e7e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting fireworks-ai\n", + " Downloading fireworks_ai-0.15.0-py3-none-any.whl.metadata (5.3 kB)\n", + "Collecting httpx (from fireworks-ai)\n", + " Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)\n", + "Collecting httpx-sse (from fireworks-ai)\n", + " Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n", + "Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (from fireworks-ai) (2.8.2)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from fireworks-ai) (9.4.0)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (3.7.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (2024.7.4)\n", + "Collecting httpcore==1.* (from httpx->fireworks-ai)\n", + " Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (3.7)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx->fireworks-ai) (1.3.1)\n", + "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->fireworks-ai)\n", + " Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (2.20.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic->fireworks-ai) (4.12.2)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx->fireworks-ai) (1.2.2)\n", + "Downloading fireworks_ai-0.15.0-py3-none-any.whl (83 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.0/84.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n", + "Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: httpx-sse, h11, httpcore, httpx, fireworks-ai\n", + "Successfully installed fireworks-ai-0.15.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 httpx-sse-0.4.0\n" + ] + } + ], + "source": [ + "pip install --upgrade fireworks-ai" + ] + }, + { + "cell_type": "markdown", + "id": "sV4u2hezqV4X", + "metadata": { + "id": "sV4u2hezqV4X" + }, + "source": [ + "## Setup your API Key\n", + "\n", + "In order to use the Llama 3.1, you must first obtain Fireworks API Keys. If you don't already have one, you can one by following the instructions [here](https://docs.fireworks.ai/getting-started/quickstart)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "M9EtJxjJrlyD", + "metadata": { + "id": "M9EtJxjJrlyD" + }, + "outputs": [], + "source": [ + "from fireworks.client import Fireworks\n", + "\n", + "#replace the FIREWORKS_API_KEY with the key copied in the above step.\n", + "client = Fireworks(api_key=\"FIREWORKS_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "id": "FrFtrGQ6r-Mk", + "metadata": { + "id": "FrFtrGQ6r-Mk" + }, + "source": [ + "## Accessing Llama 3.1 Models using API\n", + "\n", + "We are sending a request to Llama 3.1 405B model, alternatively you can change the model string to access the otherm models.\n", + "\n", + "* accounts/fireworks/models/llama-v3p1-70b-instruct\n", + "* accounts/fireworks/models/llama-v3p1-8B-instruct" + ] + }, + { + "cell_type": "markdown", + "id": "TVGsp9wvtO2Q", + "metadata": { + "id": "TVGsp9wvtO2Q" + }, + "source": [ + "### Chat Completions API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "DHAgBTk8Hpg_", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DHAgBTk8Hpg_", + "outputId": "a2060088-0fac-4c29-c388-60c3760e8c0f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm an artificial intelligence model known as Llama. Llama stands for \"Large Language Model Meta AI.\"\n" + ] + } + ], + "source": [ + "model_name = \"accounts/fireworks/models/llama-v3p1-405b-instruct\"\n", + "\n", + "response = client.chat.completions.create(\n", + "\tmodel=model_name,\n", + "\tmessages=[{\n", + "\t\t\"role\": \"user\",\n", + "\t\t\"content\": \"Who are you?\",\n", + "\t}],\n", + ")\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "O5i7rqkhJQJN", + "metadata": { + "id": "O5i7rqkhJQJN" + }, + "source": [ + "## Generate Synthetic Data\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "yxVgS4xuBhFF", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yxVgS4xuBhFF", + "outputId": "1ce8ebe2-c924-4065-d2c7-87e2a8d5e019" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (2.8.2)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic) (2.20.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic) (4.12.2)\n" + ] + } + ], + "source": [ + "pip install pydantic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "wcOA_lp1B9lD", + "metadata": { + "id": "wcOA_lp1B9lD" + }, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "D9xgnLuGCAQi", + "metadata": { + "id": "D9xgnLuGCAQi" + }, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field\n", + "from typing import List, Optional\n", + "from enum import Enum\n", + "\n", + "\n", + "class Category(str, Enum):\n", + " COUNTRIES = \"Countries\"\n", + " CAPITALS = \"Capitals\"\n", + " RIVERS = \"Rivers\"\n", + " MOUNTAINS = \"Mountains\"\n", + " LANDMARKS = \"Landmarks\"\n", + " CLIMATE = \"Climate\"\n", + " CULTURE = \"Culture\"\n", + "\n", + "class Difficulty(str, Enum):\n", + " EASY = \"Easy\"\n", + " MEDIUM = \"Medium\"\n", + " HARD = \"Hard\"\n", + " EXPERT = \"Expert\"\n", + "\n", + "class QuestionType(str, Enum):\n", + " MULTIPLE_CHOICE = \"Multiple Choice\"\n", + " TRUE_FALSE = \"True/False\"\n", + " FILL_IN_THE_BLANK = \"Fill in the Blank\"\n", + " SHORT_ANSWER = \"Short Answer\"\n", + "\n", + "class Question(BaseModel):\n", + " instruction: str\n", + " context: str\n", + " response: str\n", + " question_type: QuestionType\n", + " category: Category\n", + " difficulty: Difficulty\n", + "\n", + "class GeographyQuizDataset(BaseModel):\n", + " title: str = \"World Geography Challenge Dataset\"\n", + " description: str = \"Dataset for geography quiz questions and answers\"\n", + " questions: List[Question]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fdLrBLx9vwL", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5fdLrBLx9vwL", + "outputId": "f4e84aff-af6c-4075-e48f-7674d764ab06" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'instruction': \"What is the world's largest desert, covering over 9,000,000 square kilometers?\", 'context': 'Deserts are large areas of land that receive very little precipitation. The largest desert in the world is also the driest and hottest, covering much of North Africa.', 'response': 'Sahara', 'question_type': 'Multiple Choice', 'category': 'Landmarks', 'difficulty': 'Easy'}\n", + "Generated question 1/10: What is the world's largest desert, covering over 9,000,000 square kilometers?\n", + "{'instruction': 'Which African country has a coastline on the Mediterranean Sea and is home to the ancient city of Carthage?', 'context': 'Carthage was a major trading hub in the ancient world and is now a UNESCO World Heritage Site.', 'response': 'Tunisia', 'question_type': 'Multiple Choice', 'category': 'Countries', 'difficulty': 'Medium'}\n", + "Generated question 2/10: Which African country has a coastline on the Mediterranean Sea and is home to the ancient city of Carthage?\n", + "{'instruction': \"What is the world's longest river, which flows through 10 countries in northeastern Africa?\", 'context': 'This river is a primary source of water, transportation, and livelihood for millions of people in the region.', 'response': 'Nile', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Easy'}\n", + "Generated question 3/10: What is the world's longest river, which flows through 10 countries in northeastern Africa?\n", + "{'instruction': 'Which river is the longest in South America?', 'context': 'The river in question is the largest river in the world by discharge volume, and flows through Brazil, Peru, and Colombia.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 4/10: Which river is the longest in South America?\n", + "{'instruction': 'Which river is the longest in South America and flows through Brazil, Peru, and Colombia before emptying into the Pacific Ocean?', 'context': 'Rivers of South America', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 5/10: Which river is the longest in South America and flows through Brazil, Peru, and Colombia before emptying into the Pacific Ocean?\n", + "{'instruction': 'Which of the following rivers is the longest in South America?', 'context': 'The question requires the test-taker to identify the longest river in South America, which is a key geographical feature of the continent.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 6/10: Which of the following rivers is the longest in South America?\n", + "{'instruction': 'What is the longest river in South America?', 'context': 'This river flows through several countries in South America, including Brazil, Colombia, and Peru, before emptying into the Pacific Ocean.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 7/10: What is the longest river in South America?\n", + "{'instruction': \"What is the world's longest river, which flows through 11 countries in northeastern Africa?\", 'context': 'The river originates in Burundi and flows northwards through Rwanda and Tanzania before emptying into the Mediterranean Sea.', 'response': 'Nile', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Easy'}\n", + "Generated question 8/10: What is the world's longest river, which flows through 11 countries in northeastern Africa?\n", + "{'instruction': 'What is the longest river in South America?', 'context': 'South America is home to many significant rivers, but one stands out as the longest.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 9/10: What is the longest river in South America?\n", + "{'instruction': 'Which river is the longest in South America?', 'context': 'The question requires the test-taker to identify the longest river in South America.', 'response': 'Amazon River', 'question_type': 'Multiple Choice', 'category': 'Rivers', 'difficulty': 'Medium'}\n", + "Generated question 10/10: Which river is the longest in South America?\n", + "Generated and saved 10 questions to geography_quiz_dataset.jsonl\n" + ] + } + ], + "source": [ + "import json\n", + "def generate_question():\n", + " prompt = \"\"\"Generate a geography quiz question. Format your response as a JSON object with the following structure:\n", + " {\n", + " \"instruction\": \"The full question text\",\n", + " \"context\": \"Provide context about the question\",\n", + " \"response\": \"The correct answer\",\n", + " \"question_type\": \"The type of question (e.g., 'Multiple Choice')\",\n", + " \"category\": \"The category should be marked as one of these: Countries, Capitals, Rivers, Mountains, Landmarks, Climate, Culture\",\n", + " \"difficulty\": \"The difficulty level of the question (e.g., 'Easy')\"\n", + " }\"\"\"\n", + "\n", + " response = client.chat.completions.create(\n", + " model=\"accounts/fireworks/models/llama-v3p1-405b-instruct\",\n", + " response_format={\"type\": \"json_object\"},\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a geography expert creating quiz questions.\"},\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " )\n", + "\n", + " question_data = json.loads(response.choices[0].message.content)\n", + " print(question_data)\n", + " return Question(**question_data)\n", + "\n", + "def main(num_questions=10):\n", + " with open(\"geography_quiz_dataset.jsonl\", \"w\") as f:\n", + " for i in range(num_questions):\n", + " question = generate_question()\n", + " json.dump(question.dict(), f)\n", + " f.write(\"\\n\")\n", + " print(f\"Generated question {i+1}/{num_questions}: {question.instruction}\")\n", + "\n", + " print(f\"Generated and saved {num_questions} questions to geography_quiz_dataset.jsonl\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "id": "sDoxKlYwDORC", + "metadata": { + "id": "sDoxKlYwDORC" + }, + "source": [ + "## Conclusion\n", + "\n", + "We’re excited to see how the community leverages Llama 3.1 API to create interesting applications.\n", + "\n", + "\n", + "For more information and to get started with Llama 3.1, visit [docs.fireworks.ai](https://docs.fireworks.ai) or join our [discord community](https://discord.gg/fireworks-ai)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}