Merge pull request #3 from PacktPublishing/generate-instruct-dataset

feat: Add new dataset instruction generation logic
PacktPublishing · Aug 7, 2024 · 0b32e11 · 0b32e11
2 parents 9f84695 + a34016a
commit 0b32e11
Show file tree

Hide file tree

Showing 19 changed files with 496 additions and 942 deletions.
diff --git a/.env.example b/.env.example
@@ -10,8 +10,11 @@ LINKEDIN_USERNAME = "str"
 LINKEDIN_PASSWORD = "str"
 
 # OpenAI API Config
-OPENAI_MODEL_ID = "gpt-3.5-turbo"
+OPENAI_MODEL_ID = "gpt-4o-mini"
 OPENAI_API_KEY = "str"
 
+# Huggingface API Config
+HUGGINGFACE_ACCESS_TOKEN = "str"
+
 # RAG
 RAG_MODEL_DEVICE = "cpu"
diff --git a/configs/generate_instruct_datasets.yaml b/configs/generate_instruct_datasets.yaml
@@ -1,2 +1,5 @@
 parameters:
-
+  test_split_size: 0.1
+  push_to_huggingface: true
+  dataset_id: pauliusztin/llmtwin
+  mock: false
diff --git a/llm_engineering/application/dataset/constants.py b/llm_engineering/application/dataset/constants.py
@@ -0,0 +1,7 @@
+MOCKED_RESPONSE = """
+[
+    {"instruction": "<mocked generated instruction> 1", "answer": "<mocked generated answer> 1"},
+    {"instruction": "<mocked generated instruction> 2", "answer": "<mocked generated answer> 2"},
+    {"instruction": "<mocked generated instruction> 3", "answer": "<mocked generated answer> 3"}
+]
+"""
diff --git a/llm_engineering/application/dataset/generation.py b/llm_engineering/application/dataset/generation.py
@@ -1,10 +1,10 @@
-from typing import Generator
-
 import tiktoken
+from langchain_core.exceptions import OutputParserException
 from langchain_core.language_models.fake import FakeListLLM
 from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
 from langchain_core.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI
+from loguru import logger
 
 from llm_engineering import domain
 from llm_engineering.application import utils
@@ -13,27 +13,39 @@
 from llm_engineering.domain.types import DataCategory
 from llm_engineering.settings import settings
 
+from . import constants, splits
 from .output_parsers import ListPydanticOutputParser
 
 
 class DatasetGenerator:
     tokenizer = tiktoken.encoding_for_model(settings.OPENAI_MODEL_ID)
 
-    system_prompt_template: str = "You are a technical writer creating posts and articles about AI and MLOps."
-    prompt_template_str: str = """I will give you batches of contents of {{ data_category }}. Generate me exactly 1 instruction for each of them. The {{ data_category }} text
-for which you have to generate the instructions is under 'Content number' x lines. 
+    system_prompt_template: str = "You are a helpful assistant who \
+            generates instruction-answer pairs based on the given context. \
+            You will imitate the casual tone and writing style of the context."
+    prompt_template_str: str = """I want to create an AI assistant that can write paragraphs and \
+{{ data_category }} about machine learning topics. Based on the following extract, \
+generate three instruction-answer pairs. Each instruction should ask \
+to write about a specific topic contained in the context, and each answer \
+should provide a relevant paragraph based on the information found in the \
+context. Only use concepts from the context to generate the instructions. \
+Copy the writing style from the extract to imitate the author's personality. \
+Do not use markdown.
+
+Extract:
+{{ extract }}
 
 Structure the answer in JSON format, ready to be loaded in Python by json.loads(), a list of objects only with fields called instruction and content.
 Do not add any extra characters and make sure it is a list with objects in valid json format following exactly the next structure:\n
 '```json\n[{"instruction": "<generated instruction>"}, {"instruction": "<generated instruction here>"}, ...]\n```'
 
-You must generate exactly a list of {{ len_documents }} json objects, using the contents provided under CONTENTS FOR GENERATION\n
-
-\nCONTENTS FOR GENERATION:\n
-{% for doc in documents %}
-Content number {{ doc.index }}:
-{{ doc.content | e }}
-{% endfor %}"""
+Output JSON format. Make sure that you generate exactly three instruction-answer pairs, and the keys of the JSON object are 'instruction' and 'answer':
+[
+    {"instruction": "<generated instruction> 1", "answer": "<generated answer> 1"},
+    {"instruction": "<generated instruction> 2", "answer": "<generated answer> 2"},
+    {"instruction": "<generated instruction> 3", "answer": "<generated answer> 3"}
+]
+"""
 
     @classmethod
     def get_system_prompt(cls) -> Prompt:
@@ -47,44 +59,23 @@ def get_system_prompt(cls) -> Prompt:
     def get_prompts(cls, documents: list[CleanedDocument]) -> dict[DataCategory, list[GenerateDatasetSamplesPrompt]]:
         grouped_prompts = {}
         grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
-        for category, documents in grouped_cleaned_documents.items():
-            batched_documents_generator = cls._batch_by_category(category, documents)
-            category_prompts = [cls.get_prompt(batch) for batch in batched_documents_generator]
+        for category, category_documents in grouped_cleaned_documents.items():
+            category_prompts = [cls.get_prompt(document) for document in category_documents]
             grouped_prompts[category] = category_prompts
 
         return grouped_prompts
 
     @classmethod
-    def _batch_by_category(
-        cls, category: DataCategory, documents: list[CleanedDocument]
-    ) -> Generator[list, None, None]:
-        match category:
-            case DataCategory.ARTICLES:
-                return utils.misc.batch(documents, size=1)
-            case DataCategory.POSTS:
-                return utils.misc.batch(documents, size=5)
-            case DataCategory.REPOSITORIES:
-                return utils.misc.batch(documents, size=1)
-            case _:
-                raise ValueError(f"Unsupported category: {category}")
-
-    @classmethod
-    def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesPrompt:
-        assert len(documents) > 0, "At least one document is required"
-
-        data_category = documents[0].get_category()
-        assert all(
-            data_category == document.get_category() for document in documents
-        ), "All documents must be of the same category"
+    def get_prompt(cls, document: CleanedDocument) -> GenerateDatasetSamplesPrompt:
+        data_category = document.get_category()
 
         prompt_template = PromptTemplate.from_template(
             template=cls.prompt_template_str,
             template_format="jinja2",
         )
         input_variables = {
             "data_category": data_category,
-            "len_documents": len(documents),
-            "documents": [{"index": i, "content": doc.content} for i, doc in enumerate(documents)],
+            "extract": document.content,
         }
         prompt = prompt_template.format(**input_variables)
         prompt_tokens = cls.tokenizer.encode(prompt)
@@ -98,7 +89,7 @@ def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesP
             content=prompt,
             num_tokens=len(prompt_tokens),
             data_category=data_category,
-            documents=documents,
+            document=document,
         )
 
         return prompt
@@ -107,9 +98,10 @@ def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesP
     def generate(
         cls,
         prompts: dict[DataCategory, list[GenerateDatasetSamplesPrompt]],
+        test_size: float = 0.2,
         mock: bool = False,
-    ) -> dict[DataCategory, domain.dataset.InstructDataset]:
-        def _batch_to_langchain_prompt(
+    ) -> domain.dataset.TrainTestSplit:
+        def _to_langchain(
             prompt: GenerateDatasetSamplesPrompt,
         ) -> list[BaseMessage]:
             messages = [
@@ -120,35 +112,31 @@ def _batch_to_langchain_prompt(
             return messages
 
         if mock:
-            llm = FakeListLLM(
-                responses=[
-                    '```json\n[{"instruction": "mock instruction"}, {"instruction": "mock instruction"}, {"instruction": "mock instruction"}]\n```'
-                ]
-            )
+            llm = FakeListLLM(responses=[constants.MOCKED_RESPONSE])
         else:
-            llm = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)
+            llm = ChatOpenAI(model=settings.OPENAI_MODEL_ID, max_tokens=800, temperature=0.7, n=1)
         parser = ListPydanticOutputParser(pydantic_object=domain.dataset.InstructDatasetSample)
 
         chain = llm | parser
 
         datasets = {}
         for category, category_prompts in prompts.items():
-            langchain_category_prompts = [_batch_to_langchain_prompt(batch) for batch in category_prompts]
-            batched_instruct_dataset_samples = chain.batch(langchain_category_prompts)
+            langchain_category_prompts = [_to_langchain(prompt) for prompt in category_prompts]
+            batches = utils.misc.batch(langchain_category_prompts, size=4)
 
             flattened_instruct_dataset_samples = []
-            for prompt, per_prompt_instruct_dataset_samples in zip(
-                category_prompts, batched_instruct_dataset_samples, strict=False
-            ):
-                prompt_documents_as_response = prompt.documents
-                for document_as_response, instruct_dataset_sample in zip(
-                    prompt_documents_as_response, per_prompt_instruct_dataset_samples, strict=False
-                ):
-                    instruct_dataset_sample.response = document_as_response.content
+            for batch in batches:
+                try:
+                    batched_instruct_dataset_samples = chain.batch(batch, stop=None)
+                except OutputParserException:
+                    logger.error(f"Failed to parse the output JSON for a batch for category {category}")
 
-                    flattened_instruct_dataset_samples.append(instruct_dataset_sample)
+                for instruct_dataset_sample_batch in batched_instruct_dataset_samples:
+                    flattened_instruct_dataset_samples.extend(instruct_dataset_sample_batch)
 
             dataset = domain.dataset.InstructDataset(category=category, samples=flattened_instruct_dataset_samples)
             datasets[category] = dataset
 
-        return datasets
+        train_test_split = splits.create_train_test_split(datasets, test_size=test_size, random_state=42)
+
+        return train_test_split
diff --git a/llm_engineering/application/dataset/splits.py b/llm_engineering/application/dataset/splits.py
@@ -0,0 +1,30 @@
+from sklearn.model_selection import train_test_split
+
+from llm_engineering.domain.dataset import InstructDataset, InstructDatasetSample, TrainTestSplit
+from llm_engineering.domain.types import DataCategory
+
+
+def create_train_test_split(
+    data: dict[DataCategory, InstructDataset], test_size=0.2, random_state=42
+) -> TrainTestSplit:
+    train_data = {}
+    test_data = {}
+
+    for category, dataset in data.items():
+        samples = dataset.samples
+        samples_dicts = [sample.model_dump() for sample in samples]
+
+        train_samples_dicts, test_samples_dicts = train_test_split(
+            samples_dicts, test_size=test_size, random_state=random_state
+        )
+
+        train_samples = [InstructDatasetSample(**sample_dict) for sample_dict in train_samples_dicts]
+        test_samples = [InstructDatasetSample(**sample_dict) for sample_dict in test_samples_dicts]
+
+        train_dataset = InstructDataset(category=category, samples=train_samples)
+        test_dataset = InstructDataset(category=category, samples=test_samples)
+
+        train_data[category] = train_dataset
+        test_data[category] = test_dataset
+
+    return TrainTestSplit(train=train_data, test=test_data, test_split_size=test_size)
diff --git a/llm_engineering/application/preprocessing/chunking_data_handlers.py b/llm_engineering/application/preprocessing/chunking_data_handlers.py
@@ -11,7 +11,7 @@
     CleanedRepositoryDocument,
 )
 
-from .operations import chunk_text
+from .operations import chunk_article, chunk_text
 
 CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
 ChunkT = TypeVar("ChunkT", bound=Chunk)
@@ -24,12 +24,11 @@ class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]):
     """
 
     @property
-    def chunk_size(self) -> int:
-        return 500
-
-    @property
-    def chunk_overlap(self) -> int:
-        return 50
+    def metadata(self) -> dict:
+        return {
+            "chunk_size": 500,
+            "chunk_overlap": 50,
+        }
 
     @abstractmethod
     def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
@@ -38,18 +37,19 @@ def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
 
 class PostChunkingHandler(ChunkingDataHandler):
     @property
-    def chunk_size(self) -> int:
-        return 250
-
-    @property
-    def chunk_overlap(self) -> int:
-        return 25
+    def metadata(self) -> dict:
+        return {
+            "chunk_size": 250,
+            "chunk_overlap": 25,
+        }
 
     def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
         data_models_list = []
 
         cleaned_content = data_model.content
-        chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        chunks = chunk_text(
+            cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
+        )
 
         for chunk in chunks:
             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
@@ -61,22 +61,28 @@ def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
                 author_id=data_model.author_id,
                 author_full_name=data_model.author_full_name,
                 image=data_model.image if data_model.image else None,
-                metadata={
-                    "chunk_size": self.chunk_size,
-                    "chunk_overlap": self.chunk_overlap,
-                },
+                metadata=self.metadata,
             )
             data_models_list.append(model)
 
         return data_models_list
 
 
 class ArticleChunkingHandler(ChunkingDataHandler):
+    @property
+    def metadata(self) -> dict:
+        return {
+            "min_length": 1000,
+            "max_length": 1000,
+        }
+
     def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
         data_models_list = []
 
         cleaned_content = data_model.content
-        chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        chunks = chunk_article(
+            cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"]
+        )
 
         for chunk in chunks:
             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
@@ -88,10 +94,7 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
                 document_id=data_model.id,
                 author_id=data_model.author_id,
                 author_full_name=data_model.author_full_name,
-                metadata={
-                    "chunk_size": self.chunk_size,
-                    "chunk_overlap": self.chunk_overlap,
-                },
+                metadata=self.metadata,
             )
             data_models_list.append(model)
 
@@ -100,18 +103,19 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
 
 class RepositoryChunkingHandler(ChunkingDataHandler):
     @property
-    def chunk_size(self) -> int:
-        return 750
-
-    @property
-    def chunk_overlap(self) -> int:
-        return 75
+    def metadata(self) -> dict:
+        return {
+            "chunk_size": 1500,
+            "chunk_overlap": 100,
+        }
 
     def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
         data_models_list = []
 
         cleaned_content = data_model.content
-        chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        chunks = chunk_text(
+            cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
+        )
 
         for chunk in chunks:
             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
@@ -124,10 +128,7 @@ def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
                 document_id=data_model.id,
                 author_id=data_model.author_id,
                 author_full_name=data_model.author_full_name,
-                metadata={
-                    "chunk_size": self.chunk_size,
-                    "chunk_overlap": self.chunk_overlap,
-                },
+                metadata=self.metadata,
             )
             data_models_list.append(model)
 

diff --git a/llm_engineering/application/preprocessing/operations/__init__.py b/llm_engineering/application/preprocessing/operations/__init__.py
@@ -1,7 +1,8 @@
-from .chunking import chunk_text
+from .chunking import chunk_article, chunk_text
 from .cleaning import clean_text
 
 __all__ = [
+    "chunk_article",
     "chunk_text",
     "clean_text",
 ]