Skip to content

Commit

Permalink
Merge pull request #3 from PacktPublishing/generate-instruct-dataset
Browse files Browse the repository at this point in the history
feat: Add new dataset instruction generation logic
  • Loading branch information
iusztinpaul authored Aug 7, 2024
2 parents 9f84695 + a34016a commit 0b32e11
Show file tree
Hide file tree
Showing 19 changed files with 496 additions and 942 deletions.
5 changes: 4 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ LINKEDIN_USERNAME = "str"
LINKEDIN_PASSWORD = "str"

# OpenAI API Config
OPENAI_MODEL_ID = "gpt-3.5-turbo"
OPENAI_MODEL_ID = "gpt-4o-mini"
OPENAI_API_KEY = "str"

# Huggingface API Config
HUGGINGFACE_ACCESS_TOKEN = "str"

# RAG
RAG_MODEL_DEVICE = "cpu"
5 changes: 4 additions & 1 deletion configs/generate_instruct_datasets.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
parameters:

test_split_size: 0.1
push_to_huggingface: true
dataset_id: pauliusztin/llmtwin
mock: false
7 changes: 7 additions & 0 deletions llm_engineering/application/dataset/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MOCKED_RESPONSE = """
[
{"instruction": "<mocked generated instruction> 1", "answer": "<mocked generated answer> 1"},
{"instruction": "<mocked generated instruction> 2", "answer": "<mocked generated answer> 2"},
{"instruction": "<mocked generated instruction> 3", "answer": "<mocked generated answer> 3"}
]
"""
106 changes: 47 additions & 59 deletions llm_engineering/application/dataset/generation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Generator

import tiktoken
from langchain_core.exceptions import OutputParserException
from langchain_core.language_models.fake import FakeListLLM
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from loguru import logger

from llm_engineering import domain
from llm_engineering.application import utils
Expand All @@ -13,27 +13,39 @@
from llm_engineering.domain.types import DataCategory
from llm_engineering.settings import settings

from . import constants, splits
from .output_parsers import ListPydanticOutputParser


class DatasetGenerator:
tokenizer = tiktoken.encoding_for_model(settings.OPENAI_MODEL_ID)

system_prompt_template: str = "You are a technical writer creating posts and articles about AI and MLOps."
prompt_template_str: str = """I will give you batches of contents of {{ data_category }}. Generate me exactly 1 instruction for each of them. The {{ data_category }} text
for which you have to generate the instructions is under 'Content number' x lines.
system_prompt_template: str = "You are a helpful assistant who \
generates instruction-answer pairs based on the given context. \
You will imitate the casual tone and writing style of the context."
prompt_template_str: str = """I want to create an AI assistant that can write paragraphs and \
{{ data_category }} about machine learning topics. Based on the following extract, \
generate three instruction-answer pairs. Each instruction should ask \
to write about a specific topic contained in the context, and each answer \
should provide a relevant paragraph based on the information found in the \
context. Only use concepts from the context to generate the instructions. \
Copy the writing style from the extract to imitate the author's personality. \
Do not use markdown.
Extract:
{{ extract }}
Structure the answer in JSON format, ready to be loaded in Python by json.loads(), a list of objects only with fields called instruction and content.
Do not add any extra characters and make sure it is a list with objects in valid json format following exactly the next structure:\n
'```json\n[{"instruction": "<generated instruction>"}, {"instruction": "<generated instruction here>"}, ...]\n```'
You must generate exactly a list of {{ len_documents }} json objects, using the contents provided under CONTENTS FOR GENERATION\n
\nCONTENTS FOR GENERATION:\n
{% for doc in documents %}
Content number {{ doc.index }}:
{{ doc.content | e }}
{% endfor %}"""
Output JSON format. Make sure that you generate exactly three instruction-answer pairs, and the keys of the JSON object are 'instruction' and 'answer':
[
{"instruction": "<generated instruction> 1", "answer": "<generated answer> 1"},
{"instruction": "<generated instruction> 2", "answer": "<generated answer> 2"},
{"instruction": "<generated instruction> 3", "answer": "<generated answer> 3"}
]
"""

@classmethod
def get_system_prompt(cls) -> Prompt:
Expand All @@ -47,44 +59,23 @@ def get_system_prompt(cls) -> Prompt:
def get_prompts(cls, documents: list[CleanedDocument]) -> dict[DataCategory, list[GenerateDatasetSamplesPrompt]]:
grouped_prompts = {}
grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
for category, documents in grouped_cleaned_documents.items():
batched_documents_generator = cls._batch_by_category(category, documents)
category_prompts = [cls.get_prompt(batch) for batch in batched_documents_generator]
for category, category_documents in grouped_cleaned_documents.items():
category_prompts = [cls.get_prompt(document) for document in category_documents]
grouped_prompts[category] = category_prompts

return grouped_prompts

@classmethod
def _batch_by_category(
cls, category: DataCategory, documents: list[CleanedDocument]
) -> Generator[list, None, None]:
match category:
case DataCategory.ARTICLES:
return utils.misc.batch(documents, size=1)
case DataCategory.POSTS:
return utils.misc.batch(documents, size=5)
case DataCategory.REPOSITORIES:
return utils.misc.batch(documents, size=1)
case _:
raise ValueError(f"Unsupported category: {category}")

@classmethod
def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesPrompt:
assert len(documents) > 0, "At least one document is required"

data_category = documents[0].get_category()
assert all(
data_category == document.get_category() for document in documents
), "All documents must be of the same category"
def get_prompt(cls, document: CleanedDocument) -> GenerateDatasetSamplesPrompt:
data_category = document.get_category()

prompt_template = PromptTemplate.from_template(
template=cls.prompt_template_str,
template_format="jinja2",
)
input_variables = {
"data_category": data_category,
"len_documents": len(documents),
"documents": [{"index": i, "content": doc.content} for i, doc in enumerate(documents)],
"extract": document.content,
}
prompt = prompt_template.format(**input_variables)
prompt_tokens = cls.tokenizer.encode(prompt)
Expand All @@ -98,7 +89,7 @@ def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesP
content=prompt,
num_tokens=len(prompt_tokens),
data_category=data_category,
documents=documents,
document=document,
)

return prompt
Expand All @@ -107,9 +98,10 @@ def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesP
def generate(
cls,
prompts: dict[DataCategory, list[GenerateDatasetSamplesPrompt]],
test_size: float = 0.2,
mock: bool = False,
) -> dict[DataCategory, domain.dataset.InstructDataset]:
def _batch_to_langchain_prompt(
) -> domain.dataset.TrainTestSplit:
def _to_langchain(
prompt: GenerateDatasetSamplesPrompt,
) -> list[BaseMessage]:
messages = [
Expand All @@ -120,35 +112,31 @@ def _batch_to_langchain_prompt(
return messages

if mock:
llm = FakeListLLM(
responses=[
'```json\n[{"instruction": "mock instruction"}, {"instruction": "mock instruction"}, {"instruction": "mock instruction"}]\n```'
]
)
llm = FakeListLLM(responses=[constants.MOCKED_RESPONSE])
else:
llm = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)
llm = ChatOpenAI(model=settings.OPENAI_MODEL_ID, max_tokens=800, temperature=0.7, n=1)
parser = ListPydanticOutputParser(pydantic_object=domain.dataset.InstructDatasetSample)

chain = llm | parser

datasets = {}
for category, category_prompts in prompts.items():
langchain_category_prompts = [_batch_to_langchain_prompt(batch) for batch in category_prompts]
batched_instruct_dataset_samples = chain.batch(langchain_category_prompts)
langchain_category_prompts = [_to_langchain(prompt) for prompt in category_prompts]
batches = utils.misc.batch(langchain_category_prompts, size=4)

flattened_instruct_dataset_samples = []
for prompt, per_prompt_instruct_dataset_samples in zip(
category_prompts, batched_instruct_dataset_samples, strict=False
):
prompt_documents_as_response = prompt.documents
for document_as_response, instruct_dataset_sample in zip(
prompt_documents_as_response, per_prompt_instruct_dataset_samples, strict=False
):
instruct_dataset_sample.response = document_as_response.content
for batch in batches:
try:
batched_instruct_dataset_samples = chain.batch(batch, stop=None)
except OutputParserException:
logger.error(f"Failed to parse the output JSON for a batch for category {category}")

flattened_instruct_dataset_samples.append(instruct_dataset_sample)
for instruct_dataset_sample_batch in batched_instruct_dataset_samples:
flattened_instruct_dataset_samples.extend(instruct_dataset_sample_batch)

dataset = domain.dataset.InstructDataset(category=category, samples=flattened_instruct_dataset_samples)
datasets[category] = dataset

return datasets
train_test_split = splits.create_train_test_split(datasets, test_size=test_size, random_state=42)

return train_test_split
30 changes: 30 additions & 0 deletions llm_engineering/application/dataset/splits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from sklearn.model_selection import train_test_split

from llm_engineering.domain.dataset import InstructDataset, InstructDatasetSample, TrainTestSplit
from llm_engineering.domain.types import DataCategory


def create_train_test_split(
data: dict[DataCategory, InstructDataset], test_size=0.2, random_state=42
) -> TrainTestSplit:
train_data = {}
test_data = {}

for category, dataset in data.items():
samples = dataset.samples
samples_dicts = [sample.model_dump() for sample in samples]

train_samples_dicts, test_samples_dicts = train_test_split(
samples_dicts, test_size=test_size, random_state=random_state
)

train_samples = [InstructDatasetSample(**sample_dict) for sample_dict in train_samples_dicts]
test_samples = [InstructDatasetSample(**sample_dict) for sample_dict in test_samples_dicts]

train_dataset = InstructDataset(category=category, samples=train_samples)
test_dataset = InstructDataset(category=category, samples=test_samples)

train_data[category] = train_dataset
test_data[category] = test_dataset

return TrainTestSplit(train=train_data, test=test_data, test_split_size=test_size)
69 changes: 35 additions & 34 deletions llm_engineering/application/preprocessing/chunking_data_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
CleanedRepositoryDocument,
)

from .operations import chunk_text
from .operations import chunk_article, chunk_text

CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
ChunkT = TypeVar("ChunkT", bound=Chunk)
Expand All @@ -24,12 +24,11 @@ class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]):
"""

@property
def chunk_size(self) -> int:
return 500

@property
def chunk_overlap(self) -> int:
return 50
def metadata(self) -> dict:
return {
"chunk_size": 500,
"chunk_overlap": 50,
}

@abstractmethod
def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
Expand All @@ -38,18 +37,19 @@ def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:

class PostChunkingHandler(ChunkingDataHandler):
@property
def chunk_size(self) -> int:
return 250

@property
def chunk_overlap(self) -> int:
return 25
def metadata(self) -> dict:
return {
"chunk_size": 250,
"chunk_overlap": 25,
}

def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand All @@ -61,22 +61,28 @@ def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
image=data_model.image if data_model.image else None,
metadata={
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
},
metadata=self.metadata,
)
data_models_list.append(model)

return data_models_list


class ArticleChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"min_length": 1000,
"max_length": 1000,
}

def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
chunks = chunk_article(
cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"]
)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand All @@ -88,10 +94,7 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata={
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
},
metadata=self.metadata,
)
data_models_list.append(model)

Expand All @@ -100,18 +103,19 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:

class RepositoryChunkingHandler(ChunkingDataHandler):
@property
def chunk_size(self) -> int:
return 750

@property
def chunk_overlap(self) -> int:
return 75
def metadata(self) -> dict:
return {
"chunk_size": 1500,
"chunk_overlap": 100,
}

def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand All @@ -124,10 +128,7 @@ def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata={
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
},
metadata=self.metadata,
)
data_models_list.append(model)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .chunking import chunk_text
from .chunking import chunk_article, chunk_text
from .cleaning import clean_text

__all__ = [
"chunk_article",
"chunk_text",
"clean_text",
]
Loading

0 comments on commit 0b32e11

Please sign in to comment.