-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Flexible chunking. Add metadata to FE steps
- Loading branch information
1 parent
3abe5d4
commit cd1945b
Showing
11 changed files
with
139 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 0 additions & 2 deletions
2
llm_engineering/application/preprocessing/operations/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,7 @@ | ||
from .chunking import chunk_text | ||
from .cleaning import clean_text | ||
from .embeddings import embedd_text | ||
|
||
__all__ = [ | ||
"chunk_text", | ||
"clean_text", | ||
"embedd_text", | ||
] |
22 changes: 9 additions & 13 deletions
22
llm_engineering/application/preprocessing/operations/chunking.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,21 @@ | ||
from langchain.text_splitter import ( | ||
RecursiveCharacterTextSplitter, | ||
SentenceTransformersTokenTextSplitter, | ||
) | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter | ||
|
||
from llm_engineering.application.networks import EmbeddingModelSingleton | ||
|
||
embedding_model = EmbeddingModelSingleton() | ||
|
||
|
||
def chunk_text(text: str) -> list[str]: | ||
character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=500, chunk_overlap=0) | ||
text_split = character_splitter.split_text(text) | ||
def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]: | ||
character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=chunk_size, chunk_overlap=0) | ||
text_split_by_characters = character_splitter.split_text(text) | ||
|
||
token_splitter = SentenceTransformersTokenTextSplitter( | ||
chunk_overlap=50, | ||
chunk_overlap=chunk_overlap, | ||
tokens_per_chunk=embedding_model.max_input_length, | ||
model_name=embedding_model.model_id, | ||
) | ||
chunks = [] | ||
chunks_by_tokens = [] | ||
for section in text_split_by_characters: | ||
chunks_by_tokens.extend(token_splitter.split_text(section)) | ||
|
||
for section in text_split: | ||
chunks.extend(token_splitter.split_text(section)) | ||
|
||
return chunks | ||
return chunks_by_tokens |
7 changes: 0 additions & 7 deletions
7
llm_engineering/application/preprocessing/operations/embeddings.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 36 additions & 4 deletions
40
llm_engineering/interfaces/orchestrator/steps/feature_engineering/rag.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,51 @@ | ||
from typing_extensions import Annotated | ||
from zenml import step | ||
from zenml import get_step_context, step | ||
|
||
from llm_engineering.application import utils | ||
from llm_engineering.application.preprocessing import ChunkingDispatcher, EmbeddingDispatcher | ||
from llm_engineering.domain.chunks import Chunk | ||
from llm_engineering.domain.embedded_chunks import EmbeddedChunk | ||
|
||
|
||
@step | ||
def chunk_and_embed( | ||
cleaned_documents: Annotated[list, "cleaned_documents"], | ||
) -> Annotated[list, "embedded_documents"]: | ||
embedded_documents = [] | ||
metadata = {"chunking": {}, "embedding": {}, "num_documents": len(cleaned_documents)} | ||
|
||
embedded_chunks = [] | ||
for document in cleaned_documents: | ||
chunks = ChunkingDispatcher.dispatch(document) | ||
metadata["chunking"] = _add_chunks_metadata(chunks, metadata["chunking"]) | ||
|
||
for batched_chunks in utils.misc.batch(chunks, 10): | ||
batched_embedded_chunks = EmbeddingDispatcher.dispatch(batched_chunks) | ||
embedded_documents.extend(batched_embedded_chunks) | ||
embedded_chunks.extend(batched_embedded_chunks) | ||
|
||
metadata["embedding"] = _add_embeddings_metadata(embedded_chunks, metadata["embedding"]) | ||
metadata["num_chunks"] = len(embedded_chunks) | ||
metadata["num_embedded_chunks"] = len(embedded_chunks) | ||
|
||
step_context = get_step_context() | ||
step_context.add_output_metadata(output_name="embedded_documents", metadata=metadata) | ||
|
||
return embedded_chunks | ||
|
||
|
||
def _add_chunks_metadata(chunks: list[Chunk], metadata: dict) -> dict: | ||
for chunk in chunks: | ||
category = chunk.get_category() | ||
if category not in metadata: | ||
metadata[category] = chunk.metadata | ||
metadata[category]["num_chunks"] = metadata[category].get("num_chunks", 0) + 1 | ||
|
||
return metadata | ||
|
||
|
||
def _add_embeddings_metadata(embedded_chunks: list[EmbeddedChunk], metadata: dict) -> dict: | ||
for embedded_chunk in embedded_chunks: | ||
category = embedded_chunk.get_category() | ||
if category not in metadata: | ||
metadata[category] = embedded_chunk.metadata | ||
|
||
return embedded_documents | ||
return metadata |