Skip to content

Commit

Permalink
fix: Added chunk_size and chunk_overlap to the chunking method
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Jul 27, 2024
1 parent b8612d7 commit cd5c962
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,7 @@
from typing import Generic, TypeVar
from uuid import UUID

from llm_engineering.domain.chunks import (
ArticleChunk,
Chunk,
PostChunk,
RepositoryChunk,
)
from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk
from llm_engineering.domain.cleaned_documents import (
CleanedArticleDocument,
CleanedDocument,
Expand Down Expand Up @@ -42,17 +37,19 @@ def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:


class PostChunkingHandler(ChunkingDataHandler):
@property
def chunk_size(self) -> int:
return 250

@property
def chunk_overlap(self) -> int:
return 25

def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content)
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand All @@ -79,7 +76,7 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content)
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand All @@ -102,17 +99,19 @@ def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:


class RepositoryChunkingHandler(ChunkingDataHandler):
@property
def chunk_size(self) -> int:
return 750

@property
def chunk_overlap(self) -> int:
return 75

def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
data_models_list = []

cleaned_content = data_model.content
chunks = chunk_text(cleaned_content)
chunks = chunk_text(cleaned_content, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
Expand Down
29 changes: 22 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,36 @@ build-backend = "poetry.core.masonry.api"
run-digital-data-etl-alex = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_alex_vesa.yaml"
run-digital-data-etl-maxime = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml"
run-digital-data-etl-paul = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml"
run-digital-data-etl = ["run-digital-data-etl-alex", "run-digital-data-etl-maxime", "run-digital-data-etl-paul"]
run-digital-data-etl = [
"run-digital-data-etl-alex",
"run-digital-data-etl-maxime",
"run-digital-data-etl-paul",
]

run-feature-engineering-pipeline = "python -m llm_engineering.interfaces.orchestrator.run --no-cache --run-feature-engineering"
run-generate-instruct-datasets-pipeline = "python -m llm_engineering.interfaces.orchestrator.run --no-cache --run-generate-instruct-datasets"
run-preprocessing-pipeline = ["run-digital-data-etl", "run-feature-engineering-pipeline", "run-generate-instruct-datasets-pipeline"]
run-preprocessing-pipeline = [
"run-digital-data-etl",
"run-feature-engineering-pipeline",
"run-generate-instruct-datasets-pipeline",
]
run-export-artifact-to-json-pipeline = "python -m llm_engineering.interfaces.orchestrator.run --no-cache --run-export-artifact-to-json"

run-training-pipeline = "python -m llm_engineering.interfaces.orchestrator.run --no-cache --run-training"

# Local infrastructure
local-docker-infrastructure-up = "docker-compose up -d"
local-docker-infrastructure-down = "docker-compose stop"
local-docker-infrastructure-up = "docker compose up -d"
local-docker-infrastructure-down = "docker compose stop"
local-zenml-server-down = "poetry run zenml down"
local-infrastructure-up = ["local-docker-infrastructure-up", "local-zenml-server-down", "local-zenml-server-up"]
local-infrastructure-down = ["local-docker-infrastructure-down", "local-zenml-server-down"]
local-infrastructure-up = [
"local-docker-infrastructure-up",
"local-zenml-server-down",
"local-zenml-server-up",
]
local-infrastructure-down = [
"local-docker-infrastructure-down",
"local-zenml-server-down",
]

# QA
lint-check = "poetry run ruff check ."
Expand All @@ -88,4 +103,4 @@ cmd = "poetry run zenml up"

[tool.poe.tasks.test]
cmd = "pytest tests/"
env = { ENV_FILE = ".env.testing" }
env = { ENV_FILE = ".env.testing" }

0 comments on commit cd5c962

Please sign in to comment.