From 796b835b5ed84b10f8d6dbf86ba0aaf0d92a7153 Mon Sep 17 00:00:00 2001 From: iusztinpaul Date: Thu, 20 Jun 2024 19:17:18 +0300 Subject: [PATCH] feat: Add docs to the training pipeline --- README.md | 14 +++++++------- .../interfaces/orchestrator/pipelines/training.py | 7 +++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8fb0691..43498c2 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,10 @@ poetry self add 'poethepoet[poetry_plugin]' pre-commit install ``` -We use [Poe the Poet](https://poethepoet.natn.io/index.html) to run all the scripts. You don't have to do anything else than installing it to poetry as a plugin. +We run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html). You don't have to do anything else but install it as a Poetry plugin. ### Configure sensitive information -After you have installed all the dependencies, you have to fill an `.env` file. +After you have installed all the dependencies, you have to fill a `.env` file. First, copy our example: ```shell @@ -28,12 +28,12 @@ Now, let's understand how to fill it. ### Selenium Drivers -To run the data collection pipeline, you have to download the Selenium Chrome driver. To proceed, use the links below: +You must download the Selenium Chrome driver to run the data collection pipeline. To proceed, use the links below: * https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/ * https://googlechromelabs.github.io/chrome-for-testing/#stable > [!WARNING] -> For MacOS users, after downloading the driver run the following command to give permissions for the driver to be accesible: `xattr -d com.apple.quarantine /path/to/your/driver/chromedriver` +> For MacOS users, after downloading the driver, run the following command to give permissions for the driver to be accessible: `xattr -d com.apple.quarantine /path/to/your/driver/chromedriver` The last step is to add the path to the downloaded driver in your `.env` file: ``` @@ -83,7 +83,7 @@ poetry poe local-infrastructure-down ``` > [!WARNING] -> When running on MacOS, before starting the server export the following environment variable: +> When running on MacOS, before starting the server, export the following environment variable: > `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES` > Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369). @@ -95,7 +95,7 @@ Default credentials: - username: default - password: -**NOTE:** [More on ZenML](https://docs.zenml.io/) +🔗 [More on ZenML](https://docs.zenml.io/) #### Qdrant is now accessible at: @@ -103,7 +103,7 @@ REST API: localhost:6333 Web UI: localhost:6333/dashboard GRPC API: localhost:6334 -**NOTE:** [More on Qdrant](https://qdrant.tech/documentation/quick-start/) +🔗 [More on Qdrant](https://qdrant.tech/documentation/quick-start/) #### MongoDB is now accessible at: diff --git a/llm_engineering/interfaces/orchestrator/pipelines/training.py b/llm_engineering/interfaces/orchestrator/pipelines/training.py index bae28e5..c0ff534 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/training.py +++ b/llm_engineering/interfaces/orchestrator/pipelines/training.py @@ -6,6 +6,13 @@ @pipeline def training() -> None: + # NOTE: This is a placeholder pipeline for the training logic. + + # Here is how you can access the generated instruct datasets by the generate_instruct_datasets pipeline. + # 'instruct_datasets' is the ID of the artifact. instruct_datasets = Client().get_artifact_version(name_id_or_prefix="instruct_datasets") + # Based on that you can retrieve other artifacts such as: raw_documents, cleaned_documents or embedded_documents + + # Here is an example of how to start the training logic with the tokenization step. training_steps.tokenize(instruct_datasets=instruct_datasets)