From 796b835b5ed84b10f8d6dbf86ba0aaf0d92a7153 Mon Sep 17 00:00:00 2001
From: iusztinpaul
Date: Thu, 20 Jun 2024 19:17:18 +0300
Subject: [PATCH] feat: Add docs to the training pipeline
---
README.md | 14 +++++++-------
.../interfaces/orchestrator/pipelines/training.py | 7 +++++++
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 8fb0691..43498c2 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,10 @@ poetry self add 'poethepoet[poetry_plugin]'
pre-commit install
```
-We use [Poe the Poet](https://poethepoet.natn.io/index.html) to run all the scripts. You don't have to do anything else than installing it to poetry as a plugin.
+We run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html). You don't have to do anything else but install it as a Poetry plugin.
### Configure sensitive information
-After you have installed all the dependencies, you have to fill an `.env` file.
+After you have installed all the dependencies, you have to fill a `.env` file.
First, copy our example:
```shell
@@ -28,12 +28,12 @@ Now, let's understand how to fill it.
### Selenium Drivers
-To run the data collection pipeline, you have to download the Selenium Chrome driver. To proceed, use the links below:
+You must download the Selenium Chrome driver to run the data collection pipeline. To proceed, use the links below:
* https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/
* https://googlechromelabs.github.io/chrome-for-testing/#stable
> [!WARNING]
-> For MacOS users, after downloading the driver run the following command to give permissions for the driver to be accesible: `xattr -d com.apple.quarantine /path/to/your/driver/chromedriver`
+> For MacOS users, after downloading the driver, run the following command to give permissions for the driver to be accessible: `xattr -d com.apple.quarantine /path/to/your/driver/chromedriver`
The last step is to add the path to the downloaded driver in your `.env` file:
```
@@ -83,7 +83,7 @@ poetry poe local-infrastructure-down
```
> [!WARNING]
-> When running on MacOS, before starting the server export the following environment variable:
+> When running on MacOS, before starting the server, export the following environment variable:
> `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`
> Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369).
@@ -95,7 +95,7 @@ Default credentials:
- username: default
- password:
-**NOTE:** [More on ZenML](https://docs.zenml.io/)
+🔗 [More on ZenML](https://docs.zenml.io/)
#### Qdrant is now accessible at:
@@ -103,7 +103,7 @@ REST API: localhost:6333
Web UI: localhost:6333/dashboard
GRPC API: localhost:6334
-**NOTE:** [More on Qdrant](https://qdrant.tech/documentation/quick-start/)
+🔗 [More on Qdrant](https://qdrant.tech/documentation/quick-start/)
#### MongoDB is now accessible at:
diff --git a/llm_engineering/interfaces/orchestrator/pipelines/training.py b/llm_engineering/interfaces/orchestrator/pipelines/training.py
index bae28e5..c0ff534 100644
--- a/llm_engineering/interfaces/orchestrator/pipelines/training.py
+++ b/llm_engineering/interfaces/orchestrator/pipelines/training.py
@@ -6,6 +6,13 @@
@pipeline
def training() -> None:
+ # NOTE: This is a placeholder pipeline for the training logic.
+
+ # Here is how you can access the generated instruct datasets by the generate_instruct_datasets pipeline.
+ # 'instruct_datasets' is the ID of the artifact.
instruct_datasets = Client().get_artifact_version(name_id_or_prefix="instruct_datasets")
+ # Based on that you can retrieve other artifacts such as: raw_documents, cleaned_documents or embedded_documents
+
+ # Here is an example of how to start the training logic with the tokenization step.
training_steps.tokenize(instruct_datasets=instruct_datasets)