diff --git a/.gitignore b/.gitignore index 9c5659c..0beb08a 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,5 @@ cython_debug/ .vscode/**/launch.json # Data -output/ \ No newline at end of file +output/ +sagemaker_*.json diff --git a/Makefile b/Makefile index a619a24..82b091e 100644 --- a/Makefile +++ b/Makefile @@ -11,11 +11,11 @@ help: create-sagemaker-role: @echo "Creating the SageMaker role..." - poetry run python llm_engineering/core/aws/create_sagemaker_role.py + poetry run python llm_engineering/core/aws/roles/create_sagemaker_role.py create-sagemaker-execution-role: @echo "Creating the SageMaker execution role..." - poetry run python llm_engineering/core/aws/create_sagemaker_execution_role.py + poetry run python llm_engineering/core/aws/roles/create_execution_role.py deploy-inference-endpoint: @echo "Deploying the inference endpoint..." @@ -27,4 +27,7 @@ delete-inference-endpoint: exit 1; \ fi @echo "Deleting the inference endpoint and config..." - poetry run python llm_engineering/model/delete_inference_endpoint.py $(ENDPOINT_NAME) \ No newline at end of file + poetry run python llm_engineering/model/delete_inference_endpoint.py $(ENDPOINT_NAME) + +test-inference: + poetry run python -m llm_engineering.model.inference.test \ No newline at end of file diff --git a/llm_engineering/core/aws/roles/create_sagemaker_role.py b/llm_engineering/core/aws/roles/create_sagemaker_role.py index 964e280..7a4d9a1 100644 --- a/llm_engineering/core/aws/roles/create_sagemaker_role.py +++ b/llm_engineering/core/aws/roles/create_sagemaker_role.py @@ -1,6 +1,8 @@ import json +from pathlib import Path import boto3 +from loguru import logger from llm_engineering.settings import settings @@ -33,19 +35,16 @@ def create_sagemaker_user(username, region_name="eu-central-1"): response = iam.create_access_key(UserName=username) access_key = response["AccessKey"] - print(f"User '{username}' created successfully.") - print(f"Access Key ID: {access_key['AccessKeyId']}") - print(f"Secret Access Key: {access_key['SecretAccessKey']}") + logger.info(f"User '{username}' successfully created.") + logger.info("Access Key ID and Secret Access Key successfully created.") - # Return the access key info return {"AccessKeyId": access_key["AccessKeyId"], "SecretAccessKey": access_key["SecretAccessKey"]} if __name__ == "__main__": - new_user = create_sagemaker_user("sagemaker-deployer-2") + new_user = create_sagemaker_user("sagemaker-deployer-3") - # Save the access keys to a file - with open("sagemaker_user_credentials.json", "w") as f: + with Path("sagemaker_user_credentials.json").open("w") as f: json.dump(new_user, f) -print("Credentials saved to 'sagemaker_user_credentials.json'") +logger.info("Credentials saved to 'sagemaker_user_credentials.json'") diff --git a/llm_engineering/model/deploy/huggingface/config.py b/llm_engineering/model/deploy/huggingface/config.py index 36ec1e0..290e1ac 100644 --- a/llm_engineering/model/deploy/huggingface/config.py +++ b/llm_engineering/model/deploy/huggingface/config.py @@ -1,8 +1,6 @@ import json -from sagemaker.compute_resource_requirements.resource_requirements import ( - ResourceRequirements, -) +from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements from llm_engineering.settings import settings @@ -13,7 +11,7 @@ "MAX_TOTAL_TOKENS": json.dumps(settings.MAX_TOTAL_TOKENS), # Max length of the generation (including input text) "MAX_BATCH_TOTAL_TOKENS": json.dumps(settings.MAX_BATCH_TOTAL_TOKENS), "HUGGING_FACE_HUB_TOKEN": settings.HUGGING_FACE_HUB_TOKEN, - "MAX_BATCH_PREFILL_TOKENS": "25000", + "MAX_BATCH_PREFILL_TOKENS": "10000", # 'HF_MODEL_QUANTIZE': "bitsandbytes", } diff --git a/llm_engineering/model/inference/run.py b/llm_engineering/model/inference/run.py index 327e69b..0add42c 100644 --- a/llm_engineering/model/inference/run.py +++ b/llm_engineering/model/inference/run.py @@ -26,6 +26,7 @@ def execute(self) -> str: }, ) extraction = self.llm.inference()[0]["generated_text"] + return extraction diff --git a/llm_engineering/model/inference/test.py b/llm_engineering/model/inference/test.py index 13f3a1d..7f09a25 100644 --- a/llm_engineering/model/inference/test.py +++ b/llm_engineering/model/inference/test.py @@ -1,3 +1,5 @@ +from loguru import logger + from llm_engineering.model.inference.inference import LLMInferenceSagemakerEndpoint from llm_engineering.model.inference.run import InferenceExecutor from llm_engineering.settings import settings @@ -8,4 +10,6 @@ llm = LLMInferenceSagemakerEndpoint( endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, inference_component_name=None ) - InferenceExecutor(llm, text, prompt).execute() + answer = InferenceExecutor(llm, text, prompt).execute() + + logger.info(answer) diff --git a/llm_engineering/settings.py b/llm_engineering/settings.py index 9f319cb..0525011 100644 --- a/llm_engineering/settings.py +++ b/llm_engineering/settings.py @@ -12,7 +12,7 @@ class Config: # Selenium Drivers SELENIUM_BROWSER_BINARY_PATH: str | None = None - SELENIUM_BROWSER_DRIVER_PATH: str + SELENIUM_BROWSER_DRIVER_PATH: str | None = None # LinkedIn Credentials LINKEDIN_USERNAME: str | None = None @@ -42,14 +42,14 @@ class Config: COMET_WORKSPACE: str | None = None COMET_PROJECT: str | None = None - ARN_ROLE: str + ARN_ROLE: str | None = None HUGGING_FACE_HUB_TOKEN: str - HF_MODEL_ID: str = "test" - GPU_INSTANCE_TYPE: str = "test" + HF_MODEL_ID: str = "crumb/nano-mistral" + GPU_INSTANCE_TYPE: str = "ml.g5.xlarge" SM_NUM_GPUS: int = 1 - MAX_INPUT_LENGTH: int = 20000 - MAX_TOTAL_TOKENS: int = 32000 + MAX_INPUT_LENGTH: int = 8000 + MAX_TOTAL_TOKENS: int = 12000 MAX_BATCH_TOTAL_TOKENS: int = 12000 COPIES: int = 4 # Number of replicas GPUS: int = 1 # Number of GPUs