* feature(intelligent-search): Added API to connect to Llama.cpp in EC2 and filter the response into OR filters * updated sql to filter script and added init.sql for tables * feature(intelligent-search): Changed llama.cpp for llama in GPU now contained in API * Updated Dockerfile to use GPU and download LLM from S3 * Added link to facebook/research/llama * Updated Dockerfile * Updated requirements and Dockerfile base images * fixed minor issues: Not used variables, updated COPY and replace values * fix(intelligent-search): Fixed WHERE statement filter * feature(smart-charts): Added method to create charts using llama. style(intelligent-search): Changed names for attributes to match frontend format. fix(intelligent-search): Fixed vulnerability in requiments and small issues fix * Added some test before deploying the service * Added semaphore to handle concurrency --------- Co-authored-by: EC2 Default User <ec2-user@ip-10-0-2-226.eu-central-1.compute.internal>
55 lines
2.3 KiB
Python
55 lines
2.3 KiB
Python
from llama import Llama, Dialog
|
|
from decouple import config
|
|
from utils.contexts import search_context_v2
|
|
from threading import Semaphore
|
|
|
|
|
|
class LLM_Model:
|
|
|
|
def __init__(self, **params):
|
|
"""
|
|
Initialization of pre-trained model.
|
|
Args:
|
|
ckpt_dirckpt_dir (str): The directory containing checkpoint files for the pretrained model.
|
|
tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding.
|
|
max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 128.
|
|
max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 4.
|
|
"""
|
|
self.generator = Llama.build(**params)
|
|
self.max_queue_size = config('LLM_MAX_QUEUE_SIZE', cast=int, default=1)
|
|
self.semaphore = Semaphore(config('LLM_MAX_BATCH_SIZE', cast=int, default=1))
|
|
self.queue = list()
|
|
self.responses = list()
|
|
|
|
def __execute_prompts(self, prompts, **params):
|
|
"""
|
|
Entry point of the program for generating text using a pretrained model.
|
|
|
|
Args:
|
|
prompts (list str): batch of prompts to be asked to LLM.
|
|
temperature (float, optional): The temperature value for controlling randomness in generation. Defaults to 0.6.
|
|
top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. Defaults to 0.9.
|
|
max_gen_len (int, optional): The maximum length of generated sequences. Defaults to 64.
|
|
"""
|
|
return self.generator.text_completion(
|
|
prompts, **params)
|
|
|
|
def execute_prompts(self, prompts, **params):
|
|
if self.semaphore.acquire(timeout=10):
|
|
results = self.__execute_prompts(prompts, **params)
|
|
self.semaphore.release()
|
|
return results
|
|
else:
|
|
raise TimeoutError("[Error] LLM is over-requested")
|
|
|
|
async def queue_prompt(self, prompt, force=False, **params):
|
|
if self.semaphore.acquire(timeout=10):
|
|
if force:
|
|
self.responses = execute_prompts(self.queue + [prompt])
|
|
else:
|
|
self.queue.append(prompt)
|
|
# Wait until response exists
|
|
self.semaphore.release()
|
|
else:
|
|
raise TimeoutError("[Error] LLM is over-requested")
|
|
|