openreplay/ee/intelligent_search/core/llm_api.py
MauricioGarciaS 16efb1316c
feat(intelligent-search): intelligent search service (#1545)
* feature(intelligent-search): Added API to connect to Llama.cpp in EC2 and filter the response into OR filters

* updated sql to filter script and added init.sql for tables

* feature(intelligent-search): Changed llama.cpp for llama in GPU now contained in API

* Updated Dockerfile to use GPU and download LLM from S3

* Added link to facebook/research/llama

* Updated Dockerfile

* Updated requirements and Dockerfile base images

* fixed minor issues: Not used variables, updated COPY and replace values

* fix(intelligent-search): Fixed WHERE statement filter

* feature(smart-charts): Added method to create charts using llama. style(intelligent-search): Changed names for attributes to match frontend format. fix(intelligent-search): Fixed vulnerability in requiments and small issues fix

* Added some test before deploying the service

* Added semaphore to handle concurrency

---------

Co-authored-by: EC2 Default User <ec2-user@ip-10-0-2-226.eu-central-1.compute.internal>
2023-10-25 10:13:58 +02:00

55 lines
2.3 KiB
Python

from llama import Llama, Dialog
from decouple import config
from utils.contexts import search_context_v2
from threading import Semaphore
class LLM_Model:
def __init__(self, **params):
"""
Initialization of pre-trained model.
Args:
ckpt_dirckpt_dir (str): The directory containing checkpoint files for the pretrained model.
tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding.
max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 128.
max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 4.
"""
self.generator = Llama.build(**params)
self.max_queue_size = config('LLM_MAX_QUEUE_SIZE', cast=int, default=1)
self.semaphore = Semaphore(config('LLM_MAX_BATCH_SIZE', cast=int, default=1))
self.queue = list()
self.responses = list()
def __execute_prompts(self, prompts, **params):
"""
Entry point of the program for generating text using a pretrained model.
Args:
prompts (list str): batch of prompts to be asked to LLM.
temperature (float, optional): The temperature value for controlling randomness in generation. Defaults to 0.6.
top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. Defaults to 0.9.
max_gen_len (int, optional): The maximum length of generated sequences. Defaults to 64.
"""
return self.generator.text_completion(
prompts, **params)
def execute_prompts(self, prompts, **params):
if self.semaphore.acquire(timeout=10):
results = self.__execute_prompts(prompts, **params)
self.semaphore.release()
return results
else:
raise TimeoutError("[Error] LLM is over-requested")
async def queue_prompt(self, prompt, force=False, **params):
if self.semaphore.acquire(timeout=10):
if force:
self.responses = execute_prompts(self.queue + [prompt])
else:
self.queue.append(prompt)
# Wait until response exists
self.semaphore.release()
else:
raise TimeoutError("[Error] LLM is over-requested")