openreplay/ee/recommendation/ml_trainer/main.py
MauricioGarciaS cea5eda985
feat(recommendations): Added services recommendation (ml_service) and trainer (ml_trainer) (#1275)
* Created two services: recommendation training and recommendation serving

* Deleted Docker temporary

* Added features based in signals information

* Added method to get sessions features using PG

* Added same utils and core elements into ml_trainer

* Added checks before training models, added handler for model serving

* Updated serving API and recommendation functions to use frontend signals features

* reorganized modules to have base image and for both serving and training

* Added Dockerfiles and base Dockerfile

* Solved issue while ordering sessions by relevance

* Added method to save user feedback of recommendations

* Added security authorization

* Updated Dockerfile

* fixed issues with secret insertion to API

* Updated feedback structure

* Added git for dags

* Solved issue of insertion on recommendation feedback

* Changed update method from def to async def and it is called during startup

* Solved issues of airflow running mlflow in dag

* Changes sanity checks and added middleware params

* base path renaming

* Changed update method to a interval method which loads one model each 10s if there are models to download

* Added sql files for recommendation service and trainer

* Cleaned files and added documentation for methods and classes

* Added README file

* Renamed endpoints, changed None into empty array and updated readme

* refactor(recommendation): optimized query

* style(recommendation): changed import to top file, renamed endpoints parameters, function optimization

* refactor(recommendation): .gitignore

* refactor(recommendation): .gitignore

* refactor(recommendation): Optimized Dockerfiles

* refactor(recommendation): changed imports

* refactor(recommendation): optimized requests

* refactor(recommendation): optimized requests

* Fixed boot for fastapi, updated some queries

* Fixed issues while downloading models and while returning json response from API

* limited number of recommendations and set a minimum score to present recommendations

* fix(recommendation): fixed some queries and updated prediction method

* Added env value to control number of predictions to make

* docs(recommendation): Added third party libraries used in recommendation service

* frozen requirements

* Update base_crons.py

added `misfire_grace_time` to recommendation crons

---------

Co-authored-by: Taha Yassine Kraiem <tahayk2@gmail.com>
2023-06-07 15:58:33 +02:00

118 lines
4.9 KiB
Python

import mlflow
import hashlib
import argparse
import numpy as np
from decouple import config
from datetime import datetime
from core.user_features import get_training_database
from core.recommendation_model import SVM_recommendation, sort_database
mlflow.set_tracking_uri(config('MLFLOW_TRACKING_URI'))
def handle_database(x_train, y_train):
"""
Verifies if database is well-balanced. If not and if possible fixes it.
"""
total = len(y_train)
if total < 13:
return None, None
train_balance = y_train.sum() / total
if train_balance < 0.4:
positives = y_train[y_train == 1]
n_positives = len(positives)
x_positive = x_train[y_train == 1]
if n_positives < 7:
return None, None
else:
n_negatives_expected = min(int(n_positives/0.4), total-y_train.sum())
negatives = y_train[y_train == 0][:n_negatives_expected]
x_negative = x_train[y_train == 0][:n_negatives_expected]
return np.concatenate((x_positive, x_negative), axis=0), np.concatenate((negatives, positives), axis=0)
elif train_balance > 0.6:
negatives = y_train[y_train == 0]
n_negatives = len(negatives)
x_negative = x_train[y_train == 0]
if n_negatives < 7:
return None, None
else:
n_positives_expected = min(int(n_negatives / 0.4), y_train.sum())
positives = y_train[y_train == 1][:n_positives_expected]
x_positive = x_train[y_train == 1][:n_positives_expected]
return np.concatenate((x_positive, x_negative), axis=0), np.concatenate((negatives, positives), axis=0)
else:
return x_train, y_train
def main(experiment_name, projectId, tenantId):
"""
Main training method using mlflow for tracking and s3 for stocking.
Params:
experiment_name: experiment name for mlflow repo.
projectId: project id of sessions.
tenantId: tenant of the project id (used mainly as salt for hashing).
"""
hashed = hashlib.sha256(bytes(f'{projectId}-{tenantId}'.encode('utf-8'))).hexdigest()
x_, y_, d = get_training_database(projectId, max_timestamp=1680248412284, favorites=True)
x, y = handle_database(x_, y_)
if x is None:
print(f'[INFO] Project {projectId}: Not enough data to train model - {y_.sum()}/{len(y_)-y_.sum()}')
return
x, y = sort_database(x, y)
_experiment = mlflow.get_experiment_by_name(experiment_name)
if _experiment is None:
artifact_uri = config('MODELS_S3_BUCKET', default='./mlruns')
mlflow.create_experiment(experiment_name, artifact_uri)
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=f'{hashed}-{datetime.now().strftime("%Y-%M-%d_%H:%m")}'):
reg_model_name = f"{hashed}-RecModel"
best_meta = {'score': 0, 'model': None, 'name': 'NoName'}
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
with mlflow.start_run(run_name=f'sub_run_with_{kernel}', nested=True):
print("--")
model = SVM_recommendation(kernel=kernel, test=True)
model.fit(x, y)
mlflow.sklearn.log_model(model, "sk_learn",
serialization_format="cloudpickle")
mlflow.log_param("kernel", kernel)
mlflow.log_metric("score", model.score)
for _name, displ in model.plots().items():
#TODO: Close displays not to overload memory
mlflow.log_figure(displ, f'{_name}.png')
if model.score > best_meta['score']:
best_meta['score'] = model.score
best_meta['model'] = model
best_meta['name'] = kernel
mlflow.log_metric("score", best_meta['score'])
mlflow.log_param("name", best_meta['name'])
mlflow.sklearn.log_model(best_meta['model'], "sk_learn",
serialization_format="cloudpickle",
registered_model_name=reg_model_name,
)
if __name__ == '__main__':
import asyncio
import os
os.environ['PG_POOL'] = 'true'
from utils import pg_client
asyncio.run(pg_client.init())
parser = argparse.ArgumentParser(
prog='Recommandation Trainer',
description='This python script aims to create a model able to predict which sessions may be most interesting to replay for the users',
)
parser.add_argument('--projects', type=int, nargs='+')
parser.add_argument('--tenants', type=int, nargs='+')
args = parser.parse_args()
projects = args.projects
tenants = args.tenants
for i in range(len(projects)):
print(f'Processing project {projects[i]}...')
main(experiment_name='s3-recommendations', projectId=projects[i], tenantId=tenants[i])
asyncio.run(pg_client.terminate())