chore(recommendations): python modules updated and added airflow dag to save sessions features (#1979)

* fix(trainer): Updated requirements

* fix(recommendations): Downgraded pydantic to 1.10.12 and mlflow to 2.5

* Updated dag for updating database with feedbacks, changed feedback file from ml_service/core into common core

* fix(recommendations): fixed database update and added more features into DB

* Updated modules in recommendations trainer and server

* chore(recommendations): Updated python modules for trainer. Added script to save features from feedback sessions into ml database.

* updated requirements

* updated requirements
This commit is contained in:
MauricioGarciaS 2024-04-24 15:10:18 +02:00 committed by GitHub
parent 76c3ed9966
commit 7ffcf79bf6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 131 additions and 94 deletions

View file

@ -57,6 +57,17 @@ def preprocess(X):
return x, transform return x, transform
class RecommendationSystem(mlflow.pyfunc.PythonModel):
def __init__(self):
...
def fit(self, X, y):
...
def predict(self, X):
return None
class SVM_recommendation(mlflow.pyfunc.PythonModel): class SVM_recommendation(mlflow.pyfunc.PythonModel):
def __init__(self, test=False, **params): def __init__(self, test=False, **params):

View file

@ -1,4 +1,3 @@
fastapi==0.95.2 fastapi==0.110.0
apscheduler==3.10.1 apscheduler==3.10.4
uvicorn==0.22.0 uvicorn==0.27.1
SQLAlchemy==2.0.15

View file

@ -11,10 +11,13 @@ from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator, ShortCircuitOperator from airflow.operators.python import PythonOperator, ShortCircuitOperator
from datetime import datetime, timedelta from datetime import datetime, timedelta
from decouple import config from decouple import config
import numpy as np
_work_dir = os.getcwd() _work_dir = os.getcwd()
sys.path.insert(1, _work_dir) sys.path.insert(1, _work_dir)
from utils import pg_client from utils import pg_client
from utils.feedback import ConnectionHandler from utils import ch_client
from core.feedback import ConnectionHandler
from copy import copy
from sqlalchemy import text from sqlalchemy import text
@ -27,28 +30,43 @@ dbname = config('pg_dbname_ml')
password = config('pg_password_ml') password = config('pg_password_ml')
tracking_uri = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}" tracking_uri = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}"
# 1702296756
def get_today_feedback(): def get_today_feedback():
connection_handler = ConnectionHandler(tracking_uri) current_datetime = int((datetime.now()-timedelta(seconds=execute_interval)).timestamp())
query = f"SELECT project_id, session_id, user_id as viewer_id, payload FROM recommendation_feedback WHERE insertion_time >= {current_datetime}"
connection_handler = ConnectionHandler(tracking_uri) # Connection to mlflow's database
with connection_handler.get_live_session() as conn: with connection_handler.get_live_session() as conn:
cur = conn.connection().connection.cursor() cur = conn.execute(text(query))
query = cur.mogrify( res = cur.fetchall()
f"""SELECT * FROM recommendation_feedback WHERE insertion_time > %(time_lower_bound)s;""",
{'time_lower_bound': int(datetime.now().timestamp()) - execute_interval})
conn.execute(text(query.decode("utf-8")))
conn.commit() conn.commit()
for i in range(len(res)):
payload_i = res[i][3]
res[i] = res[i][:3] + (payload_i['reason'], payload_i['comment'], payload_i['interesting'])
def get_features_pg(ti): df = pd.DataFrame(res, columns=["project_id", "session_id", "viewer_id", "reason", "comment", "interesting"])
os.environ['PG_POOL'] = 'true'
asyncio.run(pg_client.init())
sessionIds = ti.xcom_pull(key='sessionIds')
userIds = ti.xcom_pull(key='userIds').split(',')
sessionsIds_list = df['session_id'].unique()
sessionIds = ','.join([str(k) for k in sessionsIds_list])
with ch_client.ClickHouseClient() as conn:
query = f"""SELECT session_id, issue_type, count(1) as event_count FROM experimental.events WHERE session_id in ({sessionIds}) AND event_type = 'ISSUE' GROUP BY session_id, issue_type;"""
res = conn.execute(query)
df3 = pd.DataFrame(res)
df3 = df3.pivot(index='session_id', columns=['issue_type'], values=['event_count']).event_count
issues_type_found = df3.columns
df[issues_type_found] = [[0] * len(issues_type_found)] * len(df)
for sess in df3.index:
tmp = copy(df[df['session_id'] == sess])
tmp[issues_type_found] = [df3.loc[sess]] * len(tmp)
df.loc[df['session_id'] == sess] = tmp
asyncio.run(pg_client.init()) # Connection to OR postgres database
with pg_client.PostgresClient() as conn: with pg_client.PostgresClient() as conn:
conn.execute( conn.execute("""SELECT T.project_id,
"""SELECT T.project_id,
T.session_id, T.session_id,
T2.viewer_id, T2.viewer_id,
T.pages_count, T.pages_count,
@ -89,29 +107,31 @@ def get_features_pg(ti):
AND duration IS NOT NULL) as T AND duration IS NOT NULL) as T
USING (session_id);""".format(sessionIds=sessionIds) USING (session_id);""".format(sessionIds=sessionIds)
) )
response = conn.fetchall() res = conn.fetchall()
sessionIds = [int(sessId) for sessId in sessionIds.split(',')] df2 = pd.DataFrame(res,
df = pd.DataFrame(response) columns=["project_id", "session_id", "viewer_id", "pages_count", "events_count", "errors_count",
df2 = pd.DataFrame(zip(userIds, sessionIds), columns=['viewer_id', 'session_id']) "duration", "country", "issue_score", "device_type", "replays", "network_access",
"storage_access", "console_access", "stack_access"])
base_query = f"""INSERT INTO {features_table_name} (project_id, session_id, viewer_id, pages_count, events_count, df2 = df.merge(df2, on=['session_id', 'project_id', 'viewer_id'], how='inner')
issues_count, duration, country, issue_score, device_type, for i in range(len(df2.columns)):
replays, network_access, storage_access, console_access, if df2.dtypes[i] == np.float64:
stack_access) VALUES """ df2[df2.columns[i]] = df2[df2.columns[i]].astype('int')
count = 0 df2.fillna(0, inplace=True)
## Upload df2 to DB table
base_query = f"""INSERT INTO {features_table_name} ({', '.join(df2.columns)}) VALUES """
params = {} params = {}
for i in range(len(df)): for i in range(len(df2)):
viewer = df['viewer_id'].iloc[i]
session = df['session_id'].iloc[i]
d = df2[df2['viewer_id'] == viewer]
x = d[d['session_id'] == session]
if len(x) > 0:
template = '(' template = '('
for k, v in x.items(): for k, v in df2.iloc[i].items():
params[f'{k}_{count}'] = v.values[0] try:
template += f's({k}_{count})%' params[f'{k}_{i}'] = v.item()
base_query += template + '), ' except Exception:
count += 1 params[f'{k}_{i}'] = v
template += f'%({k}_{i})s, '
base_query += template[:-2] + '), '
base_query = base_query[:-2] base_query = base_query[:-2]
connection_handler = ConnectionHandler(tracking_uri) connection_handler = ConnectionHandler(tracking_uri)
with connection_handler.get_live_session() as conn: with connection_handler.get_live_session() as conn:
@ -121,6 +141,10 @@ def get_features_pg(ti):
conn.commit() conn.commit()
def get_features_pg():
...
dag = DAG( dag = DAG(
"Feedback_DB_FILL", "Feedback_DB_FILL",
default_args={ default_args={

View file

@ -7,6 +7,7 @@ find airflow/ -type f -name "*.cfg" -exec sed -i "s/{{pg_dbname_airflow}}/${pg_d
find airflow/ -type f -name "*.cfg" -exec sed -i "s#{{airflow_secret_key}}#${airflow_secret_key}#g" {} \; find airflow/ -type f -name "*.cfg" -exec sed -i "s#{{airflow_secret_key}}#${airflow_secret_key}#g" {} \;
export MLFLOW_TRACKING_URI=postgresql+psycopg2://${pg_user_ml}:${pg_password_ml}@${pg_host_ml}:${pg_port_ml}/${pg_dbname_ml} export MLFLOW_TRACKING_URI=postgresql+psycopg2://${pg_user_ml}:${pg_password_ml}@${pg_host_ml}:${pg_port_ml}/${pg_dbname_ml}
git init airflow/dags git init airflow/dags
airflow db upgrade
# Airflow setup # Airflow setup
# airflow db init # airflow db init
# airflow users create \ # airflow users create \

View file

@ -1,3 +1,3 @@
argcomplete==3.0.8 argcomplete==3.2.2
apache-airflow==2.6.2 apache-airflow==2.8.2
airflow-code-editor==7.2.1 airflow-code-editor==7.5.0

View file

@ -1,19 +1,19 @@
requests==2.31.0 requests==2.31.0
urllib3==1.26.16 urllib3==2.0.7
pyjwt==2.8.0 pyjwt==2.8.0
SQLAlchemy==2.0.20 SQLAlchemy==2.0.28
alembic==1.11.1 alembic==1.13.1
psycopg2-binary==2.9.7 psycopg2-binary==2.9.9
joblib==1.3.2 joblib==1.3.2
scipy==1.11.2 scipy==1.12.0
scikit-learn==1.3.0 scikit-learn==1.4.1.post1
mlflow==2.5 mlflow==2.11.1
clickhouse-driver==0.2.6 clickhouse-driver==0.2.7
python3-saml==1.15.0 python3-saml==1.16.0
python-multipart==0.0.6 python-multipart==0.0.6
python-decouple==3.8 python-decouple==3.8
pydantic==1.10.12 pydantic==2.6.3
boto3==1.28.29 boto3==1.34.57

View file

@ -22,6 +22,8 @@ class ClickHouseClient:
self.__client = clickhouse_driver.Client(host=config("ch_host"), self.__client = clickhouse_driver.Client(host=config("ch_host"),
database="default", database="default",
port=config("ch_port", cast=int), port=config("ch_port", cast=int),
user=config("ch_user", cast=str),
password=config("ch_password", cast=str),
settings=settings) \ settings=settings) \
if self.__client is None else self.__client if self.__client is None else self.__client