From f12562518fb83ce4b93e6b5ee2d6073f09b8cbfd Mon Sep 17 00:00:00 2001 From: ourvakan Date: Thu, 17 Jun 2021 18:08:44 +0300 Subject: [PATCH] boilerplate --- .../buying_clients.ipynb | 1109 +++++++++++++++++ .../credentials.yml.example | 6 + .../explorational_analysis.ipynb | 560 +++++++++ 3 files changed, 1675 insertions(+) create mode 100644 ee/connectors/data_analysis_cookbook/buying_clients.ipynb create mode 100644 ee/connectors/data_analysis_cookbook/credentials.yml.example create mode 100644 ee/connectors/data_analysis_cookbook/explorational_analysis.ipynb diff --git a/ee/connectors/data_analysis_cookbook/buying_clients.ipynb b/ee/connectors/data_analysis_cookbook/buying_clients.ipynb new file mode 100644 index 000000000..350d3ea97 --- /dev/null +++ b/ee/connectors/data_analysis_cookbook/buying_clients.ipynb @@ -0,0 +1,1109 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Identifying bying clients\n", + "\n", + "In this notebook we will create a decision tree based model to identify clients who pay (buyers) and understand what makes a user a client (most relevant features).\n", + "\n", + "We divide our notebook into four stages: data preparation, feature engineering, model building, feature importance analysis" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import psycopg2\n", + "from IPython.display import display\n", + "import yaml\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Data preparation\n", + "\n", + "In this step we load from database (PostgreSQL in this example) data and keep it locally as a CSV file. '\n", + "The main reason for that is to be able to reproduce results quickly." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "# Create a connection to the database\n", + "\n", + "# Load a config file with credentials\n", + "conf = yaml.load(\n", + " open(\"credentials.yml\"), Loader=yaml.FullLoader)['pg']\n", + "# Create a connection\n", + "conn = psycopg2.connect(\n", + " host=conf['host'],\n", + " port=conf['port'],\n", + " database=conf['database'],\n", + " user=conf['user'],\n", + " password=conf['password']\n", + ")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Here we prepare two functions to obtain data from the databases (or .csv files if they were pre-downloaded)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "def load_events(source='csv') -> pd.DataFrame:\n", + " \"\"\" Obtains session events from database or csv file\"\"\"\n", + " if source == 'db':\n", + " q = f'select * from connector_events where sessionid IN {sessions}'\n", + " all_events = pd.read_sql(q, conn)\n", + " all_events.to_csv('all_events_1454.csv', index=False)\n", + " elif source == 'csv':\n", + " all_events = pd.read_csv('all_events_1454_sep.csv', sep='|')\n", + " else:\n", + " raise ValueError(\"source parameter should be either 'csv' or 'db'\")\n", + " return all_events\n", + "\n", + "def load_sessions(source='csv') -> pd.DataFrame:\n", + " \"\"\" Obtains sessions information from database or csv file\"\"\"\n", + " if source == 'db':\n", + " q = f\"select * from connector_user_sessions where sessionid in {sessions}\"\n", + " all_sessions = pd.read_sql(q, conn)\n", + " # Saving as a CSV file is optional\n", + " all_sessions.to_csv(\"all_sessions.csv\", sep='|', index=False)\n", + " elif source == 'csv':\n", + " all_sessions = pd.read_csv(\"all_sessions.csv\", sep='|')\n", + " else:\n", + " raise ValueError(\"source parameter should be either 'csv' or 'db'\")\n", + " return all_sessions\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "In the connector's events table we found a client who had a button with a label \"PAY\" by performing a simple query\n", + "\n", + "`q = \"select session_id from connector_events where mouseclick_label = 'PAY\";`\n", + "\n", + "We went on to find all session ids of this client from our internal tables and saved it in `all_sessions_1454.csv`.\n", + "This step is unnecessary for clients and only explained for general clarity.\n", + "**The most important takeaway here, is that we have prepared a list of sessions for which we know\n", + "whether a click on \"PAY\" button has been made or not.**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [], + "source": [ + "sessions_info = pd.read_csv(\"all_sessions_1454.csv\")\n", + "sessions = tuple(sessions_info['session_id'])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "First off, let's see that the dataset is highly imbalanced, because the number of buyers is much less than the number of ordinary visitors.\n", + "In fact, the percentage of buying clients (0.04%) is so small, it's not event seen on the pie chart.\n", + "Hence we're going to use special techniques for imbalanced datasets." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "buyers_count = sessions_info[sessions_info.paid == 1].shape[0]\n", + "regular_count = sessions_info.shape[0] - buyers_count\n", + "ax = plt.subplot(111)\n", + "wedges, texts, _ = ax.pie(x=(buyers_count, regular_count),\n", + " shadow=False,\n", + " labels=['Paying visitors', 'Regular visitors'],\n", + " autopct='%1.2f%%',\n", + " explode=(0, 0.8))\n", + "\n", + "for w in wedges:\n", + " w.set_linewidth(1)\n", + " w.set_edgecolor('white')\n", + "\n", + "plt.show()\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\david\\appdata\\local\\programs\\python\\python38\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3361: DtypeWarning: Columns (9,11,12,13,26) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " if (await self.run_code(code, result, async_=asy)):\n" + ] + } + ], + "source": [ + "all_events = load_events()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's take a look at the events dataset by printing one session" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": " sessionid connectioninformation_downlink \\\n0 4207534060820504 NaN \n2 4207534060820504 NaN \n3 4207534060820504 NaN \n4 4207534060820504 NaN \n5 4207534060820504 NaN \n6 4207534060820504 NaN \n7 4207534060820504 NaN \n\n connectioninformation_type consolelog_level consolelog_value \\\n0 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n5 NaN NaN NaN \n6 NaN NaN NaN \n7 NaN NaN NaN \n\n customevent_messageid customevent_name customevent_payload \\\n0 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n5 NaN NaN NaN \n6 NaN NaN NaN \n7 NaN NaN NaN \n\n customevent_timestamp errorevent_message ... issueevent_messageid \\\n0 NaN NaN ... 5.680858e+09 \n2 NaN NaN ... 5.680913e+09 \n3 NaN NaN ... NaN \n4 NaN NaN ... 5.680859e+09 \n5 NaN NaN ... NaN \n6 NaN NaN ... NaN \n7 NaN NaN ... 5.680898e+09 \n\n issueevent_timestamp issueevent_type \\\n0 1.614202e+12 click_rage \n2 1.614202e+12 click_rage \n3 NaN NaN \n4 1.614202e+12 click_rage \n5 NaN NaN \n6 NaN NaN \n7 1.614202e+12 cpu \n\n issueevent_contextstring issueevent_context \\\n0 SIGN OUT Triston Armstrong DEVELOPER Join GitS... NaN \n2 SAVE & NEXT NaN \n3 NaN NaN \n4 SIGN OUT Triston Armstrong DEVELOPER Join GitS... NaN \n5 NaN NaN \n6 NaN NaN \n7 https://app.gitstart.com/ NaN \n\n issueevent_payload customissue_name customissue_payload \\\n0 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n5 NaN NaN NaN \n6 NaN NaN NaN \n7 {\"Duration\":10581,\"Rate\":94} NaN NaN \n\n received_at batch_order_number \n0 1616761976450 2962 \n2 1616761976855 3003 \n3 1616761976460 2965 \n4 1616761976460 2966 \n5 1616761976464 2968 \n6 1616761976539 2972 \n7 1616761976661 2988 \n\n[7 rows x 49 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sessionidconnectioninformation_downlinkconnectioninformation_typeconsolelog_levelconsolelog_valuecustomevent_messageidcustomevent_namecustomevent_payloadcustomevent_timestamperrorevent_message...issueevent_messageidissueevent_timestampissueevent_typeissueevent_contextstringissueevent_contextissueevent_payloadcustomissue_namecustomissue_payloadreceived_atbatch_order_number
04207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...5.680858e+091.614202e+12click_rageSIGN OUT Triston Armstrong DEVELOPER Join GitS...NaNNaNNaNNaN16167619764502962
24207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...5.680913e+091.614202e+12click_rageSAVE & NEXTNaNNaNNaNNaN16167619768553003
34207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN16167619764602965
44207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...5.680859e+091.614202e+12click_rageSIGN OUT Triston Armstrong DEVELOPER Join GitS...NaNNaNNaNNaN16167619764602966
54207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN16167619764642968
64207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN16167619765392972
74207534060820504NaNNaNNaNNaNNaNNaNNaNNaNNaN...5.680898e+091.614202e+12cpuhttps://app.gitstart.com/NaN{\"Duration\":10581,\"Rate\":94}NaNNaN16167619766612988
\n

7 rows × 49 columns

\n
" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_events[all_events.sessionid == all_events.iloc[0].sessionid].head(10)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We mostly see NaNs and that's OK.\n", + "One row in session events files contains user's actions grouped by timestamp.\n", + "For example, if there was a mouse click event at a time $t$, only the columns corresponding to that click\n", + "(such as mouseclick_label, mouseclick_hesitationtime etc) will be filled and the rest will be NaNs.\n", + "The columns sessionid, received_at, batch_order_number will always be filled as the contain information about the\n", + "session unique identifier, the time at which the event was received by connectors worker and the order number\n", + "to ensure the chronological ordering is preserved when inserting in database in batches." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "['sessionid',\n 'connectioninformation_downlink',\n 'connectioninformation_type',\n 'consolelog_level',\n 'consolelog_value',\n 'customevent_messageid',\n 'customevent_name',\n 'customevent_payload',\n 'customevent_timestamp',\n 'errorevent_message',\n 'errorevent_messageid',\n 'errorevent_name',\n 'errorevent_payload',\n 'errorevent_source',\n 'errorevent_timestamp',\n 'jsexception_message',\n 'jsexception_name',\n 'jsexception_payload',\n 'metadata_key',\n 'metadata_value',\n 'mouseclick_id',\n 'mouseclick_hesitationtime',\n 'mouseclick_label',\n 'pageevent_firstcontentfulpaint',\n 'pageevent_firstpaint',\n 'pageevent_messageid',\n 'pageevent_referrer',\n 'pageevent_speedindex',\n 'pageevent_timestamp',\n 'pageevent_url',\n 'pagerendertiming_timetointeractive',\n 'pagerendertiming_visuallycomplete',\n 'rawcustomevent_name',\n 'rawcustomevent_payload',\n 'setviewportsize_height',\n 'setviewportsize_width',\n 'timestamp_timestamp',\n 'user_anonymous_id',\n 'user_id',\n 'issueevent_messageid',\n 'issueevent_timestamp',\n 'issueevent_type',\n 'issueevent_contextstring',\n 'issueevent_context',\n 'issueevent_payload',\n 'customissue_name',\n 'customissue_payload',\n 'received_at',\n 'batch_order_number']" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's take a glance at all available features\n", + "list(all_events.columns)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Feature engineering\n", + "\n", + "How are we going to predict buying users?\n", + "What kind of features identify them?\n", + "\n", + "- Did the user visit the website earlier?\n", + "- Was the website not illustrating images?\n", + "- Did users experience many issues on the website?\n", + "- What pages did he visit?\n", + "\n", + "All of those questions sound important.\n", + "In these section we will extract the answers to this questions from the datasets of events and sessions for each user.\n", + "We'll create a vector of numerical features and assign them to each user who visited the site.\n", + "Our goal is to see if the feature sets or buyers and is separable by some nonlinear function with a good precision. We'll be looking for this function using decision trees model.\n", + "Of course one can experiment further by applying any other algorithm." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Unfortunately, none of the paying clients received ids.\n", + "Hence we'll only be looking into the session features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [], + "source": [ + "# Create a DataFrame for session features\n", + "labels = sessions_info['paid']\n", + "session_features = sessions_info.drop(['paid'], axis=1)\n", + "session_features.rename({'session_id': 'sessionid'}, axis=1, inplace=True)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's get to some meaningful features. For example, let's create a function\n", + "that will tell if a specific event happened during the session." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [], + "source": [ + "def add_feature_about_event_presence(session_features, column, feature_name, dtype='int'):\n", + " temp_sessions = all_events[['sessionid', column]].dropna()\n", + " temp_sessions = temp_sessions.drop_duplicates(subset='sessionid', keep='last')\n", + " if temp_sessions.shape[0] == 0:\n", + " return session_features\n", + " session_features = session_features.merge(temp_sessions, how='left', on='sessionid')\n", + "\n", + " if dtype == 'int':\n", + " session_features.loc[session_features[column] > 0, feature_name] = 1\n", + " elif dtype == 'str':\n", + " session_features.loc[session_features[column] != '', feature_name] = 1\n", + " session_features[feature_name] = session_features[feature_name].fillna(0)\n", + " session_features = session_features.drop([column], axis=1)\n", + " return session_features" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "for column, feature_name in [('errorevent_messageid', 'error_event'),\n", + " ('customevent_messageid', 'custom_event'),\n", + " ('jsexception_message', 'js_exception'),\n", + " ('customissue_name', 'custom_issue')\n", + " ]:\n", + " session_features = add_feature_about_event_presence(session_features, column, feature_name)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "At some point it's interesting to take a look\n", + "at the maximum values of some parameters during the session.\n", + "These features can be added with the function below:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [], + "source": [ + "def add_max_val(session_features, column):\n", + " feature_df = all_events[['sessionid', column]].dropna()\n", + " feature_df_agg = feature_df.groupby('sessionid').agg('max').reset_index()\n", + " if feature_df_agg.shape[0] > 0:\n", + " session_features = session_features.merge(feature_df_agg, how='left', on='sessionid')\n", + " session_features[col] = session_features[col].fillna(0)\n", + " return session_features" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [], + "source": [ + "max_feature_columns = ['pageevent_firstcontentfulpaint',\n", + " 'pageevent_firstpaint',\n", + " 'pageevent_speedindex',\n", + " 'pagerendertiming_timetointeractive',\n", + " 'pagerendertiming_visuallycomplete',\n", + " 'rawcustomevent_name',\n", + " 'rawcustomevent_payload',\n", + " 'setviewportsize_height',\n", + " 'setviewportsize_width']\n", + "\n", + "for col in max_feature_columns:\n", + " session_features = add_max_val(session_features, col)\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We also should handle categorical variables, the ones that can take on one of a limited,\n", + "and usually fixed, number of possible values (such us user's browser)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [], + "source": [ + "def add_categorial_feature(session_features, column, feature_name):\n", + " categories = list(session_features[column].unique())\n", + " session_features[feature_name] = pd.Categorical(session_features[column], categories=categories).codes" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\david\\appdata\\local\\programs\\python\\python38\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3361: DtypeWarning: Columns (1,2,3,4,5,8,9,10,11) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " if (await self.run_code(code, result, async_=asy)):\n" + ] + } + ], + "source": [ + "# Get additional information from sessions table\n", + "sessions_table = load_sessions()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "54057\n" + ] + } + ], + "source": [ + "sessions_table = sessions_table.drop_duplicates(subset=['sessionid'], keep='last')\n", + "print(sessions_table.shape[0])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [], + "source": [ + "for iss in ['click_rage', 'missing_resource', 'dead_click', 'js_exception', 'bad_request', 'cpu', 'memory']:\n", + " session_features[iss] = session_features['issue_types'].apply(lambda x: 1 if iss in x else 0)\n", + "session_features = session_features.drop(['issue_types'], axis=1)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": " sessionid events_count pages_count error_event \\\n0 4323603410944837 2 1 0.0 \n1 4323604627948361 11 1 0.0 \n2 4323603204776517 1 1 0.0 \n3 4323548146402182 37 7 0.0 \n4 4323554393301661 10 3 0.0 \n5 4323594072990251 17 5 0.0 \n6 4323655897228014 1 1 0.0 \n7 4323630712718240 7 1 0.0 \n8 4323592422793765 11 1 0.0 \n9 4323618781102971 7 2 0.0 \n\n pageevent_firstcontentfulpaint pageevent_firstpaint pageevent_speedindex \\\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n2 0.0 0.0 0.0 \n3 3940.0 3850.0 3825.0 \n4 0.0 0.0 0.0 \n5 0.0 0.0 166.0 \n6 0.0 0.0 0.0 \n7 0.0 0.0 0.0 \n8 2312.0 2251.0 2312.0 \n9 0.0 0.0 0.0 \n\n pagerendertiming_timetointeractive pagerendertiming_visuallycomplete \\\n0 0.0 0.0 \n1 0.0 0.0 \n2 0.0 0.0 \n3 0.0 0.0 \n4 0.0 0.0 \n5 0.0 0.0 \n6 0.0 0.0 \n7 0.0 0.0 \n8 0.0 0.0 \n9 0.0 0.0 \n\n setviewportsize_height setviewportsize_width click_rage \\\n0 0.0 0.0 0 \n1 0.0 0.0 1 \n2 0.0 0.0 0 \n3 0.0 0.0 1 \n4 0.0 0.0 0 \n5 0.0 0.0 0 \n6 0.0 0.0 0 \n7 0.0 0.0 0 \n8 0.0 0.0 0 \n9 0.0 0.0 1 \n\n missing_resource dead_click js_exception bad_request cpu memory \n0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 \n2 0 0 0 0 1 0 \n3 1 0 0 0 1 0 \n4 0 0 0 0 0 0 \n5 0 0 0 0 1 0 \n6 0 0 0 0 0 0 \n7 0 0 0 0 0 0 \n8 0 1 0 0 0 0 \n9 0 0 0 0 0 0 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sessionidevents_countpages_counterror_eventpageevent_firstcontentfulpaintpageevent_firstpaintpageevent_speedindexpagerendertiming_timetointeractivepagerendertiming_visuallycompletesetviewportsize_heightsetviewportsize_widthclick_ragemissing_resourcedead_clickjs_exceptionbad_requestcpumemory
04323603410944837210.00.00.00.00.00.00.00.00000000
143236046279483611110.00.00.00.00.00.00.00.01000000
24323603204776517110.00.00.00.00.00.00.00.00000010
343235481464021823770.03940.03850.03825.00.00.00.00.01100010
443235543933016611030.00.00.00.00.00.00.00.00000000
543235940729902511750.00.00.0166.00.00.00.00.00000010
64323655897228014110.00.00.00.00.00.00.00.00000000
74323630712718240710.00.00.00.00.00.00.00.00000000
843235924227937651110.02312.02251.02312.00.00.00.00.00010000
94323618781102971720.00.00.00.00.00.00.00.01000000
\n
" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_features.head(10)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [ + { + "data": { + "text/plain": "(57794, 18)" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_features.shape" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 20, + "outputs": [], + "source": [ + "def add_one_hot_encoded_feature(origin_df, session_features, column):\n", + " df = origin_df[['sessionid', column]]\n", + " dummies = pd.get_dummies(df[column], prefix=column, dummy_na=True)\n", + " df = pd.concat([df, dummies], axis=1)\n", + " session_features = session_features.merge(df, how='left', on='sessionid')\n", + " session_features = session_features.drop([column], axis=1)\n", + " return session_features" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 21, + "outputs": [], + "source": [ + "for col in ['user_browser', 'user_country', 'user_device', 'connection_type']:\n", + " session_features = add_one_hot_encoded_feature(sessions_table, session_features, col)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 22, + "outputs": [ + { + "data": { + "text/plain": "(57794, 34)" + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_features.shape" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 23, + "outputs": [], + "source": [ + "columns_to_merge = ['connection_effective_bandwidth', 'session_start_timestamp',\n", + " 'session_duration', 'user_device_heap_size',\n", + " 'user_device_memory_size', 'avg_cpu', 'avg_fps', 'max_cpu',\n", + " 'max_fps', 'max_total_js_heap_size', 'max_used_js_heap_size',\n", + " 'js_exceptions_count', 'long_tasks_total_duration', 'long_tasks_max_duration',\n", + " 'long_tasks_count', 'inputs_count', 'clicks_count', 'sessionid'\n", + " ]\n", + "session_features = session_features.merge(sessions_table[columns_to_merge],\n", + " how='left',\n", + " on='sessionid')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 24, + "outputs": [ + { + "data": { + "text/plain": " sessionid events_count pages_count error_event \\\n0 4323603410944837 2 1 0.0 \n1 4323604627948361 11 1 0.0 \n2 4323603204776517 1 1 0.0 \n3 4323548146402182 37 7 0.0 \n4 4323554393301661 10 3 0.0 \n\n pageevent_firstcontentfulpaint pageevent_firstpaint pageevent_speedindex \\\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n2 0.0 0.0 0.0 \n3 3940.0 3850.0 3825.0 \n4 0.0 0.0 0.0 \n\n pagerendertiming_timetointeractive pagerendertiming_visuallycomplete \\\n0 0.0 0.0 \n1 0.0 0.0 \n2 0.0 0.0 \n3 0.0 0.0 \n4 0.0 0.0 \n\n setviewportsize_height ... max_cpu max_fps max_total_js_heap_size \\\n0 0.0 ... 89.0 120.0 51399294.0 \n1 0.0 ... 0.0 0.0 0.0 \n2 0.0 ... 90.0 190.0 49323074.0 \n3 0.0 ... 68.0 60.0 140152925.0 \n4 0.0 ... 38.0 61.0 86162824.0 \n\n max_used_js_heap_size js_exceptions_count long_tasks_total_duration \\\n0 46629158.0 0.0 9161.0 \n1 0.0 0.0 0.0 \n2 47140794.0 0.0 407.0 \n3 121761837.0 0.0 0.0 \n4 82145777.0 0.0 0.0 \n\n long_tasks_max_duration long_tasks_count inputs_count clicks_count \n0 68.0 7.0 0.0 0.0 \n1 0.0 0.0 1.0 0.0 \n2 73.0 3.0 1.0 0.0 \n3 0.0 0.0 0.0 0.0 \n4 0.0 0.0 0.0 0.0 \n\n[5 rows x 51 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sessionidevents_countpages_counterror_eventpageevent_firstcontentfulpaintpageevent_firstpaintpageevent_speedindexpagerendertiming_timetointeractivepagerendertiming_visuallycompletesetviewportsize_height...max_cpumax_fpsmax_total_js_heap_sizemax_used_js_heap_sizejs_exceptions_countlong_tasks_total_durationlong_tasks_max_durationlong_tasks_countinputs_countclicks_count
04323603410944837210.00.00.00.00.00.00.0...89.0120.051399294.046629158.00.09161.068.07.00.00.0
143236046279483611110.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.01.00.0
24323603204776517110.00.00.00.00.00.00.0...90.0190.049323074.047140794.00.0407.073.03.01.00.0
343235481464021823770.03940.03850.03825.00.00.00.0...68.060.0140152925.0121761837.00.00.00.00.00.00.0
443235543933016611030.00.00.00.00.00.00.0...38.061.086162824.082145777.00.00.00.00.00.00.0
\n

5 rows × 51 columns

\n
" + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_features = session_features.dropna(how='all', axis=0)\n", + "session_features = session_features.fillna(0)\n", + "session_features.head()\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 25, + "outputs": [ + { + "data": { + "text/plain": "(57794, 51)" + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_features.shape" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 25, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 3. Build model" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Decision tree model is chosen because it is known to work great with heterogenous datasets and correlated features" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 26, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import sklearn\n", + "from sklearn.model_selection import train_test_split" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 27, + "outputs": [], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(session_features.drop(['sessionid'], axis=1),\n", + " labels,\n", + " test_size=0.15,\n", + " random_state=42)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 28, + "outputs": [ + { + "data": { + "text/plain": "(6, 17)" + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Ensure that test set has paying clients\n", + "sum(y_test), sum(y_train)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 29, + "outputs": [ + { + "data": { + "text/plain": "0.0003460630241836984" + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Ratio of paying sessions will be denoted by EPSILON\n", + "EPSILON = y_train[y_train == 1].shape[0]/y_train.shape[0]\n", + "EPSILON" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 30, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\david\\appdata\\local\\programs\\python\\python38\\lib\\site-packages\\xgboost\\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[17:33:08] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=None)" + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgc0 = xgb.XGBClassifier()\n", + "xgc0.fit(x_train, y_train)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 31, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[17:33:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=2889.6470588235293,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=None)" + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgc = xgb.XGBClassifier(scale_pos_weight=1/EPSILON)\n", + "xgc.fit(x_train, y_train)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 4. Evaluate and choose the best model\n", + "\n", + "In this section we will built two models\n", + "and see at their performances using different metrics" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 32, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "from xgboost import plot_importance\n", + "from sklearn.metrics import plot_roc_curve, recall_score, precision_score, accuracy_score, confusion_matrix\n", + "from sklearn.metrics import plot_confusion_matrix, precision_recall_curve, plot_precision_recall_curve\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 33, + "outputs": [], + "source": [ + "def report(models, x_test, y_test, y_pred=None, model_names=None):\n", + "\n", + " fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 14))\n", + "\n", + " for model, name in zip(models, model_names):\n", + "\n", + " y_out = model.predict(x_test)\n", + "\n", + " precision = precision_score(y_test, y_out)\n", + " recall = recall_score(y_test, y_out)\n", + " accuracy = accuracy_score(y_test, y_out)\n", + "\n", + " print(f'Model: {name}')\n", + " print(f'Precision: {precision}')\n", + " print(f'Recall: {recall}')\n", + " print(f'Accuracy: {accuracy}')\n", + " print('-------------------------')\n", + " print()\n", + "\n", + " roc_auc = plot_roc_curve(model, x_test, y_test, ax=ax1)\n", + " precision_recall = plot_precision_recall_curve(model, x_test, y_test, ax=ax2)\n", + "\n", + " date_time = datetime.now().strftime(\"%m_%d_%H_%M_%S\")\n", + " plt.savefig(f'report_{date_time}.png', dpi=300)\n", + " plt.show()\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 34, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: XGBClassifier\n", + "Precision: 0.8\n", + "Recall: 0.6666666666666666\n", + "Accuracy: 0.9996539792387543\n", + "-------------------------\n", + "\n", + "Model: Weighted XGBClassifier\n", + "Precision: 0.6666666666666666\n", + "Recall: 1.0\n", + "Accuracy: 0.9996539792387543\n", + "-------------------------\n", + "\n" + ] + }, + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "report([xgc0, xgc], x_test, y_test, model_names=['XGBClassifier', 'Weighted XGBClassifier'])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "For the best model let's see the features that influenced the decision the most:\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 35, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_importance(xgc0)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 35, + "outputs": [], + "source": [ + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/ee/connectors/data_analysis_cookbook/credentials.yml.example b/ee/connectors/data_analysis_cookbook/credentials.yml.example new file mode 100644 index 000000000..e9b7caedd --- /dev/null +++ b/ee/connectors/data_analysis_cookbook/credentials.yml.example @@ -0,0 +1,6 @@ +pg: + user: user + password: ****** + database: db_name + host: '127.0.0.1' + port: 8080 \ No newline at end of file diff --git a/ee/connectors/data_analysis_cookbook/explorational_analysis.ipynb b/ee/connectors/data_analysis_cookbook/explorational_analysis.ipynb new file mode 100644 index 000000000..6a118e458 --- /dev/null +++ b/ee/connectors/data_analysis_cookbook/explorational_analysis.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import psycopg2\n", + "from IPython.display import display\n", + "import yaml" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "conf = yaml.load(\n", + " open(\"credentials.yml\"), Loader=yaml.FullLoader)['pg']\n", + "\n", + "# Create a connection to the database\n", + "conn = psycopg2.connect(\n", + " host=conf['host'],\n", + " port=conf['port'],\n", + " database=conf['database'],\n", + " user=conf['user'],\n", + " password=conf['password']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A simple query can be executed either with native psycopg's framework or instanvia pandas.\n", + "As an example let's get a total number of sessions in the database" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " n_sessions\n", + "0 8961277\n" + ] + } + ], + "source": [ + "q = \"select count(*) as n_sessions from connector_user_sessions\"\n", + "df = pd.read_sql(q, conn)\n", + "print(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distributions\n", + "\n", + "One of the basic usages of the data would be to view the data distributions\n", + "and to learn the main statistical properties such as mean, median, variance etc\n", + "which eventually lead to understanding your users better.\n", + "Visualization examples in this document include:\n", + "- Session durations\n", + "- User locations\n", + "- Website load (seasonality)\n", + "- Histogram of issue counts\n", + "- Hesitation time distribution\n", + "- URL visits graph" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##### Session duration" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "q = 'select session_duration from connector_user_sessions limit 10000'\n", + "durations = pd.read_sql(q, conn)\n", + "\n", + "# translate duration to seconds\n", + "durations['session_duration'] = durations['session_duration'] / (1000 * 60)\n", + "sns.displot(durations, x=\"session_duration\", bins=23)\n", + "x_mean =durations['session_duration'].mean()\n", + "x_median =durations['session_duration'].median()\n", + "\n", + "plt.axvline(x_mean, c='orange')\n", + "plt.axvline(x_median, c='red')\n", + "plt.text(50, 80, f\"mean = {x_mean:.2f} min\", size=12)\n", + "plt.text(50, 70, f\"median = {x_median:.2f} min\", size=12)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##### User locations" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "q = 'select count(*) as n_users, user_country from connector_user_sessions group by user_country '\n", + "countries = pd.read_sql(q, conn)\n", + "countries = countries[countries['n_users'] > 900]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " n_users user_country latitude longitude\n", + "0 1567 AU -27.0 133.0\n", + "1 31726 BR -10.0 -55.0\n", + "2 5181 CA 60.0 -95.0\n", + "3 1183 CH 47.0 8.0\n", + "4 1013 CO 4.0 -72.0\n" + ] + } + ], + "source": [ + "coordinates = pd.read_csv('coordinates_and_codes.csv')\n", + "countries_with_coords = pd.merge(countries, coordinates, left_on='user_country', right_on='alpha-2_code')\n", + "countries_with_coords.drop(['alpha-2_code'], axis=1, inplace=True)\n", + "print(countries_with_coords.head())" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "", + "text/html": "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import folium\n", + "\n", + "m = folium.Map(location=[40.18130, 44.5089], zoom_start=1, prefer_canvas=True)\n", + "\n", + "def plotDot(point):\n", + " '''input: series that contains a numeric named latitude and a numeric named longitude\n", + " this function creates a CircleMarker and adds it to your this_map'''\n", + " folium.Marker(location=(point.latitude, point.longitude),\n", + " # radius=point.n_users,\n", + " color=\"#3186cc\",\n", + " popup=point.n_users,\n", + " fill=True,\n", + " fill_color=\"#3186cc\").add_to(m)\n", + "countries_with_coords.apply(plotDot, axis=1)\n", + "m.save('users_map.html')\n", + "\n", + "display(m)\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "##### Website load" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "day = 86400 * 1000\n", + "\n", + "q = 'select session_start_timestamp from connector_user_sessions where session_start_timestamp is not null limit 10000'\n", + "starts = pd.read_sql(q, conn)\n", + "starts['session_start_timestamp'] = starts['session_start_timestamp'].apply(lambda x: (x % day)* 24 / day)\n", + "sns.displot(starts, x=\"session_start_timestamp\", kind=\"kde\", bw_adjust=.2, fill=True)\n", + "plt.xlim(0, 24)\n", + "plt.title(\"Website load distribution during the day\")\n", + "plt.show()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "##### Issue counts" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "q = 'select issues_count from connector_user_sessions limit 10000'\n", + "issues = pd.read_sql(q, conn)\n", + "issues = issues.fillna(0)\n", + "sns.histplot(issues[issues > 2])\n", + "plt.title(\"Distribution of the number of issues across sessions\")\n", + "plt.show()\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "##### Hesitation time distribution" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(43, 1)\n" + ] + } + ], + "source": [ + "q = \"select mouseclick_hesitationtime from connector_events where mouseclick_label = 'PAY' \" \\\n", + " \"and mouseclick_hesitationtime is not null limit 10000\"\n", + "\n", + "hesitation = pd.read_sql(q, conn)\n", + "print(hesitation.shape)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.displot(hesitation, x=\"mouseclick_hesitationtime\", kind=\"kde\", bw_adjust=.2, fill=True)\n", + "plt.title(\"Hesitation time distribution for a click on the button PAY\")\n", + "plt.show()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Path illustration" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "data": { + "text/plain": " sessionid pageevent_url\n9036 4169875804784252 231\n134 4167791269614996 229\n7115 4169433581028977 136\n24174 4176629796934961 107\n18942 4172534204991174 104", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sessionidpageevent_url
90364169875804784252231
1344167791269614996229
71154169433581028977136
241744176629796934961107
189424172534204991174104
\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Select some session events. For illustration purposes, we stick to a fixed number of rows,\n", + "# however it would be more accurate to load all events for each session\n", + "q = 'select pageevent_url, sessionid from connector_events ' \\\n", + " 'where pageevent_url is not null limit 100000'\n", + "urls = pd.read_sql(q, conn)\n", + "\n", + "# Calculate the number of UNIQUE urls per session\n", + "urls_count = urls.groupby('sessionid').agg('nunique').reset_index()\n", + "\n", + "# Select the session with the maximum number of UNIQUE urls\n", + "urls_count = urls_count.sort_values(by='pageevent_url', ascending=False)\n", + "urls_count.head()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [], + "source": [ + "# Get all url visits in correct order\n", + "# (to visualize the full path, not only unique values matter)\n", + "sess_id = urls_count.iloc[0].sessionid\n", + "sess_id = 4592792577630589\n", + "q = f'select pageevent_url from connector_events ' \\\n", + " f'where sessionid = {sess_id} ' \\\n", + " f'and pageevent_url is not null limit 1000'\n", + "\n", + "session_urls = pd.read_sql(q, conn)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "data": { + "text/plain": " pageevent_url\n0 https://atlas.cradle.global/briefs/list\n1 https://atlas.cradle.global/briefs/view/6adf82...\n2 https://atlas.cradle.global/briefs/view/6adf82...\n3 https://atlas.cradle.global/briefs/list\n4 https://atlas.cradle.global/briefs/view/e2b0f0...", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
pageevent_url
0https://atlas.cradle.global/briefs/list
1https://atlas.cradle.global/briefs/view/6adf82...
2https://atlas.cradle.global/briefs/view/6adf82...
3https://atlas.cradle.global/briefs/list
4https://atlas.cradle.global/briefs/view/e2b0f0...
\n
" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We consider URLs with different query parameters to be the same\n", + "# by cutting off the part after question mark:\n", + "\n", + "session_urls['pageevent_url'] = session_urls['pageevent_url'].apply(lambda x: x.split('?')[0])\n", + "session_urls.head()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# For the \"largest\" session draw a graph of page visits\n", + "import networkx as nx\n", + "\n", + "G = nx.DiGraph(directed=True)\n", + "\n", + "for ind in range(session_urls.pageevent_url.shape[0] - 1):\n", + " G.add_edges_from([(session_urls.pageevent_url[ind], session_urls.pageevent_url[ind + 1])])\n", + "\n", + "options = {\n", + " 'node_color': 'blue',\n", + " 'node_size': 50,\n", + " 'width': 1,\n", + " 'alpha': 0.5,\n", + " 'arrowstyle': '-|>',\n", + " 'arrowsize': 12,\n", + "}\n", + "\n", + "nx.draw_networkx(G, arrows=True, with_labels=False, **options)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Analysis of such graphs may be useful in user profiling.\n", + "The graph of page visits can provide essential information to clusterize users by their behavior\n", + "even if they don't actions on the website.\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file