ML with Linear Regression

ptyadana · ptyadana · commit a5703c4a91f8 · 2021-01-14T12:26:34.000+08:00
Former-commit-id: aea9705
diff --git a/Spark - Spark SQL and Data Frames/03.Data Analysis with Spark/05.04.Machine Learning - Linear Regression.ipynb b/Spark - Spark SQL and Data Frames/03.Data Analysis with Spark/05.04.Machine Learning - Linear Regression.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Machine Learning - Linear Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "from pyspark.ml.feature import VectorAssembler\n",
+    "from pyspark.ml.regression import LinearRegression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = '../Data'\n",
+    "file_path = data_path + '/utilization.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)\n",
+    "\n",
+    "# rename columns\n",
+    "df = df.withColumnRenamed('_c0', 'event_datetime')\\\n",
+    "    .withColumnRenamed('_c1', 'server_id')\\\n",
+    "    .withColumnRenamed('_c2', 'cpu_utilization')\\\n",
+    "    .withColumnRenamed('_c3', 'free_memory')\\\n",
+    "    .withColumnRenamed('_c4', 'session_count')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "|     event_datetime|server_id|cpu_utilization|free_memory|session_count|\n",
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|\n",
+      "|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|\n",
+      "|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|\n",
+      "|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|\n",
+      "|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|\n",
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vectorizing Features, tranform using VectorAssembler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vector_assembler = VectorAssembler(inputCols=['cpu_utilization'], outputCol='features')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_vutil = vector_assembler.transform(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+---------+---------------+-----------+-------------+--------+\n",
+      "|     event_datetime|server_id|cpu_utilization|free_memory|session_count|features|\n",
+      "+-------------------+---------+---------------+-----------+-------------+--------+\n",
+      "|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|  [0.57]|\n",
+      "|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|  [0.47]|\n",
+      "|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|  [0.56]|\n",
+      "|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|  [0.57]|\n",
+      "|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|  [0.35]|\n",
+      "+-------------------+---------+---------------+-----------+-------------+--------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_vutil.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Creation and Prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "linear_regression = LinearRegression(featuresCol='features', labelCol='session_count')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lr_model = linear_regression.fit(df_vutil)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Coefficient and Y Intercept"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DenseVector([47.024])"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lr_model.coefficients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "40.41695103550495"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lr_model.intercept"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Checking RMSE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12.837990225931527"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lr_model.summary.rootMeanSquaredError"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "--------"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv-datascience",
+   "language": "python",
+   "name": "venv-datascience"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}