From 0bfce163293acd0e11b7b6436c00d0c2bb5ed7c2 Mon Sep 17 00:00:00 2001 From: ganiyuolalekan Date: Wed, 15 Apr 2020 15:24:05 +0000 Subject: [PATCH 1/3] Ganiyu Olalekan's Implementation of the Logistic Regression Algorithm --- ...ic Regression Scratch Implementation.ipynb | 281 ++++++++++++++++++ ganiyu_olalekan_matthew/README.md | 15 + 2 files changed, 296 insertions(+) create mode 100644 ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb create mode 100644 ganiyu_olalekan_matthew/README.md diff --git a/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb new file mode 100644 index 0000000..f85c3cd --- /dev/null +++ b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn import datasets\n", + "from sklearn import linear_model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "iris = datasets.load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class LogisticRegression:\n", + " def __init__(self, lr=0.01, num_iter=100, verbose=False, fit_intercept=True):\n", + " self.lr = lr\n", + " self.theta = None\n", + " self.verbose = verbose\n", + " self.num_iter = num_iter\n", + " self.fit_intercept = fit_intercept\n", + "\n", + " @staticmethod\n", + " def __intercept(data):\n", + " intercept = np.ones((data.shape[0], 1))\n", + " return np.concatenate((intercept, data), axis=1)\n", + "\n", + " @staticmethod\n", + " def __sigmoid(z):\n", + " return 1 / (1 + np.exp(-z))\n", + "\n", + " @staticmethod\n", + " def __loss(h, y_hat):\n", + " return (-y_hat * np.log(h) - (1 - y_hat) * np.log(1 - h)).mean()\n", + "\n", + " def fit(self, data, target):\n", + " if self.fit_intercept:\n", + " data = self.__intercept(data)\n", + "\n", + " self.theta = np.zeros((data.shape[1], 1))\n", + "\n", + " for _ in range(self.num_iter):\n", + " z = np.dot(data, self.theta)\n", + " h = self.__sigmoid(z)\n", + " loss = self.__loss(h, target)\n", + " gradient_d = np.dot(data.T, (h - target)) / target.shape[0]\n", + " self.theta -= (self.lr * gradient_d)\n", + "\n", + " if self.verbose:\n", + " print(f\"Loss: {loss}\")\n", + "\n", + " def __predict_probability(self, data):\n", + " return self.__sigmoid(np.dot(self.__intercept(data), self.theta))\n", + "\n", + " def predict(self, test):\n", + " return self.__predict_probability(test).round()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = iris.data[:100, :]\n", + "y = iris.target.reshape(150, 1)[:100, :]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "x_train = np.vstack((X[:32, :], X[50:82, :]))\n", + "x_test = np.vstack((X[32:50, :], X[82:100, :]))\n", + "y_train = np.vstack((y[:32, :], y[50:82, :]))\n", + "y_test = np.vstack((y[32:50, :], y[82:100, :]))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy(prediction, output):\n", + " bool_elem = (prediction == output)\n", + " elem = bool_elem.size\n", + " return (100 / elem) * bool_elem.ravel().tolist().count(True)\n", + "\n", + "\n", + "def data_visualization(data, target, figsize=(10, 6)):\n", + " plt.figure(figsize=figsize)\n", + " plt.scatter(data[:, :1][target == 0], data[:, 1:2][target == 0], color='b', label='0')\n", + " plt.scatter(data[:, :1][target == 1], data[:, 1:2][target == 1], color='r', label='1')\n", + " plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "data_visualization(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "model = LogisticRegression(num_iter=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 10.6 ms, sys: 0 ns, total: 10.6 ms\n", + "Wall time: 9.62 ms\n" + ] + } + ], + "source": [ + "# Time Measurement\n", + "\n", + "%time model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Accuracy Measurement in %\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20.9 ms, sys: 159 µs, total: 21 ms\n", + "Wall time: 20.1 ms\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/olalekan/.local/lib/python3.6/site-packages/sklearn/utils/validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Comparing with the scikit learn Logistic Regression class\n", + "\n", + "sk_model = linear_model.LogisticRegression()\n", + "\n", + "%time sk_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 50.0\n" + ] + } + ], + "source": [ + "# Comparing with the scikit learn Logistic Regression class\n", + "# Accuracy Measurement in %\n", + "\n", + "prediction = sk_model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ganiyu_olalekan_matthew/README.md b/ganiyu_olalekan_matthew/README.md new file mode 100644 index 0000000..2658849 --- /dev/null +++ b/ganiyu_olalekan_matthew/README.md @@ -0,0 +1,15 @@ +# ML-Logistic-regression-algorithm-challenge + +Data Science Nigeria (DSN) open challenge on implementing a Logistic Regression algorithm from scratch using Python or R programming. + + +## Logistic Regression + +Logistic regression is the appropriate regression analysis to conduct when the dependent variable is binary. Like all regression analyses, the logistic regression is a predictive analysis. Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more nominal, ordinal, interval or ratio-level independent variables. + +## Implemetation + +The Logistic Regression algorithm was implement with a jupyter notebook using python. + +The iris dataset from sklearn.datasets.load_iris() was used to test the speed and accuracy of the algorithm. Unlike the default iris dataset, my implementation modified it to match the binary requirement of the logistic regression algorithm. Given that the algorithm (Logistic Algorithm) is a Binary Classification Algorithm. + From 30e461e1cb3cc401c86dc57f109e15a762c382f1 Mon Sep 17 00:00:00 2001 From: ganiyuolalekan Date: Fri, 17 Apr 2020 15:07:34 +0000 Subject: [PATCH 2/3] Added Regularization to the Logistic Regression Algorithm --- ...ic Regression Scratch Implementation.ipynb | 156 ++++++++++++++++-- 1 file changed, 142 insertions(+), 14 deletions(-) diff --git a/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb index f85c3cd..4e0bbe6 100644 --- a/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb +++ b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb @@ -6,6 +6,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Importing necessary libraries to run and test the Logistic Regression Algorithm\n", + "\n", "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", @@ -19,6 +21,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Loading the sklearn.datasets.load_iris() to use to test the accuracy of the alogorithm\n", + "\n", "iris = datasets.load_iris()" ] }, @@ -28,14 +32,25 @@ "metadata": {}, "outputs": [], "source": [ + "# The logistic regression class\n", + "\n", "class LogisticRegression:\n", - " def __init__(self, lr=0.01, num_iter=100, verbose=False, fit_intercept=True):\n", + " def __init__(self, lr=0.01, num_iter=100, verbose=False, lambd=0.0, fit_intercept=True):\n", " self.lr = lr\n", " self.theta = None\n", + " self.lambd = lambd\n", " self.verbose = verbose\n", " self.num_iter = num_iter\n", " self.fit_intercept = fit_intercept\n", "\n", + " def __l2_regularization(self):\n", + " \"\"\"\n", + " L2 Regularization or Euclidean Normal, used to reduce overfitting.\n", + " \n", + " Could be turned off by leaving lambd set to 0\n", + " \"\"\"\n", + " return (self.lambd / 2) * np.sum(np.square(self.theta)).mean() \n", + " \n", " @staticmethod\n", " def __intercept(data):\n", " intercept = np.ones((data.shape[0], 1))\n", @@ -43,13 +58,23 @@ "\n", " @staticmethod\n", " def __sigmoid(z):\n", + " \"\"\"\n", + " The Sigmoid activation function which is best fit for binary classification function\n", + " \"\"\"\n", " return 1 / (1 + np.exp(-z))\n", "\n", " @staticmethod\n", " def __loss(h, y_hat):\n", + " \"\"\"\n", + " The Cost function\n", + " Turn verbosa (verbosa=True) ON to examine if the cost function is being minimized\n", + " \"\"\"\n", " return (-y_hat * np.log(h) - (1 - y_hat) * np.log(1 - h)).mean()\n", "\n", " def fit(self, data, target):\n", + " \"\"\"\n", + " The fit function used to fit data to model\n", + " \"\"\"\n", " if self.fit_intercept:\n", " data = self.__intercept(data)\n", "\n", @@ -59,16 +84,22 @@ " z = np.dot(data, self.theta)\n", " h = self.__sigmoid(z)\n", " loss = self.__loss(h, target)\n", - " gradient_d = np.dot(data.T, (h - target)) / target.shape[0]\n", + " gradient_d = np.dot(data.T, (h - target)) / target.shape[0] + self.__l2_regularization()\n", " self.theta -= (self.lr * gradient_d)\n", "\n", " if self.verbose:\n", " print(f\"Loss: {loss}\")\n", "\n", " def __predict_probability(self, data):\n", + " \"\"\"\n", + " Probability prediction function of data (mostly test data).\n", + " \"\"\"\n", " return self.__sigmoid(np.dot(self.__intercept(data), self.theta))\n", "\n", " def predict(self, test):\n", + " \"\"\"\n", + " rounded prediction of the probability to either 0 or 1, as this is a binary classification model\n", + " \"\"\"\n", " return self.__predict_probability(test).round()" ] }, @@ -78,6 +109,11 @@ "metadata": {}, "outputs": [], "source": [ + "# Initializing data and its target output\n", + "\n", + "# The iris dataset is grouped in fifty's and is of 3 different sets \n", + "# So to test the binary classification algorithm better, took the first 2 sets (the first 100)\n", + "\n", "X = iris.data[:100, :]\n", "y = iris.target.reshape(150, 1)[:100, :]" ] @@ -88,10 +124,15 @@ "metadata": {}, "outputs": [], "source": [ - "x_train = np.vstack((X[:32, :], X[50:82, :]))\n", - "x_test = np.vstack((X[32:50, :], X[82:100, :]))\n", - "y_train = np.vstack((y[:32, :], y[50:82, :]))\n", - "y_test = np.vstack((y[32:50, :], y[82:100, :]))" + "# Data spilting\n", + "\n", + "# To avoid inaccuracy, the data is splitted evenly between both datasets\n", + "# 70% training and 30% testing in even distribution\n", + "\n", + "x_train = np.vstack((X[:35, :], X[50:85, :]))\n", + "x_test = np.vstack((X[35:50, :], X[85:100, :]))\n", + "y_train = np.vstack((y[:35, :], y[50:85, :]))\n", + "y_test = np.vstack((y[35:50, :], y[85:100, :]))" ] }, { @@ -100,6 +141,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Accuracy and data visualization function\n", + "\n", "def accuracy(prediction, output):\n", " bool_elem = (prediction == output)\n", " elem = bool_elem.size\n", @@ -108,8 +151,8 @@ "\n", "def data_visualization(data, target, figsize=(10, 6)):\n", " plt.figure(figsize=figsize)\n", - " plt.scatter(data[:, :1][target == 0], data[:, 1:2][target == 0], color='b', label='0')\n", - " plt.scatter(data[:, :1][target == 1], data[:, 1:2][target == 1], color='r', label='1')\n", + " plt.scatter(data[:, :1][target == 0], data[:, 1:2][target == 0], color='b', label='Label 0')\n", + " plt.scatter(data[:, :1][target == 1], data[:, 1:2][target == 1], color='r', label='Label 1')\n", " plt.legend()" ] }, @@ -120,7 +163,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -141,7 +184,9 @@ "metadata": {}, "outputs": [], "source": [ - "model = LogisticRegression(num_iter=50)" + "# Initializing the model at the default 100 iteration\n", + "\n", + "model = LogisticRegression()" ] }, { @@ -153,8 +198,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 10.6 ms, sys: 0 ns, total: 10.6 ms\n", - "Wall time: 9.62 ms\n" + "CPU times: user 29.4 ms, sys: 4.28 ms, total: 33.7 ms\n", + "Wall time: 29.9 ms\n" ] } ], @@ -193,8 +238,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 20.9 ms, sys: 159 µs, total: 21 ms\n", - "Wall time: 20.1 ms\n" + "CPU times: user 19.5 ms, sys: 7.93 ms, total: 27.5 ms\n", + "Wall time: 203 ms\n" ] }, { @@ -223,6 +268,8 @@ "source": [ "# Comparing with the scikit learn Logistic Regression class\n", "\n", + "# max_iter=100 the default\n", + "\n", "sk_model = linear_model.LogisticRegression()\n", "\n", "%time sk_model.fit(x_train, y_train)" @@ -249,6 +296,87 @@ "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 15.8 ms, sys: 0 ns, total: 15.8 ms\n", + "Wall time: 14.3 ms\n", + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Testing model with 50 iterations\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=50)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.02 ms, sys: 3.61 ms, total: 9.63 ms\n", + "Wall time: 8.48 ms\n", + "Accuracy: 96.66666666666667\n" + ] + } + ], + "source": [ + "# Testing model with 30 iterations\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=30)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 13.5 ms, sys: 285 µs, total: 13.8 ms\n", + "Wall time: 12.2 ms\n", + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Testing model with 30 iterations and applying regularization\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=30, lambd=0.5)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, { "cell_type": "code", "execution_count": null, From 3d342320ad55776fedec3b6ca43b7d4a17036a02 Mon Sep 17 00:00:00 2001 From: ganiyuolalekan Date: Fri, 17 Apr 2020 22:29:59 +0000 Subject: [PATCH 3/3] Modified README.md --- ganiyu_olalekan_matthew/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ganiyu_olalekan_matthew/README.md b/ganiyu_olalekan_matthew/README.md index 2658849..cf67fb8 100644 --- a/ganiyu_olalekan_matthew/README.md +++ b/ganiyu_olalekan_matthew/README.md @@ -9,7 +9,9 @@ Logistic regression is the appropriate regression analysis to conduct when the d ## Implemetation -The Logistic Regression algorithm was implement with a jupyter notebook using python. +The Logistic Regression algorithm was implemented with python using jupyter notebook. The iris dataset from sklearn.datasets.load_iris() was used to test the speed and accuracy of the algorithm. Unlike the default iris dataset, my implementation modified it to match the binary requirement of the logistic regression algorithm. Given that the algorithm (Logistic Algorithm) is a Binary Classification Algorithm. +L2 Regularization (Euclidean Normal) was also implemented to optimize the algorithm, obtaining same accuracy in fewer iterations as it would have been for longer iterations. +