diff --git a/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb new file mode 100644 index 0000000..4e0bbe6 --- /dev/null +++ b/ganiyu_olalekan_matthew/Logistic Regression Scratch Implementation.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing necessary libraries to run and test the Logistic Regression Algorithm\n", + "\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn import datasets\n", + "from sklearn import linear_model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the sklearn.datasets.load_iris() to use to test the accuracy of the alogorithm\n", + "\n", + "iris = datasets.load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# The logistic regression class\n", + "\n", + "class LogisticRegression:\n", + " def __init__(self, lr=0.01, num_iter=100, verbose=False, lambd=0.0, fit_intercept=True):\n", + " self.lr = lr\n", + " self.theta = None\n", + " self.lambd = lambd\n", + " self.verbose = verbose\n", + " self.num_iter = num_iter\n", + " self.fit_intercept = fit_intercept\n", + "\n", + " def __l2_regularization(self):\n", + " \"\"\"\n", + " L2 Regularization or Euclidean Normal, used to reduce overfitting.\n", + " \n", + " Could be turned off by leaving lambd set to 0\n", + " \"\"\"\n", + " return (self.lambd / 2) * np.sum(np.square(self.theta)).mean() \n", + " \n", + " @staticmethod\n", + " def __intercept(data):\n", + " intercept = np.ones((data.shape[0], 1))\n", + " return np.concatenate((intercept, data), axis=1)\n", + "\n", + " @staticmethod\n", + " def __sigmoid(z):\n", + " \"\"\"\n", + " The Sigmoid activation function which is best fit for binary classification function\n", + " \"\"\"\n", + " return 1 / (1 + np.exp(-z))\n", + "\n", + " @staticmethod\n", + " def __loss(h, y_hat):\n", + " \"\"\"\n", + " The Cost function\n", + " Turn verbosa (verbosa=True) ON to examine if the cost function is being minimized\n", + " \"\"\"\n", + " return (-y_hat * np.log(h) - (1 - y_hat) * np.log(1 - h)).mean()\n", + "\n", + " def fit(self, data, target):\n", + " \"\"\"\n", + " The fit function used to fit data to model\n", + " \"\"\"\n", + " if self.fit_intercept:\n", + " data = self.__intercept(data)\n", + "\n", + " self.theta = np.zeros((data.shape[1], 1))\n", + "\n", + " for _ in range(self.num_iter):\n", + " z = np.dot(data, self.theta)\n", + " h = self.__sigmoid(z)\n", + " loss = self.__loss(h, target)\n", + " gradient_d = np.dot(data.T, (h - target)) / target.shape[0] + self.__l2_regularization()\n", + " self.theta -= (self.lr * gradient_d)\n", + "\n", + " if self.verbose:\n", + " print(f\"Loss: {loss}\")\n", + "\n", + " def __predict_probability(self, data):\n", + " \"\"\"\n", + " Probability prediction function of data (mostly test data).\n", + " \"\"\"\n", + " return self.__sigmoid(np.dot(self.__intercept(data), self.theta))\n", + "\n", + " def predict(self, test):\n", + " \"\"\"\n", + " rounded prediction of the probability to either 0 or 1, as this is a binary classification model\n", + " \"\"\"\n", + " return self.__predict_probability(test).round()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing data and its target output\n", + "\n", + "# The iris dataset is grouped in fifty's and is of 3 different sets \n", + "# So to test the binary classification algorithm better, took the first 2 sets (the first 100)\n", + "\n", + "X = iris.data[:100, :]\n", + "y = iris.target.reshape(150, 1)[:100, :]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Data spilting\n", + "\n", + "# To avoid inaccuracy, the data is splitted evenly between both datasets\n", + "# 70% training and 30% testing in even distribution\n", + "\n", + "x_train = np.vstack((X[:35, :], X[50:85, :]))\n", + "x_test = np.vstack((X[35:50, :], X[85:100, :]))\n", + "y_train = np.vstack((y[:35, :], y[50:85, :]))\n", + "y_test = np.vstack((y[35:50, :], y[85:100, :]))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Accuracy and data visualization function\n", + "\n", + "def accuracy(prediction, output):\n", + " bool_elem = (prediction == output)\n", + " elem = bool_elem.size\n", + " return (100 / elem) * bool_elem.ravel().tolist().count(True)\n", + "\n", + "\n", + "def data_visualization(data, target, figsize=(10, 6)):\n", + " plt.figure(figsize=figsize)\n", + " plt.scatter(data[:, :1][target == 0], data[:, 1:2][target == 0], color='b', label='Label 0')\n", + " plt.scatter(data[:, :1][target == 1], data[:, 1:2][target == 1], color='r', label='Label 1')\n", + " plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "data_visualization(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing the model at the default 100 iteration\n", + "\n", + "model = LogisticRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 29.4 ms, sys: 4.28 ms, total: 33.7 ms\n", + "Wall time: 29.9 ms\n" + ] + } + ], + "source": [ + "# Time Measurement\n", + "\n", + "%time model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Accuracy Measurement in %\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 19.5 ms, sys: 7.93 ms, total: 27.5 ms\n", + "Wall time: 203 ms\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/olalekan/.local/lib/python3.6/site-packages/sklearn/utils/validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Comparing with the scikit learn Logistic Regression class\n", + "\n", + "# max_iter=100 the default\n", + "\n", + "sk_model = linear_model.LogisticRegression()\n", + "\n", + "%time sk_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 50.0\n" + ] + } + ], + "source": [ + "# Comparing with the scikit learn Logistic Regression class\n", + "# Accuracy Measurement in %\n", + "\n", + "prediction = sk_model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 15.8 ms, sys: 0 ns, total: 15.8 ms\n", + "Wall time: 14.3 ms\n", + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Testing model with 50 iterations\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=50)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.02 ms, sys: 3.61 ms, total: 9.63 ms\n", + "Wall time: 8.48 ms\n", + "Accuracy: 96.66666666666667\n" + ] + } + ], + "source": [ + "# Testing model with 30 iterations\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=30)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 13.5 ms, sys: 285 µs, total: 13.8 ms\n", + "Wall time: 12.2 ms\n", + "Accuracy: 100.0\n" + ] + } + ], + "source": [ + "# Testing model with 30 iterations and applying regularization\n", + "\n", + "\n", + "model = LogisticRegression(num_iter=30, lambd=0.5)\n", + "\n", + "%time model.fit(x_train, y_train)\n", + "\n", + "prediction = model.predict(x_test)\n", + "print(f\"Accuracy: {accuracy(prediction, y_test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ganiyu_olalekan_matthew/README.md b/ganiyu_olalekan_matthew/README.md new file mode 100644 index 0000000..cf67fb8 --- /dev/null +++ b/ganiyu_olalekan_matthew/README.md @@ -0,0 +1,17 @@ +# ML-Logistic-regression-algorithm-challenge + +Data Science Nigeria (DSN) open challenge on implementing a Logistic Regression algorithm from scratch using Python or R programming. + + +## Logistic Regression + +Logistic regression is the appropriate regression analysis to conduct when the dependent variable is binary. Like all regression analyses, the logistic regression is a predictive analysis. Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more nominal, ordinal, interval or ratio-level independent variables. + +## Implemetation + +The Logistic Regression algorithm was implemented with python using jupyter notebook. + +The iris dataset from sklearn.datasets.load_iris() was used to test the speed and accuracy of the algorithm. Unlike the default iris dataset, my implementation modified it to match the binary requirement of the logistic regression algorithm. Given that the algorithm (Logistic Algorithm) is a Binary Classification Algorithm. + +L2 Regularization (Euclidean Normal) was also implemented to optimize the algorithm, obtaining same accuracy in fewer iterations as it would have been for longer iterations. +