From d06a7b74c5a3c9e301db90a7a772c9a22a8b068b Mon Sep 17 00:00:00 2001 From: Opeyemi Osakuade <59209191+opeyemiferanmi1@users.noreply.github.com> Date: Fri, 17 Apr 2020 23:51:38 +0100 Subject: [PATCH 1/5] Create Opeyemi Osakuade --- Opeyemi Osakuade | 1 + 1 file changed, 1 insertion(+) create mode 100644 Opeyemi Osakuade diff --git a/Opeyemi Osakuade b/Opeyemi Osakuade new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/Opeyemi Osakuade @@ -0,0 +1 @@ + From cf1b3575a9b1f4aa765a34e6ad3de49311345ff0 Mon Sep 17 00:00:00 2001 From: Opeyemi Osakuade <59209191+opeyemiferanmi1@users.noreply.github.com> Date: Fri, 17 Apr 2020 23:53:20 +0100 Subject: [PATCH 2/5] Delete Opeyemi Osakuade --- Opeyemi Osakuade | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Opeyemi Osakuade diff --git a/Opeyemi Osakuade b/Opeyemi Osakuade deleted file mode 100644 index 8b13789..0000000 --- a/Opeyemi Osakuade +++ /dev/null @@ -1 +0,0 @@ - From 288a45bdcd40cee9b35e41655c53bc9dc3c79224 Mon Sep 17 00:00:00 2001 From: Opeyemi Osakuade <59209191+opeyemiferanmi1@users.noreply.github.com> Date: Wed, 22 Apr 2020 21:54:48 +0100 Subject: [PATCH 3/5] Create README.md --- Opeyemi Osakuade/README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Opeyemi Osakuade/README.md diff --git a/Opeyemi Osakuade/README.md b/Opeyemi Osakuade/README.md new file mode 100644 index 0000000..c231d3b --- /dev/null +++ b/Opeyemi Osakuade/README.md @@ -0,0 +1,25 @@ +## Design and develop the Logistic Regression algorithm from scratch using Python +Logistic regression is a classification algorithm,used to model the probability of a certain class or event. It transforms the output into a probability value (i.e. a number between 0 and 1) using logistic sigmoid function. + +For a binary classifier, we want the classifier to output values that are between 0 and 1. +
+i.e. + 0 ≤ yθ(x)≤1 + +### Hypothesis Representation + +The term logistic regression refers to "logit function" which refers to "log odds". Odds refers to the ratio of the probability of an event occuring to the probability it does not occur. +Taking the log, log odds for the model turns out to be the equation of the *Sigmoid Function* + +### Cost function +Since the logistic regression function(sigmoid) is *non linear*, to get a *convex function*, i.e a bowl-shaped function that eases the gradient descent function's work to converge to the optimal minimum point,a logistic regression cost function is derived + +### Gradient descent +To choose the values of weights that corresponds to a convex function and fits the data well(so we reach a global minimum), ensure that + +the prediction(h) is at least close to the actual *y*, minimize the cost function using gradient descent. + +Repeat until convergence, updating all weights. + +#### Data +Comprise of two written test scores at DMV driving school and also contains the result whether passed or failed, the objective is to predict if each person with the test scores passed or failed From 596b00773797ef327f7faae11fd599b8c29a427d Mon Sep 17 00:00:00 2001 From: Opeyemi Osakuade <59209191+opeyemiferanmi1@users.noreply.github.com> Date: Wed, 22 Apr 2020 23:29:44 +0100 Subject: [PATCH 4/5] Add files via upload --- Opeyemi Osakuade/DMV_Written_Tests.csv | 101 ++++++ .../DSN-Algorithm Challenge.ipynb | 332 ++++++++++++++++++ 2 files changed, 433 insertions(+) create mode 100644 Opeyemi Osakuade/DMV_Written_Tests.csv create mode 100644 Opeyemi Osakuade/DSN-Algorithm Challenge.ipynb diff --git a/Opeyemi Osakuade/DMV_Written_Tests.csv b/Opeyemi Osakuade/DMV_Written_Tests.csv new file mode 100644 index 0000000..b9acb0a --- /dev/null +++ b/Opeyemi Osakuade/DMV_Written_Tests.csv @@ -0,0 +1,101 @@ +DMV_Test_1,DMV_Test_2,Results +34.62365962451697,78.0246928153624,0 +30.28671076822607,43.89499752400101,0 +35.84740876993872,72.90219802708364,0 +60.18259938620976,86.30855209546826,1 +79.0327360507101,75.3443764369103,1 +45.08327747668339,56.3163717815305,0 +61.10666453684766,96.51142588489624,1 +75.02474556738889,46.55401354116538,1 +76.09878670226257,87.42056971926803,1 +84.43281996120035,43.53339331072109,1 +95.86155507093572,38.22527805795094,0 +75.01365838958247,30.60326323428011,0 +82.30705337399482,76.48196330235604,1 +69.36458875970939,97.71869196188608,1 +39.53833914367223,76.03681085115882,0 +53.9710521485623,89.20735013750205,1 +69.07014406283025,52.74046973016765,1 +67.94685547711617,46.67857410673128,0 +70.66150955499435,92.92713789364831,1 +76.97878372747498,47.57596364975532,1 +67.37202754570876,42.83843832029179,0 +89.67677575072079,65.79936592745237,1 +50.534788289883,48.85581152764205,0 +34.21206097786789,44.20952859866288,0 +77.9240914545704,68.9723599933059,1 +62.27101367004632,69.95445795447587,1 +80.1901807509566,44.82162893218353,1 +93.114388797442,38.80067033713209,0 +61.83020602312595,50.25610789244621,0 +38.78580379679423,64.99568095539578,0 +61.379289447425,72.80788731317097,1 +85.40451939411645,57.05198397627122,1 +52.10797973193984,63.12762376881715,0 +52.04540476831827,69.43286012045222,1 +40.23689373545111,71.16774802184875,0 +54.63510555424817,52.21388588061123,0 +33.91550010906887,98.86943574220611,0 +64.17698887494485,80.90806058670817,1 +74.78925295941542,41.57341522824434,0 +34.1836400264419,75.2377203360134,0 +83.90239366249155,56.30804621605327,1 +51.54772026906181,46.85629026349976,0 +94.44336776917852,65.56892160559052,1 +82.36875375713919,40.61825515970618,0 +51.04775177128865,45.82270145776001,0 +62.22267576120188,52.06099194836679,0 +77.19303492601364,70.45820000180959,1 +97.77159928000232,86.7278223300282,1 +62.07306379667647,96.76882412413983,1 +91.56497449807442,88.69629254546599,1 +79.94481794066932,74.16311935043758,1 +99.2725269292572,60.99903099844988,1 +90.54671411399852,43.39060180650027,1 +34.52451385320009,60.39634245837173,0 +50.2864961189907,49.80453881323059,0 +49.58667721632031,59.80895099453265,0 +97.64563396007767,68.86157272420604,1 +32.57720016809309,95.59854761387875,0 +74.24869136721598,69.82457122657193,1 +71.79646205863379,78.45356224515052,1 +75.3956114656803,85.75993667331619,1 +35.28611281526193,47.02051394723416,0 +56.25381749711624,39.26147251058019,0 +30.05882244669796,49.59297386723685,0 +44.66826172480893,66.45008614558913,0 +66.56089447242954,41.09209807936973,0 +40.45755098375164,97.53518548909936,1 +49.07256321908844,51.88321182073966,0 +80.27957401466998,92.11606081344084,1 +66.74671856944039,60.99139402740988,1 +32.72283304060323,43.30717306430063,0 +64.0393204150601,78.03168802018232,1 +72.34649422579923,96.22759296761404,1 +60.45788573918959,73.09499809758037,1 +58.84095621726802,75.85844831279042,1 +99.82785779692128,72.36925193383885,1 +47.26426910848174,88.47586499559782,1 +50.45815980285988,75.80985952982456,1 +60.45555629271532,42.50840943572217,0 +82.22666157785568,42.71987853716458,0 +88.9138964166533,69.80378889835472,1 +94.83450672430196,45.69430680250754,1 +67.31925746917527,66.58935317747915,1 +57.23870631569862,59.51428198012956,1 +80.36675600171273,90.96014789746954,1 +68.46852178591112,85.59430710452014,1 +42.0754545384731,78.84478600148043,0 +75.47770200533905,90.42453899753964,1 +78.63542434898018,96.64742716885644,1 +52.34800398794107,60.76950525602592,0 +94.09433112516793,77.15910509073893,1 +90.44855097096364,87.50879176484702,1 +55.48216114069585,35.57070347228866,0 +74.49269241843041,84.84513684930135,1 +89.84580670720979,45.35828361091658,1 +83.48916274498238,48.38028579728175,1 +42.2617008099817,87.10385094025457,1 +99.31500880510394,68.77540947206617,1 +55.34001756003703,64.9319380069486,1 +74.77589300092767,89.52981289513276,1 diff --git a/Opeyemi Osakuade/DSN-Algorithm Challenge.ipynb b/Opeyemi Osakuade/DSN-Algorithm Challenge.ipynb new file mode 100644 index 0000000..e5e2318 --- /dev/null +++ b/Opeyemi Osakuade/DSN-Algorithm Challenge.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Design and develop the Logistic Regression algorithm from scratch using Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For a binary classifier, we want the classifier to output values that are between 0 and 1. \n", + "i.e. $$0\\leq y_\\theta(x)\\leq1$$\n", + "\n", + "\n", + "\n", + "The term logistic regression refers to \"*logit function*\" which refers to \"*log odds*\".\n", + "*Odds refers to the ratio of the probability of an event occuring to the probability it does not occur.*\n", + "\n", + "Consider a model with feature $x_{1}$, $x_{2}$,...,$x_{n}$ and output denoted by $y$ that can take values 0 and 1. Let **p** be the probability of **Y=1**\n", + "$$\n", + "odd =\\frac p{1-p}\n", + "$$\n", + "Taking the log, log odds for the model becomes\n", + "$$\n", + "log (\\frac p{1-p}) = \\theta_{0}x_{0} + \\theta_{1}x_{1}+ ... + \\theta_{n}x_{n}\n", + "$$\n", + "where $\\theta_{1},\\theta_{2} ... \\theta_{n}$ are weights, $\\theta_{0}x_{0} $ serves as intercept with $x_{0}$ = 1\n", + "\n", + "Simplifying the equation to obtain **p**\n", + "$$\n", + "p =\\frac 1{1+e^{-( \\theta_{0} + \\theta_{1}x_{1}+ ... + \\theta_{n}x_{n})}}\n", + "$$ which turns out to be the equation of the *Sigmoid Function*\n", + "$$\n", + "S_{z} =\\frac 1{1+e^{-(z)}}\n", + "$$\n", + "\n", + "With $z = \\theta_{0}x_{0} + \\theta_{1}x_{1}+ \\theta_{2}x_{2}+ ... + \\theta_{n}x_{n}$,\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def sigmoid(z): \n", + " return 1/ (1 + np.exp(-z))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cost function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the logistic regression function(sigmoid) is *non linear*, to get a *convex function*, i.e a bowl-shaped function that eases the gradient descent function's work to converge to the optimal minimum point,a logistic regression cost function is derived as shown below\n", + "\n", + "\n", + "\n", + "$$h = \\frac 1{1+e^{-(\\theta^Tx)}} $$\n", + "\n", + " $$Cost(h,y)= \\left( \\begin{array}{c} -log(h)\\;\\;if\\;y =1 \\\\-log(1-h)\\;\\;if\\;y =0\\end{array}\\right) $$\n", + " \n", + " where $h$ is the prediction label while y is the actual label\n", + "\n", + "*Log of numbers between 0 and 1 returns a negative value. The negative sign before the log counters that.*\n", + " \n", + " Making the equation more compact into a one-line expression, we have:\n", + "$$\n", + " Cost(h,y)=-ylog(h)-((1-y)log(1-h)\n", + "$$\n", + "substituting y=1 or 0 respectively returns the previous equation\n", + "\n", + "With *m* as the size of my training set and *i* as ith training example\n", + "\n", + "The cost function $J(\\theta)$ can be rewritten as:\n", + "$$J(\\theta)= \\frac1m\\sum \\limits _{i=1} ^{m}Cost(h,y)$$\n", + "$$J(\\theta)= āˆ’\\frac1m\\sum \\limits _{i=1} ^{m}y^{(i)}log(h^{(i)})+((1-y^{(i)})log(1-h^{(i)}))$$" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def Cost_function(x,y,theta):\n", + " m = len(y)\n", + " y_pred = sigmoid(np.dot(x, theta))\n", + " error =(y * np.log(y_pred)) + ((1-y)* np.log(1-y_pred))\n", + " cost = -1 / m * sum(error)\n", + " gradient = 1 / m * np.dot(x.transpose(), (y_pred - y))\n", + " return cost , gradient" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gradient descent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To choose the values of $\\theta$=$\\theta_{0}$,$\\theta_{1}$...$\\theta_{n}$ that corresponds to a convex function and fits the data well(so we reach a global minimum), ensure that $h = \\theta_{0} + \\theta_{1}x_{1}...\\theta_{n}x_{n}$is at least close to the actual $y$, minimize the cost function$J(\\theta)$ using gradient descent.\n", + "\n", + "Repeat until convergence, updating all $\\theta_j$:$\\left[ \\theta_{0},\\theta_{1},\\theta_{2}\\cdots,\\theta_{n}\\right] $$${\\theta_j}:=\\theta_jāˆ’\\alpha\\frac{\\partial}{\\partial \\theta_j}J(\\theta)\n", + "$$\n", + "where $\\alpha$ = Learning rate\n", + "\n", + "And partial deivative of cost function$J(\\theta)$ with respect to $\\theta$ :$$\n", + "\\frac{\\partial}{\\partial \\theta_j}J(\\theta)=āˆ’\\frac1m\\sum \\limits _{i=1} ^{m}(h^{(i)}+y^{(i)})x^{(i)}_j\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def gradient_descent(x, y, theta, alpha, iterations):\n", + " costs = []\n", + " for i in range(iterations):\n", + " cost, gradient = Cost_function(x,y,theta)\n", + " theta -= (alpha * gradient)\n", + " costs.append(cost)\n", + " return theta, costs\n", + "def predict(theta, x):\n", + " results = x.dot(theta)\n", + " return results > 0" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "plt.style.use(\"ggplot\")\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cost at initialization [0.69314718]\n", + "Gradient at initialization: [[-0.1 ]\n", + " [-0.28122914]\n", + " [-0.25098615]]\n", + "Theta after running gradient descent: [[1.50850586]\n", + " [3.5468762 ]\n", + " [3.29383709]]\n", + "Resulting cost: [0.20489382]\n", + "Training Accuracy: 89 %\n", + "A person who scores 50 and 79 on their DMV written tests have a 0.71 probability of passing.\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "data = pd.read_csv(\"DMV_Written_Tests.csv\")\n", + "scores = data[['DMV_Test_1', 'DMV_Test_2']].values\n", + "results = data['Results'].values\n", + "#standardization\n", + "mean_scores = np.mean(scores, axis=0)\n", + "std_scores = np.std(scores, axis=0)\n", + "scores = (scores - mean_scores) / std_scores\n", + "\n", + "num_iter = 200\n", + "#include intercept\n", + "rows = scores.shape[0]\n", + "cols = scores.shape[1]\n", + "\n", + "X = np.append(np.ones((rows, 1)), scores, axis=1) \n", + "y = results.reshape(rows, 1)\n", + "\n", + "#initialization\n", + "theta_init = np.zeros((cols + 1, 1))\n", + "cost, gradient = Cost_function(X, y,theta_init)\n", + "\n", + "print(\"Cost at initialization\", cost)\n", + "print(\"Gradient at initialization:\", gradient)\n", + "\n", + "#Compute weights and cost function after running gradient descent\n", + "theta, costs = gradient_descent(X, y, theta_init, 1, 200)\n", + "print(\"Theta after running gradient descent:\", theta)\n", + "print(\"Resulting cost:\", costs[-1])\n", + "plt.plot(costs)\n", + "plt.xlabel(\"Iterations\")\n", + "plt.ylabel(\"$J(\\Theta)$\")\n", + "plt.title(\"Values of Cost Function over iterations of Gradient Descent\")\n", + "p = predict(theta, X)\n", + "print(\"Training Accuracy:\", sum(p==y)[0],\"%\")\n", + "\n", + "#Test\n", + "test = np.array([50,79])\n", + "test = (test - mean_scores)/std_scores\n", + "test = np.append(np.ones(1), test)\n", + "probability = sigmoid(test.dot(theta))\n", + "print(\"A person who scores 50 and 79 on their DMV written tests have a\",\n", + " np.round(probability[0], 2),\"probability of passing.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting the decision boundary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$h_\\theta(x) = \\sigma(z)$, where $\\sigma$ is the logistic sigmoid function and $z = \\theta^Tx$\n", + "\n", + "When $h_\\theta(x) \\geq 0.5$ the model predicts class \"1\":\n", + "\n", + "$\\implies \\sigma(\\theta^Tx) \\geq 0.5$\n", + "\n", + "$\\implies \\theta^Tx \\geq 0$ predict class \"1\" \n", + "\n", + "Hence, $\\theta_1 + \\theta_2x_2 + \\theta_3x_3 = 0$ is the equation for the decision boundary, giving us \n", + "\n", + "$ x_3 = \\frac{-(\\theta_1+\\theta_2x_2)}{\\theta_3}$" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "passed = (results == 1).reshape(100, 1)\n", + "failed = (results == 0).reshape(100, 1)\n", + "sns.scatterplot(x = X[passed[:, 0], 1],\n", + " y = X[passed[:, 0], 2],\n", + " marker = \"^\",\n", + " color = \"green\",\n", + " s = 60)\n", + "ax = sns.scatterplot(x = X[failed[:, 0], 1],\n", + " y = X[failed[:, 0], 2],\n", + " marker = \"X\",\n", + " color = \"red\",\n", + " s = 60)\n", + "\n", + "ax.legend([\"Passed\", \"Failed\"])\n", + "ax.set(xlabel=\"DMV Written Test 1 Scores\", ylabel=\"DMV Written Test 2 Scores\")\n", + "\n", + "x_boundary = np.array([np.min(X[:, 1]), np.max(X[:, 1])])\n", + "y_boundary = -(theta[0] + theta[1] * x_boundary) / theta[2]\n", + "\n", + "sns.lineplot(x = x_boundary, y = y_boundary, color=\"blue\")\n", + "plt.show();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 5418acc83b275ff4c872bc8c6863a1ac6dc0caec Mon Sep 17 00:00:00 2001 From: Opeyemi Osakuade <59209191+opeyemiferanmi1@users.noreply.github.com> Date: Wed, 22 Apr 2020 23:30:45 +0100 Subject: [PATCH 5/5] Update README.md --- Opeyemi Osakuade/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Opeyemi Osakuade/README.md b/Opeyemi Osakuade/README.md index c231d3b..c9e7f94 100644 --- a/Opeyemi Osakuade/README.md +++ b/Opeyemi Osakuade/README.md @@ -15,9 +15,7 @@ Taking the log, log odds for the model turns out to be the equation of the *Sigm Since the logistic regression function(sigmoid) is *non linear*, to get a *convex function*, i.e a bowl-shaped function that eases the gradient descent function's work to converge to the optimal minimum point,a logistic regression cost function is derived ### Gradient descent -To choose the values of weights that corresponds to a convex function and fits the data well(so we reach a global minimum), ensure that - -the prediction(h) is at least close to the actual *y*, minimize the cost function using gradient descent. +To choose the values of weights that corresponds to a convex function and fits the data well(so we reach a global minimum), ensure that the prediction(h) is at least close to the actual *y*, minimize the cost function using gradient descent. Repeat until convergence, updating all weights.