diff --git a/Logistic Regression Algortithm.py b/Logistic Regression Algortithm.py new file mode 100644 index 0000000..6f35913 --- /dev/null +++ b/Logistic Regression Algortithm.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +# the main library to build the algorithm +import numpy as np +# for visualization after fitting the algorithm +import matplotlib.pyplot as plt +# a sample dataset to test the algorithm +import sklearn.datasets as ds + + +# In[2]: + + +# the dataset +iris = ds.load_iris() + + +# In[3]: + + +# features +X = iris.data[:, :2] +# target +y = (iris.target != 0) * 1 + + +# In[4]: + + +class LogisticRegression: + """ + implementation of Logistic Regression from scratch + """ + def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False): + self.lr = lr + self.num_iter = num_iter + self.fit_intercept = fit_intercept + self.verbose = verbose + + def __add_intercept(self, X): + intercept = np.ones((X.shape[0], 1)) + return np.concatenate((intercept, X), axis=1) + + def __sigmoid(self, z): + return 1 / (1 + np.exp(-z)) + + def __loss(self, h, y): + return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean() + + def fit(self, X, y): + if self.fit_intercept: + X = self.__add_intercept(X) + # weight initialization + self.theta = np.zeros(X.shape[1]) + for i in range(self.num_iter): + z = np.dot(X, self.theta) + h = self.__sigmoid(z) + gradient = np.dot(X.T, (h - y)) / y.size + self.theta -= self.lr * gradient + z = np.dot(X, self.theta) + h = self.__sigmoid(z) + loss = self.__loss(h, y) + if(self.verbose == True and i % 10000 == 0): + print(f'loss: {loss}\t') + + def predict_proba(self, X): + if self.fit_intercept: + X = self.__add_intercept(X) + return self.__sigmoid(np.dot(X, self.theta)) + + def predict(self, X): + return self.predict_proba(X).round() + + +# In[5]: + + +model = LogisticRegression(lr = 0.1, num_iter=200) + + +# In[6]: + + +get_ipython().run_line_magic('time', 'model.fit(X, y)') + + +# In[7]: + + +pred = model.predict(X) +print(f'Accuracy Score: {(pred == y).mean()}') + + +# In[8]: + + +plt.figure(figsize=(10,6)) + +plt.scatter(X[y==0][:,0], X[y == 0][:,1], color='b', label='0') +plt.scatter(X[y==1][:,0], X[y == 1][:,1], color='r', label='0') +plt.legend() + +x1_min, x1_max = X[:,0].min(), X[:, 0].max() +x2_min, x2_max = X[:,1].min(), X[:, 1].max() + +xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max)) +grid = np.c_[xx1.ravel(), xx2.ravel()] +probs = model.predict_proba(grid).reshape(xx1.shape) +plt.contour(xx1, xx2, probs, [0.5], linewidths=1, colors='black') + + +# In[ ]: + + + + diff --git a/README.md b/README.md index d3696d5..000b552 100644 --- a/README.md +++ b/README.md @@ -4,38 +4,18 @@ ![DSN logo](DSN_logo.png)|DSN Algorithm Challenge| |---|---| -A lot of data scientists or machine learning enthusiasts do use various machine learning algorithms as a black box without knowing how they work or the mathematics behind it. The purpose of this challenge is to encourage the mathematical understanding of machine learning algorithms, their break and yield point. +I used the Numpy Library to create the Logistic Regression from scratch +I also tested the algorithm and compared it with the Logistic Regression algorithm provided by scikit learn +The matplotlib library is for visualization. -In summary, participants are encouraged to understand the fundamental concepts behind machine learning algorithms/models. +# My Method +I created a class Logistic Regression and it has four (4) argument +argument 1 : lr short for learning_rate which will determine how my algorithm will move not interms of direction but in weights learned during training a model +argument 2: num_iter short for Number of Iterations this is the number of times the algorithm will go over the data in order to learn the weights and if it is too high it takes time to finish training and it may lead to overfitting and too low will lead to underfitting +argument 3: fit_intercept this is based on the assumtion that the data has linear relationship among them (features and target) hence it uses the intercept learned to predict probability +argument 4: verbose which is to output the training steps -The rules and guidelines for this challenge are as follows: +Logistic Regression is to model the probability of a certain class or event existing such as win or loss -1. Ensure to register at https://bit.ly/dsnmlhack - -2. The algorithm challenge is open to all. - -3. Participants are expected to design and develop the Logistic Regression algorithm from scratch using Python or R programming. - -4. For python developers (numpy is advisable). - -5. To push your solution to us, make a [pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) to DSN's GitHub page at https://www.github.com/datasciencenigeria/ML-Logistic-regression-algorithm-challenge. Ensure to add your readme file to understand your code. - -6. The top 3 optimized code will be compensated as follows: - -- **1st position**: 20GB data plan. -- **2nd position**: 15GB data plan. -- **3rd position**: 10GB data plan. - -7. Add your scripts and readme.MD file as a folder saved as your full name (surname_first_middle name) by making a pull request to the repository. - ---- -For issues on this challenge kindly reach out to the AI+campus/city managers - -**Twitter**: [@DataScienceNIG](https://twitter.com/DataScienceNIG), [@elishatofunmi](https://twitter.com/Elishatofunmi), [@o_funminiyi](https://twitter.com/o_funminiyi), [@gbganalyst](https://twitter.com/gbganalyst) - -or - -**Call**: +2349062000119,+2349080564419. - -Good luck! +The sigmoid function therefore allows this to be possible because the sigmoid function maps any real value into another value between 0 and 1.