diff --git a/module1-vectors-and-matrices/Jacob_Torres_LS_DS_131_Vectors_and_Matrices_Assignment.ipynb b/module1-vectors-and-matrices/Jacob_Torres_LS_DS_131_Vectors_and_Matrices_Assignment.ipynb new file mode 100644 index 00000000..34e8532f --- /dev/null +++ b/module1-vectors-and-matrices/Jacob_Torres_LS_DS_131_Vectors_and_Matrices_Assignment.ipynb @@ -0,0 +1,1172 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Jacob_Torres_LS_DS_131_Vectors_and_Matrices_Assignment.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yXA3GwWhY9KL" + }, + "source": [ + "# Part 1 - Scalars and Vectors\n", + "\n", + "For the questions below it is not sufficient to simply provide answer to the questions, but you must solve the problems and show your work using python (the NumPy library will help a lot!) Translate the vectors and matrices into their appropriate python representations and use numpy or functions that you write yourself to demonstrate the result or property. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oNOTv43_Zi9L" + }, + "source": [ + "## 1.1 Create a two-dimensional vector and plot it on a graph" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5oCHTpESqmk4" + }, + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "XNqjzQzrkVG7", + "outputId": "412d60bd-ee5d-449e-d2a2-033b2310c333", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 281 + } + }, + "source": [ + "# 2D vector\n", + "vec_2d = np.array([2, 5])\n", + "\n", + "# Graph the vector\n", + "plt.arrow(0,0,vec_2d[0],vec_2d[1], color='green')\n", + "plt.xlim(0, 3)\n", + "plt.ylim(0, 5)\n", + "plt.title(\"2-Dimensional Vector\")\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "unKFT619lk3e" + }, + "source": [ + "## 1.2 Create a three-dimensional vector and plot it on a graph" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "atUEd3T6llKm", + "outputId": "20039539-5e28-4523-9bd4-00c2fd415887", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 466 + } + }, + "source": [ + "# 3-dimensional vector\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "\n", + "vectors = np.array([\n", + " [0, 0, 0, 3, 1, -2],\n", + " [0, 0, 0, 1, -2, 2],\n", + " [0, 0, 0, -1, 3, 0]\n", + "])\n", + "X, Y, Z, U, V, W = zip(*vectors)\n", + "fig = plt.figure(figsize=(8, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "ax.quiver(X, Y, Z, U, V, W, length=1)\n", + "ax.set_xlim(-3, 3)\n", + "ax.set_ylim(-3, 3)\n", + "ax.set_zlim(-3, 3)\n", + "ax.set_xlabel('X')\n", + "ax.set_ylabel('Y')\n", + "ax.set_zlabel('Z')\n", + "ax.set_title(\"3-Dimensional Vector\")\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b7qFxbKxZmI2" + }, + "source": [ + "## 1.3 Scale the vectors you created in 1.1 by $5$, $\\pi$, and $-e$ and plot all four vectors (original + 3 scaled vectors) on a graph. What do you notice about these vectors? " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ah6zMSLJdJwL", + "outputId": "d7cb85b7-0337-4278-9e27-5319305406c6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + } + }, + "source": [ + "from math import e, pi\n", + "print(e)\n", + "print(pi)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "2.718281828459045\n", + "3.141592653589793\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3qpwDlzXkVf5", + "outputId": "2d69dd95-2e8a-4d27-e283-8f9730f52143", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + } + }, + "source": [ + "# Scale the original vector\n", + "A = np.multiply(5, vec_2d)\n", + "B = np.multiply(pi, vec_2d)\n", + "C = np.multiply(np.multiply(-1, e), vec_2d)\n", + "\n", + "print(f\"\"\"Original vector = {vec_2d}\n", + "Scaled vector A = {A}\n", + "Scaled vector B = {B}\n", + "Scaled vector C = {C}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Original vector = [2 5]\n", + "Scaled vector A = [10 25]\n", + "Scaled vector B = [ 6.28318531 15.70796327]\n", + "Scaled vector C = [ -5.43656366 -13.59140914]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lIYq9lnSl6_h", + "outputId": "28b4c44d-006e-473e-8500-308f83e4825f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 879 + } + }, + "source": [ + "# Graph all 4 vectors\n", + "fig, ax = plt.subplots(figsize=(10, 15))\n", + "ax.set_title(\"Scaled Vectors\")\n", + "\n", + "ax.set_xlim(-10, 10)\n", + "ax.set_xticks(ticks=np.arange(-10, 15, 5))\n", + "\n", + "\n", + "ax.set_ylim(-30, 30)\n", + "ax.set_yticks(ticks=np.arange(-30, 40, 10))\n", + "\n", + "ax.arrow(0,0, vec_2d[0],vec_2d[1], color='g')\n", + "ax.text(x=3, y=6, s='V', fontweight='bold')\n", + "\n", + "ax.arrow(0,0, A[0],A[1], color='r')\n", + "ax.text(x=9, y=26, s='A', fontweight='bold')\n", + "\n", + "ax.arrow(0,0, B[0],B[1], color='b')\n", + "ax.text(x=7, y=16, s='B', fontweight='bold')\n", + "\n", + "ax.arrow(0,0, C[0],C[1], color='y')\n", + "ax.text(x=-6, y=-14, s='C', fontweight='bold')\n", + "\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wrgqa6sWimbH" + }, + "source": [ + "## 1.4 Graph vectors $\\vec{a}$ and $\\vec{b}$ and plot them on a graph\n", + "\n", + "\\begin{align}\n", + "\\vec{a} = \\begin{bmatrix} 5 \\\\ 7 \\end{bmatrix}\n", + "\\qquad\n", + "\\vec{b} = \\begin{bmatrix} 3 \\\\4 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I1BGXA_skV-b", + "outputId": "c8200e87-e544-4017-c968-ed153d48ac0c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 716 + } + }, + "source": [ + "# Graph vectors a and b\n", + "\n", + "a = [5, 7]\n", + "b = [3, 4]\n", + "fig, ax = plt.subplots(figsize=(8, 12))\n", + "ax.set_title(\"Vectors a and b\")\n", + "\n", + "ax.set_xlim(0, 6)\n", + "ax.set_ylim(0, 8)\n", + "\n", + "ax.arrow(0,0, a[0],a[1], color='b')\n", + "ax.text(x=6, y=7, s='a', fontweight='bold')\n", + "ax.arrow(0,0, b[0],b[1], color='y')\n", + "ax.text(x=4, y=4, s='b', fontweight='bold')\n", + "\n", + "\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QN6RU_3gizpw" + }, + "source": [ + "## 1.5 find $\\vec{a} - \\vec{b}$ and plot the result on the same graph as $\\vec{a}$ and $\\vec{b}$. Is there a relationship between vectors $\\vec{a} \\thinspace, \\vec{b} \\thinspace \\text{and} \\thinspace \\vec{a-b}$" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "68sWHIOPkXp5", + "outputId": "d30df900-7c46-483d-86c4-ebef7ca6b028", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + } + }, + "source": [ + "# Find vector a -b\n", + "a_b = np.subtract(a, b)\n", + "print(f\"Vector a -b = {a_b}\")\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Vector a -b = [2 3]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "z7Qm1DgRqh1i", + "outputId": "dd08e253-1221-4296-c3a5-1005bcb04daf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 499 + } + }, + "source": [ + "# Graph vectors a, b, and a -b\n", + "fig, ax = plt.subplots(figsize=(8, 8))\n", + "ax.set_title(\"Vector a -b\")\n", + "\n", + "ax.set_xlim(0, 6)\n", + "ax.set_ylim(0, 8)\n", + "\n", + "ax.arrow(0,0, a[0],a[1], color='b')\n", + "ax.text(x=6, y=7, s='a', fontweight='bold')\n", + "\n", + "ax.arrow(0,0, b[0],b[1], color='y')\n", + "ax.text(x=4, y=4, s='b', fontweight='bold')\n", + "\n", + "ax.arrow(0,0, a_b[0],a_b[1], color='r')\n", + "ax.text(x=3, y=3, s='a -b', fontweight='bold')\n", + "\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1ZPVuJAlehu_" + }, + "source": [ + "## 1.6 Find $c \\cdot d$\n", + "\n", + "\\begin{align}\n", + "\\vec{c} = \\begin{bmatrix}7 & 22 & 4 & 16\\end{bmatrix}\n", + "\\qquad\n", + "\\vec{d} = \\begin{bmatrix}12 & 6 & 2 & 9\\end{bmatrix}\n", + "\\end{align}\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2_cZQFCskYNr", + "outputId": "70f3f4c2-c3a2-438b-dd00-312d0b868e50", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + } + }, + "source": [ + "c = np.array([7, 22, 4, 16])\n", + "d = np.array([12, 6, 2, 9])\n", + "c_dot_d = np.dot(c, d)\n", + "print(f\"\"\"\n", + "c = {c}\n", + "d = {d}\n", + "c . d = {c_dot_d}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "c = [ 7 22 4 16]\n", + "d = [12 6 2 9]\n", + "c . d = 368\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cLm8yokpfg9B" + }, + "source": [ + "## 1.7 Find $e \\times f$\n", + "\n", + "\\begin{align}\n", + "\\vec{e} = \\begin{bmatrix} 5 \\\\ 7 \\\\ 2 \\end{bmatrix}\n", + "\\qquad\n", + "\\vec{f} = \\begin{bmatrix} 3 \\\\4 \\\\ 6 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ku-TdCKAkYs8", + "outputId": "d4d8b704-0369-4701-eb7d-76ca7b9492e2", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + } + }, + "source": [ + "e = np.array([\n", + " [5, 7, 2]\n", + "])\n", + "\n", + "f = np.array([\n", + " [3, 4, 6]\n", + "])\n", + "\n", + "e_mul_f = np.matmul(e.T, f)\n", + "\n", + "print(f\"\"\"\n", + "e = {e}\n", + "f = {f}\n", + "The product of e and f = {e_mul_f}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "e = [[5 7 2]]\n", + "f = [[3 4 6]]\n", + "The product of e and f = [[15 20 30]\n", + " [21 28 42]\n", + " [ 6 8 12]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-TN8wO2-h53s" + }, + "source": [ + "## 1.8 Find $||g||$ and then find $||h||$. Which is longer?\n", + "\n", + "\\begin{align}\n", + "\\vec{g} = \\begin{bmatrix} 1 \\\\ 1 \\\\ 1 \\\\ 8 \\end{bmatrix}\n", + "\\qquad\n", + "\\vec{h} = \\begin{bmatrix} 3 \\\\3 \\\\ 3 \\\\ 3 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-5VKOMKBlgaA", + "outputId": "8b1e5f46-90a7-4362-fc3c-a86b1077bdd8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + } + }, + "source": [ + "g = np.array([\n", + " [1, 1, 1, 8]\n", + "])\n", + "\n", + "h = np.array([\n", + " [3, 3, 3, 3]\n", + "])\n", + "\n", + "g_norm = np.linalg.norm(g)\n", + "h_norm = np.linalg.norm(h)\n", + "\n", + "print(f\"\"\"\n", + "g norm = {g_norm}\n", + "h norm = {h_norm}\n", + "Therefore, g is longer than h.\n", + "\"\"\")\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "g norm = 8.18535277187245\n", + "h norm = 6.0\n", + "Therefore, g is longer than h.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "njrWIMS-ZAoH" + }, + "source": [ + "# Part 2 - Matrices" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjkcAVIOmOnn" + }, + "source": [ + "## 2.1 What are the dimensions of the following matrices? Which of the following can be multiplied together? See if you can find all of the different legal combinations.\n", + "\\begin{align}\n", + "A = \\begin{bmatrix}\n", + "1 & 2 \\\\\n", + "3 & 4 \\\\\n", + "5 & 6\n", + "\\end{bmatrix}\n", + "\\qquad\n", + "B = \\begin{bmatrix}\n", + "2 & 4 & 6 \\\\\n", + "\\end{bmatrix}\n", + "\\qquad\n", + "C = \\begin{bmatrix}\n", + "9 & 6 & 3 \\\\\n", + "4 & 7 & 11\n", + "\\end{bmatrix}\n", + "\\qquad\n", + "D = \\begin{bmatrix}\n", + "1 & 0 & 0 \\\\\n", + "0 & 1 & 0 \\\\\n", + "0 & 0 & 1\n", + "\\end{bmatrix}\n", + "\\qquad\n", + "E = \\begin{bmatrix}\n", + "1 & 3 \\\\\n", + "5 & 7\n", + "\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z69c-uPtnbIx", + "outputId": "edc42e49-cdf0-464f-8e7e-ab2af9345b70", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + } + }, + "source": [ + "matrix_A = np.array([\n", + " [1, 2],\n", + " [3, 4],\n", + " [5, 6]\n", + "])\n", + "\n", + "matrix_B = np.array([\n", + " [2, 4, 6]\n", + "])\n", + "\n", + "matrix_C = np.array([\n", + " [9, 6, 3],\n", + " [4, 7, 11]\n", + "])\n", + "\n", + "matrix_D = np.array([\n", + " [1, 0, 0],\n", + " [0, 1, 0],\n", + " [0, 0, 1]\n", + "])\n", + "\n", + "matrix_E = np.array([\n", + " [1, 3],\n", + " [5, 7]\n", + "])\n", + "\n", + "print(f\"\"\"\n", + "Matrix A {matrix_A.shape}\n", + "Matrix B {matrix_B.shape}\n", + "Matrix C {matrix_C.shape}\n", + "Matrix D {matrix_D.shape}\n", + "Matrix E {matrix_E.shape}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Matrix A (3, 2)\n", + "Matrix B (1, 3)\n", + "Matrix C (2, 3)\n", + "Matrix D (3, 3)\n", + "Matrix E (2, 2)\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2g-KTo660Z2v", + "outputId": "b55527ef-f9c2-401a-ccfb-7369795afe37", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + } + }, + "source": [ + "# Multiply matrices A and C\n", + "matrix_F = np.matmul(matrix_A, matrix_C)\n", + "print(f\"\"\"\n", + "Matrix F {matrix_F.shape}\n", + "{matrix_F}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Matrix F (3, 3)\n", + "[[17 20 25]\n", + " [43 46 53]\n", + " [69 72 81]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "we03LqtBy4IY", + "outputId": "0b167a32-4572-47bd-a30f-91b42489ab0e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + } + }, + "source": [ + "# Multiply F (the product of A and C) by D\n", + "matrix_G = np.matmul(matrix_F, matrix_D)\n", + "print(f\"\"\"\n", + "Matrix G {matrix_G.shape}\n", + "{matrix_G}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Matrix G (3, 3)\n", + "[[17 20 25]\n", + " [43 46 53]\n", + " [69 72 81]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y69ojy6xzEh5" + }, + "source": [ + "Matrices A and C can be multiplied, because they have the same shape as the other's transposed shape." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lMOlCoM3ncGa" + }, + "source": [ + "## 2.2 Find the following products: CD, AE, and BA. What are the dimensions of the resulting matrices? How does that relate to the dimensions of their factor matrices?" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zhKwiSItoE2F", + "outputId": "745e06c2-22ae-4729-8859-2bd5efa6eb10", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + } + }, + "source": [ + "matrix_H = np.matmul(matrix_C, matrix_D)\n", + "matrix_I = np.matmul(matrix_A, matrix_E)\n", + "matrix_J = np.matmul(matrix_B, matrix_A)\n", + "print(f\"\"\"\n", + "CD = Matrix H {matrix_H.shape}\n", + "AE = Matrix I {matrix_I.shape}\n", + "BA = Matrix J {matrix_J.shape}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "CD = Matrix H (2, 3)\n", + "AE = Matrix I (3, 2)\n", + "BA = Matrix J (1, 2)\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ygWaNQpd9Jrj" + }, + "source": [ + "CD: the product of matrices with dimensions (2, 3) and (3, 3) has dimensions (2, 3)\n", + "\n", + "AE: the product of matrices with dimensions (3, 2) and (2, 2) has dimensions (3, 2)\n", + "\n", + "BA: the product of matrices with dimensions (1, 3) and (3, 2) has dimensions (1, 2)\n", + "\n", + "Based on these results, the pattern in the products of matrices seems to be to inherit the number of rows of the first factor matrix, and the number of columns of the second factor matrix." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p2jmaGLgoFPN" + }, + "source": [ + "## 2.3 Find $F^{T}$. How are the numbers along the main diagonal (top left to bottom right) of the original matrix and its transpose related? What are the dimensions of $F$? What are the dimensions of $F^{T}$?\n", + "\n", + "\\begin{align}\n", + "F = \n", + "\\begin{bmatrix}\n", + "20 & 19 & 18 & 17 \\\\\n", + "16 & 15 & 14 & 13 \\\\\n", + "12 & 11 & 10 & 9 \\\\\n", + "8 & 7 & 6 & 5 \\\\\n", + "4 & 3 & 2 & 1\n", + "\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Wl3ElwgLqaAn", + "outputId": "4a3a3faa-0035-422b-b60a-fc2204338169", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 208 + } + }, + "source": [ + "f = np.array([\n", + " [20, 19, 18, 17],\n", + " [16, 15, 14, 13],\n", + " [12, 11, 10, 9],\n", + " [8, 7, 6, 5],\n", + " [4, 3, 2, 1]\n", + "])\n", + "\n", + "print(f\"\"\"\n", + "f = {f}\n", + "f^T = {f.T}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "f = [[20 19 18 17]\n", + " [16 15 14 13]\n", + " [12 11 10 9]\n", + " [ 8 7 6 5]\n", + " [ 4 3 2 1]]\n", + "f^T = [[20 16 12 8 4]\n", + " [19 15 11 7 3]\n", + " [18 14 10 6 2]\n", + " [17 13 9 5 1]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Adh-UXCoF7Is" + }, + "source": [ + "The main diagonal of both f and its transpose are 20, 15, 10, and 5, in that order. Even though the transpose has most of the other values in different positions, the main diagonal appears not to change." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "13ik2LEEZLHn" + }, + "source": [ + "# Part 3 - Square Matrices" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sDBAPUwfp7f7" + }, + "source": [ + "## 3.1 Find $IG$ (be sure to show your work) 😃\n", + "\n", + "You don't have to do anything crazy complicated here to show your work, just create the G matrix as specified below, and a corresponding 2x2 Identity matrix and then multiply them together to show the result. You don't need to write LaTeX or anything like that (unless you want to).\n", + "\n", + "\\begin{align}\n", + "G= \n", + "\\begin{bmatrix}\n", + "13 & 14 \\\\\n", + "21 & 12 \n", + "\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZnqvZBOYqar3", + "outputId": "db5b8ea6-6606-4599-8bf3-318b048426cc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + } + }, + "source": [ + "g = np.array([\n", + " [13, 21],\n", + " [14, 12]\n", + "])\n", + "\n", + "identity = np.array([\n", + " [1, 0],\n", + " [0, 1]\n", + "])\n", + "\n", + "identity_g = np.matmul(g, identity)\n", + "\n", + "print(f\"\"\"\n", + "g = {g}\n", + "identity matrix = {identity}\n", + "g * identity matrix = {identity_g}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "g = [[13 21]\n", + " [14 12]]\n", + "identity matrix = [[1 0]\n", + " [0 1]]\n", + "g * identity matrix = [[13 21]\n", + " [14 12]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DZ_0XTDQqpMT" + }, + "source": [ + "## 3.2 Find $|H|$ and then find $|J|$.\n", + "\n", + "\\begin{align}\n", + "H= \n", + "\\begin{bmatrix}\n", + "12 & 11 \\\\\n", + "7 & 10 \n", + "\\end{bmatrix}\n", + "\\qquad\n", + "J= \n", + "\\begin{bmatrix}\n", + "0 & 1 & 2 \\\\\n", + "7 & 10 & 4 \\\\\n", + "3 & 2 & 0\n", + "\\end{bmatrix}\n", + "\\end{align}\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5QShhoXyrjDS", + "outputId": "83ea3db5-eb69-4105-e38b-a43e974b2cca", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 87 + } + }, + "source": [ + "h = np.array([\n", + " [12, 11],\n", + " [7, 10]\n", + "])\n", + "\n", + "j = np.array([\n", + " [0, 1, 2],\n", + " [7, 10, 4],\n", + " [3, 2, 0]\n", + "])\n", + "\n", + "# Determinants\n", + "det_h = np.linalg.det(h)\n", + "det_j = np.linalg.det(j)\n", + "\n", + "print(f\"\"\"\n", + "|h| = {round(det_h, 3)}\n", + "|j| = {round(det_j, 3)}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "|h| = 43.0\n", + "|j| = -20.0\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2gZl1CFwrXSH" + }, + "source": [ + "## 3.3 Find $H^{-1}$ and then find $J^{-1}$" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nyX6De2-rio1", + "outputId": "8a115279-d700-4f25-cd47-04baf06a73f7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + } + }, + "source": [ + "inv_h = np.linalg.inv(h)\n", + "inv_j = np.linalg.inv(j)\n", + "\n", + "print(f\"\"\"\n", + "h^-1 = {inv_h}\n", + "j^-1 = {inv_j}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "h^-1 = [[ 0.23255814 -0.25581395]\n", + " [-0.1627907 0.27906977]]\n", + "j^-1 = [[ 0.4 -0.2 0.8 ]\n", + " [-0.6 0.3 -0.7 ]\n", + " [ 0.8 -0.15 0.35]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vvd4Pe86rjhW" + }, + "source": [ + "## 3.4 Find $HH^{-1}$ and then find $J^{-1}J$. Is $HH^{-1} == J^{-1}J$? Why or Why not? \n", + "\n", + "Please ignore Python rounding errors. If necessary, format your output so that it rounds to 5 significant digits (the fifth decimal place)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TShQKn7_lfnQ", + "outputId": "234ce224-bd27-436e-b32f-cb6ddaec7734", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + } + }, + "source": [ + "print(f\"\"\"\n", + "h(h^-1) = {np.matmul(h, inv_h)}\n", + "j(j^-1) = {np.matmul(j, inv_j)}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "h(h^-1) = [[1.00000000e+00 5.55111512e-16]\n", + " [2.22044605e-16 1.00000000e+00]]\n", + "j(j^-1) = [[ 1.00000000e+00 -5.55111512e-17 0.00000000e+00]\n", + " [ 0.00000000e+00 1.00000000e+00 0.00000000e+00]\n", + " [ 0.00000000e+00 0.00000000e+00 1.00000000e+00]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p5b6HLjMpYCr" + }, + "source": [ + "The product matrices in both h(h^-1) and j(j^-1) are identity matrices for their respective dimensionalities." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V0iTO4McYjtk" + }, + "source": [ + "# Stretch Goals: \n", + "\n", + "A reminder that these challenges are optional. If you finish your work quickly we welcome you to work on them. If there are other activities that you feel like will help your understanding of the above topics more, feel free to work on that. Topics from the Stretch Goals sections will never end up on Sprint Challenges. You don't have to do these in order, you don't have to do all of them. \n", + "\n", + "- Write a function that can calculate the dot product of any two vectors of equal length that are passed to it.\n", + "- Write a function that can calculate the norm of any vector\n", + "- Prove to yourself again that the vectors in 1.9 are orthogonal by graphing them. \n", + "- Research how to plot a 3d graph with animations so that you can make the graph rotate (this will be easier in a local notebook than in google colab)\n", + "- Create and plot a matrix on a 2d graph.\n", + "- Create and plot a matrix on a 3d graph.\n", + "- Plot two vectors that are not collinear on a 2d graph. Calculate the determinant of the 2x2 matrix that these vectors form. How does this determinant relate to the graphical interpretation of the vectors?\n", + "\n" + ] + } + ] +} \ No newline at end of file diff --git a/module2-intermediate-linear-algebra/Jacob_Torres_LS_DS_132_Intermediate_Linear_Algebra_Assignment.ipynb b/module2-intermediate-linear-algebra/Jacob_Torres_LS_DS_132_Intermediate_Linear_Algebra_Assignment.ipynb new file mode 100644 index 00000000..0b9794dd --- /dev/null +++ b/module2-intermediate-linear-algebra/Jacob_Torres_LS_DS_132_Intermediate_Linear_Algebra_Assignment.ipynb @@ -0,0 +1,1562 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Jacob_Torres_LS_DS_132_Intermediate_Linear_Algebra_Assignment.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GSNiYn8lr6nN" + }, + "source": [ + "# Statistics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3d4izUhQvh2_" + }, + "source": [ + "## 1.1 Sales for the past week was the following amounts: [3505, 2400, 3027, 2798, 3700, 3250, 2689]. Without using library functions, what is the mean, variance, and standard deviation of of sales from last week? (for extra bonus points, write your own function that can calculate these two values for any sized list)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YgCoLzDSwCvf" + }, + "source": [ + "# Function for finding sales stats\n", + "def sales_stats(sales):\n", + " sales_mean = 0.0\n", + " sales_variance = 0.0\n", + " sales_std = 0.0\n", + " \n", + " # Calculate mean\n", + " sum = 0\n", + " for x in range(len(sales)):\n", + " sum += sales[x]\n", + " sales_mean = sum / len(sales)\n", + "\n", + "# Calculate variance\n", + " sum = 0\n", + " for x in range(len(sales)):\n", + " sum += (sales[x] - sales_mean) ** 2\n", + " sales_variance = sum / (len(sales) -1)\n", + "\n", + " # Calculate standard deviation\n", + " sales_std = sales_variance ** (1/2)\n", + "\n", + " return sales_mean, sales_variance, sales_std\n" + ], + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "w1iZfYvBtEA1", + "outputId": "2e9d19e5-7e09-4755-ae8a-ebe232dff5ba", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105 + } + }, + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "weekly_sales = np.array([3505, 2400, 3027, 2798, 3700, 3250, 2689])\n", + "sales_mean, sales_variance, sales_std = sales_stats(weekly_sales)\n", + "\n", + "print(f\"\"\"\n", + "Sales mean = ${round(sales_mean, 2)}\n", + "Variance = ${round(sales_variance, 2)}\n", + "Standard deviation = ${round(sales_std, 2)}\n", + "\"\"\")" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Sales mean = $3052.71\n", + "Variance = $214387.9\n", + "Standard deviation = $463.02\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oh63KaOctEp_" + }, + "source": [ + "## 1.2 Find the covariance between last week's sales numbers and the number of customers that entered the store last week: [127, 80, 105, 92, 120, 115, 93] (you may use librray functions for calculating the covariance since we didn't specifically talk about its formula)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G7ZB0krot564", + "outputId": "a4ac2b4d-c595-400b-f456-a49b32f2dc32", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105 + } + }, + "source": [ + "weekly_customers = np.array([127, 80, 105, 92, 120, 115, 93])\n", + "\n", + "# Transform sales and customer data into dataframe\n", + "data = {\n", + " 'weekly_sales': weekly_sales,\n", + " 'weekly_customers': weekly_customers\n", + "}\n", + "sales_df = pd.DataFrame(data)\n", + "\n", + "# Calculate covariance between sales and customers\n", + "cov = sales_df.cov()\n", + "print(f\"\"\"Covariance between weekly sales and customers:\n", + "{cov}\n", + "\"\"\")" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Covariance between weekly sales and customers:\n", + " weekly_sales weekly_customers\n", + "weekly_sales 214387.904762 7604.357143\n", + "weekly_customers 7604.357143 290.952381\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J9SbUY9mt66I" + }, + "source": [ + "## 1.3 Find the standard deviation of customers who entered the store last week. Then, use the standard deviations of both sales and customers to standardize the covariance to find the correlation coefficient that summarizes the relationship between sales and customers. (You may use library functions to check your work.)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vFJms2YRrKhY", + "outputId": "6cbb6dcb-01df-4c5d-d836-e4c676a6b03c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105 + } + }, + "source": [ + "customer_mean, customer_variance, customer_std = sales_stats(weekly_customers)\n", + "print(f\"\"\"\n", + "Standard deviation:\n", + " Sales = ${round(sales_std, 2)}\n", + " Customers = {round(customer_std, 3)}\n", + "\"\"\")" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Standard deviation:\n", + " Sales = $463.02\n", + " Customers = 17.057\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ENyVw3U_f-bp", + "outputId": "adc28dc3-22fa-4af1-8aa1-e22ea50904b8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 157 + } + }, + "source": [ + "# Calculate the correlation coefficient\n", + "combined_std = sales_std * customer_std\n", + "combined_cov = cov['weekly_sales']['weekly_customers']\n", + "corr = combined_cov / combined_std\n", + "print(f\"\"\"The correlation between sales and customers last week:\n", + " r = {round(corr, 3)}\n", + "\n", + "Pandas correlation matrix:\n", + "{sales_df.corr()}\n", + "\"\"\")" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "The correlation between sales and customers last week:\n", + " r = 0.963\n", + "\n", + "Pandas correlation matrix:\n", + " weekly_sales weekly_customers\n", + "weekly_sales 1.000000 0.962834\n", + "weekly_customers 0.962834 1.000000\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IbZVf7nmujPJ" + }, + "source": [ + "## 1.4 Use pandas to import a cleaned version of the titanic dataset from the following link: [Titanic Dataset](https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/titanic_clean.csv)\n", + "\n", + "## Calculate the variance-covariance matrix and correlation matrix for the titanic dataset's numeric columns. (you can encode some of the categorical variables and include them as a stretch goal if you finish early)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0TWgUIiaCFzq", + "outputId": "d4787592-85be-4079-d13d-ff01d04315b8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 559 + } + }, + "source": [ + "# Load Titanic dataset\n", + "data_url = 'https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/titanic_clean.csv'\n", + "titanic_data = pd.read_csv(data_url)\n", + "titanic_data.drop(columns='Unnamed: 0', inplace=True)\n", + "print(titanic_data.shape)\n", + "titanic_data.head()" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(1310, 15)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.desthas_cabin_number
01.01.0Allen, Miss. Elisabeth Waltonfemale29.00000.00.024160211.3375B5S2NaNSt Louis, MO1
11.01.0Allison, Master. Hudson Trevormale0.91671.02.0113781151.5500C22 C26S11NaNMontreal, PQ / Chesterville, ON1
21.00.0Allison, Miss. Helen Lorainefemale2.00001.02.0113781151.5500C22 C26SNaNNaNMontreal, PQ / Chesterville, ON1
31.00.0Allison, Mr. Hudson Joshua Creightonmale30.00001.02.0113781151.5500C22 C26SNaN135.0Montreal, PQ / Chesterville, ON1
41.00.0Allison, Mrs. Hudson J C (Bessie Waldo Daniels)female25.00001.02.0113781151.5500C22 C26SNaNNaNMontreal, PQ / Chesterville, ON1
\n", + "
" + ], + "text/plain": [ + " pclass survived ... home.dest has_cabin_number\n", + "0 1.0 1.0 ... St Louis, MO 1\n", + "1 1.0 1.0 ... Montreal, PQ / Chesterville, ON 1\n", + "2 1.0 0.0 ... Montreal, PQ / Chesterville, ON 1\n", + "3 1.0 0.0 ... Montreal, PQ / Chesterville, ON 1\n", + "4 1.0 0.0 ... Montreal, PQ / Chesterville, ON 1\n", + "\n", + "[5 rows x 15 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FS1vbgWdnEXF", + "outputId": "e513a71f-9012-4d1f-af2d-7a29fb64c2b7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + } + }, + "source": [ + "titanic_data.cov()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pclasssurvivedagesibspparchfarebodyhas_cabin_number
pclass0.701969-0.127248-3.9546050.0530900.013287-24.227788-2.876653-0.249992
survived-0.1272480.236250-0.314343-0.0140880.0347766.1460230.0000000.061406
age-3.954605-0.314343165.850021-2.559806-1.459378114.41661381.6229221.463138
sibsp0.053090-0.014088-2.5598061.0850520.3368338.641768-8.708471-0.003946
parch0.0132870.034776-1.4593780.3368330.7491959.9280314.2371900.013316
fare-24.2277886.146023114.4166138.6417689.9280312678.959738-179.16468410.976961
body-2.8766530.00000081.622922-8.7084714.237190-179.1646849544.6885673.625689
has_cabin_number-0.2499920.0614061.463138-0.0039460.01331610.9769613.6256890.174613
\n", + "
" + ], + "text/plain": [ + " pclass survived ... body has_cabin_number\n", + "pclass 0.701969 -0.127248 ... -2.876653 -0.249992\n", + "survived -0.127248 0.236250 ... 0.000000 0.061406\n", + "age -3.954605 -0.314343 ... 81.622922 1.463138\n", + "sibsp 0.053090 -0.014088 ... -8.708471 -0.003946\n", + "parch 0.013287 0.034776 ... 4.237190 0.013316\n", + "fare -24.227788 6.146023 ... -179.164684 10.976961\n", + "body -2.876653 0.000000 ... 9544.688567 3.625689\n", + "has_cabin_number -0.249992 0.061406 ... 3.625689 0.174613\n", + "\n", + "[8 rows x 8 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oaDDPLoorcKk", + "outputId": "1307d209-c445-447a-c113-d1d0b587007a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + } + }, + "source": [ + "titanic_data.corr()" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pclasssurvivedagesibspparchfarebodyhas_cabin_number
pclass1.000000-0.312469-0.3663700.0608320.018322-0.558629-0.034642-0.713857
survived-0.3124691.000000-0.050199-0.0278250.0826600.244265NaN0.302250
age-0.366370-0.0501991.000000-0.190747-0.1308720.1718920.0590590.271887
sibsp0.060832-0.027825-0.1907471.0000000.3735870.160238-0.099961-0.009064
parch0.0183220.082660-0.1308720.3735871.0000000.2215390.0510990.036806
fare-0.5586290.2442650.1718920.1602380.2215391.000000-0.0431100.507253
body-0.034642NaN0.059059-0.0999610.051099-0.0431101.0000000.083796
has_cabin_number-0.7138570.3022500.271887-0.0090640.0368060.5072530.0837961.000000
\n", + "
" + ], + "text/plain": [ + " pclass survived ... body has_cabin_number\n", + "pclass 1.000000 -0.312469 ... -0.034642 -0.713857\n", + "survived -0.312469 1.000000 ... NaN 0.302250\n", + "age -0.366370 -0.050199 ... 0.059059 0.271887\n", + "sibsp 0.060832 -0.027825 ... -0.099961 -0.009064\n", + "parch 0.018322 0.082660 ... 0.051099 0.036806\n", + "fare -0.558629 0.244265 ... -0.043110 0.507253\n", + "body -0.034642 NaN ... 1.000000 0.083796\n", + "has_cabin_number -0.713857 0.302250 ... 0.083796 1.000000\n", + "\n", + "[8 rows x 8 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7K0Xfh8MvYkl" + }, + "source": [ + "# Orthogonality" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pe3eOZ2fvdZ-" + }, + "source": [ + "## 2.1 Plot two vectors that are orthogonal to each other. What is a synonym for orthogonal?" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YLSBk7hJvvCx", + "outputId": "46292beb-9c3b-4244-ddad-960056292c7b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 516 + } + }, + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "j = np.array([2, -1])\n", + "k = np.array([1, 2])\n", + "j_dot_k = np.vdot(j, k)\n", + "print(f\"The vectors below have a dot product of {j_dot_k}\")\n", + "\n", + "fig = plt.figure(figsize=(8, 8))\n", + "ax = fig.add_subplot()\n", + "ax.set_title(\"Orthogonal Vectors\")\n", + "ax.set_xlim(-3, 3)\n", + "ax.set_ylim(-3, 3)\n", + "ax.arrow(-1, -1, j[0],j[1], color='b')\n", + "ax.arrow(2, -1, k[0],k[1], color='r')\n", + "\n", + "plt.show()" + ], + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "text": [ + "The vectors below have a dot product of 0\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQ3Iq3sOeWuF" + }, + "source": [ + "A synonym for orthogonal is perpendicular. In other words, two vectors that are orthogonal to each other will create a right angle at their intersection." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7AS4V1Nhvvxz" + }, + "source": [ + "## 2.2 Are the following vectors orthogonal? Why or why not?\n", + "\n", + "\\begin{align}\n", + "a = \\begin{bmatrix} -5 \\\\ 3 \\\\ 7 \\end{bmatrix}\n", + "\\qquad\n", + "b = \\begin{bmatrix} 6 \\\\ -8 \\\\ 2 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "F_-y54YSz47k", + "outputId": "608743bf-1fb1-4ba5-c416-264220fd0e61", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 105 + } + }, + "source": [ + "# Calculate the dot product of the vectors\n", + "a = np.array([\n", + " [-5, 3, 7]\n", + "])\n", + "b = np.array([\n", + " [6, -8, 2]\n", + "])\n", + "\n", + "a_dot_b = np.vdot(a, b)\n", + "print(f\"\"\"a.b = {a_dot_b}\n", + "\n", + "Since the dot product = {a_dot_b}, and not 0,\n", + "the vectors are not orthogonal.\n", + "\"\"\")" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "text": [ + "a.b = -40\n", + "\n", + "Since the dot product = -40, and not 0,\n", + "the vectors are not orthogonal.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MiNjyqiEz5SG" + }, + "source": [ + "## 2.3 Compute the following values: What do these quantities have in common?\n", + "\n", + "## What is $||c||^2$? \n", + "\n", + "## What is $c \\cdot c$? \n", + "\n", + "## What is $c^{T}c$?\n", + "\n", + "\\begin{align}\n", + "c = \\begin{bmatrix} 2 & -15 & 6 & 20 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IlV_uaYK1EQB", + "outputId": "3e875f9f-d9d3-4254-f5e8-0d10806f372c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 157 + } + }, + "source": [ + "c = np.array([\n", + " [2, -15, 6, 20]\n", + "])\n", + "\n", + "print(f\"\"\"c = {c}\n", + "||c||^2 = {np.linalg.norm(c) ** 2}\n", + "c.c = {np.vdot(c, c)}\n", + "c^Tc = {c.T * c}\n", + "\"\"\")" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "text": [ + "c = [[ 2 -15 6 20]]\n", + "||c||^2 = 665.0\n", + "c.c = 665\n", + "c^Tc = [[ 4 -30 12 40]\n", + " [ -30 225 -90 -300]\n", + " [ 12 -90 36 120]\n", + " [ 40 -300 120 400]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X98kxMKOld9L" + }, + "source": [ + "- The squared norm of c = the dot product of c and itself.\n", + "- c multiplied by its transpose creates a 4x4 square matrix." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MK_TpWqk1Evk" + }, + "source": [ + "# Unit Vectors" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Kpit6WWO1b8l" + }, + "source": [ + "## 3.1 Using Latex, write the following vectors as a linear combination of scalars and unit vectors:\n", + "\n", + "\\begin{align}\n", + "d = \\begin{bmatrix} 7 \\\\ 12 \\end{bmatrix}\n", + "\\qquad\n", + "e = \\begin{bmatrix} 2 \\\\ 11 \\\\ -8 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oBCj1sDW2ouC" + }, + "source": [ + "\\begin{align}\n", + "d = \\begin{bmatrix} 7 \n", + "\\begin{bmatrix} 1 \\\\ 0\n", + "\\end{bmatrix}\n", + "+ 12 \n", + "\\begin{bmatrix} 0 \\\\ 1 \\end{bmatrix}\n", + "\\end{bmatrix}\n", + "\n", + "e = \\begin{bmatrix} 2\n", + "\\begin{bmatrix} 1 \\\\ 0 \\\\ 0\n", + "\\end{bmatrix}\n", + "+ 11 \n", + "\\begin{bmatrix} 0 \\\\ 1 \\\\ 0\n", + "\\end{bmatrix}\n", + "+ -8\n", + "\\begin{bmatrix} 0 \\\\ 0 \\\\ 1 \\end{bmatrix}\n", + "\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dAdUQuep1_yJ" + }, + "source": [ + "## 3.2 Turn vector $f$ into a unit vector:\n", + "\n", + "\\begin{align}\n", + "f = \\begin{bmatrix} 4 & 12 & 11 & 9 & 2 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I3W8ZiHR1_Fa", + "outputId": "7cd4d363-65ac-49bd-9e73-1d4aac5fb119", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "f = np.array([\n", + " [4, 12, 11, 9, 2]\n", + "])\n", + "\n", + "f_norm = np.linalg.norm(f)\n", + "f_unit = f / f_norm\n", + "print(f_unit)" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[[0.20908335 0.62725005 0.57497921 0.47043754 0.10454167]]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o39UyP-I5lpP" + }, + "source": [ + "# Linear Independence / Dependence " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ajfBqYe45sT5" + }, + "source": [ + "## 4.1 Plot two vectors that are linearly dependent and two vectors that are linearly independent (bonus points if done in $\\mathbb{R}^3$)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GzckBwQKmd-u", + "outputId": "76ac3130-3982-4384-83fc-6220ff5f9145", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 499 + } + }, + "source": [ + "# Linearly dependent vectors\n", + "V = np.array([3, 2])\n", + "V2 = np.multiply(3, V)\n", + "\n", + "plt.style.use('seaborn-bright')\n", + "fig, ax = plt.subplots(figsize=(12, 12))\n", + "ax.set_title(\"Linearly Dependent Vectors\")\n", + "ax.set_xlim(-4, 4)\n", + "ax.set_ylim(-4, 4)\n", + "ax.arrow(0,0, V[0],V[1], color='b')\n", + "ax.arrow(0,0, V2[0],V2[1], color='r')\n", + "plt.show()" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Rl9o6xh8oF2K", + "outputId": "9b7e5e05-0f66-4e35-f9dd-c575ee202413", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 499 + } + }, + "source": [ + "# Linearly Independent Vectors\n", + "V = np.array([-2, 4])\n", + "V2 = np.array([-2, 2])\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 12))\n", + "ax.set_title(\"Linearly Independent Vectors\")\n", + "ax.set_xlim(-4, 4)\n", + "ax.set_ylim(-4, 4)\n", + "ax.arrow(0,0, V[0],V[1], color='b')\n", + "ax.arrow(0,0, V2[0],V2[1], color='r')\n", + "plt.show()" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TrJ0MT_n3SvO" + }, + "source": [ + "# Span" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "86iXLzwM2z8l" + }, + "source": [ + "## 5.1 What is the span of the following vectors?\n", + "\n", + "\\begin{align}\n", + "g = \\begin{bmatrix} 1 & 2 \\end{bmatrix}\n", + "\\qquad\n", + "h = \\begin{bmatrix} 4 & 8 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "G2LK2RWL39Q4", + "outputId": "61677548-5e06-41d2-92e6-ffa985c04c81", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140 + } + }, + "source": [ + "g = np.array([1, 2])\n", + "h = np.array([4, 8])\n", + "matrix_gh = np.array([g, h])\n", + "\n", + "print(f\"\"\"\n", + "Matrix gh = {matrix_gh}\n", + "\n", + "h = 4g; g and h are linearly dependent.\n", + "span(gh) = line (1, 2), (4, 8)\n", + "\"\"\")" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Matrix gh = [[1 2]\n", + " [4 8]]\n", + "\n", + "h = 4g; g and h are linearly dependent.\n", + "span(gh) = line (1, 2), (4, 8)\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l1deylUj4IHH" + }, + "source": [ + "## 5.2 What is the span of $\\{l, m, n\\}$?\n", + "\n", + "\\begin{align}\n", + "l = \\begin{bmatrix} 1 & 2 & 3 \\end{bmatrix}\n", + "\\qquad\n", + "m = \\begin{bmatrix} -1 & 0 & 7 \\end{bmatrix}\n", + "\\qquad\n", + "n = \\begin{bmatrix} 4 & 8 & 2\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "p1i_ueD25ZcP", + "outputId": "056671d1-a160-4a4c-a4c8-990ab6273d19", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 157 + } + }, + "source": [ + "l = np.array([1, 2, 3])\n", + "m = np.array([-1, 0, 7])\n", + "n = np.array([4, 8, 2])\n", + "matrix_lmn = np.array([l, m, n])\n", + "\n", + "print(f\"\"\"Matrix LMN\n", + "{matrix_lmn}\n", + "\n", + "det(lmn) = {np.linalg.det(matrix_lmn)}\n", + "span(lmn) = 3 dimensional space\n", + "\"\"\")" + ], + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Matrix LMN\n", + "[[ 1 2 3]\n", + " [-1 0 7]\n", + " [ 4 8 2]]\n", + "\n", + "det(lmn) = -19.999999999999996\n", + "span(lmn) = 3 dimensional space\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IBqe7X1732kX" + }, + "source": [ + "# Basis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YeUZVHRM6PpT" + }, + "source": [ + "## 6.1 Graph two vectors that form a basis for $\\mathbb{R}^2$\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "utvF3Pkt8NP6", + "outputId": "bb5ebfee-ee03-4016-af65-4c4a4cc11edb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 281 + } + }, + "source": [ + "i_hat = np.array([1, 0])\n", + "j_hat = np.array([0, 1])\n", + "\n", + "plt.title(\"Basis Vectors for R2\")\n", + "plt.xlim(-2, 2)\n", + "plt.ylim(-2, 2)\n", + "plt.arrow(0,0, i_hat[0],i_hat[1], linewidth=1, color='green')\n", + "plt.arrow(0,0, j_hat[0],j_hat[1], linewidth=1, color='orange')\n", + "plt.show()" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "20yPFBDUxxnS" + }, + "source": [ + "## 6.2 What does it mean to form a basis?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3w4tEayT8M0o" + }, + "source": [ + "A set of vectors can form the basis of a vector space if they are linearly independent and can span the whole vector space by being scaled." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EHmUxbcY6vD3" + }, + "source": [ + "# Rank" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IpJwt9kw6v8U" + }, + "source": [ + "## 7.1 What is the Rank of P?\n", + "\n", + "\\begin{align}\n", + "P = \\begin{bmatrix} \n", + "1 & 2 & 3 \\\\\n", + " -1 & 0 & 7 \\\\\n", + "4 & 8 & 2\n", + "\\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Yezj1zWbELO3", + "outputId": "8bb3ba25-19a9-4b71-e060-22eab8bb1243", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "p = np.array([\n", + " [1, -1, 4],\n", + " [2, 0, 8],\n", + " [3, 7, 2]\n", + " ])\n", + "p_rank = np.linalg.matrix_rank(p)\n", + "\n", + "print(f\"\"\"\n", + "rank(p) = {p_rank}\n", + "\"\"\")" + ], + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "rank(p) = 3\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jGqFMBYY7mHD" + }, + "source": [ + "## 7.2 What does the rank of a matrix tell us?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vjg1IiCD8nnP" + }, + "source": [ + "The rank of a matrix is the number of dimensions that the matrix vectors are able to span. The rank is never less than the smallest possible dimensionality of the matrix, but it may be less than the column space (number of columns.) If this is the case, it means that 1 or more of the column vectors are linearly dependent, and the dimensionality of the matrix is colapsed." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0Db2sc_V8QD6" + }, + "source": [ + "# Linear Projections\n", + "\n", + "## 8.1 Line $L$ is formed by all of the vectors that can be created by scaling vector $v$ \n", + "\\begin{align}\n", + "v = \\begin{bmatrix} 1 & 3 \\end{bmatrix}\n", + "\\end{align}\n", + "\n", + "\\begin{align}\n", + "w = \\begin{bmatrix} -1 & 2 \\end{bmatrix}\n", + "\\end{align}\n", + "\n", + "## find $proj_{L}(w)$\n", + "\n", + "## graph your projected vector to check your work (make sure your axis are square/even)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hp5z2WTBCNKx", + "outputId": "27e741bc-d4d1-434d-f983-a3c52f064fdd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "v = np.array([\n", + " [1, 3]\n", + "])\n", + "w = np.array([\n", + " [-1, 2]\n", + "])\n", + "proj_lw = np.subtract(\n", + " w, (np.vdot(v, w) /\n", + " (np.linalg.norm(v) **2)) *v\n", + ")\n", + "\n", + "print(f\"\"\"\n", + "proj l(w) = {proj_lw}\n", + "\"\"\")" + ], + "execution_count": 45, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "proj l(w) = [[-1.5 0.5]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TKkrPwRM-Oar" + }, + "source": [ + "# Stretch Goal\n", + "\n", + "## For vectors that begin at the origin, the coordinates of where the vector ends can be interpreted as regular data points. (See 3Blue1Brown videos about Spans, Basis, etc.)\n", + "\n", + "## Write a function that can calculate the linear projection of each point (x,y) (vector) onto the line y=x. run the function and plot the original points in blue and the new projected points on the line y=x in red. \n", + "\n", + "## For extra points plot the orthogonal vectors as a dashed line from the original blue points to the projected red points." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "cp52kZra-ykj", + "outputId": "b790b243-626b-4008-acc7-df8013af2499", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Creating a dataframe for you to work with -Feel free to not use the dataframe if you don't want to.\n", + "x_values = [1, 4, 7, 3, 9, 4, 5 ]\n", + "y_values = [4, 2, 5, 0, 8, 2, 8]\n", + "\n", + "data = {\"x\": x_values, \"y\": y_values}\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "df.head()\n", + "\n", + "plt.scatter(df.x, df.y)\n", + "plt.show()" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAPjklEQVR4nO3dbYylZX3H8e+PXVwYfMDIQBCE5UVD2pBU6An1qUTANVAtNk0TMGuTmpJpo7VguzEqL4wveNGEqH3RmkwWHxpHjPKQqqGUScSqSUVnAcvDYqMI6yKyx1hF3GYR+PfFOeM+OMuc454z97U7308yuee+zj33/rI585sz132fuVJVSJLadVzXASRJL8yilqTGWdSS1DiLWpIaZ1FLUuM2TuOkp5xySm3evHkap5akY9KOHTt+UlWzKz02laLevHkzS0tL0zi1JB2Tkjx2uMec+pCkxlnUktQ4i1qSGmdRS1LjLGpJatxIRZ3kvUkeTPJAkpuSnDDtYNIkLSzC5ivhuIsH24XFrhPpWDLt59eqRZ3kDODvgF5VnQdsAK6abAxpehYWYe4GeOxJqBps526wrDUZa/H8GnXqYyNwYpKNwAzwo8lFkKbruu2wd9/BY3v3DcalI7UWz69Vi7qqHgduAHYBTwA/r6o7Dz0uyVySpSRL/X5/cgmlI7Rrz3jj0jjW4vk1ytTHy4G3AecArwROSvKOQ4+rqvmq6lVVb3Z2xXdBSp0469TxxqVxrMXza5SpjzcBP6iqflX9CrgVeN3kIkjTdf3VMLPp4LGZTYNx6UitxfNrlKLeBbwmyUySAJcCOycXQZqurVtgfhucfRokg+38tsG4dKTW4vmVUdZMTPJh4ErgWeBe4Oqq2ne443u9XvlHmSRpdEl2VFVvpcdG+ut5VfUh4EMTTSVJGonvTJSkxlnUktQ4i1qSGmdRS1LjLGpJapxFLUmNs6glqXEWtSQ1zqKWpMZZ1JLUOItakhpnUUtS4yxqSWqcRS1JjbOoJalxFrUkNW6UxW3PTXLfAR9PJbl2LcJJkkZY4aWqvgu8GiDJBuBx4LYp55IkDY079XEp8P2qemwaYSRJv2ncor4KuGmlB5LMJVlKstTv9488mSQJGKOok7wIuAL4wkqPV9V8VfWqqjc7OzupfJK07o3zivpy4J6qenJaYSRJv2mcon47h5n2kCRNz0hFneQkYAtw63TjSJIOterteQBV9UvgFVPOIklage9MlKTGWdSS1DiLWpIaZ1FLUuMsaklqnEUtSY2zqCWpcRa1JDXOopakxlnUktQ4i1qSGmdRS1LjLGpJapxFLUmNs6glqXEWtSQ1btQVXk5OcnOSh5PsTPLaaQeTJA2MtMIL8E/AHVX158PVyGemmEmSdIBVizrJy4CLgL8EqKpngGemG0uStGyUqY9zgD7wyST3Jtk+XOz2IEnmkiwlWer3+xMPKknr1ShFvRG4APh4VZ0P/BJ4/6EHVdV8VfWqqjc7OzvhmJK0fo1S1LuB3VV193D/ZgbFLUlaA6sWdVX9GPhhknOHQ5cCD001lSTp10a96+M9wMLwjo9HgHdOL5Ik6UAjFXVV3Qf0ppxFkrQC35koSY2zqCWpcRa1JDXOopakxlnUktQ4i1qSGmdRS1LjLGpJapxFLUmNs6glqXEWtSQ1zqKWpMZZ1JLUOItakhpnUUtS40Yq6iSPJrk/yX1JlqYdSlK3FhZh85Vw3MWD7cJi14nWt1FXeAG4uKp+MrUkkpqwsAhzN8DefYP9x54c7ANs3dJdrvXMqQ9JB7lu+/6SXrZ332Bc3Ri1qAu4M8mOJHMrHZBkLslSkqV+vz+5hJLW1K49441r+kYt6jdU1QXA5cC7k1x06AFVNV9Vvarqzc7OTjSkpLVz1qnjjWv6Rirqqnp8uN0D3AZcOM1Qkrpz/dUws+ngsZlNg3F1Y9WiTnJSkpcsfw68GXhg2sEkdWPrFpjfBmefBslgO7/NC4ldGuWuj9OA25IsH//ZqrpjqqkkdWrrFou5JasWdVU9Avz+GmSRJK3A2/MkqXEWtSQ1zqKWpMZZ1JLUOItakhpnUUtS4yxqSWqcRS1JjbOoJalxFrUkNc6ilqTGWdSS1DiLWpIaZ1FLUuMsaklqnEUtSY0buaiTbEhyb5IvTyPIwiJsvhKOu3iwXVicxr8iSUefUZbiWnYNsBN46aRDLCzC3A2wd99g/7EnB/vgckCSNNIr6iRnAm8Btk8jxHXb95f0sr37BuOStN6NOvXxMeB9wPOHOyDJXJKlJEv9fn+sELv2jDcuSevJqkWd5K3Anqra8ULHVdV8VfWqqjc7OztWiLNOHW9cktaTUV5Rvx64IsmjwOeAS5J8ZpIhrr8aZjYdPDazaTAuSevdqkVdVR+oqjOrajNwFfCVqnrHJENs3QLz2+Ds0yAZbOe3eSFRkmC8uz6mausWi1mSVjJWUVfVV4GvTiWJJGlFvjNRkhpnUUtS4yxqSWqcRS1JjbOoJalxFrUkNc6ilqTGWdSS1DiLWpIaZ1FLUuMsaklqnEUtSY2zqCWpcRa1JDXOopakxlnUktS4URa3PSHJt5J8J8mDST68FsEkSQOjrPCyD7ikqp5OcjzwjST/XlXfnHI2SRIjFHVVFfD0cPf44UdNM5Qkab+R5qiTbEhyH7AHWKyqu1c4Zi7JUpKlfr8/6ZyStG6NVNRV9VxVvRo4E7gwyXkrHDNfVb2q6s3Ozk46pyStW2Pd9VFVPwPuAi6bThxJ0qFGuetjNsnJw89PBLYAD087mCRpYJS7Pk4HPp1kA4Ni/3xVfXm6sSRJy0a56+O/gfPXIIskaQW+M1GSGmdRS1LjLGpJapxFLUmNs6glqXEWtSQ1zqKWpMZZ1JLUOItakhpnUUtS4yxqSWqcRS1JjbOoJalxFrUkNc6ilqTGjbLCy6uS3JXkoSQPJrlmLYLp6PSuj8LGSyBvHGzf9dGuE0lHv1FWeHkW+IequifJS4AdSRar6qEpZ9NR5l0fhY//2/79557fv/8v7+0mk3QsWPUVdVU9UVX3DD//BbATOGPawXT0mf/SeOOSRjPWHHWSzQyW5bp7hcfmkiwlWer3+5NJp6PKc8+PNy5pNCMXdZIXA7cA11bVU4c+XlXzVdWrqt7s7OwkM+ooseEwz6bDjUsazUjfQkmOZ1DSC1V163Qj6Wg19yfjjUsazaoXE5MEuBHYWVUfmX4kHa2WLxjOf2kw3bHhuEFJeyFROjKpqhc+IHkD8HXgfmB5tvGDVXX74b6m1+vV0tLSxEJK0rEuyY6q6q302KqvqKvqG0AmnkqSNBIv80hS4yxqSWqcRS1JjbOoJalxFrUkNc6ilqTGWdSS1DiLWpIaZ1FLUuMsaklqnEUtSY2zqCWpcRa1JDXOopakxlnUktQ4i1qSGrdqUSf5RJI9SR5Yi0CSpION8or6U8BlU84hSTqMVYu6qr4G/HQNskiSVjCxOeokc0mWkiz1+/1JnVaS1r2JFXVVzVdVr6p6s7OzkzqtJK173vUhSY2zqCWpcaPcnncT8F/AuUl2J/mr6ceSJC3buNoBVfX2tQgiSVqZUx+S1DiLWpIaZ1FLUuMsaklqnEUtSY2zqCWpcRa1JDXOopakxlnUktQ4i1qSGmdRS1LjLGpJapxFLUmNs6glqXEWtSQ1zqKWpMaNVNRJLkvy3STfS/L+aYfS6hYWYfOVcNzFg+3CYteJJE3Lqiu8JNkA/DOwBdgNfDvJF6vqoWmH08oWFmHuBti7b7D/2JODfYCtW7rLJWk6RnlFfSHwvap6pKqeAT4HvG26sfRCrtu+v6SX7d03GJd07BmlqM8AfnjA/u7h2EGSzCVZSrLU7/cnlU8r2LVnvHFJR7eJXUysqvmq6lVVb3Z2dlKn1QrOOnW8cUlHt1GK+nHgVQfsnzkcU0euvxpmNh08NrNpMC7p2DNKUX8b+J0k5yR5EXAV8MXpxtIL2boF5rfB2adBMtjOb/NConSsWvWuj6p6NsnfAv8BbAA+UVUPTj2ZXtDWLRaztF6sWtQAVXU7cPuUs0iSVuA7EyWpcRa1JDXOopakxlnUktS4VNXkT5r0gcd+yy8/BfjJBONMirnGY67xmGs8x2Kus6tqxXcLTqWoj0SSparqdZ3jUOYaj7nGY67xrLdcTn1IUuMsaklqXItFPd91gMMw13jMNR5zjWdd5WpujlqSdLAWX1FLkg5gUUtS45op6iSfSLInyQNdZ1mW5FVJ7kryUJIHk1zTdSaAJCck+VaS7wxzfbjrTAdKsiHJvUm+3HWWAyV5NMn9Se5LstR1nmVJTk5yc5KHk+xM8toGMp07/H9a/ngqybVd5wJI8t7h8/6BJDclOaHrTABJrhlmenDS/1fNzFEnuQh4GvjXqjqv6zwASU4HTq+qe5K8BNgB/GnXC/smCXBSVT2d5HjgG8A1VfXNLnMtS/L3QA94aVW9tes8y5I8CvSqqqk3SiT5NPD1qto+/JvvM1X1s65zLRsucP048IdV9du+kW1SWc5g8Hz/var6vySfB26vqk91nOs8BuvJXgg8A9wB/E1VfW8S52/mFXVVfQ34adc5DlRVT1TVPcPPfwHsZIX1ItdaDTw93D1++NHET9wkZwJvAVxqdwRJXgZcBNwIUFXPtFTSQ5cC3++6pA+wETgxyUZgBvhRx3kAfhe4u6r2VtWzwH8CfzapkzdT1K1Lshk4H7i72yQDw+mF+4A9wGJVNZEL+BjwPuD5roOsoIA7k+xIMtd1mKFzgD7wyeF00fYkJ3Ud6hBXATd1HQKgqh4HbgB2AU8AP6+qO7tNBcADwB8leUWSGeCPOXgJwyNiUY8gyYuBW4Brq+qprvMAVNVzVfVqBmtYXjj81atTSd4K7KmqHV1nOYw3VNUFwOXAu4fTbV3bCFwAfLyqzgd+Cby/20j7DadirgC+0HUWgCQvB97G4AfcK4GTkryj21RQVTuBfwTuZDDtcR/w3KTOb1GvYjgHfAuwUFW3dp3nUMNfk+8CLus6C/B64IrhXPDngEuSfKbbSPsNX41RVXuA2xjMJ3ZtN7D7gN+IbmZQ3K24HLinqp7sOsjQm4AfVFW/qn4F3Aq8ruNMAFTVjVX1B1V1EfC/wP9M6twW9QsYXrS7EdhZVR/pOs+yJLNJTh5+fiKwBXi421RQVR+oqjOrajODX5e/UlWdv9oBSHLS8IIww6mFNzP4dbVTVfVj4IdJzh0OXQp0erH6EG+nkWmPoV3Aa5LMDL8/L2Vw7ahzSU4dbs9iMD/92Umde6Q1E9dCkpuANwKnJNkNfKiqbuw2Fa8H/gK4fzgfDPDB4RqSXTod+PTwavxxwOerqqlb4Rp0GnDb4HubjcBnq+qObiP92nuAheE0wyPAOzvOA/z6B9oW4K+7zrKsqu5OcjNwD/AscC/tvJ38liSvAH4FvHuSF4WbuT1PkrQypz4kqXEWtSQ1zqKWpMZZ1JLUOItakhpnUUtS4yxqSWrc/wPp0Tgk3c+srAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EZTA6Tj6BGDb" + }, + "source": [ + "" + ], + "execution_count": 24, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/module3-dimensionality-reduction/Jacob_Torres_LS_DS_133_High_Dimensional_Data_Assignment.ipynb b/module3-dimensionality-reduction/Jacob_Torres_LS_DS_133_High_Dimensional_Data_Assignment.ipynb new file mode 100644 index 00000000..4fa5ad76 --- /dev/null +++ b/module3-dimensionality-reduction/Jacob_Torres_LS_DS_133_High_Dimensional_Data_Assignment.ipynb @@ -0,0 +1,2550 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Jacob_Torres_LS_DS_133_High_Dimensional_Data_Assignment.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7wMWCkE1RZpM" + }, + "source": [ + "# Vertical Line Test" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W0-g7aprRv2j" + }, + "source": [ + "## 1.1 Create two graphs, one that passes the vertical line test and one that does not." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-tPn7C1yChre" + }, + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "fIJhCtF6RW_U", + "outputId": "b5f5f300-63f3-497c-8d82-a5fec1cf1065", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 823 + } + }, + "source": [ + "# Function that passes the vertical line test\n", + "def f(x):\n", + " y = 3 *x -4\n", + " return y\n", + "\n", + "# Graph the function\n", + "x_vals = [-2, -1, 0, 1, 2]\n", + "y_vals = [f(x) for x in x_vals]\n", + "vertical_x = np.ones(13)\n", + "vertical_y = np.arange(-10, 3)\n", + "\n", + "plt.style.use('seaborn-bright')\n", + "fig, ax = plt.subplots(figsize=(12, 16))\n", + "ax.set_title(\"Passes the Vertical Line Test\")\n", + "ax.set_xlim(-3, 3)\n", + "ax.set_ylim(-10, 2)\n", + "ax.plot(x_vals, y_vals, '-', color='g', linewidth=1.5)\n", + "ax.plot(vertical_x, vertical_y, '.', color='k')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "za07xHPQJIGI", + "outputId": "9ee5ee50-fce1-4c43-cb9a-9bde94d14e88", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 607 + } + }, + "source": [ + "# Graph that doesn't pass the vertical line test\n", + "def g(y):\n", + " x = y **2\n", + " return x\n", + "\n", + "y_vals = np.arange(-3, 4)\n", + "x_vals = [g(y) for y in y_vals]\n", + "vertical_y = np.arange(-3, 4)\n", + "vertical_x = np.multiply(5, np.ones(7))\n", + "\n", + "plt.style.use('seaborn-bright')\n", + "fig, ax = plt.subplots(figsize=(8, 10))\n", + "ax.set_title(\"Doesn't Pass the Vertical Line Test\")\n", + "ax.set_xlim(0, 10)\n", + "ax.set_ylim(-4, 4)\n", + "ax.plot(x_vals, y_vals, '-', color='g', linewidth=1.5)\n", + "ax.plot(vertical_x, vertical_y, '.', color='k')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ckeTKqMgRy7g" + }, + "source": [ + "## 1.2 Why are graphs that don't pass the vertical line test not considered \"functions?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gtl20YeDR6x-" + }, + "source": [ + "When an equation fails to pass the vertical line test, it is a graphical representation of there being multiple possible outputs (y values) for at least one input (x value.) This means that the equation is inconsistent in its output depending on any given input, and can't be reliably used." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g21uN62xSKSk" + }, + "source": [ + "# Functions as Relations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gwkcV-EMSMNd" + }, + "source": [ + "## 2.1 Which of the following relations are functions? Why?\n", + "\n", + "\\begin{align}\n", + "\\text{Relation 1: } \\{(1, 2), (3, 2), (1, 3)\\}\n", + "\\\\\n", + "\\text{Relation 2: } \\{(1, 3), (2, 3), (6, 7)\\}\n", + "\\\\\n", + "\\text{Relation 3: } \\{(9, 4), (2, 1), (9, 6)\\}\n", + "\\\\\n", + "\\text{Relation 4: } \\{(6, 2), (8, 3), (6, 4)\\}\n", + "\\\\\n", + "\\text{Relation 5: } \\{(2, 6), (2, 7), (2, 4)\\}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lPJ_c50EbIPM", + "outputId": "0e148aa4-b87b-4975-cda0-79648c3994a0", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "rel1 = np.array([\n", + " [1, 2],\n", + " [3, 2],\n", + " [1, 3]\n", + "])\n", + "rel2 = np.array([\n", + " [1, 3],\n", + " [2, 3],\n", + " [6, 7]\n", + "])\n", + "rel3 = np.array([\n", + " [9, 4],\n", + " [2, 1],\n", + " [9, 6]\n", + "])\n", + "rel4 = np.array([\n", + " [6, 2],\n", + " [8, 3],\n", + " [6, 4]\n", + "])\n", + "rel5 = np.array([\n", + " [2, 6],\n", + " [2, 7],\n", + " [2,4]\n", + "])\n", + "\n", + "print(f\"\"\"\n", + "Relation 1:\n", + "{rel1}\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 2:\n", + "{rel2}\n", + "This is a function because all inputs have unique outputs.\n", + "\n", + "Relation 3:\n", + "{rel3}\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 4:\n", + "{rel4}\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 5:\n", + "{rel5}\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Relation 1:\n", + "[[1 2]\n", + " [3 2]\n", + " [1 3]]\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 2:\n", + "[[1 3]\n", + " [2 3]\n", + " [6 7]]\n", + "This is a function because all inputs have unique outputs.\n", + "\n", + "Relation 3:\n", + "[[9 4]\n", + " [2 1]\n", + " [9 6]]\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 4:\n", + "[[6 2]\n", + " [8 3]\n", + " [6 4]]\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n", + "Relation 5:\n", + "[[2 6]\n", + " [2 7]\n", + " [2 4]]\n", + "This is not a function because 1 input maps to more than 1 output.\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y0U30PrlTAAa" + }, + "source": [ + "# Functions as a mapping between dimensions\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pw-OU9qmT5Ua" + }, + "source": [ + "## 3.1 for the following functions what is the dimensionality of the domain (input) and codomain (range/output)?\n", + "\n", + "\\begin{align}\n", + "m(𝑥_1,𝑥_2,𝑥_3)=(x_1+x_2, x_1+x_3, x_2+x_3)\n", + "\\\\\n", + "n(𝑥_1,𝑥_2,𝑥_3,𝑥_4)=(x_2^2 + x_3, x_2x_4)\n", + "\\end{align}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k4tKHjdHUevC" + }, + "source": [ + "## 3.2 Do you think it's possible to create a function that maps from a lower dimensional space to a higher dimensional space? If so, provide an example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dy1pJVh0S0P7" + }, + "source": [ + "I do not believe that this is possible, because a function can only map a single output to an input, and cannot add extra dimensionality on its own." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2nEWvwVyVWdW" + }, + "source": [ + "# Vector Transformations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1n0-6FsYVcVk" + }, + "source": [ + "## 4.1 Plug the corresponding unit vectors into each function. Use the output vectors to create a transformation matrix.\n", + "\n", + "\\begin{align}\n", + "p(\\begin{bmatrix}x_1 \\\\ x_2 \\end{bmatrix}) = \\begin{bmatrix} x_1 + 3x_2 \\\\2 x_2 - x_1 \\\\ \\end{bmatrix}\n", + "\\\\\n", + "\\\\\n", + "q(\\begin{bmatrix}x_1 \\\\ x_2 \\\\ x_3\\end{bmatrix}) = \\begin{bmatrix} 4x_1 + x_2 + 2x_3 \\\\2 x_2 - x_1 + 3x_3 \\\\ 5x_1 - 2x_3 + x_2 \\end{bmatrix}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CNmFtdKmsSPW", + "outputId": "921d50ee-4e19-49ea-dd2b-ab760c4bd2f2", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Unit vectors\n", + "i_hat = np.array([1, 0, 0])\n", + "j_hat = np.array([0, 1, 0])\n", + "k_hat = np.array([0, 0, 1])\n", + "\n", + "# Functions\n", + "def p(x1, x2):\n", + " y1 = x1 +3 *x2\n", + " y2 = 2 *x2 -x1\n", + " return np.array([y1, y2])\n", + "\n", + "def q(x1, x2, x3):\n", + " y1 = 4 *x1 +x2 +2 *x3\n", + " y2 = 2 *x2 -x1 +3 *x3\n", + " y3 = 5 *x1 -2 *x3 +x2\n", + " return np.array([y1, y2, y3])\n", + "\n", + "# Output matrices\n", + "p_matrix = p(i_hat, j_hat)\n", + "q_matrix = q(i_hat, j_hat, k_hat)\n", + "\n", + "print(f\"\"\"\n", + "p = {p_matrix}\n", + "q = {q_matrix}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "p = [[ 1 3 0]\n", + " [-1 2 0]]\n", + "q = [[ 4 1 2]\n", + " [-1 2 3]\n", + " [ 5 1 -2]]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2Byn7MZT9wpl", + "outputId": "d343d8fc-91bc-4e95-ca91-dfb047b6cd4f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Composite transformation matrix\n", + "t = np.matmul(p_matrix, q_matrix)\n", + "print(f\"t = {t}\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "t = [[ 1 7 11]\n", + " [-6 3 4]]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n5HUOQIxZ2gp" + }, + "source": [ + "## 4.2 Verify that your transformation matrices are correct by choosing an input matrix and calculating the result both via the traditional functions above and also via vector-matrix multiplication." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0UANR1IEaVWE", + "outputId": "57ff2b9e-1335-4645-8219-5ede3d3ee0ee", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Verify transformation matrix\n", + "v = np.array([\n", + " [1, 2, 4], [3, 6, 9]\n", + "])\n", + "v_pq = np.matmul(np.matmul(v.T, p_matrix), q_matrix)\n", + "v_t = np.matmul(v.T, t)\n", + " \n", + "print(f\"\"\"Verifying the transformation matrix:\n", + " Test matrix = {v}\n", + "\n", + " Result of output matrices: {v_pq}\n", + "\n", + " Result of transformation matrix: {v_t}\n", + "\n", + " The results are the same.\n", + " \"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Verifying the transformation matrix:\n", + " Test matrix = [[1 2 4]\n", + " [3 6 9]]\n", + "\n", + " Result of output matrices: [[-17 16 23]\n", + " [-34 32 46]\n", + " [-50 55 80]]\n", + "\n", + " Result of transformation matrix: [[-17 16 23]\n", + " [-34 32 46]\n", + " [-50 55 80]]\n", + "\n", + " The results are the same.\n", + " \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vEoiri3mak7j" + }, + "source": [ + "# Eigenvalues and Eigenvectors" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5HY0R4u7anIr" + }, + "source": [ + "## 5.1 In your own words, give an explanation for the intuition behind eigenvalues and eigenvectors.\n", + "\n", + "An eigenvector is a vector which maintains its orientation, even after a transformation is applied to it. Eigenvalues An eigenvalue is the non-0 value by which a given eigenvector is scaled." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VdehMwBtatKI" + }, + "source": [ + "# The Curse of Dimensionality" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZiBJxsZla88c" + }, + "source": [ + "## 6.2 What is the rule of thumb for how many observations you should have compared to parameters in your model?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4oVrJax-a3SK" + }, + "source": [ + "## 6.1 What are some of the challenges of working with high dimensional spaces?\n", + "\n", + "- As the number of dimensions increases, the amount of useful information decreases.\n", + "- It's extremely difficult and impractical to visualize more than two dimensions of data.\n", + "- After a certain number of dimensions, the data points become spacially equidistant, so some important statistical calculations become impossible." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NZmqdeygbHJx" + }, + "source": [ + "# Principal Component Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7iEheetpbJdN" + }, + "source": [ + "## 7.1 Code for loading and cleaning the 2013 national dataset from the [Housing Affordability Data System (HADS)](https://www.huduser.gov/portal/datasets/hads/hads.html) --housing data, can be found below. \n", + "\n", + "## Perform PCA on the processed dataset `national_processed` (Make sure you standardize your data!) and then make a scatterplot of PC1 against PC2. Some of our discussion and work around PCA with this dataset will continue during tomorrow's lecture and assignment.\n", + "\n", + "Not only does this dataset have decent amount columns to begin with (99), but in preparing the data for PCA we have also [one-hot-encoded](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f#targetText=One%20hot%20encoding%20is%20a,the%20entry%20in%20the%20dataset.) all of the categorical variables. This has the effect of creating a new column for each individual category of each categorical variable. After processing this dataset has 64738 columns. --Das a lot of columns.\n", + "\n", + "Don't worry too much about the mechanics of one-hot encoding right now, you will learn and experiment with a whole bunch of categorical encoding approaches in unit 2. \n", + "\n", + "The code below will read in the dataset and perform the one-hot encoding of the categorical variables. Start adding your PCA code at the bottom of the provided code." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Us4nNFs02HlD", + "outputId": "658501f1-fab8-4df0-e469-8f946eb2003a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 351 + } + }, + "source": [ + "from urllib.request import urlopen\n", + "from zipfile import ZipFile\n", + "from io import BytesIO\n", + "import os.path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Read Natinal Data \n", + "national_url = 'https://www.huduser.gov/portal/datasets/hads/hads2013n_ASCII.zip'\n", + "national_file = 'thads2013n.txt'\n", + "\n", + "if os.path.exists(national_file):\n", + " national = pd.read_csv(national_file)\n", + "else: \n", + " z_national = urlopen(national_url)\n", + " zip_national = ZipFile(BytesIO(z_national.read())).extract(national_file)\n", + " national = pd.read_csv(zip_national)\n", + "\n", + "print(national.shape)\n", + "national.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(64535, 99)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CONTROLAGE1METRO3REGIONLMEDFMRL30L50L80IPOVBEDRMSBUILTSTATUSTYPEVALUEVACANCYTENURENUNITSROOMSWEIGHTPERZINC2ZADEQZSMHCSTRUCTURETYPEOWNRENTUTILITYOTHERCOSTCOST06COST12COST08COSTMEDTOTSALASSISTEDGLMEDGL30GL50GL80APLMEDABL30...COST08RELPOVCATCOST08RELFMRPCTCOST08RELFMRCATCOST12RELAMIPCTCOST12RELAMICATCOST12RELPOVPCTCOST12RELPOVCATCOST12RELFMRPCTCOST12RELFMRCATCOSTMedRELAMIPCTCOSTMedRELAMICATCOSTMedRELPOVPCTCOSTMedRELPOVCATCOSTMedRELFMRPCTCOSTMedRELFMRCATFMTZADEQFMTMETRO3FMTBUILTFMTSTRUCTURETYPEFMTBEDRMSFMTOWNRENTFMTCOST06RELPOVCATFMTCOST08RELPOVCATFMTCOST12RELPOVCATFMTCOSTMEDRELPOVCATFMTINCRELPOVCATFMTCOST06RELFMRCATFMTCOST08RELFMRCATFMTCOST12RELFMRCATFMTCOSTMEDRELFMRCATFMTINCRELFMRCATFMTCOST06RELAMICATFMTCOST08RELAMICATFMTCOST12RELAMICATFMTCOSTMEDRELAMICATFMTINCRELAMICATFMTASSISTEDFMTBURDENFMTREGIONFMTSTATUS
0'100003130103'82'3''1'737389561573826213403221106722006'1'140000-6'1'163117.394239118021'1'5331'1'169.000000213.750000648.588189803.050535696.905247615.1567120-97373815738262134032251616.620234.571429...472.898038248.4026352290.250487484.001102237.0776242222.339102464.3469362'1 Adequate''-5''2000-2009''1 Single Family''2 2BR''1 Owner''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''3 150-200% Poverty''2 50.1 - 100% FMR''2 50.1 - 100% FMR''2 50.1 - 100% FMR''2 50.1 - 100% FMR''1 LTE 50% FMR''2 30 - 50% AMI''2 30 - 50% AMI''2 30 - 50% AMI''2 30 - 50% AMI''2 30 - 50% AMI''.''2 30% to 50%''-5''-5'
1'100006110249'50'5''3'5584611001716528604457442421841980'1'1130000-6'1'162150.7255444122961'1'4871'1'245.33333358.3333331167.6407811669.6434051324.6712181058.988479123000-95584617165286044574455846.019911.400000...4120.4246563103.0940636275.7689994151.785764365.3884684174.909320396.2716802'1 Adequate''-5''1980-1989''1 Single Family''4 4BR+''1 Owner''3 150-200% Poverty''4 200%+ Poverty''4 200%+ Poverty''3 150-200% Poverty''4 200%+ Poverty''3 GT FMR''3 GT FMR''3 GT FMR''2 50.1 - 100% FMR''3 GT FMR''4 60 - 80% AMI''4 60 - 80% AMI''6 100 - 120% AMI''4 60 - 80% AMI''7 120% AMI +''.''1 Less than 30%''-5''-5'
2'100006370140'53'5''3'5584611001375022897366141547041985'1'1150000-6'1'172213.789404227974'1'14051'1'159.00000037.5000001193.3932091772.6270061374.5821751068.02516828000-95584613750228973661444676.819937.500000...4124.9620163109.4529056458.3392394161.147910365.9464494276.153890497.0931972'1 Adequate''-5''1980-1989''1 Single Family''4 4BR+''1 Owner''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''3 150-200% Poverty''3 GT FMR''3 GT FMR''3 GT FMR''2 50.1 - 100% FMR''2 50.1 - 100% FMR''4 60 - 80% AMI''5 80 - 100% AMI''6 100 - 120% AMI''4 60 - 80% AMI''4 60 - 80% AMI''.''3 50% or More''-5''-5'
3'100006520140'67'5''3'558469491375022897366141396431985'1'1200000-6'1'162364.585097232220'1'2791'1'179.00000070.6666671578.8576122351.1693411820.4429001411.7002240-95584613750228973661444676.817875.000000...4191.8274923161.9267097673.4945124247.752301397.2248015404.3827634148.7566103'1 Adequate''-5''1980-1989''1 Single Family''3 3BR''1 Owner''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''4 200%+ Poverty''3 GT FMR''3 GT FMR''3 GT FMR''3 GT FMR''2 50.1 - 100% FMR''6 100 - 120% AMI''7 120% AMI +''7 120% AMI +''5 80 - 100% AMI''4 60 - 80% AMI''.''1 Less than 30%''-5''-5'
4'100007130148'26'1''3'609917371480124628394211549221980'1'1-6-6'2'10042314.524902296874'1'7595'2'146.00000012.500000759.000000759.000000759.000000759.0000009690006099114801246283942148792.816651.125000...3102.985075355.3087073195.9721153102.985075355.3087073195.9721153102.9850753'1 Adequate''Central City''1980-1989''5 50+ units''2 2BR''2 Renter''3 150-200% Poverty''3 150-200% Poverty''3 150-200% Poverty''3 150-200% Poverty''4 200%+ Poverty''3 GT FMR''3 GT FMR''3 GT FMR''3 GT FMR''3 GT FMR''3 50 - 60% AMI''3 50 - 60% AMI''3 50 - 60% AMI''3 50 - 60% AMI''7 120% AMI +''0 Not Assisted''1 Less than 30%''-5''-5'
\n", + "

5 rows × 99 columns

\n", + "
" + ], + "text/plain": [ + " CONTROL AGE1 METRO3 ... FMTBURDEN FMTREGION FMTSTATUS\n", + "0 '100003130103' 82 '3' ... '2 30% to 50%' '-5' '-5'\n", + "1 '100006110249' 50 '5' ... '1 Less than 30%' '-5' '-5'\n", + "2 '100006370140' 53 '5' ... '3 50% or More' '-5' '-5'\n", + "3 '100006520140' 67 '5' ... '1 Less than 30%' '-5' '-5'\n", + "4 '100007130148' 26 '1' ... '1 Less than 30%' '-5' '-5'\n", + "\n", + "[5 rows x 99 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "O5zrMTud2qFU", + "outputId": "eab96302-82de-497a-a349-171f88926bf5", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Look at datatypes\n", + "# a lot of object datatypes even though they seem to be strings of numbers.\n", + "national.dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "CONTROL object\n", + "AGE1 int64\n", + "METRO3 object\n", + "REGION object\n", + "LMED int64\n", + " ... \n", + "FMTINCRELAMICAT object\n", + "FMTASSISTED object\n", + "FMTBURDEN object\n", + "FMTREGION object\n", + "FMTSTATUS object\n", + "Length: 99, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yIrOjVmd2yLz", + "outputId": "d9774873-8c0b-418b-f3dd-7cb808d0095b", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# check for null values\n", + "national.isnull().sum().any()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2u9tdmwX3KL_", + "outputId": "80d70b05-5549-4fb3-866e-06e3cc1ff01c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# check for number of categorical vs numeric columns\n", + "cat_cols = national.columns[national.dtypes=='object']\n", + "num_cols = national.columns[national.dtypes!='object']\n", + "\n", + "print(f'{len(cat_cols)} categorical columns')\n", + "print(f'{len(num_cols)} numerical columns')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "32 categorical columns\n", + "67 numerical columns\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UMHOtOs_3gcL", + "outputId": "6e7b3bbe-fa4f-458a-f6b7-6b42960b9dd9", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# We're making a copy of our data in case we mess something up.\n", + "national_processed = national.copy()\n", + "\n", + "# Categorically Encode our Variables:\n", + "# They need to all be numeric before we do PCA.\n", + "# https://pbpython.com/categorical-encoding.html\n", + "\n", + "# Cast categorical columns to \"category\" data type\n", + "national_processed[cat_cols] = national_processed[cat_cols].astype('category')\n", + "\n", + "national_processed.dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "CONTROL category\n", + "AGE1 int64\n", + "METRO3 category\n", + "REGION category\n", + "LMED int64\n", + " ... \n", + "FMTINCRELAMICAT category\n", + "FMTASSISTED category\n", + "FMTBURDEN category\n", + "FMTREGION category\n", + "FMTSTATUS category\n", + "Length: 99, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ymxU-aPtQ1eq", + "outputId": "501ec0af-55bf-4f8b-a5dc-3ab5793e6e14", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 266 + } + }, + "source": [ + "# Replace all category cell values with their numeric category codes\n", + "for col in cat_cols:\n", + " national_processed[col] = national_processed[col].cat.codes\n", + "\n", + "print(national_processed.shape)\n", + "national_processed.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(64535, 99)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CONTROLAGE1METRO3REGIONLMEDFMRL30L50L80IPOVBEDRMSBUILTSTATUSTYPEVALUEVACANCYTENURENUNITSROOMSWEIGHTPERZINC2ZADEQZSMHCSTRUCTURETYPEOWNRENTUTILITYOTHERCOSTCOST06COST12COST08COSTMEDTOTSALASSISTEDGLMEDGL30GL50GL80APLMEDABL30...COST08RELPOVCATCOST08RELFMRPCTCOST08RELFMRCATCOST12RELAMIPCTCOST12RELAMICATCOST12RELPOVPCTCOST12RELPOVCATCOST12RELFMRPCTCOST12RELFMRCATCOSTMedRELAMIPCTCOSTMedRELAMICATCOSTMedRELPOVPCTCOSTMedRELPOVCATCOSTMedRELFMRPCTCOSTMedRELFMRCATFMTZADEQFMTMETRO3FMTBUILTFMTSTRUCTURETYPEFMTBEDRMSFMTOWNRENTFMTCOST06RELPOVCATFMTCOST08RELPOVCATFMTCOST12RELPOVCATFMTCOSTMEDRELPOVCATFMTINCRELPOVCATFMTCOST06RELFMRCATFMTCOST08RELFMRCATFMTCOST12RELFMRCATFMTCOSTMEDRELFMRCATFMTINCRELFMRCATFMTCOST06RELAMICATFMTCOST08RELAMICATFMTCOST12RELAMICATFMTCOSTMEDRELAMICATFMTINCRELAMICATFMTASSISTEDFMTBURDENFMTREGIONFMTSTATUS
0082207373895615738262134032211067220060140000-61163117.394239118021153310169.000000213.750000648.588189803.050535696.905247615.1567120-97373815738262134032251616.620234.571429...472.898038248.4026352290.250487484.001102237.0776242222.339102464.34693621051204444311111111120200
115042558461100171652860445744242184198001130000-61162150.7255444122961148710245.33333358.3333331167.6407811669.6434051324.6712181058.988479123000-95584617165286044574455846.019911.400000...4120.4246563103.0940636275.7689994151.785764365.3884684174.909320396.27168021031403443422213335370100
225342558461100137502289736614154704198501150000-61172213.7894042279741140510159.00000037.5000001193.3932091772.6270061374.5821751068.02516828000-95584613750228973661444676.819937.500000...4124.9620163109.4529056458.3392394161.147910365.9464494276.153890497.09319721031404444322212345340300
33674255846949137502289736614139643198501200000-61162364.585097232220127910179.00000070.6666671578.8576122351.1693411820.4429001411.7002240-95584613750228973661444676.817875.000000...4191.8274923161.9267097673.4945124247.752301397.2248015404.3827634148.75661031031304444422222566440100
44260260991737148012462839421154922198001-6-6210042314.524902296874175951146.00000012.500000759.000000759.000000759.000000759.0000009690006099114801246283942148792.816651.125000...3102.985075355.3087073195.9721153102.985075355.3087073195.9721153102.98507531135213333422223222271100
\n", + "

5 rows × 99 columns

\n", + "
" + ], + "text/plain": [ + " CONTROL AGE1 METRO3 REGION ... FMTASSISTED FMTBURDEN FMTREGION FMTSTATUS\n", + "0 0 82 2 0 ... 0 2 0 0\n", + "1 1 50 4 2 ... 0 1 0 0\n", + "2 2 53 4 2 ... 0 3 0 0\n", + "3 3 67 4 2 ... 0 1 0 0\n", + "4 4 26 0 2 ... 1 1 0 0\n", + "\n", + "[5 rows x 99 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6NS5AQpcQ8Je", + "outputId": "c137d54d-1fb7-4252-b868-4b8483abc826", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Now we only ahve numeric columns (ints and floats)\n", + "national_processed.dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "CONTROL int32\n", + "AGE1 int64\n", + "METRO3 int8\n", + "REGION int8\n", + "LMED int64\n", + " ... \n", + "FMTINCRELAMICAT int8\n", + "FMTASSISTED int8\n", + "FMTBURDEN int8\n", + "FMTREGION int8\n", + "FMTSTATUS int8\n", + "Length: 99, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WAJ8bjVcEwhA", + "outputId": "ec77ee6e-0bd1-4e93-d275-7a97519d1930", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 351 + } + }, + "source": [ + "# Standardize the data\n", + "scaler = StandardScaler()\n", + "z = scaler.fit_transform(national_processed)\n", + "\n", + "# Extract features\n", + "pca = PCA()\n", + "new_features = pca.fit_transform(z)\n", + "\n", + "# Turn the processed data into a dataframe\n", + "national_pca = pd.DataFrame(new_features)\n", + "print(national_pca.shape)\n", + "national_pca.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(64535, 99)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789101112131415161718192021222324252627282930313233343536373839...59606162636465666768697071727374757677787980818283848586878889909192939495969798
0-2.576570-1.376127-1.253835-0.6198821.592855-2.529629-1.0243842.0389710.743631-0.981357-0.958024-0.983633-0.1243202.130272-0.232511-1.259668-1.8112150.8002300.1574960.0578910.076233-0.3038510.2346310.3116921.4148470.6655750.2060750.297168-0.885343-1.1008120.017568-0.296827-0.0452260.328718-0.227059-0.2065670.1051830.209632-0.4529490.487132...-0.022755-0.0053800.0170970.0175090.007771-0.001771-0.0056350.0024380.000044-1.555674e-12-2.275710e-12-7.244823e-12-2.792285e-121.391321e-132.899610e-12-1.398466e-125.030034e-12-1.453494e-12-7.971580e-15-8.595370e-161.397316e-15-4.201700e-15-2.629575e-151.662437e-152.281487e-16-8.200807e-157.638178e-152.129641e-151.126562e-154.763201e-15-1.111901e-151.643459e-15-2.481185e-151.627337e-154.206682e-162.517816e-154.136666e-156.081237e-15-5.551242e-16-7.942743e-16
12.040643-0.988061-2.237999-2.9874032.1603201.2208751.076368-0.6813690.948055-0.182382-0.6903090.670541-0.3129500.1607160.063940-0.032742-1.310611-0.029583-0.320912-1.2132950.082541-0.438037-0.140961-0.4171530.129375-0.3732540.1624620.426850-0.2946721.1966200.0344420.739189-0.334534-0.349723-0.0511910.1070430.9526030.1994570.0227680.199969...-0.0654890.030766-0.021506-0.0141760.0100880.000578-0.010665-0.002009-0.0000831.985665e-111.697024e-11-1.820959e-11-4.569093e-12-8.943151e-12-1.002888e-113.043135e-12-1.867900e-12-7.525301e-12-1.236942e-13-2.811725e-14-2.859341e-15-1.245108e-146.380007e-15-3.271522e-146.700829e-143.538613e-14-3.039974e-14-1.604190e-14-5.531218e-157.315701e-15-9.227129e-15-2.709105e-15-3.293959e-154.693389e-15-3.784732e-157.904892e-15-1.024321e-14-7.866356e-15-6.322116e-16-2.619343e-16
21.2152700.370168-3.248606-2.1238991.229951-2.5599450.9842910.088488-0.209300-0.715159-0.2439961.168680-0.1361000.4912520.358669-0.462415-1.4945700.114951-0.160159-0.8894620.678651-0.0022220.162761-0.738769-0.8445790.332651-0.0146020.525549-0.1347050.832489-0.3668750.429854-0.639612-0.112777-0.1140590.2588930.9746110.1678550.177374-0.033658...-0.0371080.019828-0.0322030.0032940.0061820.000272-0.0083070.0002100.000053-3.011104e-11-3.448156e-11-2.696771e-12-1.342672e-113.055270e-11-1.853116e-116.526787e-121.171080e-112.918130e-111.978506e-14-6.917647e-15-8.690510e-153.585682e-15-4.658132e-15-8.145799e-15-4.673181e-16-5.639147e-15-2.337729e-15-5.838861e-152.205696e-15-2.530473e-15-4.543976e-16-6.528784e-15-2.303640e-15-2.500232e-152.893038e-152.086186e-151.598883e-153.318875e-158.989284e-16-3.788372e-17
32.8218371.723711-4.375157-1.4793430.085327-1.7550500.710440-0.0648430.487051-0.175074-0.9478110.273949-0.083829-0.606772-0.0587910.293943-1.3190741.225816-0.688633-0.437567-0.155818-0.9341890.0707810.0828820.259157-0.305200-0.1574190.136505-0.6786510.363593-0.4972620.4535850.0991060.302715-0.3191960.502472-0.0562090.0439860.2844940.152548...0.051131-0.0164790.032716-0.001735-0.006142-0.0000830.0032330.0003690.0000471.570527e-11-2.956002e-11-4.246039e-11-3.625665e-11-3.904595e-11-8.910011e-122.530238e-127.069996e-123.740412e-138.286938e-156.592696e-16-1.280837e-159.213680e-16-3.869037e-171.861092e-16-3.535306e-162.137629e-15-3.969571e-16-1.665115e-152.604798e-15-3.388406e-153.325411e-16-1.605029e-15-2.185016e-15-2.579443e-153.379958e-151.414309e-15-3.038426e-15-1.618005e-15-2.137277e-16-7.535854e-16
4-1.570106-0.878362-3.3070620.785881-2.3104003.881736-0.6459520.464725-0.0921470.4395020.6804271.297333-2.1765270.873973-1.2587290.078789-1.274484-1.278863-0.049121-1.107579-1.3157070.183007-0.463981-0.126541-0.632876-0.4903560.2270400.4487140.3620580.192815-0.0611160.4236520.0602510.0662550.0872480.049826-0.007892-0.222354-0.0177750.482700...0.0295080.017652-0.0079010.005932-0.0081800.0011590.0049480.003858-0.000114-1.210293e-12-7.386035e-131.444818e-12-7.904880e-13-9.223507e-14-6.033953e-135.752033e-14-3.907023e-138.020451e-13-3.283658e-135.266361e-143.634224e-151.832897e-144.621295e-14-3.589585e-14-1.776807e-14-2.737603e-14-8.598189e-15-3.794475e-14-1.916136e-14-8.228730e-15-6.056303e-15-1.204971e-14-1.099331e-147.764073e-15-3.602489e-159.824194e-153.711774e-155.388211e-15-7.548762e-16-5.272142e-17
\n", + "

5 rows × 99 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 ... 96 97 98\n", + "0 -2.576570 -1.376127 -1.253835 ... 6.081237e-15 -5.551242e-16 -7.942743e-16\n", + "1 2.040643 -0.988061 -2.237999 ... -7.866356e-15 -6.322116e-16 -2.619343e-16\n", + "2 1.215270 0.370168 -3.248606 ... 3.318875e-15 8.989284e-16 -3.788372e-17\n", + "3 2.821837 1.723711 -4.375157 ... -1.618005e-15 -2.137277e-16 -7.535854e-16\n", + "4 -1.570106 -0.878362 -3.307062 ... 5.388211e-15 -7.548762e-16 -5.272142e-17\n", + "\n", + "[5 rows x 99 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 22 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RQx7UvsucIrL" + }, + "source": [ + "# Stretch Goals\n", + "\n", + "## 1) Perform further data exploration on the HADS national dataset (the version before we one-hot encoded it) Make scatterplots and see if you can see any resemblance between the original scatterplots and the plot of the principal components that you made in 7.1. \n", + "\n", + "(You may or may not not see very much resemblance depending on the variables you choose, and that's ok!)\n", + "\n", + "## 2) Study \"Scree Plots\" and then try and make one for your PCA dataset. How many principal conponents do you need to retain in order for your PCs to contain 90% of the explained variance? \n", + "\n", + "We will present this topic formally at the beginning of tomorrow's lecture, so if you figure this stretch goal out, you're ahead of the game. \n", + "\n", + "## 3) Explore further the intuition behind eigenvalues and eigenvectors by creating your very own eigenfaces:\n", + "\n", + "Prioritize self-study over this stretch goal if you are not semi-comfortable with the topics of PCA, Eigenvalues, and Eigenvectors.\n", + "\n", + "![Eigenfaces](https://i.pinimg.com/236x/1c/f1/01/1cf101a9859437a5d096a04b05be06b4--faces-tattoo.jpg)\n", + "\n", + "You don't necessarily have to use this resource, but this will get you started: \n", + "[Eigenface Tutorial](https://sandipanweb.wordpress.com/2018/01/06/eigenfaces-and-a-simple-face-detector-with-pca-svd-in-python/)" + ] + } + ] +} \ No newline at end of file diff --git a/module4-clustering/Jacob_Torres_LS_DS_134_Clustering_Assignment.ipynb b/module4-clustering/Jacob_Torres_LS_DS_134_Clustering_Assignment.ipynb new file mode 100644 index 00000000..e5b7cd31 --- /dev/null +++ b/module4-clustering/Jacob_Torres_LS_DS_134_Clustering_Assignment.ipynb @@ -0,0 +1,2570 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Jacob_Torres_LS_DS_134_Clustering_Assignment.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y-3rVFtGLMJM" + }, + "source": [ + "# K-Means Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_VS3FFSFLR3a" + }, + "source": [ + "# 1) Use the \"Breast Cancer Wisconsin (Diagnostic) Data Set\" from Kaggle to try and cluster types of cancer cells. \n", + "\n", + "Here's the original dataset for your reference:\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "899RK3bBn4OE" + }, + "source": [ + "## This is a supervised learning dataset\n", + "\n", + "(Because it has **labels** - The \"diagnosis\" column.)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ws5R9X6hLJQ2", + "outputId": "4778414d-a7a1-45fa-954f-e727910bffad", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 254 + } + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.decomposition import PCA # You don't necessarily have to use this\n", + "from sklearn.cluster import KMeans # You don't necessarily have to use this\n", + "from sklearn.preprocessing import StandardScaler # You don't necessarily have to use this\n", + "\n", + "df_original = pd.read_csv(\"https://raw.githubusercontent.com/ryanleeallred/datasets/master/Cancer_Cells.csv\")\n", + "print(df_original.shape)\n", + "df_original.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(569, 33)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_meanfractal_dimension_meanradius_setexture_seperimeter_searea_sesmoothness_secompactness_seconcavity_seconcave points_sesymmetry_sefractal_dimension_seradius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstUnnamed: 32
0842302M17.9910.38122.801001.00.118400.277600.30010.147100.24190.078711.09500.90538.589153.400.0063990.049040.053730.015870.030030.00619325.3817.33184.602019.00.16220.66560.71190.26540.46010.11890NaN
1842517M20.5717.77132.901326.00.084740.078640.08690.070170.18120.056670.54350.73393.39874.080.0052250.013080.018600.013400.013890.00353224.9923.41158.801956.00.12380.18660.24160.18600.27500.08902NaN
284300903M19.6921.25130.001203.00.109600.159900.19740.127900.20690.059990.74560.78694.58594.030.0061500.040060.038320.020580.022500.00457123.5725.53152.501709.00.14440.42450.45040.24300.36130.08758NaN
384348301M11.4220.3877.58386.10.142500.283900.24140.105200.25970.097440.49561.15603.44527.230.0091100.074580.056610.018670.059630.00920814.9126.5098.87567.70.20980.86630.68690.25750.66380.17300NaN
484358402M20.2914.34135.101297.00.100300.132800.19800.104300.18090.058830.75720.78135.43894.440.0114900.024610.056880.018850.017560.00511522.5416.67152.201575.00.13740.20500.40000.16250.23640.07678NaN
\n", + "
" + ], + "text/plain": [ + " id diagnosis ... fractal_dimension_worst Unnamed: 32\n", + "0 842302 M ... 0.11890 NaN\n", + "1 842517 M ... 0.08902 NaN\n", + "2 84300903 M ... 0.08758 NaN\n", + "3 84348301 M ... 0.17300 NaN\n", + "4 84358402 M ... 0.07678 NaN\n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 250 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IHDDqaU-ove4" + }, + "source": [ + "## Now it's an unsupervised learning dataset\n", + "\n", + "(Because we've removed the diagnosis label) - Use this version." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "86MHoPJon_aC", + "outputId": "92b1badf-3389-4a71-edac-1b2457d5c4f5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 236 + } + }, + "source": [ + "df = df_original.drop(columns=['diagnosis', 'Unnamed: 32'])\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_meanfractal_dimension_meanradius_setexture_seperimeter_searea_sesmoothness_secompactness_seconcavity_seconcave points_sesymmetry_sefractal_dimension_seradius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
084230217.9910.38122.801001.00.118400.277600.30010.147100.24190.078711.09500.90538.589153.400.0063990.049040.053730.015870.030030.00619325.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
184251720.5717.77132.901326.00.084740.078640.08690.070170.18120.056670.54350.73393.39874.080.0052250.013080.018600.013400.013890.00353224.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
28430090319.6921.25130.001203.00.109600.159900.19740.127900.20690.059990.74560.78694.58594.030.0061500.040060.038320.020580.022500.00457123.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
38434830111.4220.3877.58386.10.142500.283900.24140.105200.25970.097440.49561.15603.44527.230.0091100.074580.056610.018670.059630.00920814.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
48435840220.2914.34135.101297.00.100300.132800.19800.104300.18090.058830.75720.78135.43894.440.0114900.024610.056880.018850.017560.00511522.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", + "
" + ], + "text/plain": [ + " id radius_mean ... symmetry_worst fractal_dimension_worst\n", + "0 842302 17.99 ... 0.4601 0.11890\n", + "1 842517 20.57 ... 0.2750 0.08902\n", + "2 84300903 19.69 ... 0.3613 0.08758\n", + "3 84348301 11.42 ... 0.6638 0.17300\n", + "4 84358402 20.29 ... 0.2364 0.07678\n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 251 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rskC80k3OKMA" + }, + "source": [ + "## Let's do it!\n", + "\n", + "- You might want to do some data exploration to see if you can find specific columns that will help you find distinct clusters of cells\n", + "- You might want to use the elbow method to decide on the number of clusters to use.\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "U92Y3jNKPpjJ", + "outputId": "6bb313a9-9dfb-498d-df2b-136c10c2da9e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + } + }, + "source": [ + "df.dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "id int64\n", + "radius_mean float64\n", + "texture_mean float64\n", + "perimeter_mean float64\n", + "area_mean float64\n", + "smoothness_mean float64\n", + "compactness_mean float64\n", + "concavity_mean float64\n", + "concave points_mean float64\n", + "symmetry_mean float64\n", + "fractal_dimension_mean float64\n", + "radius_se float64\n", + "texture_se float64\n", + "perimeter_se float64\n", + "area_se float64\n", + "smoothness_se float64\n", + "compactness_se float64\n", + "concavity_se float64\n", + "concave points_se float64\n", + "symmetry_se float64\n", + "fractal_dimension_se float64\n", + "radius_worst float64\n", + "texture_worst float64\n", + "perimeter_worst float64\n", + "area_worst float64\n", + "smoothness_worst float64\n", + "compactness_worst float64\n", + "concavity_worst float64\n", + "concave points_worst float64\n", + "symmetry_worst float64\n", + "fractal_dimension_worst float64\n", + "dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 252 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EpsV4xHDlb3P", + "outputId": "cbd37d0d-da32-40f7-fa3c-8f063133725f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 327 + } + }, + "source": [ + "df = df.drop('id', axis=1)\n", + "df.describe()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
radius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_meanfractal_dimension_meanradius_setexture_seperimeter_searea_sesmoothness_secompactness_seconcavity_seconcave points_sesymmetry_sefractal_dimension_seradius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
count569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000
mean14.12729219.28964991.969033654.8891040.0963600.1043410.0887990.0489190.1811620.0627980.4051721.2168532.86605940.3370790.0070410.0254780.0318940.0117960.0205420.00379516.26919025.677223107.261213880.5831280.1323690.2542650.2721880.1146060.2900760.083946
std3.5240494.30103624.298981351.9141290.0140640.0528130.0797200.0388030.0274140.0070600.2773130.5516482.02185545.4910060.0030030.0179080.0301860.0061700.0082660.0026464.8332426.14625833.602542569.3569930.0228320.1573360.2086240.0657320.0618670.018061
min6.9810009.71000043.790000143.5000000.0526300.0193800.0000000.0000000.1060000.0499600.1115000.3602000.7570006.8020000.0017130.0022520.0000000.0000000.0078820.0008957.93000012.02000050.410000185.2000000.0711700.0272900.0000000.0000000.1565000.055040
25%11.70000016.17000075.170000420.3000000.0863700.0649200.0295600.0203100.1619000.0577000.2324000.8339001.60600017.8500000.0051690.0130800.0150900.0076380.0151600.00224813.01000021.08000084.110000515.3000000.1166000.1472000.1145000.0649300.2504000.071460
50%13.37000018.84000086.240000551.1000000.0958700.0926300.0615400.0335000.1792000.0615400.3242001.1080002.28700024.5300000.0063800.0204500.0258900.0109300.0187300.00318714.97000025.41000097.660000686.5000000.1313000.2119000.2267000.0999300.2822000.080040
75%15.78000021.800000104.100000782.7000000.1053000.1304000.1307000.0740000.1957000.0661200.4789001.4740003.35700045.1900000.0081460.0324500.0420500.0147100.0234800.00455818.79000029.720000125.4000001084.0000000.1460000.3391000.3829000.1614000.3179000.092080
max28.11000039.280000188.5000002501.0000000.1634000.3454000.4268000.2012000.3040000.0974402.8730004.88500021.980000542.2000000.0311300.1354000.3960000.0527900.0789500.02984036.04000049.540000251.2000004254.0000000.2226001.0580001.2520000.2910000.6638000.207500
\n", + "
" + ], + "text/plain": [ + " radius_mean texture_mean ... symmetry_worst fractal_dimension_worst\n", + "count 569.000000 569.000000 ... 569.000000 569.000000\n", + "mean 14.127292 19.289649 ... 0.290076 0.083946\n", + "std 3.524049 4.301036 ... 0.061867 0.018061\n", + "min 6.981000 9.710000 ... 0.156500 0.055040\n", + "25% 11.700000 16.170000 ... 0.250400 0.071460\n", + "50% 13.370000 18.840000 ... 0.282200 0.080040\n", + "75% 15.780000 21.800000 ... 0.317900 0.092080\n", + "max 28.110000 39.280000 ... 0.663800 0.207500\n", + "\n", + "[8 rows x 30 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 253 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OUWrdtgbYyjy", + "outputId": "a635f544-3584-4da3-90b1-96834f752c8f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + } + }, + "source": [ + "# Subset the data for kmeans clustering\n", + "df = df.copy()[\n", + " ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean',\n", + " 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst']\n", + "]\n", + "df.describe()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
radius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanradius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worst
count569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000
mean14.12729219.28964991.969033654.8891040.0963600.10434116.26919025.677223107.261213880.5831280.1323690.254265
std3.5240494.30103624.298981351.9141290.0140640.0528134.8332426.14625833.602542569.3569930.0228320.157336
min6.9810009.71000043.790000143.5000000.0526300.0193807.93000012.02000050.410000185.2000000.0711700.027290
25%11.70000016.17000075.170000420.3000000.0863700.06492013.01000021.08000084.110000515.3000000.1166000.147200
50%13.37000018.84000086.240000551.1000000.0958700.09263014.97000025.41000097.660000686.5000000.1313000.211900
75%15.78000021.800000104.100000782.7000000.1053000.13040018.79000029.720000125.4000001084.0000000.1460000.339100
max28.11000039.280000188.5000002501.0000000.1634000.34540036.04000049.540000251.2000004254.0000000.2226001.058000
\n", + "
" + ], + "text/plain": [ + " radius_mean texture_mean ... smoothness_worst compactness_worst\n", + "count 569.000000 569.000000 ... 569.000000 569.000000\n", + "mean 14.127292 19.289649 ... 0.132369 0.254265\n", + "std 3.524049 4.301036 ... 0.022832 0.157336\n", + "min 6.981000 9.710000 ... 0.071170 0.027290\n", + "25% 11.700000 16.170000 ... 0.116600 0.147200\n", + "50% 13.370000 18.840000 ... 0.131300 0.211900\n", + "75% 15.780000 21.800000 ... 0.146000 0.339100\n", + "max 28.110000 39.280000 ... 0.222600 1.058000\n", + "\n", + "[8 rows x 12 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 254 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Lm2lCM9Z3n-8", + "outputId": "7a7b1dcd-11c6-4c62-fa13-6e285b0181fe", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 316 + } + }, + "source": [ + "# Find the best value for k:\n", + "sum_of_squared_distances = []\n", + "K = range(1, 13)\n", + "for k in K:\n", + " km = KMeans(n_clusters=k)\n", + " km = km.fit(df)\n", + " sum_of_squared_distances.append(km.inertia_)\n", + "\n", + "# Print the inertia values\n", + "print(sum_of_squared_distances)\n", + "\n", + "# Plot the elbow graph\n", + "plt.xlabel('Number of clusters (K)')\n", + "plt.ylabel('Sum of squared distances (in millions)')\n", + "plt.xticks(ticks=range(1, 13))\n", + "plt.yticks(ticks=range(0, 350, 50))\n", + "plt.plot(K, sum_of_squared_distances, 'bx-')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[255499233.1336417, 77282652.76919657, 46878484.46069864, 28781127.702141505, 20180353.514803693, 16176871.156065524, 12859545.712581772, 10952676.698887236, 9164492.365829851, 8167911.523061521, 7315384.09659356, 6536910.992331707]\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "g2c9yWOyIb-G", + "outputId": "7223c5cb-44c7-493b-cf75-6367599a49a8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "# Audio graph imports\n", + "!pip install sonipy\n", + "from sonipy.sonify import SonifyTool" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: sonipy in /usr/local/lib/python3.6/dist-packages (1.0)\n", + "Requirement already satisfied: pydub in /usr/local/lib/python3.6/dist-packages (from sonipy) (0.24.1)\n", + "Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages (from sonipy) (0.11.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from sonipy) (1.4.1)\n", + "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.6/dist-packages (from sonipy) (1.18.5)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from sonipy) (3.2.2)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from sonipy) (1.1.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->sonipy) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->sonipy) (1.2.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->sonipy) (2.4.7)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->sonipy) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->sonipy) (2018.9)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib->sonipy) (1.15.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1G7AlDcZIm89" + }, + "source": [ + "# Audio graph\n", + "duration_args = {'time_total': 3000}\n", + "\n", + "# Uncomment the line below to play the audio graph\n", + "#SonifyTool(x=K, y=sum_of_squared_distances, duration_args=duration_args).play()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TjpOhJjLHuCQ", + "outputId": "15fe2709-d415-48d1-fe78-ad09d81b218e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "# Perform kmeans clustering with k=2\n", + "kmeans = KMeans(n_clusters=2)\n", + "kmeans.fit(df)\n", + "df['clusters'] = kmeans.labels_\n", + "df['clusters'].value_counts()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 439\n", + "0 130\n", + "Name: clusters, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 258 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I4gKoEpZpOCR", + "outputId": "4b87f9da-e9ae-4aff-dbd3-9c855ff782bb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "# Visualize the clusters\n", + "plt.scatter(x=df['radius_mean'], y=df['radius_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "n4hhaq_yk2l2", + "outputId": "793f0c43-73d4-438b-c422-95576863371c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "plt.scatter(x=df['texture_mean'], y=df['texture_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IHktbyTVrXEW", + "outputId": "a5e48808-88bf-4dec-f86d-e413e7189a97", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "plt.scatter(x=df['area_mean'], y=df['area_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uC-XZp_7j_3k", + "outputId": "7e8151f8-c9c9-477c-98c3-f8f770abca03", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "plt.scatter(x=df['perimeter_mean'], y=df['perimeter_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nOzdd3hU1dbA4d+aPpNCKAERaUoRQaRj99oLFvBaUK8VBSxYrhXUK7YrduyKnasifCKKXSyogFQFBFSKIIKUUNKTqfv7YwZIMjOpk0wS1vs8eTKzT1szkDVn9llnbzHGoJRSqnGxJDsApZRSiafJXSmlGiFN7kop1QhpcldKqUZIk7tSSjVCtmQHANCiRQvToUOHZIehlFINyqJFi7YZYzJjLasXyb1Dhw4sXLgw2WEopVSDIiJ/xlum3TJKKdUIaXJXSqlGSJO7Uko1QprclVKqEdLkrpRSdWTnlmyWfr+CrA3ba/1Y9aJaRimlGrNgMMhTIyfw1Vs/4HDZ8Xn9HDqoD3f873ocLketHLPCM3cRaSsi34rIChFZLiI3RNrHishGEVkc+TmtxDajRWS1iPwuIifXSuRKKdVAvDvuA76ZNAu/109BTiH+Yj/zPv2JCbf+r9aOWZlumQBwszHmIOBQ4FoROSiy7EljTK/Iz6cAkWVDge7AKcDzImKthdiVUqpB+PDZz/AW+kq1+Yr8fPbaN4RCoVo5ZoXJ3RizyRjzU+RxHvAr0KacTc4C3jXGeI0xa4HVwIBEBKuUUg1RQU5RzHa/108wEKyVY1bpgqqIdAB6A/MiTdeJyFIReU1Emkba2gB/ldhsAzE+DERkuIgsFJGFWVlZVQ5cKaUaiu5HdI3Z3qF7W+wOe60cs9LJXURSganAjcaYXOAF4ACgF7AJeLwqBzbGTDDG9DPG9MvMjDk0glJKNQpXP3Ep7jQXVlu4h9piteD0OLn++atq7ZiVqpYRETvhxP62MeZ9AGPMlhLLXwY+jjzdCLQtsfl+kTallNordTy4PS8tfoz3Hv+I3+avpuPB7Tj3ljNp322/WjtmhcldRAR4FfjVGPNEifbWxphNkadDgGWRx9OBd0TkCWBfoDMwP6FRK6VUA9O6YytGPXtlnR2vMmfuRwAXA7+IyOJI2xjgAhHpBRhgHTACwBizXESmACsIV9pca4ypnSsGSimlYqowuRtjZgESY9Gn5WzzIPBgDeJSSilVAzr8gFJKNUKa3JVSqhHS5K6UUo2QJnellGqENLkrpVQjpMldKaUaIU3uSinVCGlyV0qpRkiTu1JKNUKa3JVSqhHS5K6UUo2QTpCtlNpr+Yp9zJj4HT9MnUtqs1TOvPpkeh59UMUbNgCa3JVSeyWf18+NR93N+l834i30IgJzP1rExfecy/m3npXs8GpMu2WUUnulb96ZxV+/hRM7gDHgLfQy8Z7J5G7PS3J0NafJXSm1V5o9bR7FBd6odpvDxrJZvyUhosTS5K6U2is1yUxHLNFTVRhjSG2akoSIEqvC5C4ibUXkWxFZISLLReSGSPujIvKbiCwVkWkikhFp7yAiRSKyOPLzYm2/CKWUqqozRp6Ew2kv1SYCnnQPPY48MElRJU5lztwDwM3GmIOAQ4FrReQgYAbQwxjTE1gJjC6xzRpjTK/Iz8iER62UUjXUtX8nRj5xKU63A0+6G3eaixb7NefhL+/GYmn4nRqVmWZvE7Ap8jhPRH4F2hhjviyx2lzgnNoJUSmlasfpI07i2AuOZMWPK0lJd3PgwM6NIrFDFUshRaQD0BuYV2bRFcDkEs87isjPQC5wlzHmhxj7Gg4MB2jXrl1VwlBKqYRJSffQ/+ReNdrH2mXr2bhqEx0PbkebTq0TFFnNVDq5i0gqMBW40RiTW6L9TsJdN29HmjYB7Ywx20WkL/CBiHQvuQ2AMWYCMAGgX79+pmYvQyml6l5hXhF3nf4QKxetwWqzEvAFGHBqH8ZMugG7w17xDmpRpb5/iIidcGJ/2xjzfon2y4DTgYuMMQbAGOM1xmyPPF4ErAG6JDhupZRKumeue4Xf5q/CW+ijMLcIX7Gf+Z//zDsPvl/xxrWsMtUyArwK/GqMeaJE+ynAbcCZxpjCEu2ZImKNPN4f6Az8kejAlVIqmYKBIN9NnoPfGyjV7ivy8fFLM5IU1R6V6ZY5ArgY+EVEFkfaxgBPA05gRjj/MzdSGXM0cJ+I+IEQMNIYsyPhkSulVBIFA0GCwVDMZcUFxXUcTbTKVMvMAqIr/eHTOOtPJdyFo5RSjZbD5WD/nu1Z/fPaUu1iEfqc0DNJUe3ROGp+lFIqCW6aMAJ3qgubI3ye7HDZSc1IYcRjlyQ5Mh0VUqkGKxgMMueDBcyaNo+UJh5OHXY8nfvsn+yw9ipd+h7AK8uf5MNnP2ftL39y0GFdOH3kSWRkNkl2aEikyCWp+vXrZxYuXJjsMJRqMILBIGNO+y8r5qykuKAYi0Wwu+xcOe5fDL7u1GSHp+qIiCwyxvSLtUy7ZZRqgGZPm8+KOb/vvnAXChm8hT4m3Pa/pA1Xm7sjj/W/bcTn9Sfl+BXxef3M/XgR3747m51bc5IdTq3TbhmlGqDv35sbc7hau93GkpnLOeqfh9ZZLMWFXh4b9jxzPliAzWEFA1f89wIGX3dancVQkV/nrWLMaQ8SCoQwGIL+IJfeN5Tzbjkz2aHVGj1zV6oB8qS7Yw5Xi4ArxVmnsTxx1Yv8+OEC/F4/RXnFFOUX88od7zDnwwV1Gkc8fp+fOwf9l/ydBRTmFVGUV4yv2M/EsZNZMXdlssOrNZrclWqATrvyeByu6NvbrTYrvY7rUWdx5GcXMOv9ufiKS3fFeAu9TBo3rc7iKM/ib5cTDASj2n1Ffj579eskRFQ3NLkr1QAdOKAzl957Pg6XHXeaG0+6m7SmKTz4yZg6HdMkd3seVps15rLtG+vHvYuxuq8gPClHYW5RHUdTd7TPXakG6tybz+TES45h8TfLcKe56XPCwXU+WFXLdi2w2W1A6QQqInTuWz/KMnsd252ALxDV7kpxcvQ5hyUhorqhZ+5KNWAZmU34x/lHMPC0PkkZhdBmtzH8sYtxehyl2o0xzPlwAWP/+SjZWYmvTAmFQiz4/Gf+7/GP+PGjhTG7XXZJa5rKiMcuwel2YLGGU54r1cVBh3flyLMHJDy2+kLr3JVSNfb63e8y6aH3MaHofNK0VRNeWvI4TVsm5sae/OwCbjr6brasy8Lv9WN32cnIbMJTsx+gaauMuNut/nktn736NfnZhRz1z4EcdmY/rNbYXUoNhda5K6Vq1bpl62Mmdgj3y09+5IOEHeulWyeyYeUmivKLCfiDFOUVs3X9Np665uVyt+vUuyOjnr2S0W9dz5FDBjb4xF4RTe5KqRqz2eMnymAgxOxp89n2d2IusH43ZU5UH3owEGTuR4sIhWKP0rg30uSulKqxky8/DqfbEXf51vXbuKTTddx1xkMU5tWsQiUUjP0NwRhDfehmri80uSulaqz/Kb045YrjYt9YBYSCIfzFfn76aimPXPZsjY51xOD+UeWXFquFPicc3Oi7WqqiMjMxtRWRb0VkhYgsF5EbIu3NRGSGiKyK/G4aaRcReVpEVovIUhHpU9svQilVse2bdjJz8mwWfrmk3OqS6hARrntmGE98fx/7dGyJzW6Nmej93gDzP/2Z3B3VH/9m5OOX0rxNU9ypLgBcqU6atEjnxhdHVHufjVGF1TIi0hpobYz5SUTSgEXAYOAyYIcxZpyI3AE0NcbcLiKnAaOA04CBwFPGmIHlHUOrZZSqXRPvncK74z7AHhl33Olx8PCM/9CxR7taOd6mP7Zw/RF3kr0lugzSleLkxZ8fpU2n1tXev8/rZ9bUufyx9E/aHtiGY847HJenboddqA9qVC1jjNlkjPkp8jgP+BVoA5wFvBlZ7U3CCZ9I+0QTNhfIiHxAKKWSYNGMJfzfo9Pxe/0U5hVRmFfEzi05jDn1wVq7ANl6/1YMOKX37rrykmwOG/t0aFmj/Tucdo678CiuHPcvTr7s2L0ysVekSn3uItIB6A3MA1oZYzZFFm0GWkUetwH+KrHZhkibUioJPnrxS4oLo2/BL8gt5Lf5q2vtuJeMPQ9PurtUJY3T4+Tap6+IO2SBSpxKDz8gIqmE50a90RiTG5kUGwBjjBGRKl2mFpHhwHCAdu1q56uhUgoKcgpjtotI3HFXdpk1bR7/9/hHZG/NYcApvRk6egjNWzet1HFbtc/kpcWPMfmRD1ny7TJadWjJ+bedRc+jD6rya1BVV6nkLiJ2won9bWPM+5HmLSLS2hizKdLtsjXSvhFoW2Lz/SJtpRhjJgATINznXs34lVIVOObcw/l17iq8Zc7eg4EQ3Q7tHHe7N+95l0kPTSMYCHfdfPjcZ3wz6QdeWT6+0nebtmzbglHPDKt+8KraKlMtI8CrwK/GmCdKLJoOXBp5fCnwYYn2SyJVM4cCOSW6b5RSdeyky/5Bx4Pb7R7n3WK14HQ7uOGFq3CnuGJuk59TwFsPTN2d2AGMgdzt+fzf49PrJG5VM5U5cz8CuBj4RUQWR9rGAOOAKSIyDPgTOC+y7FPClTKrgULg8oRGrJSqEofTzhPf3csP781l9ocLyGjZhEFXncD+PdvH3WbGm99BnO/TP7w3l+EPX1xL0apEqTC5G2NmAbHvTIDjY6xvgGtrGJdSKoHsjnB1yXEXHlWp9XduyY67zGrTex8bAv1XUkpFOXBA55hljABHnl1387Oq6tPkrpSKMuC03jRrHT18rjvVxUV3np2EiFRVaXJXqhEJBoMEgzUfWsBmt/HMj/+l30mHYLVZsFgtdD+iK88vegR3qjsBkaqVi9YwcexkJo2bxqa1WxK+f52sQ6lGYNvG7YwfOYGFXyzGGOh74iHcNGEEmfs1r/G+fcU+QiGjd4EmiDGG5254jc9f+wZfsR+r1YLFZmXUs8M45fLjqrQvnaxDqQYgZ1suz9/4Ohe2H8kVB93AB89+VqmzcL/Pz/WH38mCzxcTDIQIBUMsmrGEUYeOxuf11zguh8uhiT2Bls/+jS9e/xZvoQ8TMgT8QXxFPp659hVytuUm7Dia3JWqB4ryi7im/+189OKXZP21nb9++5tX7nibhy+peHjcH6cvJH9nAaHgnpr0UDBEYW4Rs96fV5thq2qYOWUO3kJfVLvVZmX+pz8n7DiVHn5AKVWxwrwiPnvlK+Z/vpjMNs0YPOo0OvXuWOF2MyZ+T25WXqkZhryFXmZPm8eGVZvYr3P8sfc2rNwUdfcpQFF+MRtXbuKnr5byyYQZFBV4OXboERw79Ahs9sT/6a9dtp5tG7bTqXfHcucy3dtZLJZwcXmMHvF4FUrVocldqQTJzy7gmn63s2PTTrxFPiwWYebkOdz86jUcO/SIcrdd8t3ymIN7WW1WVi5cU25y79CjLc4UJ0V5xaXa3Wku/lj6J1Me+3D3GDK/fL+CL9+Yybgv70rYxBa52/MYc9qDrFu+AZvdgq84wBkjT2TkE5dRcgwqFXb8RUfx6StfRZ29B4MhBg5K3PQX2i2jVIK8/9QnbP97B96i8B9tKGTwFvl46uoJ+H3l933ve0ArbI7Yybaii6IDT+tD832blRp90Wa3kt4sjXmf/lRqcLDiAi+/LVjN3I8WVfZlVei/F45nzeJ1eAu9FOQU4ff6+eTlr/nyzZkJO0Zj0rV/J865+QwcLjt2px2nx4HD7eD2N68jNSMlYcfR5K5UgsyeNh9fcXQSNyHD2l/Wl7vt6SNOiuoqsVgtNG/TjB5HHljutlabladmP8AJ/zoad5oLd6qL4y48inNvORObPfpPvDi/mB+nL6jEK6pYzrZcln6/goC/9IVfb6GXqeM/TsgxKhLwBwj4AxWvWI9cdu9QXlryOFc+dBEjHruUt9Y+z9HnHJbQY2i3jFIJktYsNWZ7MBCMe0a26Y8t/DZ/NZn7NeOBj0fz6OXPsXNLDiYUotuhXRj99g2V6tpIb5bGza9ew82vXrO7bc6HCxCJTu5Wm4X0FmmVfFXlK8wtittPnL+zICHHiCdrw3aevOpFFn29FIA+xx3MTS+PpGXbFrV63ETZr3Nr9rtxUK3tX5O7Ugky5PrT+H3B6lLdIBarhXbd2rDvAfuUWjcUCvHYsOf5bvIcrHYrGGi+b1PG/3A/oWAIpyc8L2hN9DulV8xxYGx2G6dccTzGGNYsWUf+zgK69j+gWjcnteqQiSfdE9V/bLVbOfT0vtWOvSI+r5/rDx/Djk3Zu6uEfvr6F64/bAwT1zyHw2mvtWM3FNoto1SCHH5Wf86+6XTsTjsp6W7cqS7adG7NvR/cHrXuJxO+4vv/m4uv2E9RXjFF+cX8vWYLDwx9kpbtMmMm9pWL1nDvOY9xZY+bePTy59iwqvyRtB1OOw99cTcZmel40tx40t04PU5unDACu9PGFd1u5Kaj7uaeIY9ybqsrmf7CF1V+zRaLhZtfuRqnx7H7DN7hspPePI2L7jqnyvurrDkfzKcguzC6/DOviNnT5tfacRsSvUNV7RUC/gDrlv9FakZKjefvrEh2Vg6/z19NRqsMuvTdP2a3ylU9/826ZX9FtdudNt5e90JUKeGiGUu4Z/Aj+Ir9GGN2j8k+ftYD5Q7dC+EhCZbP/h1vkY8eRx6Iy+Pk8gNv4O81mzGhPX//To+Th7+8m+6Hd63ya177y5+8/9SnbPpjC72P78EZI08mvXnlun6KCsIlm81aZ9Bsn8rN8vT2A1N5c+zkUvEDIHDJPedx8X/OrepLaJDKu0NVu2VUozdz8mzGXz2BUDBEMBCi48HtGPv+rbTYt1mtHC8jswkDB5XfJRFveruAL0j2ttxSyd0Yw9PXvrK7CgfCZ6lF+cVMuHUi4764u9xjWa3WUlPbrVy0hm0bd0QlRl+Rlw+e/axayb3jwe25+ZWrq7zd5Ec+4H/3vYfVZsHvDdD3pEMY8/b1FXYRdewZnnwkqvwz1VXhh93eQrtlVKO26qc/eGzY8xRkF1KUV4yvyMeqRX8w+pQHSea31iMG94/Zboxh7vSFFOYVsXndVoKBIMUFxWxZlxVz/eVzfq/ysdcuWx/zpidjyh/HPdG+f+9H3rr/PbyFXgpzwyWUi2Ys4dErnq9w24Gn9aFFm+bYHHvOT212K81bN6vVvv6GpDLT7L0mIltFZFmJtskisjjys27XDE0i0kFEikose7E2g1eqItOe+RR/mfLEUDDE5rVbWLN4XXKCAo4cMjBuFcyUx6ZzTqthXHXwvzmn1TC+mTSrVA17SZXt+ijph6lzYy8QOOKs2B86teHdhz+I+gbjL/Yz96OF5GeXX2mzq/zzxEuOIaWJB0+6mxMuOYan5jyA1ZaYm7Mausp0y7wBPAtM3NVgjDl/12MReRzIKbH+GmNMr0QFqFRNZK3fTqhsvyzhcsAdm+vuLLWsjFYZ2J22mHXxBdmFGGPwAxR4eeGmN+hzQk8WfbUUX4muGafHyTk3n1HlY//y/W+xFxg4+tzDq7y/6tq5JSdmu9VmJW9HfoU39KQ1TeXfE0by7wkjayO8Bq/CM3djzPfAjljLIpNnnwdMSnBcSiVE/1N74XQ7otr93gBd+x+QhIjC2nTah306tox59l62u8hb6CNnRy5HDhmA3WnHk+7G7rJzxtUnMfi6U6t87JQmsfuzLTYLfyxZR1FBcczlidbr2O4xa+QdLjst2zWMWvX6rKZ97kcBW4wxq0q0dRSRn0XkOxGJO2GjiAwXkYUisjArK3Z/olI1NeiqE2jSMh27c8+XVFeKk3NuObPGdeQ1ISLcO+02mu6TgSfNjSvFic1uRSyxu2q2/bWD0W/dwNt/vsAjX93DlL9fZsSjlxAMBJn78SI+f+0bNqz8u1LHHjzqNJwxhvAVhAeGPsm5ra7kize+rdHrq4xLxp6HO81VqhvF6XFy9fjLtWslASpVCikiHYCPjTE9yrS/AKw2xjweee4EUo0x20WkL/AB0N0YU+4gxVoKqWpT7o48pj7xMbM/mE968zSG3DCII4cMqNNBrYLBID9/vYzNa7fSue/+dO0X/tYQDARZ+OUSpj//BT/NWBJ1Gz+AxSIcc/7hjHn7xlLt63/byM3/uAdfkY9QKDyO+3EXHsW/Xx5Z7msLBoOMHzGBr9/5AZvdGlVxAuB0O3hqzoMccEiHmr3wCmxdn8Wkh6ax9PtfadWhBUNvH1KqskeVr7xSyGondxGxARuBvsaYDXG2mwncYowpN3NrcleN2ba/d/Dvo/9DdlYOoWAIEaHboV144OPROJx2fvpqKXedMQ5/nIk13Gkunl/wMPt12Xd3mzEmXKu+ehMl/4RdKU5ufHEEx18U90vzbn/9vpGbjv4POVnR514Wq4XTrjyeG14YXvUXrOpMbc3EdALwW8nELiKZImKNPN4f6Az8UYNjKNXgPXLps2z5M4uivGK8hT6KC7wsn/M7kx56H4CPXvoybmLft9M+PDd/XKnEDuGz9u0bd1D23Ky4wMvkRz/gtTvf4a0H3mPj6vh3sX79zqy447+EgiGyt8a+4KkahsqUQk4CfgS6isgGERkWWTSU6AupRwNLI6WR7wEjjTExL8YqtTcozCvil+9XlLpNHsBX5OPzV78BYMffO+Nub3faaNu1TVS7r8iHkdjfutf98hfvjvuAt+5/j+E9b+azV7+Oud53k2cTDMSexs/pdnD4WQPixqXqvwpLIY0xF8RpvyxG21Rgas3DUqrh27h6E38sXR/3Zqnc7XkM73kzwTKJv6RYt+OvXLSGB4c+ibcgeqo22FNtE/QHCfqDPDvqVQ4/q//uC8g7t2Tzw9R5FOXHr4rZt9M+HHN+3ZVFqsTT4QeUSrDCvCLGnv0Iy+esxO6wxU3efq+ftcvij/Nuc1ijSh3zduZz6/H3UphbFLW+xWYhFIg+VjAY4ss3Z3LuzWfyw9S5jLvkGQTwxxkDPbNtc56d95COrNjAaXJXKsGeHPESy2b9ht8bKHXT0a7ka7FaCAVDUf3lCJESQIOIhTOvOZnDzix9reybd2YRjFFRY7VZ6dB9P9b+sj7qpq2gP8jrd02iQ492PHzJM6Vi2sXhdmC1WUhrmspj347F4Yq+N0A1LJrclUogb5GXWe/PKzXR9S4p6R6OGNyfr9+eFdUHD+Fx1m959WpCQcPBR3eLOXpl1l/bSg0gtotYhEP+0Z2/fv875l2vfm+Ax4c9jyXG+O4iQvfDunD+7YPpdVyPhM2tqpJLk7tSCVR20oqSQqEQi2YsjZn4AUTgiCEDcZW5wSgYCDL9+c/5dtJs8nMKEZGofnyb3UrzNs3K7b/fsSUbpzv65iUwtOmyL31PPCT+C1MNjiZ3pRIorVkq6S3SoipgLBbBneJix6adMS+wikXoffzBUYk9a8M2hve8pcKBtDr16si7D02L2WWzJwYLoRjVMU6Pk+MuOLLc/auGR5O7Ugn0/A2vkbc9P6o9FDJs2xi/KtiEDIu/Xcacjxbw9Vs/MPejhYjFAsbE7IYpyWIV2nRtzZqlf8Zdx+60ccTgAQwc1Icnh79EKGQIBYLYXXZOvvzYCifhVg2PJnelEmTF3JV89tq3cW9IqoivyM995zxe7tl3LKGgIXtLTvzjChw4oDM3vjSClHQPBx91EN9NmYO32Mdhp/ejU++O1YpX1W+a3JVKkB+mzo1ZiVIVVU3su3Tp34n5n/0cc1n/k3vx30/v3P28VftMzrv1rGodRzUcmtxVgxAKhZj3yU/Men8ennQ3J19+LJ161a8zzl2jOppg3c/wNO3pT3C6HTGn70vmuPUqeTS5q3ovFArxn8GPsOTbZRQXeLFYhM9e+ZqrHvkXZ11b9fHMa8txFxzJtKc+rbCPvDbk7yjA7op901Fa0/InvVCNk86hquq9uR8tYsnMZbvPSkOh8EXGCbf+j9ztebVyzD9/3cD95z/Bhe1GcuNRd7Hg89hdHiV1PLg9l9x7Pg6XHafbEXPM9FicHmfccdyrIr1ZKpYy+3GlODmrGhN6qIZPk7uq9777vx8pzo/ubrDarfz01dKEH+/PFX8xauBofpg6l6wN21k++3fuPecxPn/9mwq3Pe+WM3n996cZ8dglXPvU5Tw3fxy9jusRTt5CqZmHXClOTrr0H/zzxkGkpMeeHakqbn71alp1bIk71RWerclpZ/B1p3LEYB0AbG+k3TKq3nOnusJ92WVuqxcEV4or4cd7/a53KS7wlqpH9xaGvymcePExGGP49OWv+eKNbxERTr78WE4ddhw7t+Qw6/15hIIhDjuzH/sesA8Aj351D8FAEL8/wIw3ZjJz8hw86W5sDhszp8zBhEIEY4wJUxX7HtCK/if35s2Vz7Dix5Vkb82h26GdYw48pvYOlZqso7bpZB2qPL8vWM3Nx94TdfdnShMPUza/UqUBrtb+8ifTnv6MzWu30OfEQzh9xIm7J2IuKihm46pNjD7lAbK3Rk9g4XQ7ePXX8Tw5PDx2jLcw/G3C5rBhsQr+Yj8WqwWLxYJYhEvGnsf5tw2O2s+OzTt58ILxLP1uRVXeht3EKmDY/WHnSnVy95SbGXBK73K3y92ex/QXvmDpzOXs27k1Q64/jfbd9qtWDKp+qPFMTLVNk7uqyP89Pp037n4Xq92KiCAiPPDxaHocUfmbb+ZMX8B/LxyP3xsgFAzhcNtJb57GC4se4bNXvubtB9/HarPEHHERwkn83vdv5f7zn4hZlVKWw2Xn1tevpecxB+0+g/Z5/Vze9Xq2/rUNqvmnF3P4AaeN/0y5mcPOiPl3zra/d3B1n9sozC3EF/kQsjvt3DvtVh12oAHT5K4ahZ1bsvnpq19wpTjpf0qvKo1cGAwGOb/1cHK2lT4jtzls9D2hJ0tmLqe4sPyEnd48jcGjTmXivVMqnZitdiuC0P2Irtz7wW3M++Qnxo94qdyx1KureZtmTFr/Ysz5U58Y/iJfvjEzanKOzLbNeXvdC3U6n6xKnBpNsycir4nIVhFZVqJtrIhsFJHFkZ/TSiwbLSKrReR3ETk5MS9B7c3+XrOZe4Y8wqWdRzHh1on88cufpS5MVsbGVZvxFkUn74AvwKKvllaY2AH8Pj9NW2XEGXwrtqA/SMAfYMnM5Zzbahg/fjqAOw8AACAASURBVLSwVhI7QPaWHPJ2RA99ADDvk59izrqUk5Vb7rAIquGqzF/IG8ApMdqfNMb0ivx8CiAiBxGefq97ZJvnd82pqlR17NySzbUD7tidFHdszmbyuA8Yd/EzVdpPShNP3BETy16ojadNp9b84/zDkWrWmPm9AWa9P6/SJZJVJRbBlRr7AnNqE0/MdmMM7jjbqIatwv+mxpjvgcp+tJ8FvGuM8Rpj1gKrAa3DUtX24fNfhOcLLZGAvUU+5kxfwKa1Wyq9n+atm3LggM6RyTD2cHmcdOm7f1R9eFlOt4MrHryA1IwURj5+WfW7MYwBTOm6doHUpim4UmuW9E+8+Oi4F5eH3HBa1IeKzW6l9/EH776grBqXmtS5XyciSyPdNrvqrdoAf5VYZ0OkLYqIDBeRhSKyMCsrqwZhqMbst7krY04+YXfYWLfsrxhbxHf35Jvo0L0trhQnKU082J12Bo08kdsmXocr1YW1xEQWNrsVT7obJFxmePOrV9O1fycADj6qGw539aagC/iDBHyB3cndarNw9DmH8dqvT8WcCLssq90aPtMu89nS75ReXPfMsNgbAadddQInXnIMDpedlCYenB4nnXp35PaJo6r1OlT9V9069xeA+wlfVrofeBy4oio7MMZMACZA+IJqNeNQjVyHHm1ZMnMFgTLzfQb9QfY9oFWV9tW0VQYv/PQIa5asY/vfO+nSd3+atsoA4MWfH2XSQ9NY9sOv7LN/K4bePpieRx9EzrZcHhv2Ao9c9hwArTu25JbXrqFVh5asX7GhWq+pZE271WZl2EMXsv7XDWz6o5xvIhKeycnv9e/us7c5rPQ5vic3ThhBZpvm5R7TYrFww/NXcdFd/2TN4nW0bNeCjj3aVSt+1TBUqlpGRDoAHxtjepS3TERGAxhjHoos+wIYa4z5sbz9a7WMimfT2i0MP+QWiktchLQ7bRx0aBce+/beSu/HmGLwfguhbHAcitgqHnTMGMM1/W5n3bL1BEqM1uhwOfA0cZG9JboWvqpsDiv792zP+l83xi2vFBH+edMgPn5pRtQ6dpedt9c+v/tDSu1dalQtE2eHrUs8HQLsqqSZDgwVEaeIdAQ6A/OrcwylAFp3bMWjX/2H/Q9pH67Ndtg45rzDuW/6HZXeh/H/gtl6JCZnDCb3IZZ+fD6jBlzCGekXc0W3G/huypyY261cuIYNK/8uldgBfMW+hCR2gIAvyJrF62ImdrGEk/c9U28ha8OOmOvY7DaWfv9rQmJRjUuF3TIiMgn4B9BCRDYA9wD/EJFehLtl1gEjAIwxy0VkCrACCADXGmOqN0C1UhGt2mfS44gDyduRv3uyicpWeBgTxOwcASacjJfNS+HOi9riLQrfqPTX73/z6BXPk59TwKCrTiy17eZ1WVgstTv8ktVujTmGu1iE7od35aYJI2l3YBsWzViCxWqJmlhbCFcCKVWW3sSk6rW8nflc2ePf5GzL3Z0EnR4nJ132D65/9soKtze+nzE7rwATnoP0prMOYMWC1JjrulNdHHfhkVz18L9IaZLChlWbGHHIzTEv6FaHzW7FarfuHkbBleIktWkq2zZsj17XYeOa8ZdzxsiTAFi9eC03HnFX1HDCGZnpvLtxQlQVkNo7JLxbRqm68smEr8jPLih1dust9PL5a9+w7e/KVOj6KVlasnZF/NEXi/KL+eKNmVx/+J3M++xnrFYLh57RD0eccdLLU3YIX6fHyb/+cy5j3rmRgYP6cMix3TnnpjNiTlgN4ZurDjtzz99sp14dufbpK3C6HXjSPbjT3DRrncG4L+/WxK5i0lEhVb22+NtlMaeusztsrFr0By32bVb+Duy7eg8hFAKft/z69IAvwPpfN/LAeY8TCobofdzBXHjn2Uwc+39RXSLlsVgFsYW7XDJapnPB6LMZPOpURITDz+zPxHunMOXRD6MGQytp6XcrOO6CI3c/P3XY8Rxz3uEsn/0b7lQX3Q7rgtWqiV3Fpsld1Wv7HtCKn2P0NYeCITLbll/+ByDigCaPY7JvZPl8J1abIRiocLPdFy9//uYXWrXPpGu/A/h13qpKxx30h3C4bVx+/9Co+Uo3rNrE5Ic/qLC7Z8uf0fd/eNLc9K9g9EelQLtlVD131nWnYi9z16XVZiGjZRPSm8XuOw/4A8ycPJsHLxjPc9e/xrpVByCZn5FXNAibvWpdLL5iPx9PmMHvC9dUOXZfkY/vp86Nap//yU9RozqWZbVZ6dS7fs0RqxoWTe6qXmvfbT/umXoLzfdtisPtAIGQMeRk5XLZgTfw5MiXCIX2nNX7fX5uOW4sj1/5IjMnz2b6C18w6tAxfPnWSroffxMBf9W7MULBULldMg53/NEp02J8ANmdtnKrcMQitO3Whr4n9qxaoEqVoMld1Xv9T+7FO+tfpGOPtlgsggkaivKL8Rf7+fqtH/j8tW93r/v127MidePhm55CwRDeIh/PXPsKLo+DC0YPwZVStTFcKhp3JuAPYHNEf2i4UpwMGXVaVPuRZw+MO2KwO83FGVefzPgf7q/1MkzVuOn/HtUg5GTl8sfSPwkFS6dFb6GXD575dPfzmZNnx7zZx2K18M5D09ivy77c/uYomu+buOnnQoEQAV8QJDxBhyfdjcNl5/zbzmLAqdH9401bZXDbG9fhcDtwp7pwpbpwuOz8+5WRTM/5H6OeGUZKutauq5rRC6qqQSjKL8ZitRK+N660wrzwDUn52QXs3JIde/u8YqY99QlWq5VAIIg7rfLD3IYqOSQwBprv25Rrxl9Bt0M706RFetxVjzn3MPqccDBzP15EKBhi4KA+ZGQ2qXRMSlVEk7tqEPbp2JLUDM/ueUt3sTlsHDlkIH+v2cyoQ8eUOxFGybLDWOWVibBt4w4OPb1vpdZNa5rKiRcfUytxKKXdMqpBsFgs3PLatbg8zt037Tg9Dprtk8HZNw7ioYueIndHHn5vmfLCOp49LrNti7o9oFJx6Jm7qtc2rt7Ezi057N+zPf1OOoQXfn6U6c99zqY/ttD7+INBYHjP6xlw3FYuu2U7rfbz4S2yMuuTJnz4WguCAaFDt2J2brWx6c/EzoDkcDtKfQNwehxcdt/QhB5DqerS5K4SxgS3gPer8GxDruMRa+uKN4ojOyuHewY/wprF63YPrnXx2HM4b5SHf93wA2uXbWXay4uY8wlYbSFatfXS87ACLBYQ8dO2UzFnD8/CZjf4/YLdbli11M3YKzqSt7Pm/+1tditXPnQhb46dQkF2YbjNYWPbhu2EQiGtdFFJp/8DVUKECqdgsk7A5D6MyXsEk3USoYK3qrQPE8rHmPDF0fvOeZyVC9fgLfJRmFuEt8hL230ewpd1M6nuWXTvu5Lbxi/l4ls2IRY4/7osrFbYNfudzQ6etBBOtyE1Pfy7a+9C7pqwrlKxWO0Wjv/XUbRsnxlz+RGDB9D/lN6lJt4oyC7kzbFTeOGmN6r0upWqDZrcVY2Z4CbIvR/wAsWRHy/kPYwJrK94e/9vhLYNwWwdgNnSl82/XMHvC1aXGke9zzH5HHJ4LnZ7uBvEYgGXx3DO1Vn0PjKPWDd8lp3m1O6Ag/oW0nyfikd5DPpDfDtpFtlxqm+2b87m3Yc/iLow6y308smEr8jdkVfhMZSqTZrcVc0Vz4izIATFX5S7qQntwOy4CALLCZc5BsjdvBibrURftjvIiLEbcXmi7xK12Q2dDi7E4ajcoF7BoJCWUYnBZYBQ0JQ6My/p71WbWLlgTcw7V+1OG3+v3lypYyhVWypM7pEJsLeKyLISbY+KyG+RCbKniUhGpL2DiBSJyOLIz4u1GbyqLwIQ857LELHq0iE8hZ0JbsUUvAOm9Jl0uy4FmMj+uvYu4L0Vy2nfxRt1Jg7hM/hDjiggGKxcWYzNbtiwZs+FVREpt6ImFIw9JO/+h3Sgfff9oob2BfB7A7Tq0LJS8ShVWypz5v4GcEqZthlAD2NMT2AlMLrEsjXGmF6Rn5GJCVPVa84TiJ0hbeAKz25k/CswBf/DFH9OyDsfs+0kTNbxUPAM4W6cPRxOw9X3bcKdGuKBt9bicJqYiR3CXS+HHF6Aw1W5G42ys6wE/Hv+29tddm6cMCLu+mKx4PSUrrJxuh1cdt/5DL1jCA6XI2rZUf8cSNOWekOSSq4Kk7sx5ntgR5m2L40xu07J5gL71UJsqoEQWztIHQW4AGvkxwWpw8HakdDOUZjtQzF5D2Ny7oCd/4Lgn4T76GMn5ZOHbuPJD1fF7IqJOr6Ez+ArYgysWb5nsg6nx8nAQX0YNOwEWrSJPS58u25tuPapy9mnY0ucHgcHHdaFh2f8h679O3HAIR148OPRtOvWBotFcHqcDBpxIre8dk3FwShVyyo1zZ6IdAA+Nsb0iLHsI2CyMeatyHrLCZ/N5wJ3GWN+iLPP4cBwgHbt2vX9888/q/cKVL1hAqsxRZ8BBnGdgti7ECqYDHn/BYqSHR7GQDAgTH6uHd9/cjCnjzyR00eehNVq5Yepc3n40mdK3cXqdDu45/1b6X9yrwr37fP6sdmtWgKp6lR50+zVKLmLyJ1AP+BsY4wRESeQaozZLiJ9gQ+A7saYcqeK1zlUG6/QtrMhsKziFRPORrz+fhAkYzziOrVU67xP5vDGXa/z99o82nZpyRX/vZI+J+iwu6r+Ki+5V/tuDhG5DDgdON5EPiGMMV7C37UxxiwSkTVAF0Azdx0zoYJwpUpoM9h7guNwRJJxVpmYyaVjsxPuAio7nowVSAFy4mxnMPkvlkruxreE/v2uo/9nQcAHOMDpw5hnEdF7/VTDU63/tSJyCnAbcIwxprBEeyawwxgTFJH9gc7AHwmJVFWa8a/C7LgwUoVSBOIGWxdoNhGRyo+GmBCuMyE/+qJpYoTAfgj4VwCFJdqDxE/suzbdsvuhMSFM9rVg8kusUATeH6HoA/Cck8CYlaoblSmFnAT8CHQVkQ0iMgx4FkgDZpQpeTwaWCoii4H3gJHGmMpMUa8SyGTfCCaHcMIzYArB/yum4NW6D8bzT5Cajk0ugDtGexAC66HJI2DtSuVHCROw99nzNPBbmcS+SxGm6P+qHK1S9UGFZ+7GmAtiNMfMEsaYqcDUmgalqs8EN0cqUcryQtE0SL22cvsxASj6EFM0FRDEcw64zkSk8tPUGWNgx1VQ/iWXyuyJuBdkzVYIrAh3P8Wd36gkC4gLSb2pTHu8D4bK3RylVH2jnYkqijEGk30NeOexK6manF+gcDqm6StYLBUn+FBgM+wYCqG/azlaoOAFKpfY3eA8FkkbhdgO2NNsOzD87cIURK/v0i4Z1TBp3VYjI9Z9wNqe6DNRF7jPrtxO/AvAN5/SZ8vF4J8NWYdhvLNibmZChZhQHqGiz2HbcQlO7OV1uVSU2F1gyUQyv8TSdHzpxA6IWJCMpyPdR+7wscQDjn6I5581jFup5NAz90ZIMsaXuKBaDOICW1ck5YrK7cA3H0y8bpBszM7hGPf5iK0juE4BJHxzku9Hwok2ROXOpKuimvuzdgXP+Yh7MGJJjbuaOPpC5kwo/hRC28E+ABwDwsMTKNUAaXJvhMTeGTK/A+/nENwSKYU8rNxSSGNCe5ZLBuAkfoVLAIrexmCDvEdB0sHsJH5deRIF14GlZbmJfRexZIDnwtqPSak6oMm9kRKLp8JuGGNCmIIJUPAqmByMtQOSfie4B0H+Y5U4WQ6P4ojJSlTYtcAbvkPWfVKyA1GqTmmf+17M5D0B+S9EyiaB4DrMzlEQWI00fRloJINfhTZjTO1MiK1UfaXJfS9lTDEUTiS6xLAYk/8M4uiPtJoLtoOSEV6EhfDdpjUkKYTvZlVq76HJfW8V3EbcCpTAGoBwTXuTp0lIgq2OtHsh9Toqf3NSLG5IuUovjKq9jib3vZU1M37OtHXZ/dBibweey6n7BC+Q/3S44iflWqp+echCOLFfgaQMr4X4lKrf9ILqXkrEiUm5CvJfpnTXjAtJvaH0yqk3QPHnENpQhxGa8IXaglfAkgnuoVA0mfIHIrOCvS94hoK9P2JtioijnPWVarw0ue/FJOVajDSBgpcgtANsnZG0MRjr/pjtw8C/ELCB2CKljsngC9ed29ohmd9jvN9A/hPhtrLEiTR7lfDI00rt3TS578VEBEm5GFIu3t0WKvoUssoMD53o+5GqrBi83yEplyGeczH27pjtQyldh++GlOGa2JWK0OS+FzPGgPcLTOGk8B2p9oFQ+FKyw4pBwNp6zzP7QdBsIiZvXHi4X0szSLka8ZyXxBiVql80ue+ljPFicu4H70d7hhrwL0luUHHZEM/FpVrE0Qtp/m6S4lGq/tPkvpcxoR2YnDvB+x3RwwUkvf8lttTbEXu3ZEehVIOiyX0vYozB7Lg0UsdeD8eBieIE9xlYUi9JdiBKNTiVqnMXkddEZKuILCvR1kxEZojIqsjvppF2EZGnRWS1iCwVkT7x96xqiwluxXh/wERuSALA/xME/6L+JnYrSAuwdgF7L6TJA0j6g8kOSqkGqbJn7m8QnlpvYom2O4CvjTHjROSOyPPbgVMJz53aGRgIvBD5reqAMSFM7lgoeh/EASaAsfdAmr4EwfXJDq8cDrAfgmSMR6yZyQ5GqQavUmfuxpjvgbJzoZ4FvBl5/CYwuET7RBM2F8gQkdaoOmEK34GiDwFfZF7QYvAvxeSMAVs3MPVw2jjbQUjmTCzN39bErlSC1GT4gVbGmE2Rx5uBVpHHbYC/Sqy3IdJWiogMF5GFIrIwK6s+DxnbwBS+SfRgYD7wfgPWtuAYANSnuzZdSPrdiLVFsgNRqlFJyNgyxhhDFUstjDETjDH9jDH9MjP1bC1hQnlxFggmVAiWJkBdDn/rCs9qZNmX0uPTCFgPCN9R6uhbh/EotXeoSXLfsqu7JfJ7a6R9I9C2xHr7RdpUXXAeRcx/VmkGeQ9A8Ud1FIgADvAMRZpNBM854WsAuxkIbgRTXy/uKtWw1SS5TwcujTy+FPiwRPslkaqZQ4GcEt03qpZJ2k0gTYi6Vm6ywPtF3QThOh+avYe0/BFL+hggEB4ALGpe1mJM/lN1E5NSe5nKlkJOAn4EuorIBhEZBowDThSRVcAJkecAnwJ/AKuBl4FrEh71Xi5mmWOEWPeF5tOJ7iULEp64ug54P8PiOBixpIWfh3bGv5AbWFs3MSm1l6lUKaQx5oI4i46Psa4Brq1JUCq2PWWO00DspcocdydSgLxxhJN5kpgy/f6WZiDW2FdlbAfUSUhK7W10so4GwPh/JZQzGrPtVCh6D/CWKHNcEi5z3LVuKB+8nyYtVgDEU/qp2CHlasBdZkVXuBtJKZVwOvxAPRcq+gJybiVc4RKra8MP3m8IBbPB9x0U/0DSx4hJuS6qSVKuwogHCl6MjM9+AJI2GnH0T0KASjV+mtzrMWMCkHsXpcctj7kmZB1J3ZY47mJjz3AGVvAMx5I6LGqt8Njx/4KUf9VpdErtrTS512eBPyh/WrndK9Z2JPHZ+yDN3gBTDJKiE1ErVU9ocq/PLOn1pA5cCN+AVDYWF5J6LSI2kNQkxKWUikeTez0m1n0wkgqm7LA+dcmKpN6Ise4H3nlQPA0w4Q+etDGI87AkxqaUikeTez0SCm6B/OcguAVcpyDuM0j+ODBWJHUEAuAehDF3QigfLE0R0WIrpeorTe71RKhwKuSO3tPg+zY8RyieuNvUCXuPUk9FnGDVSaiVqu80udcDoWAe5I6JXmB2Atl1Hs8ediR9bBKPr5SqLv1enUTGBAjlPgRZA4lfm56kmnVbV6TZG4j9wOQcXylVI3rmnkQm71EonET9mvbOCZ6rsKRfn+xAlFI1oGfuSWKML5LYK7pBqTYIOM8DSyalx1i3gHiQVL3RSKmGTs/ckyWUTdK6XNIfwOI5F2OCmML/QeH/whUwzqORtJsQS7PkxKWUShhN7sliaQ7iAuOt4wOnIO4hAIhYkZTLIOWyOo5BKVXbtFumhkxwM6H8Fwjl3Icp/hpjoofaNaFCTGA1JpSPCWZh/KuAEKTdQvjuz7pgA1xI02fDd5QqpRo1/SuvAeOdjdl5DeGx032Y4vfB1g2avYmIA2MMJu+/UPgu4b7tIkDCZ+xYIfVOar1rxn0B4AZrc8R9JmJtVeEmSqmGr9rJXUS6ApNLNO0P/AfIAK4CsiLtY4wxSR5gPPGMCWCybyKcsHc1FoJ/OaZwCjj6YXYOg1BW2S3D6wHk3VmLEboh7SYs2uWi1F6p2sndGPM70AtARKyEJ8GeBlwOPGmMeSwhEdZXgRXEHrGxGIqmYvKfjJ6RKEoiZ0uKDO4lTcHWHkm5CnEdm8D9K6UakkR1yxwPrDHG/Ln3DPlqJ26XSnAjmII6DOWo8FjpzqMJf84qpfZ2ibqgOhSYVOL5dSKyVEReE5GmsTYQkeEislBEFmZlle26aABsB4JkxF5msqmTyajFA64hWJq/iriO1cSulNpNwvNZ12AHIg7gb6C7MWaLiLQCthE+rb0faG2MuaK8ffTr188sXLiwRnEkg/H/itlxCRg/UFg3B7X2A4sHxIp4/gnOE3WCDKX2UiKyyBjTL9ayRHTLnAr8ZIzZArDrd+TALwMfJ+AY9ZOtM6TdDXkP77lIWpssXbFkvlP7x1FKNXiJSO4XUKJLRkRaG2M2RZ4OAZYl4Bj1jjG+8Fm7fzF10gWDDdwn1cFxlFKNQY2Su4ikACcCI0o0PyIivQh3y6wrs6zxKJoO/iXUTWIHxI54zq2bYymlGrwaJXdjTAHQvEzbxTWKqJ4yxgeB1SDpiG0/TMHrJLaUMULSAVek2sYLWMCSiWQ8hlj3SfzxlFKNkt6hWgET3IjJexqKPwWxggliLK0gtL52DpjxLBbnoZFjbwO8YNlXL5oqpapEk3sMxhhM0fuQ92jpyal3FRbVSmK3gH0g4hi4u0WsLWrhOEqpvYEm9zJMYDVmx6Uxhg1INCtY2oCkgdgQz3ngHqJn6EqphNDkXoIxgXAFTGhbLezdCpjwjUfGD87jkIxHwhNOK6VUgmlyL8n3I4RyErhDD1AUTujuoZByJRLaBJZ9tMtFKVWrGn1yN8HN4YRt2x8RewXrbif2YGDVIeA+HUm/v3RXi7V5/E2UUipBGm1yN8HtmOxR4P8FxAZYMGn3YPGcGX8j8SQyAghu1T50pVRSNN7kvnNEZFjewJ6p7HLvwtjaI45DwuuYEPgXYYLZ4F8Qnks0Ydzg/EcC96eUUpXXKJO7CayGwEogUGZJMabgdcQxPlIVczmY/MjAX74qHMFBuPum5KBrUuK5E6z7IJ4h1X4NSilVE40yuRPcRnRijwgsx5gQZscwCG2l6tPcWcCSAc3fg+KvwyWTrtOQ4AZM4UQI5YLrZMRzESLuGr4QpZSqnkaZ3I21LXGTe6gA/EvB5FK1xG4J/9h7I00eDg8FkHLRnsX2LojruOoHrZRSCdQok7tgMNiImeBNLiaUR7gbpfJ7pPkXiLUFYklJUJRKKVV7EjUTU/1ibQ2kxlkYDI+9bqpQ8mjrhsXeXhO7UqrBaJTJXcQK7jPiLA2C7ztIGwO42PMW2Il9Nu9E0v9TG2EqpVStabDdMiZUgCn6CALLwdYJcQ9BLOm7l4tzAKboPaCozJYWwI0l5QKMozumcBIEtyOukzGOPrDzaghtDq9ngpB2B+LoU4evTCmlaq7GyV1E1gF5hAc3Dxhj+olIM2Ay0IHwhB3nGWN21vRYu5jgFsz2f0Ioj3DydmPyn4XmUxBbx/BKzqNBJMY1UwfiOTscu70n0qTnntcCmBafQeC38F2t9oO1K0Yp1SAlqlvmWGNMrxITtd4BfG2M6Qx8HXmeMCZvHIS2s+esvCh8oTTnrt3riLiRjBfCd51KSuTuUwek3YDYu8fdt4gg9m6I81BN7EqpBqu2umXOAv4RefwmMBO4PWF7935D9CxIJny3qfHvHkNGnIdB5pxwH7spBseRiDUzYWEopVR9lYjkboAvRcQALxljJgCtSkySvRloVXYjERkODAdo165dFQ8ZL+xILXrJ41g84Dq1ivtXSqmGLRHdMkcaY/oApwLXisjRJRcaYwwxer6NMROMMf2MMf0yM6t4Nu0eTHgIgJJs4DwhXCmjlFJ7uRond2PMxsjvrcA0YACwRURaA0R+b63pcUqS1H+D/aBIP7or3Kdu7YA0GZvIwyilVINVo24ZEUkBLMaYvMjjk4D7gOnApcC4yO8PaxpoqeNaUqDZZPD/HB4gzNoeHAMRaZRl+0opVWU17XNvBUyLjFluA94xxnwuIguAKSIyDPgTOK+Gx4kiIuDoE/5RSilVSo2SuzHmD+CQGO3bgeNrsm+llFLVp/0YSinVCGlyV0qpRkiTu1JKNUKa3JVSqhGS8D1GSQ5CJItwVU08LYBtdRROTWiciaVxJpbGmVj1Ic72xpiYd4HWi+ReERFZWGJQsnpL40wsjTOxNM7Equ9xareMUko1QprclVKqEWooyX1CsgOoJI0zsTTOxNI4E6tex9kg+tyVUkpVTUM5c1dKKVUFmtyVUqoRqpfJXUSsIvKziHwced5RROaJyGoRmSwiZWfqSEaMGSLynoj8JiK/ishhItJMRGaIyKrI76b1IM6bRGS5iCwTkUki4qoP76eIvCYiW0VkWYm2mO+fhD0diXepiNTZUKBx4nw08u++VESmiUhGiWWjI3H+LiIn11Wc8WItsexmETEi0iLyvF69p5H2UZH3dbmIPFKiPSnvaZx/+14iMldEFovIQhEZEGlP2vsZlzGm3v0A/wbeAT6OPJ8CDI08fhG4uh7E+CZwZeSxA8gAHgHuiLTdATyc5BjbAGsBd4n38bL68H4CRwN9gGUl2mK+f8BpwGeAAIcC85Ic50mALfL44RJxHgQsAZxAR2ANYE1mrJH2tsAXhG8UbFFP39Njga8AZ+R5y2S/p3Hi/BI4tcR7ODPZ72e8n3p35i4i+wGDgFciiBUMtQAAA4hJREFUzwU4DngvssqbwODkRBcmIk0I/8O/CmCM8RljsglPDP5mZLWkxxlhA9wiYgM8wCbqwftpjPke2FGmOd77dxYw0YTNBTJ2zfSVjDiNMV8aYwKRp3OB/UrE+a4xxmuMWQusJjwzWZ2I854CPAncRunpLuvVewpcDYwzxngj6+yavS1p72mcOA2QHnncBPi7RJxJeT/jqXfJHRhP+D9iKPK8OZBd4o9pA+Ez0mTqCGQBr0e6j16JzERV4cTgdcmEp0B8DFhPOKnnAIuof+/nLvHevzbAXyXWq08xX0H4jA3qYZwichaw0RizpMyi+hZrF+CoSHfhdyLSP9Je3+K8EXhURP4i/Lc1OtJe3+KsX8ld5P/bO3vWKoIoDD8HgoFYGURErhAVtfWrCCgo0SJFiI1FIKCClWUKCw0I/gQLib2KghIktR91ggZjRBQDBrwBURsLbSK8Fmdu7kWyJJHoDMt5YGH37hYv786c2TlzuGNDwBdJL3NrWYMufLo2Iekw8ANPI6wgn6tlrTNNOeuz+GC0C9gKDObUtF5K8G8tzGwc+AXcy61lNcysB7gGXM+tZR10Ab14SuMKvpOb5ZW0KpeBMUm7gTHS7L1EigruwHFg2MwWgQd4+uAmPsVp7RrVAJbyyFuhCTQlTafrR3iw/6cbg/8FZ4CPkr5KWgYmcY9L87NFlX9LeN64RXbNZnYRGAJG00AE5enchw/sc6lPNYBZM9tJeVqbwGRKa8zgM/ftlKfzAt6PAB7SThGVprOs4C7pqqSGpD5gBHgmaRR4DpxLj236htsbRdJn4JOZHUw/nQbe0t4YHArQiadj+s2sJ30FtXQW5WcHVf5NAedTRUI/8L0jffPfMbNBPHU4LOlnx60pYMTMus1sD7AfmMmhEUDSvKQdkvpSn2oCR1L7LcpT4DG+qIqZHcCLFL5RmKd4jv1kOh8APqTz0vwss1omfQidol0tsxd/oQv4aNldgL5DwAvgNd4wt+HrA0/xF/4E6C1A5w3gHfAGuINXHWT3E7iPrwMs40HnUpV/eAXCLbxSYh44llnnAp5ffZWO2x3Pjyed70lVFTm1/nF/kXa1TGmebgHupnY6Cwzk9rRC5wl83WoOmAaO5vaz6oi/HwiCIKghRaVlgiAIgs0hgnsQBEENieAeBEFQQyK4B0EQ1JAI7kEQBDUkgnsQBEENieAeBEFQQ34DUykgwbJN5kQAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dyq2jHY0tkpJ", + "outputId": "93e5c466-1915-409f-f539-fc0ec44454b6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "plt.scatter(x=df['smoothness_mean'], y=df['smoothness_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "N5UYIYrM67NY", + "outputId": "1e043c1b-c513-4adc-fe07-feeebf98ef59", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "plt.scatter(x=df['compactness_mean'], y=df['compactness_worst'], c=df['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ghqYSxrP_FE" + }, + "source": [ + "## Check you work: \n", + "\n", + "This is something that in a truly unsupervised learning situation **WOULD NOT BE POSSIBLE**. But for educational purposes go back and grab the true diagnosis column (label) from the original dataset. Take your cluster labels and compare them to the original diagnosis column. You can make scatterplots for each to see how they compare or you can calculate a percent accuracy score like: \n", + "\\begin{align}\n", + "\\frac{\\text{Num Correct Labels}}{\\text{Num Total Observations}}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OIG7-yGLP-eA", + "outputId": "24e91b4b-8be6-46b9-e701-1d40bb0b4dc7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + } + }, + "source": [ + "df['diagnosis'] = df_original['diagnosis']\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
radius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanradius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstclustersdiagnosis
017.9910.38122.801001.00.118400.2776025.3817.33184.602019.00.16220.66560M
120.5717.77132.901326.00.084740.0786424.9923.41158.801956.00.12380.18660M
219.6921.25130.001203.00.109600.1599023.5725.53152.501709.00.14440.42450M
311.4220.3877.58386.10.142500.2839014.9126.5098.87567.70.20980.86631M
420.2914.34135.101297.00.100300.1328022.5416.67152.201575.00.13740.20500M
\n", + "
" + ], + "text/plain": [ + " radius_mean texture_mean ... clusters diagnosis\n", + "0 17.99 10.38 ... 0 M\n", + "1 20.57 17.77 ... 0 M\n", + "2 19.69 21.25 ... 0 M\n", + "3 11.42 20.38 ... 1 M\n", + "4 20.29 14.34 ... 0 M\n", + "\n", + "[5 rows x 14 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 265 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gCATuBE5Jfgk", + "outputId": "83e3c2b4-c0c8-4e79-ce05-a4dc88133b5f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "df['diagnosis'].value_counts()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "B 357\n", + "M 212\n", + "Name: diagnosis, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 266 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TIwU2tnDKQg0", + "outputId": "128920b1-e642-417c-c027-c8e31af74319", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 169 + } + }, + "source": [ + "(pd.crosstab(index=df['clusters'],\n", + " columns=df['diagnosis'],\n", + " normalize=True, margins=True)) * 100" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisBMAll
clusters
00.17574722.67135322.8471
162.56590514.58699577.1529
All62.74165237.258348100.0000
\n", + "
" + ], + "text/plain": [ + "diagnosis B M All\n", + "clusters \n", + "0 0.175747 22.671353 22.8471\n", + "1 62.565905 14.586995 77.1529\n", + "All 62.741652 37.258348 100.0000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 284 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BedOTS0eJ9_K" + }, + "source": [ + "# 2) Perform PCA on your dataset first and *then* use k-means clustering. \n", + "\n", + "- You need to standardize your data before PCA.\n", + "- First try clustering just on PC1 and PC2 so that you can make a scatterplot of your clustering.\n", + "- Then use use a scree plot to decide how many principal components to include in your clustering, and use however many principal components you need in order to retain 90% of the variation of the original dataset\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dW1AeAK8PNah", + "outputId": "a8a10ac5-aa89-4d44-f8b1-83555977c275", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 217 + } + }, + "source": [ + "# Standardize the data\n", + "principle_components = df.drop('diagnosis', axis=1)\n", + "scaler = StandardScaler()\n", + "pca = PCA(n_components=0.9)\n", + "Z = scaler.fit_transform(principle_components)\n", + "principle_components = pd.DataFrame(pca.fit_transform(Z))\n", + "\n", + "print(principle_components.shape)\n", + "principle_components.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(569, 4)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
05.0695692.399980-3.4321811.555644
13.747829-2.669786-1.423654-0.560089
24.4239040.135107-0.756282-0.089172
31.3433766.9033620.1784111.265439
43.438885-1.149978-2.827432-0.460856
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + "0 5.069569 2.399980 -3.432181 1.555644\n", + "1 3.747829 -2.669786 -1.423654 -0.560089\n", + "2 4.423904 0.135107 -0.756282 -0.089172\n", + "3 1.343376 6.903362 0.178411 1.265439\n", + "4 3.438885 -1.149978 -2.827432 -0.460856" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 268 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Yo-6WlnFm10p", + "outputId": "67bc147a-0b3f-4443-eb29-6ecba171a5c0", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 199 + } + }, + "source": [ + "# Rename principle component columns\n", + "pc_cols = []\n", + "for i in principle_components.columns:\n", + " pc_cols.append(f\"PC{i+1}\")\n", + "\n", + "principle_components.columns = pc_cols\n", + "principle_components.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PC1PC2PC3PC4
05.0695692.399980-3.4321811.555644
13.747829-2.669786-1.423654-0.560089
24.4239040.135107-0.756282-0.089172
31.3433766.9033620.1784111.265439
43.438885-1.149978-2.827432-0.460856
\n", + "
" + ], + "text/plain": [ + " PC1 PC2 PC3 PC4\n", + "0 5.069569 2.399980 -3.432181 1.555644\n", + "1 3.747829 -2.669786 -1.423654 -0.560089\n", + "2 4.423904 0.135107 -0.756282 -0.089172\n", + "3 1.343376 6.903362 0.178411 1.265439\n", + "4 3.438885 -1.149978 -2.827432 -0.460856" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 269 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RM7MFu70wZLY", + "outputId": "f3ea05be-7532-4208-db6e-127e0bf03aeb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 290 + } + }, + "source": [ + "principle_components.describe()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PC1PC2PC3PC4
count5.690000e+025.690000e+025.690000e+025.690000e+02
mean-7.024258e-17-1.139491e-16-4.760886e-173.258475e-17
std2.778672e+001.519665e+001.284437e+007.701293e-01
min-4.260721e+00-3.246944e+00-3.432181e+00-1.693007e+00
25%-2.054315e+00-1.073269e+00-8.626120e-01-5.033166e-01
50%-1.008188e+00-2.037317e-01-1.121198e-01-7.409345e-02
75%1.625827e+009.362490e-017.179154e-014.322451e-01
max1.116828e+016.903362e+004.707096e+003.152067e+00
\n", + "
" + ], + "text/plain": [ + " PC1 PC2 PC3 PC4\n", + "count 5.690000e+02 5.690000e+02 5.690000e+02 5.690000e+02\n", + "mean -7.024258e-17 -1.139491e-16 -4.760886e-17 3.258475e-17\n", + "std 2.778672e+00 1.519665e+00 1.284437e+00 7.701293e-01\n", + "min -4.260721e+00 -3.246944e+00 -3.432181e+00 -1.693007e+00\n", + "25% -2.054315e+00 -1.073269e+00 -8.626120e-01 -5.033166e-01\n", + "50% -1.008188e+00 -2.037317e-01 -1.121198e-01 -7.409345e-02\n", + "75% 1.625827e+00 9.362490e-01 7.179154e-01 4.322451e-01\n", + "max 1.116828e+01 6.903362e+00 4.707096e+00 3.152067e+00" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 270 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bGxXcvPxxCFf", + "outputId": "be89c6b6-1cc0-4d05-fc7b-d858ceaf0ba5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "print(f\"\"\"\n", + "Variance percentages: {pca.explained_variance_ratio_ *100}\n", + "\"\"\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "Variance percentages: [59.28804874 17.7332576 12.66829158 4.55428267]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "N4gc7OWqxTHd", + "outputId": "49c56380-ce0f-42e3-8aba-0488da14b012", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 316 + } + }, + "source": [ + "# Perform kmeans clustering of the principle components, just the first two\n", + "first_two_pcs = principle_components[['PC1', 'PC2']]\n", + "\n", + "# Find best value of k\n", + "K = range(1, 11)\n", + "sum_of_squared_distances = []\n", + "for k in K:\n", + " km = KMeans(n_clusters=k)\n", + " km = km.fit(first_two_pcs)\n", + " sum_of_squared_distances.append(km.inertia_)\n", + "\n", + "# Print the inertia values\n", + "print(sum_of_squared_distances)\n", + "\n", + "# Plot the inertia values\n", + "plt.plot(K, sum_of_squared_distances, 'bx-')\n", + "plt.xticks(ticks=range(0, 11))\n", + "plt.yticks(ticks=range(0, 6))\n", + "plt.xlabel('Number of Clusters (K)')\n", + "plt.ylabel('Sum of Squared Distances (in thousands)')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[5697.266029995058, 2423.676286604752, 1713.8943983617223, 1375.3921122569516, 1114.4039466094957, 936.8421262303988, 832.7017907802748, 739.0867598940645, 675.184734885664, 610.2855450466519]\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Rw-kjyyuuD0o" + }, + "source": [ + "# Audio graph\n", + "# Uncomment the line below to play the audio graph\n", + "#SonifyTool(x=K, y=sum_of_squared_distances, duration_args=duration_args).play()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xKkW1XX6OtNt", + "outputId": "07c6e9c2-a487-44a1-8d58-70c1f3422237", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "# Perform kmeans clustering\n", + "kmeans = KMeans(n_clusters=2)\n", + "kmeans.fit(first_two_pcs)\n", + "principle_components['clusters'] = kmeans.labels_\n", + "principle_components['clusters'].value_counts()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 413\n", + "0 156\n", + "Name: clusters, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 273 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "W-un6yCp98ZA", + "outputId": "3f39633f-ad65-4239-e942-969e74c8b60f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "# Perform kmeans clustering on all components\n", + "kmeans = KMeans(n_clusters=2)\n", + "kmeans.fit(principle_components)\n", + "principle_components['clusters'] = kmeans.labels_\n", + "principle_components['clusters'].value_counts()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 413\n", + "0 156\n", + "Name: clusters, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 274 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z92hsuqBDsqN", + "outputId": "6ced3a8e-effe-4df0-9337-7c796944e4eb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 169 + } + }, + "source": [ + "principle_components['diagnosis'] = df_original['diagnosis']\n", + "\n", + "# Crosstab of clusters and diagnosis\n", + "(pd.crosstab(index=principle_components['clusters'],\n", + " columns=principle_components['diagnosis'],\n", + " normalize=True, margins=True)) * 100" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisBMAll
clusters
00.00000027.41652027.41652
162.7416529.84182872.58348
All62.74165237.258348100.00000
\n", + "
" + ], + "text/plain": [ + "diagnosis B M All\n", + "clusters \n", + "0 0.000000 27.416520 27.41652\n", + "1 62.741652 9.841828 72.58348\n", + "All 62.741652 37.258348 100.00000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 275 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BIixS1Nry87O", + "outputId": "bbce4913-0fd2-470d-a412-99fb29d79b93", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 265 + } + }, + "source": [ + "# Visualize the clusters\n", + "plt.scatter(x=principle_components['PC1'], y=principle_components['PC2'],\n", + " c=principle_components['clusters'])\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PkrfbzfBROpP" + }, + "source": [ + "## Check your work: \n", + "\n", + "- Compare your PC1, PC2 clustering scatterplot to the clustering scatterplots you made on the raw data\n", + "- Calculate accuracy scores for both the PC1,PC2 Principal component clustering and the 90% of explained variance clustering.\n", + "\n", + "How do your accuracy scores -when preprocessing the data with PCA- compare to the accuracy when simply clustering on the raw data?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLc95J3ChKkN" + }, + "source": [ + "The crosstabs above show the split between cells labeled B (benign) and cells labeled M (malignant,) as it relates to the clusters that kmeans placed them in.\n", + "\n", + "The first table was produced using the raw data, and shows slightly less accuracy in the kmeans clusters than the second table, which was produced using the principle components. In table 1, about 0.2% of benign cells and about 15% of the malignant cells were placed in the wrong cluster. In table 2, all benign cells were clustered correctly. Only about 10% of malignant cells were misplaced as opposed to 15." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wKBwVaGOOYsq" + }, + "source": [ + "# Stretch Goals:\n", + "\n", + "- Study for the Sprint Challenge\n", + "- Work on your Data Storytelling Project" + ] + } + ] +} \ No newline at end of file