|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 11, |
| 6 | + "metadata": { |
| 7 | + "collapsed": true |
| 8 | + }, |
| 9 | + "outputs": [], |
| 10 | + "source": [ |
| 11 | + "import pandas as pd\n", |
| 12 | + "import logging\n", |
| 13 | + "import glob\n", |
| 14 | + "from sklearn.model_selection import train_test_split\n", |
| 15 | + "pd.set_option('display.max_colwidth', 500)\n", |
| 16 | + "logger = logging.getLogger()\n", |
| 17 | + "logger.setLevel(logging.WARNING)" |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "markdown", |
| 22 | + "metadata": {}, |
| 23 | + "source": [ |
| 24 | + "## 1. Process Data" |
| 25 | + ] |
| 26 | + }, |
| 27 | + { |
| 28 | + "cell_type": "markdown", |
| 29 | + "metadata": {}, |
| 30 | + "source": [ |
| 31 | + "Look at filesystem to see files extracted from BigQuery" |
| 32 | + ] |
| 33 | + }, |
| 34 | + { |
| 35 | + "cell_type": "code", |
| 36 | + "execution_count": 3, |
| 37 | + "metadata": {}, |
| 38 | + "outputs": [ |
| 39 | + { |
| 40 | + "name": "stdout", |
| 41 | + "output_type": "stream", |
| 42 | + "text": [ |
| 43 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:58 results000000000000.csv\r\n", |
| 44 | + "-rw-r--r-- 1 40294 40294 280M Jan 14 00:59 results000000000001.csv\r\n", |
| 45 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000002.csv\r\n", |
| 46 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000003.csv\r\n", |
| 47 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000004.csv\r\n", |
| 48 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000005.csv\r\n", |
| 49 | + "-rw-r--r-- 1 40294 40294 282M Jan 14 00:59 results000000000006.csv\r\n", |
| 50 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000007.csv\r\n", |
| 51 | + "-rw-r--r-- 1 40294 40294 280M Jan 14 00:59 results000000000008.csv\r\n", |
| 52 | + "-rw-r--r-- 1 40294 40294 281M Jan 14 00:59 results000000000009.csv\r\n" |
| 53 | + ] |
| 54 | + } |
| 55 | + ], |
| 56 | + "source": [ |
| 57 | + "!ls -lah | grep results" |
| 58 | + ] |
| 59 | + }, |
| 60 | + { |
| 61 | + "cell_type": "markdown", |
| 62 | + "metadata": {}, |
| 63 | + "source": [ |
| 64 | + "Split data into train and test set and preview data" |
| 65 | + ] |
| 66 | + }, |
| 67 | + { |
| 68 | + "cell_type": "code", |
| 69 | + "execution_count": 14, |
| 70 | + "metadata": {}, |
| 71 | + "outputs": [ |
| 72 | + { |
| 73 | + "name": "stdout", |
| 74 | + "output_type": "stream", |
| 75 | + "text": [ |
| 76 | + "Train: 4,297,104 rows 3 columns\n", |
| 77 | + "Test: 1,074,277 rows 3 columns\n" |
| 78 | + ] |
| 79 | + }, |
| 80 | + { |
| 81 | + "data": { |
| 82 | + "text/html": [ |
| 83 | + "<div>\n", |
| 84 | + "<style scoped>\n", |
| 85 | + " .dataframe tbody tr th:only-of-type {\n", |
| 86 | + " vertical-align: middle;\n", |
| 87 | + " }\n", |
| 88 | + "\n", |
| 89 | + " .dataframe tbody tr th {\n", |
| 90 | + " vertical-align: top;\n", |
| 91 | + " }\n", |
| 92 | + "\n", |
| 93 | + " .dataframe thead th {\n", |
| 94 | + " text-align: right;\n", |
| 95 | + " }\n", |
| 96 | + "</style>\n", |
| 97 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 98 | + " <thead>\n", |
| 99 | + " <tr style=\"text-align: right;\">\n", |
| 100 | + " <th></th>\n", |
| 101 | + " <th>issue_url</th>\n", |
| 102 | + " <th>issue_title</th>\n", |
| 103 | + " <th>body</th>\n", |
| 104 | + " </tr>\n", |
| 105 | + " </thead>\n", |
| 106 | + " <tbody>\n", |
| 107 | + " <tr>\n", |
| 108 | + " <th>222500</th>\n", |
| 109 | + " <td>\"https://github.com/zenoss/ZenPacks.zenoss.Microsoft.Windows/issues/845\"</td>\n", |
| 110 | + " <td>\"zwinrmkrb5includedir is not honored\"</td>\n", |
| 111 | + " <td>\"i am seeing a persistent issue where setting zwinrmkrb5includedir does not result in correct behavior. if i manually create a krb5.conf file with rdns set to flase, i can successfully model a device, but on subsequent runs modeling fails and the setting disappears. is there a secret to preventing reverse dns lookups?\"</td>\n", |
| 112 | + " </tr>\n", |
| 113 | + " <tr>\n", |
| 114 | + " <th>264960</th>\n", |
| 115 | + " <td>\"https://github.com/uf-feedback/vertpaleo/issues/38\"</td>\n", |
| 116 | + " <td>\"portal usage statistics are almost back\"</td>\n", |
| 117 | + " <td>\"thanks to the financial support of the museum of vertebrate zoology at berkeley, we have fixed the issues that were preventing us from logging the vertnet statistics of data use. usage statistics are being collected once again. we are now working on the reporting and visualization of those stats, so that we can bring those back to the natural history collections community in a friendly, useful modality. we expect all of this to be up and running before the end of the year. we apologize for ...</td>\n", |
| 118 | + " </tr>\n", |
| 119 | + " <tr>\n", |
| 120 | + " <th>448395</th>\n", |
| 121 | + " <td>\"https://github.com/airesvsg/wp-rest-api-cache/issues/10\"</td>\n", |
| 122 | + " <td>\"uncaught exception: serialization of 'closure' is not allowed\"</td>\n", |
| 123 | + " <td>\"i'm getting this kind of error just by installing & enabling the plugin: 21-mar-2017 08:46:03 utc php fatal error: uncaught exception: serialization of 'closure' is not allowed in .../wordpress/wp-includes/functions.php:435 stack trace: 0 ... /wordpress/wp-includes/functions.php 435 : serialize object wp_rest_response 1 ... /wordpress/wp-includes/option.php 427 : maybe_serialize object wp_rest_response 2 ... /wordpress/wp-includes/option.php 730 : add_option '_transient_rest...', object wp_...</td>\n", |
| 124 | + " </tr>\n", |
| 125 | + " </tbody>\n", |
| 126 | + "</table>\n", |
| 127 | + "</div>" |
| 128 | + ], |
| 129 | + "text/plain": [ |
| 130 | + " issue_url \\\n", |
| 131 | + "222500 \"https://github.com/zenoss/ZenPacks.zenoss.Microsoft.Windows/issues/845\" \n", |
| 132 | + "264960 \"https://github.com/uf-feedback/vertpaleo/issues/38\" \n", |
| 133 | + "448395 \"https://github.com/airesvsg/wp-rest-api-cache/issues/10\" \n", |
| 134 | + "\n", |
| 135 | + " issue_title \\\n", |
| 136 | + "222500 \"zwinrmkrb5includedir is not honored\" \n", |
| 137 | + "264960 \"portal usage statistics are almost back\" \n", |
| 138 | + "448395 \"uncaught exception: serialization of 'closure' is not allowed\" \n", |
| 139 | + "\n", |
| 140 | + " body \n", |
| 141 | + "222500 \"i am seeing a persistent issue where setting zwinrmkrb5includedir does not result in correct behavior. if i manually create a krb5.conf file with rdns set to flase, i can successfully model a device, but on subsequent runs modeling fails and the setting disappears. is there a secret to preventing reverse dns lookups?\" \n", |
| 142 | + "264960 \"thanks to the financial support of the museum of vertebrate zoology at berkeley, we have fixed the issues that were preventing us from logging the vertnet statistics of data use. usage statistics are being collected once again. we are now working on the reporting and visualization of those stats, so that we can bring those back to the natural history collections community in a friendly, useful modality. we expect all of this to be up and running before the end of the year. we apologize for ... \n", |
| 143 | + "448395 \"i'm getting this kind of error just by installing & enabling the plugin: 21-mar-2017 08:46:03 utc php fatal error: uncaught exception: serialization of 'closure' is not allowed in .../wordpress/wp-includes/functions.php:435 stack trace: 0 ... /wordpress/wp-includes/functions.php 435 : serialize object wp_rest_response 1 ... /wordpress/wp-includes/option.php 427 : maybe_serialize object wp_rest_response 2 ... /wordpress/wp-includes/option.php 730 : add_option '_transient_rest...', object wp_... " |
| 144 | + ] |
| 145 | + }, |
| 146 | + "execution_count": 14, |
| 147 | + "metadata": {}, |
| 148 | + "output_type": "execute_result" |
| 149 | + } |
| 150 | + ], |
| 151 | + "source": [ |
| 152 | + "#read in data\n", |
| 153 | + "traindf, testdf = train_test_split(\n", |
| 154 | + " pd.concat([\n", |
| 155 | + " pd.read_csv(f) for f in glob.glob('*.csv')\n", |
| 156 | + " ]), \n", |
| 157 | + " test_size=.20)\n", |
| 158 | + "\n", |
| 159 | + "\n", |
| 160 | + "#print out stats about shape of data\n", |
| 161 | + "print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')\n", |
| 162 | + "print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')\n", |
| 163 | + "\n", |
| 164 | + "# preview data\n", |
| 165 | + "traindf.head(3)" |
| 166 | + ] |
| 167 | + }, |
| 168 | + { |
| 169 | + "cell_type": "markdown", |
| 170 | + "metadata": {}, |
| 171 | + "source": [ |
| 172 | + "Convert to lists in preparation for modeling" |
| 173 | + ] |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "code", |
| 177 | + "execution_count": 15, |
| 178 | + "metadata": {}, |
| 179 | + "outputs": [ |
| 180 | + { |
| 181 | + "data": { |
| 182 | + "text/plain": [ |
| 183 | + "'\"i am seeing a persistent issue where setting zwinrmkrb5includedir does not result in correct behavior. if i manually create a krb5.conf file with rdns set to flase, i can successfully model a device, but on subsequent runs modeling fails and the setting disappears. is there a secret to preventing reverse dns lookups?\"'" |
| 184 | + ] |
| 185 | + }, |
| 186 | + "execution_count": 15, |
| 187 | + "metadata": {}, |
| 188 | + "output_type": "execute_result" |
| 189 | + } |
| 190 | + ], |
| 191 | + "source": [ |
| 192 | + "train_body_raw = traindf.body.tolist()\n", |
| 193 | + "train_title_raw = traindf.issue_title.tolist()\n", |
| 194 | + "train_body_raw[0]" |
| 195 | + ] |
| 196 | + }, |
| 197 | + { |
| 198 | + "cell_type": "code", |
| 199 | + "execution_count": null, |
| 200 | + "metadata": { |
| 201 | + "collapsed": true |
| 202 | + }, |
| 203 | + "outputs": [], |
| 204 | + "source": [] |
| 205 | + } |
| 206 | + ], |
| 207 | + "metadata": { |
| 208 | + "kernelspec": { |
| 209 | + "display_name": "Python 3", |
| 210 | + "language": "python", |
| 211 | + "name": "python3" |
| 212 | + }, |
| 213 | + "language_info": { |
| 214 | + "codemirror_mode": { |
| 215 | + "name": "ipython", |
| 216 | + "version": 3 |
| 217 | + }, |
| 218 | + "file_extension": ".py", |
| 219 | + "mimetype": "text/x-python", |
| 220 | + "name": "python", |
| 221 | + "nbconvert_exporter": "python", |
| 222 | + "pygments_lexer": "ipython3", |
| 223 | + "version": "3.6.2" |
| 224 | + } |
| 225 | + }, |
| 226 | + "nbformat": 4, |
| 227 | + "nbformat_minor": 2 |
| 228 | +} |
0 commit comments