|
26 | 26 | }, |
27 | 27 | { |
28 | 28 | "cell_type": "code", |
29 | | - "execution_count": 0, |
| 29 | + "execution_count": 1, |
30 | 30 | "metadata": { |
31 | 31 | "colab": {}, |
32 | 32 | "colab_type": "code", |
|
53 | 53 | }, |
54 | 54 | { |
55 | 55 | "cell_type": "code", |
56 | | - "execution_count": 0, |
| 56 | + "execution_count": 2, |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [], |
| 59 | + "source": [ |
| 60 | + "path = os.getcwd()\n", |
| 61 | + "path = path + '\\Data'\n", |
| 62 | + "\n", |
| 63 | + "fil = 'sentiment_sentences.txt'" |
| 64 | + ] |
| 65 | + }, |
| 66 | + { |
| 67 | + "cell_type": "code", |
| 68 | + "execution_count": 3, |
| 69 | + "metadata": {}, |
| 70 | + "outputs": [ |
| 71 | + { |
| 72 | + "name": "stdout", |
| 73 | + "output_type": "stream", |
| 74 | + "text": [ |
| 75 | + "File already exists\n" |
| 76 | + ] |
| 77 | + } |
| 78 | + ], |
| 79 | + "source": [ |
| 80 | + "if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n", |
| 81 | + " file = open(os.path.join(path, fil), 'w')\n", |
| 82 | + " file.close()\n", |
| 83 | + " \n", |
| 84 | + " # combined the three files to make sentiment_sentences.txt\n", |
| 85 | + " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", |
| 86 | + "\n", |
| 87 | + " with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n", |
| 88 | + " for fname in filenames:\n", |
| 89 | + " with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n", |
| 90 | + " outfile.write(infile.read())\n", |
| 91 | + " print(\"File created\")\n", |
| 92 | + "else:\n", |
| 93 | + " print(\"File already exists\")" |
| 94 | + ] |
| 95 | + }, |
| 96 | + { |
| 97 | + "cell_type": "code", |
| 98 | + "execution_count": 4, |
57 | 99 | "metadata": { |
58 | 100 | "colab": {}, |
59 | 101 | "colab_type": "code", |
|
66 | 108 | "name": "stdout", |
67 | 109 | "output_type": "stream", |
68 | 110 | "text": [ |
69 | | - "CPU times: user 1min 36s, sys: 4.23 s, total: 1min 40s\n", |
70 | | - "Wall time: 1min 40s\n", |
| 111 | + "Wall time: 15.7 s\n", |
71 | 112 | "done loading Word2Vec\n" |
72 | 113 | ] |
73 | 114 | } |
74 | 115 | ], |
75 | 116 | "source": [ |
76 | 117 | "#Load the pre-trained word2vec model and the dataset\n", |
77 | | - "data_path= \"DATAPATH\"\n", |
78 | | - "path_to_model = os.path.join(data_path,'GoogleNews-vectors-negative300.bin')\n", |
79 | | - "training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", |
| 118 | + "try:\n", |
| 119 | + " from google.colab import files\n", |
| 120 | + " data_path= \"DATAPATH\" \n", |
| 121 | + " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", |
| 122 | + " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", |
| 123 | + "except ModuleNotFoundError:\n", |
| 124 | + " data_path= \"Data\" \n", |
| 125 | + " \n", |
| 126 | + " if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n", |
| 127 | + " if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n", |
| 128 | + " wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n", |
| 129 | + " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", |
| 130 | + " else:\n", |
| 131 | + " path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n", |
| 132 | + " \n", |
| 133 | + " else:\n", |
| 134 | + " path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n", |
| 135 | + " \n", |
| 136 | + " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", |
| 137 | + "\n", |
| 138 | + "\n", |
80 | 139 | "\n", |
81 | 140 | "#Load W2V model. This will take some time. \n", |
82 | 141 | "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", |
|
90 | 149 | "for line in fh:\n", |
91 | 150 | " text, sentiment = line.split(\"\\t\")\n", |
92 | 151 | " texts.append(text)\n", |
93 | | - " cats.append(sentiment)\n" |
| 152 | + " cats.append(sentiment)" |
94 | 153 | ] |
95 | 154 | }, |
96 | 155 | { |
97 | 156 | "cell_type": "code", |
98 | | - "execution_count": 0, |
| 157 | + "execution_count": 5, |
99 | 158 | "metadata": { |
100 | 159 | "colab": {}, |
101 | 160 | "colab_type": "code", |
|
120 | 179 | }, |
121 | 180 | { |
122 | 181 | "cell_type": "code", |
123 | | - "execution_count": 0, |
| 182 | + "execution_count": 6, |
124 | 183 | "metadata": { |
125 | 184 | "colab": {}, |
126 | 185 | "colab_type": "code", |
|
148 | 207 | }, |
149 | 208 | { |
150 | 209 | "cell_type": "code", |
151 | | - "execution_count": 0, |
| 210 | + "execution_count": 7, |
152 | 211 | "metadata": { |
153 | 212 | "colab": {}, |
154 | 213 | "colab_type": "code", |
|
186 | 245 | }, |
187 | 246 | { |
188 | 247 | "cell_type": "code", |
189 | | - "execution_count": 0, |
| 248 | + "execution_count": 8, |
190 | 249 | "metadata": { |
191 | 250 | "colab": {}, |
192 | 251 | "colab_type": "code", |
|
215 | 274 | " if token in w2v_model:\n", |
216 | 275 | " feat_for_this += w2v_model[token]\n", |
217 | 276 | " count_for_this +=1\n", |
218 | | - " feats.append(feat_for_this/count_for_this) \n", |
| 277 | + " if(count_for_this!=0):\n", |
| 278 | + " feats.append(feat_for_this/count_for_this) \n", |
| 279 | + " else:\n", |
| 280 | + " feats.append(zero_vector)\n", |
219 | 281 | " return feats\n", |
220 | 282 | "\n", |
221 | 283 | "\n", |
222 | 284 | "train_vectors = embedding_feats(texts_processed)\n", |
223 | | - "print(len(train_vectors))\n" |
| 285 | + "print(len(train_vectors))" |
224 | 286 | ] |
225 | 287 | }, |
226 | 288 | { |
227 | 289 | "cell_type": "code", |
228 | | - "execution_count": 0, |
| 290 | + "execution_count": 9, |
229 | 291 | "metadata": { |
230 | 292 | "colab": {}, |
231 | 293 | "colab_type": "code", |
|
237 | 299 | "name": "stdout", |
238 | 300 | "output_type": "stream", |
239 | 301 | "text": [ |
240 | | - "Accuracy: 0.812\n", |
241 | | - " precision recall f1-score support\n", |
| 302 | + "Accuracy: 0.8173333333333334\n", |
| 303 | + " precision recall f1-score support\n", |
242 | 304 | "\n", |
243 | | - " 0\n", |
244 | | - " 0.82 0.80 0.81 374\n", |
245 | | - " 1\n", |
246 | | - " 0.80 0.83 0.82 376\n", |
| 305 | + " 0\n", |
| 306 | + " 0.79 0.82 0.81 350\n", |
| 307 | + " 1\n", |
| 308 | + " 0.84 0.81 0.83 400\n", |
247 | 309 | "\n", |
248 | | - "avg / total 0.81 0.81 0.81 750\n", |
| 310 | + " accuracy 0.82 750\n", |
| 311 | + " macro avg 0.82 0.82 0.82 750\n", |
| 312 | + "weighted avg 0.82 0.82 0.82 750\n", |
249 | 313 | "\n" |
250 | 314 | ] |
251 | 315 | } |
|
269 | 333 | "source": [ |
270 | 334 | "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!" |
271 | 335 | ] |
272 | | - }, |
273 | | - { |
274 | | - "cell_type": "code", |
275 | | - "execution_count": 0, |
276 | | - "metadata": { |
277 | | - "colab": {}, |
278 | | - "colab_type": "code", |
279 | | - "id": "8z3tJJJkb_JB" |
280 | | - }, |
281 | | - "outputs": [], |
282 | | - "source": [] |
283 | 336 | } |
284 | 337 | ], |
285 | 338 | "metadata": { |
|
302 | 355 | "name": "python", |
303 | 356 | "nbconvert_exporter": "python", |
304 | 357 | "pygments_lexer": "ipython3", |
305 | | - "version": "3.6.10" |
| 358 | + "version": "3.7.0" |
306 | 359 | } |
307 | 360 | }, |
308 | 361 | "nbformat": 4, |
|
0 commit comments