|
16 | 16 | "execution_count": 1, |
17 | 17 | "metadata": { |
18 | 18 | "ExecuteTime": { |
19 | | - "end_time": "2021-04-03T08:59:14.632353Z", |
20 | | - "start_time": "2021-04-03T08:59:14.188951Z" |
| 19 | + "end_time": "2021-04-05T21:26:40.863650Z", |
| 20 | + "start_time": "2021-04-05T21:26:40.339123Z" |
21 | 21 | }, |
22 | 22 | "colab": {}, |
23 | 23 | "colab_type": "code", |
|
35 | 35 | "execution_count": 2, |
36 | 36 | "metadata": { |
37 | 37 | "ExecuteTime": { |
38 | | - "end_time": "2021-04-03T08:59:14.664303Z", |
39 | | - "start_time": "2021-04-03T08:59:14.633352Z" |
| 38 | + "end_time": "2021-04-05T21:26:40.894143Z", |
| 39 | + "start_time": "2021-04-05T21:26:40.865114Z" |
40 | 40 | }, |
41 | 41 | "colab": {}, |
42 | 42 | "colab_type": "code", |
|
70 | 70 | "execution_count": 3, |
71 | 71 | "metadata": { |
72 | 72 | "ExecuteTime": { |
73 | | - "end_time": "2021-04-03T08:59:14.679318Z", |
74 | | - "start_time": "2021-04-03T08:59:14.665306Z" |
| 73 | + "end_time": "2021-04-05T21:26:56.724662Z", |
| 74 | + "start_time": "2021-04-05T21:26:56.712651Z" |
75 | 75 | }, |
76 | 76 | "colab": { |
77 | 77 | "base_uri": "https://localhost:8080/", |
|
88 | 88 | "text": [ |
89 | 89 | "Word2Vec(vocab=6, size=100, alpha=0.025)\n", |
90 | 90 | "['dog', 'bites', 'man', 'eats', 'meat', 'food']\n", |
91 | | - "[-1.1890394e-04 -2.7612262e-04 3.0016506e-03 3.3397041e-03\n", |
92 | | - " 2.6973987e-03 2.5381467e-03 -4.4830954e-03 -3.8807455e-03\n", |
93 | | - " -2.7481976e-03 -3.5091466e-03 -1.0663099e-03 3.8600836e-03\n", |
94 | | - " -1.8223912e-03 -1.8985753e-03 2.5068773e-03 5.8603484e-05\n", |
95 | | - " -1.8388843e-03 3.4894156e-03 -1.9410843e-03 1.9663016e-03\n", |
96 | | - " -1.9262581e-04 -1.8321032e-04 4.6618818e-03 2.0332152e-03\n", |
97 | | - " -5.5621512e-04 -5.0049595e-04 4.4950778e-03 -2.3708560e-03\n", |
98 | | - " -4.1250056e-03 -8.1713696e-04 -1.5846886e-03 2.6569276e-03\n", |
99 | | - " -2.4425923e-03 3.3881937e-03 4.8663849e-03 -3.1806210e-03\n", |
100 | | - " 6.0354080e-04 2.6283797e-03 3.2367259e-03 -4.4542220e-03\n", |
101 | | - " -4.3623694e-03 -4.9372590e-03 3.1183651e-03 2.6437298e-03\n", |
102 | | - " -3.1073038e-03 7.5010926e-04 3.5182503e-03 -2.6689377e-03\n", |
103 | | - " 4.2944783e-03 1.2430353e-03 2.1388694e-03 1.5726388e-03\n", |
104 | | - " -3.4201301e-03 -3.5607379e-03 3.4647183e-03 -9.6110179e-04\n", |
105 | | - " -2.5040556e-03 -9.6717122e-04 1.0441509e-03 -3.4992509e-03\n", |
106 | | - " -9.8467432e-04 2.5085383e-03 3.4381317e-03 -8.5586461e-04\n", |
107 | | - " -4.3379996e-04 2.0993554e-03 -3.3381197e-03 3.6710135e-03\n", |
108 | | - " 2.4826424e-03 7.7588746e-04 -3.6549675e-03 2.5771847e-03\n", |
109 | | - " -3.9825556e-03 -6.0248183e-04 -5.7223073e-04 -1.7433831e-03\n", |
110 | | - " -1.0604414e-03 -2.1816064e-03 -4.6085631e-03 2.3315020e-03\n", |
111 | | - " 2.3816996e-03 1.9949675e-03 -4.0842607e-03 -2.8094815e-04\n", |
112 | | - " -4.2685810e-03 -1.3998528e-03 1.7278946e-03 -2.2190765e-03\n", |
113 | | - " -2.3720833e-04 -4.0732473e-03 -5.0638389e-04 -2.4232429e-03\n", |
114 | | - " -1.9645202e-03 -2.8262585e-03 7.5944123e-04 1.1781134e-03\n", |
115 | | - " 4.9539114e-04 -1.1337005e-03 -3.3781745e-03 1.0580849e-03]\n" |
| 91 | + "[-3.1667745e-03 2.5268614e-03 -4.9504861e-03 2.3797194e-03\n", |
| 92 | + " -3.3511904e-03 1.7659335e-03 -9.6838089e-04 3.6862001e-03\n", |
| 93 | + " 3.3760078e-03 -1.1944126e-03 -4.7475514e-03 -4.6677454e-03\n", |
| 94 | + " 4.7231275e-03 2.1875298e-03 4.9989321e-03 -4.7024325e-04\n", |
| 95 | + " 4.6936749e-03 4.5417100e-03 -4.8383311e-03 4.5522186e-03\n", |
| 96 | + " 9.4010920e-04 -2.8778350e-03 -2.3938445e-03 7.6240452e-04\n", |
| 97 | + " 2.8537741e-05 -1.0585956e-03 1.5203804e-03 1.1994856e-04\n", |
| 98 | + " 4.3881699e-03 3.5755127e-04 1.9964906e-03 -3.3893189e-03\n", |
| 99 | + " 2.5362791e-03 -3.8559963e-03 -4.6814438e-03 -1.0485576e-03\n", |
| 100 | + " 1.9576577e-03 -5.4296525e-04 2.5505766e-03 1.4563937e-03\n", |
| 101 | + " 1.1214090e-03 3.1200200e-03 3.5230191e-03 4.4931062e-03\n", |
| 102 | + " -5.5389071e-04 1.6268899e-03 -4.6736463e-03 -1.9612674e-04\n", |
| 103 | + " 1.5486709e-03 -3.5581242e-03 1.5163666e-03 2.2859944e-03\n", |
| 104 | + " -3.5728619e-03 -3.5505979e-03 7.8282715e-04 -4.8093311e-03\n", |
| 105 | + " -3.1324120e-03 -3.6213300e-03 -1.4478542e-03 3.4006054e-03\n", |
| 106 | + " 2.2276146e-03 -4.1698264e-03 -3.6997625e-03 -4.1264743e-03\n", |
| 107 | + " -4.9103238e-03 -2.2635974e-03 -3.9036905e-03 3.8846405e-03\n", |
| 108 | + " -7.9726276e-05 -2.0692295e-03 -3.0645117e-04 -3.0288144e-03\n", |
| 109 | + " -3.4682599e-03 -3.1768843e-03 -1.1148058e-03 -2.8012963e-03\n", |
| 110 | + " -6.5973290e-04 -2.3705217e-03 4.3961490e-03 3.2166531e-03\n", |
| 111 | + " 3.6933657e-04 -6.2054797e-04 2.0661615e-04 3.7390803e-04\n", |
| 112 | + " -3.5061471e-03 3.6587315e-03 2.1328868e-03 -2.5964181e-03\n", |
| 113 | + " 4.3381471e-03 4.0168604e-03 1.8054987e-03 -1.2192487e-03\n", |
| 114 | + " 1.5615283e-03 -1.8635839e-03 2.9529419e-03 -3.3825964e-03\n", |
| 115 | + " -3.2592549e-03 -4.7523994e-04 -5.3210353e-04 -9.8173530e-04]\n" |
116 | 116 | ] |
117 | 117 | } |
118 | 118 | ], |
|
133 | 133 | "execution_count": 4, |
134 | 134 | "metadata": { |
135 | 135 | "ExecuteTime": { |
136 | | - "end_time": "2021-04-03T08:59:14.694928Z", |
137 | | - "start_time": "2021-04-03T08:59:14.680319Z" |
| 136 | + "end_time": "2021-04-05T21:26:57.420196Z", |
| 137 | + "start_time": "2021-04-05T21:26:57.417193Z" |
138 | 138 | }, |
139 | 139 | "colab": { |
140 | 140 | "base_uri": "https://localhost:8080/", |
|
149 | 149 | "name": "stdout", |
150 | 150 | "output_type": "stream", |
151 | 151 | "text": [ |
152 | | - "Similarity between eats and bites: -0.13728619\n", |
153 | | - "Similarity between eats and man: -0.19164583\n" |
| 152 | + "Similarity between eats and bites: -0.09852024\n", |
| 153 | + "Similarity between eats and man: -0.17088428\n" |
154 | 154 | ] |
155 | 155 | } |
156 | 156 | ], |
|
175 | 175 | "execution_count": 5, |
176 | 176 | "metadata": { |
177 | 177 | "ExecuteTime": { |
178 | | - "end_time": "2021-04-03T08:59:14.710944Z", |
179 | | - "start_time": "2021-04-03T08:59:14.695930Z" |
| 178 | + "end_time": "2021-04-05T21:26:59.635831Z", |
| 179 | + "start_time": "2021-04-05T21:26:59.621818Z" |
180 | 180 | }, |
181 | 181 | "colab": { |
182 | 182 | "base_uri": "https://localhost:8080/", |
|
190 | 190 | { |
191 | 191 | "data": { |
192 | 192 | "text/plain": [ |
193 | | - "[('man', 0.12813393771648407),\n", |
194 | | - " ('dog', 0.11004816740751266),\n", |
195 | | - " ('food', 0.005883853882551193),\n", |
196 | | - " ('bites', -0.056721072643995285),\n", |
197 | | - " ('eats', -0.09314743429422379)]" |
| 193 | + "[('bites', 0.1353721022605896),\n", |
| 194 | + " ('man', 0.1094527617096901),\n", |
| 195 | + " ('food', -0.02215239405632019),\n", |
| 196 | + " ('dog', -0.1444159597158432),\n", |
| 197 | + " ('eats', -0.16309654712677002)]" |
198 | 198 | ] |
199 | 199 | }, |
200 | 200 | "execution_count": 5, |
|
212 | 212 | "execution_count": 6, |
213 | 213 | "metadata": { |
214 | 214 | "ExecuteTime": { |
215 | | - "end_time": "2021-04-03T08:59:14.726958Z", |
216 | | - "start_time": "2021-04-03T08:59:14.711944Z" |
| 215 | + "end_time": "2021-04-05T21:26:59.855822Z", |
| 216 | + "start_time": "2021-04-05T21:26:59.841810Z" |
217 | 217 | }, |
218 | 218 | "colab": { |
219 | 219 | "base_uri": "https://localhost:8080/", |
|
257 | 257 | "execution_count": 7, |
258 | 258 | "metadata": { |
259 | 259 | "ExecuteTime": { |
260 | | - "end_time": "2021-04-03T08:59:14.742937Z", |
261 | | - "start_time": "2021-04-03T08:59:14.727959Z" |
| 260 | + "end_time": "2021-04-05T21:27:00.517046Z", |
| 261 | + "start_time": "2021-04-05T21:27:00.508038Z" |
262 | 262 | }, |
263 | 263 | "colab": { |
264 | 264 | "base_uri": "https://localhost:8080/", |
|
275 | 275 | "text": [ |
276 | 276 | "Word2Vec(vocab=6, size=100, alpha=0.025)\n", |
277 | 277 | "['dog', 'bites', 'man', 'eats', 'meat', 'food']\n", |
278 | | - "[-1.1890394e-04 -2.7612262e-04 3.0016506e-03 3.3397041e-03\n", |
279 | | - " 2.6973987e-03 2.5381467e-03 -4.4830954e-03 -3.8807455e-03\n", |
280 | | - " -2.7481976e-03 -3.5091466e-03 -1.0663099e-03 3.8600836e-03\n", |
281 | | - " -1.8223912e-03 -1.8985753e-03 2.5068773e-03 5.8603484e-05\n", |
282 | | - " -1.8388843e-03 3.4894156e-03 -1.9410843e-03 1.9663016e-03\n", |
283 | | - " -1.9262581e-04 -1.8321032e-04 4.6618818e-03 2.0332152e-03\n", |
284 | | - " -5.5621512e-04 -5.0049595e-04 4.4950778e-03 -2.3708560e-03\n", |
285 | | - " -4.1250056e-03 -8.1713696e-04 -1.5846886e-03 2.6569276e-03\n", |
286 | | - " -2.4425923e-03 3.3881937e-03 4.8663849e-03 -3.1806210e-03\n", |
287 | | - " 6.0354080e-04 2.6283797e-03 3.2367259e-03 -4.4542220e-03\n", |
288 | | - " -4.3623694e-03 -4.9372590e-03 3.1183651e-03 2.6437298e-03\n", |
289 | | - " -3.1073038e-03 7.5010926e-04 3.5182503e-03 -2.6689377e-03\n", |
290 | | - " 4.2944783e-03 1.2430353e-03 2.1388694e-03 1.5726388e-03\n", |
291 | | - " -3.4201301e-03 -3.5607379e-03 3.4647183e-03 -9.6110179e-04\n", |
292 | | - " -2.5040556e-03 -9.6717122e-04 1.0441509e-03 -3.4992509e-03\n", |
293 | | - " -9.8467432e-04 2.5085383e-03 3.4381317e-03 -8.5586461e-04\n", |
294 | | - " -4.3379996e-04 2.0993554e-03 -3.3381197e-03 3.6710135e-03\n", |
295 | | - " 2.4826424e-03 7.7588746e-04 -3.6549675e-03 2.5771847e-03\n", |
296 | | - " -3.9825556e-03 -6.0248183e-04 -5.7223073e-04 -1.7433831e-03\n", |
297 | | - " -1.0604414e-03 -2.1816064e-03 -4.6085631e-03 2.3315020e-03\n", |
298 | | - " 2.3816996e-03 1.9949675e-03 -4.0842607e-03 -2.8094815e-04\n", |
299 | | - " -4.2685810e-03 -1.3998528e-03 1.7278946e-03 -2.2190765e-03\n", |
300 | | - " -2.3720833e-04 -4.0732473e-03 -5.0638389e-04 -2.4232429e-03\n", |
301 | | - " -1.9645202e-03 -2.8262585e-03 7.5944123e-04 1.1781134e-03\n", |
302 | | - " 4.9539114e-04 -1.1337005e-03 -3.3781745e-03 1.0580849e-03]\n" |
| 278 | + "[-3.1667745e-03 2.5268614e-03 -4.9504861e-03 2.3797194e-03\n", |
| 279 | + " -3.3511904e-03 1.7659335e-03 -9.6838089e-04 3.6862001e-03\n", |
| 280 | + " 3.3760078e-03 -1.1944126e-03 -4.7475514e-03 -4.6677454e-03\n", |
| 281 | + " 4.7231275e-03 2.1875298e-03 4.9989321e-03 -4.7024325e-04\n", |
| 282 | + " 4.6936749e-03 4.5417100e-03 -4.8383311e-03 4.5522186e-03\n", |
| 283 | + " 9.4010920e-04 -2.8778350e-03 -2.3938445e-03 7.6240452e-04\n", |
| 284 | + " 2.8537741e-05 -1.0585956e-03 1.5203804e-03 1.1994856e-04\n", |
| 285 | + " 4.3881699e-03 3.5755127e-04 1.9964906e-03 -3.3893189e-03\n", |
| 286 | + " 2.5362791e-03 -3.8559963e-03 -4.6814438e-03 -1.0485576e-03\n", |
| 287 | + " 1.9576577e-03 -5.4296525e-04 2.5505766e-03 1.4563937e-03\n", |
| 288 | + " 1.1214090e-03 3.1200200e-03 3.5230191e-03 4.4931062e-03\n", |
| 289 | + " -5.5389071e-04 1.6268899e-03 -4.6736463e-03 -1.9612674e-04\n", |
| 290 | + " 1.5486709e-03 -3.5581242e-03 1.5163666e-03 2.2859944e-03\n", |
| 291 | + " -3.5728619e-03 -3.5505979e-03 7.8282715e-04 -4.8093311e-03\n", |
| 292 | + " -3.1324120e-03 -3.6213300e-03 -1.4478542e-03 3.4006054e-03\n", |
| 293 | + " 2.2276146e-03 -4.1698264e-03 -3.6997625e-03 -4.1264743e-03\n", |
| 294 | + " -4.9103238e-03 -2.2635974e-03 -3.9036905e-03 3.8846405e-03\n", |
| 295 | + " -7.9726276e-05 -2.0692295e-03 -3.0645117e-04 -3.0288144e-03\n", |
| 296 | + " -3.4682599e-03 -3.1768843e-03 -1.1148058e-03 -2.8012963e-03\n", |
| 297 | + " -6.5973290e-04 -2.3705217e-03 4.3961490e-03 3.2166531e-03\n", |
| 298 | + " 3.6933657e-04 -6.2054797e-04 2.0661615e-04 3.7390803e-04\n", |
| 299 | + " -3.5061471e-03 3.6587315e-03 2.1328868e-03 -2.5964181e-03\n", |
| 300 | + " 4.3381471e-03 4.0168604e-03 1.8054987e-03 -1.2192487e-03\n", |
| 301 | + " 1.5615283e-03 -1.8635839e-03 2.9529419e-03 -3.3825964e-03\n", |
| 302 | + " -3.2592549e-03 -4.7523994e-04 -5.3210353e-04 -9.8173530e-04]\n" |
303 | 303 | ] |
304 | 304 | } |
305 | 305 | ], |
|
320 | 320 | "execution_count": 8, |
321 | 321 | "metadata": { |
322 | 322 | "ExecuteTime": { |
323 | | - "end_time": "2021-04-03T08:59:14.758953Z", |
324 | | - "start_time": "2021-04-03T08:59:14.743938Z" |
| 323 | + "end_time": "2021-04-05T21:27:02.660747Z", |
| 324 | + "start_time": "2021-04-05T21:27:02.642866Z" |
325 | 325 | }, |
326 | 326 | "colab": { |
327 | 327 | "base_uri": "https://localhost:8080/", |
|
336 | 336 | "name": "stdout", |
337 | 337 | "output_type": "stream", |
338 | 338 | "text": [ |
339 | | - "Similarity between eats and bites: -0.13728109\n", |
340 | | - "Similarity between eats and man: -0.19165389\n" |
| 339 | + "Similarity between eats and bites: -0.09852936\n", |
| 340 | + "Similarity between eats and man: -0.17089055\n" |
341 | 341 | ] |
342 | 342 | } |
343 | 343 | ], |
|
362 | 362 | "execution_count": 9, |
363 | 363 | "metadata": { |
364 | 364 | "ExecuteTime": { |
365 | | - "end_time": "2021-04-03T08:59:14.774064Z", |
366 | | - "start_time": "2021-04-03T08:59:14.759954Z" |
| 365 | + "end_time": "2021-04-05T21:27:03.419546Z", |
| 366 | + "start_time": "2021-04-05T21:27:03.414541Z" |
367 | 367 | }, |
368 | 368 | "colab": { |
369 | 369 | "base_uri": "https://localhost:8080/", |
|
377 | 377 | { |
378 | 378 | "data": { |
379 | 379 | "text/plain": [ |
380 | | - "[('man', 0.12813392281532288),\n", |
381 | | - " ('dog', 0.11004817485809326),\n", |
382 | | - " ('food', 0.005883842706680298),\n", |
383 | | - " ('bites', -0.056721076369285583),\n", |
384 | | - " ('eats', -0.09321994334459305)]" |
| 380 | + "[('bites', 0.1353721022605896),\n", |
| 381 | + " ('man', 0.10945276916027069),\n", |
| 382 | + " ('food', -0.022152386605739594),\n", |
| 383 | + " ('dog', -0.1444159746170044),\n", |
| 384 | + " ('eats', -0.16317100822925568)]" |
385 | 385 | ] |
386 | 386 | }, |
387 | 387 | "execution_count": 9, |
|
399 | 399 | "execution_count": 10, |
400 | 400 | "metadata": { |
401 | 401 | "ExecuteTime": { |
402 | | - "end_time": "2021-04-03T08:59:14.790080Z", |
403 | | - "start_time": "2021-04-03T08:59:14.775066Z" |
| 402 | + "end_time": "2021-04-05T21:27:03.973454Z", |
| 403 | + "start_time": "2021-04-05T21:27:03.950433Z" |
404 | 404 | }, |
405 | 405 | "colab": { |
406 | 406 | "base_uri": "https://localhost:8080/", |
|
448 | 448 | }, |
449 | 449 | { |
450 | 450 | "cell_type": "code", |
451 | | - "execution_count": 11, |
| 451 | + "execution_count": 12, |
452 | 452 | "metadata": { |
453 | 453 | "ExecuteTime": { |
454 | | - "end_time": "2021-04-03T08:59:14.806094Z", |
455 | | - "start_time": "2021-04-03T08:59:14.791080Z" |
| 454 | + "end_time": "2021-04-05T21:27:58.596845Z", |
| 455 | + "start_time": "2021-04-05T21:27:58.585833Z" |
456 | 456 | } |
457 | 457 | }, |
458 | 458 | "outputs": [ |
459 | 459 | { |
460 | 460 | "name": "stdout", |
461 | 461 | "output_type": "stream", |
462 | 462 | "text": [ |
463 | | - "file already exists, skipping download\n" |
| 463 | + "file already exists, skipping download\n", |
| 464 | + "File at: data/en/enwiki-latest-pages-articles-multistream14.xml-p13159683p14324602.bz2\n" |
464 | 465 | ] |
465 | 466 | } |
466 | 467 | ], |
|
483 | 484 | " if token:\n", |
484 | 485 | " params = { 'id' : id, 'confirm' : token }\n", |
485 | 486 | " response = session.get(URL, params = params, stream = True)\n", |
486 | | - " print(len(response.content))\n", |
487 | 487 | "\n", |
488 | 488 | " save_response_content(response, destination) \n", |
489 | 489 | "\n", |
|
505 | 505 | "if not os.path.exists(file_name):\n", |
506 | 506 | " download_file_from_google_drive(file_id, file_name)\n", |
507 | 507 | "else:\n", |
508 | | - " print(\"file already exists, skipping download\")" |
| 508 | + " print(\"file already exists, skipping download\")\n", |
| 509 | + "\n", |
| 510 | + "print(f\"File at: {file_name}\")" |
509 | 511 | ] |
510 | 512 | }, |
511 | 513 | { |
|
549 | 551 | "#if you get a memory error executing the lines above\n", |
550 | 552 | "#comment the lines out and uncomment the lines below. \n", |
551 | 553 | "#loading will be slower, but stable.\n", |
552 | | - "wiki = WikiCorpus(file_name, processes=4, lemmatize=False, dictionary={})\n", |
553 | | - "sentences = list(wiki.get_texts())\n", |
| 554 | + "# wiki = WikiCorpus(file_name, processes=4, lemmatize=False, dictionary={})\n", |
| 555 | + "# sentences = list(wiki.get_texts())\n", |
554 | 556 | "\n", |
555 | 557 | "#if you still get a memory error, try settings processes to 1 or 2 and then run it again." |
556 | 558 | ] |
|
0 commit comments