Skip to content

Commit 59df15c

Browse files
authored
Fixed some errors and modified a path
Created 'sentiments_sentences' by merging three files. Modified the path for getting google vectors file. Added a condition for appending zero vector.
1 parent 2109c9c commit 59df15c

File tree

1 file changed

+87
-34
lines changed

1 file changed

+87
-34
lines changed

Ch4/03_Word2Vec_Example.ipynb

Lines changed: 87 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
{
2828
"cell_type": "code",
29-
"execution_count": 0,
29+
"execution_count": 1,
3030
"metadata": {
3131
"colab": {},
3232
"colab_type": "code",
@@ -53,7 +53,49 @@
5353
},
5454
{
5555
"cell_type": "code",
56-
"execution_count": 0,
56+
"execution_count": 2,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"path = os.getcwd()\n",
61+
"path = path + '\\Data'\n",
62+
"\n",
63+
"fil = 'sentiment_sentences.txt'"
64+
]
65+
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": 3,
69+
"metadata": {},
70+
"outputs": [
71+
{
72+
"name": "stdout",
73+
"output_type": "stream",
74+
"text": [
75+
"File already exists\n"
76+
]
77+
}
78+
],
79+
"source": [
80+
"if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n",
81+
" file = open(os.path.join(path, fil), 'w')\n",
82+
" file.close()\n",
83+
" \n",
84+
" # combined the three files to make sentiment_sentences.txt\n",
85+
" filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
86+
"\n",
87+
" with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n",
88+
" for fname in filenames:\n",
89+
" with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n",
90+
" outfile.write(infile.read())\n",
91+
" print(\"File created\")\n",
92+
"else:\n",
93+
" print(\"File already exists\")"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": 4,
5799
"metadata": {
58100
"colab": {},
59101
"colab_type": "code",
@@ -66,17 +108,34 @@
66108
"name": "stdout",
67109
"output_type": "stream",
68110
"text": [
69-
"CPU times: user 1min 36s, sys: 4.23 s, total: 1min 40s\n",
70-
"Wall time: 1min 40s\n",
111+
"Wall time: 15.7 s\n",
71112
"done loading Word2Vec\n"
72113
]
73114
}
74115
],
75116
"source": [
76117
"#Load the pre-trained word2vec model and the dataset\n",
77-
"data_path= \"DATAPATH\"\n",
78-
"path_to_model = os.path.join(data_path,'GoogleNews-vectors-negative300.bin')\n",
79-
"training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
118+
"try:\n",
119+
" from google.colab import files\n",
120+
" data_path= \"DATAPATH\" \n",
121+
" path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n",
122+
" training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n",
123+
"except ModuleNotFoundError:\n",
124+
" data_path= \"Data\" \n",
125+
" \n",
126+
" if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
127+
" if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
128+
" wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
129+
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
130+
" else:\n",
131+
" path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
132+
" \n",
133+
" else:\n",
134+
" path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
135+
" \n",
136+
" training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
137+
"\n",
138+
"\n",
80139
"\n",
81140
"#Load W2V model. This will take some time. \n",
82141
"%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
@@ -90,12 +149,12 @@
90149
"for line in fh:\n",
91150
" text, sentiment = line.split(\"\\t\")\n",
92151
" texts.append(text)\n",
93-
" cats.append(sentiment)\n"
152+
" cats.append(sentiment)"
94153
]
95154
},
96155
{
97156
"cell_type": "code",
98-
"execution_count": 0,
157+
"execution_count": 5,
99158
"metadata": {
100159
"colab": {},
101160
"colab_type": "code",
@@ -120,7 +179,7 @@
120179
},
121180
{
122181
"cell_type": "code",
123-
"execution_count": 0,
182+
"execution_count": 6,
124183
"metadata": {
125184
"colab": {},
126185
"colab_type": "code",
@@ -148,7 +207,7 @@
148207
},
149208
{
150209
"cell_type": "code",
151-
"execution_count": 0,
210+
"execution_count": 7,
152211
"metadata": {
153212
"colab": {},
154213
"colab_type": "code",
@@ -186,7 +245,7 @@
186245
},
187246
{
188247
"cell_type": "code",
189-
"execution_count": 0,
248+
"execution_count": 8,
190249
"metadata": {
191250
"colab": {},
192251
"colab_type": "code",
@@ -215,17 +274,20 @@
215274
" if token in w2v_model:\n",
216275
" feat_for_this += w2v_model[token]\n",
217276
" count_for_this +=1\n",
218-
" feats.append(feat_for_this/count_for_this) \n",
277+
" if(count_for_this!=0):\n",
278+
" feats.append(feat_for_this/count_for_this) \n",
279+
" else:\n",
280+
" feats.append(zero_vector)\n",
219281
" return feats\n",
220282
"\n",
221283
"\n",
222284
"train_vectors = embedding_feats(texts_processed)\n",
223-
"print(len(train_vectors))\n"
285+
"print(len(train_vectors))"
224286
]
225287
},
226288
{
227289
"cell_type": "code",
228-
"execution_count": 0,
290+
"execution_count": 9,
229291
"metadata": {
230292
"colab": {},
231293
"colab_type": "code",
@@ -237,15 +299,17 @@
237299
"name": "stdout",
238300
"output_type": "stream",
239301
"text": [
240-
"Accuracy: 0.812\n",
241-
" precision recall f1-score support\n",
302+
"Accuracy: 0.8173333333333334\n",
303+
" precision recall f1-score support\n",
242304
"\n",
243-
" 0\n",
244-
" 0.82 0.80 0.81 374\n",
245-
" 1\n",
246-
" 0.80 0.83 0.82 376\n",
305+
" 0\n",
306+
" 0.79 0.82 0.81 350\n",
307+
" 1\n",
308+
" 0.84 0.81 0.83 400\n",
247309
"\n",
248-
"avg / total 0.81 0.81 0.81 750\n",
310+
" accuracy 0.82 750\n",
311+
" macro avg 0.82 0.82 0.82 750\n",
312+
"weighted avg 0.82 0.82 0.82 750\n",
249313
"\n"
250314
]
251315
}
@@ -269,17 +333,6 @@
269333
"source": [
270334
"Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!"
271335
]
272-
},
273-
{
274-
"cell_type": "code",
275-
"execution_count": 0,
276-
"metadata": {
277-
"colab": {},
278-
"colab_type": "code",
279-
"id": "8z3tJJJkb_JB"
280-
},
281-
"outputs": [],
282-
"source": []
283336
}
284337
],
285338
"metadata": {
@@ -302,7 +355,7 @@
302355
"name": "python",
303356
"nbconvert_exporter": "python",
304357
"pygments_lexer": "ipython3",
305-
"version": "3.6.10"
358+
"version": "3.7.0"
306359
}
307360
},
308361
"nbformat": 4,

0 commit comments

Comments
 (0)