Fixed some errors and modified a path

jatinpapreja · web-flow · commit 59df15c30e86 · 2021-05-19T18:17:03.000+05:30
Created 'sentiments_sentences' by merging three files.
Modified the path for getting google vectors file. 
Added a condition for appending zero vector.
diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 1,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -53,7 +53,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = os.getcwd()\n",
+    "path = path + '\\Data'\n",
+    "\n",
+    "fil = 'sentiment_sentences.txt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File already exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n",
+    "    file = open(os.path.join(path, fil), 'w')\n",
+    "    file.close()\n",
+    "    \n",
+    "    # combined the three files to make sentiment_sentences.txt\n",
+    "    filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
+    "\n",
+    "    with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n",
+    "        for fname in filenames:\n",
+    "            with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n",
+    "                outfile.write(infile.read())\n",
+    "    print(\"File created\")\n",
+    "else:\n",
+    "    print(\"File already exists\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -66,17 +108,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1min 36s, sys: 4.23 s, total: 1min 40s\n",
-      "Wall time: 1min 40s\n",
+      "Wall time: 15.7 s\n",
       "done loading Word2Vec\n"
      ]
     }
    ],
    "source": [
     "#Load the pre-trained word2vec model and the dataset\n",
-    "data_path= \"DATAPATH\"\n",
-    "path_to_model = os.path.join(data_path,'GoogleNews-vectors-negative300.bin')\n",
-    "training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
+    "try:\n",
+    "    from google.colab import files\n",
+    "    data_path= \"DATAPATH\"        \n",
+    "    path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n",
+    "    training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n",
+    "except ModuleNotFoundError:\n",
+    "    data_path= \"Data\"        \n",
+    "    \n",
+    "    if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
+    "        if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
+    "            wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
+    "            path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
+    "        else:\n",
+    "            path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
+    "            \n",
+    "    else:\n",
+    "        path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
+    "        \n",
+    "    training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
+    "\n",
+    "\n",
     "\n",
     "#Load W2V model. This will take some time. \n",
     "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
@@ -90,12 +149,12 @@
     "for line in fh:\n",
     "    text, sentiment = line.split(\"\\t\")\n",
     "    texts.append(text)\n",
-    "    cats.append(sentiment)\n"
+    "    cats.append(sentiment)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 5,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -120,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 6,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -148,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 7,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -186,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 8,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -215,17 +274,20 @@
     "            if token in w2v_model:\n",
     "                feat_for_this += w2v_model[token]\n",
     "                count_for_this +=1\n",
-    "        feats.append(feat_for_this/count_for_this)        \n",
+    "        if(count_for_this!=0):\n",
+    "            feats.append(feat_for_this/count_for_this) \n",
+    "        else:\n",
+    "            feats.append(zero_vector)\n",
     "    return feats\n",
     "\n",
     "\n",
     "train_vectors = embedding_feats(texts_processed)\n",
-    "print(len(train_vectors))\n"
+    "print(len(train_vectors))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 9,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -237,15 +299,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy:  0.812\n",
-      "             precision    recall  f1-score   support\n",
+      "Accuracy:  0.8173333333333334\n",
+      "              precision    recall  f1-score   support\n",
       "\n",
-      "         0\n",
-      "       0.82      0.80      0.81       374\n",
-      "         1\n",
-      "       0.80      0.83      0.82       376\n",
+      "          0\n",
+      "       0.79      0.82      0.81       350\n",
+      "          1\n",
+      "       0.84      0.81      0.83       400\n",
       "\n",
-      "avg / total       0.81      0.81      0.81       750\n",
+      "    accuracy                           0.82       750\n",
+      "   macro avg       0.82      0.82      0.82       750\n",
+      "weighted avg       0.82      0.82      0.82       750\n",
       "\n"
      ]
     }
@@ -269,17 +333,6 @@
    "source": [
     "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "8z3tJJJkb_JB"
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -302,7 +355,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.7.0"
   }
  },
  "nbformat": 4,