[SN-131]corrected_HF.ipynb

paultancre · web-flow · commit 0b1f2d73453d · 2024-05-15T11:47:31.000+01:00
diff --git a/examples/integrations/huggingface/huggingface.ipynb b/examples/integrations/huggingface/huggingface.ipynb
@@ -37,9 +37,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "#install the required packages\n",
-        "\n",
-        "!pip install -q \"labelbox[data]\"\n",
+        "!pip install -q \"labelbox\"\n",
         "!pip install -q transformers"
       ]
     },
@@ -56,8 +54,6 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# import libraries\n",
-        "\n",
         "import labelbox as lb\n",
         "import transformers\n",
         "transformers.logging.set_verbosity(50)\n",
@@ -100,7 +96,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# get images from a Labelbox dataset, those images needs to be available so you may need a token from your cloud provider\n",
+        "# Get images from a Labelbox dataset,\n",
+        "# Ensure the images are available by obtaining a token from your cloud provider if necessary\n",
         "DATASET_ID = \"\""
       ]
     },
@@ -126,7 +123,7 @@
         "\tprint(export_task.errors)\n",
         "export_json = export_task.result\n",
         "\n",
-        "data_row_urls = [i['data_row']['row_data'] for i in export_json]"
+        "data_row_urls = [dr_url['data_row']['row_data'] for dr_url in export_json]"
       ]
     },
     {
@@ -142,7 +139,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# get ResNet-50 from HuggingFace\n",
+        "# Get ResNet-50 from HuggingFace\n",
         "image_processor = transformers.AutoImageProcessor.from_pretrained(\"microsoft/resnet-50\")\n",
         "model = transformers.ResNetModel.from_pretrained(\"microsoft/resnet-50\")"
       ]
@@ -160,11 +157,11 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "#create a new embedding in your workspace, use the right dimensions to your use case, here we use 2048 for ResNet-50\n",
+        "# Create a new embedding in your workspace, use the right dimensions to your use case, here we use 2048 for ResNet-50\n",
         "new_custom_embedding_id = client.create_embedding(name=\"My new awesome embedding\", dims=2048).id\n",
         "\n",
-        "#or use an existing embedding from your workspace\n",
-        "#existing_embedding_id = client.get_embedding_by_name(name=\"ResNet img 2048\").id"
+        "# Or use an existing embedding from your workspace\n",
+        "# existing_embedding_id = client.get_embedding_by_name(name=\"ResNet img 2048\").id"
       ]
     },
     {
@@ -180,15 +177,19 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "data_rows = []\n",
         "img_emb = []\n",
         "\n",
         "for url in tqdm(data_row_urls):\n",
         "    try:\n",
         "        response = requests.get(url, stream=True)\n",
         "        if response.status_code == 200:\n",
+        "            # Open the image, convert to RGB, and resize to 224x224\n",
         "            image = Image.open(response.raw).convert('RGB').resize((224, 224))\n",
+        "\n",
+        "            # Preprocess the image for model input\n",
         "            img_hf = image_processor(image, return_tensors=\"pt\")\n",
+        "\n",
+        "            # Pass the image through the model to get embeddings\n",
         "            with torch.no_grad():\n",
         "                last_layer = model(**img_hf, output_hidden_states=True).last_hidden_state\n",
         "                resnet_embeddings = F.adaptive_avg_pool2d(last_layer, (1, 1))\n",
@@ -199,12 +200,14 @@
         "    except Exception as e:\n",
         "        print(f\"Error processing URL: {url}. Exception: {e}\")\n",
         "        continue\n",
+        "\n",
+        "data_rows = []\n",
         "    \n",
-        "# create data rows payload to send to a dataset\n",
+        "# Create data rows payload to send to a dataset\n",
         "for url, embedding in tqdm(zip(data_row_urls, img_emb)):\n",
         "    data_rows.append({\n",
         "        \"row_data\": url,\n",
-        "        \"embeddings\": [{\"embedding_id\": existing_embedding_id, \"vector\": embedding[0].tolist()}]\n",
+        "        \"embeddings\": [{\"embedding_id\": new_custom_embedding_id, \"vector\": embedding[0].tolist()}]\n",
         "    })"
       ]
     },
@@ -214,7 +217,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "#upload to a new dataset\n",
+        "# Upload to a new dataset\n",
         "dataset = client.create_dataset(name='image_custom_embedding_resnet', iam_integration=None)\n",
         "task = dataset.create_data_rows(data_rows)\n",
         "print(task.errors)"