|
37 | 37 | "metadata": {}, |
38 | 38 | "outputs": [], |
39 | 39 | "source": [ |
40 | | - "#install the required packages\n", |
41 | | - "\n", |
42 | | - "!pip install -q \"labelbox[data]\"\n", |
| 40 | + "!pip install -q \"labelbox\"\n", |
43 | 41 | "!pip install -q transformers" |
44 | 42 | ] |
45 | 43 | }, |
|
56 | 54 | "metadata": {}, |
57 | 55 | "outputs": [], |
58 | 56 | "source": [ |
59 | | - "# import libraries\n", |
60 | | - "\n", |
61 | 57 | "import labelbox as lb\n", |
62 | 58 | "import transformers\n", |
63 | 59 | "transformers.logging.set_verbosity(50)\n", |
|
100 | 96 | "metadata": {}, |
101 | 97 | "outputs": [], |
102 | 98 | "source": [ |
103 | | - "# get images from a Labelbox dataset, those images needs to be available so you may need a token from your cloud provider\n", |
| 99 | + "# Get images from a Labelbox dataset,\n", |
| 100 | + "# Ensure the images are available by obtaining a token from your cloud provider if necessary\n", |
104 | 101 | "DATASET_ID = \"\"" |
105 | 102 | ] |
106 | 103 | }, |
|
126 | 123 | "\tprint(export_task.errors)\n", |
127 | 124 | "export_json = export_task.result\n", |
128 | 125 | "\n", |
129 | | - "data_row_urls = [i['data_row']['row_data'] for i in export_json]" |
| 126 | + "data_row_urls = [dr_url['data_row']['row_data'] for dr_url in export_json]" |
130 | 127 | ] |
131 | 128 | }, |
132 | 129 | { |
|
142 | 139 | "metadata": {}, |
143 | 140 | "outputs": [], |
144 | 141 | "source": [ |
145 | | - "# get ResNet-50 from HuggingFace\n", |
| 142 | + "# Get ResNet-50 from HuggingFace\n", |
146 | 143 | "image_processor = transformers.AutoImageProcessor.from_pretrained(\"microsoft/resnet-50\")\n", |
147 | 144 | "model = transformers.ResNetModel.from_pretrained(\"microsoft/resnet-50\")" |
148 | 145 | ] |
|
160 | 157 | "metadata": {}, |
161 | 158 | "outputs": [], |
162 | 159 | "source": [ |
163 | | - "#create a new embedding in your workspace, use the right dimensions to your use case, here we use 2048 for ResNet-50\n", |
| 160 | + "# Create a new embedding in your workspace, use the right dimensions to your use case, here we use 2048 for ResNet-50\n", |
164 | 161 | "new_custom_embedding_id = client.create_embedding(name=\"My new awesome embedding\", dims=2048).id\n", |
165 | 162 | "\n", |
166 | | - "#or use an existing embedding from your workspace\n", |
167 | | - "#existing_embedding_id = client.get_embedding_by_name(name=\"ResNet img 2048\").id" |
| 163 | + "# Or use an existing embedding from your workspace\n", |
| 164 | + "# existing_embedding_id = client.get_embedding_by_name(name=\"ResNet img 2048\").id" |
168 | 165 | ] |
169 | 166 | }, |
170 | 167 | { |
|
180 | 177 | "metadata": {}, |
181 | 178 | "outputs": [], |
182 | 179 | "source": [ |
183 | | - "data_rows = []\n", |
184 | 180 | "img_emb = []\n", |
185 | 181 | "\n", |
186 | 182 | "for url in tqdm(data_row_urls):\n", |
187 | 183 | " try:\n", |
188 | 184 | " response = requests.get(url, stream=True)\n", |
189 | 185 | " if response.status_code == 200:\n", |
| 186 | + " # Open the image, convert to RGB, and resize to 224x224\n", |
190 | 187 | " image = Image.open(response.raw).convert('RGB').resize((224, 224))\n", |
| 188 | + "\n", |
| 189 | + " # Preprocess the image for model input\n", |
191 | 190 | " img_hf = image_processor(image, return_tensors=\"pt\")\n", |
| 191 | + "\n", |
| 192 | + " # Pass the image through the model to get embeddings\n", |
192 | 193 | " with torch.no_grad():\n", |
193 | 194 | " last_layer = model(**img_hf, output_hidden_states=True).last_hidden_state\n", |
194 | 195 | " resnet_embeddings = F.adaptive_avg_pool2d(last_layer, (1, 1))\n", |
|
199 | 200 | " except Exception as e:\n", |
200 | 201 | " print(f\"Error processing URL: {url}. Exception: {e}\")\n", |
201 | 202 | " continue\n", |
| 203 | + "\n", |
| 204 | + "data_rows = []\n", |
202 | 205 | " \n", |
203 | | - "# create data rows payload to send to a dataset\n", |
| 206 | + "# Create data rows payload to send to a dataset\n", |
204 | 207 | "for url, embedding in tqdm(zip(data_row_urls, img_emb)):\n", |
205 | 208 | " data_rows.append({\n", |
206 | 209 | " \"row_data\": url,\n", |
207 | | - " \"embeddings\": [{\"embedding_id\": existing_embedding_id, \"vector\": embedding[0].tolist()}]\n", |
| 210 | + " \"embeddings\": [{\"embedding_id\": new_custom_embedding_id, \"vector\": embedding[0].tolist()}]\n", |
208 | 211 | " })" |
209 | 212 | ] |
210 | 213 | }, |
|
214 | 217 | "metadata": {}, |
215 | 218 | "outputs": [], |
216 | 219 | "source": [ |
217 | | - "#upload to a new dataset\n", |
| 220 | + "# Upload to a new dataset\n", |
218 | 221 | "dataset = client.create_dataset(name='image_custom_embedding_resnet', iam_integration=None)\n", |
219 | 222 | "task = dataset.create_data_rows(data_rows)\n", |
220 | 223 | "print(task.errors)" |
|
0 commit comments