|
39 | 39 | { |
40 | 40 | "metadata": {}, |
41 | 41 | "source": [ |
42 | | - "# Setup" |
| 42 | + "# Set up " |
43 | 43 | ], |
44 | 44 | "cell_type": "markdown" |
45 | 45 | }, |
46 | 46 | { |
47 | 47 | "metadata": {}, |
48 | 48 | "source": [ |
49 | | - "!pip3 install -q \"labelbox\"" |
| 49 | + "!pip3 install -q \"labelbox[data]\"" |
50 | 50 | ], |
51 | 51 | "cell_type": "code", |
52 | 52 | "outputs": [], |
|
57 | 57 | "source": [ |
58 | 58 | "import labelbox as lb\n", |
59 | 59 | "import numpy as np\n", |
60 | | - "import json" |
| 60 | + "import json\n", |
| 61 | + "import uuid\n", |
| 62 | + "import random" |
61 | 63 | ], |
62 | 64 | "cell_type": "code", |
63 | 65 | "outputs": [], |
64 | 66 | "execution_count": null |
65 | 67 | }, |
| 68 | + { |
| 69 | + "metadata": {}, |
| 70 | + "source": [ |
| 71 | + "# Replace with your API key" |
| 72 | + ], |
| 73 | + "cell_type": "markdown" |
| 74 | + }, |
66 | 75 | { |
67 | 76 | "metadata": {}, |
68 | 77 | "source": [ |
|
76 | 85 | { |
77 | 86 | "metadata": {}, |
78 | 87 | "source": [ |
79 | | - "# Select data rows in Labelbox for custom embeddings" |
| 88 | + "# Select data rows" |
80 | 89 | ], |
81 | 90 | "cell_type": "markdown" |
82 | 91 | }, |
83 | 92 | { |
84 | 93 | "metadata": {}, |
85 | 94 | "source": [ |
86 | | - "client.enable_experimental = True\n", |
87 | | - "\n", |
88 | | - "# get images from a Labelbox dataset\n", |
89 | | - "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n", |
90 | | - "dataset = client.get_dataset(\"<DATASET-ID>\")\n", |
91 | | - "\n", |
| 95 | + "- Get images from a Labelbox dataset\n", |
| 96 | + "- To improve similarity search, you need to upload custom embeddings to at least 1,000 data rows.\n" |
| 97 | + ], |
| 98 | + "cell_type": "markdown" |
| 99 | + }, |
| 100 | + { |
| 101 | + "metadata": {}, |
| 102 | + "source": [ |
| 103 | + "DATASET_ID = \"\"" |
| 104 | + ], |
| 105 | + "cell_type": "code", |
| 106 | + "outputs": [], |
| 107 | + "execution_count": null |
| 108 | + }, |
| 109 | + { |
| 110 | + "metadata": {}, |
| 111 | + "source": [ |
| 112 | + "dataset = client.get_dataset(dataset_id=DATASET_ID)\n", |
92 | 113 | "export_task = dataset.export()\n", |
93 | 114 | "export_task.wait_till_done()" |
94 | 115 | ], |
|
124 | 145 | { |
125 | 146 | "metadata": {}, |
126 | 147 | "source": [ |
127 | | - "data_row_ids = [dr[\"data_row\"][\"id\"] for dr in data_rows]\n", |
128 | | - "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo" |
| 148 | + "data_row_dict = [{\"data_row_id\": dr[\"data_row\"][\"id\"], \"row_data\": dr[\"data_row\"][\"row_data\"]} for dr in data_rows]\n", |
| 149 | + "data_row_dict = data_row_dict[:1000] # keep the first 1000 examples for the sake of this demo" |
129 | 150 | ], |
130 | 151 | "cell_type": "code", |
131 | 152 | "outputs": [], |
|
134 | 155 | { |
135 | 156 | "metadata": {}, |
136 | 157 | "source": [ |
137 | | - "# Create the payload for custom embeddings\n", |
138 | | - "-- It should be a .ndjson file. \n", |
139 | | - "-- Every line is a json file that finishes with a \\n character. \n", |
140 | | - "-- It does not have to be created through Python. " |
| 158 | + "# Create custom embedding payload " |
141 | 159 | ], |
142 | 160 | "cell_type": "markdown" |
143 | 161 | }, |
144 | 162 | { |
145 | 163 | "metadata": {}, |
146 | 164 | "source": [ |
147 | | - "nb_data_rows = len(data_row_ids)\n", |
| 165 | + "Generate random vectors for embeddings (max : 2048 dimensions)" |
| 166 | + ], |
| 167 | + "cell_type": "markdown" |
| 168 | + }, |
| 169 | + { |
| 170 | + "metadata": {}, |
| 171 | + "source": [ |
| 172 | + "nb_data_rows = len(data_row_dict)\n", |
148 | 173 | "print(\"Number of data rows: \", nb_data_rows)\n", |
149 | | - "# Generate random vectors, of dimension 2048 each\n", |
150 | 174 | "# Labelbox supports custom embedding vectors of dimension up to 2048\n", |
151 | 175 | "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]" |
152 | 176 | ], |
|
157 | 181 | { |
158 | 182 | "metadata": {}, |
159 | 183 | "source": [ |
160 | | - "# Create the payload for custom embeddings\n", |
161 | | - "payload = []\n", |
162 | | - "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n", |
163 | | - " payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n", |
164 | | - "\n", |
165 | | - "print('payload', len(payload),payload[:1])" |
| 184 | + "List all custom embeddings available in your Labelbox workspace" |
| 185 | + ], |
| 186 | + "cell_type": "markdown" |
| 187 | + }, |
| 188 | + { |
| 189 | + "metadata": {}, |
| 190 | + "source": [ |
| 191 | + "embeddings = client.get_embeddings()" |
166 | 192 | ], |
167 | 193 | "cell_type": "code", |
168 | 194 | "outputs": [], |
|
171 | 197 | { |
172 | 198 | "metadata": {}, |
173 | 199 | "source": [ |
174 | | - "# Delete any pre-existing file\n", |
175 | | - "import os\n", |
176 | | - "if os.path.exists(\"payload.ndjson\"):\n", |
177 | | - " os.remove(\"payload.ndjson\")\n", |
178 | | - "\n", |
179 | | - "# Convert the payload to a JSON file\n", |
180 | | - "with open('payload.ndjson', 'w') as f:\n", |
181 | | - " for p in payload:\n", |
182 | | - " f.write(json.dumps(p) + \"\\n\")\n", |
183 | | - " # sanity_check_payload = json.dump(payload, f)" |
| 200 | + "Choose an existing embedding type or create a new one" |
| 201 | + ], |
| 202 | + "cell_type": "markdown" |
| 203 | + }, |
| 204 | + { |
| 205 | + "metadata": {}, |
| 206 | + "source": [ |
| 207 | + "# Name of the custom embedding must be unique\n", |
| 208 | + "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)" |
184 | 209 | ], |
185 | 210 | "cell_type": "code", |
186 | 211 | "outputs": [], |
|
189 | 214 | { |
190 | 215 | "metadata": {}, |
191 | 216 | "source": [ |
192 | | - "# Sanity check that you can read/load the file and the payload is correct\n", |
193 | | - "with open('payload.ndjson') as f:\n", |
194 | | - " sanity_check_payload = [json.loads(l) for l in f.readlines()]\n", |
195 | | - "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))" |
| 217 | + "Create payload" |
| 218 | + ], |
| 219 | + "cell_type": "markdown" |
| 220 | + }, |
| 221 | + { |
| 222 | + "metadata": {}, |
| 223 | + "source": [ |
| 224 | + "- The payload should encompass all the data you wish to retain, along with the new embeddings vector data.\n", |
| 225 | + "- `row_data` and `key` is required when using `dataset.upsert_data_rows()` " |
| 226 | + ], |
| 227 | + "cell_type": "markdown" |
| 228 | + }, |
| 229 | + { |
| 230 | + "metadata": {}, |
| 231 | + "source": [ |
| 232 | + "payload = []\n", |
| 233 | + "for data_row_dict, custom_embedding in zip(data_row_dict,custom_embeddings):\n", |
| 234 | + " payload.append({\"key\": lb.UniqueId(data_row_dict['data_row_id']), \"row_data\": data_row_dict['row_data'], \"embeddings\": [{\"embedding_id\": embedding.id, \"vector\": custom_embedding}]})\n", |
| 235 | + "\n", |
| 236 | + "print('payload', len(payload),payload[:1])" |
196 | 237 | ], |
197 | 238 | "cell_type": "code", |
198 | 239 | "outputs": [], |
|
201 | 242 | { |
202 | 243 | "metadata": {}, |
203 | 244 | "source": [ |
204 | | - "# See all custom embeddings available in your Labelbox workspace\n", |
205 | | - "embeddings = client.get_embeddings()" |
| 245 | + "# Upload payload" |
| 246 | + ], |
| 247 | + "cell_type": "markdown" |
| 248 | + }, |
| 249 | + { |
| 250 | + "metadata": {}, |
| 251 | + "source": [ |
| 252 | + "Upsert data rows with custom embeddings" |
| 253 | + ], |
| 254 | + "cell_type": "markdown" |
| 255 | + }, |
| 256 | + { |
| 257 | + "metadata": {}, |
| 258 | + "source": [ |
| 259 | + "task = dataset.upsert_data_rows(payload)\n", |
| 260 | + "task.wait_till_done()\n", |
| 261 | + "print(task.errors)\n", |
| 262 | + "print(task.status)" |
206 | 263 | ], |
207 | 264 | "cell_type": "code", |
208 | 265 | "outputs": [], |
|
211 | 268 | { |
212 | 269 | "metadata": {}, |
213 | 270 | "source": [ |
214 | | - "# Create a new custom embedding, unless you want to re-use one\n", |
215 | | - "# Name of the custom embedding must be unique\n", |
216 | | - "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)" |
| 271 | + "Get the count of imported vectors for a custom embedding" |
| 272 | + ], |
| 273 | + "cell_type": "markdown" |
| 274 | + }, |
| 275 | + { |
| 276 | + "metadata": {}, |
| 277 | + "source": [ |
| 278 | + "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n", |
| 279 | + "count = embedding.get_imported_vector_count()\n", |
| 280 | + "print(count)" |
217 | 281 | ], |
218 | 282 | "cell_type": "code", |
219 | 283 | "outputs": [], |
|
222 | 286 | { |
223 | 287 | "metadata": {}, |
224 | 288 | "source": [ |
225 | | - "# Delete a custom embedding\n", |
| 289 | + "Delete custom embedding type" |
| 290 | + ], |
| 291 | + "cell_type": "markdown" |
| 292 | + }, |
| 293 | + { |
| 294 | + "metadata": {}, |
| 295 | + "source": [ |
226 | 296 | "#embedding.delete()" |
227 | 297 | ], |
228 | 298 | "cell_type": "code", |
|
232 | 302 | { |
233 | 303 | "metadata": {}, |
234 | 304 | "source": [ |
235 | | - "# Upload the payload to Labelbox" |
| 305 | + "# Upload custom embeddings during data row creation" |
| 306 | + ], |
| 307 | + "cell_type": "markdown" |
| 308 | + }, |
| 309 | + { |
| 310 | + "metadata": {}, |
| 311 | + "source": [ |
| 312 | + "Create a dataset" |
236 | 313 | ], |
237 | 314 | "cell_type": "markdown" |
238 | 315 | }, |
239 | 316 | { |
240 | 317 | "metadata": {}, |
241 | 318 | "source": [ |
242 | | - "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id\n", |
243 | | - "embedding.import_vectors_from_file(\"./payload.ndjson\")" |
| 319 | + "# Create a dataset\n", |
| 320 | + "dataset_new = client.create_dataset(name=\"data_rows_with_embeddings\")" |
244 | 321 | ], |
245 | 322 | "cell_type": "code", |
246 | 323 | "outputs": [], |
|
249 | 326 | { |
250 | 327 | "metadata": {}, |
251 | 328 | "source": [ |
252 | | - "# Get the count of imported vectors for a custom embedding" |
| 329 | + "Fetch an embedding (2048 dimension)" |
253 | 330 | ], |
254 | 331 | "cell_type": "markdown" |
255 | 332 | }, |
256 | 333 | { |
257 | 334 | "metadata": {}, |
258 | 335 | "source": [ |
259 | | - "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n", |
260 | | - "count = embedding.get_imported_vector_count()" |
| 336 | + "embedding = client.get_embedding_by_name(\"my_custom_embedding_2048_dimensions\")\n", |
| 337 | + "vector = [random.uniform(1.0, 2.0) for _ in range(embedding.dims)]" |
| 338 | + ], |
| 339 | + "cell_type": "code", |
| 340 | + "outputs": [], |
| 341 | + "execution_count": null |
| 342 | + }, |
| 343 | + { |
| 344 | + "metadata": {}, |
| 345 | + "source": [ |
| 346 | + "Upload data rows with embeddings" |
| 347 | + ], |
| 348 | + "cell_type": "markdown" |
| 349 | + }, |
| 350 | + { |
| 351 | + "metadata": {}, |
| 352 | + "source": [ |
| 353 | + "\n", |
| 354 | + "uploads = []\n", |
| 355 | + "# Generate data rows\n", |
| 356 | + "for i in range(1,9):\n", |
| 357 | + " uploads.append({\n", |
| 358 | + " \"row_data\": f\"https://storage.googleapis.com/labelbox-datasets/People_Clothing_Segmentation/jpeg_images/IMAGES/img_000{i}.jpeg\",\n", |
| 359 | + " \"global_key\": \"TEST-ID-%id\" % uuid.uuid1(),\n", |
| 360 | + " \"embeddings\": [{\n", |
| 361 | + " \"embedding_id\": embedding.id,\n", |
| 362 | + " \"vector\": vector\n", |
| 363 | + " }]\n", |
| 364 | + " })\n", |
| 365 | + "\n", |
| 366 | + "task1 = dataset_new.create_data_rows(uploads)\n", |
| 367 | + "task1.wait_till_done()\n", |
| 368 | + "print(\"ERRORS: \" , task1.errors)\n", |
| 369 | + "print(\"RESULTS:\" , task1.result)" |
261 | 370 | ], |
262 | 371 | "cell_type": "code", |
263 | 372 | "outputs": [], |
|
0 commit comments